├── 05-dataprocessing
    └── code
    │   ├── data
    │       ├── missingdata.csv
    │       ├── categoricaldata.csv
    │       ├── iris.csv
    │       └── iris_mod.csv
    │   ├── images
    │       ├── eda.pdf
    │       ├── decisionreg.pdf
    │       ├── estimator-api.pdf
    │       ├── estimator-api.png
    │       ├── holdout-tuning.pdf
    │       ├── holdout-tuning.png
    │       ├── iris-subsampling.pdf
    │       ├── iris-subsampling.png
    │       ├── sklearn-pipeline.pdf
    │       ├── sklearn-pipeline.png
    │       ├── transformer-api.pdf
    │       └── transformer-api.png
    │   ├── 5-1_reading-data.ipynb
    │   └── 5-3_python-oop.ipynb
├── 04-scientific-python
    ├── myplot.pdf
    ├── myplot.png
    └── images
    │   ├── output_171_0.png
    │   ├── output_173_0.png
    │   ├── output_174_0.png
    │   ├── output_176_0.png
    │   ├── output_178_0.png
    │   ├── output_180_0.png
    │   ├── output_181_0.png
    │   ├── output_183_0.png
    │   ├── output_185_0.png
    │   ├── output_188_0.png
    │   └── numpy-intro
    │       ├── array_1.png
    │       ├── array_2.png
    │       ├── matmul.png
    │       ├── ufunc.png
    │       ├── matmatmul.png
    │       ├── random_1.png
    │       ├── random_2.png
    │       ├── transpose.png
    │       ├── broadcasting-1.png
    │       ├── broadcasting-2.png
    │       └── numpy-nature-1.png
├── 06-decision-trees
    └── code
    │   └── tree.png
├── 09_eval2-resampling-and-CIs
    ├── CI_iris.pdf
    ├── CI_mnist.pdf
    └── 05_adv-bootstrap.ipynb
├── 11-nested-cross-validation
    ├── nested-cv-image.png
    └── 1_nested-cv_compact.ipynb
├── README.md
├── hw02-starter
    ├── dataset
    │   ├── DryBeanDataset
    │   │   ├── Dry_Bean_Dataset.xlsx
    │   │   └── Dry_Bean_Dataset.txt
    │   └── make-splits.ipynb
    └── hw-baseline.ipynb
├── hyperparameter-tuning-examples
    ├── figures
    │   └── orion-recommendations.png
    ├── 05.1-successive-halving-decisiontree.ipynb
    ├── 06.1-genetic-opt.ipynb
    ├── 03.1-hyperopt-decisiontree-example.ipynb
    ├── 05.2-successive-halving-stacking.ipynb
    └── 01.1-gridsearch-decisiontree-example.ipynb
├── LICENSE
├── .gitignore
├── 07-ensembles
    └── code
    │   ├── 07-01_majority-voting.ipynb
    │   ├── 07-05_random-forests.ipynb
    │   ├── 07-02_bagging.ipynb
    │   ├── 07-06_stacking.ipynb
    │   ├── 07-04_gradient-boosting.ipynb
    │   └── 07-03_adaboosting.ipynb
├── 02-knn
    └── code
    │   └── iris.csv
└── from-scratch-coding-exercises
    ├── bagging-from-scratch.ipynb
    └── bagging-solution.ipynb


/05-dataprocessing/code/data/missingdata.csv:
--------------------------------------------------------------------------------
1 | A,B,C,D
2 | 1.,2.,3.,4.
3 | 5.,6.,,8.
4 | 10.,11.,12.,
5 | 


--------------------------------------------------------------------------------
/04-scientific-python/myplot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/myplot.pdf


--------------------------------------------------------------------------------
/04-scientific-python/myplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/myplot.png


--------------------------------------------------------------------------------
/06-decision-trees/code/tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/06-decision-trees/code/tree.png


--------------------------------------------------------------------------------
/05-dataprocessing/code/data/categoricaldata.csv:
--------------------------------------------------------------------------------
1 | color,size,price,classlabel
2 | green,M,10.1,class1
3 | red,L,13.5,class2
4 | blue,XXL,15.3,class1


--------------------------------------------------------------------------------
/05-dataprocessing/code/images/eda.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/05-dataprocessing/code/images/eda.pdf


--------------------------------------------------------------------------------
/09_eval2-resampling-and-CIs/CI_iris.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/09_eval2-resampling-and-CIs/CI_iris.pdf


--------------------------------------------------------------------------------
/09_eval2-resampling-and-CIs/CI_mnist.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/09_eval2-resampling-and-CIs/CI_mnist.pdf


--------------------------------------------------------------------------------
/04-scientific-python/images/output_171_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/output_171_0.png


--------------------------------------------------------------------------------
/04-scientific-python/images/output_173_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/output_173_0.png


--------------------------------------------------------------------------------
/04-scientific-python/images/output_174_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/output_174_0.png


--------------------------------------------------------------------------------
/04-scientific-python/images/output_176_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/output_176_0.png


--------------------------------------------------------------------------------
/04-scientific-python/images/output_178_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/output_178_0.png


--------------------------------------------------------------------------------
/04-scientific-python/images/output_180_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/output_180_0.png


--------------------------------------------------------------------------------
/04-scientific-python/images/output_181_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/output_181_0.png


--------------------------------------------------------------------------------
/04-scientific-python/images/output_183_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/output_183_0.png


--------------------------------------------------------------------------------
/04-scientific-python/images/output_185_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/output_185_0.png


--------------------------------------------------------------------------------
/04-scientific-python/images/output_188_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/output_188_0.png


--------------------------------------------------------------------------------
/05-dataprocessing/code/images/decisionreg.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/05-dataprocessing/code/images/decisionreg.pdf


--------------------------------------------------------------------------------
/11-nested-cross-validation/nested-cv-image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/11-nested-cross-validation/nested-cv-image.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # stat451-machine-learning-fs21
2 | 
3 | Course material for Stat 451: Introduction to Machine Learning and Statistical Pattern Classification
4 | 
5 | 


--------------------------------------------------------------------------------
/05-dataprocessing/code/images/estimator-api.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/05-dataprocessing/code/images/estimator-api.pdf


--------------------------------------------------------------------------------
/05-dataprocessing/code/images/estimator-api.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/05-dataprocessing/code/images/estimator-api.png


--------------------------------------------------------------------------------
/05-dataprocessing/code/images/holdout-tuning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/05-dataprocessing/code/images/holdout-tuning.pdf


--------------------------------------------------------------------------------
/05-dataprocessing/code/images/holdout-tuning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/05-dataprocessing/code/images/holdout-tuning.png


--------------------------------------------------------------------------------
/04-scientific-python/images/numpy-intro/array_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/numpy-intro/array_1.png


--------------------------------------------------------------------------------
/04-scientific-python/images/numpy-intro/array_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/numpy-intro/array_2.png


--------------------------------------------------------------------------------
/04-scientific-python/images/numpy-intro/matmul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/numpy-intro/matmul.png


--------------------------------------------------------------------------------
/04-scientific-python/images/numpy-intro/ufunc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/numpy-intro/ufunc.png


--------------------------------------------------------------------------------
/05-dataprocessing/code/images/iris-subsampling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/05-dataprocessing/code/images/iris-subsampling.pdf


--------------------------------------------------------------------------------
/05-dataprocessing/code/images/iris-subsampling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/05-dataprocessing/code/images/iris-subsampling.png


--------------------------------------------------------------------------------
/05-dataprocessing/code/images/sklearn-pipeline.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/05-dataprocessing/code/images/sklearn-pipeline.pdf


--------------------------------------------------------------------------------
/05-dataprocessing/code/images/sklearn-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/05-dataprocessing/code/images/sklearn-pipeline.png


--------------------------------------------------------------------------------
/05-dataprocessing/code/images/transformer-api.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/05-dataprocessing/code/images/transformer-api.pdf


--------------------------------------------------------------------------------
/05-dataprocessing/code/images/transformer-api.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/05-dataprocessing/code/images/transformer-api.png


--------------------------------------------------------------------------------
/04-scientific-python/images/numpy-intro/matmatmul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/numpy-intro/matmatmul.png


--------------------------------------------------------------------------------
/04-scientific-python/images/numpy-intro/random_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/numpy-intro/random_1.png


--------------------------------------------------------------------------------
/04-scientific-python/images/numpy-intro/random_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/numpy-intro/random_2.png


--------------------------------------------------------------------------------
/04-scientific-python/images/numpy-intro/transpose.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/numpy-intro/transpose.png


--------------------------------------------------------------------------------
/04-scientific-python/images/numpy-intro/broadcasting-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/numpy-intro/broadcasting-1.png


--------------------------------------------------------------------------------
/04-scientific-python/images/numpy-intro/broadcasting-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/numpy-intro/broadcasting-2.png


--------------------------------------------------------------------------------
/04-scientific-python/images/numpy-intro/numpy-nature-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/04-scientific-python/images/numpy-intro/numpy-nature-1.png


--------------------------------------------------------------------------------
/hw02-starter/dataset/DryBeanDataset/Dry_Bean_Dataset.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/hw02-starter/dataset/DryBeanDataset/Dry_Bean_Dataset.xlsx


--------------------------------------------------------------------------------
/hyperparameter-tuning-examples/figures/orion-recommendations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/HEAD/hyperparameter-tuning-examples/figures/orion-recommendations.png


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Sebastian Raschka
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/hw02-starter/dataset/DryBeanDataset/Dry_Bean_Dataset.txt:
--------------------------------------------------------------------------------
 1 | Data Set Name:
 2 | Dry Bean Dataset
 3 | 
 4 | Abstract:
 5 | Images of 13,611 grains of 7 different registered dry beans were taken with a high-resolution camera. A total of 16 features; 12 dimensions and 4 shape forms, were obtained from the grains.
 6 | 
 7 | Source:
 8 | Murat KOKLU
 9 | Faculty of Technology,
10 | Selcuk University,
11 | TURKEY.
12 | ORCID : 0000-0002-2737-2360
13 | mkoklu@selcuk.edu.tr
14 | 
15 | Ilker Ali OZKAN
16 | Faculty of Technology,
17 | Selcuk University,
18 | TURKEY.
19 | ORCID : 0000-0002-5715-1040
20 | ilkerozkan@selcuk.edu.tr
21 | 
22 | Data Type:
23 | Multivariate
24 | 
25 | Task:
26 | Classification
27 | 
28 | Attribute Type:
29 | Categorical
30 | Integer
31 | Real
32 | 
33 | Area:
34 | CS / Engineering
35 | 
36 | Format Type:
37 | Matrix
38 | 
39 | Does your data set contain missing values?
40 | No
41 | 
42 | Number of Instances (records in your data set): 
43 | 13611
44 | 
45 | Number of Attributes (fields within each record): 
46 | 17
47 | 
48 | Relevant Information:
49 | Seven different types of dry beans were used in this research, taking into account the features such as form, shape, type, and structure by the market situation. A computer vision system was developed to distinguish seven different registered varieties of dry beans with similar features in order to obtain uniform seed classification. For the classification model, images of 13,611 grains of 7 different registered dry beans were taken with a high-resolution camera. Bean images obtained by computer vision system were subjected to segmentation and feature extraction stages, and a total of 16 features; 12 dimensions and 4 shape forms, were obtained from the grains.
50 | 
51 | Attribute Information:
52 | 1.) Area (A): The area of a bean zone and the number of pixels within its boundaries.
53 | 2.) Perimeter (P): Bean circumference is defined as the length of its border.
54 | 3.) Major axis length (L): The distance between the ends of the longest line that can be drawn from a bean.
55 | 4.) Minor axis length (l): The longest line that can be drawn from the bean while standing perpendicular to the main axis.
56 | 5.) Aspect ratio (K): Defines the relationship between L and l.
57 | 6.) Eccentricity (Ec): Eccentricity of the ellipse having the same moments as the region.
58 | 7.) Convex area (C): Number of pixels in the smallest convex polygon that can contain the area of a bean seed.
59 | 8.) Equivalent diameter (Ed): The diameter of a circle having the same area as a bean seed area.
60 | 9.) Extent (Ex): The ratio of the pixels in the bounding box to the bean area.
61 | 10.)Solidity (S): Also known as convexity. The ratio of the pixels in the convex shell to those found in beans.
62 | 11.)Roundness (R): Calculated with the following formula: (4piA)/(P^2)
63 | 12.)Compactness (CO): Measures the roundness of an object: Ed/L
64 | 13.)ShapeFactor1 (SF1)
65 | 14.)ShapeFactor2 (SF2)
66 | 15.)ShapeFactor3 (SF3)
67 | 16.)ShapeFactor4 (SF4)
68 | 17.)Class (Seker, Barbunya, Bombay, Cali, Dermosan, Horoz and Sira)
69 | 
70 | 
71 | 
72 | Relevant Papers:
73 | KOKLU, M. and OZKAN, I.A., (2020), “Multiclass Classification of Dry Beans Using Computer Vision and Machine Learning Techniques.” Computers and Electronics in Agriculture, 174, 105507.
74 | DOI: https://doi.org/10.1016/j.compag.2020.105507
75 | 
76 | Citation Requests / Acknowledgements:
77 | KOKLU, M. and OZKAN, I.A., (2020), “Multiclass Classification of Dry Beans Using Computer Vision and Machine Learning Techniques.” Computers and Electronics in Agriculture, 174, 105507.
78 | DOI: https://doi.org/10.1016/j.compag.2020.105507


--------------------------------------------------------------------------------
/07-ensembles/code/07-01_majority-voting.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# STAT451: Machine Learning -- L07: Ensemble Methods"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "STAT 451: Intro to Machine Learning (Fall 2021)  \n",
 15 |     "Instructor: Sebastian Raschka"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {},
 22 |    "outputs": [
 23 |     {
 24 |      "name": "stdout",
 25 |      "output_type": "stream",
 26 |      "text": [
 27 |       "scikit-learn: 1.0\n",
 28 |       "mlxtend     : 0.19.0\n",
 29 |       "\n"
 30 |      ]
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "%load_ext watermark\n",
 35 |     "%watermark -p scikit-learn,mlxtend"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "# Majority Voting"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "name": "stdout",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "Train/Valid/Test sizes: 318 80 171\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "from sklearn import model_selection\n",
 60 |     "from sklearn.tree import DecisionTreeClassifier\n",
 61 |     "from sklearn.model_selection import train_test_split\n",
 62 |     "from sklearn import datasets\n",
 63 |     "from mlxtend.classifier import EnsembleVoteClassifier\n",
 64 |     "\n",
 65 |     "data = datasets.load_breast_cancer()\n",
 66 |     "X, y = data.data, data.target\n",
 67 |     "\n",
 68 |     "X_temp, X_test, y_temp, y_test = \\\n",
 69 |     "    train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)\n",
 70 |     "\n",
 71 |     "X_train, X_valid, y_train, y_valid = \\\n",
 72 |     "    train_test_split(X_temp, y_temp, test_size=0.2, random_state=123, stratify=y_temp)\n",
 73 |     "\n",
 74 |     "print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 3,
 80 |    "metadata": {},
 81 |    "outputs": [
 82 |     {
 83 |      "name": "stdout",
 84 |      "output_type": "stream",
 85 |      "text": [
 86 |       "Validation Accuracy: 0.91 [Classifier 1]\n",
 87 |       "Validation Accuracy: 0.89 [Classifier 2]\n",
 88 |       "Validation Accuracy: 0.91 [Classifier 3]\n",
 89 |       "Validation Accuracy: 0.91 [Ensemble]\n",
 90 |       "Test Accuracy: 0.93\n"
 91 |      ]
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "clf1 = DecisionTreeClassifier(random_state=1, max_depth=None)\n",
 96 |     "clf2 = DecisionTreeClassifier(random_state=1, max_depth=1)\n",
 97 |     "clf3 = DecisionTreeClassifier(random_state=1, max_depth=2)\n",
 98 |     "eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], weights=[1, 1, 1])\n",
 99 |     "\n",
100 |     "labels = ['Classifier 1', 'Classifier 2', 'Classifier 3', 'Ensemble']\n",
101 |     "for clf, label in zip([clf1, clf2, clf3, eclf], labels):\n",
102 |     "\n",
103 |     "    clf.fit(X_train, y_train)\n",
104 |     "    print(\"Validation Accuracy: %0.2f [%s]\" % (clf.score(X_valid, y_valid), label))\n",
105 |     "    \n",
106 |     "print(\"Test Accuracy: %0.2f\" % eclf.score(X_test, y_test))"
107 |    ]
108 |   }
109 |  ],
110 |  "metadata": {
111 |   "kernelspec": {
112 |    "display_name": "Python 3 (ipykernel)",
113 |    "language": "python",
114 |    "name": "python3"
115 |   },
116 |   "language_info": {
117 |    "codemirror_mode": {
118 |     "name": "ipython",
119 |     "version": 3
120 |    },
121 |    "file_extension": ".py",
122 |    "mimetype": "text/x-python",
123 |    "name": "python",
124 |    "nbconvert_exporter": "python",
125 |    "pygments_lexer": "ipython3",
126 |    "version": "3.9.6"
127 |   }
128 |  },
129 |  "nbformat": 4,
130 |  "nbformat_minor": 4
131 | }
132 | 


--------------------------------------------------------------------------------
/07-ensembles/code/07-05_random-forests.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# STAT451: Machine Learning -- L07: Ensemble Methods Part 1/2"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "STAT 451: Intro to Machine Learning (Fall 2021)  \n",
 15 |     "Instructor: Sebastian Raschka (sraschka@wisc.edu)  \n"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {},
 22 |    "outputs": [
 23 |     {
 24 |      "name": "stdout",
 25 |      "output_type": "stream",
 26 |      "text": [
 27 |       "scikit-learn: 1.0\n",
 28 |       "mlxtend     : 0.19.0\n",
 29 |       "\n"
 30 |      ]
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "%load_ext watermark\n",
 35 |     "%watermark -p scikit-learn,mlxtend"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "# Random Forests"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "name": "stdout",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "Train/Valid/Test sizes: 318 80 171\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "from sklearn import model_selection\n",
 60 |     "from sklearn.model_selection import train_test_split\n",
 61 |     "from sklearn import datasets\n",
 62 |     "\n",
 63 |     "\n",
 64 |     "data = datasets.load_breast_cancer()\n",
 65 |     "X, y = data.data, data.target\n",
 66 |     "\n",
 67 |     "X_temp, X_test, y_temp, y_test = \\\n",
 68 |     "    train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)\n",
 69 |     "\n",
 70 |     "X_train, X_valid, y_train, y_valid = \\\n",
 71 |     "    train_test_split(X_temp, y_temp, test_size=0.2, random_state=123, stratify=y_temp)\n",
 72 |     "\n",
 73 |     "print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 3,
 79 |    "metadata": {},
 80 |    "outputs": [
 81 |     {
 82 |      "name": "stdout",
 83 |      "output_type": "stream",
 84 |      "text": [
 85 |       "Training Accuracy: 1.00\n",
 86 |       "Validation Accuracy: 0.95\n",
 87 |       "Test Accuracy: 0.98\n"
 88 |      ]
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "from sklearn.ensemble import RandomForestClassifier\n",
 93 |     "\n",
 94 |     "\n",
 95 |     "forest = RandomForestClassifier(n_estimators=100,\n",
 96 |     "                                random_state=1)\n",
 97 |     "\n",
 98 |     "forest.fit(X_train, y_train)\n",
 99 |     "    \n",
100 |     "print(\"Training Accuracy: %0.2f\" % forest.score(X_train, y_train))\n",
101 |     "print(\"Validation Accuracy: %0.2f\" % forest.score(X_valid, y_valid))\n",
102 |     "print(\"Test Accuracy: %0.2f\" % forest.score(X_test, y_test))"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 4,
108 |    "metadata": {},
109 |    "outputs": [
110 |     {
111 |      "name": "stdout",
112 |      "output_type": "stream",
113 |      "text": [
114 |       "Training Accuracy: 1.00\n",
115 |       "Validation Accuracy: 1.00\n",
116 |       "Test Accuracy: 0.98\n"
117 |      ]
118 |     }
119 |    ],
120 |    "source": [
121 |     "from sklearn.ensemble import ExtraTreesClassifier\n",
122 |     "\n",
123 |     "\n",
124 |     "forest = ExtraTreesClassifier(n_estimators=100,\n",
125 |     "                              random_state=1)\n",
126 |     "\n",
127 |     "forest.fit(X_train, y_train)\n",
128 |     "    \n",
129 |     "    \n",
130 |     "print(\"Training Accuracy: %0.2f\" % forest.score(X_train, y_train))\n",
131 |     "print(\"Validation Accuracy: %0.2f\" % forest.score(X_valid, y_valid))\n",
132 |     "print(\"Test Accuracy: %0.2f\" % forest.score(X_test, y_test))"
133 |    ]
134 |   }
135 |  ],
136 |  "metadata": {
137 |   "kernelspec": {
138 |    "display_name": "Python 3 (ipykernel)",
139 |    "language": "python",
140 |    "name": "python3"
141 |   },
142 |   "language_info": {
143 |    "codemirror_mode": {
144 |     "name": "ipython",
145 |     "version": 3
146 |    },
147 |    "file_extension": ".py",
148 |    "mimetype": "text/x-python",
149 |    "name": "python",
150 |    "nbconvert_exporter": "python",
151 |    "pygments_lexer": "ipython3",
152 |    "version": "3.9.6"
153 |   }
154 |  },
155 |  "nbformat": 4,
156 |  "nbformat_minor": 4
157 | }
158 | 


--------------------------------------------------------------------------------
/07-ensembles/code/07-02_bagging.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# STAT451: Machine Learning -- L07: Ensemble Methods"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "STAT 451: Intro to Machine Learning (Fall 2021)  \n",
 15 |     "Instructor: Sebastian Raschka"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {},
 22 |    "outputs": [
 23 |     {
 24 |      "name": "stdout",
 25 |      "output_type": "stream",
 26 |      "text": [
 27 |       "scikit-learn: 1.0\n",
 28 |       "mlxtend     : 0.19.0\n",
 29 |       "\n"
 30 |      ]
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "%load_ext watermark\n",
 35 |     "%watermark -p scikit-learn,mlxtend"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "# Bagging"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "name": "stdout",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "Train/Valid/Test sizes: 318 80 171\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "from sklearn import model_selection\n",
 60 |     "from sklearn.tree import DecisionTreeClassifier\n",
 61 |     "from sklearn.model_selection import train_test_split\n",
 62 |     "from sklearn import datasets\n",
 63 |     "from sklearn.ensemble import BaggingClassifier\n",
 64 |     "\n",
 65 |     "\n",
 66 |     "data = datasets.load_breast_cancer()\n",
 67 |     "X, y = data.data, data.target\n",
 68 |     "\n",
 69 |     "X_temp, X_test, y_temp, y_test = \\\n",
 70 |     "    train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)\n",
 71 |     "\n",
 72 |     "X_train, X_valid, y_train, y_valid = \\\n",
 73 |     "    train_test_split(X_temp, y_temp, test_size=0.2, random_state=123, stratify=y_temp)\n",
 74 |     "\n",
 75 |     "print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 3,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stdout",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "OOB Accuracy: 0.95\n",
 88 |       "Validation Accuracy: 0.96\n",
 89 |       "Test Accuracy: 0.96\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "tree = DecisionTreeClassifier(criterion='entropy',\n",
 95 |     "                              random_state=1,\n",
 96 |     "                              max_depth=None)\n",
 97 |     "\n",
 98 |     "\n",
 99 |     "bag = BaggingClassifier(base_estimator=tree,\n",
100 |     "                        n_estimators=500,\n",
101 |     "                        oob_score=True,\n",
102 |     "                        bootstrap=True,\n",
103 |     "                        bootstrap_features=False,\n",
104 |     "                        n_jobs=1,\n",
105 |     "                        random_state=1)\n",
106 |     "\n",
107 |     "bag.fit(X_train, y_train)\n",
108 |     "    \n",
109 |     "\n",
110 |     "print(\"OOB Accuracy: %0.2f\" % bag.oob_score_)\n",
111 |     "print(\"Validation Accuracy: %0.2f\" % bag.score(X_valid, y_valid))\n",
112 |     "print(\"Test Accuracy: %0.2f\" % bag.score(X_test, y_test))"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 4,
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "name": "stdout",
122 |      "output_type": "stream",
123 |      "text": [
124 |       "Training Accuracy: 1.00\n",
125 |       "Test Accuracy: 0.93\n"
126 |      ]
127 |     }
128 |    ],
129 |    "source": [
130 |     "tree = DecisionTreeClassifier(criterion='entropy',\n",
131 |     "                              random_state=1,\n",
132 |     "                              max_depth=None)\n",
133 |     "\n",
134 |     "tree.fit(X_train, y_train)\n",
135 |     "\n",
136 |     "print(\"Training Accuracy: %0.2f\" % tree.score(X_train, y_train))\n",
137 |     "print(\"Test Accuracy: %0.2f\" % tree.score(X_test, y_test))"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 5,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "Training Accuracy: 0.92\n",
150 |       "Test Accuracy: 0.92\n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "tree = DecisionTreeClassifier(criterion='entropy',\n",
156 |     "                              random_state=1,\n",
157 |     "                              max_depth=1)\n",
158 |     "\n",
159 |     "tree.fit(X_train, y_train)\n",
160 |     "\n",
161 |     "print(\"Training Accuracy: %0.2f\" % tree.score(X_train, y_train))\n",
162 |     "print(\"Test Accuracy: %0.2f\" % tree.score(X_test, y_test))"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": []
171 |   }
172 |  ],
173 |  "metadata": {
174 |   "kernelspec": {
175 |    "display_name": "Python 3 (ipykernel)",
176 |    "language": "python",
177 |    "name": "python3"
178 |   },
179 |   "language_info": {
180 |    "codemirror_mode": {
181 |     "name": "ipython",
182 |     "version": 3
183 |    },
184 |    "file_extension": ".py",
185 |    "mimetype": "text/x-python",
186 |    "name": "python",
187 |    "nbconvert_exporter": "python",
188 |    "pygments_lexer": "ipython3",
189 |    "version": "3.9.6"
190 |   }
191 |  },
192 |  "nbformat": 4,
193 |  "nbformat_minor": 4
194 | }
195 | 


--------------------------------------------------------------------------------
/02-knn/code/iris.csv:
--------------------------------------------------------------------------------
  1 | Id,SepalLength[cm],SepalWidth[cm],PetalLength[cm],PetalWidth[cm],Species
  2 | 1,5.1,3.5,1.4,0.2,Iris-setosa
  3 | 2,4.9,3.0,1.4,0.2,Iris-setosa
  4 | 3,4.7,3.2,1.3,0.2,Iris-setosa
  5 | 4,4.6,3.1,1.5,0.2,Iris-setosa
  6 | 5,5.0,3.6,1.4,0.2,Iris-setosa
  7 | 6,5.4,3.9,1.7,0.4,Iris-setosa
  8 | 7,4.6,3.4,1.4,0.3,Iris-setosa
  9 | 8,5.0,3.4,1.5,0.2,Iris-setosa
 10 | 9,4.4,2.9,1.4,0.2,Iris-setosa
 11 | 10,4.9,3.1,1.5,0.1,Iris-setosa
 12 | 11,5.4,3.7,1.5,0.2,Iris-setosa
 13 | 12,4.8,3.4,1.6,0.2,Iris-setosa
 14 | 13,4.8,3.0,1.4,0.1,Iris-setosa
 15 | 14,4.3,3.0,1.1,0.1,Iris-setosa
 16 | 15,5.8,4.0,1.2,0.2,Iris-setosa
 17 | 16,5.7,4.4,1.5,0.4,Iris-setosa
 18 | 17,5.4,3.9,1.3,0.4,Iris-setosa
 19 | 18,5.1,3.5,1.4,0.3,Iris-setosa
 20 | 19,5.7,3.8,1.7,0.3,Iris-setosa
 21 | 20,5.1,3.8,1.5,0.3,Iris-setosa
 22 | 21,5.4,3.4,1.7,0.2,Iris-setosa
 23 | 22,5.1,3.7,1.5,0.4,Iris-setosa
 24 | 23,4.6,3.6,1.0,0.2,Iris-setosa
 25 | 24,5.1,3.3,1.7,0.5,Iris-setosa
 26 | 25,4.8,3.4,1.9,0.2,Iris-setosa
 27 | 26,5.0,3.0,1.6,0.2,Iris-setosa
 28 | 27,5.0,3.4,1.6,0.4,Iris-setosa
 29 | 28,5.2,3.5,1.5,0.2,Iris-setosa
 30 | 29,5.2,3.4,1.4,0.2,Iris-setosa
 31 | 30,4.7,3.2,1.6,0.2,Iris-setosa
 32 | 31,4.8,3.1,1.6,0.2,Iris-setosa
 33 | 32,5.4,3.4,1.5,0.4,Iris-setosa
 34 | 33,5.2,4.1,1.5,0.1,Iris-setosa
 35 | 34,5.5,4.2,1.4,0.2,Iris-setosa
 36 | 35,4.9,3.1,1.5,0.1,Iris-setosa
 37 | 36,5.0,3.2,1.2,0.2,Iris-setosa
 38 | 37,5.5,3.5,1.3,0.2,Iris-setosa
 39 | 38,4.9,3.1,1.5,0.1,Iris-setosa
 40 | 39,4.4,3.0,1.3,0.2,Iris-setosa
 41 | 40,5.1,3.4,1.5,0.2,Iris-setosa
 42 | 41,5.0,3.5,1.3,0.3,Iris-setosa
 43 | 42,4.5,2.3,1.3,0.3,Iris-setosa
 44 | 43,4.4,3.2,1.3,0.2,Iris-setosa
 45 | 44,5.0,3.5,1.6,0.6,Iris-setosa
 46 | 45,5.1,3.8,1.9,0.4,Iris-setosa
 47 | 46,4.8,3.0,1.4,0.3,Iris-setosa
 48 | 47,5.1,3.8,1.6,0.2,Iris-setosa
 49 | 48,4.6,3.2,1.4,0.2,Iris-setosa
 50 | 49,5.3,3.7,1.5,0.2,Iris-setosa
 51 | 50,5.0,3.3,1.4,0.2,Iris-setosa
 52 | 51,7.0,3.2,4.7,1.4,Iris-versicolor
 53 | 52,6.4,3.2,4.5,1.5,Iris-versicolor
 54 | 53,6.9,3.1,4.9,1.5,Iris-versicolor
 55 | 54,5.5,2.3,4.0,1.3,Iris-versicolor
 56 | 55,6.5,2.8,4.6,1.5,Iris-versicolor
 57 | 56,5.7,2.8,4.5,1.3,Iris-versicolor
 58 | 57,6.3,3.3,4.7,1.6,Iris-versicolor
 59 | 58,4.9,2.4,3.3,1.0,Iris-versicolor
 60 | 59,6.6,2.9,4.6,1.3,Iris-versicolor
 61 | 60,5.2,2.7,3.9,1.4,Iris-versicolor
 62 | 61,5.0,2.0,3.5,1.0,Iris-versicolor
 63 | 62,5.9,3.0,4.2,1.5,Iris-versicolor
 64 | 63,6.0,2.2,4.0,1.0,Iris-versicolor
 65 | 64,6.1,2.9,4.7,1.4,Iris-versicolor
 66 | 65,5.6,2.9,3.6,1.3,Iris-versicolor
 67 | 66,6.7,3.1,4.4,1.4,Iris-versicolor
 68 | 67,5.6,3.0,4.5,1.5,Iris-versicolor
 69 | 68,5.8,2.7,4.1,1.0,Iris-versicolor
 70 | 69,6.2,2.2,4.5,1.5,Iris-versicolor
 71 | 70,5.6,2.5,3.9,1.1,Iris-versicolor
 72 | 71,5.9,3.2,4.8,1.8,Iris-versicolor
 73 | 72,6.1,2.8,4.0,1.3,Iris-versicolor
 74 | 73,6.3,2.5,4.9,1.5,Iris-versicolor
 75 | 74,6.1,2.8,4.7,1.2,Iris-versicolor
 76 | 75,6.4,2.9,4.3,1.3,Iris-versicolor
 77 | 76,6.6,3.0,4.4,1.4,Iris-versicolor
 78 | 77,6.8,2.8,4.8,1.4,Iris-versicolor
 79 | 78,6.7,3.0,5.0,1.7,Iris-versicolor
 80 | 79,6.0,2.9,4.5,1.5,Iris-versicolor
 81 | 80,5.7,2.6,3.5,1.0,Iris-versicolor
 82 | 81,5.5,2.4,3.8,1.1,Iris-versicolor
 83 | 82,5.5,2.4,3.7,1.0,Iris-versicolor
 84 | 83,5.8,2.7,3.9,1.2,Iris-versicolor
 85 | 84,6.0,2.7,5.1,1.6,Iris-versicolor
 86 | 85,5.4,3.0,4.5,1.5,Iris-versicolor
 87 | 86,6.0,3.4,4.5,1.6,Iris-versicolor
 88 | 87,6.7,3.1,4.7,1.5,Iris-versicolor
 89 | 88,6.3,2.3,4.4,1.3,Iris-versicolor
 90 | 89,5.6,3.0,4.1,1.3,Iris-versicolor
 91 | 90,5.5,2.5,4.0,1.3,Iris-versicolor
 92 | 91,5.5,2.6,4.4,1.2,Iris-versicolor
 93 | 92,6.1,3.0,4.6,1.4,Iris-versicolor
 94 | 93,5.8,2.6,4.0,1.2,Iris-versicolor
 95 | 94,5.0,2.3,3.3,1.0,Iris-versicolor
 96 | 95,5.6,2.7,4.2,1.3,Iris-versicolor
 97 | 96,5.7,3.0,4.2,1.2,Iris-versicolor
 98 | 97,5.7,2.9,4.2,1.3,Iris-versicolor
 99 | 98,6.2,2.9,4.3,1.3,Iris-versicolor
100 | 99,5.1,2.5,3.0,1.1,Iris-versicolor
101 | 100,5.7,2.8,4.1,1.3,Iris-versicolor
102 | 101,6.3,3.3,6.0,2.5,Iris-virginica
103 | 102,5.8,2.7,5.1,1.9,Iris-virginica
104 | 103,7.1,3.0,5.9,2.1,Iris-virginica
105 | 104,6.3,2.9,5.6,1.8,Iris-virginica
106 | 105,6.5,3.0,5.8,2.2,Iris-virginica
107 | 106,7.6,3.0,6.6,2.1,Iris-virginica
108 | 107,4.9,2.5,4.5,1.7,Iris-virginica
109 | 108,7.3,2.9,6.3,1.8,Iris-virginica
110 | 109,6.7,2.5,5.8,1.8,Iris-virginica
111 | 110,7.2,3.6,6.1,2.5,Iris-virginica
112 | 111,6.5,3.2,5.1,2.0,Iris-virginica
113 | 112,6.4,2.7,5.3,1.9,Iris-virginica
114 | 113,6.8,3.0,5.5,2.1,Iris-virginica
115 | 114,5.7,2.5,5.0,2.0,Iris-virginica
116 | 115,5.8,2.8,5.1,2.4,Iris-virginica
117 | 116,6.4,3.2,5.3,2.3,Iris-virginica
118 | 117,6.5,3.0,5.5,1.8,Iris-virginica
119 | 118,7.7,3.8,6.7,2.2,Iris-virginica
120 | 119,7.7,2.6,6.9,2.3,Iris-virginica
121 | 120,6.0,2.2,5.0,1.5,Iris-virginica
122 | 121,6.9,3.2,5.7,2.3,Iris-virginica
123 | 122,5.6,2.8,4.9,2.0,Iris-virginica
124 | 123,7.7,2.8,6.7,2.0,Iris-virginica
125 | 124,6.3,2.7,4.9,1.8,Iris-virginica
126 | 125,6.7,3.3,5.7,2.1,Iris-virginica
127 | 126,7.2,3.2,6.0,1.8,Iris-virginica
128 | 127,6.2,2.8,4.8,1.8,Iris-virginica
129 | 128,6.1,3.0,4.9,1.8,Iris-virginica
130 | 129,6.4,2.8,5.6,2.1,Iris-virginica
131 | 130,7.2,3.0,5.8,1.6,Iris-virginica
132 | 131,7.4,2.8,6.1,1.9,Iris-virginica
133 | 132,7.9,3.8,6.4,2.0,Iris-virginica
134 | 133,6.4,2.8,5.6,2.2,Iris-virginica
135 | 134,6.3,2.8,5.1,1.5,Iris-virginica
136 | 135,6.1,2.6,5.6,1.4,Iris-virginica
137 | 136,7.7,3.0,6.1,2.3,Iris-virginica
138 | 137,6.3,3.4,5.6,2.4,Iris-virginica
139 | 138,6.4,3.1,5.5,1.8,Iris-virginica
140 | 139,6.0,3.0,4.8,1.8,Iris-virginica
141 | 140,6.9,3.1,5.4,2.1,Iris-virginica
142 | 141,6.7,3.1,5.6,2.4,Iris-virginica
143 | 142,6.9,3.1,5.1,2.3,Iris-virginica
144 | 143,5.8,2.7,5.1,1.9,Iris-virginica
145 | 144,6.8,3.2,5.9,2.3,Iris-virginica
146 | 145,6.7,3.3,5.7,2.5,Iris-virginica
147 | 146,6.7,3.0,5.2,2.3,Iris-virginica
148 | 147,6.3,2.5,5.0,1.9,Iris-virginica
149 | 148,6.5,3.0,5.2,2.0,Iris-virginica
150 | 149,6.2,3.4,5.4,2.3,Iris-virginica
151 | 150,5.9,3.0,5.1,1.8,Iris-virginica


--------------------------------------------------------------------------------
/05-dataprocessing/code/data/iris.csv:
--------------------------------------------------------------------------------
  1 | Id,SepalLength[cm],SepalWidth[cm],PetalLength[cm],PetalWidth[cm],Species
  2 | 1,5.1,3.5,1.4,0.2,Iris-setosa
  3 | 2,4.9,3.0,1.4,0.2,Iris-setosa
  4 | 3,4.7,3.2,1.3,0.2,Iris-setosa
  5 | 4,4.6,3.1,1.5,0.2,Iris-setosa
  6 | 5,5.0,3.6,1.4,0.2,Iris-setosa
  7 | 6,5.4,3.9,1.7,0.4,Iris-setosa
  8 | 7,4.6,3.4,1.4,0.3,Iris-setosa
  9 | 8,5.0,3.4,1.5,0.2,Iris-setosa
 10 | 9,4.4,2.9,1.4,0.2,Iris-setosa
 11 | 10,4.9,3.1,1.5,0.1,Iris-setosa
 12 | 11,5.4,3.7,1.5,0.2,Iris-setosa
 13 | 12,4.8,3.4,1.6,0.2,Iris-setosa
 14 | 13,4.8,3.0,1.4,0.1,Iris-setosa
 15 | 14,4.3,3.0,1.1,0.1,Iris-setosa
 16 | 15,5.8,4.0,1.2,0.2,Iris-setosa
 17 | 16,5.7,4.4,1.5,0.4,Iris-setosa
 18 | 17,5.4,3.9,1.3,0.4,Iris-setosa
 19 | 18,5.1,3.5,1.4,0.3,Iris-setosa
 20 | 19,5.7,3.8,1.7,0.3,Iris-setosa
 21 | 20,5.1,3.8,1.5,0.3,Iris-setosa
 22 | 21,5.4,3.4,1.7,0.2,Iris-setosa
 23 | 22,5.1,3.7,1.5,0.4,Iris-setosa
 24 | 23,4.6,3.6,1.0,0.2,Iris-setosa
 25 | 24,5.1,3.3,1.7,0.5,Iris-setosa
 26 | 25,4.8,3.4,1.9,0.2,Iris-setosa
 27 | 26,5.0,3.0,1.6,0.2,Iris-setosa
 28 | 27,5.0,3.4,1.6,0.4,Iris-setosa
 29 | 28,5.2,3.5,1.5,0.2,Iris-setosa
 30 | 29,5.2,3.4,1.4,0.2,Iris-setosa
 31 | 30,4.7,3.2,1.6,0.2,Iris-setosa
 32 | 31,4.8,3.1,1.6,0.2,Iris-setosa
 33 | 32,5.4,3.4,1.5,0.4,Iris-setosa
 34 | 33,5.2,4.1,1.5,0.1,Iris-setosa
 35 | 34,5.5,4.2,1.4,0.2,Iris-setosa
 36 | 35,4.9,3.1,1.5,0.1,Iris-setosa
 37 | 36,5.0,3.2,1.2,0.2,Iris-setosa
 38 | 37,5.5,3.5,1.3,0.2,Iris-setosa
 39 | 38,4.9,3.1,1.5,0.1,Iris-setosa
 40 | 39,4.4,3.0,1.3,0.2,Iris-setosa
 41 | 40,5.1,3.4,1.5,0.2,Iris-setosa
 42 | 41,5.0,3.5,1.3,0.3,Iris-setosa
 43 | 42,4.5,2.3,1.3,0.3,Iris-setosa
 44 | 43,4.4,3.2,1.3,0.2,Iris-setosa
 45 | 44,5.0,3.5,1.6,0.6,Iris-setosa
 46 | 45,5.1,3.8,1.9,0.4,Iris-setosa
 47 | 46,4.8,3.0,1.4,0.3,Iris-setosa
 48 | 47,5.1,3.8,1.6,0.2,Iris-setosa
 49 | 48,4.6,3.2,1.4,0.2,Iris-setosa
 50 | 49,5.3,3.7,1.5,0.2,Iris-setosa
 51 | 50,5.0,3.3,1.4,0.2,Iris-setosa
 52 | 51,7.0,3.2,4.7,1.4,Iris-versicolor
 53 | 52,6.4,3.2,4.5,1.5,Iris-versicolor
 54 | 53,6.9,3.1,4.9,1.5,Iris-versicolor
 55 | 54,5.5,2.3,4.0,1.3,Iris-versicolor
 56 | 55,6.5,2.8,4.6,1.5,Iris-versicolor
 57 | 56,5.7,2.8,4.5,1.3,Iris-versicolor
 58 | 57,6.3,3.3,4.7,1.6,Iris-versicolor
 59 | 58,4.9,2.4,3.3,1.0,Iris-versicolor
 60 | 59,6.6,2.9,4.6,1.3,Iris-versicolor
 61 | 60,5.2,2.7,3.9,1.4,Iris-versicolor
 62 | 61,5.0,2.0,3.5,1.0,Iris-versicolor
 63 | 62,5.9,3.0,4.2,1.5,Iris-versicolor
 64 | 63,6.0,2.2,4.0,1.0,Iris-versicolor
 65 | 64,6.1,2.9,4.7,1.4,Iris-versicolor
 66 | 65,5.6,2.9,3.6,1.3,Iris-versicolor
 67 | 66,6.7,3.1,4.4,1.4,Iris-versicolor
 68 | 67,5.6,3.0,4.5,1.5,Iris-versicolor
 69 | 68,5.8,2.7,4.1,1.0,Iris-versicolor
 70 | 69,6.2,2.2,4.5,1.5,Iris-versicolor
 71 | 70,5.6,2.5,3.9,1.1,Iris-versicolor
 72 | 71,5.9,3.2,4.8,1.8,Iris-versicolor
 73 | 72,6.1,2.8,4.0,1.3,Iris-versicolor
 74 | 73,6.3,2.5,4.9,1.5,Iris-versicolor
 75 | 74,6.1,2.8,4.7,1.2,Iris-versicolor
 76 | 75,6.4,2.9,4.3,1.3,Iris-versicolor
 77 | 76,6.6,3.0,4.4,1.4,Iris-versicolor
 78 | 77,6.8,2.8,4.8,1.4,Iris-versicolor
 79 | 78,6.7,3.0,5.0,1.7,Iris-versicolor
 80 | 79,6.0,2.9,4.5,1.5,Iris-versicolor
 81 | 80,5.7,2.6,3.5,1.0,Iris-versicolor
 82 | 81,5.5,2.4,3.8,1.1,Iris-versicolor
 83 | 82,5.5,2.4,3.7,1.0,Iris-versicolor
 84 | 83,5.8,2.7,3.9,1.2,Iris-versicolor
 85 | 84,6.0,2.7,5.1,1.6,Iris-versicolor
 86 | 85,5.4,3.0,4.5,1.5,Iris-versicolor
 87 | 86,6.0,3.4,4.5,1.6,Iris-versicolor
 88 | 87,6.7,3.1,4.7,1.5,Iris-versicolor
 89 | 88,6.3,2.3,4.4,1.3,Iris-versicolor
 90 | 89,5.6,3.0,4.1,1.3,Iris-versicolor
 91 | 90,5.5,2.5,4.0,1.3,Iris-versicolor
 92 | 91,5.5,2.6,4.4,1.2,Iris-versicolor
 93 | 92,6.1,3.0,4.6,1.4,Iris-versicolor
 94 | 93,5.8,2.6,4.0,1.2,Iris-versicolor
 95 | 94,5.0,2.3,3.3,1.0,Iris-versicolor
 96 | 95,5.6,2.7,4.2,1.3,Iris-versicolor
 97 | 96,5.7,3.0,4.2,1.2,Iris-versicolor
 98 | 97,5.7,2.9,4.2,1.3,Iris-versicolor
 99 | 98,6.2,2.9,4.3,1.3,Iris-versicolor
100 | 99,5.1,2.5,3.0,1.1,Iris-versicolor
101 | 100,5.7,2.8,4.1,1.3,Iris-versicolor
102 | 101,6.3,3.3,6.0,2.5,Iris-virginica
103 | 102,5.8,2.7,5.1,1.9,Iris-virginica
104 | 103,7.1,3.0,5.9,2.1,Iris-virginica
105 | 104,6.3,2.9,5.6,1.8,Iris-virginica
106 | 105,6.5,3.0,5.8,2.2,Iris-virginica
107 | 106,7.6,3.0,6.6,2.1,Iris-virginica
108 | 107,4.9,2.5,4.5,1.7,Iris-virginica
109 | 108,7.3,2.9,6.3,1.8,Iris-virginica
110 | 109,6.7,2.5,5.8,1.8,Iris-virginica
111 | 110,7.2,3.6,6.1,2.5,Iris-virginica
112 | 111,6.5,3.2,5.1,2.0,Iris-virginica
113 | 112,6.4,2.7,5.3,1.9,Iris-virginica
114 | 113,6.8,3.0,5.5,2.1,Iris-virginica
115 | 114,5.7,2.5,5.0,2.0,Iris-virginica
116 | 115,5.8,2.8,5.1,2.4,Iris-virginica
117 | 116,6.4,3.2,5.3,2.3,Iris-virginica
118 | 117,6.5,3.0,5.5,1.8,Iris-virginica
119 | 118,7.7,3.8,6.7,2.2,Iris-virginica
120 | 119,7.7,2.6,6.9,2.3,Iris-virginica
121 | 120,6.0,2.2,5.0,1.5,Iris-virginica
122 | 121,6.9,3.2,5.7,2.3,Iris-virginica
123 | 122,5.6,2.8,4.9,2.0,Iris-virginica
124 | 123,7.7,2.8,6.7,2.0,Iris-virginica
125 | 124,6.3,2.7,4.9,1.8,Iris-virginica
126 | 125,6.7,3.3,5.7,2.1,Iris-virginica
127 | 126,7.2,3.2,6.0,1.8,Iris-virginica
128 | 127,6.2,2.8,4.8,1.8,Iris-virginica
129 | 128,6.1,3.0,4.9,1.8,Iris-virginica
130 | 129,6.4,2.8,5.6,2.1,Iris-virginica
131 | 130,7.2,3.0,5.8,1.6,Iris-virginica
132 | 131,7.4,2.8,6.1,1.9,Iris-virginica
133 | 132,7.9,3.8,6.4,2.0,Iris-virginica
134 | 133,6.4,2.8,5.6,2.2,Iris-virginica
135 | 134,6.3,2.8,5.1,1.5,Iris-virginica
136 | 135,6.1,2.6,5.6,1.4,Iris-virginica
137 | 136,7.7,3.0,6.1,2.3,Iris-virginica
138 | 137,6.3,3.4,5.6,2.4,Iris-virginica
139 | 138,6.4,3.1,5.5,1.8,Iris-virginica
140 | 139,6.0,3.0,4.8,1.8,Iris-virginica
141 | 140,6.9,3.1,5.4,2.1,Iris-virginica
142 | 141,6.7,3.1,5.6,2.4,Iris-virginica
143 | 142,6.9,3.1,5.1,2.3,Iris-virginica
144 | 143,5.8,2.7,5.1,1.9,Iris-virginica
145 | 144,6.8,3.2,5.9,2.3,Iris-virginica
146 | 145,6.7,3.3,5.7,2.5,Iris-virginica
147 | 146,6.7,3.0,5.2,2.3,Iris-virginica
148 | 147,6.3,2.5,5.0,1.9,Iris-virginica
149 | 148,6.5,3.0,5.2,2.0,Iris-virginica
150 | 149,6.2,3.4,5.4,2.3,Iris-virginica
151 | 150,5.9,3.0,5.1,1.8,Iris-virginica


--------------------------------------------------------------------------------
/hyperparameter-tuning-examples/05.1-successive-halving-decisiontree.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "39329df3-1f99-4b11-9405-5969d52368a7",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Decision Tree & Successive Halving Random + Search Example"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "id": "7f61a90e-a119-4bd0-af21-38604c5b4eec",
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "scikit-learn: 1.0\n",
 22 |       "mlxtend     : 0.19.0\n",
 23 |       "\n"
 24 |      ]
 25 |     }
 26 |    ],
 27 |    "source": [
 28 |     "%load_ext watermark\n",
 29 |     "%watermark -p scikit-learn,mlxtend"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "id": "1f0489c2-dd9c-4e71-a78c-e01201762b37",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "## Dataset"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 2,
 43 |    "id": "271b17ff-5ea4-4161-8b7f-20ba8131d666",
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "name": "stdout",
 48 |      "output_type": "stream",
 49 |      "text": [
 50 |       "Train/Valid/Test sizes: 398 80 171\n"
 51 |      ]
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "from sklearn import model_selection\n",
 56 |     "from sklearn.model_selection import train_test_split\n",
 57 |     "from sklearn import datasets\n",
 58 |     "\n",
 59 |     "\n",
 60 |     "data = datasets.load_breast_cancer()\n",
 61 |     "X, y = data.data, data.target\n",
 62 |     "\n",
 63 |     "X_train, X_test, y_train, y_test = \\\n",
 64 |     "    train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)\n",
 65 |     "\n",
 66 |     "X_train_sub, X_valid, y_train_sub, y_valid = \\\n",
 67 |     "    train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)\n",
 68 |     "\n",
 69 |     "print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "id": "0c922b01-86f0-4e83-9e36-446f99f6fe1b",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "## Successive Halving + Random Search"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "id": "72e56f33-ec33-46dd-afa2-a1b3c8b3da0b",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "\n",
 86 |     "- More info: \n",
 87 |     "  - https://scikit-learn.org/stable/modules/grid_search.html#successive-halving-user-guide\n",
 88 |     "  - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.HalvingRandomSearchCV.html#sklearn.model_selection.HalvingRandomSearchCV"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 3,
 94 |    "id": "96f0b4c1-803a-436f-93d5-31baab55faa5",
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "data": {
 99 |       "text/plain": [
100 |        "0.8882539682539681"
101 |       ]
102 |      },
103 |      "execution_count": 3,
104 |      "metadata": {},
105 |      "output_type": "execute_result"
106 |     }
107 |    ],
108 |    "source": [
109 |     "import numpy as np\n",
110 |     "import scipy.stats\n",
111 |     "\n",
112 |     "from sklearn.experimental import enable_halving_search_cv\n",
113 |     "from sklearn.model_selection import HalvingRandomSearchCV\n",
114 |     "\n",
115 |     "from sklearn.tree import DecisionTreeClassifier\n",
116 |     "\n",
117 |     "\n",
118 |     "clf = DecisionTreeClassifier(random_state=123)\n",
119 |     "\n",
120 |     "params =  {\n",
121 |     "    'min_samples_split': scipy.stats.randint(2, 12),\n",
122 |     "    'min_impurity_decrease': scipy.stats.uniform(0.0, 0.5),\n",
123 |     "    'max_depth': [6, 16, None]\n",
124 |     "}\n",
125 |     "\n",
126 |     "\n",
127 |     "search = HalvingRandomSearchCV(\n",
128 |     "    estimator=clf, \n",
129 |     "    param_distributions=params,\n",
130 |     "    n_candidates='exhaust',\n",
131 |     "    resource='n_samples',\n",
132 |     "    factor=3,\n",
133 |     "    random_state=123,\n",
134 |     "    n_jobs=1)\n",
135 |     "\n",
136 |     "\n",
137 |     "search.fit(X_train, y_train)\n",
138 |     "\n",
139 |     "search.best_score_"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 4,
145 |    "id": "2c26399d-ebfc-4b06-86d9-36e49711e908",
146 |    "metadata": {},
147 |    "outputs": [
148 |     {
149 |      "data": {
150 |       "text/plain": [
151 |        "{'max_depth': None,\n",
152 |        " 'min_impurity_decrease': 0.029838948304784174,\n",
153 |        " 'min_samples_split': 2}"
154 |       ]
155 |      },
156 |      "execution_count": 4,
157 |      "metadata": {},
158 |      "output_type": "execute_result"
159 |     }
160 |    ],
161 |    "source": [
162 |     "search.best_params_"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 5,
168 |    "id": "763e816b-6437-45a9-812f-8b429472d75e",
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "name": "stdout",
173 |      "output_type": "stream",
174 |      "text": [
175 |       "Training Accuracy: 0.95\n",
176 |       "Test Accuracy: 0.94\n"
177 |      ]
178 |     }
179 |    ],
180 |    "source": [
181 |     "print(f\"Training Accuracy: {search.best_estimator_.score(X_train, y_train):0.2f}\")\n",
182 |     "print(f\"Test Accuracy: {search.best_estimator_.score(X_test, y_test):0.2f}\")"
183 |    ]
184 |   }
185 |  ],
186 |  "metadata": {
187 |   "kernelspec": {
188 |    "display_name": "Python 3 (ipykernel)",
189 |    "language": "python",
190 |    "name": "python3"
191 |   },
192 |   "language_info": {
193 |    "codemirror_mode": {
194 |     "name": "ipython",
195 |     "version": 3
196 |    },
197 |    "file_extension": ".py",
198 |    "mimetype": "text/x-python",
199 |    "name": "python",
200 |    "nbconvert_exporter": "python",
201 |    "pygments_lexer": "ipython3",
202 |    "version": "3.9.6"
203 |   }
204 |  },
205 |  "nbformat": 4,
206 |  "nbformat_minor": 5
207 | }
208 | 


--------------------------------------------------------------------------------
/05-dataprocessing/code/data/iris_mod.csv:
--------------------------------------------------------------------------------
  1 | Id,SepalLength[cm],SepalWidth[cm],PetalLength[cm],PetalWidth[cm],Color_IMadeThisUp,Species
  2 | 1,5.1,3.5,1.4,0.2,red,Iris-setosa
  3 | 2,4.9,3,1.4,0.2,red,Iris-setosa
  4 | 3,4.7,3.2,1.3,0.2,red,Iris-setosa
  5 | 4,4.6,3.1,1.5,0.2,red,Iris-setosa
  6 | 5,5,3.6,1.4,0.2,red,Iris-setosa
  7 | 6,5.4,3.9,1.7,0.4,red,Iris-setosa
  8 | 7,4.6,3.4,1.4,0.3,red,Iris-setosa
  9 | 8,5,3.4,1.5,0.2,blue,Iris-setosa
 10 | 9,4.4,2.9,1.4,0.2,red,Iris-setosa
 11 | 10,4.9,3.1,1.5,0.1,red,Iris-setosa
 12 | 11,5.4,3.7,1.5,0.2,blue,Iris-setosa
 13 | 12,4.8,3.4,1.6,0.2,red,Iris-setosa
 14 | 13,4.8,3,1.4,0.1,red,Iris-setosa
 15 | 14,4.3,3,1.1,0.1,red,Iris-setosa
 16 | 15,5.8,4,1.2,0.2,red,Iris-setosa
 17 | 16,5.7,4.4,1.5,0.4,red,Iris-setosa
 18 | 17,5.4,3.9,1.3,0.4,red,Iris-setosa
 19 | 18,5.1,3.5,1.4,0.3,red,Iris-setosa
 20 | 19,5.7,3.8,1.7,0.3,red,Iris-setosa
 21 | 20,5.1,3.8,1.5,0.3,blue,Iris-setosa
 22 | 21,5.4,3.4,1.7,0.2,red,Iris-setosa
 23 | 22,5.1,3.7,1.5,0.4,red,Iris-setosa
 24 | 23,4.6,3.6,1,0.2,red,Iris-setosa
 25 | 24,5.1,3.3,1.7,0.5,blue,Iris-setosa
 26 | 25,4.8,3.4,1.9,0.2,red,Iris-setosa
 27 | 26,5,3,1.6,0.2,red,Iris-setosa
 28 | 27,5,3.4,1.6,0.4,red,Iris-setosa
 29 | 28,5.2,3.5,1.5,0.2,red,Iris-setosa
 30 | 29,5.2,3.4,1.4,0.2,red,Iris-setosa
 31 | 30,4.7,3.2,1.6,0.2,violet,Iris-setosa
 32 | 31,4.8,3.1,1.6,0.2,red,Iris-setosa
 33 | 32,5.4,3.4,1.5,0.4,red,Iris-setosa
 34 | 33,5.2,4.1,1.5,0.1,red,Iris-setosa
 35 | 34,5.5,4.2,1.4,0.2,red,Iris-setosa
 36 | 35,4.9,3.1,1.5,0.1,red,Iris-setosa
 37 | 36,5,3.2,1.2,0.2,violet,Iris-setosa
 38 | 37,5.5,3.5,1.3,0.2,red,Iris-setosa
 39 | 38,4.9,3.1,1.5,0.1,red,Iris-setosa
 40 | 39,4.4,3,1.3,0.2,red,Iris-setosa
 41 | 40,5.1,3.4,1.5,0.2,red,Iris-setosa
 42 | 41,5,3.5,1.3,0.3,red,Iris-setosa
 43 | 42,4.5,2.3,1.3,0.3,red,Iris-setosa
 44 | 43,4.4,3.2,1.3,0.2,red,Iris-setosa
 45 | 44,5,3.5,1.6,0.6,red,Iris-setosa
 46 | 45,5.1,3.8,1.9,0.4,red,Iris-setosa
 47 | 46,4.8,3,1.4,0.3,red,Iris-setosa
 48 | 47,5.1,3.8,1.6,0.2,red,Iris-setosa
 49 | 48,4.6,3.2,1.4,0.2,red,Iris-setosa
 50 | 49,5.3,3.7,1.5,0.2,red,Iris-setosa
 51 | 50,5,3.3,1.4,0.2,red,Iris-setosa
 52 | 51,7,3.2,4.7,1.4,blue,Iris-versicolor
 53 | 52,6.4,3.2,4.5,1.5,blue,Iris-versicolor
 54 | 53,6.9,3.1,4.9,1.5,blue,Iris-versicolor
 55 | 54,5.5,2.3,4,1.3,blue,Iris-versicolor
 56 | 55,6.5,2.8,4.6,1.5,blue,Iris-versicolor
 57 | 56,5.7,2.8,4.5,1.3,blue,Iris-versicolor
 58 | 57,6.3,3.3,4.7,1.6,blue,Iris-versicolor
 59 | 58,4.9,2.4,3.3,1,blue,Iris-versicolor
 60 | 59,6.6,2.9,4.6,1.3,blue,Iris-versicolor
 61 | 60,5.2,2.7,3.9,1.4,blue,Iris-versicolor
 62 | 61,5,2,3.5,1,blue,Iris-versicolor
 63 | 62,5.9,3,4.2,1.5,blue,Iris-versicolor
 64 | 63,6,2.2,4,1,blue,Iris-versicolor
 65 | 64,6.1,2.9,4.7,1.4,blue,Iris-versicolor
 66 | 65,5.6,2.9,3.6,1.3,blue,Iris-versicolor
 67 | 66,6.7,3.1,4.4,1.4,red,Iris-versicolor
 68 | 67,5.6,3,4.5,1.5,blue,Iris-versicolor
 69 | 68,5.8,2.7,4.1,1,blue,Iris-versicolor
 70 | 69,6.2,2.2,4.5,1.5,blue,Iris-versicolor
 71 | 70,5.6,2.5,3.9,1.1,violet,Iris-versicolor
 72 | 71,5.9,3.2,4.8,1.8,blue,Iris-versicolor
 73 | 72,6.1,2.8,4,1.3,blue,Iris-versicolor
 74 | 73,6.3,2.5,4.9,1.5,blue,Iris-versicolor
 75 | 74,6.1,2.8,4.7,1.2,blue,Iris-versicolor
 76 | 75,6.4,2.9,4.3,1.3,blue,Iris-versicolor
 77 | 76,6.6,3,4.4,1.4,blue,Iris-versicolor
 78 | 77,6.8,2.8,4.8,1.4,blue,Iris-versicolor
 79 | 78,6.7,3,5,1.7,blue,Iris-versicolor
 80 | 79,6,2.9,4.5,1.5,blue,Iris-versicolor
 81 | 80,5.7,2.6,3.5,1,violet,Iris-versicolor
 82 | 81,5.5,2.4,3.8,1.1,blue,Iris-versicolor
 83 | 82,5.5,2.4,3.7,1,red,Iris-versicolor
 84 | 83,5.8,2.7,3.9,1.2,blue,Iris-versicolor
 85 | 84,6,2.7,5.1,1.6,blue,Iris-versicolor
 86 | 85,5.4,3,4.5,1.5,blue,Iris-versicolor
 87 | 86,6,3.4,4.5,1.6,blue,Iris-versicolor
 88 | 87,6.7,3.1,4.7,1.5,blue,Iris-versicolor
 89 | 88,6.3,2.3,4.4,1.3,violet,Iris-versicolor
 90 | 89,5.6,3,4.1,1.3,blue,Iris-versicolor
 91 | 90,5.5,2.5,4,1.3,blue,Iris-versicolor
 92 | 91,5.5,2.6,4.4,1.2,blue,Iris-versicolor
 93 | 92,6.1,3,4.6,1.4,blue,Iris-versicolor
 94 | 93,5.8,2.6,4,1.2,violet,Iris-versicolor
 95 | 94,5,2.3,3.3,1,blue,Iris-versicolor
 96 | 95,5.6,2.7,4.2,1.3,violet,Iris-versicolor
 97 | 96,5.7,3,4.2,1.2,blue,Iris-versicolor
 98 | 97,5.7,2.9,4.2,1.3,blue,Iris-versicolor
 99 | 98,6.2,2.9,4.3,1.3,blue,Iris-versicolor
100 | 99,5.1,2.5,3,1.1,blue,Iris-versicolor
101 | 100,5.7,2.8,4.1,1.3,blue,Iris-versicolor
102 | 101,6.3,3.3,6,2.5,violet,Iris-virginica
103 | 102,5.8,2.7,5.1,1.9,violet,Iris-virginica
104 | 103,7.1,3,5.9,2.1,violet,Iris-virginica
105 | 104,6.3,2.9,5.6,1.8,violet,Iris-virginica
106 | 105,6.5,3,5.8,2.2,violet,Iris-virginica
107 | 106,7.6,3,6.6,2.1,violet,Iris-virginica
108 | 107,4.9,2.5,4.5,1.7,violet,Iris-virginica
109 | 108,7.3,2.9,6.3,1.8,violet,Iris-virginica
110 | 109,6.7,2.5,5.8,1.8,violet,Iris-virginica
111 | 110,7.2,3.6,6.1,2.5,violet,Iris-virginica
112 | 111,6.5,3.2,5.1,2,violet,Iris-virginica
113 | 112,6.4,2.7,5.3,1.9,violet,Iris-virginica
114 | 113,6.8,3,5.5,2.1,violet,Iris-virginica
115 | 114,5.7,2.5,5,2,violet,Iris-virginica
116 | 115,5.8,2.8,5.1,2.4,violet,Iris-virginica
117 | 116,6.4,3.2,5.3,2.3,violet,Iris-virginica
118 | 117,6.5,3,5.5,1.8,violet,Iris-virginica
119 | 118,7.7,3.8,6.7,2.2,violet,Iris-virginica
120 | 119,7.7,2.6,6.9,2.3,violet,Iris-virginica
121 | 120,6,2.2,5,1.5,violet,Iris-virginica
122 | 121,6.9,3.2,5.7,2.3,blue,Iris-virginica
123 | 122,5.6,2.8,4.9,2,violet,Iris-virginica
124 | 123,7.7,2.8,6.7,2,violet,Iris-virginica
125 | 124,6.3,2.7,4.9,1.8,violet,Iris-virginica
126 | 125,6.7,3.3,5.7,2.1,blue,Iris-virginica
127 | 126,7.2,3.2,6,1.8,violet,Iris-virginica
128 | 127,6.2,2.8,4.8,1.8,violet,Iris-virginica
129 | 128,6.1,3,4.9,1.8,violet,Iris-virginica
130 | 129,6.4,2.8,5.6,2.1,blue,Iris-virginica
131 | 130,7.2,3,5.8,1.6,violet,Iris-virginica
132 | 131,7.4,2.8,6.1,1.9,violet,Iris-virginica
133 | 132,7.9,3.8,6.4,2,violet,Iris-virginica
134 | 133,6.4,2.8,5.6,2.2,violet,Iris-virginica
135 | 134,6.3,2.8,5.1,1.5,red,Iris-virginica
136 | 135,6.1,2.6,5.6,1.4,violet,Iris-virginica
137 | 136,7.7,3,6.1,2.3,violet,Iris-virginica
138 | 137,6.3,3.4,5.6,2.4,violet,Iris-virginica
139 | 138,6.4,3.1,5.5,1.8,violet,Iris-virginica
140 | 139,6,3,4.8,1.8,blue,Iris-virginica
141 | 140,6.9,3.1,5.4,2.1,violet,Iris-virginica
142 | 141,6.7,3.1,5.6,2.4,violet,Iris-virginica
143 | 142,6.9,3.1,5.1,2.3,violet,Iris-virginica
144 | 143,5.8,2.7,5.1,1.9,violet,Iris-virginica
145 | 144,6.8,3.2,5.9,2.3,violet,Iris-virginica
146 | 145,6.7,3.3,5.7,2.5,violet,Iris-virginica
147 | 146,6.7,3,5.2,2.3,violet,Iris-virginica
148 | 147,6.3,2.5,5,1.9,violet,Iris-virginica
149 | 148,6.5,3,5.2,2,blue,Iris-virginica
150 | 149,6.2,3.4,5.4,2.3,violet,Iris-virginica
151 | 150,5.9,3,5.1,1.8,red,Iris-virginica


--------------------------------------------------------------------------------
/hw02-starter/hw-baseline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "9f0cafc0-1993-411c-9247-d75ac091280b",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Performance Baselines"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "id": "67ce4a31-1781-4f46-a7ce-e2b1f6cedc8b",
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "scikit-learn: 1.0\n",
 22 |       "mlxtend     : 0.19.0\n",
 23 |       "xgboost     : 1.5.0\n",
 24 |       "\n"
 25 |      ]
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "%load_ext watermark\n",
 30 |     "%watermark -p scikit-learn,mlxtend,xgboost"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "id": "381c55c2-1ec2-43fc-8c66-4a2acbc4b857",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "## Dataset"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "id": "38f4522c-3671-4ce5-acff-bd29143e5392",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "Source: https://archive.ics.uci.edu/ml/datasets/Dry+Bean+Dataset"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 2,
 52 |    "id": "bf2e92fa-1bf4-4435-a1f3-4e9613ec83d0",
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "name": "stdout",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "X_train.shape: (9119, 16)\n",
 60 |       "y_train.shape: (9119,)\n",
 61 |       "X_test.shape: (4492, 16)\n",
 62 |       "y_test.shape: (4492,)\n"
 63 |      ]
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "import pandas as pd\n",
 68 |     "\n",
 69 |     "\n",
 70 |     "X_train = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/X_train.csv', header=None).values\n",
 71 |     "y_train = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/y_train.csv', header=None).values.ravel().astype(int)\n",
 72 |     "\n",
 73 |     "X_test = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/X_test.csv', header=None).values\n",
 74 |     "y_test = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/y_test.csv', header=None).values.ravel().astype(int)\n",
 75 |     "\n",
 76 |     "print('X_train.shape:', X_train.shape)\n",
 77 |     "print('y_train.shape:', y_train.shape)\n",
 78 |     "print('X_test.shape:', X_test.shape)\n",
 79 |     "print('y_test.shape:', y_test.shape)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 3,
 85 |    "id": "dfd1ee6f-7163-48ba-ba58-542191985c84",
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stdout",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "Train/Valid/Test sizes: 9119 1824 4492\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "from sklearn.model_selection import train_test_split\n",
 98 |     "\n",
 99 |     "\n",
100 |     "X_train_sub, X_valid, y_train_sub, y_valid = \\\n",
101 |     "    train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)\n",
102 |     "\n",
103 |     "print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "id": "3285747b-edfb-4381-9b90-8212a04f6d85",
109 |    "metadata": {},
110 |    "source": [
111 |     "## Baselines"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "id": "1224925f-c4a2-4e28-9693-ff49c43ad694",
117 |    "metadata": {},
118 |    "source": [
119 |     "Compare hyperparameter settings on validation set:"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 4,
125 |    "id": "2a49887e-6bd5-4d1f-bb5f-e833613c0f2f",
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "name": "stdout",
130 |      "output_type": "stream",
131 |      "text": [
132 |       "Train Accuracy: 79.657%\n",
133 |       "Valid Accuracy: 71.162%\n"
134 |      ]
135 |     }
136 |    ],
137 |    "source": [
138 |     "from sklearn.neighbors import KNeighborsClassifier\n",
139 |     "\n",
140 |     "\n",
141 |     "knn = KNeighborsClassifier(n_neighbors=5)\n",
142 |     "knn.fit(X_train_sub, y_train_sub)\n",
143 |     "print(f\"Train Accuracy: {knn.score(X_train_sub, y_train_sub)*100:0.3f}%\")\n",
144 |     "print(f\"Valid Accuracy: {knn.score(X_valid, y_valid)*100:0.3f}%\")"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 5,
150 |    "id": "2c2c6e00-15cf-4b39-80db-2ddeb46409ad",
151 |    "metadata": {},
152 |    "outputs": [
153 |     {
154 |      "name": "stdout",
155 |      "output_type": "stream",
156 |      "text": [
157 |       "Train Accuracy: 84.003%\n",
158 |       "Valid Accuracy: 71.930%\n"
159 |      ]
160 |     }
161 |    ],
162 |    "source": [
163 |     "knn = KNeighborsClassifier(n_neighbors=3)\n",
164 |     "knn.fit(X_train_sub, y_train_sub)\n",
165 |     "print(f\"Train Accuracy: {knn.score(X_train_sub, y_train_sub)*100:0.3f}%\")\n",
166 |     "print(f\"Valid Accuracy: {knn.score(X_valid, y_valid)*100:0.3f}%\")"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 6,
172 |    "id": "66df75b8-6358-480b-8b0e-914259a27aea",
173 |    "metadata": {},
174 |    "outputs": [
175 |     {
176 |      "name": "stdout",
177 |      "output_type": "stream",
178 |      "text": [
179 |       "Train Accuracy: 77.478%\n",
180 |       "Valid Accuracy: 69.518%\n"
181 |      ]
182 |     }
183 |    ],
184 |    "source": [
185 |     "knn = KNeighborsClassifier(n_neighbors=7)\n",
186 |     "knn.fit(X_train_sub, y_train_sub)\n",
187 |     "print(f\"Train Accuracy: {knn.score(X_train_sub, y_train_sub)*100:0.3f}%\")\n",
188 |     "print(f\"Valid Accuracy: {knn.score(X_valid, y_valid)*100:0.3f}%\")"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "id": "9e13b5e5-18fe-4acc-8b60-6c6be54ac460",
194 |    "metadata": {},
195 |    "source": [
196 |     "Choose best model and train on whole training set:"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 7,
202 |    "id": "abc7a3c0-33f8-414c-9572-875bc657c919",
203 |    "metadata": {},
204 |    "outputs": [
205 |     {
206 |      "name": "stdout",
207 |      "output_type": "stream",
208 |      "text": [
209 |       "Train Accuracy: 84.965%\n",
210 |       "Test Accuracy: 71.305%\n"
211 |      ]
212 |     }
213 |    ],
214 |    "source": [
215 |     "model = KNeighborsClassifier(n_neighbors=3)\n",
216 |     "model.fit(X_train, y_train)\n",
217 |     "print(f\"Train Accuracy: {model.score(X_train, y_train)*100:0.3f}%\")\n",
218 |     "print(f\"Test Accuracy: {model.score(X_test, y_test)*100:0.3f}%\")"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "id": "ece3d175-a886-4738-b299-175b326b1d54",
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": []
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "id": "e142cdbb-2eca-4e73-bfd8-674241ce539e",
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": []
236 |   }
237 |  ],
238 |  "metadata": {
239 |   "kernelspec": {
240 |    "display_name": "Python 3 (ipykernel)",
241 |    "language": "python",
242 |    "name": "python3"
243 |   },
244 |   "language_info": {
245 |    "codemirror_mode": {
246 |     "name": "ipython",
247 |     "version": 3
248 |    },
249 |    "file_extension": ".py",
250 |    "mimetype": "text/x-python",
251 |    "name": "python",
252 |    "nbconvert_exporter": "python",
253 |    "pygments_lexer": "ipython3",
254 |    "version": "3.9.6"
255 |   }
256 |  },
257 |  "nbformat": 4,
258 |  "nbformat_minor": 5
259 | }
260 | 


--------------------------------------------------------------------------------
/hyperparameter-tuning-examples/06.1-genetic-opt.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "39329df3-1f99-4b11-9405-5969d52368a7",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Genetic Programming-Based Hyperparameter Optimization of a Decision Tree"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "66496ba0-a0c7-4819-9c6d-13daf80c8c9c",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "This notebook shows how to use [`sklearn-genetic-opt`](https://sklearn-genetic-opt.readthedocs.io/en/stable/) for hyperparameter optimization based on genetic algorithms (evolutionary programming). If you are interested in understanding how it works, `sklearn-genetic-opt` is using [DEAP](https://deap.readthedocs.io/) under the hood. \n"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "id": "7f61a90e-a119-4bd0-af21-38604c5b4eec",
 23 |    "metadata": {},
 24 |    "outputs": [
 25 |     {
 26 |      "name": "stdout",
 27 |      "output_type": "stream",
 28 |      "text": [
 29 |       "scikit-learn   : 1.0\n",
 30 |       "sklearn        : 1.0\n",
 31 |       "deap           : 1.3.1\n",
 32 |       "sklearn_genetic: 0.7.0\n",
 33 |       "\n"
 34 |      ]
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "%load_ext watermark\n",
 39 |     "%watermark -p scikit-learn,sklearn,deap,sklearn_genetic"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "id": "1f0489c2-dd9c-4e71-a78c-e01201762b37",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "## Dataset"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 2,
 53 |    "id": "271b17ff-5ea4-4161-8b7f-20ba8131d666",
 54 |    "metadata": {},
 55 |    "outputs": [
 56 |     {
 57 |      "name": "stdout",
 58 |      "output_type": "stream",
 59 |      "text": [
 60 |       "Train/Valid/Test sizes: 398 80 171\n"
 61 |      ]
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "from sklearn import model_selection\n",
 66 |     "from sklearn.model_selection import train_test_split\n",
 67 |     "from sklearn import datasets\n",
 68 |     "\n",
 69 |     "\n",
 70 |     "data = datasets.load_breast_cancer()\n",
 71 |     "X, y = data.data, data.target\n",
 72 |     "\n",
 73 |     "X_train, X_test, y_train, y_test = \\\n",
 74 |     "    train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)\n",
 75 |     "\n",
 76 |     "X_train_sub, X_valid, y_train_sub, y_valid = \\\n",
 77 |     "    train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)\n",
 78 |     "\n",
 79 |     "print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "id": "0c922b01-86f0-4e83-9e36-446f99f6fe1b",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "## sklearn-genetic-opt"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "id": "72e56f33-ec33-46dd-afa2-a1b3c8b3da0b",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "- Install:  `pip install sklearn-genetic-opt[all]`\n",
 96 |     "\n",
 97 |     "- More info: https://sklearn-genetic-opt.readthedocs.io/en/stable/#"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 3,
103 |    "id": "96f0b4c1-803a-436f-93d5-31baab55faa5",
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "name": "stdout",
108 |      "output_type": "stream",
109 |      "text": [
110 |       "gen\tnevals\tfitness \tfitness_std\tfitness_max\tfitness_min\n",
111 |       "0  \t15    \t0.773962\t0.131052   \t0.914778   \t0.628165   \n",
112 |       "1  \t28    \t0.888608\t0.0588224  \t0.914778   \t0.673165   \n",
113 |       "2  \t29    \t0.911424\t0.00855215 \t0.914778   \t0.88962    \n",
114 |       "3  \t28    \t0.914778\t4.44089e-16\t0.914778   \t0.914778   \n",
115 |       "4  \t28    \t0.914778\t4.44089e-16\t0.914778   \t0.914778   \n",
116 |       "5  \t28    \t0.914778\t4.44089e-16\t0.914778   \t0.914778   \n",
117 |       "6  \t29    \t0.914778\t4.44089e-16\t0.914778   \t0.914778   \n",
118 |       "7  \t27    \t0.918297\t0.00703797 \t0.932373   \t0.914778   \n",
119 |       "8  \t27    \t0.922989\t0.0087779  \t0.932373   \t0.914778   \n",
120 |       "9  \t29    \t0.928854\t0.00703797 \t0.932373   \t0.914778   \n",
121 |       "10 \t29    \t0.932373\t3.33067e-16\t0.932373   \t0.932373   \n",
122 |       "11 \t29    \t0.932373\t3.33067e-16\t0.932373   \t0.932373   \n",
123 |       "12 \t29    \t0.932373\t3.33067e-16\t0.932373   \t0.932373   \n",
124 |       "13 \t29    \t0.932861\t0.000974684\t0.93481    \t0.932373   \n",
125 |       "14 \t29    \t0.933023\t0.00107755 \t0.93481    \t0.932373   \n",
126 |       "15 \t28    \t0.93416 \t0.00107755 \t0.93481    \t0.932373   \n",
127 |       "16 \t29    \t0.93481 \t3.33067e-16\t0.93481    \t0.93481    \n",
128 |       "17 \t29    \t0.93481 \t3.33067e-16\t0.93481    \t0.93481    \n",
129 |       "18 \t29    \t0.93481 \t3.33067e-16\t0.93481    \t0.93481    \n",
130 |       "19 \t28    \t0.93481 \t3.33067e-16\t0.93481    \t0.93481    \n",
131 |       "20 \t29    \t0.93481 \t3.33067e-16\t0.93481    \t0.93481    \n"
132 |      ]
133 |     },
134 |     {
135 |      "data": {
136 |       "text/plain": [
137 |        "0.9348101265822784"
138 |       ]
139 |      },
140 |      "execution_count": 3,
141 |      "metadata": {},
142 |      "output_type": "execute_result"
143 |     }
144 |    ],
145 |    "source": [
146 |     "import numpy as np\n",
147 |     "import scipy.stats\n",
148 |     "\n",
149 |     "from sklearn_genetic import GASearchCV\n",
150 |     "from sklearn_genetic.space import Integer, Categorical, Continuous\n",
151 |     "from sklearn.tree import DecisionTreeClassifier\n",
152 |     "\n",
153 |     "\n",
154 |     "clf = DecisionTreeClassifier(random_state=123)\n",
155 |     "\n",
156 |     "params =  {\n",
157 |     "    'min_samples_split': Integer(2, 12),\n",
158 |     "    'min_impurity_decrease': Continuous(0.0, 0.5),\n",
159 |     "    'max_depth': Categorical([6, 16, None])\n",
160 |     "}\n",
161 |     "\n",
162 |     "search = GASearchCV(\n",
163 |     "    estimator=clf,\n",
164 |     "    cv=5,\n",
165 |     "    population_size=15,\n",
166 |     "    generations=20,\n",
167 |     "    tournament_size=3,\n",
168 |     "    elitism=True,\n",
169 |     "    keep_top_k=4,\n",
170 |     "    crossover_probability=0.9,\n",
171 |     "    mutation_probability=0.05,\n",
172 |     "    param_grid=params,\n",
173 |     "    criteria='max',\n",
174 |     "    algorithm='eaMuCommaLambda',\n",
175 |     "    n_jobs=-1)\n",
176 |     "\n",
177 |     "search.fit(X_train, y_train)\n",
178 |     "\n",
179 |     "search.best_score_"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 4,
185 |    "id": "2c26399d-ebfc-4b06-86d9-36e49711e908",
186 |    "metadata": {},
187 |    "outputs": [
188 |     {
189 |      "data": {
190 |       "text/plain": [
191 |        "{'min_samples_split': 8,\n",
192 |        " 'min_impurity_decrease': 0.006258039752250311,\n",
193 |        " 'max_depth': 16}"
194 |       ]
195 |      },
196 |      "execution_count": 4,
197 |      "metadata": {},
198 |      "output_type": "execute_result"
199 |     }
200 |    ],
201 |    "source": [
202 |     "search.best_params_"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 5,
208 |    "id": "763e816b-6437-45a9-812f-8b429472d75e",
209 |    "metadata": {},
210 |    "outputs": [
211 |     {
212 |      "name": "stdout",
213 |      "output_type": "stream",
214 |      "text": [
215 |       "Training Accuracy: 0.99\n",
216 |       "Test Accuracy: 0.94\n"
217 |      ]
218 |     }
219 |    ],
220 |    "source": [
221 |     "print(f\"Training Accuracy: {search.best_estimator_.score(X_train, y_train):0.2f}\")\n",
222 |     "print(f\"Test Accuracy: {search.best_estimator_.score(X_test, y_test):0.2f}\")"
223 |    ]
224 |   }
225 |  ],
226 |  "metadata": {
227 |   "kernelspec": {
228 |    "display_name": "Python 3 (ipykernel)",
229 |    "language": "python",
230 |    "name": "python3"
231 |   },
232 |   "language_info": {
233 |    "codemirror_mode": {
234 |     "name": "ipython",
235 |     "version": 3
236 |    },
237 |    "file_extension": ".py",
238 |    "mimetype": "text/x-python",
239 |    "name": "python",
240 |    "nbconvert_exporter": "python",
241 |    "pygments_lexer": "ipython3",
242 |    "version": "3.9.6"
243 |   }
244 |  },
245 |  "nbformat": 4,
246 |  "nbformat_minor": 5
247 | }
248 | 


--------------------------------------------------------------------------------
/hyperparameter-tuning-examples/03.1-hyperopt-decisiontree-example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "39329df3-1f99-4b11-9405-5969d52368a7",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Deciscion Tree & Hyperopt Example"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "c7d5f0ab-33cd-40f2-82e7-fb2747f04f89",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Example showing how to use the Hyperopt library (http://hyperopt.github.io) for Bayesian hyperparameter optimization (via tree of parzen estimator)"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "id": "7f61a90e-a119-4bd0-af21-38604c5b4eec",
 23 |    "metadata": {},
 24 |    "outputs": [
 25 |     {
 26 |      "name": "stdout",
 27 |      "output_type": "stream",
 28 |      "text": [
 29 |       "scikit-learn: 1.0\n",
 30 |       "hyperopt    : 0.2.5\n",
 31 |       "\n"
 32 |      ]
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "%load_ext watermark\n",
 37 |     "%watermark -p scikit-learn,hyperopt"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "id": "1f0489c2-dd9c-4e71-a78c-e01201762b37",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "## Dataset"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 2,
 51 |    "id": "271b17ff-5ea4-4161-8b7f-20ba8131d666",
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "name": "stdout",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "Train/Valid/Test sizes: 398 80 171\n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "from sklearn import model_selection\n",
 64 |     "from sklearn.model_selection import train_test_split\n",
 65 |     "from sklearn import datasets\n",
 66 |     "\n",
 67 |     "\n",
 68 |     "data = datasets.load_breast_cancer()\n",
 69 |     "X, y = data.data, data.target\n",
 70 |     "\n",
 71 |     "X_train, X_test, y_train, y_test = \\\n",
 72 |     "    train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)\n",
 73 |     "\n",
 74 |     "X_train_sub, X_valid, y_train_sub, y_valid = \\\n",
 75 |     "    train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)\n",
 76 |     "\n",
 77 |     "print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "id": "0affc454-9f07-48e6-bcee-e6253d968247",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## Hyperopt"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 3,
 91 |    "id": "53282fd6-1292-4b4d-a0b7-980707d61c3c",
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "from hyperopt import Trials, STATUS_OK, tpe, hp, fmin\n",
 96 |     "import hyperopt.pyll.stochastic"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "id": "5435889f-3cd7-45cd-abb2-632e3b034194",
102 |    "metadata": {},
103 |    "source": [
104 |     "Some random sampling examples:"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 4,
110 |    "id": "7ca6f8f6-0c78-434a-8121-a83b5708e143",
111 |    "metadata": {},
112 |    "outputs": [
113 |     {
114 |      "data": {
115 |       "text/plain": [
116 |        "1.8925662130833578"
117 |       ]
118 |      },
119 |      "execution_count": 4,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "hyperopt.pyll.stochastic.sample(hp.loguniform('test', 1e-5, 1)) # range e^{low} to e^{high}"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 5,
131 |    "id": "b2adc867-2d5a-44bd-8115-195ed53d6a7e",
132 |    "metadata": {},
133 |    "outputs": [
134 |     {
135 |      "data": {
136 |       "text/plain": [
137 |        "1.1"
138 |       ]
139 |      },
140 |      "execution_count": 5,
141 |      "metadata": {},
142 |      "output_type": "execute_result"
143 |     }
144 |    ],
145 |    "source": [
146 |     "hyperopt.pyll.stochastic.sample(hp.qloguniform('test', 1e-5, 1, 0.1)) # rounded to 0.1"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 6,
152 |    "id": "9a6bb270-d2a1-4179-a770-39bad5a8332c",
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "from sklearn.model_selection import cross_val_score\n",
157 |     "from sklearn.tree import DecisionTreeClassifier\n",
158 |     "import numpy as np\n",
159 |     "\n",
160 |     "\n",
161 |     "\n",
162 |     "params =  {\n",
163 |     "    'min_samples_split': hp.choice('min_samples_split', np.arange(2, 10)),\n",
164 |     "    'min_impurity_decrease': hp.quniform('min_impurity_decrease', 0.0, 0.5, 0.05),\n",
165 |     "    'max_depth': hp.choice('max_depth', [6, 16, None])\n",
166 |     "}\n",
167 |     "\n",
168 |     "\n",
169 |     "\n",
170 |     "def optimization_objective(params):\n",
171 |     "\n",
172 |     "\n",
173 |     "    tree = DecisionTreeClassifier(random_state=123, **params)\n",
174 |     "    tree.fit(X_train, y_train)\n",
175 |     "    \n",
176 |     "    accuracies = cross_val_score(\n",
177 |     "        estimator=tree, X=X_train, y=y_train, cv=10, n_jobs=-1)\n",
178 |     "\n",
179 |     "    score = accuracies.mean()\n",
180 |     "\n",
181 |     "    return {'loss':1-score, 'status': STATUS_OK}"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 7,
187 |    "id": "a51829c6-234f-401f-84ed-a005f71d0150",
188 |    "metadata": {},
189 |    "outputs": [
190 |     {
191 |      "name": "stdout",
192 |      "output_type": "stream",
193 |      "text": [
194 |       "100%|████████| 50/50 [00:01<00:00, 32.09trial/s, best loss: 0.06756410256410261]\n"
195 |      ]
196 |     }
197 |    ],
198 |    "source": [
199 |     "trials = Trials()\n",
200 |     "best = fmin(fn=optimization_objective,\n",
201 |     "            space=params,\n",
202 |     "            algo=tpe.suggest,\n",
203 |     "            max_evals=50,\n",
204 |     "            trials=trials)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 8,
210 |    "id": "2c26399d-ebfc-4b06-86d9-36e49711e908",
211 |    "metadata": {},
212 |    "outputs": [
213 |     {
214 |      "data": {
215 |       "text/plain": [
216 |        "{'max_depth': 2, 'min_impurity_decrease': 0.0, 'min_samples_split': 5}"
217 |       ]
218 |      },
219 |      "execution_count": 8,
220 |      "metadata": {},
221 |      "output_type": "execute_result"
222 |     }
223 |    ],
224 |    "source": [
225 |     "best"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "id": "42380f27-d982-4ae8-8981-17b7224ebb04",
231 |    "metadata": {},
232 |    "source": [
233 |     "- Attention, `fmin` returns results from `hp.choice` as an index!"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 9,
239 |    "id": "83e99f85-9ce2-494e-99ea-20ab49dc0b15",
240 |    "metadata": {},
241 |    "outputs": [
242 |     {
243 |      "name": "stdout",
244 |      "output_type": "stream",
245 |      "text": [
246 |       "{'max_depth': None, 'min_impurity_decrease': 0.0, 'min_samples_split': 7}\n"
247 |      ]
248 |     }
249 |    ],
250 |    "source": [
251 |     "from hyperopt import space_eval\n",
252 |     "\n",
253 |     "best_params = space_eval(params, best)\n",
254 |     "print(best_params)"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 10,
260 |    "id": "fbb610d8-4846-4e9f-a589-adacd0042603",
261 |    "metadata": {},
262 |    "outputs": [
263 |     {
264 |      "data": {
265 |       "text/plain": [
266 |        "DecisionTreeClassifier(min_samples_split=7, random_state=123)"
267 |       ]
268 |      },
269 |      "execution_count": 10,
270 |      "metadata": {},
271 |      "output_type": "execute_result"
272 |     }
273 |    ],
274 |    "source": [
275 |     "tree = DecisionTreeClassifier(random_state=123, **best_params)\n",
276 |     "tree.fit(X_train, y_train)"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 11,
282 |    "id": "763e816b-6437-45a9-812f-8b429472d75e",
283 |    "metadata": {},
284 |    "outputs": [
285 |     {
286 |      "name": "stdout",
287 |      "output_type": "stream",
288 |      "text": [
289 |       "Training Accuracy: 0.99\n",
290 |       "Test Accuracy: 0.94\n"
291 |      ]
292 |     }
293 |    ],
294 |    "source": [
295 |     "print(f\"Training Accuracy: {tree.score(X_train, y_train):0.2f}\")\n",
296 |     "print(f\"Test Accuracy: {tree.score(X_test, y_test):0.2f}\")"
297 |    ]
298 |   }
299 |  ],
300 |  "metadata": {
301 |   "kernelspec": {
302 |    "display_name": "Python 3 (ipykernel)",
303 |    "language": "python",
304 |    "name": "python3"
305 |   },
306 |   "language_info": {
307 |    "codemirror_mode": {
308 |     "name": "ipython",
309 |     "version": 3
310 |    },
311 |    "file_extension": ".py",
312 |    "mimetype": "text/x-python",
313 |    "name": "python",
314 |    "nbconvert_exporter": "python",
315 |    "pygments_lexer": "ipython3",
316 |    "version": "3.9.6"
317 |   }
318 |  },
319 |  "nbformat": 4,
320 |  "nbformat_minor": 5
321 | }
322 | 


--------------------------------------------------------------------------------
/07-ensembles/code/07-06_stacking.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# STAT451: Machine Learning -- L07: Ensemble Methods Part 1/2"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "STAT 451: Intro to Machine Learning (Fall 2021)  \n",
 15 |     "Instructor: Sebastian Raschka"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {},
 22 |    "outputs": [
 23 |     {
 24 |      "name": "stdout",
 25 |      "output_type": "stream",
 26 |      "text": [
 27 |       "scikit-learn: 1.0\n",
 28 |       "mlxtend     : 0.19.0\n",
 29 |       "\n"
 30 |      ]
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "%load_ext watermark\n",
 35 |     "%watermark -p scikit-learn,mlxtend"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "# Stacking"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "## Dataset"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 2,
 55 |    "metadata": {},
 56 |    "outputs": [
 57 |     {
 58 |      "name": "stdout",
 59 |      "output_type": "stream",
 60 |      "text": [
 61 |       "Train/Valid/Test sizes: 318 80 171\n"
 62 |      ]
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "from sklearn.model_selection import train_test_split\n",
 67 |     "from sklearn import datasets\n",
 68 |     "\n",
 69 |     "data = datasets.load_breast_cancer()\n",
 70 |     "X, y = data.data, data.target\n",
 71 |     "\n",
 72 |     "X_temp, X_test, y_temp, y_test = \\\n",
 73 |     "    train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)\n",
 74 |     "\n",
 75 |     "X_train, X_valid, y_train, y_valid = \\\n",
 76 |     "    train_test_split(X_temp, y_temp, test_size=0.2, random_state=123, stratify=y_temp)\n",
 77 |     "\n",
 78 |     "print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## MLxtend standard Stacking (prone to overfitting)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 3,
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "name": "stdout",
 95 |      "output_type": "stream",
 96 |      "text": [
 97 |       "Training Accuracy: 1.00\n",
 98 |       "Validation Accuracy: 0.96\n",
 99 |       "Test Accuracy: 0.98\n"
100 |      ]
101 |     }
102 |    ],
103 |    "source": [
104 |     "from sklearn import model_selection\n",
105 |     "from sklearn.linear_model import LogisticRegression\n",
106 |     "from sklearn.neighbors import KNeighborsClassifier\n",
107 |     "from sklearn.ensemble import AdaBoostClassifier\n",
108 |     "from sklearn.ensemble import RandomForestClassifier\n",
109 |     "from sklearn.tree import DecisionTreeClassifier\n",
110 |     "from sklearn.ensemble import HistGradientBoostingClassifier\n",
111 |     "from mlxtend.classifier import StackingClassifier\n",
112 |     "\n",
113 |     "\n",
114 |     "clf1 = KNeighborsClassifier(n_neighbors=5)\n",
115 |     "clf2 = RandomForestClassifier(random_state=123)\n",
116 |     "clf3 = HistGradientBoostingClassifier(random_state=123)\n",
117 |     "clf4 = AdaBoostClassifier(random_state=123)\n",
118 |     "clf5 = DecisionTreeClassifier(random_state=123,\n",
119 |     "                              max_depth=None)\n",
120 |     "\n",
121 |     "lr = LogisticRegression(random_state=123)\n",
122 |     "\n",
123 |     "sclf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5], \n",
124 |     "                          meta_classifier=lr)\n",
125 |     "\n",
126 |     "\n",
127 |     "sclf.fit(X_train, y_train)\n",
128 |     "print(\"Training Accuracy: %0.2f\" % sclf.score(X_train, y_train))\n",
129 |     "print(\"Validation Accuracy: %0.2f\" % sclf.score(X_valid, y_valid))\n",
130 |     "print(\"Test Accuracy: %0.2f\" % sclf.score(X_test, y_test))"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "## MLxtend Stacking + CV"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 4,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "Training Accuracy: 1.00\n",
150 |       "Validation Accuracy: 0.97\n",
151 |       "Test Accuracy: 0.98\n"
152 |      ]
153 |     }
154 |    ],
155 |    "source": [
156 |     "from mlxtend.classifier import StackingCVClassifier\n",
157 |     "\n",
158 |     "\n",
159 |     "clf1 = KNeighborsClassifier(n_neighbors=5)\n",
160 |     "clf2 = RandomForestClassifier(random_state=123)\n",
161 |     "clf3 = HistGradientBoostingClassifier(random_state=123)\n",
162 |     "clf4 = AdaBoostClassifier(random_state=123)\n",
163 |     "clf5 = DecisionTreeClassifier(random_state=123,\n",
164 |     "                              max_depth=None)\n",
165 |     "\n",
166 |     "lr = LogisticRegression(random_state=123)\n",
167 |     "\n",
168 |     "sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5], \n",
169 |     "                            meta_classifier=lr, \n",
170 |     "                            cv=10,\n",
171 |     "                            random_state=123)\n",
172 |     "\n",
173 |     "\n",
174 |     "sclf.fit(X_train, y_train)\n",
175 |     "print(\"Training Accuracy: %0.2f\" % sclf.score(X_train, y_train))\n",
176 |     "print(\"Validation Accuracy: %0.2f\" % sclf.score(X_valid, y_valid))\n",
177 |     "print(\"Test Accuracy: %0.2f\" % sclf.score(X_test, y_test))"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "## Stacking Classifier from scikit-learn (also includes CV)"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 5,
190 |    "metadata": {},
191 |    "outputs": [
192 |     {
193 |      "name": "stdout",
194 |      "output_type": "stream",
195 |      "text": [
196 |       "Training Accuracy: 1.00\n",
197 |       "Validation Accuracy: 0.99\n",
198 |       "Test Accuracy: 0.98\n"
199 |      ]
200 |     }
201 |    ],
202 |    "source": [
203 |     "from sklearn.ensemble import StackingClassifier\n",
204 |     "\n",
205 |     "\n",
206 |     "clf1 = KNeighborsClassifier(n_neighbors=5)\n",
207 |     "clf2 = RandomForestClassifier(random_state=123)\n",
208 |     "clf3 = HistGradientBoostingClassifier(random_state=123)\n",
209 |     "clf4 = AdaBoostClassifier(random_state=123)\n",
210 |     "clf5 = DecisionTreeClassifier(random_state=123,\n",
211 |     "                              max_depth=None)\n",
212 |     "\n",
213 |     "lr = LogisticRegression(random_state=123)\n",
214 |     "\n",
215 |     "estimators = [('clf1', clf1),\n",
216 |     "              ('clf2', clf2),\n",
217 |     "              ('clf3', clf3),\n",
218 |     "              ('clf4', clf4),\n",
219 |     "              ('clf5', clf5)]\n",
220 |     "\n",
221 |     "sclf = StackingClassifier(estimators=estimators, \n",
222 |     "                          final_estimator=lr, \n",
223 |     "                          cv=10)\n",
224 |     "\n",
225 |     "\n",
226 |     "sclf.fit(X_train, y_train)\n",
227 |     "print(\"Training Accuracy: %0.2f\" % sclf.score(X_train, y_train))\n",
228 |     "print(\"Validation Accuracy: %0.2f\" % sclf.score(X_valid, y_valid))\n",
229 |     "print(\"Test Accuracy: %0.2f\" % sclf.score(X_test, y_test))"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "metadata": {},
235 |    "source": [
236 |     "## MLxtend StackingCVClassifier with same behavior as scikit-learn above"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 6,
242 |    "metadata": {},
243 |    "outputs": [
244 |     {
245 |      "name": "stdout",
246 |      "output_type": "stream",
247 |      "text": [
248 |       "Training Accuracy: 1.00\n",
249 |       "Validation Accuracy: 0.99\n",
250 |       "Test Accuracy: 0.98\n"
251 |      ]
252 |     }
253 |    ],
254 |    "source": [
255 |     "# stack_method{‘auto’, ‘predict_proba’, ‘decision_function’, ‘predict’}, default=’auto’\n",
256 |     "\n",
257 |     "\n",
258 |     "from mlxtend.classifier import StackingCVClassifier\n",
259 |     "\n",
260 |     "\n",
261 |     "sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5], \n",
262 |     "                            meta_classifier=lr, \n",
263 |     "                            use_probas=True, # changed\n",
264 |     "                            drop_proba_col='last',\n",
265 |     "                            #use_features_in_secondary=True,\n",
266 |     "                            cv=10,\n",
267 |     "                            random_state=123)\n",
268 |     "\n",
269 |     "\n",
270 |     "sclf.fit(X_train, y_train)\n",
271 |     "print(\"Training Accuracy: %0.2f\" % sclf.score(X_train, y_train))\n",
272 |     "print(\"Validation Accuracy: %0.2f\" % sclf.score(X_valid, y_valid))\n",
273 |     "print(\"Test Accuracy: %0.2f\" % sclf.score(X_test, y_test))"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": []
282 |   }
283 |  ],
284 |  "metadata": {
285 |   "kernelspec": {
286 |    "display_name": "Python 3 (ipykernel)",
287 |    "language": "python",
288 |    "name": "python3"
289 |   },
290 |   "language_info": {
291 |    "codemirror_mode": {
292 |     "name": "ipython",
293 |     "version": 3
294 |    },
295 |    "file_extension": ".py",
296 |    "mimetype": "text/x-python",
297 |    "name": "python",
298 |    "nbconvert_exporter": "python",
299 |    "pygments_lexer": "ipython3",
300 |    "version": "3.9.6"
301 |   }
302 |  },
303 |  "nbformat": 4,
304 |  "nbformat_minor": 4
305 | }
306 | 


--------------------------------------------------------------------------------
/hyperparameter-tuning-examples/05.2-successive-halving-stacking.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "39329df3-1f99-4b11-9405-5969d52368a7",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Stacking & Successive Halving Random + Search Example"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "id": "7f61a90e-a119-4bd0-af21-38604c5b4eec",
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "scikit-learn: 1.0\n",
 22 |       "mlxtend     : 0.19.0\n",
 23 |       "\n"
 24 |      ]
 25 |     }
 26 |    ],
 27 |    "source": [
 28 |     "%load_ext watermark\n",
 29 |     "%watermark -p scikit-learn,mlxtend"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "id": "1f0489c2-dd9c-4e71-a78c-e01201762b37",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "## Dataset"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 2,
 43 |    "id": "271b17ff-5ea4-4161-8b7f-20ba8131d666",
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "name": "stdout",
 48 |      "output_type": "stream",
 49 |      "text": [
 50 |       "Train/Valid/Test sizes: 398 80 171\n"
 51 |      ]
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "from sklearn import model_selection\n",
 56 |     "from sklearn.model_selection import train_test_split\n",
 57 |     "from sklearn import datasets\n",
 58 |     "\n",
 59 |     "\n",
 60 |     "data = datasets.load_breast_cancer()\n",
 61 |     "X, y = data.data, data.target\n",
 62 |     "\n",
 63 |     "X_train, X_test, y_train, y_test = \\\n",
 64 |     "    train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)\n",
 65 |     "\n",
 66 |     "X_train_sub, X_valid, y_train_sub, y_valid = \\\n",
 67 |     "    train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)\n",
 68 |     "\n",
 69 |     "print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "id": "4cc20172-b2c7-4a8a-b310-714f658d3e23",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "## Baseline"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 3,
 83 |    "id": "96f0b4c1-803a-436f-93d5-31baab55faa5",
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "from sklearn.ensemble import RandomForestClassifier\n",
 88 |     "from xgboost import XGBClassifier\n",
 89 |     "from mlxtend.classifier import StackingCVClassifier\n",
 90 |     "from sklearn.linear_model import LogisticRegression\n",
 91 |     "\n",
 92 |     "\n",
 93 |     "forest = RandomForestClassifier(n_estimators=100,\n",
 94 |     "                                random_state=123)\n",
 95 |     "\n",
 96 |     "boost = XGBClassifier(random_state=123, verbosity=0, use_label_encoder=False)\n",
 97 |     "\n",
 98 |     "metaclassifier = LogisticRegression(random_state=123)\n",
 99 |     "\n",
100 |     "sclf = StackingCVClassifier(classifiers=[forest, boost], \n",
101 |     "                            meta_classifier=metaclassifier, \n",
102 |     "                            random_state=123)\n"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "id": "8acc4951-e0de-4dfb-98d4-fd0a7e14a896",
108 |    "metadata": {},
109 |    "source": [
110 |     "Random forest:"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 4,
116 |    "id": "30d17dbf-2a07-4482-a942-5e6d936f2bd0",
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "Training Accuracy: 1.00\n",
124 |       "Validation Accuracy: 0.95\n",
125 |       "Test Accuracy: 0.96\n"
126 |      ]
127 |     }
128 |    ],
129 |    "source": [
130 |     "forest.fit(X_train_sub, y_train_sub)\n",
131 |     "print(f\"Training Accuracy: {forest.score(X_train_sub, y_train_sub):0.2f}\")\n",
132 |     "print(f\"Validation Accuracy: {forest.score(X_valid, y_valid):0.2f}\")\n",
133 |     "print(f\"Test Accuracy: {forest.score(X_test, y_test):0.2f}\")"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "id": "b44ea7b8-ca93-452b-8f61-b4ea067fb883",
139 |    "metadata": {},
140 |    "source": [
141 |     "Gradient boosting:"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 5,
147 |    "id": "524ec950-396c-40de-b5c1-87b8473ffcb7",
148 |    "metadata": {},
149 |    "outputs": [
150 |     {
151 |      "name": "stdout",
152 |      "output_type": "stream",
153 |      "text": [
154 |       "Training Accuracy: 1.00\n",
155 |       "Validation Accuracy: 0.97\n",
156 |       "Test Accuracy: 0.95\n"
157 |      ]
158 |     }
159 |    ],
160 |    "source": [
161 |     "boost.fit(X_train_sub, y_train_sub)\n",
162 |     "print(f\"Training Accuracy: {boost.score(X_train_sub, y_train_sub):0.2f}\")\n",
163 |     "print(f\"Validation Accuracy: {boost.score(X_valid, y_valid):0.2f}\")\n",
164 |     "print(f\"Test Accuracy: {boost.score(X_test, y_test):0.2f}\")"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "id": "5010e087-1d01-4487-9792-01ea22877cb1",
170 |    "metadata": {},
171 |    "source": [
172 |     "Stacking:"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 6,
178 |    "id": "85ea8745-5bde-4221-b56d-391d2ebb36f5",
179 |    "metadata": {},
180 |    "outputs": [
181 |     {
182 |      "name": "stdout",
183 |      "output_type": "stream",
184 |      "text": [
185 |       "Training Accuracy: 1.00\n",
186 |       "Validation Accuracy: 0.97\n",
187 |       "Test Accuracy: 0.95\n"
188 |      ]
189 |     }
190 |    ],
191 |    "source": [
192 |     "sclf.fit(X_train_sub, y_train_sub)\n",
193 |     "print(f\"Training Accuracy: {sclf.score(X_train_sub, y_train_sub):0.2f}\")\n",
194 |     "print(f\"Validation Accuracy: {sclf.score(X_valid, y_valid):0.2f}\")\n",
195 |     "print(f\"Test Accuracy: {sclf.score(X_test, y_test):0.2f}\")"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "id": "5a824414-2fc2-40bc-b9e8-972a80af153b",
201 |    "metadata": {},
202 |    "source": [
203 |     "## Successive Halving + Random Search"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "id": "81ea4cb4-f9e6-4f5d-9a0d-4ab4167d1883",
209 |    "metadata": {},
210 |    "source": [
211 |     "\n",
212 |     "- More info: \n",
213 |     "  - https://scikit-learn.org/stable/modules/grid_search.html#successive-halving-user-guide\n",
214 |     "  - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.HalvingRandomSearchCV.html#sklearn.model_selection.HalvingRandomSearchCV"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 7,
220 |    "id": "e80b4870-e6d2-4f62-91dd-2b53afaac49c",
221 |    "metadata": {},
222 |    "outputs": [
223 |     {
224 |      "data": {
225 |       "text/plain": [
226 |        "0.9495238095238095"
227 |       ]
228 |      },
229 |      "execution_count": 7,
230 |      "metadata": {},
231 |      "output_type": "execute_result"
232 |     }
233 |    ],
234 |    "source": [
235 |     "import numpy as np\n",
236 |     "import scipy.stats\n",
237 |     "\n",
238 |     "from sklearn.experimental import enable_halving_search_cv\n",
239 |     "from sklearn.model_selection import HalvingRandomSearchCV\n",
240 |     "\n",
241 |     "from sklearn.pipeline import make_pipeline\n",
242 |     "\n",
243 |     "\n",
244 |     "pipe = make_pipeline(sclf)\n",
245 |     "\n",
246 |     "params = {\n",
247 |     "    'stackingcvclassifier__use_probas': [True],\n",
248 |     "    'stackingcvclassifier__drop_proba_col': [None, 'last'],\n",
249 |     "    'stackingcvclassifier__xgbclassifier__reg_alpha': scipy.stats.loguniform(1e-5, 1),\n",
250 |     "    'stackingcvclassifier__xgbclassifier__max_depth': [2, 4, 6, 8],\n",
251 |     "    'stackingcvclassifier__randomforestclassifier__n_estimators': [10, 100]\n",
252 |     "}\n",
253 |     "\n",
254 |     "\n",
255 |     "search = HalvingRandomSearchCV(\n",
256 |     "    estimator=pipe, \n",
257 |     "    param_distributions=params,\n",
258 |     "    n_candidates='exhaust',\n",
259 |     "    resource='n_samples',\n",
260 |     "    factor=3,\n",
261 |     "    random_state=123,\n",
262 |     "    n_jobs=1)\n",
263 |     "\n",
264 |     "search.fit(X_train, y_train)\n",
265 |     "search.best_score_"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 8,
271 |    "id": "2c26399d-ebfc-4b06-86d9-36e49711e908",
272 |    "metadata": {},
273 |    "outputs": [
274 |     {
275 |      "data": {
276 |       "text/plain": [
277 |        "{'stackingcvclassifier__drop_proba_col': None,\n",
278 |        " 'stackingcvclassifier__randomforestclassifier__n_estimators': 10,\n",
279 |        " 'stackingcvclassifier__use_probas': True,\n",
280 |        " 'stackingcvclassifier__xgbclassifier__max_depth': 2,\n",
281 |        " 'stackingcvclassifier__xgbclassifier__reg_alpha': 0.0032298576306232034}"
282 |       ]
283 |      },
284 |      "execution_count": 8,
285 |      "metadata": {},
286 |      "output_type": "execute_result"
287 |     }
288 |    ],
289 |    "source": [
290 |     "search.best_params_"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 9,
296 |    "id": "763e816b-6437-45a9-812f-8b429472d75e",
297 |    "metadata": {},
298 |    "outputs": [
299 |     {
300 |      "name": "stdout",
301 |      "output_type": "stream",
302 |      "text": [
303 |       "Training Accuracy: 1.00\n",
304 |       "Test Accuracy: 0.96\n"
305 |      ]
306 |     }
307 |    ],
308 |    "source": [
309 |     "print(f\"Training Accuracy: {search.best_estimator_.score(X_train, y_train):0.2f}\")\n",
310 |     "print(f\"Test Accuracy: {search.best_estimator_.score(X_test, y_test):0.2f}\")"
311 |    ]
312 |   }
313 |  ],
314 |  "metadata": {
315 |   "kernelspec": {
316 |    "display_name": "Python 3 (ipykernel)",
317 |    "language": "python",
318 |    "name": "python3"
319 |   },
320 |   "language_info": {
321 |    "codemirror_mode": {
322 |     "name": "ipython",
323 |     "version": 3
324 |    },
325 |    "file_extension": ".py",
326 |    "mimetype": "text/x-python",
327 |    "name": "python",
328 |    "nbconvert_exporter": "python",
329 |    "pygments_lexer": "ipython3",
330 |    "version": "3.9.6"
331 |   }
332 |  },
333 |  "nbformat": 4,
334 |  "nbformat_minor": 5
335 | }
336 | 


--------------------------------------------------------------------------------
/07-ensembles/code/07-04_gradient-boosting.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# STAT451: Machine Learning -- L07: Ensemble Methods Part 2/3"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "STAT 451: Intro to Machine Learning (Fall 2021)  \n",
 15 |     "Instructor: Sebastian Raschka (sraschka@wisc.edu)  \n"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {},
 22 |    "outputs": [
 23 |     {
 24 |      "name": "stdout",
 25 |      "output_type": "stream",
 26 |      "text": [
 27 |       "scikit-learn: 1.0\n",
 28 |       "\n"
 29 |      ]
 30 |     }
 31 |    ],
 32 |    "source": [
 33 |     "%load_ext watermark\n",
 34 |     "%watermark -p scikit-learn"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "# Gradient Boosting"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "Train/Valid/Test sizes: 318 80 171\n"
 54 |      ]
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "from sklearn import model_selection\n",
 59 |     "from sklearn.model_selection import train_test_split\n",
 60 |     "from sklearn import datasets\n",
 61 |     "\n",
 62 |     "\n",
 63 |     "data = datasets.load_breast_cancer()\n",
 64 |     "X, y = data.data, data.target\n",
 65 |     "\n",
 66 |     "X_temp, X_test, y_temp, y_test = \\\n",
 67 |     "    train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)\n",
 68 |     "\n",
 69 |     "X_train, X_valid, y_train, y_valid = \\\n",
 70 |     "    train_test_split(X_temp, y_temp, test_size=0.2, random_state=123, stratify=y_temp)\n",
 71 |     "\n",
 72 |     "print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## Original gradient boosting"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 3,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "Training Accuracy: 1.00\n",
 92 |       "Validation Accuracy: 0.90\n",
 93 |       "Test Accuracy: 0.92\n"
 94 |      ]
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
 99 |     "\n",
100 |     "\n",
101 |     "boost = GradientBoostingClassifier(\n",
102 |     "    learning_rate=0.1,\n",
103 |     "    n_estimators=100,\n",
104 |     "    max_depth=8,\n",
105 |     "    random_state=1)\n",
106 |     "\n",
107 |     "boost.fit(X_train, y_train)\n",
108 |     "    \n",
109 |     "    \n",
110 |     "print(\"Training Accuracy: %0.2f\" % boost.score(X_train, y_train))\n",
111 |     "print(\"Validation Accuracy: %0.2f\" % boost.score(X_valid, y_valid))\n",
112 |     "print(\"Test Accuracy: %0.2f\" % boost.score(X_test, y_test))"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "## HistGradientBoostingClassifier (inspired by LightGBM)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 4,
125 |    "metadata": {},
126 |    "outputs": [
127 |     {
128 |      "name": "stdout",
129 |      "output_type": "stream",
130 |      "text": [
131 |       "Training Accuracy: 1.00\n",
132 |       "Validation Accuracy: 0.96\n",
133 |       "Test Accuracy: 0.97\n"
134 |      ]
135 |     }
136 |    ],
137 |    "source": [
138 |     "#from sklearn.experimental import enable_hist_gradient_boosting\n",
139 |     "from sklearn.ensemble import HistGradientBoostingClassifier\n",
140 |     "\n",
141 |     "\n",
142 |     "boost = HistGradientBoostingClassifier(\n",
143 |     "    learning_rate=0.1,\n",
144 |     "    #n_estimators=100,\n",
145 |     "    #max_depth=8,\n",
146 |     "    random_state=1)\n",
147 |     "\n",
148 |     "boost.fit(X_train, y_train)\n",
149 |     "    \n",
150 |     "print(\"Training Accuracy: %0.2f\" % boost.score(X_train, y_train))\n",
151 |     "print(\"Validation Accuracy: %0.2f\" % boost.score(X_valid, y_valid))\n",
152 |     "print(\"Test Accuracy: %0.2f\" % boost.score(X_test, y_test))"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "```\n",
160 |     "import numpy as np\n",
161 |     "import xgboost as xgb\n",
162 |     "\n",
163 |     "\n",
164 |     "dtrain = xgb.DMatrix(X_train, label=y_train)\n",
165 |     "dtest = xgb.DMatrix(X_test, label=y_test)\n",
166 |     "\n",
167 |     "\n",
168 |     "param = {\n",
169 |     "    'max_depth': 8,\n",
170 |     "    'eta': 0.1,  # learning rate\n",
171 |     "    'objective': 'multi:softprob',  # loss function for multiclass\n",
172 |     "    'num_class': 3}  # number of classes\n",
173 |     "\n",
174 |     "boost = xgb.train(param, dtrain, num_boost_round=100)\n",
175 |     "\n",
176 |     "y_pred = boost.predict(dtest)\n",
177 |     "y_labels = np.argmax(y_pred, axis=1)\n",
178 |     "\n",
179 |     "\n",
180 |     "print(\"Test Accuracy: %0.2f\" % (y_labels == y_test).mean())\n",
181 |     "```"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "## XGBoost"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 5,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "# https://xgboost.readthedocs.io/en/latest/build.html"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": 6,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "#!pip install xgboost"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 7,
212 |    "metadata": {},
213 |    "outputs": [
214 |     {
215 |      "name": "stdout",
216 |      "output_type": "stream",
217 |      "text": [
218 |       "[07:41:34] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n",
219 |       "Training Accuracy: 1.00\n",
220 |       "Validation Accuracy: 0.95\n",
221 |       "Test Accuracy: 0.98\n"
222 |      ]
223 |     },
224 |     {
225 |      "name": "stderr",
226 |      "output_type": "stream",
227 |      "text": [
228 |       "/home/raschka/miniforge3/lib/python3.8/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n",
229 |       "  warnings.warn(label_encoder_deprecation_msg, UserWarning)\n"
230 |      ]
231 |     }
232 |    ],
233 |    "source": [
234 |     "import numpy as np\n",
235 |     "import xgboost as xgb\n",
236 |     "\n",
237 |     "\n",
238 |     "boost = xgb.XGBClassifier()\n",
239 |     "\n",
240 |     "boost.fit(X_train, y_train)\n",
241 |     "    \n",
242 |     "print(\"Training Accuracy: %0.2f\" % boost.score(X_train, y_train))\n",
243 |     "print(\"Validation Accuracy: %0.2f\" % boost.score(X_valid, y_valid))\n",
244 |     "print(\"Test Accuracy: %0.2f\" % boost.score(X_test, y_test))"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "markdown",
249 |    "metadata": {},
250 |    "source": [
251 |     "## LightGBM"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "metadata": {},
258 |    "outputs": [],
259 |    "source": [
260 |     "# https://lightgbm.readthedocs.io/en/latest/Installation-Guide.html\n",
261 |     "# conda install -c conda-forge lightgbm"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 8,
267 |    "metadata": {},
268 |    "outputs": [
269 |     {
270 |      "name": "stdout",
271 |      "output_type": "stream",
272 |      "text": [
273 |       "Training Accuracy: 1.00\n",
274 |       "Validation Accuracy: 0.96\n",
275 |       "Test Accuracy: 0.98\n"
276 |      ]
277 |     }
278 |    ],
279 |    "source": [
280 |     "import lightgbm as lgb\n",
281 |     "\n",
282 |     "\n",
283 |     "boost = lgb.LGBMClassifier()\n",
284 |     "\n",
285 |     "boost.fit(X_train, y_train)\n",
286 |     "\n",
287 |     "\n",
288 |     "print(\"Training Accuracy: %0.2f\" % boost.score(X_train, y_train))\n",
289 |     "print(\"Validation Accuracy: %0.2f\" % boost.score(X_valid, y_valid))\n",
290 |     "print(\"Test Accuracy: %0.2f\" % boost.score(X_test, y_test))"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "markdown",
295 |    "metadata": {},
296 |    "source": [
297 |     "## CatBoost"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": null,
303 |    "metadata": {},
304 |    "outputs": [],
305 |    "source": [
306 |     "# https://catboost.ai\n",
307 |     "# conda install -c conda-forge catboost"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 13,
313 |    "metadata": {},
314 |    "outputs": [
315 |     {
316 |      "name": "stdout",
317 |      "output_type": "stream",
318 |      "text": [
319 |       "Training Accuracy: 1.00\n",
320 |       "Validation Accuracy: 0.97\n",
321 |       "Test Accuracy: 0.98\n"
322 |      ]
323 |     }
324 |    ],
325 |    "source": [
326 |     "from catboost import CatBoostClassifier\n",
327 |     "\n",
328 |     "\n",
329 |     "boost = CatBoostClassifier(verbose=0)\n",
330 |     "\n",
331 |     "boost.fit(X_train, y_train)\n",
332 |     "\n",
333 |     "\n",
334 |     "print(\"Training Accuracy: %0.2f\" % boost.score(X_train, y_train))\n",
335 |     "print(\"Validation Accuracy: %0.2f\" % boost.score(X_valid, y_valid))\n",
336 |     "print(\"Test Accuracy: %0.2f\" % boost.score(X_test, y_test))"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": null,
342 |    "metadata": {},
343 |    "outputs": [],
344 |    "source": []
345 |   }
346 |  ],
347 |  "metadata": {
348 |   "kernelspec": {
349 |    "display_name": "Python 3",
350 |    "language": "python",
351 |    "name": "python3"
352 |   },
353 |   "language_info": {
354 |    "codemirror_mode": {
355 |     "name": "ipython",
356 |     "version": 3
357 |    },
358 |    "file_extension": ".py",
359 |    "mimetype": "text/x-python",
360 |    "name": "python",
361 |    "nbconvert_exporter": "python",
362 |    "pygments_lexer": "ipython3",
363 |    "version": "3.8.8"
364 |   }
365 |  },
366 |  "nbformat": 4,
367 |  "nbformat_minor": 4
368 | }
369 | 


--------------------------------------------------------------------------------
/hw02-starter/dataset/make-splits.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "c1a5df29-369d-423a-a8ed-3755d0734002",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pandas as pd"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "id": "91189b30-5a7d-4817-af41-9d92cc2817b8",
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "data": {
 21 |       "text/html": [
 22 |        "<div>\n",
 23 |        "<style scoped>\n",
 24 |        "    .dataframe tbody tr th:only-of-type {\n",
 25 |        "        vertical-align: middle;\n",
 26 |        "    }\n",
 27 |        "\n",
 28 |        "    .dataframe tbody tr th {\n",
 29 |        "        vertical-align: top;\n",
 30 |        "    }\n",
 31 |        "\n",
 32 |        "    .dataframe thead th {\n",
 33 |        "        text-align: right;\n",
 34 |        "    }\n",
 35 |        "</style>\n",
 36 |        "<table border=\"1\" class=\"dataframe\">\n",
 37 |        "  <thead>\n",
 38 |        "    <tr style=\"text-align: right;\">\n",
 39 |        "      <th></th>\n",
 40 |        "      <th>Area</th>\n",
 41 |        "      <th>Perimeter</th>\n",
 42 |        "      <th>MajorAxisLength</th>\n",
 43 |        "      <th>MinorAxisLength</th>\n",
 44 |        "      <th>AspectRation</th>\n",
 45 |        "      <th>Eccentricity</th>\n",
 46 |        "      <th>ConvexArea</th>\n",
 47 |        "      <th>EquivDiameter</th>\n",
 48 |        "      <th>Extent</th>\n",
 49 |        "      <th>Solidity</th>\n",
 50 |        "      <th>roundness</th>\n",
 51 |        "      <th>Compactness</th>\n",
 52 |        "      <th>ShapeFactor1</th>\n",
 53 |        "      <th>ShapeFactor2</th>\n",
 54 |        "      <th>ShapeFactor3</th>\n",
 55 |        "      <th>ShapeFactor4</th>\n",
 56 |        "      <th>Class</th>\n",
 57 |        "    </tr>\n",
 58 |        "  </thead>\n",
 59 |        "  <tbody>\n",
 60 |        "    <tr>\n",
 61 |        "      <th>0</th>\n",
 62 |        "      <td>28395</td>\n",
 63 |        "      <td>610.291</td>\n",
 64 |        "      <td>208.178117</td>\n",
 65 |        "      <td>173.888747</td>\n",
 66 |        "      <td>1.197191</td>\n",
 67 |        "      <td>0.549812</td>\n",
 68 |        "      <td>28715</td>\n",
 69 |        "      <td>190.141097</td>\n",
 70 |        "      <td>0.763923</td>\n",
 71 |        "      <td>0.988856</td>\n",
 72 |        "      <td>0.958027</td>\n",
 73 |        "      <td>0.913358</td>\n",
 74 |        "      <td>0.007332</td>\n",
 75 |        "      <td>0.003147</td>\n",
 76 |        "      <td>0.834222</td>\n",
 77 |        "      <td>0.998724</td>\n",
 78 |        "      <td>SEKER</td>\n",
 79 |        "    </tr>\n",
 80 |        "    <tr>\n",
 81 |        "      <th>1</th>\n",
 82 |        "      <td>28734</td>\n",
 83 |        "      <td>638.018</td>\n",
 84 |        "      <td>200.524796</td>\n",
 85 |        "      <td>182.734419</td>\n",
 86 |        "      <td>1.097356</td>\n",
 87 |        "      <td>0.411785</td>\n",
 88 |        "      <td>29172</td>\n",
 89 |        "      <td>191.272750</td>\n",
 90 |        "      <td>0.783968</td>\n",
 91 |        "      <td>0.984986</td>\n",
 92 |        "      <td>0.887034</td>\n",
 93 |        "      <td>0.953861</td>\n",
 94 |        "      <td>0.006979</td>\n",
 95 |        "      <td>0.003564</td>\n",
 96 |        "      <td>0.909851</td>\n",
 97 |        "      <td>0.998430</td>\n",
 98 |        "      <td>SEKER</td>\n",
 99 |        "    </tr>\n",
100 |        "    <tr>\n",
101 |        "      <th>2</th>\n",
102 |        "      <td>29380</td>\n",
103 |        "      <td>624.110</td>\n",
104 |        "      <td>212.826130</td>\n",
105 |        "      <td>175.931143</td>\n",
106 |        "      <td>1.209713</td>\n",
107 |        "      <td>0.562727</td>\n",
108 |        "      <td>29690</td>\n",
109 |        "      <td>193.410904</td>\n",
110 |        "      <td>0.778113</td>\n",
111 |        "      <td>0.989559</td>\n",
112 |        "      <td>0.947849</td>\n",
113 |        "      <td>0.908774</td>\n",
114 |        "      <td>0.007244</td>\n",
115 |        "      <td>0.003048</td>\n",
116 |        "      <td>0.825871</td>\n",
117 |        "      <td>0.999066</td>\n",
118 |        "      <td>SEKER</td>\n",
119 |        "    </tr>\n",
120 |        "    <tr>\n",
121 |        "      <th>3</th>\n",
122 |        "      <td>30008</td>\n",
123 |        "      <td>645.884</td>\n",
124 |        "      <td>210.557999</td>\n",
125 |        "      <td>182.516516</td>\n",
126 |        "      <td>1.153638</td>\n",
127 |        "      <td>0.498616</td>\n",
128 |        "      <td>30724</td>\n",
129 |        "      <td>195.467062</td>\n",
130 |        "      <td>0.782681</td>\n",
131 |        "      <td>0.976696</td>\n",
132 |        "      <td>0.903936</td>\n",
133 |        "      <td>0.928329</td>\n",
134 |        "      <td>0.007017</td>\n",
135 |        "      <td>0.003215</td>\n",
136 |        "      <td>0.861794</td>\n",
137 |        "      <td>0.994199</td>\n",
138 |        "      <td>SEKER</td>\n",
139 |        "    </tr>\n",
140 |        "    <tr>\n",
141 |        "      <th>4</th>\n",
142 |        "      <td>30140</td>\n",
143 |        "      <td>620.134</td>\n",
144 |        "      <td>201.847882</td>\n",
145 |        "      <td>190.279279</td>\n",
146 |        "      <td>1.060798</td>\n",
147 |        "      <td>0.333680</td>\n",
148 |        "      <td>30417</td>\n",
149 |        "      <td>195.896503</td>\n",
150 |        "      <td>0.773098</td>\n",
151 |        "      <td>0.990893</td>\n",
152 |        "      <td>0.984877</td>\n",
153 |        "      <td>0.970516</td>\n",
154 |        "      <td>0.006697</td>\n",
155 |        "      <td>0.003665</td>\n",
156 |        "      <td>0.941900</td>\n",
157 |        "      <td>0.999166</td>\n",
158 |        "      <td>SEKER</td>\n",
159 |        "    </tr>\n",
160 |        "  </tbody>\n",
161 |        "</table>\n",
162 |        "</div>"
163 |       ],
164 |       "text/plain": [
165 |        "    Area  Perimeter  MajorAxisLength  MinorAxisLength  AspectRation  \\\n",
166 |        "0  28395    610.291       208.178117       173.888747      1.197191   \n",
167 |        "1  28734    638.018       200.524796       182.734419      1.097356   \n",
168 |        "2  29380    624.110       212.826130       175.931143      1.209713   \n",
169 |        "3  30008    645.884       210.557999       182.516516      1.153638   \n",
170 |        "4  30140    620.134       201.847882       190.279279      1.060798   \n",
171 |        "\n",
172 |        "   Eccentricity  ConvexArea  EquivDiameter    Extent  Solidity  roundness  \\\n",
173 |        "0      0.549812       28715     190.141097  0.763923  0.988856   0.958027   \n",
174 |        "1      0.411785       29172     191.272750  0.783968  0.984986   0.887034   \n",
175 |        "2      0.562727       29690     193.410904  0.778113  0.989559   0.947849   \n",
176 |        "3      0.498616       30724     195.467062  0.782681  0.976696   0.903936   \n",
177 |        "4      0.333680       30417     195.896503  0.773098  0.990893   0.984877   \n",
178 |        "\n",
179 |        "   Compactness  ShapeFactor1  ShapeFactor2  ShapeFactor3  ShapeFactor4  Class  \n",
180 |        "0     0.913358      0.007332      0.003147      0.834222      0.998724  SEKER  \n",
181 |        "1     0.953861      0.006979      0.003564      0.909851      0.998430  SEKER  \n",
182 |        "2     0.908774      0.007244      0.003048      0.825871      0.999066  SEKER  \n",
183 |        "3     0.928329      0.007017      0.003215      0.861794      0.994199  SEKER  \n",
184 |        "4     0.970516      0.006697      0.003665      0.941900      0.999166  SEKER  "
185 |       ]
186 |      },
187 |      "execution_count": 2,
188 |      "metadata": {},
189 |      "output_type": "execute_result"
190 |     }
191 |    ],
192 |    "source": [
193 |     "df = pd.read_excel('DryBeanDataset/Dry_Bean_Dataset.xlsx')\n",
194 |     "df.head()"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 3,
200 |    "id": "207230c3-a272-4ed0-8653-0709726adb40",
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "X = df.iloc[:, :-1].values\n",
205 |     "y_str = df['Class'].values"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 4,
211 |    "id": "7fa70fe3-6cb5-4514-852a-75c0b05e4317",
212 |    "metadata": {},
213 |    "outputs": [
214 |     {
215 |      "data": {
216 |       "text/plain": [
217 |        "array(['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA'],\n",
218 |        "      dtype=object)"
219 |       ]
220 |      },
221 |      "execution_count": 4,
222 |      "metadata": {},
223 |      "output_type": "execute_result"
224 |     }
225 |    ],
226 |    "source": [
227 |     "import numpy as np\n",
228 |     "\n",
229 |     "\n",
230 |     "np.unique(y_str)"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 5,
236 |    "id": "b342a132-4599-4bf9-9bce-471bc6dffe86",
237 |    "metadata": {},
238 |    "outputs": [],
239 |    "source": [
240 |     "from sklearn.preprocessing import LabelEncoder\n",
241 |     "\n",
242 |     "\n",
243 |     "le = LabelEncoder()\n",
244 |     "y = le.fit_transform(y_str)"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 6,
250 |    "id": "7379cf6e-48b7-4d66-942f-fa4fbba1387f",
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "from sklearn.model_selection import train_test_split\n",
255 |     "\n",
256 |     "\n",
257 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
258 |     "        X, y, test_size=0.33, random_state=42, stratify=y)"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 7,
264 |    "id": "579336a6-bf57-4901-9607-6b785942129f",
265 |    "metadata": {},
266 |    "outputs": [],
267 |    "source": [
268 |     "np.savetxt(\"X_train.csv\", X_train, delimiter=\",\")\n",
269 |     "np.savetxt(\"y_train.csv\", y_train, delimiter=\",\")\n",
270 |     "np.savetxt(\"X_test.csv\", X_test, delimiter=\",\")\n",
271 |     "np.savetxt(\"y_test.csv\", y_test, delimiter=\",\")"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "id": "91c99a46-6941-4ae6-bbfc-385d7c332c9d",
278 |    "metadata": {},
279 |    "outputs": [],
280 |    "source": []
281 |   }
282 |  ],
283 |  "metadata": {
284 |   "kernelspec": {
285 |    "display_name": "Python 3 (ipykernel)",
286 |    "language": "python",
287 |    "name": "python3"
288 |   },
289 |   "language_info": {
290 |    "codemirror_mode": {
291 |     "name": "ipython",
292 |     "version": 3
293 |    },
294 |    "file_extension": ".py",
295 |    "mimetype": "text/x-python",
296 |    "name": "python",
297 |    "nbconvert_exporter": "python",
298 |    "pygments_lexer": "ipython3",
299 |    "version": "3.9.6"
300 |   }
301 |  },
302 |  "nbformat": 4,
303 |  "nbformat_minor": 5
304 | }
305 | 


--------------------------------------------------------------------------------
/11-nested-cross-validation/1_nested-cv_compact.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# L11: Model Evaluation 4 -- Algorithm Comparison (Nested Cross-Validation)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "\n",
 15 |     "\n",
 16 |     "## -- Compact version"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "This notebook illustrates how to implement nested cross-validation in scikit-learn. This notebook is a more compact version of the other notebooks [./2_nested-cv_verbose1.ipynb](./2_nested-cv_verbose1.ipynb) and [./3_nested-cv_verbose2.ipynb](./3nested-cv_verbose2.ipynb).\n",
 24 |     "\n",
 25 |     "Note that due to using `cross_val_score`, we cannot see the best settings for all the outer training folds here. \n",
 26 |     "\n",
 27 |     "<img src=\"nested-cv-image.png\" width=400>"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 1,
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "Author: Sebastian Raschka\n",
 40 |       "\n",
 41 |       "Python implementation: CPython\n",
 42 |       "Python version       : 3.9.6\n",
 43 |       "IPython version      : 7.29.0\n",
 44 |       "\n",
 45 |       "sklearn: 1.0\n",
 46 |       "mlxtend: 0.20.0dev\n",
 47 |       "\n"
 48 |      ]
 49 |     }
 50 |    ],
 51 |    "source": [
 52 |     "%load_ext watermark\n",
 53 |     "%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 2,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "import numpy as np\n",
 63 |     "from sklearn.model_selection import GridSearchCV\n",
 64 |     "from sklearn.model_selection import train_test_split\n",
 65 |     "from sklearn.model_selection import StratifiedKFold\n",
 66 |     "from sklearn.model_selection import cross_val_score\n",
 67 |     "from sklearn.pipeline import Pipeline\n",
 68 |     "from sklearn.preprocessing import StandardScaler\n",
 69 |     "from sklearn.linear_model import LogisticRegression\n",
 70 |     "from sklearn.neighbors import KNeighborsClassifier\n",
 71 |     "from sklearn.tree import DecisionTreeClassifier\n",
 72 |     "from sklearn.ensemble import RandomForestClassifier\n",
 73 |     "from sklearn.svm import SVC\n",
 74 |     "from mlxtend.data import mnist_data\n",
 75 |     "from sklearn.metrics import accuracy_score\n",
 76 |     "\n",
 77 |     "# Loading and splitting the dataset\n",
 78 |     "# Note that this is a small (stratified) subset\n",
 79 |     "# of MNIST; it consists of 5000 samples only, that is,\n",
 80 |     "# 10% of the original MNIST dataset\n",
 81 |     "# http://yann.lecun.com/exdb/mnist/\n",
 82 |     "X, y = mnist_data()\n",
 83 |     "X = X.astype(np.float32)\n",
 84 |     "X_train, X_test, y_train, y_test = train_test_split(X, y,\n",
 85 |     "                                                    test_size=0.2,\n",
 86 |     "                                                    random_state=1,\n",
 87 |     "                                                    stratify=y)\n",
 88 |     "\n",
 89 |     "# Initializing Classifiers\n",
 90 |     "#clf1 = LogisticRegression(random_state=1)\n",
 91 |     "clf2 = KNeighborsClassifier()\n",
 92 |     "clf3 = DecisionTreeClassifier(random_state=1)\n",
 93 |     "#clf4 = SVC(random_state=1)\n",
 94 |     "clf5 = RandomForestClassifier(random_state=1)\n",
 95 |     "\n",
 96 |     "# Building the pipelines\n",
 97 |     "#pipe1 = Pipeline([('std', StandardScaler()),\n",
 98 |     "#                  ('clf1', clf1)])\n",
 99 |     "\n",
100 |     "pipe2 = Pipeline([('std', StandardScaler()),\n",
101 |     "                  ('clf2', clf2)])\n",
102 |     "\n",
103 |     "#pipe4 = Pipeline([('std', StandardScaler()),\n",
104 |     "#                  ('clf4', clf4)])\n",
105 |     "\n",
106 |     "\n",
107 |     "# Setting up the parameter grids\n",
108 |     "#param_grid1 = [{'clf1__penalty': ['l2'],\n",
109 |     "#                'clf1__C': np.power(10., np.arange(-4, 4))}]\n",
110 |     "\n",
111 |     "param_grid2 = [{'clf2__n_neighbors': list(range(1, 10)),\n",
112 |     "                'clf2__p': [1, 2]}]\n",
113 |     "\n",
114 |     "param_grid3 = [{'max_depth': list(range(1, 10)) + [None],\n",
115 |     "                'criterion': ['gini', 'entropy']}]\n",
116 |     "\n",
117 |     "#param_grid4 = [{'clf4__kernel': ['rbf'],\n",
118 |     "#                'clf4__C': np.power(10., np.arange(-4, 4)),\n",
119 |     "#                'clf4__gamma': np.power(10., np.arange(-5, 0))},\n",
120 |     "#               {'clf4__kernel': ['linear'],\n",
121 |     "#                'clf4__C': np.power(10., np.arange(-4, 4))}]\n",
122 |     "\n",
123 |     "param_grid5 = [{'n_estimators': [10, 100, 500, 1000, 10000]}]"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 3,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "# Setting up multiple GridSearchCV objects, 1 for each algorithm\n",
133 |     "gridcvs = {}\n",
134 |     "inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)\n",
135 |     "\n",
136 |     "for pgrid, est, name in zip((param_grid2,\n",
137 |     "                             param_grid3, param_grid5),\n",
138 |     "                            (pipe2, clf3, clf5),\n",
139 |     "                            ('KNN', 'DTree', 'RForest')):\n",
140 |     "    gcv = GridSearchCV(estimator=est,\n",
141 |     "                       param_grid=pgrid,\n",
142 |     "                       scoring='accuracy',\n",
143 |     "                       n_jobs=1, # be careful to only set one n_jobs to -1\n",
144 |     "                       cv=inner_cv,\n",
145 |     "                       verbose=0,\n",
146 |     "                       refit=True)\n",
147 |     "    gridcvs[name] = gcv"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 4,
153 |    "metadata": {},
154 |    "outputs": [
155 |     {
156 |      "name": "stdout",
157 |      "output_type": "stream",
158 |      "text": [
159 |       "DTree   | outer ACC 76.75% +/- 1.32\n",
160 |       "KNN     | outer ACC 91.10% +/- 0.58\n",
161 |       "RForest | outer ACC 93.98% +/- 0.98\n"
162 |      ]
163 |     }
164 |    ],
165 |    "source": [
166 |     "outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)\n",
167 |     "\n",
168 |     "\n",
169 |     "for name, gs_est in sorted(gridcvs.items()):\n",
170 |     "    nested_score = cross_val_score(gs_est, \n",
171 |     "                                   X=X_train, \n",
172 |     "                                   y=y_train, \n",
173 |     "                                   cv=outer_cv,\n",
174 |     "                                   n_jobs=1) # be careful to only set one n_jobs to -1\n",
175 |     "    print(f'{name:<7} | outer ACC {100*nested_score.mean():.2f}% +/- {100*nested_score.std():.2f}')"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "------"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "- Determine the best algorithm from the experiment above; e.g., we find that Random Forest is performing best\n",
190 |     "- Now, select a hyperparameters for the model based on regular k-fold on the whole training set"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 5,
196 |    "metadata": {},
197 |    "outputs": [
198 |     {
199 |      "name": "stdout",
200 |      "output_type": "stream",
201 |      "text": [
202 |       "Fitting 2 folds for each of 5 candidates, totalling 10 fits\n"
203 |      ]
204 |     },
205 |     {
206 |      "data": {
207 |       "text/plain": [
208 |        "GridSearchCV(cv=StratifiedKFold(n_splits=2, random_state=1, shuffle=True),\n",
209 |        "             estimator=RandomForestClassifier(random_state=1), n_jobs=-1,\n",
210 |        "             param_grid=[{'n_estimators': [10, 100, 500, 1000, 10000]}],\n",
211 |        "             scoring='accuracy', verbose=1)"
212 |       ]
213 |      },
214 |      "execution_count": 5,
215 |      "metadata": {},
216 |      "output_type": "execute_result"
217 |     }
218 |    ],
219 |    "source": [
220 |     "gcv_model_select = GridSearchCV(estimator=clf5,\n",
221 |     "                                param_grid=param_grid5,\n",
222 |     "                                scoring='accuracy',\n",
223 |     "                                n_jobs=-1,\n",
224 |     "                                cv=inner_cv,\n",
225 |     "                                verbose=1,\n",
226 |     "                                refit=True)\n",
227 |     "\n",
228 |     "gcv_model_select.fit(X_train, y_train)"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 6,
234 |    "metadata": {},
235 |    "outputs": [
236 |     {
237 |      "name": "stdout",
238 |      "output_type": "stream",
239 |      "text": [
240 |       "Nested CV Accuracy 93.30% (average over k-fold CV test folds)\n",
241 |       "Best Parameters: {'n_estimators': 10000}\n",
242 |       "Training Accuracy 100.00%\n",
243 |       "Test Accuracy 94.00%\n"
244 |      ]
245 |     }
246 |    ],
247 |    "source": [
248 |     "best_model = gcv_model_select.best_estimator_\n",
249 |     "\n",
250 |     "\n",
251 |     "## We can skip the next step because we set refit=True\n",
252 |     "## so scikit-learn has already fit the model to the\n",
253 |     "## whole training set\n",
254 |     "\n",
255 |     "# best_model.fit(X_train, y_train)\n",
256 |     "\n",
257 |     "\n",
258 |     "train_acc = accuracy_score(y_true=y_train, y_pred=best_model.predict(X_train))\n",
259 |     "test_acc = accuracy_score(y_true=y_test, y_pred=best_model.predict(X_test))\n",
260 |     "\n",
261 |     "print(f'Nested CV Accuracy {100 * gcv_model_select.best_score_:.2f}% (average over k-fold CV test folds)')\n",
262 |     "print(f'Best Parameters: {gcv_model_select.best_params_}')\n",
263 |     "\n",
264 |     "print(f'Training Accuracy {100 * train_acc:.2f}%')\n",
265 |     "print(f'Test Accuracy {100 * test_acc:.2f}%')"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": null,
271 |    "metadata": {},
272 |    "outputs": [],
273 |    "source": []
274 |   }
275 |  ],
276 |  "metadata": {
277 |   "anaconda-cloud": {},
278 |   "kernelspec": {
279 |    "display_name": "Python 3 (ipykernel)",
280 |    "language": "python",
281 |    "name": "python3"
282 |   },
283 |   "language_info": {
284 |    "codemirror_mode": {
285 |     "name": "ipython",
286 |     "version": 3
287 |    },
288 |    "file_extension": ".py",
289 |    "mimetype": "text/x-python",
290 |    "name": "python",
291 |    "nbconvert_exporter": "python",
292 |    "pygments_lexer": "ipython3",
293 |    "version": "3.9.6"
294 |   }
295 |  },
296 |  "nbformat": 4,
297 |  "nbformat_minor": 4
298 | }
299 | 


--------------------------------------------------------------------------------
/05-dataprocessing/code/5-1_reading-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "a0fa450b-13b2-4c32-9dd1-ae60a44e8225",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "STAT 451: Machine Learning (Fall 2021)  \n",
  9 |     "Instructor: Sebastian Raschka (sraschka@wisc.edu)  "
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "id": "46d2da68-18e9-4c45-9158-2cdeb29a6981",
 15 |    "metadata": {
 16 |     "tags": []
 17 |    },
 18 |    "source": [
 19 |     "# L05 - Data Preprocessing and Machine Learning with Scikit-Learn"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "id": "f73773f2-50b1-4362-bc32-950abd134842",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "# 5.1 Reading a Dataset from a Tabular Text File"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 1,
 33 |    "id": "91bdac45-ffd1-4610-9af7-5d7a6487b704",
 34 |    "metadata": {},
 35 |    "outputs": [
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "Author: Sebastian Raschka\n",
 41 |       "\n",
 42 |       "Python implementation: CPython\n",
 43 |       "Python version       : 3.9.6\n",
 44 |       "IPython version      : 7.27.0\n",
 45 |       "\n",
 46 |       "pandas: 1.3.2\n",
 47 |       "\n"
 48 |      ]
 49 |     }
 50 |    ],
 51 |    "source": [
 52 |     "%load_ext watermark\n",
 53 |     "%watermark -v -a 'Sebastian Raschka' -p pandas"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "id": "f6573b7c-84a8-41b5-874b-ad167d8d8a91",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## Overview"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "id": "fc3c6665-3697-4e70-bc8e-664da84e36bd",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "In this lecture, we are closing the \"Computational Foundation\" section by introducing yet another Python library, pandas, which is extremely handy for data (pre)processing. The second focus of this lecture is on the [Scikit-learn](http://scikit-learn.org) machine learning library, which is widely considered as the most mature and most well-designed general machine learning library."
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "id": "58c02f5c-d0df-4427-ab59-9755b5d3f569",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "## Pandas -- A Python Library for Working with Data Frames"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "id": "266c5db8-fcce-4886-9294-19c9176395ca",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "- Pandas is probably the most popular and convenient data wrangling library for Python (official website: https://pandas.pydata.org) \n",
 86 |     "- Pandas stands for PANel-DAta-S.\n",
 87 |     "- Relativ similar to data frames in R.\n",
 88 |     "- How is it different from NumPy arrays? \n",
 89 |     "    - Allows for heterogenous data (columns can have different data types)\n",
 90 |     "    - Adds some more convenient functions on top that are handy for data processing"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "id": "b12f3f67-f528-4ef9-9113-1125d83a5545",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "### Loading Tabular Datasets from Text Files"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "id": "ab8814b0-6cf1-4a74-9f05-8feb02eed31f",
104 |    "metadata": {},
105 |    "source": [
106 |     "- Here, we are working with structured data, data which is organized similar to a \"design matrix\" (see lecture 1) -- that is, examples as rows and features as columns (in contrast: unstructured data such as text or images, etc.).\n",
107 |     "- CSV stands for \"comma separated values\" (also common: TSV, tab seperated values).\n",
108 |     "- The `head` command is a Linux/Unix command that shows the first 10 rows by default; the `!` denotes that Jupyter/the IPython kernel should execute it as a shell command (`!`-commands may not work if you are on Windows, but it is not really important)."
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 2,
114 |    "id": "301dfef8-3fd5-4d32-9ac2-51428910cd62",
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "name": "stdout",
119 |      "output_type": "stream",
120 |      "text": [
121 |       "Id,SepalLength[cm],SepalWidth[cm],PetalLength[cm],PetalWidth[cm],Species\n",
122 |       "1,5.1,3.5,1.4,0.2,Iris-setosa\n",
123 |       "2,4.9,3.0,1.4,0.2,Iris-setosa\n",
124 |       "3,4.7,3.2,1.3,0.2,Iris-setosa\n",
125 |       "4,4.6,3.1,1.5,0.2,Iris-setosa\n",
126 |       "5,5.0,3.6,1.4,0.2,Iris-setosa\n",
127 |       "6,5.4,3.9,1.7,0.4,Iris-setosa\n",
128 |       "7,4.6,3.4,1.4,0.3,Iris-setosa\n",
129 |       "8,5.0,3.4,1.5,0.2,Iris-setosa\n",
130 |       "9,4.4,2.9,1.4,0.2,Iris-setosa\n"
131 |      ]
132 |     }
133 |    ],
134 |    "source": [
135 |     "!head data/iris.csv"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "id": "7b2c22eb-010f-4768-88e0-262d855c3e4d",
141 |    "metadata": {},
142 |    "source": [
143 |     "- We use the `read_csv` command to load the CSV file into a pandas data frame object f of the class `DataFrame`.\n",
144 |     "- Data frames also have a `head` command; here it shows the first 5 rows."
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 3,
150 |    "id": "36000139-3942-4f60-b171-d232e52f66d3",
151 |    "metadata": {},
152 |    "outputs": [
153 |     {
154 |      "data": {
155 |       "text/html": [
156 |        "<div>\n",
157 |        "<style scoped>\n",
158 |        "    .dataframe tbody tr th:only-of-type {\n",
159 |        "        vertical-align: middle;\n",
160 |        "    }\n",
161 |        "\n",
162 |        "    .dataframe tbody tr th {\n",
163 |        "        vertical-align: top;\n",
164 |        "    }\n",
165 |        "\n",
166 |        "    .dataframe thead th {\n",
167 |        "        text-align: right;\n",
168 |        "    }\n",
169 |        "</style>\n",
170 |        "<table border=\"1\" class=\"dataframe\">\n",
171 |        "  <thead>\n",
172 |        "    <tr style=\"text-align: right;\">\n",
173 |        "      <th></th>\n",
174 |        "      <th>Id</th>\n",
175 |        "      <th>SepalLength[cm]</th>\n",
176 |        "      <th>SepalWidth[cm]</th>\n",
177 |        "      <th>PetalLength[cm]</th>\n",
178 |        "      <th>PetalWidth[cm]</th>\n",
179 |        "      <th>Species</th>\n",
180 |        "    </tr>\n",
181 |        "  </thead>\n",
182 |        "  <tbody>\n",
183 |        "    <tr>\n",
184 |        "      <th>0</th>\n",
185 |        "      <td>1</td>\n",
186 |        "      <td>5.1</td>\n",
187 |        "      <td>3.5</td>\n",
188 |        "      <td>1.4</td>\n",
189 |        "      <td>0.2</td>\n",
190 |        "      <td>Iris-setosa</td>\n",
191 |        "    </tr>\n",
192 |        "    <tr>\n",
193 |        "      <th>1</th>\n",
194 |        "      <td>2</td>\n",
195 |        "      <td>4.9</td>\n",
196 |        "      <td>3.0</td>\n",
197 |        "      <td>1.4</td>\n",
198 |        "      <td>0.2</td>\n",
199 |        "      <td>Iris-setosa</td>\n",
200 |        "    </tr>\n",
201 |        "    <tr>\n",
202 |        "      <th>2</th>\n",
203 |        "      <td>3</td>\n",
204 |        "      <td>4.7</td>\n",
205 |        "      <td>3.2</td>\n",
206 |        "      <td>1.3</td>\n",
207 |        "      <td>0.2</td>\n",
208 |        "      <td>Iris-setosa</td>\n",
209 |        "    </tr>\n",
210 |        "    <tr>\n",
211 |        "      <th>3</th>\n",
212 |        "      <td>4</td>\n",
213 |        "      <td>4.6</td>\n",
214 |        "      <td>3.1</td>\n",
215 |        "      <td>1.5</td>\n",
216 |        "      <td>0.2</td>\n",
217 |        "      <td>Iris-setosa</td>\n",
218 |        "    </tr>\n",
219 |        "    <tr>\n",
220 |        "      <th>4</th>\n",
221 |        "      <td>5</td>\n",
222 |        "      <td>5.0</td>\n",
223 |        "      <td>3.6</td>\n",
224 |        "      <td>1.4</td>\n",
225 |        "      <td>0.2</td>\n",
226 |        "      <td>Iris-setosa</td>\n",
227 |        "    </tr>\n",
228 |        "  </tbody>\n",
229 |        "</table>\n",
230 |        "</div>"
231 |       ],
232 |       "text/plain": [
233 |        "   Id  SepalLength[cm]  SepalWidth[cm]  PetalLength[cm]  PetalWidth[cm]  \\\n",
234 |        "0   1              5.1             3.5              1.4             0.2   \n",
235 |        "1   2              4.9             3.0              1.4             0.2   \n",
236 |        "2   3              4.7             3.2              1.3             0.2   \n",
237 |        "3   4              4.6             3.1              1.5             0.2   \n",
238 |        "4   5              5.0             3.6              1.4             0.2   \n",
239 |        "\n",
240 |        "       Species  \n",
241 |        "0  Iris-setosa  \n",
242 |        "1  Iris-setosa  \n",
243 |        "2  Iris-setosa  \n",
244 |        "3  Iris-setosa  \n",
245 |        "4  Iris-setosa  "
246 |       ]
247 |      },
248 |      "execution_count": 3,
249 |      "metadata": {},
250 |      "output_type": "execute_result"
251 |     }
252 |    ],
253 |    "source": [
254 |     "import pandas as pd\n",
255 |     "\n",
256 |     "\n",
257 |     "df = pd.read_csv('data/iris.csv')\n",
258 |     "df.head()"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 4,
264 |    "id": "842e84a7-b62c-45a5-84b9-c0df11aae6e3",
265 |    "metadata": {},
266 |    "outputs": [
267 |     {
268 |      "data": {
269 |       "text/plain": [
270 |        "pandas.core.frame.DataFrame"
271 |       ]
272 |      },
273 |      "execution_count": 4,
274 |      "metadata": {},
275 |      "output_type": "execute_result"
276 |     }
277 |    ],
278 |    "source": [
279 |     "type(df)"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "id": "7b281c57-dcbd-4ffe-a1a4-d8e7228db90b",
285 |    "metadata": {},
286 |    "source": [
287 |     "- It is always good to double check the dimensions and see if they are what we expect. \n",
288 |     "- The `DataFrame` `shape` attribute works the same way as the NumPy array `shape` attribute (Lecture 04)."
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 5,
294 |    "id": "a6440e8c-315b-416d-b36b-63b32842a68d",
295 |    "metadata": {},
296 |    "outputs": [
297 |     {
298 |      "data": {
299 |       "text/plain": [
300 |        "(150, 6)"
301 |       ]
302 |      },
303 |      "execution_count": 5,
304 |      "metadata": {},
305 |      "output_type": "execute_result"
306 |     }
307 |    ],
308 |    "source": [
309 |     "df.shape"
310 |    ]
311 |   }
312 |  ],
313 |  "metadata": {
314 |   "kernelspec": {
315 |    "display_name": "Python 3.9.2 64-bit ('base': conda)",
316 |    "language": "python",
317 |    "name": "python392jvsc74a57bd0249cfc85c6a0073df6bca89c83e3180d730f84f7e1f446fbe710b75104ecfa4f"
318 |   },
319 |   "language_info": {
320 |    "codemirror_mode": {
321 |     "name": "ipython",
322 |     "version": 3
323 |    },
324 |    "file_extension": ".py",
325 |    "mimetype": "text/x-python",
326 |    "name": "python",
327 |    "nbconvert_exporter": "python",
328 |    "pygments_lexer": "ipython3",
329 |    "version": "3.9.6"
330 |   }
331 |  },
332 |  "nbformat": 4,
333 |  "nbformat_minor": 5
334 | }
335 | 


--------------------------------------------------------------------------------
/from-scratch-coding-exercises/bagging-from-scratch.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Optional Coding Exercise \n",
  8 |     "\n",
  9 |     "## -- Implementing a Bagging Algorithm from Scratch"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "Author: Sebastian Raschka\n",
 22 |       "\n",
 23 |       "Last updated: 2021-12-08\n",
 24 |       "\n",
 25 |       "Python implementation: CPython\n",
 26 |       "Python version       : 3.9.6\n",
 27 |       "IPython version      : 7.29.0\n",
 28 |       "\n",
 29 |       "numpy     : 1.21.2\n",
 30 |       "scipy     : 1.7.0\n",
 31 |       "matplotlib: 3.4.3\n",
 32 |       "sklearn   : 1.0\n",
 33 |       "\n"
 34 |      ]
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "%load_ext watermark\n",
 39 |     "%watermark  -d -u -a 'Sebastian Raschka' -v -p numpy,scipy,matplotlib,sklearn"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "import numpy as np"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "<br>\n",
 56 |     "<br>\n",
 57 |     "<br>\n",
 58 |     "<br>\n",
 59 |     "<br>\n",
 60 |     "<br>"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## 2) Bagging"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "In this coding exercise, you will be combining multiple decision trees to a bagging classifier. This time, we will be using the decision tree algorithm implemented in scikit-learn (which is some variant of the CART algorithm for binary splits, as implemented earlier and discussed in class)."
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "### 2.1 Bootrapping"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "As you remember, bagging relies on bootstrap sampling. So, as a first step, your task is to implement a function for generating bootstrap samples. In this exercise, for simplicity, we will perform the computations based on the Iris dataset.\n",
 89 |     "\n",
 90 |     "On an interesting side note, scikit-learn recently updated their version of the Iris dataset since it was discovered that the Iris version hosted on the UCI machine learning repository (https://archive.ics.uci.edu/ml/datasets/Iris/) has two data points that are different from R. Fisher's original paper (Fisher,R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).) and changed it in their most recent version. Since most students may not have the latest scikit-learn version installed, we will be working with the Iris dataset that is deposited on UCI, which has become quite the standard in the Python machine learning community for benchmarking algorithms. Instead of manually downloading it, we will be fetching it through the `mlxtend` (http://rasbt.github.io/mlxtend/) library that you installed in the last homework."
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 17,
 96 |    "metadata": {},
 97 |    "outputs": [
 98 |     {
 99 |      "name": "stdout",
100 |      "output_type": "stream",
101 |      "text": [
102 |       "Number of examples: 150\n",
103 |       "Number of features: 4\n",
104 |       "Unique class labels: [0 1 2]\n"
105 |      ]
106 |     }
107 |    ],
108 |    "source": [
109 |     "# DO NOT EDIT OR DELETE THIS CELL\n",
110 |     "\n",
111 |     "from mlxtend.data import iris_data\n",
112 |     "X, y = iris_data()\n",
113 |     "\n",
114 |     "print('Number of examples:', X.shape[0])\n",
115 |     "print('Number of features:', X.shape[1])\n",
116 |     "print('Unique class labels:', np.unique(y))"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "Use scikit-learn's `train_test_split` function to divide the dataset into a training and a test set.\n",
124 |     "\n",
125 |     "- The test set should contain 45 examples, and the training set should contain 105 examples.\n",
126 |     "- To ensure reproducible results, use `123` as a random seed.\n",
127 |     "- Perform a stratified split."
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "# EDIT THIS CELL\n",
137 |     "\n",
138 |     "\n",
139 |     "from sklearn.model_selection import train_test_split\n",
140 |     "\n",
141 |     "\n",
142 |     "X_train, X_test, y_train, y_test = train_test_split(X, y,\n",
143 |     "                                                    # YOUR CODE\n",
144 |     "                                                   )\n",
145 |     "\n",
146 |     "print('Number of training examples:', X_train.shape[0])\n",
147 |     "print('Number of test examples:', X_test.shape[0])"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "Next we are implementing a function to generate bootstrap samples of the training set. In particular, we will perform the bootstrapping as follows:\n",
155 |     "\n",
156 |     "- Create an index array with values 0, ..., 104.\n",
157 |     "- Draw a random sample (with replacement) from this index array using the `choice` method of a NumPy `RandomState` object that is passed to the function as `rng`. \n",
158 |     "- Select training examples from the X array and labels from the y array using the new sample of indices."
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "# EDIT THIS CELL\n",
168 |     "\n",
169 |     "def draw_bootstrap_sample(rng, X, y):\n",
170 |     "    sample_indices = np.arange(X.shape[0])\n",
171 |     "    bootstrap_indices = rng.choice(\n",
172 |     "                                    # YOUR CODE\n",
173 |     "                                    )\n",
174 |     "    return X[bootstrap_indices], y[bootstrap_indices]"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "I added the following code cell for your convenience to double-check your solution. If your results don't match the results shown below, there is a bug in your implementation of the `draw_bootstrap_sample` function."
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 20,
187 |    "metadata": {},
188 |    "outputs": [
189 |     {
190 |      "name": "stdout",
191 |      "output_type": "stream",
192 |      "text": [
193 |       "Number of training inputs from bootstrap round: 105\n",
194 |       "Number of training labels from bootstrap round: 105\n",
195 |       "Labels:\n",
196 |       " [0 0 1 0 0 1 2 0 2 1 0 0 2 1 1 1 1 2 1 1 2 0 2 1 2 1 1 1 0 1 0 0 1 2 0 0 0\n",
197 |       " 0 2 1 1 2 1 2 1 1 2 1 2 0 1 1 2 2 1 0 1 0 2 2 0 1 0 2 0 0 0 0 1 2 0 0 1 0\n",
198 |       " 1 1 0 1 1 2 2 0 2 0 2 0 1 1 2 2 0 2 2 2 0 1 0 1 2 2 2 1 0 0 0]\n"
199 |      ]
200 |     }
201 |    ],
202 |    "source": [
203 |     "# DO NOT EDIT OR DELETE THIS CELL\n",
204 |     "\n",
205 |     "rng = np.random.RandomState(123)\n",
206 |     "X_boot, y_boot = draw_bootstrap_sample(rng, X_train, y_train)\n",
207 |     "\n",
208 |     "print('Number of training inputs from bootstrap round:', X_boot.shape[0])\n",
209 |     "print('Number of training labels from bootstrap round:', y_boot.shape[0])\n",
210 |     "print('Labels:\\n', y_boot)"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "metadata": {},
216 |    "source": [
217 |     "### 2.2 Bagging classifier from decision trees (4 pts)"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {},
223 |    "source": [
224 |     "In this section, you will implement a Bagging algorithm based on the `DecisionTreeClassifier`. I provided a partial solution for you. "
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "# EDIT THIS CELL\n",
234 |     "\n",
235 |     "\n",
236 |     "from sklearn.tree import DecisionTreeClassifier\n",
237 |     "\n",
238 |     "\n",
239 |     "class BaggingClassifier(object):\n",
240 |     "    \n",
241 |     "    def __init__(self, num_trees=10, random_state=123):\n",
242 |     "        self.num_trees = num_trees\n",
243 |     "        self.rng = np.random.RandomState(random_state)\n",
244 |     "        \n",
245 |     "        \n",
246 |     "    def fit(self, X, y):\n",
247 |     "        self.trees_ = [DecisionTreeClassifier(random_state=self.rng) for i in range(self.num_trees)]\n",
248 |     "        for i in range(self.num_trees):\n",
249 |     "            X_boot, y_boot = # YOUR CODE to draw a bootstrap sample\n",
250 |     "            # YOUR CODE to\n",
251 |     "            # fit the trees in self.trees_ on the bootstrap samples\n",
252 |     "        \n",
253 |     "    def predict(self, X):\n",
254 |     "        ary = np.zeros((X.shape[0], len(self.trees_)), dtype=np.int64)\n",
255 |     "        for i in range(len(self.trees_)):\n",
256 |     "            ary[:, i] = self.trees_[i].predict(X)\n",
257 |     "\n",
258 |     "        maj = np.apply_along_axis(lambda x:\n",
259 |     "                                  np.argmax(np.bincount(x)),\n",
260 |     "                                            axis=1,\n",
261 |     "                                            arr=ary)\n",
262 |     "        return maj"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {},
268 |    "source": [
269 |     "I added the following code cell for your convenience to double-check your solution. If your results don't match the results shown below, there is a bug in your implementation of the `BaggingClassifier()`."
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": 22,
275 |    "metadata": {},
276 |    "outputs": [
277 |     {
278 |      "name": "stdout",
279 |      "output_type": "stream",
280 |      "text": [
281 |       "Individual Tree Accuracies:\n",
282 |       "88.9%\n",
283 |       "93.3%\n",
284 |       "97.8%\n",
285 |       "93.3%\n",
286 |       "93.3%\n",
287 |       "93.3%\n",
288 |       "91.1%\n",
289 |       "97.8%\n",
290 |       "97.8%\n",
291 |       "97.8%\n",
292 |       "\n",
293 |       "Bagging Test Accuracy: 97.8%\n"
294 |      ]
295 |     }
296 |    ],
297 |    "source": [
298 |     "# DO NOT EDIT OR DELETE THIS CELL\n",
299 |     "\n",
300 |     "model = BaggingClassifier()\n",
301 |     "model.fit(X_train, y_train)\n",
302 |     "\n",
303 |     "predictions = model.predict(X_test)\n",
304 |     "\n",
305 |     "print('Individual Tree Accuracies:')\n",
306 |     "for tree in model.trees_:\n",
307 |     "    predictions = tree.predict(X_test) \n",
308 |     "    print('%.1f%%' % ((predictions == y_test).sum() / X_test.shape[0] * 100))\n",
309 |     "\n",
310 |     "print('\\nBagging Test Accuracy: %.1f%%' % ((predictions == y_test).sum() / X_test.shape[0] * 100))"
311 |    ]
312 |   }
313 |  ],
314 |  "metadata": {
315 |   "kernelspec": {
316 |    "display_name": "Python 3 (ipykernel)",
317 |    "language": "python",
318 |    "name": "python3"
319 |   },
320 |   "language_info": {
321 |    "codemirror_mode": {
322 |     "name": "ipython",
323 |     "version": 3
324 |    },
325 |    "file_extension": ".py",
326 |    "mimetype": "text/x-python",
327 |    "name": "python",
328 |    "nbconvert_exporter": "python",
329 |    "pygments_lexer": "ipython3",
330 |    "version": "3.9.6"
331 |   }
332 |  },
333 |  "nbformat": 4,
334 |  "nbformat_minor": 4
335 | }
336 | 


--------------------------------------------------------------------------------
/07-ensembles/code/07-03_adaboosting.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# STAT451: Machine Learning -- L07: Ensemble Methods Part 2/3"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "STAT 451: Intro to Machine Learning (Fall 2021)  \n",
 15 |     "Instructor: Sebastian Raschka (sraschka@wisc.edu)  \n"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {},
 22 |    "outputs": [
 23 |     {
 24 |      "name": "stdout",
 25 |      "output_type": "stream",
 26 |      "text": [
 27 |       "scikit-learn: 1.0\n",
 28 |       "mlxtend     : 0.19.0\n",
 29 |       "\n"
 30 |      ]
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "%load_ext watermark\n",
 35 |     "%watermark -p scikit-learn,mlxtend"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "# AdaBoost"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "name": "stdout",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "Train/Valid/Test sizes: 318 80 171\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "from sklearn import model_selection\n",
 60 |     "from sklearn.tree import DecisionTreeClassifier\n",
 61 |     "from sklearn.model_selection import train_test_split\n",
 62 |     "from sklearn import datasets\n",
 63 |     "from sklearn.ensemble import AdaBoostClassifier\n",
 64 |     "\n",
 65 |     "data = datasets.load_breast_cancer()\n",
 66 |     "X, y = data.data, data.target\n",
 67 |     "\n",
 68 |     "X_temp, X_test, y_temp, y_test = \\\n",
 69 |     "    train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)\n",
 70 |     "\n",
 71 |     "X_train, X_valid, y_train, y_valid = \\\n",
 72 |     "    train_test_split(X_temp, y_temp, test_size=0.2, random_state=123, stratify=y_temp)\n",
 73 |     "\n",
 74 |     "print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 3,
 80 |    "metadata": {},
 81 |    "outputs": [
 82 |     {
 83 |      "name": "stdout",
 84 |      "output_type": "stream",
 85 |      "text": [
 86 |       "Valid Accuracy: 0.97\n",
 87 |       "Test Accuracy: 0.98\n"
 88 |      ]
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "tree = DecisionTreeClassifier(criterion='entropy',\n",
 93 |     "                              random_state=1,\n",
 94 |     "                              max_depth=1)\n",
 95 |     "\n",
 96 |     "\n",
 97 |     "boost = AdaBoostClassifier(base_estimator=tree,\n",
 98 |     "                           n_estimators=500,\n",
 99 |     "                           algorithm='SAMME',\n",
100 |     "                           #n_jobs=1,\n",
101 |     "                           random_state=1)\n",
102 |     "\n",
103 |     "boost.fit(X_train, y_train)\n",
104 |     "    \n",
105 |     "    \n",
106 |     "print(\"Valid Accuracy: %0.2f\" % boost.score(X_valid, y_valid))\n",
107 |     "print(\"Test Accuracy: %0.2f\" % boost.score(X_test, y_test))"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 4,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "data": {
117 |       "text/plain": [
118 |        "array([2.50552594, 1.92278773, 1.55587214, 1.25187013, 0.83252642,\n",
119 |        "       1.08142915, 0.47669717, 0.52052104, 0.95125412, 0.80934036,\n",
120 |        "       0.55109052, 1.05012036, 0.55221499, 0.63914069, 0.59736276,\n",
121 |        "       0.73008531, 0.90592534, 0.70073463, 0.65650764, 0.61296891,\n",
122 |        "       0.60313281, 0.25326461, 0.43373898, 0.78441489, 0.61236155,\n",
123 |        "       1.0663583 , 0.90082217, 0.74116791, 0.56945746, 0.82543076,\n",
124 |        "       0.7878422 , 0.64458085, 0.64797598, 0.87901285, 0.95523461,\n",
125 |        "       0.76467063, 0.28370611, 0.76030775, 0.21207145, 0.68804838,\n",
126 |        "       0.20617217, 0.3279255 , 0.62017215, 0.49812938, 0.51195149,\n",
127 |        "       0.7236535 , 0.9534974 , 0.5063253 , 0.59993466, 0.33345466,\n",
128 |        "       0.4770671 , 0.42432093, 0.32997923, 0.28444983, 0.66587665,\n",
129 |        "       0.85117955, 0.52725978, 0.75103776, 0.57712158, 0.72617161,\n",
130 |        "       0.68159588, 0.70714949, 0.61346881, 0.95838569, 0.49603377,\n",
131 |        "       0.65835912, 0.41575424, 0.75348008, 0.60231214, 0.75623471,\n",
132 |        "       0.878401  , 0.43486019, 0.74713028, 0.6171537 , 0.551583  ,\n",
133 |        "       0.7560625 , 0.85264749, 0.43766225, 0.80434382, 0.67149846,\n",
134 |        "       0.72092654, 1.00914516, 0.65784733, 0.72799295, 0.50237923,\n",
135 |        "       0.60017745, 0.59266233, 0.70446006, 0.7924262 , 0.87568356,\n",
136 |        "       0.77335321, 0.51540746, 0.65297512, 0.52861535, 0.56790548,\n",
137 |        "       0.68888948, 0.74751924, 0.5212341 , 0.65688859, 0.5272307 ,\n",
138 |        "       0.31189559, 0.7388376 , 0.41601534, 0.82160166, 0.83059175,\n",
139 |        "       0.43767452, 0.79215531, 0.75203738, 0.77539321, 0.81177688,\n",
140 |        "       0.63884703, 0.61604461, 0.47070834, 0.91495938, 0.73168304,\n",
141 |        "       0.50626363, 0.81990622, 0.75567365, 0.36968088, 0.64176768,\n",
142 |        "       0.4828037 , 0.55188373, 0.6089235 , 0.8216975 , 0.28359622,\n",
143 |        "       0.36553302, 0.37281069, 0.556874  , 0.61090634, 0.77409846,\n",
144 |        "       0.4858811 , 0.30036674, 0.38805685, 0.56586083, 0.82437763,\n",
145 |        "       0.46828801, 0.63366345, 0.87910602, 0.66317906, 0.27858858,\n",
146 |        "       0.36329466, 0.59641648, 0.45700963, 0.63091301, 0.77312518,\n",
147 |        "       0.70985075, 0.78478871, 0.71851454, 0.28828766, 0.30743493,\n",
148 |        "       0.63303008, 0.62926748, 0.46454714, 0.68304948, 0.38213245,\n",
149 |        "       0.54995245, 0.61497835, 0.40412321, 0.74808218, 0.31895306,\n",
150 |        "       0.2816644 , 0.47135939, 0.29163432, 0.47048706, 0.18544358,\n",
151 |        "       0.6616607 , 0.89724109, 0.73602104, 0.7967473 , 0.34984654,\n",
152 |        "       0.75165934, 0.51948597, 0.5146689 , 0.79593669, 0.77882929,\n",
153 |        "       0.89795802, 0.60184265, 0.54933154, 0.48117024, 0.71350232,\n",
154 |        "       0.34533628, 0.53080835, 0.21716551, 0.58050033, 0.69650235,\n",
155 |        "       0.76385342, 0.81026516, 0.38808633, 0.29880956, 0.30509973,\n",
156 |        "       0.49297223, 0.23450843, 0.58333009, 0.4035195 , 0.53738891,\n",
157 |        "       0.48818332, 0.86679803, 0.40315325, 0.5020812 , 0.7185822 ,\n",
158 |        "       0.53893692, 0.39556036, 0.5471129 , 0.76919386, 0.78093477,\n",
159 |        "       0.71208154, 0.39008568, 0.47491871, 0.66689463, 0.64283009,\n",
160 |        "       0.68453453, 0.79772148, 0.50244554, 0.69062109, 0.42708303,\n",
161 |        "       0.82702257, 0.40925005, 0.51324953, 0.63000334, 0.84796778,\n",
162 |        "       0.83247992, 0.78032733, 0.7501293 , 0.25136797, 0.39726184,\n",
163 |        "       0.73273764, 0.77979841, 0.56296215, 0.6899653 , 0.77935536,\n",
164 |        "       0.69487053, 0.76949942, 0.56052286, 0.84229398, 0.56867596,\n",
165 |        "       0.56258893, 0.44446383, 0.69291978, 0.46064667, 0.71933103,\n",
166 |        "       0.74873055, 0.52891418, 0.38122275, 0.33761612, 0.52618052,\n",
167 |        "       0.64418716, 0.6076475 , 0.56244067, 0.95852124, 0.47361947,\n",
168 |        "       0.58244582, 0.48415847, 0.42135536, 0.23216241, 0.61715051,\n",
169 |        "       0.43513235, 0.47840852, 0.20734576, 0.26533765, 0.52268137,\n",
170 |        "       0.5600532 , 0.85742643, 0.39564925, 0.29297077, 0.36238727,\n",
171 |        "       0.51156803, 0.28142217, 0.41027353, 0.90528738, 0.78347096,\n",
172 |        "       0.77480841, 0.75056414, 0.49986959, 0.49270847, 0.61766635,\n",
173 |        "       0.83141839, 0.90030128, 0.4318063 , 0.33301924, 0.43186752,\n",
174 |        "       0.53863208, 0.57840945, 0.96663885, 0.34554928, 0.58101888,\n",
175 |        "       0.36115854, 0.58208305, 0.44322345, 0.44144567, 0.55601302,\n",
176 |        "       0.46164016, 0.8341194 , 0.69980753, 0.56958929, 0.63266352,\n",
177 |        "       0.84392574, 0.49339212, 0.47439085, 0.56743615, 0.70565729,\n",
178 |        "       0.70541742, 0.40657449, 0.49026461, 0.3531605 , 0.57473951,\n",
179 |        "       0.47415586, 0.62914306, 0.71547112, 0.70462982, 0.49657606,\n",
180 |        "       0.54464089, 0.79564052, 0.66244532, 0.26373275, 0.36536601,\n",
181 |        "       0.58208978, 0.57146492, 0.64993923, 0.62768067, 0.45158713,\n",
182 |        "       0.85325834, 0.53231378, 0.79233168, 0.38511095, 0.75284982,\n",
183 |        "       0.74706233, 0.51575888, 0.81909575, 0.90603209, 0.24386135,\n",
184 |        "       0.76649866, 0.43377406, 0.61029083, 0.81090616, 0.47636237,\n",
185 |        "       0.61872684, 0.33218728, 0.62857641, 0.38149671, 0.63049088,\n",
186 |        "       0.84099336, 0.68290934, 0.37364869, 0.7612564 , 0.6721415 ,\n",
187 |        "       0.59878955, 0.75516227, 0.51940095, 0.56673766, 0.65583281,\n",
188 |        "       0.38326285, 0.50038392, 0.59113044, 0.63058081, 0.79691382,\n",
189 |        "       0.30948438, 0.31440909, 0.39276869, 0.6169601 , 0.41067144,\n",
190 |        "       0.38914378, 0.61105762, 0.84578571, 0.76666481, 0.20038221,\n",
191 |        "       0.63232166, 0.49269969, 0.62061495, 0.5165723 , 0.44577444,\n",
192 |        "       0.52215064, 0.88869613, 0.65128026, 0.68651296, 0.56536919,\n",
193 |        "       0.82830332, 0.4810854 , 0.85682689, 0.76105398, 0.49925319,\n",
194 |        "       0.63811212, 0.82590671, 0.79741426, 0.46258058, 0.69548031,\n",
195 |        "       0.4772494 , 0.62100381, 0.38127053, 0.31111932, 0.31090082,\n",
196 |        "       0.68859918, 0.41593675, 0.22091296, 0.25879888, 0.22901721,\n",
197 |        "       0.25660411, 0.39305061, 0.64236859, 0.3687151 , 0.27744108,\n",
198 |        "       0.6484749 , 0.64670298, 0.67547747, 0.6585325 , 0.65795449,\n",
199 |        "       0.63569417, 0.47156199, 0.7453016 , 0.82213752, 0.67220655,\n",
200 |        "       0.69153595, 0.92503448, 0.66472388, 0.33919138, 0.73938646,\n",
201 |        "       0.68659464, 0.84517312, 0.63620677, 0.66356759, 0.7771215 ,\n",
202 |        "       0.67677419, 0.86014288, 0.81826428, 0.78236964, 0.79556257,\n",
203 |        "       0.89657375, 0.82004767, 0.47509016, 0.71750845, 0.50758669,\n",
204 |        "       0.60770523, 0.24365722, 0.34110615, 0.57642176, 0.39441335,\n",
205 |        "       0.71666005, 0.43751309, 0.5926216 , 0.91341033, 0.29772593,\n",
206 |        "       0.39997662, 0.73657033, 0.71875337, 0.63415102, 0.4963237 ,\n",
207 |        "       0.61470616, 0.68514349, 0.49779751, 0.37211042, 0.80201592,\n",
208 |        "       0.6695872 , 0.30821091, 0.7400185 , 0.52286022, 0.93431586,\n",
209 |        "       0.45770812, 0.57900751, 0.72182711, 0.19983035, 0.27639864,\n",
210 |        "       0.24267346, 0.28814664, 0.47518657, 0.4816818 , 0.66318791,\n",
211 |        "       0.82110072, 0.64901859, 0.67720318, 0.85010379, 0.83883734,\n",
212 |        "       0.68183937, 0.30411931, 0.4001568 , 0.78175708, 0.68773236,\n",
213 |        "       0.54684759, 0.54964308, 0.58641857, 0.46835477, 0.46285774,\n",
214 |        "       0.65778772, 0.69239561, 0.75293829, 0.73931576, 0.52851791,\n",
215 |        "       0.55692036, 0.81739817, 0.54968139, 0.22737988, 0.29434696,\n",
216 |        "       0.69304954, 0.62216185, 0.51364148, 0.76889355, 0.41801246,\n",
217 |        "       0.22162885, 0.30292322, 0.465463  , 0.48224517, 0.46706297])"
218 |       ]
219 |      },
220 |      "execution_count": 4,
221 |      "metadata": {},
222 |      "output_type": "execute_result"
223 |     }
224 |    ],
225 |    "source": [
226 |     "boost.estimator_weights_"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": []
235 |   }
236 |  ],
237 |  "metadata": {
238 |   "kernelspec": {
239 |    "display_name": "Python 3 (ipykernel)",
240 |    "language": "python",
241 |    "name": "python3"
242 |   },
243 |   "language_info": {
244 |    "codemirror_mode": {
245 |     "name": "ipython",
246 |     "version": 3
247 |    },
248 |    "file_extension": ".py",
249 |    "mimetype": "text/x-python",
250 |    "name": "python",
251 |    "nbconvert_exporter": "python",
252 |    "pygments_lexer": "ipython3",
253 |    "version": "3.9.6"
254 |   }
255 |  },
256 |  "nbformat": 4,
257 |  "nbformat_minor": 4
258 | }
259 | 


--------------------------------------------------------------------------------
/from-scratch-coding-exercises/bagging-solution.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Optional Coding Exercise \n",
  8 |     "\n",
  9 |     "## -- Implementing a Bagging Algorithm from Scratch"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "Author: Sebastian Raschka\n",
 22 |       "\n",
 23 |       "Last updated: 2021-12-17\n",
 24 |       "\n",
 25 |       "Python implementation: CPython\n",
 26 |       "Python version       : 3.9.6\n",
 27 |       "IPython version      : 7.29.0\n",
 28 |       "\n",
 29 |       "numpy     : 1.21.2\n",
 30 |       "scipy     : 1.7.0\n",
 31 |       "matplotlib: 3.4.3\n",
 32 |       "sklearn   : 1.0\n",
 33 |       "\n"
 34 |      ]
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "%load_ext watermark\n",
 39 |     "%watermark  -d -u -a 'Sebastian Raschka' -v -p numpy,scipy,matplotlib,sklearn"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "import numpy as np"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "## 2) Bagging"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "In this coding exercise, you will be combining multiple decision trees to a bagging classifier. This time, we will be using the decision tree algorithm implemented in scikit-learn (which is some variant of the CART algorithm for binary splits, as implemented earlier and discussed in class)."
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "### 2.1 Bootrapping"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "As you remember, bagging relies on bootstrap sampling. So, as a first step, your task is to implement a function for generating bootstrap samples. In this exercise, for simplicity, we will perform the computations based on the Iris dataset.\n",
 77 |     "\n",
 78 |     "On an interesting side note, scikit-learn recently updated their version of the Iris dataset since it was discovered that the Iris version hosted on the UCI machine learning repository (https://archive.ics.uci.edu/ml/datasets/Iris/) has two data points that are different from R. Fisher's original paper (Fisher,R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).) and changed it in their most recent version. Since most students may not have the latest scikit-learn version installed, we will be working with the Iris dataset that is deposited on UCI, which has become quite the standard in the Python machine learning community for benchmarking algorithms. Instead of manually downloading it, we will be fetching it through the `mlxtend` (http://rasbt.github.io/mlxtend/) library that you installed in the last homework."
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 3,
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "name": "stdout",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "Number of examples: 150\n",
 91 |       "Number of features: 4\n",
 92 |       "Unique class labels: [0 1 2]\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "# DO NOT EDIT OR DELETE THIS CELL\n",
 98 |     "\n",
 99 |     "from mlxtend.data import iris_data\n",
100 |     "X, y = iris_data()\n",
101 |     "\n",
102 |     "print('Number of examples:', X.shape[0])\n",
103 |     "print('Number of features:', X.shape[1])\n",
104 |     "print('Unique class labels:', np.unique(y))"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "Use scikit-learn's `train_test_split` function to divide the dataset into a training and a test set.\n",
112 |     "\n",
113 |     "- The test set should contain 45 examples, and the training set should contain 105 examples.\n",
114 |     "- To ensure reproducible results, use `123` as a random seed.\n",
115 |     "- Perform a stratified split."
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 4,
121 |    "metadata": {},
122 |    "outputs": [
123 |     {
124 |      "name": "stdout",
125 |      "output_type": "stream",
126 |      "text": [
127 |       "Number of training examples: 105\n",
128 |       "Number of test examples: 45\n"
129 |      ]
130 |     }
131 |    ],
132 |    "source": [
133 |     "# SOLUTION\n",
134 |     "from sklearn.model_selection import train_test_split\n",
135 |     "\n",
136 |     "\n",
137 |     "X_train, X_test, y_train, y_test = train_test_split(X, y,\n",
138 |     "                                                    test_size=0.3,\n",
139 |     "                                                    random_state=123,\n",
140 |     "                                                    shuffle=True,\n",
141 |     "                                                    stratify=y)\n",
142 |     "\n",
143 |     "print('Number of training examples:', X_train.shape[0])\n",
144 |     "print('Number of test examples:', X_test.shape[0])"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 5,
150 |    "metadata": {},
151 |    "outputs": [
152 |     {
153 |      "data": {
154 |       "text/plain": [
155 |        "array([[5. , 2. , 3.5, 1. ],\n",
156 |        "       [5.4, 3.9, 1.3, 0.4],\n",
157 |        "       [5.6, 3. , 4.1, 1.3],\n",
158 |        "       [7.4, 2.8, 6.1, 1.9],\n",
159 |        "       [4.6, 3.4, 1.4, 0.3]])"
160 |       ]
161 |      },
162 |      "execution_count": 5,
163 |      "metadata": {},
164 |      "output_type": "execute_result"
165 |     }
166 |    ],
167 |    "source": [
168 |     "X_train[:5]"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "Next we are implementing a function to generate bootstrap samples of the training set. In particular, we will perform the bootstrapping as follows:\n",
176 |     "\n",
177 |     "- Create an index array with values 0, ..., 104.\n",
178 |     "- Draw a random sample (with replacement) from this index array using the `choice` method of a NumPy `RandomState` object that is passed to the function as `rng`. \n",
179 |     "- Select training examples from the X array and labels from the y array using the new sample of indices."
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 6,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "# SOLUTION\n",
189 |     "\n",
190 |     "def draw_bootstrap_sample(rng, X, y):\n",
191 |     "    sample_indices = np.arange(X.shape[0])\n",
192 |     "    bootstrap_indices = rng.choice(sample_indices,\n",
193 |     "                                   size=sample_indices.shape[0],\n",
194 |     "                                   replace=True)\n",
195 |     "    return X[bootstrap_indices], y[bootstrap_indices]"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "I added the following code cell for your convenience to double-check your solution. If your results don't match the results shown below, there is a bug in your implementation of the `draw_bootstrap_sample` function."
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 7,
208 |    "metadata": {},
209 |    "outputs": [
210 |     {
211 |      "name": "stdout",
212 |      "output_type": "stream",
213 |      "text": [
214 |       "Number of training inputs from bootstrap round: 105\n",
215 |       "Number of training labels from bootstrap round: 105\n",
216 |       "Labels:\n",
217 |       " [0 0 1 0 0 1 2 0 2 1 0 0 2 1 1 1 1 2 1 1 2 0 2 1 2 1 1 1 0 1 0 0 1 2 0 0 0\n",
218 |       " 0 2 1 1 2 1 2 1 1 2 1 2 0 1 1 2 2 1 0 1 0 2 2 0 1 0 2 0 0 0 0 1 2 0 0 1 0\n",
219 |       " 1 1 0 1 1 2 2 0 2 0 2 0 1 1 2 2 0 2 2 2 0 1 0 1 2 2 2 1 0 0 0]\n"
220 |      ]
221 |     }
222 |    ],
223 |    "source": [
224 |     "# DO NOT EDIT OR DELETE THIS CELL\n",
225 |     "\n",
226 |     "rng = np.random.RandomState(123)\n",
227 |     "X_boot, y_boot = draw_bootstrap_sample(rng, X_train, y_train)\n",
228 |     "\n",
229 |     "print('Number of training inputs from bootstrap round:', X_boot.shape[0])\n",
230 |     "print('Number of training labels from bootstrap round:', y_boot.shape[0])\n",
231 |     "print('Labels:\\n', y_boot)"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {},
237 |    "source": [
238 |     "### 2.2 Baggging classifier from decision trees (4 pts)"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {},
244 |    "source": [
245 |     "In this section, you will implement a Bagging algorithm based on the `DecisionTreeClassifier`. I provided a partial solution for you. "
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 9,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "# SOLUTION\n",
255 |     "\n",
256 |     "from sklearn.tree import DecisionTreeClassifier\n",
257 |     "\n",
258 |     "\n",
259 |     "class BaggingClassifier(object):\n",
260 |     "    \n",
261 |     "    def __init__(self, num_trees=10, random_state=123):\n",
262 |     "        self.num_trees = num_trees\n",
263 |     "        self.rng = np.random.RandomState(random_state)\n",
264 |     "        \n",
265 |     "    def fit(self, X, y):\n",
266 |     "        self.trees_ = [DecisionTreeClassifier(random_state=self.rng) for i in range(self.num_trees)]\n",
267 |     "        for i in range(self.num_trees):\n",
268 |     "            X_boot, y_boot = draw_bootstrap_sample(self.rng, X, y)\n",
269 |     "            self.trees_[i].fit(X_boot, y_boot)\n",
270 |     "        \n",
271 |     "    def predict(self, X):\n",
272 |     "        ary = np.zeros((X.shape[0], len(self.trees_)), dtype=np.int64)\n",
273 |     "        for i in range(len(self.trees_)):\n",
274 |     "            ary[:, i] = self.trees_[i].predict(X)\n",
275 |     "\n",
276 |     "        maj = np.apply_along_axis(lambda x:\n",
277 |     "                                  np.argmax(np.bincount(x)),\n",
278 |     "                                            axis=1,\n",
279 |     "                                            arr=ary)\n",
280 |     "        return maj"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "markdown",
285 |    "metadata": {},
286 |    "source": [
287 |     "I added the following code cell for your convenience to double-check your solution. If your results don't match the results shown below, there is a bug in your implementation of the `BaggingClassifier()`."
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 10,
293 |    "metadata": {},
294 |    "outputs": [
295 |     {
296 |      "name": "stdout",
297 |      "output_type": "stream",
298 |      "text": [
299 |       "Individual Tree Accuracies:\n",
300 |       "88.9%\n",
301 |       "93.3%\n",
302 |       "97.8%\n",
303 |       "93.3%\n",
304 |       "93.3%\n",
305 |       "93.3%\n",
306 |       "91.1%\n",
307 |       "97.8%\n",
308 |       "97.8%\n",
309 |       "97.8%\n",
310 |       "\n",
311 |       "Bagging Test Accuracy: 97.8%\n"
312 |      ]
313 |     }
314 |    ],
315 |    "source": [
316 |     "# DO NOT EDIT OR DELETE THIS CELL\n",
317 |     "\n",
318 |     "model = BaggingClassifier()\n",
319 |     "model.fit(X_train, y_train)\n",
320 |     "\n",
321 |     "predictions = model.predict(X_test)\n",
322 |     "\n",
323 |     "print('Individual Tree Accuracies:')\n",
324 |     "for tree in model.trees_:\n",
325 |     "    predictions = tree.predict(X_test) \n",
326 |     "    print('%.1f%%' % ((predictions == y_test).sum() / X_test.shape[0] * 100))\n",
327 |     "\n",
328 |     "print('\\nBagging Test Accuracy: %.1f%%' % ((predictions == y_test).sum() / X_test.shape[0] * 100))"
329 |    ]
330 |   }
331 |  ],
332 |  "metadata": {
333 |   "kernelspec": {
334 |    "display_name": "Python 3 (ipykernel)",
335 |    "language": "python",
336 |    "name": "python3"
337 |   },
338 |   "language_info": {
339 |    "codemirror_mode": {
340 |     "name": "ipython",
341 |     "version": 3
342 |    },
343 |    "file_extension": ".py",
344 |    "mimetype": "text/x-python",
345 |    "name": "python",
346 |    "nbconvert_exporter": "python",
347 |    "pygments_lexer": "ipython3",
348 |    "version": "3.9.6"
349 |   }
350 |  },
351 |  "nbformat": 4,
352 |  "nbformat_minor": 4
353 | }
354 | 


--------------------------------------------------------------------------------
/09_eval2-resampling-and-CIs/05_adv-bootstrap.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "STAT 451: Intro to Machine Learning (Fall 2021)  \n",
  8 |     "Instructor: Sebastian Raschka (sraschka@wisc.edu)  \n",
  9 |     "\n",
 10 |     "\n",
 11 |     "\n"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "# L09: Model Evaluation 2 -- Confidence Intervals and Resampling"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "<br>\n",
 26 |     "<br>\n",
 27 |     "<br>"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "# 5. Out-of-Bag Bootstrap"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "In this section, we are going to look at the OOB bootstrap method, which I recently implemented in mlxtend."
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 1,
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "[3 4 0 1 3] [2]\n",
 54 |       "[0 0 1 4 4] [2 3]\n",
 55 |       "[1 2 4 2 4] [0 3]\n"
 56 |      ]
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "from mlxtend.evaluate import BootstrapOutOfBag\n",
 61 |     "import numpy as np\n",
 62 |     "\n",
 63 |     "\n",
 64 |     "\n",
 65 |     "\n",
 66 |     "oob = BootstrapOutOfBag(n_splits=3, random_seed=1)\n",
 67 |     "for train, test in oob.split(np.array([1, 2, 3, 4, 5])):\n",
 68 |     "    print(train, test)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "The reason why I chose a object-oriented implementation is that we can plug it into scikit-learn's `cross_val_score` function, which is super convenient."
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 2,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "from mlxtend.data import iris_data\n",
 85 |     "from sklearn.tree import DecisionTreeClassifier\n",
 86 |     "from sklearn.model_selection import cross_val_score\n",
 87 |     "from sklearn.model_selection import train_test_split\n",
 88 |     "\n",
 89 |     "\n",
 90 |     "X, y = iris_data()\n",
 91 |     "\n",
 92 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
 93 |     "    X, y, test_size=0.4, random_state=123, stratify=y)\n",
 94 |     "\n",
 95 |     "\n",
 96 |     "model = DecisionTreeClassifier(random_state=123)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "Below, we are using the standard approach for `cross_val_score` first, which will perform 5-fold cross validation by setting `cv=5`. Note that \n",
104 |     "\n",
105 |     "- if the model is a scikit-learn classifier, stratified k-fold cross validation will be performed by default, and the reported evaluation metric is the prediction accuracy;\n",
106 |     "- if the model is a scikit-learn regressor, standard k-fold cross validation will be performed by default, and the reported evaluation metric is the $R^2$ score on the test folds."
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 3,
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "name": "stdout",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "CV scores [0.94444444 1.         1.         0.88888889 0.94444444]\n",
119 |       "Mean CV score 0.9555555555555555\n",
120 |       "CV score Std 0.04157397096415492\n"
121 |      ]
122 |     }
123 |    ],
124 |    "source": [
125 |     "cv_scores = cross_val_score(model, X_train, y_train, cv=5)\n",
126 |     "print('CV scores', cv_scores)\n",
127 |     "print('Mean CV score', np.mean(cv_scores))\n",
128 |     "print('CV score Std', np.std(cv_scores))"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "Now, let's plug in our OOB object into the `cross_val_score` function:"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 4,
141 |    "metadata": {},
142 |    "outputs": [
143 |     {
144 |      "name": "stdout",
145 |      "output_type": "stream",
146 |      "text": [
147 |       "Bootstrap scores [0.93548387 0.96774194 0.96875    0.93023256 0.97058824]\n",
148 |       "Mean Bootstrap score 0.9545593199770531\n",
149 |       "Score Std 0.017819915677477555\n"
150 |      ]
151 |     }
152 |    ],
153 |    "source": [
154 |     "# 5 splits\n",
155 |     "\n",
156 |     "bootstrap_scores = \\\n",
157 |     "    cross_val_score(model, X_train, y_train, \n",
158 |     "                    cv=BootstrapOutOfBag(n_splits=5, random_seed=123))\n",
159 |     "\n",
160 |     "print('Bootstrap scores', bootstrap_scores)\n",
161 |     "print('Mean Bootstrap score', np.mean(bootstrap_scores))\n",
162 |     "print('Score Std', np.std(bootstrap_scores))"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 5,
168 |    "metadata": {},
169 |    "outputs": [
170 |     {
171 |      "name": "stdout",
172 |      "output_type": "stream",
173 |      "text": [
174 |       "Mean Bootstrap score 0.9483980861793887\n",
175 |       "Score Std 0.039817322453014004\n"
176 |      ]
177 |     }
178 |    ],
179 |    "source": [
180 |     "bootstrap_scores = \\\n",
181 |     "    cross_val_score(model, X_train, y_train, \n",
182 |     "                    cv=BootstrapOutOfBag(n_splits=200, random_seed=123))\n",
183 |     "\n",
184 |     "print('Mean Bootstrap score', np.mean(bootstrap_scores))\n",
185 |     "print('Score Std', np.std(bootstrap_scores))"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 6,
191 |    "metadata": {},
192 |    "outputs": [
193 |     {
194 |      "name": "stdout",
195 |      "output_type": "stream",
196 |      "text": [
197 |       "95% Confidence interval: [83.33, 100.00]\n"
198 |      ]
199 |     }
200 |    ],
201 |    "source": [
202 |     "lower = np.percentile(bootstrap_scores, 2.5)\n",
203 |     "upper = np.percentile(bootstrap_scores, 97.5)\n",
204 |     "print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 7,
210 |    "metadata": {},
211 |    "outputs": [
212 |     {
213 |      "data": {
214 |       "text/plain": [
215 |        "0.95"
216 |       ]
217 |      },
218 |      "execution_count": 7,
219 |      "metadata": {},
220 |      "output_type": "execute_result"
221 |     }
222 |    ],
223 |    "source": [
224 |     "model.fit(X_train, y_train)\n",
225 |     "model.score(X_test, y_test)"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "metadata": {},
231 |    "source": [
232 |     "<br>\n",
233 |     "<br>\n",
234 |     "<br>"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "metadata": {},
240 |    "source": [
241 |     "## MLxtend functional bootstrap API"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "###  OOB Bootstrap"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "Below is a more convenient way to compute the OOB Boostrap. Note that it has a tendency to be over-pessimistic."
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 8,
261 |    "metadata": {},
262 |    "outputs": [
263 |     {
264 |      "name": "stdout",
265 |      "output_type": "stream",
266 |      "text": [
267 |       "Mean Bootstrap score 0.9483980861793887\n",
268 |       "Score Std 0.039817322453014004\n"
269 |      ]
270 |     }
271 |    ],
272 |    "source": [
273 |     "from mlxtend.evaluate import bootstrap_point632_score\n",
274 |     "\n",
275 |     "bootstrap_scores = bootstrap_point632_score(model, \n",
276 |     "                                            X_train, y_train, \n",
277 |     "                                            n_splits=200, \n",
278 |     "                                            method='oob',\n",
279 |     "                                            random_seed=123)\n",
280 |     "\n",
281 |     "print('Mean Bootstrap score', np.mean(bootstrap_scores))\n",
282 |     "print('Score Std', np.std(bootstrap_scores))"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 9,
288 |    "metadata": {},
289 |    "outputs": [
290 |     {
291 |      "name": "stdout",
292 |      "output_type": "stream",
293 |      "text": [
294 |       "95% Confidence interval: [83.33, 100.00]\n"
295 |      ]
296 |     }
297 |    ],
298 |    "source": [
299 |     "lower = np.percentile(bootstrap_scores, 2.5)\n",
300 |     "upper = np.percentile(bootstrap_scores, 97.5)\n",
301 |     "print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "markdown",
306 |    "metadata": {},
307 |    "source": [
308 |     "###  .632 Bootstrap"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "markdown",
313 |    "metadata": {},
314 |    "source": [
315 |     "The .632 Bootstrap is the default setting of `bootstrap_point632_score`; it tends to be overly optimistic."
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": 10,
321 |    "metadata": {},
322 |    "outputs": [
323 |     {
324 |      "name": "stdout",
325 |      "output_type": "stream",
326 |      "text": [
327 |       "Mean Bootstrap score 0.960416034909818\n",
328 |       "Score Std 0.030496672371341522\n"
329 |      ]
330 |     }
331 |    ],
332 |    "source": [
333 |     "bootstrap_scores = bootstrap_point632_score(model, \n",
334 |     "                                            X_train, y_train, \n",
335 |     "                                            n_splits=200,\n",
336 |     "                                            random_seed=123)\n",
337 |     "print('Mean Bootstrap score', np.mean(bootstrap_scores))\n",
338 |     "print('Score Std', np.std(bootstrap_scores))"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 11,
344 |    "metadata": {},
345 |    "outputs": [
346 |     {
347 |      "name": "stdout",
348 |      "output_type": "stream",
349 |      "text": [
350 |       "95% Confidence interval: [87.29, 100.00]\n"
351 |      ]
352 |     }
353 |    ],
354 |    "source": [
355 |     "lower = np.percentile(bootstrap_scores, 2.5)\n",
356 |     "upper = np.percentile(bootstrap_scores, 97.5)\n",
357 |     "print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "markdown",
362 |    "metadata": {},
363 |    "source": [
364 |     "###  .632+ Bootstrap"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "markdown",
369 |    "metadata": {},
370 |    "source": [
371 |     "The .632+ Boostrap method attempts to address the optimistic bias of the regular .632 Boostrap."
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": 12,
377 |    "metadata": {},
378 |    "outputs": [
379 |     {
380 |      "name": "stdout",
381 |      "output_type": "stream",
382 |      "text": [
383 |       "Mean Bootstrap score 0.9597458481541713\n",
384 |       "Score Std 0.03165130213692469\n"
385 |      ]
386 |     }
387 |    ],
388 |    "source": [
389 |     "bootstrap_scores = bootstrap_point632_score(model, X_train, y_train, \n",
390 |     "                                            n_splits=200, \n",
391 |     "                                            method='.632+',\n",
392 |     "                                            random_seed=123)\n",
393 |     "print('Mean Bootstrap score', np.mean(bootstrap_scores))\n",
394 |     "print('Score Std', np.std(bootstrap_scores))"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": 13,
400 |    "metadata": {},
401 |    "outputs": [
402 |     {
403 |      "name": "stdout",
404 |      "output_type": "stream",
405 |      "text": [
406 |       "95% Confidence interval: [86.91, 100.00]\n"
407 |      ]
408 |     }
409 |    ],
410 |    "source": [
411 |     "lower = np.percentile(bootstrap_scores, 2.5)\n",
412 |     "upper = np.percentile(bootstrap_scores, 97.5)\n",
413 |     "print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "code",
418 |    "execution_count": null,
419 |    "metadata": {},
420 |    "outputs": [],
421 |    "source": []
422 |   }
423 |  ],
424 |  "metadata": {
425 |   "kernelspec": {
426 |    "display_name": "Python 3 (ipykernel)",
427 |    "language": "python",
428 |    "name": "python3"
429 |   },
430 |   "language_info": {
431 |    "codemirror_mode": {
432 |     "name": "ipython",
433 |     "version": 3
434 |    },
435 |    "file_extension": ".py",
436 |    "mimetype": "text/x-python",
437 |    "name": "python",
438 |    "nbconvert_exporter": "python",
439 |    "pygments_lexer": "ipython3",
440 |    "version": "3.9.6"
441 |   }
442 |  },
443 |  "nbformat": 4,
444 |  "nbformat_minor": 4
445 | }
446 | 


--------------------------------------------------------------------------------
/hyperparameter-tuning-examples/01.1-gridsearch-decisiontree-example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "39329df3-1f99-4b11-9405-5969d52368a7",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Decision Tree & Grid Search Example"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "bd7def2e-0fa1-41d8-b5e5-efb08845fc38",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "This notebook shows how to tune a simple classifier like a decision tree via GridSearch."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "id": "7f61a90e-a119-4bd0-af21-38604c5b4eec",
 23 |    "metadata": {},
 24 |    "outputs": [
 25 |     {
 26 |      "name": "stdout",
 27 |      "output_type": "stream",
 28 |      "text": [
 29 |       "scikit-learn: 1.0\n",
 30 |       "mlxtend     : 0.19.0\n",
 31 |       "xgboost     : 1.5.0\n",
 32 |       "\n"
 33 |      ]
 34 |     }
 35 |    ],
 36 |    "source": [
 37 |     "%load_ext watermark\n",
 38 |     "%watermark -p scikit-learn,mlxtend,xgboost"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "id": "1f0489c2-dd9c-4e71-a78c-e01201762b37",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "## Dataset"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 2,
 52 |    "id": "271b17ff-5ea4-4161-8b7f-20ba8131d666",
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "name": "stdout",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "Train/Valid/Test sizes: 398 80 171\n"
 60 |      ]
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "from sklearn import model_selection\n",
 65 |     "from sklearn.model_selection import train_test_split\n",
 66 |     "from sklearn import datasets\n",
 67 |     "\n",
 68 |     "\n",
 69 |     "data = datasets.load_breast_cancer()\n",
 70 |     "X, y = data.data, data.target\n",
 71 |     "\n",
 72 |     "X_train, X_test, y_train, y_test = \\\n",
 73 |     "    train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)\n",
 74 |     "\n",
 75 |     "X_train_sub, X_valid, y_train_sub, y_valid = \\\n",
 76 |     "    train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)\n",
 77 |     "\n",
 78 |     "print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "id": "0affc454-9f07-48e6-bcee-e6253d968247",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "## Grid Search"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 3,
 92 |    "id": "e80b4870-e6d2-4f62-91dd-2b53afaac49c",
 93 |    "metadata": {},
 94 |    "outputs": [
 95 |     {
 96 |      "name": "stdout",
 97 |      "output_type": "stream",
 98 |      "text": [
 99 |       "Fitting 10 folds for each of 9 candidates, totalling 90 fits\n",
100 |       "[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s\n",
101 |       "[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s\n",
102 |       "[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s\n",
103 |       "[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s\n",
104 |       "[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s\n",
105 |       "[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s\n",
106 |       "[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s\n",
107 |       "[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s\n",
108 |       "[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s\n",
109 |       "[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s\n",
110 |       "[CV] END ...................max_depth=6, min_samples_split=3; total time=   0.0s\n",
111 |       "[CV] END ...................max_depth=6, min_samples_split=3; total time=   0.0s\n",
112 |       "[CV] END ...................max_depth=6, min_samples_split=3; total time=   0.0s\n",
113 |       "[CV] END ...................max_depth=6, min_samples_split=3; total time=   0.0s\n",
114 |       "[CV] END ...................max_depth=6, min_samples_split=3; total time=   0.0s\n",
115 |       "[CV] END ...................max_depth=6, min_samples_split=3; total time=   0.0s\n",
116 |       "[CV] END ...................max_depth=6, min_samples_split=3; total time=   0.0s\n",
117 |       "[CV] END ...................max_depth=6, min_samples_split=3; total time=   0.0s\n",
118 |       "[CV] END ...................max_depth=6, min_samples_split=3; total time=   0.0s\n",
119 |       "[CV] END ...................max_depth=6, min_samples_split=3; total time=   0.0s\n",
120 |       "[CV] END ...................max_depth=6, min_samples_split=4; total time=   0.0s\n",
121 |       "[CV] END ...................max_depth=6, min_samples_split=4; total time=   0.0s\n",
122 |       "[CV] END ...................max_depth=6, min_samples_split=4; total time=   0.0s\n",
123 |       "[CV] END ...................max_depth=6, min_samples_split=4; total time=   0.0s\n",
124 |       "[CV] END ...................max_depth=6, min_samples_split=4; total time=   0.0s\n",
125 |       "[CV] END ...................max_depth=6, min_samples_split=4; total time=   0.0s\n",
126 |       "[CV] END ...................max_depth=6, min_samples_split=4; total time=   0.0s\n",
127 |       "[CV] END ...................max_depth=6, min_samples_split=4; total time=   0.0s\n",
128 |       "[CV] END ...................max_depth=6, min_samples_split=4; total time=   0.0s\n",
129 |       "[CV] END ...................max_depth=6, min_samples_split=4; total time=   0.0s\n",
130 |       "[CV] END ..................max_depth=16, min_samples_split=2; total time=   0.0s\n",
131 |       "[CV] END ..................max_depth=16, min_samples_split=2; total time=   0.0s\n",
132 |       "[CV] END ..................max_depth=16, min_samples_split=2; total time=   0.0s\n",
133 |       "[CV] END ..................max_depth=16, min_samples_split=2; total time=   0.0s\n",
134 |       "[CV] END ..................max_depth=16, min_samples_split=2; total time=   0.0s\n",
135 |       "[CV] END ..................max_depth=16, min_samples_split=2; total time=   0.0s\n",
136 |       "[CV] END ..................max_depth=16, min_samples_split=2; total time=   0.0s\n",
137 |       "[CV] END ..................max_depth=16, min_samples_split=2; total time=   0.0s\n",
138 |       "[CV] END ..................max_depth=16, min_samples_split=2; total time=   0.0s\n",
139 |       "[CV] END ..................max_depth=16, min_samples_split=2; total time=   0.0s\n",
140 |       "[CV] END ..................max_depth=16, min_samples_split=3; total time=   0.0s\n",
141 |       "[CV] END ..................max_depth=16, min_samples_split=3; total time=   0.0s\n",
142 |       "[CV] END ..................max_depth=16, min_samples_split=3; total time=   0.0s\n",
143 |       "[CV] END ..................max_depth=16, min_samples_split=3; total time=   0.0s\n",
144 |       "[CV] END ..................max_depth=16, min_samples_split=3; total time=   0.0s\n",
145 |       "[CV] END ..................max_depth=16, min_samples_split=3; total time=   0.0s\n",
146 |       "[CV] END ..................max_depth=16, min_samples_split=3; total time=   0.0s\n",
147 |       "[CV] END ..................max_depth=16, min_samples_split=3; total time=   0.0s\n",
148 |       "[CV] END ..................max_depth=16, min_samples_split=3; total time=   0.0s\n",
149 |       "[CV] END ..................max_depth=16, min_samples_split=3; total time=   0.0s\n",
150 |       "[CV] END ..................max_depth=16, min_samples_split=4; total time=   0.0s\n",
151 |       "[CV] END ..................max_depth=16, min_samples_split=4; total time=   0.0s\n",
152 |       "[CV] END ..................max_depth=16, min_samples_split=4; total time=   0.0s\n",
153 |       "[CV] END ..................max_depth=16, min_samples_split=4; total time=   0.0s\n",
154 |       "[CV] END ..................max_depth=16, min_samples_split=4; total time=   0.0s\n",
155 |       "[CV] END ..................max_depth=16, min_samples_split=4; total time=   0.0s\n",
156 |       "[CV] END ..................max_depth=16, min_samples_split=4; total time=   0.0s\n",
157 |       "[CV] END ..................max_depth=16, min_samples_split=4; total time=   0.0s\n",
158 |       "[CV] END ..................max_depth=16, min_samples_split=4; total time=   0.0s\n",
159 |       "[CV] END ..................max_depth=16, min_samples_split=4; total time=   0.0s\n",
160 |       "[CV] END ................max_depth=None, min_samples_split=2; total time=   0.0s\n",
161 |       "[CV] END ................max_depth=None, min_samples_split=2; total time=   0.0s\n",
162 |       "[CV] END ................max_depth=None, min_samples_split=2; total time=   0.0s\n",
163 |       "[CV] END ................max_depth=None, min_samples_split=2; total time=   0.0s\n",
164 |       "[CV] END ................max_depth=None, min_samples_split=2; total time=   0.0s\n",
165 |       "[CV] END ................max_depth=None, min_samples_split=2; total time=   0.0s\n",
166 |       "[CV] END ................max_depth=None, min_samples_split=2; total time=   0.0s\n",
167 |       "[CV] END ................max_depth=None, min_samples_split=2; total time=   0.0s\n",
168 |       "[CV] END ................max_depth=None, min_samples_split=2; total time=   0.0s\n",
169 |       "[CV] END ................max_depth=None, min_samples_split=2; total time=   0.0s\n",
170 |       "[CV] END ................max_depth=None, min_samples_split=3; total time=   0.0s\n",
171 |       "[CV] END ................max_depth=None, min_samples_split=3; total time=   0.0s\n",
172 |       "[CV] END ................max_depth=None, min_samples_split=3; total time=   0.0s\n",
173 |       "[CV] END ................max_depth=None, min_samples_split=3; total time=   0.0s\n",
174 |       "[CV] END ................max_depth=None, min_samples_split=3; total time=   0.0s\n",
175 |       "[CV] END ................max_depth=None, min_samples_split=3; total time=   0.0s\n",
176 |       "[CV] END ................max_depth=None, min_samples_split=3; total time=   0.0s\n",
177 |       "[CV] END ................max_depth=None, min_samples_split=3; total time=   0.0s\n",
178 |       "[CV] END ................max_depth=None, min_samples_split=3; total time=   0.0s\n",
179 |       "[CV] END ................max_depth=None, min_samples_split=3; total time=   0.0s\n",
180 |       "[CV] END ................max_depth=None, min_samples_split=4; total time=   0.0s\n",
181 |       "[CV] END ................max_depth=None, min_samples_split=4; total time=   0.0s\n",
182 |       "[CV] END ................max_depth=None, min_samples_split=4; total time=   0.0s\n",
183 |       "[CV] END ................max_depth=None, min_samples_split=4; total time=   0.0s\n",
184 |       "[CV] END ................max_depth=None, min_samples_split=4; total time=   0.0s\n",
185 |       "[CV] END ................max_depth=None, min_samples_split=4; total time=   0.0s\n",
186 |       "[CV] END ................max_depth=None, min_samples_split=4; total time=   0.0s\n",
187 |       "[CV] END ................max_depth=None, min_samples_split=4; total time=   0.0s\n",
188 |       "[CV] END ................max_depth=None, min_samples_split=4; total time=   0.0s\n",
189 |       "[CV] END ................max_depth=None, min_samples_split=4; total time=   0.0s\n"
190 |      ]
191 |     },
192 |     {
193 |      "data": {
194 |       "text/plain": [
195 |        "0.9274358974358975"
196 |       ]
197 |      },
198 |      "execution_count": 3,
199 |      "metadata": {},
200 |      "output_type": "execute_result"
201 |     }
202 |    ],
203 |    "source": [
204 |     "import numpy as np\n",
205 |     "from sklearn.model_selection import GridSearchCV\n",
206 |     "from sklearn.tree import DecisionTreeClassifier\n",
207 |     "\n",
208 |     "\n",
209 |     "clf = DecisionTreeClassifier(random_state=123)\n",
210 |     "\n",
211 |     "params =  {\n",
212 |     "    'min_samples_split': [2, 3, 4],\n",
213 |     "    'max_depth': [6, 16, None]\n",
214 |     "}\n",
215 |     "\n",
216 |     "\n",
217 |     "grid = GridSearchCV(estimator=clf,\n",
218 |     "                    param_grid=params,\n",
219 |     "                    cv=10,\n",
220 |     "                    n_jobs=1,\n",
221 |     "                    verbose=2)\n",
222 |     "\n",
223 |     "grid.fit(X_train, y_train)\n",
224 |     "\n",
225 |     "grid.best_score_"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 4,
231 |    "id": "2c26399d-ebfc-4b06-86d9-36e49711e908",
232 |    "metadata": {},
233 |    "outputs": [
234 |     {
235 |      "data": {
236 |       "text/plain": [
237 |        "{'max_depth': 16, 'min_samples_split': 4}"
238 |       ]
239 |      },
240 |      "execution_count": 4,
241 |      "metadata": {},
242 |      "output_type": "execute_result"
243 |     }
244 |    ],
245 |    "source": [
246 |     "grid.best_params_"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 5,
252 |    "id": "763e816b-6437-45a9-812f-8b429472d75e",
253 |    "metadata": {},
254 |    "outputs": [
255 |     {
256 |      "name": "stdout",
257 |      "output_type": "stream",
258 |      "text": [
259 |       "Training Accuracy: 1.00\n",
260 |       "Test Accuracy: 0.94\n"
261 |      ]
262 |     }
263 |    ],
264 |    "source": [
265 |     "print(f\"Training Accuracy: {grid.best_estimator_.score(X_train, y_train):0.2f}\")\n",
266 |     "#print(f\"Validation Accuracy: {grid.best_estimator_.score(X_valid, y_valid):0.2f}\")\n",
267 |     "print(f\"Test Accuracy: {grid.best_estimator_.score(X_test, y_test):0.2f}\")"
268 |    ]
269 |   }
270 |  ],
271 |  "metadata": {
272 |   "kernelspec": {
273 |    "display_name": "Python 3 (ipykernel)",
274 |    "language": "python",
275 |    "name": "python3"
276 |   },
277 |   "language_info": {
278 |    "codemirror_mode": {
279 |     "name": "ipython",
280 |     "version": 3
281 |    },
282 |    "file_extension": ".py",
283 |    "mimetype": "text/x-python",
284 |    "name": "python",
285 |    "nbconvert_exporter": "python",
286 |    "pygments_lexer": "ipython3",
287 |    "version": "3.9.6"
288 |   }
289 |  },
290 |  "nbformat": 4,
291 |  "nbformat_minor": 5
292 | }
293 | 


--------------------------------------------------------------------------------
/05-dataprocessing/code/5-3_python-oop.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "a0fa450b-13b2-4c32-9dd1-ae60a44e8225",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "STAT 451: Machine Learning (Fall 2021)  \n",
  9 |     "Instructor: Sebastian Raschka (sraschka@wisc.edu)  "
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "id": "46d2da68-18e9-4c45-9158-2cdeb29a6981",
 15 |    "metadata": {
 16 |     "tags": []
 17 |    },
 18 |    "source": [
 19 |     "# L05 - Data Preprocessing and Machine Learning with Scikit-Learn"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "id": "d60abc6b-9f90-42ff-9735-c17a3980114c",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "# 5.3 Object Oriented Programming (OOP) & Python Classes"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "id": "9c46fc49-5c08-4f14-9765-379d9d56965b",
 33 |    "metadata": {
 34 |     "tags": []
 35 |    },
 36 |    "source": [
 37 |     "## Python Classes"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "id": "bb9d593a-49b9-4d83-ae19-e7cb0b4260be",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "- This section illustrates the concept of \"classes\" in Python, which is relevant for understanding how the scikit-learn API works on a fundamental level later in this lecture.\n",
 46 |     "- Note that Python is an object oriented language, and everything in Python is an object.\n",
 47 |     "- Classes are \"templates\" for creating objects (this is called \"instantiating\" objects).\n",
 48 |     "- An object is a collection of special \"functions\" (a \"function\" of an object or class is called \"method\") and attributes.\n",
 49 |     "- Note that the `self` attribute is a special keyword for referring to a class or an instantiated object of a class, \"itself.\""
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 1,
 55 |    "id": "01cb8732-4b2c-4bdb-862b-966d13b493f5",
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "class VehicleClass():\n",
 60 |     "    \n",
 61 |     "    def __init__(self, horsepower):\n",
 62 |     "        \"This is the 'init' method\"\n",
 63 |     "        # this is a class attribute:\n",
 64 |     "        self.horsepower = horsepower\n",
 65 |     "        \n",
 66 |     "    def horsepower_to_torque(self, rpm):\n",
 67 |     "        \"This is a regular method\"\n",
 68 |     "        torque = self.horsepower * rpm / 5252\n",
 69 |     "        return torque\n",
 70 |     "    \n",
 71 |     "    def tune_motor(self):\n",
 72 |     "        self.horsepower *= 2\n",
 73 |     "    \n",
 74 |     "    def _private_method(self):\n",
 75 |     "        print('this is private')\n",
 76 |     "    \n",
 77 |     "    def __very_private_method(self):\n",
 78 |     "        print('this is very private')"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 2,
 84 |    "id": "4cfe8ab5-49da-4565-ad40-800b541e27fe",
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "123\n"
 92 |      ]
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "# instantiate an object:\n",
 97 |     "car1 = VehicleClass(horsepower=123)\n",
 98 |     "print(car1.horsepower)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 3,
104 |    "id": "50adb1c9-b251-4c76-97c7-d392d6df83fd",
105 |    "metadata": {},
106 |    "outputs": [
107 |     {
108 |      "data": {
109 |       "text/plain": [
110 |        "117.0982482863671"
111 |       ]
112 |      },
113 |      "execution_count": 3,
114 |      "metadata": {},
115 |      "output_type": "execute_result"
116 |     }
117 |    ],
118 |    "source": [
119 |     "car1.horsepower_to_torque(rpm=5000)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 4,
125 |    "id": "9e9e9956-d26b-4243-87f2-7cd809cbcf68",
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "data": {
130 |       "text/plain": [
131 |        "234.1964965727342"
132 |       ]
133 |      },
134 |      "execution_count": 4,
135 |      "metadata": {},
136 |      "output_type": "execute_result"
137 |     }
138 |    ],
139 |    "source": [
140 |     "car1.tune_motor()\n",
141 |     "car1.horsepower_to_torque(rpm=5000)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 5,
147 |    "id": "ae10555e-20fd-4f57-a3b3-5b45d5206bf0",
148 |    "metadata": {},
149 |    "outputs": [
150 |     {
151 |      "name": "stdout",
152 |      "output_type": "stream",
153 |      "text": [
154 |       "this is private\n"
155 |      ]
156 |     }
157 |    ],
158 |    "source": [
159 |     "car1._private_method()"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "id": "872bc81b-c96e-4075-8bbd-4be4d7b4c4de",
165 |    "metadata": {},
166 |    "source": [
167 |     "- Python has the motto \"we are all adults here,\" which means that a user can do the same things as a developer (in contrast to other programming languages, e.g., Java).\n",
168 |     "- A preceding underscore is an indicator that a method is considered \"private\" -- this means, this method is meant to be used internally but not by the user directly (also, it does not show up in the \"help\" documentation)\n",
169 |     "- a preceding double-underscore is a \"stronger\" indicator for methods that are supposed to be private, and while users can access these (adhering to the \"we are all adults here\" moto), we have to refer to \"name mangling.\""
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 6,
175 |    "id": "0c0003a2-2dfd-41d8-860c-5b99ad20b7c7",
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "# Excecuting the following would raise an error:\n",
180 |     "# car1.__very_private_method()"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 7,
186 |    "id": "1194b154-ee84-48df-a066-76420c2eb27f",
187 |    "metadata": {},
188 |    "outputs": [
189 |     {
190 |      "name": "stdout",
191 |      "output_type": "stream",
192 |      "text": [
193 |       "this is very private\n"
194 |      ]
195 |     }
196 |    ],
197 |    "source": [
198 |     "# If we use \"name mangling\" we can access this private method:\n",
199 |     "car1._VehicleClass__very_private_method()"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "id": "93593965-aa3b-4bdd-8bd1-12c704d0a9fd",
205 |    "metadata": {},
206 |    "source": [
207 |     "- Another useful aspect of using classes is the concept of \"inheritance.\"\n",
208 |     "- Using inheritance, we can \"inherit\" methods and attributes from a parent class for re-use.\n",
209 |     "- For instance, consider the `VehicleClass` as a more general class than the `CarClass` -- i.e., a car, truck, or motorbike are specific cases of a vehicle.\n",
210 |     "- Below is an example of a `CarClass` that inherits the methods from the `VehicleClass` and adds a specific `self.num_wheels=4` attribute -- if we were to create a `BikeClass`, we could set this to `self.num_wheels=2`, for example.\n",
211 |     "- All-in-all, this is a very simple demonstration of class inheritance, however, it's a concept that is very useful for writing \"clean code\" and structuring projects -- the scikit-learn machine learning library makes heavy use of this concept internally (we, as users, don't have to worry about it too much though, it is useful to know though in case you would like to modify or contribute to the library)."
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 8,
217 |    "id": "bf612d76-b551-4488-98ea-3904acdd5030",
218 |    "metadata": {},
219 |    "outputs": [
220 |     {
221 |      "name": "stdout",
222 |      "output_type": "stream",
223 |      "text": [
224 |       "Number of wheels: 4\n",
225 |       "Horsepower: 123\n",
226 |       "Horsepower: 246\n"
227 |      ]
228 |     }
229 |    ],
230 |    "source": [
231 |     "class CarClass(VehicleClass):\n",
232 |     "\n",
233 |     "    def __init__(self, horsepower):\n",
234 |     "        super().__init__(horsepower)\n",
235 |     "        self.num_wheels = 4\n",
236 |     "    \n",
237 |     "new_car = CarClass(horsepower=123)\n",
238 |     "print('Number of wheels:', new_car.num_wheels)\n",
239 |     "print('Horsepower:', new_car.horsepower)\n",
240 |     "new_car.tune_motor()\n",
241 |     "print('Horsepower:', new_car.horsepower)"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "id": "8f994066-ff33-4b9b-bf9f-72aec051aade",
247 |    "metadata": {},
248 |    "source": [
249 |     "## K-Nearest Neighbors Implementation"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "markdown",
254 |    "id": "bef9e923-36a7-4012-ab9b-5a612a51f483",
255 |    "metadata": {},
256 |    "source": [
257 |     "- Below is a very simple implementation of a K-nearest Neighbor classifier.\n",
258 |     "- This is a very slow and inefficient implementation, and in real-world problems, it is always recommended to use established libraries (like scikit-learn) instead of implementing algorithms from scratch.\n",
259 |     "- The scikit-learn library, for example, implements *k*NN much more efficiently and robustly -- using advanced data structures (KD-Tree and Ball-Tree, which we briefly discussed in Lecture 02).\n",
260 |     "- A scenario where it is useful to implement algorithms from scratch is for learning and teaching purposes, or if we want to try out new algorithms, hence, the implementation below, which gently introduces how things are implemented in scikit-learn."
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 9,
266 |    "id": "a179e213-7b08-488a-a7f0-7a36f871a564",
267 |    "metadata": {},
268 |    "outputs": [],
269 |    "source": [
270 |     "class KNNClassifier(object):\n",
271 |     "    def __init__(self, k, dist_fn=None):\n",
272 |     "        self.k = k\n",
273 |     "        if dist_fn is None:\n",
274 |     "            self.dist_fn = self._euclidean_dist\n",
275 |     "    \n",
276 |     "    def _euclidean_dist(self, a, b):\n",
277 |     "        dist = 0.\n",
278 |     "        for ele_i, ele_j in zip(a, b):\n",
279 |     "            dist += ((ele_i - ele_j)**2)\n",
280 |     "        dist = dist**0.5\n",
281 |     "        return dist\n",
282 |     "        \n",
283 |     "    def _find_nearest(self, x):\n",
284 |     "        dist_idx_pairs = []\n",
285 |     "        for j in range(self.dataset_.shape[0]):\n",
286 |     "            d = self.dist_fn(x, self.dataset_[j])\n",
287 |     "            dist_idx_pairs.append((d, j))\n",
288 |     "            \n",
289 |     "        sorted_dist_idx_pairs = sorted(dist_idx_pairs)\n",
290 |     "\n",
291 |     "        return sorted_dist_idx_pairs\n",
292 |     "    \n",
293 |     "    def fit(self, X, y):\n",
294 |     "        self.dataset_ = X.copy()\n",
295 |     "        self.labels_ = y.copy()\n",
296 |     "        self.possible_labels_ = np.unique(y)\n",
297 |     "\n",
298 |     "    def predict(self, X):\n",
299 |     "        predictions = np.zeros(X.shape[0], dtype=int)\n",
300 |     "        for i in range(X.shape[0]):\n",
301 |     "            k_nearest = self._find_nearest(X[i])[:self.k]\n",
302 |     "            indices = [entry[1] for entry in k_nearest]\n",
303 |     "            k_labels = self.labels_[indices]\n",
304 |     "            counts = np.bincount(k_labels,\n",
305 |     "                                 minlength=self.possible_labels_.shape[0])\n",
306 |     "            pred_label = np.argmax(counts)\n",
307 |     "            predictions[i] = pred_label\n",
308 |     "        return predictions"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": 10,
314 |    "id": "202cc3ae-b00b-40db-b936-c151fdaf66ae",
315 |    "metadata": {},
316 |    "outputs": [
317 |     {
318 |      "name": "stdout",
319 |      "output_type": "stream",
320 |      "text": [
321 |       "X_train.shape: (97, 4)\n",
322 |       "X_valid.shape: (22, 4)\n",
323 |       "X_test.shape: (31, 4)\n"
324 |      ]
325 |     }
326 |    ],
327 |    "source": [
328 |     "# Code repeated from 5-2-basic-data-handling.ipynb\n",
329 |     "\n",
330 |     "import pandas as pd\n",
331 |     "import numpy as np\n",
332 |     "\n",
333 |     "\n",
334 |     "df = pd.read_csv('data/iris.csv')\n",
335 |     "\n",
336 |     "d = {'Iris-setosa': 0,\n",
337 |     "     'Iris-versicolor': 1,\n",
338 |     "     'Iris-virginica': 2}\n",
339 |     "df['Species'] = df['Species'].map(d)\n",
340 |     "\n",
341 |     "X = df.iloc[:, 1:5].values\n",
342 |     "y = df['Species'].values\n",
343 |     "\n",
344 |     "indices = np.arange(X.shape[0])\n",
345 |     "rng = np.random.RandomState(123)\n",
346 |     "permuted_indices = rng.permutation(indices)\n",
347 |     "\n",
348 |     "train_size, valid_size = int(0.65*X.shape[0]), int(0.15*X.shape[0])\n",
349 |     "test_size = X.shape[0] - (train_size + valid_size)\n",
350 |     "train_ind = permuted_indices[:train_size]\n",
351 |     "valid_ind = permuted_indices[train_size:(train_size + valid_size)]\n",
352 |     "test_ind = permuted_indices[(train_size + valid_size):]\n",
353 |     "X_train, y_train = X[train_ind], y[train_ind]\n",
354 |     "X_valid, y_valid = X[valid_ind], y[valid_ind]\n",
355 |     "X_test, y_test = X[test_ind], y[test_ind]\n",
356 |     "\n",
357 |     "print(f'X_train.shape: {X_train.shape}')\n",
358 |     "print(f'X_valid.shape: {X_valid.shape}')\n",
359 |     "print(f'X_test.shape: {X_test.shape}')"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": 11,
365 |    "id": "c3009c86-55eb-4f49-b23f-4da4ce948c97",
366 |    "metadata": {},
367 |    "outputs": [
368 |     {
369 |      "name": "stdout",
370 |      "output_type": "stream",
371 |      "text": [
372 |       "[0 1 2 1 1 1 0 0 1 2 0 0 1 1 1 2 1 1 1 2 0 0]\n"
373 |      ]
374 |     }
375 |    ],
376 |    "source": [
377 |     "knn_model = KNNClassifier(k=3)\n",
378 |     "knn_model.fit(X_train, y_train)\n",
379 |     "\n",
380 |     "\n",
381 |     "print(knn_model.predict(X_valid))"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "markdown",
386 |    "id": "f328a87b-c63c-437d-ab51-4dea64741aa4",
387 |    "metadata": {},
388 |    "source": [
389 |     "Note that there are class attributes with a `_` suffix in the implementation above -- this is not a typo.\n",
390 |     "- The trailing `_` (e.g., here: `self.dataset_`) is a scikit-learn convention and indicates that these are \"fit\" attributes -- that is, attributes that are available only *after* calling the `fit` method."
391 |    ]
392 |   }
393 |  ],
394 |  "metadata": {
395 |   "kernelspec": {
396 |    "display_name": "Python 3.9.2 64-bit ('base': conda)",
397 |    "language": "python",
398 |    "name": "python392jvsc74a57bd0249cfc85c6a0073df6bca89c83e3180d730f84f7e1f446fbe710b75104ecfa4f"
399 |   },
400 |   "language_info": {
401 |    "codemirror_mode": {
402 |     "name": "ipython",
403 |     "version": 3
404 |    },
405 |    "file_extension": ".py",
406 |    "mimetype": "text/x-python",
407 |    "name": "python",
408 |    "nbconvert_exporter": "python",
409 |    "pygments_lexer": "ipython3",
410 |    "version": "3.9.6"
411 |   }
412 |  },
413 |  "nbformat": 4,
414 |  "nbformat_minor": 5
415 | }
416 | 


--------------------------------------------------------------------------------