├── .gitignore ├── L01 ├── 01-ml-overview__notes.pdf └── 01-ml-overview__slides.pdf ├── L02 ├── 02-knn__notes.pdf ├── 02-knn__slides.pdf └── code │ ├── 02_knn_demo.ipynb │ └── iris.csv ├── L03 └── 03-python__notes.pdf ├── L04 ├── 04_scipython__code.ipynb └── images │ ├── numpy-intro │ ├── array_1.png │ ├── array_2.png │ ├── broadcasting-1.png │ ├── broadcasting-2.png │ ├── matmatmul.png │ ├── matmul.png │ ├── numpy-nature-1.png │ ├── random_1.png │ ├── random_2.png │ ├── transpose.png │ └── ufunc.png │ ├── output_171_0.png │ ├── output_173_0.png │ ├── output_174_0.png │ ├── output_176_0.png │ ├── output_178_0.png │ ├── output_180_0.png │ ├── output_181_0.png │ ├── output_183_0.png │ ├── output_185_0.png │ └── output_188_0.png ├── L05 └── code │ ├── 05-bonus-column-transformer.ipynb │ ├── 05-preprocessing-and-sklearn__notes.ipynb │ ├── 05-preprocessing-and-sklearn__slides.pdf │ ├── data │ ├── categoricaldata.csv │ ├── iris.csv │ ├── iris_mod.csv │ └── missingdata.csv │ └── images │ ├── decisionreg.pdf │ ├── eda.pdf │ ├── estimator-api.pdf │ ├── estimator-api.png │ ├── holdout-tuning.pdf │ ├── holdout-tuning.png │ ├── iris-subsampling.pdf │ ├── iris-subsampling.png │ ├── sklearn-pipeline.pdf │ ├── sklearn-pipeline.png │ ├── transformer-api.pdf │ └── transformer-api.png ├── L06 ├── 06-trees__notes.pdf ├── 06-trees__slides.pdf └── code │ ├── 06-trees_demo.ipynb │ └── 06-trees_demo_without_graphviz.ipynb ├── L07 ├── 07-ensembles__notes.pdf ├── 07-ensembles__slides.pdf └── code │ └── 07_code-from-slides.ipynb ├── L08 ├── 08-model-eval-1-intro__notes.pdf └── 08-model-eval-1-intro__slides.pdf ├── L09 ├── 09-eval2-ci__notes.pdf ├── 09-eval2-ci__slides.pdf └── code │ ├── 09-eval2-ci__1_distribution-and-subsampling.ipynb │ ├── 09-eval2-ci__2_holdout-and-repeated-sampling.ipynb │ ├── 09-eval2-ci__3_pessimistic-bias-in-holdout.ipynb │ ├── 09-eval2-ci__4-confidence-intervals_iris.ipynb │ ├── 09-eval2-ci__4-confidence-intervals_mnist.ipynb │ └── 09-eval2-ci__5.ipynb ├── L10 ├── 10_eval3-cv__notes.pdf ├── 10_eval3-cv__slides.pdf └── code │ ├── 10_04_kfold-eval.ipynb │ ├── 10_06_kfold-sele.ipynb │ └── 10_08_1stderr.ipynb ├── L11 ├── 11-eval4-algo__notes.pdf ├── 11-eval4-algo__slides.pdf └── code │ ├── 11-eval4-algo__nested-cv_compact.ipynb │ ├── 11-eval4-algo__nested-cv_verbose1.ipynb │ ├── 11-eval4-algo__nested-cv_verbose2.ipynb │ └── nested-cv-image.png ├── L12 ├── 12_eval5-metrics__slides.pdf └── code │ ├── 12_1_confusion-matrix.ipynb │ ├── 12_2_pre-recall-f1.ipynb │ ├── 12_3_balanced-acc-Copy1.ipynb │ ├── 12_4_roc.ipynb │ ├── wdbc.data │ └── wdbc.names.txt ├── README.md └── report-template ├── examples ├── example-presentations.md ├── example-proposal.pdf └── example-report.pdf ├── project-presentation-assessment.md ├── project-proposal-assessment.md ├── project-report-assessment.md ├── proposal-latex ├── bibliography.bib ├── figures │ ├── google-scholar.pdf │ └── not-own-figure.pdf ├── ieee.bst ├── proposal.pdf ├── proposal.tex └── statcourse.sty └── report-latex ├── bibliography.bib ├── figures └── google-scholar.pdf ├── ieee.bst ├── report.pdf ├── report.tex └── statcourse.sty /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /L01/01-ml-overview__notes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L01/01-ml-overview__notes.pdf -------------------------------------------------------------------------------- /L01/01-ml-overview__slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L01/01-ml-overview__slides.pdf -------------------------------------------------------------------------------- /L02/02-knn__notes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L02/02-knn__notes.pdf -------------------------------------------------------------------------------- /L02/02-knn__slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L02/02-knn__slides.pdf -------------------------------------------------------------------------------- /L02/code/iris.csv: -------------------------------------------------------------------------------- 1 | Id,SepalLength[cm],SepalWidth[cm],PetalLength[cm],PetalWidth[cm],Species 2 | 1,5.1,3.5,1.4,0.2,Iris-setosa 3 | 2,4.9,3.0,1.4,0.2,Iris-setosa 4 | 3,4.7,3.2,1.3,0.2,Iris-setosa 5 | 4,4.6,3.1,1.5,0.2,Iris-setosa 6 | 5,5.0,3.6,1.4,0.2,Iris-setosa 7 | 6,5.4,3.9,1.7,0.4,Iris-setosa 8 | 7,4.6,3.4,1.4,0.3,Iris-setosa 9 | 8,5.0,3.4,1.5,0.2,Iris-setosa 10 | 9,4.4,2.9,1.4,0.2,Iris-setosa 11 | 10,4.9,3.1,1.5,0.1,Iris-setosa 12 | 11,5.4,3.7,1.5,0.2,Iris-setosa 13 | 12,4.8,3.4,1.6,0.2,Iris-setosa 14 | 13,4.8,3.0,1.4,0.1,Iris-setosa 15 | 14,4.3,3.0,1.1,0.1,Iris-setosa 16 | 15,5.8,4.0,1.2,0.2,Iris-setosa 17 | 16,5.7,4.4,1.5,0.4,Iris-setosa 18 | 17,5.4,3.9,1.3,0.4,Iris-setosa 19 | 18,5.1,3.5,1.4,0.3,Iris-setosa 20 | 19,5.7,3.8,1.7,0.3,Iris-setosa 21 | 20,5.1,3.8,1.5,0.3,Iris-setosa 22 | 21,5.4,3.4,1.7,0.2,Iris-setosa 23 | 22,5.1,3.7,1.5,0.4,Iris-setosa 24 | 23,4.6,3.6,1.0,0.2,Iris-setosa 25 | 24,5.1,3.3,1.7,0.5,Iris-setosa 26 | 25,4.8,3.4,1.9,0.2,Iris-setosa 27 | 26,5.0,3.0,1.6,0.2,Iris-setosa 28 | 27,5.0,3.4,1.6,0.4,Iris-setosa 29 | 28,5.2,3.5,1.5,0.2,Iris-setosa 30 | 29,5.2,3.4,1.4,0.2,Iris-setosa 31 | 30,4.7,3.2,1.6,0.2,Iris-setosa 32 | 31,4.8,3.1,1.6,0.2,Iris-setosa 33 | 32,5.4,3.4,1.5,0.4,Iris-setosa 34 | 33,5.2,4.1,1.5,0.1,Iris-setosa 35 | 34,5.5,4.2,1.4,0.2,Iris-setosa 36 | 35,4.9,3.1,1.5,0.1,Iris-setosa 37 | 36,5.0,3.2,1.2,0.2,Iris-setosa 38 | 37,5.5,3.5,1.3,0.2,Iris-setosa 39 | 38,4.9,3.1,1.5,0.1,Iris-setosa 40 | 39,4.4,3.0,1.3,0.2,Iris-setosa 41 | 40,5.1,3.4,1.5,0.2,Iris-setosa 42 | 41,5.0,3.5,1.3,0.3,Iris-setosa 43 | 42,4.5,2.3,1.3,0.3,Iris-setosa 44 | 43,4.4,3.2,1.3,0.2,Iris-setosa 45 | 44,5.0,3.5,1.6,0.6,Iris-setosa 46 | 45,5.1,3.8,1.9,0.4,Iris-setosa 47 | 46,4.8,3.0,1.4,0.3,Iris-setosa 48 | 47,5.1,3.8,1.6,0.2,Iris-setosa 49 | 48,4.6,3.2,1.4,0.2,Iris-setosa 50 | 49,5.3,3.7,1.5,0.2,Iris-setosa 51 | 50,5.0,3.3,1.4,0.2,Iris-setosa 52 | 51,7.0,3.2,4.7,1.4,Iris-versicolor 53 | 52,6.4,3.2,4.5,1.5,Iris-versicolor 54 | 53,6.9,3.1,4.9,1.5,Iris-versicolor 55 | 54,5.5,2.3,4.0,1.3,Iris-versicolor 56 | 55,6.5,2.8,4.6,1.5,Iris-versicolor 57 | 56,5.7,2.8,4.5,1.3,Iris-versicolor 58 | 57,6.3,3.3,4.7,1.6,Iris-versicolor 59 | 58,4.9,2.4,3.3,1.0,Iris-versicolor 60 | 59,6.6,2.9,4.6,1.3,Iris-versicolor 61 | 60,5.2,2.7,3.9,1.4,Iris-versicolor 62 | 61,5.0,2.0,3.5,1.0,Iris-versicolor 63 | 62,5.9,3.0,4.2,1.5,Iris-versicolor 64 | 63,6.0,2.2,4.0,1.0,Iris-versicolor 65 | 64,6.1,2.9,4.7,1.4,Iris-versicolor 66 | 65,5.6,2.9,3.6,1.3,Iris-versicolor 67 | 66,6.7,3.1,4.4,1.4,Iris-versicolor 68 | 67,5.6,3.0,4.5,1.5,Iris-versicolor 69 | 68,5.8,2.7,4.1,1.0,Iris-versicolor 70 | 69,6.2,2.2,4.5,1.5,Iris-versicolor 71 | 70,5.6,2.5,3.9,1.1,Iris-versicolor 72 | 71,5.9,3.2,4.8,1.8,Iris-versicolor 73 | 72,6.1,2.8,4.0,1.3,Iris-versicolor 74 | 73,6.3,2.5,4.9,1.5,Iris-versicolor 75 | 74,6.1,2.8,4.7,1.2,Iris-versicolor 76 | 75,6.4,2.9,4.3,1.3,Iris-versicolor 77 | 76,6.6,3.0,4.4,1.4,Iris-versicolor 78 | 77,6.8,2.8,4.8,1.4,Iris-versicolor 79 | 78,6.7,3.0,5.0,1.7,Iris-versicolor 80 | 79,6.0,2.9,4.5,1.5,Iris-versicolor 81 | 80,5.7,2.6,3.5,1.0,Iris-versicolor 82 | 81,5.5,2.4,3.8,1.1,Iris-versicolor 83 | 82,5.5,2.4,3.7,1.0,Iris-versicolor 84 | 83,5.8,2.7,3.9,1.2,Iris-versicolor 85 | 84,6.0,2.7,5.1,1.6,Iris-versicolor 86 | 85,5.4,3.0,4.5,1.5,Iris-versicolor 87 | 86,6.0,3.4,4.5,1.6,Iris-versicolor 88 | 87,6.7,3.1,4.7,1.5,Iris-versicolor 89 | 88,6.3,2.3,4.4,1.3,Iris-versicolor 90 | 89,5.6,3.0,4.1,1.3,Iris-versicolor 91 | 90,5.5,2.5,4.0,1.3,Iris-versicolor 92 | 91,5.5,2.6,4.4,1.2,Iris-versicolor 93 | 92,6.1,3.0,4.6,1.4,Iris-versicolor 94 | 93,5.8,2.6,4.0,1.2,Iris-versicolor 95 | 94,5.0,2.3,3.3,1.0,Iris-versicolor 96 | 95,5.6,2.7,4.2,1.3,Iris-versicolor 97 | 96,5.7,3.0,4.2,1.2,Iris-versicolor 98 | 97,5.7,2.9,4.2,1.3,Iris-versicolor 99 | 98,6.2,2.9,4.3,1.3,Iris-versicolor 100 | 99,5.1,2.5,3.0,1.1,Iris-versicolor 101 | 100,5.7,2.8,4.1,1.3,Iris-versicolor 102 | 101,6.3,3.3,6.0,2.5,Iris-virginica 103 | 102,5.8,2.7,5.1,1.9,Iris-virginica 104 | 103,7.1,3.0,5.9,2.1,Iris-virginica 105 | 104,6.3,2.9,5.6,1.8,Iris-virginica 106 | 105,6.5,3.0,5.8,2.2,Iris-virginica 107 | 106,7.6,3.0,6.6,2.1,Iris-virginica 108 | 107,4.9,2.5,4.5,1.7,Iris-virginica 109 | 108,7.3,2.9,6.3,1.8,Iris-virginica 110 | 109,6.7,2.5,5.8,1.8,Iris-virginica 111 | 110,7.2,3.6,6.1,2.5,Iris-virginica 112 | 111,6.5,3.2,5.1,2.0,Iris-virginica 113 | 112,6.4,2.7,5.3,1.9,Iris-virginica 114 | 113,6.8,3.0,5.5,2.1,Iris-virginica 115 | 114,5.7,2.5,5.0,2.0,Iris-virginica 116 | 115,5.8,2.8,5.1,2.4,Iris-virginica 117 | 116,6.4,3.2,5.3,2.3,Iris-virginica 118 | 117,6.5,3.0,5.5,1.8,Iris-virginica 119 | 118,7.7,3.8,6.7,2.2,Iris-virginica 120 | 119,7.7,2.6,6.9,2.3,Iris-virginica 121 | 120,6.0,2.2,5.0,1.5,Iris-virginica 122 | 121,6.9,3.2,5.7,2.3,Iris-virginica 123 | 122,5.6,2.8,4.9,2.0,Iris-virginica 124 | 123,7.7,2.8,6.7,2.0,Iris-virginica 125 | 124,6.3,2.7,4.9,1.8,Iris-virginica 126 | 125,6.7,3.3,5.7,2.1,Iris-virginica 127 | 126,7.2,3.2,6.0,1.8,Iris-virginica 128 | 127,6.2,2.8,4.8,1.8,Iris-virginica 129 | 128,6.1,3.0,4.9,1.8,Iris-virginica 130 | 129,6.4,2.8,5.6,2.1,Iris-virginica 131 | 130,7.2,3.0,5.8,1.6,Iris-virginica 132 | 131,7.4,2.8,6.1,1.9,Iris-virginica 133 | 132,7.9,3.8,6.4,2.0,Iris-virginica 134 | 133,6.4,2.8,5.6,2.2,Iris-virginica 135 | 134,6.3,2.8,5.1,1.5,Iris-virginica 136 | 135,6.1,2.6,5.6,1.4,Iris-virginica 137 | 136,7.7,3.0,6.1,2.3,Iris-virginica 138 | 137,6.3,3.4,5.6,2.4,Iris-virginica 139 | 138,6.4,3.1,5.5,1.8,Iris-virginica 140 | 139,6.0,3.0,4.8,1.8,Iris-virginica 141 | 140,6.9,3.1,5.4,2.1,Iris-virginica 142 | 141,6.7,3.1,5.6,2.4,Iris-virginica 143 | 142,6.9,3.1,5.1,2.3,Iris-virginica 144 | 143,5.8,2.7,5.1,1.9,Iris-virginica 145 | 144,6.8,3.2,5.9,2.3,Iris-virginica 146 | 145,6.7,3.3,5.7,2.5,Iris-virginica 147 | 146,6.7,3.0,5.2,2.3,Iris-virginica 148 | 147,6.3,2.5,5.0,1.9,Iris-virginica 149 | 148,6.5,3.0,5.2,2.0,Iris-virginica 150 | 149,6.2,3.4,5.4,2.3,Iris-virginica 151 | 150,5.9,3.0,5.1,1.8,Iris-virginica -------------------------------------------------------------------------------- /L03/03-python__notes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L03/03-python__notes.pdf -------------------------------------------------------------------------------- /L04/images/numpy-intro/array_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/numpy-intro/array_1.png -------------------------------------------------------------------------------- /L04/images/numpy-intro/array_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/numpy-intro/array_2.png -------------------------------------------------------------------------------- /L04/images/numpy-intro/broadcasting-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/numpy-intro/broadcasting-1.png -------------------------------------------------------------------------------- /L04/images/numpy-intro/broadcasting-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/numpy-intro/broadcasting-2.png -------------------------------------------------------------------------------- /L04/images/numpy-intro/matmatmul.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/numpy-intro/matmatmul.png -------------------------------------------------------------------------------- /L04/images/numpy-intro/matmul.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/numpy-intro/matmul.png -------------------------------------------------------------------------------- /L04/images/numpy-intro/numpy-nature-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/numpy-intro/numpy-nature-1.png -------------------------------------------------------------------------------- /L04/images/numpy-intro/random_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/numpy-intro/random_1.png -------------------------------------------------------------------------------- /L04/images/numpy-intro/random_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/numpy-intro/random_2.png -------------------------------------------------------------------------------- /L04/images/numpy-intro/transpose.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/numpy-intro/transpose.png -------------------------------------------------------------------------------- /L04/images/numpy-intro/ufunc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/numpy-intro/ufunc.png -------------------------------------------------------------------------------- /L04/images/output_171_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/output_171_0.png -------------------------------------------------------------------------------- /L04/images/output_173_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/output_173_0.png -------------------------------------------------------------------------------- /L04/images/output_174_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/output_174_0.png -------------------------------------------------------------------------------- /L04/images/output_176_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/output_176_0.png -------------------------------------------------------------------------------- /L04/images/output_178_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/output_178_0.png -------------------------------------------------------------------------------- /L04/images/output_180_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/output_180_0.png -------------------------------------------------------------------------------- /L04/images/output_181_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/output_181_0.png -------------------------------------------------------------------------------- /L04/images/output_183_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/output_183_0.png -------------------------------------------------------------------------------- /L04/images/output_185_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/output_185_0.png -------------------------------------------------------------------------------- /L04/images/output_188_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L04/images/output_188_0.png -------------------------------------------------------------------------------- /L05/code/05-bonus-column-transformer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "STAT 451: Machine Learning (Fall 2020) \n", 8 | "Instructor: Sebastian Raschka (sraschka@wisc.edu) \n", 9 | "\n", 10 | "Course website: http://pages.stat.wisc.edu/~sraschka/teaching/stat451-fs2020/" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# L05 - Bonus Notebook: Working with Heterogenous Datasets" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import pandas as pd\n", 27 | "from sklearn.preprocessing import StandardScaler\n", 28 | "from sklearn.preprocessing import OneHotEncoder\n", 29 | "from sklearn.neighbors import KNeighborsClassifier\n", 30 | "from sklearn.decomposition import PCA\n", 31 | "from sklearn.pipeline import Pipeline\n", 32 | "from sklearn.compose import ColumnTransformer\n", 33 | "from sklearn.model_selection import train_test_split" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "- Suppose you have a dataset that has both numerical and categorical features as follows: " 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/html": [ 51 | "
\n", 52 | "\n", 65 | "\n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | "
SepalLength[cm]SepalWidth[cm]PetalLength[cm]PetalWidth[cm]Color_IMadeThisUpSpecies
Id
15.13.51.40.2redIris-setosa
24.93.01.40.2redIris-setosa
34.73.21.30.2redIris-setosa
44.63.11.50.2redIris-setosa
55.03.61.40.2redIris-setosa
\n", 134 | "
" 135 | ], 136 | "text/plain": [ 137 | " SepalLength[cm] SepalWidth[cm] PetalLength[cm] PetalWidth[cm] \\\n", 138 | "Id \n", 139 | "1 5.1 3.5 1.4 0.2 \n", 140 | "2 4.9 3.0 1.4 0.2 \n", 141 | "3 4.7 3.2 1.3 0.2 \n", 142 | "4 4.6 3.1 1.5 0.2 \n", 143 | "5 5.0 3.6 1.4 0.2 \n", 144 | "\n", 145 | " Color_IMadeThisUp Species \n", 146 | "Id \n", 147 | "1 red Iris-setosa \n", 148 | "2 red Iris-setosa \n", 149 | "3 red Iris-setosa \n", 150 | "4 red Iris-setosa \n", 151 | "5 red Iris-setosa " 152 | ] 153 | }, 154 | "execution_count": 2, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "df = pd.read_csv('data/iris_mod.csv', index_col='Id')\n", 161 | "df.head()" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "- As usual, we first tranform the class labels into an integer format:" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 3, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "X = df.drop('Species', axis=1)\n", 178 | "y = df['Species']\n", 179 | "\n", 180 | "label_dict = {'Iris-setosa': 0,\n", 181 | " 'Iris-versicolor': 1,\n", 182 | " 'Iris-virginica': 2}\n", 183 | "\n", 184 | "y = y.map(label_dict)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "- Next, we are going to set up a `Pipeline` that performs certain preprocessing steps only on the numerical features:" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 4, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "numeric_features = ['SepalLength[cm]', 'SepalWidth[cm]', 'PetalLength[cm]', 'PetalWidth[cm]']\n", 201 | "\n", 202 | "numeric_transformer = Pipeline(steps=[\n", 203 | " ('scaler', StandardScaler()),\n", 204 | " ('feature_extraction', PCA(n_components=2))])" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "- Above, we weren't interested in performing these preprocessing steps on the categorical feature(s); instead, we apply **different** preprocessing steps to the categorical variable like so:" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 5, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "categorical_features = ['Color_IMadeThisUp']\n", 221 | "categorical_transformer = Pipeline(steps=[\n", 222 | " ('onehot', OneHotEncoder())])" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "- Scikit-learn's `ColumnTransformer` now allows us to merge these 2 seperate preprocessing pipelines, which operate on different feature sets in our dataset:" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 6, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "preprocessor = ColumnTransformer(\n", 239 | " transformers=[\n", 240 | " ('num', numeric_transformer, numeric_features),\n", 241 | " ('cat', categorical_transformer, categorical_features)])" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "- As a result, we get a 5 dimensional feature array (design matrix) if we apply this preprocessor. What are these 5 columns?" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 7, 254 | "metadata": {}, 255 | "outputs": [ 256 | { 257 | "data": { 258 | "text/plain": [ 259 | "(150, 5)" 260 | ] 261 | }, 262 | "execution_count": 7, 263 | "metadata": {}, 264 | "output_type": "execute_result" 265 | } 266 | ], 267 | "source": [ 268 | "temp = preprocessor.fit_transform(X)\n", 269 | "temp.shape" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 8, 275 | "metadata": {}, 276 | "outputs": [ 277 | { 278 | "data": { 279 | "text/plain": [ 280 | "array([[-2.26454173, 0.5057039 , 0. , 1. , 0. ],\n", 281 | " [-2.0864255 , -0.65540473, 0. , 1. , 0. ],\n", 282 | " [-2.36795045, -0.31847731, 0. , 1. , 0. ],\n", 283 | " [-2.30419716, -0.57536771, 0. , 1. , 0. ],\n", 284 | " [-2.38877749, 0.6747674 , 0. , 1. , 0. ]])" 285 | ] 286 | }, 287 | "execution_count": 8, 288 | "metadata": {}, 289 | "output_type": "execute_result" 290 | } 291 | ], 292 | "source": [ 293 | "temp[:5]" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "- The preprocessor can now also be conveniently be used in a Scikit-learn pipeline as shown below:" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 9, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "X_train, X_test, y_train, y_test = train_test_split(X, y, \n", 310 | " test_size=0.2,\n", 311 | " random_state=0)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 10, 317 | "metadata": {}, 318 | "outputs": [ 319 | { 320 | "name": "stdout", 321 | "output_type": "stream", 322 | "text": [ 323 | "Test accuracy: 100.0%\n" 324 | ] 325 | } 326 | ], 327 | "source": [ 328 | "clf = Pipeline(steps=[('preprocessor', preprocessor),\n", 329 | " ('classifier', KNeighborsClassifier(p=3))])\n", 330 | "\n", 331 | "\n", 332 | "clf.fit(X_train, y_train)\n", 333 | "print(f'Test accuracy: {clf.score(X_test, y_test)*100}%')" 334 | ] 335 | } 336 | ], 337 | "metadata": { 338 | "kernelspec": { 339 | "display_name": "Python 3", 340 | "language": "python", 341 | "name": "python3" 342 | }, 343 | "language_info": { 344 | "codemirror_mode": { 345 | "name": "ipython", 346 | "version": 3 347 | }, 348 | "file_extension": ".py", 349 | "mimetype": "text/x-python", 350 | "name": "python", 351 | "nbconvert_exporter": "python", 352 | "pygments_lexer": "ipython3", 353 | "version": "3.8.3" 354 | } 355 | }, 356 | "nbformat": 4, 357 | "nbformat_minor": 4 358 | } 359 | -------------------------------------------------------------------------------- /L05/code/05-preprocessing-and-sklearn__slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L05/code/05-preprocessing-and-sklearn__slides.pdf -------------------------------------------------------------------------------- /L05/code/data/categoricaldata.csv: -------------------------------------------------------------------------------- 1 | color,size,price,classlabel 2 | green,M,10.1,class1 3 | red,L,13.5,class2 4 | blue,XXL,15.3,class1 -------------------------------------------------------------------------------- /L05/code/data/iris.csv: -------------------------------------------------------------------------------- 1 | Id,SepalLength[cm],SepalWidth[cm],PetalLength[cm],PetalWidth[cm],Species 2 | 1,5.1,3.5,1.4,0.2,Iris-setosa 3 | 2,4.9,3.0,1.4,0.2,Iris-setosa 4 | 3,4.7,3.2,1.3,0.2,Iris-setosa 5 | 4,4.6,3.1,1.5,0.2,Iris-setosa 6 | 5,5.0,3.6,1.4,0.2,Iris-setosa 7 | 6,5.4,3.9,1.7,0.4,Iris-setosa 8 | 7,4.6,3.4,1.4,0.3,Iris-setosa 9 | 8,5.0,3.4,1.5,0.2,Iris-setosa 10 | 9,4.4,2.9,1.4,0.2,Iris-setosa 11 | 10,4.9,3.1,1.5,0.1,Iris-setosa 12 | 11,5.4,3.7,1.5,0.2,Iris-setosa 13 | 12,4.8,3.4,1.6,0.2,Iris-setosa 14 | 13,4.8,3.0,1.4,0.1,Iris-setosa 15 | 14,4.3,3.0,1.1,0.1,Iris-setosa 16 | 15,5.8,4.0,1.2,0.2,Iris-setosa 17 | 16,5.7,4.4,1.5,0.4,Iris-setosa 18 | 17,5.4,3.9,1.3,0.4,Iris-setosa 19 | 18,5.1,3.5,1.4,0.3,Iris-setosa 20 | 19,5.7,3.8,1.7,0.3,Iris-setosa 21 | 20,5.1,3.8,1.5,0.3,Iris-setosa 22 | 21,5.4,3.4,1.7,0.2,Iris-setosa 23 | 22,5.1,3.7,1.5,0.4,Iris-setosa 24 | 23,4.6,3.6,1.0,0.2,Iris-setosa 25 | 24,5.1,3.3,1.7,0.5,Iris-setosa 26 | 25,4.8,3.4,1.9,0.2,Iris-setosa 27 | 26,5.0,3.0,1.6,0.2,Iris-setosa 28 | 27,5.0,3.4,1.6,0.4,Iris-setosa 29 | 28,5.2,3.5,1.5,0.2,Iris-setosa 30 | 29,5.2,3.4,1.4,0.2,Iris-setosa 31 | 30,4.7,3.2,1.6,0.2,Iris-setosa 32 | 31,4.8,3.1,1.6,0.2,Iris-setosa 33 | 32,5.4,3.4,1.5,0.4,Iris-setosa 34 | 33,5.2,4.1,1.5,0.1,Iris-setosa 35 | 34,5.5,4.2,1.4,0.2,Iris-setosa 36 | 35,4.9,3.1,1.5,0.1,Iris-setosa 37 | 36,5.0,3.2,1.2,0.2,Iris-setosa 38 | 37,5.5,3.5,1.3,0.2,Iris-setosa 39 | 38,4.9,3.1,1.5,0.1,Iris-setosa 40 | 39,4.4,3.0,1.3,0.2,Iris-setosa 41 | 40,5.1,3.4,1.5,0.2,Iris-setosa 42 | 41,5.0,3.5,1.3,0.3,Iris-setosa 43 | 42,4.5,2.3,1.3,0.3,Iris-setosa 44 | 43,4.4,3.2,1.3,0.2,Iris-setosa 45 | 44,5.0,3.5,1.6,0.6,Iris-setosa 46 | 45,5.1,3.8,1.9,0.4,Iris-setosa 47 | 46,4.8,3.0,1.4,0.3,Iris-setosa 48 | 47,5.1,3.8,1.6,0.2,Iris-setosa 49 | 48,4.6,3.2,1.4,0.2,Iris-setosa 50 | 49,5.3,3.7,1.5,0.2,Iris-setosa 51 | 50,5.0,3.3,1.4,0.2,Iris-setosa 52 | 51,7.0,3.2,4.7,1.4,Iris-versicolor 53 | 52,6.4,3.2,4.5,1.5,Iris-versicolor 54 | 53,6.9,3.1,4.9,1.5,Iris-versicolor 55 | 54,5.5,2.3,4.0,1.3,Iris-versicolor 56 | 55,6.5,2.8,4.6,1.5,Iris-versicolor 57 | 56,5.7,2.8,4.5,1.3,Iris-versicolor 58 | 57,6.3,3.3,4.7,1.6,Iris-versicolor 59 | 58,4.9,2.4,3.3,1.0,Iris-versicolor 60 | 59,6.6,2.9,4.6,1.3,Iris-versicolor 61 | 60,5.2,2.7,3.9,1.4,Iris-versicolor 62 | 61,5.0,2.0,3.5,1.0,Iris-versicolor 63 | 62,5.9,3.0,4.2,1.5,Iris-versicolor 64 | 63,6.0,2.2,4.0,1.0,Iris-versicolor 65 | 64,6.1,2.9,4.7,1.4,Iris-versicolor 66 | 65,5.6,2.9,3.6,1.3,Iris-versicolor 67 | 66,6.7,3.1,4.4,1.4,Iris-versicolor 68 | 67,5.6,3.0,4.5,1.5,Iris-versicolor 69 | 68,5.8,2.7,4.1,1.0,Iris-versicolor 70 | 69,6.2,2.2,4.5,1.5,Iris-versicolor 71 | 70,5.6,2.5,3.9,1.1,Iris-versicolor 72 | 71,5.9,3.2,4.8,1.8,Iris-versicolor 73 | 72,6.1,2.8,4.0,1.3,Iris-versicolor 74 | 73,6.3,2.5,4.9,1.5,Iris-versicolor 75 | 74,6.1,2.8,4.7,1.2,Iris-versicolor 76 | 75,6.4,2.9,4.3,1.3,Iris-versicolor 77 | 76,6.6,3.0,4.4,1.4,Iris-versicolor 78 | 77,6.8,2.8,4.8,1.4,Iris-versicolor 79 | 78,6.7,3.0,5.0,1.7,Iris-versicolor 80 | 79,6.0,2.9,4.5,1.5,Iris-versicolor 81 | 80,5.7,2.6,3.5,1.0,Iris-versicolor 82 | 81,5.5,2.4,3.8,1.1,Iris-versicolor 83 | 82,5.5,2.4,3.7,1.0,Iris-versicolor 84 | 83,5.8,2.7,3.9,1.2,Iris-versicolor 85 | 84,6.0,2.7,5.1,1.6,Iris-versicolor 86 | 85,5.4,3.0,4.5,1.5,Iris-versicolor 87 | 86,6.0,3.4,4.5,1.6,Iris-versicolor 88 | 87,6.7,3.1,4.7,1.5,Iris-versicolor 89 | 88,6.3,2.3,4.4,1.3,Iris-versicolor 90 | 89,5.6,3.0,4.1,1.3,Iris-versicolor 91 | 90,5.5,2.5,4.0,1.3,Iris-versicolor 92 | 91,5.5,2.6,4.4,1.2,Iris-versicolor 93 | 92,6.1,3.0,4.6,1.4,Iris-versicolor 94 | 93,5.8,2.6,4.0,1.2,Iris-versicolor 95 | 94,5.0,2.3,3.3,1.0,Iris-versicolor 96 | 95,5.6,2.7,4.2,1.3,Iris-versicolor 97 | 96,5.7,3.0,4.2,1.2,Iris-versicolor 98 | 97,5.7,2.9,4.2,1.3,Iris-versicolor 99 | 98,6.2,2.9,4.3,1.3,Iris-versicolor 100 | 99,5.1,2.5,3.0,1.1,Iris-versicolor 101 | 100,5.7,2.8,4.1,1.3,Iris-versicolor 102 | 101,6.3,3.3,6.0,2.5,Iris-virginica 103 | 102,5.8,2.7,5.1,1.9,Iris-virginica 104 | 103,7.1,3.0,5.9,2.1,Iris-virginica 105 | 104,6.3,2.9,5.6,1.8,Iris-virginica 106 | 105,6.5,3.0,5.8,2.2,Iris-virginica 107 | 106,7.6,3.0,6.6,2.1,Iris-virginica 108 | 107,4.9,2.5,4.5,1.7,Iris-virginica 109 | 108,7.3,2.9,6.3,1.8,Iris-virginica 110 | 109,6.7,2.5,5.8,1.8,Iris-virginica 111 | 110,7.2,3.6,6.1,2.5,Iris-virginica 112 | 111,6.5,3.2,5.1,2.0,Iris-virginica 113 | 112,6.4,2.7,5.3,1.9,Iris-virginica 114 | 113,6.8,3.0,5.5,2.1,Iris-virginica 115 | 114,5.7,2.5,5.0,2.0,Iris-virginica 116 | 115,5.8,2.8,5.1,2.4,Iris-virginica 117 | 116,6.4,3.2,5.3,2.3,Iris-virginica 118 | 117,6.5,3.0,5.5,1.8,Iris-virginica 119 | 118,7.7,3.8,6.7,2.2,Iris-virginica 120 | 119,7.7,2.6,6.9,2.3,Iris-virginica 121 | 120,6.0,2.2,5.0,1.5,Iris-virginica 122 | 121,6.9,3.2,5.7,2.3,Iris-virginica 123 | 122,5.6,2.8,4.9,2.0,Iris-virginica 124 | 123,7.7,2.8,6.7,2.0,Iris-virginica 125 | 124,6.3,2.7,4.9,1.8,Iris-virginica 126 | 125,6.7,3.3,5.7,2.1,Iris-virginica 127 | 126,7.2,3.2,6.0,1.8,Iris-virginica 128 | 127,6.2,2.8,4.8,1.8,Iris-virginica 129 | 128,6.1,3.0,4.9,1.8,Iris-virginica 130 | 129,6.4,2.8,5.6,2.1,Iris-virginica 131 | 130,7.2,3.0,5.8,1.6,Iris-virginica 132 | 131,7.4,2.8,6.1,1.9,Iris-virginica 133 | 132,7.9,3.8,6.4,2.0,Iris-virginica 134 | 133,6.4,2.8,5.6,2.2,Iris-virginica 135 | 134,6.3,2.8,5.1,1.5,Iris-virginica 136 | 135,6.1,2.6,5.6,1.4,Iris-virginica 137 | 136,7.7,3.0,6.1,2.3,Iris-virginica 138 | 137,6.3,3.4,5.6,2.4,Iris-virginica 139 | 138,6.4,3.1,5.5,1.8,Iris-virginica 140 | 139,6.0,3.0,4.8,1.8,Iris-virginica 141 | 140,6.9,3.1,5.4,2.1,Iris-virginica 142 | 141,6.7,3.1,5.6,2.4,Iris-virginica 143 | 142,6.9,3.1,5.1,2.3,Iris-virginica 144 | 143,5.8,2.7,5.1,1.9,Iris-virginica 145 | 144,6.8,3.2,5.9,2.3,Iris-virginica 146 | 145,6.7,3.3,5.7,2.5,Iris-virginica 147 | 146,6.7,3.0,5.2,2.3,Iris-virginica 148 | 147,6.3,2.5,5.0,1.9,Iris-virginica 149 | 148,6.5,3.0,5.2,2.0,Iris-virginica 150 | 149,6.2,3.4,5.4,2.3,Iris-virginica 151 | 150,5.9,3.0,5.1,1.8,Iris-virginica -------------------------------------------------------------------------------- /L05/code/data/iris_mod.csv: -------------------------------------------------------------------------------- 1 | Id,SepalLength[cm],SepalWidth[cm],PetalLength[cm],PetalWidth[cm],Color_IMadeThisUp,Species 2 | 1,5.1,3.5,1.4,0.2,red,Iris-setosa 3 | 2,4.9,3,1.4,0.2,red,Iris-setosa 4 | 3,4.7,3.2,1.3,0.2,red,Iris-setosa 5 | 4,4.6,3.1,1.5,0.2,red,Iris-setosa 6 | 5,5,3.6,1.4,0.2,red,Iris-setosa 7 | 6,5.4,3.9,1.7,0.4,red,Iris-setosa 8 | 7,4.6,3.4,1.4,0.3,red,Iris-setosa 9 | 8,5,3.4,1.5,0.2,blue,Iris-setosa 10 | 9,4.4,2.9,1.4,0.2,red,Iris-setosa 11 | 10,4.9,3.1,1.5,0.1,red,Iris-setosa 12 | 11,5.4,3.7,1.5,0.2,blue,Iris-setosa 13 | 12,4.8,3.4,1.6,0.2,red,Iris-setosa 14 | 13,4.8,3,1.4,0.1,red,Iris-setosa 15 | 14,4.3,3,1.1,0.1,red,Iris-setosa 16 | 15,5.8,4,1.2,0.2,red,Iris-setosa 17 | 16,5.7,4.4,1.5,0.4,red,Iris-setosa 18 | 17,5.4,3.9,1.3,0.4,red,Iris-setosa 19 | 18,5.1,3.5,1.4,0.3,red,Iris-setosa 20 | 19,5.7,3.8,1.7,0.3,red,Iris-setosa 21 | 20,5.1,3.8,1.5,0.3,blue,Iris-setosa 22 | 21,5.4,3.4,1.7,0.2,red,Iris-setosa 23 | 22,5.1,3.7,1.5,0.4,red,Iris-setosa 24 | 23,4.6,3.6,1,0.2,red,Iris-setosa 25 | 24,5.1,3.3,1.7,0.5,blue,Iris-setosa 26 | 25,4.8,3.4,1.9,0.2,red,Iris-setosa 27 | 26,5,3,1.6,0.2,red,Iris-setosa 28 | 27,5,3.4,1.6,0.4,red,Iris-setosa 29 | 28,5.2,3.5,1.5,0.2,red,Iris-setosa 30 | 29,5.2,3.4,1.4,0.2,red,Iris-setosa 31 | 30,4.7,3.2,1.6,0.2,violet,Iris-setosa 32 | 31,4.8,3.1,1.6,0.2,red,Iris-setosa 33 | 32,5.4,3.4,1.5,0.4,red,Iris-setosa 34 | 33,5.2,4.1,1.5,0.1,red,Iris-setosa 35 | 34,5.5,4.2,1.4,0.2,red,Iris-setosa 36 | 35,4.9,3.1,1.5,0.1,red,Iris-setosa 37 | 36,5,3.2,1.2,0.2,violet,Iris-setosa 38 | 37,5.5,3.5,1.3,0.2,red,Iris-setosa 39 | 38,4.9,3.1,1.5,0.1,red,Iris-setosa 40 | 39,4.4,3,1.3,0.2,red,Iris-setosa 41 | 40,5.1,3.4,1.5,0.2,red,Iris-setosa 42 | 41,5,3.5,1.3,0.3,red,Iris-setosa 43 | 42,4.5,2.3,1.3,0.3,red,Iris-setosa 44 | 43,4.4,3.2,1.3,0.2,red,Iris-setosa 45 | 44,5,3.5,1.6,0.6,red,Iris-setosa 46 | 45,5.1,3.8,1.9,0.4,red,Iris-setosa 47 | 46,4.8,3,1.4,0.3,red,Iris-setosa 48 | 47,5.1,3.8,1.6,0.2,red,Iris-setosa 49 | 48,4.6,3.2,1.4,0.2,red,Iris-setosa 50 | 49,5.3,3.7,1.5,0.2,red,Iris-setosa 51 | 50,5,3.3,1.4,0.2,red,Iris-setosa 52 | 51,7,3.2,4.7,1.4,blue,Iris-versicolor 53 | 52,6.4,3.2,4.5,1.5,blue,Iris-versicolor 54 | 53,6.9,3.1,4.9,1.5,blue,Iris-versicolor 55 | 54,5.5,2.3,4,1.3,blue,Iris-versicolor 56 | 55,6.5,2.8,4.6,1.5,blue,Iris-versicolor 57 | 56,5.7,2.8,4.5,1.3,blue,Iris-versicolor 58 | 57,6.3,3.3,4.7,1.6,blue,Iris-versicolor 59 | 58,4.9,2.4,3.3,1,blue,Iris-versicolor 60 | 59,6.6,2.9,4.6,1.3,blue,Iris-versicolor 61 | 60,5.2,2.7,3.9,1.4,blue,Iris-versicolor 62 | 61,5,2,3.5,1,blue,Iris-versicolor 63 | 62,5.9,3,4.2,1.5,blue,Iris-versicolor 64 | 63,6,2.2,4,1,blue,Iris-versicolor 65 | 64,6.1,2.9,4.7,1.4,blue,Iris-versicolor 66 | 65,5.6,2.9,3.6,1.3,blue,Iris-versicolor 67 | 66,6.7,3.1,4.4,1.4,red,Iris-versicolor 68 | 67,5.6,3,4.5,1.5,blue,Iris-versicolor 69 | 68,5.8,2.7,4.1,1,blue,Iris-versicolor 70 | 69,6.2,2.2,4.5,1.5,blue,Iris-versicolor 71 | 70,5.6,2.5,3.9,1.1,violet,Iris-versicolor 72 | 71,5.9,3.2,4.8,1.8,blue,Iris-versicolor 73 | 72,6.1,2.8,4,1.3,blue,Iris-versicolor 74 | 73,6.3,2.5,4.9,1.5,blue,Iris-versicolor 75 | 74,6.1,2.8,4.7,1.2,blue,Iris-versicolor 76 | 75,6.4,2.9,4.3,1.3,blue,Iris-versicolor 77 | 76,6.6,3,4.4,1.4,blue,Iris-versicolor 78 | 77,6.8,2.8,4.8,1.4,blue,Iris-versicolor 79 | 78,6.7,3,5,1.7,blue,Iris-versicolor 80 | 79,6,2.9,4.5,1.5,blue,Iris-versicolor 81 | 80,5.7,2.6,3.5,1,violet,Iris-versicolor 82 | 81,5.5,2.4,3.8,1.1,blue,Iris-versicolor 83 | 82,5.5,2.4,3.7,1,red,Iris-versicolor 84 | 83,5.8,2.7,3.9,1.2,blue,Iris-versicolor 85 | 84,6,2.7,5.1,1.6,blue,Iris-versicolor 86 | 85,5.4,3,4.5,1.5,blue,Iris-versicolor 87 | 86,6,3.4,4.5,1.6,blue,Iris-versicolor 88 | 87,6.7,3.1,4.7,1.5,blue,Iris-versicolor 89 | 88,6.3,2.3,4.4,1.3,violet,Iris-versicolor 90 | 89,5.6,3,4.1,1.3,blue,Iris-versicolor 91 | 90,5.5,2.5,4,1.3,blue,Iris-versicolor 92 | 91,5.5,2.6,4.4,1.2,blue,Iris-versicolor 93 | 92,6.1,3,4.6,1.4,blue,Iris-versicolor 94 | 93,5.8,2.6,4,1.2,violet,Iris-versicolor 95 | 94,5,2.3,3.3,1,blue,Iris-versicolor 96 | 95,5.6,2.7,4.2,1.3,violet,Iris-versicolor 97 | 96,5.7,3,4.2,1.2,blue,Iris-versicolor 98 | 97,5.7,2.9,4.2,1.3,blue,Iris-versicolor 99 | 98,6.2,2.9,4.3,1.3,blue,Iris-versicolor 100 | 99,5.1,2.5,3,1.1,blue,Iris-versicolor 101 | 100,5.7,2.8,4.1,1.3,blue,Iris-versicolor 102 | 101,6.3,3.3,6,2.5,violet,Iris-virginica 103 | 102,5.8,2.7,5.1,1.9,violet,Iris-virginica 104 | 103,7.1,3,5.9,2.1,violet,Iris-virginica 105 | 104,6.3,2.9,5.6,1.8,violet,Iris-virginica 106 | 105,6.5,3,5.8,2.2,violet,Iris-virginica 107 | 106,7.6,3,6.6,2.1,violet,Iris-virginica 108 | 107,4.9,2.5,4.5,1.7,violet,Iris-virginica 109 | 108,7.3,2.9,6.3,1.8,violet,Iris-virginica 110 | 109,6.7,2.5,5.8,1.8,violet,Iris-virginica 111 | 110,7.2,3.6,6.1,2.5,violet,Iris-virginica 112 | 111,6.5,3.2,5.1,2,violet,Iris-virginica 113 | 112,6.4,2.7,5.3,1.9,violet,Iris-virginica 114 | 113,6.8,3,5.5,2.1,violet,Iris-virginica 115 | 114,5.7,2.5,5,2,violet,Iris-virginica 116 | 115,5.8,2.8,5.1,2.4,violet,Iris-virginica 117 | 116,6.4,3.2,5.3,2.3,violet,Iris-virginica 118 | 117,6.5,3,5.5,1.8,violet,Iris-virginica 119 | 118,7.7,3.8,6.7,2.2,violet,Iris-virginica 120 | 119,7.7,2.6,6.9,2.3,violet,Iris-virginica 121 | 120,6,2.2,5,1.5,violet,Iris-virginica 122 | 121,6.9,3.2,5.7,2.3,blue,Iris-virginica 123 | 122,5.6,2.8,4.9,2,violet,Iris-virginica 124 | 123,7.7,2.8,6.7,2,violet,Iris-virginica 125 | 124,6.3,2.7,4.9,1.8,violet,Iris-virginica 126 | 125,6.7,3.3,5.7,2.1,blue,Iris-virginica 127 | 126,7.2,3.2,6,1.8,violet,Iris-virginica 128 | 127,6.2,2.8,4.8,1.8,violet,Iris-virginica 129 | 128,6.1,3,4.9,1.8,violet,Iris-virginica 130 | 129,6.4,2.8,5.6,2.1,blue,Iris-virginica 131 | 130,7.2,3,5.8,1.6,violet,Iris-virginica 132 | 131,7.4,2.8,6.1,1.9,violet,Iris-virginica 133 | 132,7.9,3.8,6.4,2,violet,Iris-virginica 134 | 133,6.4,2.8,5.6,2.2,violet,Iris-virginica 135 | 134,6.3,2.8,5.1,1.5,red,Iris-virginica 136 | 135,6.1,2.6,5.6,1.4,violet,Iris-virginica 137 | 136,7.7,3,6.1,2.3,violet,Iris-virginica 138 | 137,6.3,3.4,5.6,2.4,violet,Iris-virginica 139 | 138,6.4,3.1,5.5,1.8,violet,Iris-virginica 140 | 139,6,3,4.8,1.8,blue,Iris-virginica 141 | 140,6.9,3.1,5.4,2.1,violet,Iris-virginica 142 | 141,6.7,3.1,5.6,2.4,violet,Iris-virginica 143 | 142,6.9,3.1,5.1,2.3,violet,Iris-virginica 144 | 143,5.8,2.7,5.1,1.9,violet,Iris-virginica 145 | 144,6.8,3.2,5.9,2.3,violet,Iris-virginica 146 | 145,6.7,3.3,5.7,2.5,violet,Iris-virginica 147 | 146,6.7,3,5.2,2.3,violet,Iris-virginica 148 | 147,6.3,2.5,5,1.9,violet,Iris-virginica 149 | 148,6.5,3,5.2,2,blue,Iris-virginica 150 | 149,6.2,3.4,5.4,2.3,violet,Iris-virginica 151 | 150,5.9,3,5.1,1.8,red,Iris-virginica -------------------------------------------------------------------------------- /L05/code/data/missingdata.csv: -------------------------------------------------------------------------------- 1 | A,B,C,D 2 | 1.,2.,3.,4. 3 | 5.,6.,,8. 4 | 10.,11.,12., 5 | -------------------------------------------------------------------------------- /L05/code/images/decisionreg.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L05/code/images/decisionreg.pdf -------------------------------------------------------------------------------- /L05/code/images/eda.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L05/code/images/eda.pdf -------------------------------------------------------------------------------- /L05/code/images/estimator-api.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L05/code/images/estimator-api.pdf -------------------------------------------------------------------------------- /L05/code/images/estimator-api.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L05/code/images/estimator-api.png -------------------------------------------------------------------------------- /L05/code/images/holdout-tuning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L05/code/images/holdout-tuning.pdf -------------------------------------------------------------------------------- /L05/code/images/holdout-tuning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L05/code/images/holdout-tuning.png -------------------------------------------------------------------------------- /L05/code/images/iris-subsampling.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L05/code/images/iris-subsampling.pdf -------------------------------------------------------------------------------- /L05/code/images/iris-subsampling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L05/code/images/iris-subsampling.png -------------------------------------------------------------------------------- /L05/code/images/sklearn-pipeline.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L05/code/images/sklearn-pipeline.pdf -------------------------------------------------------------------------------- /L05/code/images/sklearn-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L05/code/images/sklearn-pipeline.png -------------------------------------------------------------------------------- /L05/code/images/transformer-api.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L05/code/images/transformer-api.pdf -------------------------------------------------------------------------------- /L05/code/images/transformer-api.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L05/code/images/transformer-api.png -------------------------------------------------------------------------------- /L06/06-trees__notes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L06/06-trees__notes.pdf -------------------------------------------------------------------------------- /L06/06-trees__slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L06/06-trees__slides.pdf -------------------------------------------------------------------------------- /L06/code/06-trees_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# STAT451: Machine Learning -- L06: Decision Trees in Scikit-Learn Demo" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "STAT 451: Machine Learning (Fall 2020) \n", 15 | "Instructor: Sebastian Raschka (sraschka@wisc.edu) \n", 16 | "\n", 17 | "Course website: http://stat.wisc.edu/~sraschka/teaching/stat451-fs2020/ " 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "name": "stdout", 27 | "output_type": "stream", 28 | "text": [ 29 | "Sebastian Raschka \n", 30 | "last updated: 2020-10-15 \n", 31 | "\n", 32 | "CPython 3.8.2\n", 33 | "IPython 7.18.1\n", 34 | "\n", 35 | "numpy 1.19.1\n", 36 | "scipy 1.5.0\n", 37 | "matplotlib 3.3.1\n", 38 | "sklearn 0.23.2\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "%load_ext watermark\n", 44 | "%watermark -d -u -a 'Sebastian Raschka' -v -p numpy,scipy,matplotlib,sklearn" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "Class labels: [0 1 2]\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "from sklearn import datasets\n", 62 | "import numpy as np\n", 63 | "\n", 64 | "\n", 65 | "iris = datasets.load_iris()\n", 66 | "X = iris.data[:, [2, 3]]\n", 67 | "y = iris.target\n", 68 | "\n", 69 | "print('Class labels:', np.unique(y))" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 3, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "from sklearn.model_selection import train_test_split\n", 79 | "\n", 80 | "\n", 81 | "X_train, X_test, y_train, y_test = train_test_split(\n", 82 | " X, y, test_size=0.3, random_state=1, stratify=y)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 4, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "name": "stdout", 92 | "output_type": "stream", 93 | "text": [ 94 | "Labels counts in y: [50 50 50]\n", 95 | "Labels counts in y_train: [35 35 35]\n", 96 | "Labels counts in y_test: [15 15 15]\n" 97 | ] 98 | } 99 | ], 100 | "source": [ 101 | "print('Labels counts in y:', np.bincount(y))\n", 102 | "print('Labels counts in y_train:', np.bincount(y_train))\n", 103 | "print('Labels counts in y_test:', np.bincount(y_test))" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 7, 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "data": { 113 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAagAAAEYCAYAAAAJeGK1AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAuUUlEQVR4nO3de3xU9bX38c/KBQKEmwJyVVAoWnwUFLEWq55avPIItR7Ea+3lUK22tbWPx9o+p7WnzzmeHuurVqxIvaJWREWkVEWOrRdoVZByEUTuGi4CieESIECS9fwxOyEJk2QnmZk9M/m+X695Zfae3/z2mtRmsfdes37m7oiIiKSbnKgDEBERiUcJSkRE0pISlIiIpCUlKBERSUtKUCIikpaUoEREJC1FlqDMrMDM3jOzpWa2wszuijPmPDPbZWZLgse/RRGriIikXl6Exz4AfNndy8wsH5hvZq+4+zv1xr3t7mMjiE9ERCIUWYLy2DeEy4LN/OChbw2LiAgQ7RkUZpYLvA8MBh5w93fjDDvLzJYCW4Afu/uKBuaaBEwCeOhXPzx90kSddIkk2//507OcPuaLUYchGW7isK9bvP2RJih3rwSGm1k34EUzO9ndP6g1ZDFwXHAZ8BJgFjCkgbmmAlMB+Nv9zra4eUxEEmjDB+uVoCRp0qKKz913Am8AF9Xbv9vdy4LnLwP5ZtYj5QGKiEjKRVnF1zM4c8LMOgBfAVbVG9PbzCx4PopYvCUpDlVERCIQ5SW+PsATwX2oHGCGu88xsxsB3H0KcAVwk5lVAPuBia726yIibUKUVXzLgBFx9k+p9XwyMDkRxztEPpsKPkd5TqdETJcUBVV76V++mnwORR2KiEjkIi2SSKVNBZ+jc8/jGNitE8FVw7Ti7pTs3MumHTCoXAUeIiJpUSSRCuU5nTg6TZMTgJlxdLdOaX2GJyKSSm0mQQFpm5yqpXt8IiKp1KYSlIiIZA4lqBR6dd5fGDpiNINP/QJ3/+b+qMMREUlrSlApUllZyc23/YRXZv6RlQvf4pnnX2Tlqo+iDktEJG21mSq+5hg1ZjzFpbuP2N+jexfemzerRXO+t+gfDD5+EMcPOg6AiV8bz0tz5vL5E4e2JlQRkaylBBVHceluht105NevVjx4S4vn3Lx1KwP69a3Z7t+vD+8uWtzi+UREsp0u8aVIvAYYqtoTEWmYElSK9O/bl6LNW2q2N23eSt/evSOMSEQkvSlBpcgZpw9nzbr1bNj4MQcPHmT6C7O47NILog5LRCRt6R5UiuTl5TH5nv/gwvFXUVlVyTevu4phJ50YdVgiImlLCSqOHt27xC2I6NG9S6vmveTCr3DJhV9p1RwiIm2FElQcLS0lFxGRxNE9KBERSUtKUCIikpaUoEREJC1FlqDMrMDM3jOzpWa2wszuijPGzOx3ZrbWzJaZ2WlRxCoiIqkXZZHEAeDL7l5mZvnAfDN7xd3fqTXmYmBI8DgTeDD4KSIiWS6yMyiPKQs284NH/X5A44Bpwdh3gG5m1ieVcSbSN2+6lV6DhnHyqHOjDkVEJO1Feg/KzHLNbAmwHZjn7u/WG9IPKKq1vSnYl5FuuOZKXn3xmajDEBHJCJEmKHevdPfhQH9glJmdXG9IvG6qR3ZdBcxskpktMrNFU19akJD4iotL+NqV11BS8llC5jvn7LM4qnu3hMwlIpLt0qKKz913Am8AF9V7aRMwoNZ2f2ALcbj7VHcf6e4jJ40bnZC4pj31FKUblvLEk08mZD4REQkvyiq+nmbWLXjeAfgKsKresNnA9UE13xeAXe6+NRXxFReXMOfF53lwwgDmvPh8ws6iREQknCjPoPoAfzWzZcBCYveg5pjZjWZ2YzDmZWA9sBb4A/DdVAU37amnGHsCDD2mA2NPQGdRIiIpFlmZubsvA0bE2T+l1nMHbk5lXHD47GnGxG4AXH9GNyZMf56vX3cdRx99VKrDERFpk9LiHlS6qT576lGYD8R+JuIs6qpv3MhZ54/lozXr6D90BI888cdEhCsikpXUzTyON958my1F5fxxWVGd/X1L3+ZHt/6gxfM+89iUpgeJiAigBBXX7BefjzoEEZE2T5f4REQkLSlBiYhIWlKCEhGRtKQEJSIiaUkJSkRE0pISVIoUbdrMP11yOSed/iWGnXEO9/3+D1GHJCKS1lRmniJ5eXn85j9+wWnDT2HPnjJO/9IFjPnyOXz+xKFRhyYikpZ0BtWAV+b9hQuvnMDnzjqLC6+cwCvz/tKq+fr0PobThp8CQOfOhZw0dAibt3yaiFBFRLKSzqDieGXeX7j9N7/g+K/24+xBp1KyYRe3/+YXAFw85sutnn/jx5/wj2UfcObI01o9l4hIttIZVBy/fXgKx3+1Hz0HdycnN4eeg7tz/Ff78duHW9+qqKxsL1+79tv89u5f0qVL5wREKyKSnZSg4tjwSRFHD+paZ9/Rg7qy4ZOiBt4RzqFDh/jatd/imgmXc/m4S1s1l4hItlOCimPQsQMo2bCrzr6SDbsYdOyABt7RNHfnWzf/kJOGDuFH37ux6TeIiLRxSlBx3PrtG1n/4mZ2rC2lqrKKHWtLWf/iZm79dssTy4K/v8eTzzzPX96cz/Avns/wL57Py3P/J4FRi4hkFxVJxFFdCPHbh6cw/5OlDDp2AL++7RetKpA4+4tn4ntUtSciEpYSVAMuHvPlhFTsiYhIy0SWoMxsADAN6A1UAVPd/b56Y84DXgI2BLtmuvsvUximiLRhS+Yv49UZr7FjSzE9+/bgc8OGsHrFmprtiyZcwPCzT4k6zKwV5RlUBXCbuy82s87A+2Y2z91X1hv3truPTcQB3R0zS8RUSeHuUYcgIoEl85fx7GMzGDi+DwMHnsS2lSXMfXoux53bjzP+5SR2bSzj2cdmAChJJUlkRRLuvtXdFwfP9wAfAv2SdbyCqr2U7NybtknA3SnZuZeCqr1RhyIiwKszXmPg+D50P6ELObk5WNcqBk3sQ/HKUnJyc+h+QhcGju/DqzNeizrUrJUW96DMbCAwAng3zstnmdlSYAvwY3df0cAck4BJAA/dfiWTxo2u83r/8tVs2gE7SjolMvSEKqjaS//y1VGHISLAji3FDBx4Us12xaFDdB1cyMbp22r2dR1YyOotrft+pDQs8gRlZoXAC8Ct7r673suLgePcvczMLgFmAUPizePuU4GpAPzt/iNOk/I5xKDyuLlNROQIPfv2YNfGMrqf0AWAvPx8dq0to2PPgpoxuzaW0bNvj6hCzHqRJigzyyeWnJ5295n1X6+dsNz9ZTP7vZn1cPfiVMYpIolVv/ggkcUGMx96iXmzXqd8bzkFnQoYM/58Lv/OuGbPc9GEC2L3mMbHzpR8Vw4bpm/luHP7UVVZxa6NZWyctZUrvzEhIXHLkaKs4jPgEeBDd7+3gTG9gW3u7mY2itg9s5IUhikiCVa/+CCRxQYzH3qJuX+ey6Dr+tB1cCG71pYxd/pcgGYnqepYXp3xGqu3FNGzbw8uvPRCVq9Yw8K7PqRn3x5c+Y0JKpBIoijPoEYD1wHLzWxJsO9O4FgAd58CXAHcZGYVwH5goqdrlYOIhFK7+ACI/Rwf29/aP/bzZr3OoOv60P3EWCPm7id2hokw78nXW3QWNfzsU5SAIhRZgnL3+UCjNd/uPhmYnJqIRCQV6hcfQOKKDcr3ltN1cGHduQcXUr5XhQyZSL34RCSlqosPaktUsUFBpwJ2ra0399oyCjoVNPAOSWdKUCKSUhdNuICNs7ZSum43VZVVlK7bzcZZW7lowgWtnnvM+PPZMH0rpav2UFXhlK7aw4bpWxkz/vwERC6pFnmZuYi0LfGKD6or4e7+/j2taitUfZ9p3pOvU763iIJOBVw4/sJQ95/CVBYms/qwpdIxpkRRghKRlKtffJDItkKXf2dcswsiwlQWJrP6sKXSMaZE0iU+EYlc1G2F6h8/3vHCjEm1dIwpkZSgRCRyO7YU03Xg4eq76rZC+3aU1+zrOrCQHVuS8x39+sePd7wwY1ItHWNKJCUoEYlc/cq+VLcVClNZmMzqw5ZKx5gSSfegRCRyiWwrFLZooPa49u3bs2P6dj438Ti6DiyMe7z6Maai1VFTnyWKmFJJCUpEIpeotkJhiwbijfvw8Q0UPb+D1XuL4h6voerDZBUjhPksqY4p1ZSgRCQtJKKtUNg2SvHGnXTDIErn7edXT/wiqTGGFfazZHM7Jt2DEpGsEbZoIBOKCzIhxmRTghKRrBG2aCATigsyIcZk0yU+EckYYYoGnnzwKY4+rzPte+ZyYEclJW/sYfQ5o4/oUrFw1sK0Ky5obuFGtlOCEpGMELYAorK8ik//WsKBskO0L8ynco/z1mtvc9INg2ret3DWQs4YdQar561Jm+KClhRuZDslKBHJCGGKBl6d8Ron3TCoZgzA2gUb+fSvJUe8b/W8Ndzxux+n+mM0qKWFG9lM96BEJCO0tNtD+565HCg71Oj70oGKIo6kBCUiGaGl3R4O7KikfWF+o+9LByqKOFJkl/jMbAAwDegNVAFT3f2+emMMuA+4BNgH3ODui1Mdq4gkVkuWiAjTNSHemJI39pBzII/SdbsjKzYI83njxf7h4xsoLCzktivuyLqlNMJoMEGZ2eUh3l/u7i+38NgVwG3uvtjMOgPvm9k8d19Za8zFwJDgcSbwYPBTRDJUS5eICNM1Id6Y6266tsn3JVPYz1s/9vbt25NbkMOAK3rWJKxsWkojDHP3+C+YlQAvAdbI+89x9xMSEojZS8Bkd59Xa99DwBvu/kyw/RFwnrtvbXSyv90f/0OJSEJd8ehsrvjhtc16z93fv4fuYzrUKWQoXbeb0nn706poIVFa+nnb0u9p4rCvx80zjV3ie8Xdv9nYpGb2VKuiOjzPQGAE8G69l/oBRbW2NwX7jkhQZjYJmATw0O1XMmnc6ESEJiIJtmNLMQMHnlRnX9eBhazeUtTAOzJbSz9vW/s9xdNgkYS7N/nPojBjmmJmhcALwK3uvrv+y/EO20AsU919pLuPVHISSV9trRigpZ+3rf2e4mmyis/Mcs3sMjP7vpn9qPqRiIObWT6x5PS0u8+MM2QTMKDWdn9gSyKOLSLRuGjCBWyctZXSdbupqqyidN1uNs7aykUTLog6tKRo6edta7+neMJU8f0JKAeWE6u2S4igQu8R4EN3v7eBYbOBW8xsOrHiiF1N3n8SkbRTv4qtkxXyzj1LqayoIjcvh5GjT8+IG/8tqT5s6ZIY2b6URhhhElR/d0/Gb2Q0cB2w3MyWBPvuBI4FcPcpwMvESszXEisz/0YS4hCRJKpfxfbRnzZQtOBTBn+zH0ed1IVda8tYNn0pMx96icu/My7qcBvU0urD6tdbkliyeSmNMMIkqFfM7AJ3fy2RB3b3+TReIYjHSgxvTuRxRSS16rfw2bHyM46/ug/tu7cjJ8/ofmJnmAjznnw9rRNU2PWZJHHCdJJ4B3jRzPab2W4z22Nm9YsZRETiqt/Cp/yzg3Q5oSNVFYfvGHQdXEj53vIowgtNrYhSL0yC+g1wFtDR3bu4e2d379LUm0RE4MhqtIKj2rF73T5y8g7/+dm1toyCTgVRhBeaqupSL8wlvjXAB97QN3pFpM36cMl6brzoe+zfs58OnTtwwVe/wvHDBtUpJKi/9lLPzx/F+j9uZdCEPnTsUcCutWVsmL6VC8df2KIYZj70EvNmvU753nIKOhVw8ohh7Cnb06xChoZofaZoNdhJomaA2ePA8cArwIHq/Y1U3kVPnSREku7fH3+VXz33GifeeCyFx7Wn7OMDrH18M526d2LY9SfU+SN+xqgzWL1iTU3S6FzYmQ/+saImqYwZf36L7j/NfOgl5v55LoMm9qHr4EKKXtvO9r+XcuKVA+k/ok+dJNLcJFW7KKJ+b7z9e8vbZG+8ZGlJJ4lqG4JHu+AhIsLkV96m+8md6XxCB9rlgp3QgXZH53P0OZ1StvbSvFmvM+i6PrFCC6Bs3T6Ov7oPFFaSk5vTqkIGrc8UvSYTlLvflYpARCSz7Nl7gB7dCsg1yDHINajYW0nBMXX/HZvM9jzle8vpOrhWAUZxrABj39aaiz0tPr5aDUUvTCeJeWbWrdZ2dzObm9SoRCTtFXZsx4HPDlJd65CXA3kdc9i/7WCdccksJCjoFLuHVbPdIyjAyDl8xailx1dRRPTCVPH1dPed1RvuXgr0SlpEIpIRTu/Xl50ryti5Zi+VFc7ONXs5UHKQbXNL47bn2V26h/u+dy97dpY1PXlIY8afz4bpWyldtYeqCqfwhI6s/+NW2JXb6vZAajUUvTD3oCrN7Fh3/wTAzI6jgYatItJ25OdA7gFYOWUzlZVObq5RaPkc1aE7pfP2H9Ge55VH55CzZSPzZ77Bxd8cm5AYqgsr5j35OuV7iyjoVMCIESPYs2gPC2d/2Kr2QGo1FL0wVXwXAVOBN4Nd5wCT3D19L/Opik8kJcKuB7W7dA9Tv/sfPDC2EzfP2ct3HvwpnbsVNvk+aRsaquJr8hKfu78KnAY8C8wATk/r5CQiaWfBi29y2eAchhzTnssG5zB/5htRhyQZoMEEZWa9q5+7e7G7z3H3P7l7cbwxIiLx7C7dw4p5C7jq9NgZ01WnF7Ji3oKE3ouS7NTYGdTLId4fZoyIZIjinWV87Y4plOzam7A5q8+eji6M3fI+ujCPywbn8PrTcxNeNNEcS+Yv4+7v38NtV9zB3d+/hyXzl0UShzSssSKJU5toCmuAmsaKZJFpf/4bpZ8W8cScBfzomsRUq616bwXvfXqAZ5Zvq7P/0NJ3OSa/PKFFE2G1ZukMSZ0GE5S756YyEBGJVvHOMua8uZAHL+/BTXMW8vWxozm6a6dWz/vDKf96xL7qool7x/bi5jkLOPvy81JaNKGlMzJDmO9BiUgbMO3Pf2Ps4ByG9mrP2ME5PDFnQdKOFXXRhJbOyAxKUCJSc/Z0/WmxM6brT+vEnDcXJvReVLV0KJpQl4jMEGmCMrNHzWy7mX3QwOvnmdkuM1sSPP4t1TGKtAXVZ089gkKGHoV5STuLaqhoIpVnUeoSkRnCdJLAzHKBY2qPr+4s0UqPA5OBaY2MedvdU3sHVSTDFe8s4zt3P8XUn1wX6j7SG4tXs2X7AR5euJmi4j0M6NGZjgX59N22+ohiidpzN2TLhq3897f+g9sf+yl9jutdZ12l8j17ydlfyW/ml3Cooor8vBy65LWj744VKSuWUJeIzNBkgjKz7wE/B7YB1Ws0O9Dq/yXd/S0zG9jaeUSkruZW483+zS0AjLjulww5ai9e0J5FT8a/YFF77oY8d8/T9M3dx4xfP8W5V11Qp2Ju28oSVj69luPOHcDxF/SvWbPpn669uGUftoWGn32KElKaC3OJ7wfAUHcf5u7/K3ik8n/Vs8xsqZm9YmbDUnhckYxUuxqvOfeR3lm+gdKSEh4d14HSkhLeW7mxybkPHDh0xJgtG7ay7YOP+MP4QrZ98BGzp82pqZjLyc3BulYxaGIfileW1qzZNHB8H16d8VprP7pkmTAJqgjYlexAGrAYOM7dTwXuB2Y1NNDMJpnZIjNbNPWl5FUfiaS7llbj3fTrJ7nmlHyG987jmlPy+c5/Hnnlvf7cn3y06Ygxz93zNFefnMcpffK4+uQ8PllTVKdiruLQIboOLmTfjvKafaqgk3gaa3X0IzP7EbAeeMPMflK9L9ifdO6+293LgucvA/lmFrfMxt2nuvtIdx85adzoVIQnknZaWo1XffZ08xmxxQZvPqPdEWdR8ebesXFrneq76rOnG0cVAHDjqAJyKg5RsmZnzZi8/Hx2rS2jY8+Cmn2qoJN4GjuD6hw8PgHmEVvuvXpfSr5RZ2a9zcyC56OIxVuSimOLpIPmth5qaTVe9dlT386x7+f37ZzLNafk861fPV5z/Oq5uxbksHbTDrp1yGHoUVan+q767Kl359iflt6dc7i0fw4rnlhbUzHnu3LYMH0rPT7fPSkVdGphlD0a6yRxF4CZ/bO7P1f7NTP750Qc3MyeAc4DepjZJmLFGPnB8acAVwA3mVkFsB+Y6E2tDyKSRZpb7FBdjffH5dvr7I9XjVdb0aelPLKtikcW172ndLCylJ7tDvHEnAU1c095Zxf795fTocN+PjtQxb73DlfffbxiPY8ePMij/6i7qq7ZwTprRF146YWsXrGGhXe1bs2m+tTCKLuEWQ9qsbuf1tS+tKL1oCQLFO8sY8Lt9/Hg2I7cNGcfz/33rQlpPdSa47t7nX2dRpzINXd8I2UxNeXu799D9zEdaloYAZSu203pvP3c8bsfRxiZNKah9aAaPIMys4uBS4B+Zva7Wi91ASoSG56I1Fe3IKE8oQ1cW3p8oO6+OEUSUdqxpZiBA0+qs6/rwEJWbymKKCJpjcbuQW0B3gfKg5/Vj9nAhckPTaTtSmXrobDHf+kv7/Hi6+82WiQRNbUwyi4NJih3X+rujwOD3f2JWo+Z7l6auhBFMl+iih0eeO6vR8zz0cfbGDT+TtYUHb7vVP948Y7f2L4HZ75xREHEuf0O0iO/vE5M9YskoqYWRtmlsUt8y4l1jCAopKsjxV/WFcloiSp2qKh6n6PyD9aZ544HnueovP3cfv9zvPjrm+MeL97xG9v3wqbt5FlVnYKI8oOHOFQJIx84HNPGzyrrFElETS2MsktjrY6q/4u7Ofj5ZPDzGmBf0iISyTItWWepuvVQ/XliBQpdauYp3lnG8lXrmDmhE5fPWMeaou1079yxzvHGfmn4Ecd39yb27WPKz27kxl/9gQfHHtNgkcYVj87mih9em9DfV2uphVH2aKzM/GMAMxvt7rW/+XqHmS0Afpns4ESyQaKKHeLN8/bSNbGuDb3zufrkPG6//zm+dOqQOuP+dfJzTRc7xNkX732pLNIQCdPqqJOZnV29YWZfBFJX6yqSwRJV7BBvnmfn/p3lq9ZxU9C14aZRBSxZuZaZ//NOzbhLhxawfNU6rh7eseZ9M19/l5f++l6jBRBXndqR5avWMfbEglbFLdIaYRLUt4AHzGyjmW0Efg98M6lRiWSJ1q6zVFO08EKsaAHga48VYWYc3F/GlcNy6VMY6/7QpzCXE482zuq1v2bcjKW7ufrkPDi0v+b4PfPLObfvwUYLIKxiP1efnMeclWVHxN3cgg+Rlmryi7o1A826BOOjahwbnr6oK2nistsms2X7kU1Q+/bqEfc+U333Pv0ac+a9yWeH2pGX43y2Zz/d8g6ys6IdO8vKaZ8LOTmH/51ZfqiKdrlGhw7t6ZZ3kKJdTrs8Izc3h17dOwOwuXgP+blQ0C4/KIAoqCmA6NcjNmZ76R4qK6uoIqdmX3Xc5532OebMe5OxY87lbwfK0+4elGSelnxR91p3f6p+Y9jqij53vzehEYpkoTBJqCF1iyv2MeVn/xIULQSdHZ74t7jFFmE6UBwe03ABREMxTbj9vpriik4jTmzx5xNpSmOX+Kr/a+3cwENEkqj+0hZ1ixYavkwYZrmNli7JEWa5DZFEaeyLug8FT//L3e+q/0hRfCJtUv2iiHjFDvGKFsIUZbS0cCPMchsiiRSmSOIDM1tgZneb2SVm1jXpUYm0cfWLK/78YdkRxQ7xznzCFGW0tHAj3vvSrZOEZJfGvqgLgLsPNrNjgS8R+/Lu781sp7sPT3ZwIm1J8c4yvnP3U0z9yXU1nSSmLfmUrZ/txXByzXl82Q56dT+8Em39ZTTCLLfR0iU54r0v3TpJSHYJs9xGf2LJ6VzgVOAzYL67/2fyw2shVfFJBqqu2Bs75tyaRBFvXzpJx04SknkaquILc4nvE+BW4BV3P8vdL03r5CSSgWpX7FXfD4q3T6QtCZOgRgDTgKvN7O9mNs3MvpXkuETalHhVdS2ttBPJFk0mKHdfCjwBPAb8hdilvv+b5LhE2ox41XHx2hHpLEramiYTlJktAv4OfBVYBZzj7gMTcXAze9TMtpvZBw28bmb2OzNba2bLzCx9l5mXNiFMm5946zM1Nld1G6Pa1XHV7YjCVNqp9ZBkqyar+ICL3X1Hko7/ODCZ2CXEuMcGhgSPM4EHg58ikQizrlO89Zkam+uFom3k5Xid6rjNxeW8vxle3tB0pV1z15oSyRRhysyTlZxw97fMbGAjQ8YB0zxWaviOmXUzsz7uvjVZMYk0JMy6Th99vO2I9ZmGDOjVxFzhWw21JCaRTBWmSCJK/YCiWtubgn1HMLNJZrbIzBZNfUk3kyXxwhQt3PHA80esz9TSuRIVk0imSvcEFa82Pu53nNx9qruPdPeRk8aNjjdEpMXCtAeqPnuqvT7T8lXrjrgXlcw1olRIIdmkwQRlZpc39khRfJuAAbW2+wNbUnRskRph2gNVnz3VXp8p3llUa9eISvQ8IumqsXtQ/7uR1xyYmeBY4pkN3GJm04kVR+zS/SeJQpj2QP/4qIj3Dh7ikX/srDMmv11Rne2WthpqSUwimSz0goVJObjZM8B5QA9gG/BzIB/A3adYbPGpycBFwD7gG+6+qMmJ1epIJCXU6kgSodkLFtZmZpcCw4CC6n3u/svWBuXuVzXxugMN1+mKiEjWCvNF3SnAlcD3iBUt/DNwXJLjEhGRNi5MFd8X3f16oDRYqPAs6hYuiIiIJFyYBLU/+LnPzPoCh4BByQtJREQk3D2oOWbWDfhvYDGxCr6HkxmUiIhImAT1a3c/ALxgZnOIFUqUN/EeERGRVglzie/v1U/c/YC776q9T0REJBkaPIMys97E+t51MLMRHG471AXomILYWmzj1pKoQxBpE7ZvLubdue9EHYZksF79esW+xBRHY5f4LgRuINZe6N5a+3cDdyYotqT478X5UYcgkvXcoV/PU9kwf1vUoUgG63XmwAZfa7KThJl9zd1fSHBMSfWHt9ark4SISAY4sU9nvjSkZ9xOEmHuQS0ws0fM7BUAM/u8mX0roRGKiIjUEyZBPQbMBfoG26uBW5MVkIiICIRLUD3cfQZQBeDuFUBlUqMSEZE2L0yC2mtmRxMsFGhmXwB2JTUqERFp88J8UfdHxNZlOsHMFgA9gSuSGpWIiLR5TSYod19sZucCQ4l9F+ojdz+U9MhERKRNazJBmVkB8F3gbGKX+d42synurnZHIiKSNGEu8U0D9gD3B9tXAU8SWxdKREQkKcIkqKHufmqt7b+a2dJEHNzMLgLuA3KBh9397nqvnwe8BGwIds1MxEq+cth/3nIVZWV7jthfWNiZn0x+JoKIRERiwiSof5jZF9z9HQAzOxNY0NoDm1ku8AAwBtgELDSz2e6+st7Qt919bGuPJ/GVle3h+G/ff8T+9Q9/L4JoREQOC5OgzgSuN7NPgu1jgQ/NbDng7n5KC489Cljr7usBzGw6MA6on6BERKQNCpOgLkrSsfsBRbW2NxFLhvWdFVxS3AL82N1XxJvMzCYBkwCuve1XnHPZVQkOV0REUilMmfnHSTp2vOaA9Zu8LgaOc/cyM7sEmAUMiTeZu08FpoKaxYqIZIMwZ1DJsgkYUGu7P7GzpBruvrvW85fN7Pdm1sPdi1MUY0b7/mVnUlFVN1dXHTqI5eZyVK8+AJQWb2fZ5JvILejIsG//Ju48KqQQkShEmaAWAkPMbBCwGZgIXF17QLBo4jZ3dzMbRaw1k1YjDKmiyjnulml19hU99gOOvvgHHDv4JAA+LVpPZWUln07/WZ3CiMLCzjXPVUghIlGILEG5e4WZ3UKsU3ou8Ki7rzCzG4PXpxBrqXSTmVUA+4GJ3tQCVtIsvQccD8CBHr34f4/PiTgaEZHDojyDwt1fBl6ut29KreeTgcmpjktERKIXppu5iIhIyilBiYhIWor0Ep+0XP0KvarKCsCgqoKc/PbBvkrW3zsBy8uvGeeVFWx79mdUHBNbIHlXSTFVXoV5FT+94XDDjtoVeoWFneMWRNQupBARSTQlqAxVv0Jv36fraddrEFsf/wEDvvk7AA4dPMDmh/6FkT95ts571z/8vZqCiJ/eMLbJCj2VkotIFHSJT0RE0pISlIiIpCUlKBERSUu6BxWxMG2E4rUsqjx4gE8evrnum3JyqdxTzMEDscWOtz/3c7yqksW//XadYRW7i7nxktNjGw47f/ttLK8dvcb+sGbMrpLD3aTU6khEoqAEFbEwbYTitSzacN/V9Ljs9sM7qioB2Pbsz9gy9V8A8Koqek24C8vNr/Pebc/cyYBbngLgUMkm3KvY9sydlMy5t2aMV1U0K0YRkURTgspYRv7RAzCLNYWvOnSAnPz2WE4u/W+OJbMtj96C5ebTrsexNe+qOnQAyzl8ZTfvqL6YxbZPueXBmv1KPiISNd2DEhGRtKQEJSIiaUmX+NLMiodvo7J8H4fKPqvp7FBVVcnmp39Cv2v+s1lzVe3fA1WVHCz+pM5+r6pKWLwiIsmiBBWx+m2Eyou303vir8jNza1ZCiN/7Yd8OuP/8vHk62vGeeUhNk2+7oj5vPIQRfdfGzyvoHjOPdRfvNgrKyiafG2wcfh9Da0HpVZHIhIFy8bllTJ5yffGWg81d72mRM4lIpIMJ/bpzJeG9LR4r+kelIiIpCUlKBERSUuRJigzu8jMPjKztWZ2R5zXzcx+F7y+zMxOiyJOERFJvciKJMwsF3gAGANsAhaa2Wx3X1lr2MXAkOBxJvBg8DNrJbIgQcUNIpLJoqziGwWsdff1AGY2HRgH1E5Q44BpHqvkeMfMuplZH3ffmvpwUyORve3UJ09EMlmUl/j6AUW1tjcF+5o7BgAzm2Rmi8xs0Vuz9YdZRCTTRXkGFa+ssH55eJgxsZ3uU4GpkNll5iIiEhPlGdQmYECt7f7AlhaMERGRLBRlgloIDDGzQWbWDpgIzK43ZjZwfVDN9wVgVzbffxIRkcMiu8Tn7hVmdgswF8gFHnX3FWZ2Y/D6FOBl4BJgLbAP+EZU8YqISGpF2ovP3V8mloRq75tS67kDN9d/n4iIZD91khARkbSkBCUiImlJCUpERNKSEpSIiKQlJSgREUlLSlAiIpKWlKBERCQtKUGJiEhaUoISEZG0pAQlIiJpSQlKRETSkhKUiIikJSUoERFJS0pQIiKSlpSgREQkLSlBiYhIWlKCEhGRtKQEJSIiaSmSJd/N7CjgWWAgsBGY4O6lccZtBPYAlUCFu49MXZQiIhKlqM6g7gBed/chwOvBdkP+yd2HKzmJiLQtUSWoccATwfMngPERxSEiImkqqgR1jLtvBQh+9mpgnAOvmdn7ZjapsQnNbJKZLTKzRW/NfibB4YqISKol7R6Umf0P0DvOSz9txjSj3X2LmfUC5pnZKnd/K95Ad58KTAX4w1vrvdkBi4hIWklagnL3rzT0mpltM7M+7r7VzPoA2xuYY0vwc7uZvQiMAuImKBERyS5RXeKbDXw9eP514KX6A8ysk5l1rn4OXAB8kLIIRUQkUlElqLuBMWa2BhgTbGNmfc3s5WDMMcB8M1sKvAf82d1fjSRaERFJuUi+B+XuJcD5cfZvAS4Jnq8HTk1xaCIikibUSUJERNKSEpSIiKQlJSgREUlLSlAiIpKWlKBERCQtKUGJiEhaUoISEZG0pAQlIiJpSQlKRETSkhKUiIikJSUoERFJS0pQIiKSlpSgREQkLSlBiYhIWopkuY1k69G5XdQhiIhICIXtG05D5u4pDCU1zGySu0+NOo7mUtyppbhTL1NjV9zRyNZLfJOiDqCFFHdqKe7Uy9TYFXcEsjVBiYhIhlOCEhGRtJStCSpTr7kq7tRS3KmXqbEr7ghkZZGEiIhkvmw9gxIRkQynBCUiImkpqxKUmV1kZh+Z2VozuyPqeMIys0fNbLuZfRB1LM1hZgPM7K9m9qGZrTCzH0QdUxhmVmBm75nZ0iDuu6KOqTnMLNfM/mFmc6KOJSwz22hmy81siZktijqesMysm5k9b2argv/Oz4o6pjDMbGjwu65+7DazW6OOq7my5h6UmeUCq4ExwCZgIXCVu6+MNLAQzOwcoAyY5u4nRx1PWGbWB+jj7ovNrDPwPjA+3X/nZmZAJ3cvM7N8YD7wA3d/J+LQQjGzHwEjgS7uPjbqeMIws43ASHcvjjqW5jCzJ4C33f1hM2sHdHT3nRGH1SzB38bNwJnu/nHU8TRHNp1BjQLWuvt6dz8ITAfGRRxTKO7+FvBZ1HE0l7tvdffFwfM9wIdAv2ijaprHlAWb+cEjI/6lZmb9gUuBh6OOJduZWRfgHOARAHc/mGnJKXA+sC7TkhNkV4LqBxTV2t5EBvyxzBZmNhAYAbwbcSihBJfJlgDbgXnunhFxA78FbgeqIo6juRx4zczeN7NM6W5wPLADeCy4pPqwmXWKOqgWmAg8E3UQLZFNCcri7MuIfxVnOjMrBF4AbnX33VHHE4a7V7r7cKA/MMrM0v7SqpmNBba7+/tRx9ICo939NOBi4Obgsna6ywNOAx509xHAXiBj7m0DBJclLwOeizqWlsimBLUJGFBruz+wJaJY2ozgHs4LwNPuPjPqeJoruGTzBnBRtJGEMhq4LLifMx34spk9FW1I4bj7luDnduBFYpfk090mYFOts+vniSWsTHIxsNjdt0UdSEtkU4JaCAwxs0HBvxomArMjjimrBcUGjwAfuvu9UccTlpn1NLNuwfMOwFeAVZEGFYK7/8Td+7v7QGL/ff/F3a+NOKwmmVmnoIiG4BLZBUDaV6y6+6dAkZkNDXadD6R1AVAcV5Ghl/cgi9aDcvcKM7sFmAvkAo+6+4qIwwrFzJ4BzgN6mNkm4Ofu/ki0UYUyGrgOWB7czwG4091fji6kUPoATwTVTTnADHfPmJLtDHQM8GLs3zPkAX9091ejDSm07wFPB//oXQ98I+J4QjOzjsSqmr8TdSwtlTVl5iIikl2y6RKfiIhkESUoERFJS0pQIiKSlpSgREQkLSlBiYhIWlKCEmkmM7vBzPqGGPe4mV0Rdn8C4rqz1vOBYbrjB7FsMLMbE3D8K4OVBFSyLwmhBCXSfDcATSaoCNzZ9JC4/o+7T2ntwd39WeDbrZ1HpJoSlLRpwZnGKjN7wsyWBWv/dAxeO93M3gwanM41sz7Bmc9IYl/eXGJmHczs38xsoZl9YGZTgw4bYY9/xDGC/W+Y2X8F61atNrMvBfs7mtmMINZnzexdMxtpZncDHYKYng6mzzWzP1hszavXgq4ZTcVzjJm9aLG1spaa2Rdr/Y4eDj7j02b2FTNbYGZrzCwT2hZJBlKCEoGhwFR3PwXYDXw36DF4P3CFu58OPAr8P3d/HlgEXOPuw919PzDZ3c8I1vLqAIRao6mhY9Qakufuo4BbgZ8H+74LlAax/jtwOoC73wHsD2K6Jhg7BHjA3YcBO4GvhQjrd8Cb7n4qsb5z1d1YBgP3AacAJwJXA2cDP6blZ24ijcqaVkcirVDk7guC508B3wdeBU4G5gUnRLnA1gbe/09mdjvQETiK2B/1P4U47tAmjlHdfPd9YGDw/GxiiQJ3/8DMljUy/wZ3XxJnjsZ8Gbg+mL8S2GVm3YO5lgOY2QrgdXd3M1secl6RZlOCEjlyWRYntnzLCndvdIlvMysAfk9stdgiM/sFUBDyuE0d40Dws5LD/18Nffmw1vur52jyEl/IuapqbVehvyOSJLrEJwLHmll1kriK2BLwHwE9q/ebWb6ZDQvG7AE6B8+rk1FxsC5Wc6rzGjtGQ+YDE4Lxnwf+V63XDgWXDVvjdeCmYP5ci60qKxIJJSiR2FL1Xw8ulx1FbIG6g8SSzX+Z2VJgCfDFYPzjwJSgg/sB4A/AcmAWsWVfQmniGA35PbGktgz4V2AZsCt4bSqwrFaRREv8gNgly+XELgs2lTBFkkbdzKVNs9hS9XOCAoe0FywRku/u5WZ2ArEzns8Fya4l8z1O7PM/n6D4zgN+7O6hCkVEGqNrxyKZpSPw1+BSngE3tTQ5BXYB/25mPVr7XSgzu5JYtWEmLkkvaUhnUCIikpZ0D0pERNKSEpSIiKQlJSgREUlLSlAiIpKWlKBERCQt/X/RoeoPLSkUyAAAAABJRU5ErkJggg==\n", 114 | "text/plain": [ 115 | "
" 116 | ] 117 | }, 118 | "metadata": { 119 | "needs_background": "light" 120 | }, 121 | "output_type": "display_data" 122 | } 123 | ], 124 | "source": [ 125 | "%matplotlib inline\n", 126 | "import matplotlib.pyplot as plt\n", 127 | "from sklearn.tree import DecisionTreeClassifier\n", 128 | "from mlxtend.plotting import plot_decision_regions\n", 129 | "\n", 130 | "\n", 131 | "tree = DecisionTreeClassifier(criterion='entropy', \n", 132 | " max_depth=2, \n", 133 | " random_state=1)\n", 134 | "tree.fit(X_train, y_train)\n", 135 | "\n", 136 | "\n", 137 | "plot_decision_regions(X_train, y_train, tree)\n", 138 | "\n", 139 | "plt.xlabel('petal length [cm]')\n", 140 | "plt.ylabel('petal width [cm]')\n", 141 | "plt.legend(loc='upper left')\n", 142 | "plt.tight_layout()\n", 143 | "plt.show()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 8, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "# you may need to run\n", 153 | "\n", 154 | "# conda install pydotplus\n", 155 | "# conda install graphviz\n", 156 | "\n", 157 | "# in your command line" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 10, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "data": { 167 | "text/plain": [ 168 | "True" 169 | ] 170 | }, 171 | "execution_count": 10, 172 | "metadata": {}, 173 | "output_type": "execute_result" 174 | } 175 | ], 176 | "source": [ 177 | "from pydotplus import graph_from_dot_data\n", 178 | "from sklearn.tree import export_graphviz\n", 179 | "\n", 180 | "\n", 181 | "dot_data = export_graphviz(tree,\n", 182 | " filled=True, \n", 183 | " rounded=True,\n", 184 | " class_names=['Setosa', \n", 185 | " 'Versicolor',\n", 186 | " 'Virginica'],\n", 187 | " feature_names=['petal length', \n", 188 | " 'petal width'],\n", 189 | " out_file=None) \n", 190 | "graph = graph_from_dot_data(dot_data) \n", 191 | "graph.write_png('tree.png')" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "from IPython.display import Image\n", 199 | "\n", 200 | "\n", 201 | "Image('tree.png')" 202 | ] 203 | } 204 | ], 205 | "metadata": { 206 | "kernelspec": { 207 | "display_name": "Python 3", 208 | "language": "python", 209 | "name": "python3" 210 | }, 211 | "language_info": { 212 | "codemirror_mode": { 213 | "name": "ipython", 214 | "version": 3 215 | }, 216 | "file_extension": ".py", 217 | "mimetype": "text/x-python", 218 | "name": "python", 219 | "nbconvert_exporter": "python", 220 | "pygments_lexer": "ipython3", 221 | "version": "3.8.2" 222 | } 223 | }, 224 | "nbformat": 4, 225 | "nbformat_minor": 4 226 | } 227 | -------------------------------------------------------------------------------- /L07/07-ensembles__notes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L07/07-ensembles__notes.pdf -------------------------------------------------------------------------------- /L07/07-ensembles__slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L07/07-ensembles__slides.pdf -------------------------------------------------------------------------------- /L08/08-model-eval-1-intro__notes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L08/08-model-eval-1-intro__notes.pdf -------------------------------------------------------------------------------- /L08/08-model-eval-1-intro__slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L08/08-model-eval-1-intro__slides.pdf -------------------------------------------------------------------------------- /L09/09-eval2-ci__notes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L09/09-eval2-ci__notes.pdf -------------------------------------------------------------------------------- /L09/09-eval2-ci__slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L09/09-eval2-ci__slides.pdf -------------------------------------------------------------------------------- /L09/code/09-eval2-ci__5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "STAT 451: Machine Learning (Fall 2020) \n", 8 | "Instructor: Sebastian Raschka (sraschka@wisc.edu) \n", 9 | "\n", 10 | "Course website: http://pages.stat.wisc.edu/~sraschka/teaching/stat451-fs2020/" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# L09: Model Evaluation 2 -- Confidence Intervals and Resampling" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "
\n", 25 | "
\n", 26 | "
" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "# 5. Out-of-Bag Bootstrap" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "In this section, we are going to look at the OOB bootstrap method, which I recently implemented in mlxtend." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 1, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "[3 4 0 1 3] [2]\n", 53 | "[0 0 1 4 4] [2 3]\n", 54 | "[1 2 4 2 4] [0 3]\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "from mlxtend.evaluate import BootstrapOutOfBag\n", 60 | "import numpy as np\n", 61 | "\n", 62 | "\n", 63 | "\n", 64 | "\n", 65 | "oob = BootstrapOutOfBag(n_splits=3, random_seed=1)\n", 66 | "for train, test in oob.split(np.array([1, 2, 3, 4, 5])):\n", 67 | " print(train, test)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "The reason why I chose a object-oriented implementation is that we can plug it into scikit-learn's `cross_val_score` function, which is super convenient." 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 2, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "from mlxtend.data import iris_data\n", 84 | "from sklearn.tree import DecisionTreeClassifier\n", 85 | "from sklearn.model_selection import cross_val_score\n", 86 | "from sklearn.model_selection import train_test_split\n", 87 | "\n", 88 | "\n", 89 | "X, y = iris_data()\n", 90 | "\n", 91 | "X_train, X_test, y_train, y_test = train_test_split(\n", 92 | " X, y, test_size=0.4, random_state=123, stratify=y)\n", 93 | "\n", 94 | "\n", 95 | "model = DecisionTreeClassifier(random_state=123)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "Below, we are using the standard approach for `cross_val_score` first, which will perform 5-fold cross validation by setting `cv=5`. Note that \n", 103 | "\n", 104 | "- if the model is a scikit-learn classifier, stratified k-fold cross validation will be performed by default, and the reported evaluation metric is the prediction accuracy;\n", 105 | "- if the model is a scikit-learn regressor, standard k-fold cross validation will be performed by default, and the reported evaluation metric is the $R^2$ score on the test folds." 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 3, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "CV scores [0.94444444 1. 1. 0.88888889 0.94444444]\n", 118 | "Mean CV score 0.9555555555555555\n", 119 | "CV score Std 0.04157397096415492\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "cv_scores = cross_val_score(model, X_train, y_train, cv=5)\n", 125 | "print('CV scores', cv_scores)\n", 126 | "print('Mean CV score', np.mean(cv_scores))\n", 127 | "print('CV score Std', np.std(cv_scores))" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "Now, let's plug in our OOB object into the `cross_val_score` function:" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 4, 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "Bootstrap scores [0.93548387 0.96774194 0.96875 0.93023256 0.97058824]\n", 147 | "Mean Bootstrap score 0.9545593199770531\n", 148 | "Score Std 0.017819915677477555\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "# 5 splits\n", 154 | "\n", 155 | "bootstrap_scores = \\\n", 156 | " cross_val_score(model, X_train, y_train, \n", 157 | " cv=BootstrapOutOfBag(n_splits=5, random_seed=123))\n", 158 | "\n", 159 | "print('Bootstrap scores', bootstrap_scores)\n", 160 | "print('Mean Bootstrap score', np.mean(bootstrap_scores))\n", 161 | "print('Score Std', np.std(bootstrap_scores))" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 5, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "Mean Bootstrap score 0.9483980861793887\n", 174 | "Score Std 0.039817322453014004\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "bootstrap_scores = \\\n", 180 | " cross_val_score(model, X_train, y_train, \n", 181 | " cv=BootstrapOutOfBag(n_splits=200, random_seed=123))\n", 182 | "\n", 183 | "print('Mean Bootstrap score', np.mean(bootstrap_scores))\n", 184 | "print('Score Std', np.std(bootstrap_scores))" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 6, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "name": "stdout", 194 | "output_type": "stream", 195 | "text": [ 196 | "95% Confidence interval: [83.33, 100.00]\n" 197 | ] 198 | } 199 | ], 200 | "source": [ 201 | "lower = np.percentile(bootstrap_scores, 2.5)\n", 202 | "upper = np.percentile(bootstrap_scores, 97.5)\n", 203 | "print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 7, 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "data": { 213 | "text/plain": [ 214 | "0.95" 215 | ] 216 | }, 217 | "execution_count": 7, 218 | "metadata": {}, 219 | "output_type": "execute_result" 220 | } 221 | ], 222 | "source": [ 223 | "model.fit(X_train, y_train)\n", 224 | "model.score(X_test, y_test)" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "
\n", 232 | "
\n", 233 | "
" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "## MLxtend functional bootstrap API" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "### OOB Bootstrap" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "Below is a more convenient way to compute the OOB Boostrap. Note that it has a tendency to be over-pessimistic." 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 8, 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "name": "stdout", 264 | "output_type": "stream", 265 | "text": [ 266 | "Mean Bootstrap score 0.9483980861793887\n", 267 | "Score Std 0.039817322453014004\n" 268 | ] 269 | } 270 | ], 271 | "source": [ 272 | "from mlxtend.evaluate import bootstrap_point632_score\n", 273 | "\n", 274 | "bootstrap_scores = bootstrap_point632_score(model, \n", 275 | " X_train, y_train, \n", 276 | " n_splits=200, \n", 277 | " method='oob',\n", 278 | " random_seed=123)\n", 279 | "\n", 280 | "print('Mean Bootstrap score', np.mean(bootstrap_scores))\n", 281 | "print('Score Std', np.std(bootstrap_scores))" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 9, 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "name": "stdout", 291 | "output_type": "stream", 292 | "text": [ 293 | "95% Confidence interval: [83.33, 100.00]\n" 294 | ] 295 | } 296 | ], 297 | "source": [ 298 | "lower = np.percentile(bootstrap_scores, 2.5)\n", 299 | "upper = np.percentile(bootstrap_scores, 97.5)\n", 300 | "print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "### .632 Bootstrap" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "The .632 Bootstrap is the default setting of `bootstrap_point632_score`; it tends to be overly optimistic." 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 10, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "name": "stdout", 324 | "output_type": "stream", 325 | "text": [ 326 | "Mean Bootstrap score 0.9673875904653735\n", 327 | "Score Std 0.02516454779030485\n" 328 | ] 329 | } 330 | ], 331 | "source": [ 332 | "bootstrap_scores = bootstrap_point632_score(model, \n", 333 | " X_train, y_train, \n", 334 | " n_splits=200,\n", 335 | " random_seed=123)\n", 336 | "print('Mean Bootstrap score', np.mean(bootstrap_scores))\n", 337 | "print('Score Std', np.std(bootstrap_scores))" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 11, 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "name": "stdout", 347 | "output_type": "stream", 348 | "text": [ 349 | "95% Confidence interval: [89.47, 100.00]\n" 350 | ] 351 | } 352 | ], 353 | "source": [ 354 | "lower = np.percentile(bootstrap_scores, 2.5)\n", 355 | "upper = np.percentile(bootstrap_scores, 97.5)\n", 356 | "print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "### .632+ Bootstrap" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "The .632+ Boostrap method attempts to address the optimistic bias of the regular .632 Boostrap." 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 12, 376 | "metadata": {}, 377 | "outputs": [ 378 | { 379 | "name": "stdout", 380 | "output_type": "stream", 381 | "text": [ 382 | "Mean Bootstrap score 0.9658029542600898\n", 383 | "Score Std 0.027801366648921747\n" 384 | ] 385 | } 386 | ], 387 | "source": [ 388 | "bootstrap_scores = bootstrap_point632_score(model, X_train, y_train, \n", 389 | " n_splits=200, \n", 390 | " method='.632+',\n", 391 | " random_seed=123)\n", 392 | "print('Mean Bootstrap score', np.mean(bootstrap_scores))\n", 393 | "print('Score Std', np.std(bootstrap_scores))" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": 13, 399 | "metadata": {}, 400 | "outputs": [ 401 | { 402 | "name": "stdout", 403 | "output_type": "stream", 404 | "text": [ 405 | "95% Confidence interval: [88.40, 100.00]\n" 406 | ] 407 | } 408 | ], 409 | "source": [ 410 | "lower = np.percentile(bootstrap_scores, 2.5)\n", 411 | "upper = np.percentile(bootstrap_scores, 97.5)\n", 412 | "print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))" 413 | ] 414 | } 415 | ], 416 | "metadata": { 417 | "kernelspec": { 418 | "display_name": "Python 3", 419 | "language": "python", 420 | "name": "python3" 421 | }, 422 | "language_info": { 423 | "codemirror_mode": { 424 | "name": "ipython", 425 | "version": 3 426 | }, 427 | "file_extension": ".py", 428 | "mimetype": "text/x-python", 429 | "name": "python", 430 | "nbconvert_exporter": "python", 431 | "pygments_lexer": "ipython3", 432 | "version": "3.8.2" 433 | } 434 | }, 435 | "nbformat": 4, 436 | "nbformat_minor": 4 437 | } 438 | -------------------------------------------------------------------------------- /L10/10_eval3-cv__notes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L10/10_eval3-cv__notes.pdf -------------------------------------------------------------------------------- /L10/10_eval3-cv__slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L10/10_eval3-cv__slides.pdf -------------------------------------------------------------------------------- /L10/code/10_04_kfold-eval.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "STAT 451: Machine Learning (Fall 2020) \n", 8 | "Instructor: Sebastian Raschka (sraschka@wisc.edu) \n", 9 | "\n", 10 | "Course website: http://pages.stat.wisc.edu/~sraschka/teaching/stat451-fs2020/" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# L10: Model Evaluation 3 -- Cross-Validation and Model Selection" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "%load_ext watermark\n", 27 | "%watermark -a 'Sebastian Raschka' -u -d -v -p numpy,mlxtend,matplotlib,sklearn" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "import numpy as np\n", 37 | "import matplotlib.pyplot as plt" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "

\n", 45 | "\n", 46 | "## K-fold Cross-Validation in Scikit-Learn" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "- Simple demonstration of using a cross-validation iterator in scikit-learn" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "from sklearn.model_selection import KFold\n", 63 | "\n", 64 | "\n", 65 | "rng = np.random.RandomState(123)\n", 66 | "\n", 67 | "y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])\n", 68 | "X = rng.random_sample((y.shape[0], 4))\n", 69 | "\n", 70 | "\n", 71 | "cv = KFold(n_splits=5)\n", 72 | "\n", 73 | "for k in cv.split(X, y):\n", 74 | " print(k)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "

\n", 82 | "\n", 83 | "- In practice, we are usually interested in shuffling the dataset, because if the data records are ordered by class label, this would result in cases where the classes are not well represented in the training and test folds" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "cv = KFold(n_splits=5, random_state=123, shuffle=True)\n", 93 | "\n", 94 | "for k in cv.split(X, y):\n", 95 | " print(k)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "

\n", 103 | "\n", 104 | "- Note that the `KFold` iterator only provides us with the array indices; in practice, we are actually interested in the array values (feature values and class labels)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "cv = KFold(n_splits=5, random_state=123, shuffle=True)\n", 114 | "\n", 115 | "for train_idx, valid_idx in cv.split(X, y):\n", 116 | " print('train labels with shuffling', y[train_idx])" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "

\n", 124 | "\n", 125 | "- As discussed in the lecture, it's important to stratify the splits (very crucial for small datasets!)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "from sklearn.model_selection import StratifiedKFold\n", 135 | "\n", 136 | "cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)\n", 137 | "\n", 138 | "for train_idx, valid_idx in cv.split(X, y):\n", 139 | " print('train labels', y[train_idx])" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "

\n", 147 | "\n", 148 | "- After the illustrations of cross-validation above, the next cell demonstrates how we can actually use the iterators provided through scikit-learn to fit and evaluate a learning algorithm" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "from sklearn.tree import DecisionTreeClassifier\n", 158 | "from mlxtend.data import iris_data\n", 159 | "from sklearn.model_selection import train_test_split\n", 160 | "\n", 161 | "\n", 162 | "X, y = iris_data()\n", 163 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size=0.15, \n", 164 | " shuffle=True, stratify=y)\n", 165 | "\n", 166 | "\n", 167 | "\n", 168 | "cv = StratifiedKFold(n_splits=10, random_state=123, shuffle=True)\n", 169 | "\n", 170 | "kfold_acc = 0.\n", 171 | "for train_idx, valid_idx in cv.split(X_train, y_train):\n", 172 | " clf = DecisionTreeClassifier(random_state=123, max_depth=3).fit(X_train[train_idx], y_train[train_idx])\n", 173 | " y_pred = clf.predict(X_train[valid_idx])\n", 174 | " acc = np.mean(y_pred == y_train[valid_idx])*100\n", 175 | " kfold_acc += acc\n", 176 | "kfold_acc /= 10\n", 177 | " \n", 178 | "clf = DecisionTreeClassifier(random_state=123, max_depth=3).fit(X_train, y_train)\n", 179 | "y_pred = clf.predict(X_test)\n", 180 | "test_acc = np.mean(y_pred == y_test)*100\n", 181 | " \n", 182 | "print('Kfold Accuracy: %.2f%%' % kfold_acc)\n", 183 | "print('Test Accuracy: %.2f%%' % test_acc)\n", 184 | "\n" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "

\n", 192 | "\n", 193 | "- Usually, a more convenient way to use cross-validation through scikit-learn is to use the `cross_val_score` function (note that it performs stratifies splitting for classification by default)\n", 194 | "- (remember to ask students about whitespaces according to pep8)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "from sklearn.model_selection import cross_val_score\n", 204 | "\n", 205 | "\n", 206 | "cv_acc = cross_val_score(estimator=DecisionTreeClassifier(random_state=123, max_depth=3),\n", 207 | " X=X_train,\n", 208 | " y=y_train,\n", 209 | " cv=10,\n", 210 | " n_jobs=-1)\n", 211 | "\n", 212 | "print('Kfold Accuracy: %.2f%%' % (np.mean(cv_acc)*100))" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "

\n", 220 | "\n", 221 | "- `cross_val_score` has unfortunately no way to specify a random seed; this is not an issue in regular use cases, but it is not useful if you want to do \"repeated cross-validation\"\n", 222 | "- The next cell illustrates how we can provide our own cross-validation iterator for convenience (note that the results match or \"manual\" `StratifiedKFold` approach we performed earlier)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "from sklearn.model_selection import cross_val_score\n", 232 | "\n", 233 | "\n", 234 | "cv_acc = cross_val_score(estimator=DecisionTreeClassifier(random_state=123, max_depth=3),\n", 235 | " X=X_train,\n", 236 | " y=y_train,\n", 237 | " cv=StratifiedKFold(n_splits=10, random_state=123, shuffle=True),\n", 238 | " n_jobs=-1)\n", 239 | "\n", 240 | "print('Kfold Accuracy: %.2f%%' % (np.mean(cv_acc)*100))" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "

\n", 248 | "\n", 249 | "## Bootstrap" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "- Recall Bootstrapping from 2 lectures ago? Here I is an iterator I implemented analogous to `KFold`" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "from mlxtend.evaluate import BootstrapOutOfBag\n", 266 | "\n", 267 | "oob = BootstrapOutOfBag(n_splits=5, random_seed=99)\n", 268 | "for train, test in oob.split(np.array([1, 2, 3, 4, 5])):\n", 269 | " print(train, test)" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "

\n", 277 | "\n", 278 | "- Analagous the `KFold` iterator, we can use it in the `cross_val_score` function for convenience" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "cv_acc = cross_val_score(estimator=DecisionTreeClassifier(random_state=99, max_depth=3),\n", 288 | " X=X_train,\n", 289 | " y=y_train,\n", 290 | " cv=BootstrapOutOfBag(n_splits=200, random_seed=99),\n", 291 | " n_jobs=-1)\n", 292 | "\n", 293 | "print('OOB Bootstrap Accuracy: %.2f%%' % (np.mean(cv_acc)*100))" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "

\n", 301 | "\n", 302 | "- Analagous to the `cross_val_score` method, you can use the `bootstrap_point632_score`, which implements the .632-Bootstrap method (which is less pesimistically biased than the out-of-bag bootstrap)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "from mlxtend.evaluate import bootstrap_point632_score\n", 312 | "\n", 313 | "\n", 314 | "cv_acc = bootstrap_point632_score(estimator=DecisionTreeClassifier(random_state=123, max_depth=3),\n", 315 | " X=X_train,\n", 316 | " y=y_train,\n", 317 | " random_seed=99)\n", 318 | "\n", 319 | "print('OOB Bootstrap Accuracy: %.2f%%' % (np.mean(cv_acc)*100))" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "- By default, `bootstrap_point632_score` uses the setting `method='.632'`\n", 327 | "- By setting `method='.632+'`, we can also perform the .632+ bootstrap, which corrects for optimism bias, which is shown below" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "cv_acc = bootstrap_point632_score(estimator=DecisionTreeClassifier(random_state=123, max_depth=3),\n", 337 | " X=X_train,\n", 338 | " y=y_train,\n", 339 | " method='.632+',\n", 340 | " n_splits=200,\n", 341 | " random_seed=99)\n", 342 | "\n", 343 | "print('OOB Bootstrap Accuracy: %.2f%%' % (np.mean(cv_acc)*100))" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "- Finally, for your convenience, you can also set `method='oob'`, to run a regular Out-of-bag boostrap:" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "cv_acc = bootstrap_point632_score(estimator=DecisionTreeClassifier(random_state=123, max_depth=3),\n", 360 | " X=X_train,\n", 361 | " y=y_train,\n", 362 | " method='oob',\n", 363 | " n_splits=200,\n", 364 | " random_seed=99)\n", 365 | "\n", 366 | "print('OOB Bootstrap Accuracy: %.2f%%' % (np.mean(cv_acc)*100))" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [] 375 | } 376 | ], 377 | "metadata": { 378 | "kernelspec": { 379 | "display_name": "Python 3", 380 | "language": "python", 381 | "name": "python3" 382 | }, 383 | "language_info": { 384 | "codemirror_mode": { 385 | "name": "ipython", 386 | "version": 3 387 | }, 388 | "file_extension": ".py", 389 | "mimetype": "text/x-python", 390 | "name": "python", 391 | "nbconvert_exporter": "python", 392 | "pygments_lexer": "ipython3", 393 | "version": "3.8.2" 394 | } 395 | }, 396 | "nbformat": 4, 397 | "nbformat_minor": 4 398 | } 399 | -------------------------------------------------------------------------------- /L11/11-eval4-algo__notes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L11/11-eval4-algo__notes.pdf -------------------------------------------------------------------------------- /L11/11-eval4-algo__slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L11/11-eval4-algo__slides.pdf -------------------------------------------------------------------------------- /L11/code/11-eval4-algo__nested-cv_compact.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "STAT 451: Machine Learning (Fall 2020) \n", 8 | "Instructor: Sebastian Raschka (sraschka@wisc.edu) \n", 9 | "\n", 10 | "Course website: http://pages.stat.wisc.edu/~sraschka/teaching/stat451-fs2020/" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# L11: Model Evaluation 4 -- Algorithm Comparison (Nested Cross-Validation)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "\n", 25 | "\n", 26 | "## -- Compact version" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "This notebook illustrates how to implement nested cross-validation in scikit-learn. This notebook is a more compact version of the other notebooks [./11-eval4-algo__nested-cv_verbose1.ipynb](./11-eval4-algo__nested-cv_verbose1.ipynb) and [./11-eval4-algo__nested-cv_verbose2.ipynb](./11-eval4-algo__nested-cv_verbose2.ipynb).\n", 34 | "\n", 35 | "Note that due to using `cross_val_score`, we cannot see the best settings for all the outer training folds here. \n", 36 | "\n", 37 | "" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 1, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "Sebastian Raschka 2020-11-24 \n", 50 | "\n", 51 | "CPython 3.8.2\n", 52 | "IPython 7.18.1\n", 53 | "\n", 54 | "sklearn 0.23.2\n", 55 | "mlxtend 0.18.0.dev0\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "%load_ext watermark\n", 61 | "%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 2, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "import numpy as np\n", 71 | "from sklearn.model_selection import GridSearchCV\n", 72 | "from sklearn.model_selection import train_test_split\n", 73 | "from sklearn.model_selection import StratifiedKFold\n", 74 | "from sklearn.model_selection import cross_val_score\n", 75 | "from sklearn.pipeline import Pipeline\n", 76 | "from sklearn.preprocessing import StandardScaler\n", 77 | "from sklearn.linear_model import LogisticRegression\n", 78 | "from sklearn.neighbors import KNeighborsClassifier\n", 79 | "from sklearn.tree import DecisionTreeClassifier\n", 80 | "from sklearn.ensemble import RandomForestClassifier\n", 81 | "from sklearn.svm import SVC\n", 82 | "from mlxtend.data import mnist_data\n", 83 | "from sklearn.metrics import accuracy_score\n", 84 | "\n", 85 | "# Loading and splitting the dataset\n", 86 | "# Note that this is a small (stratified) subset\n", 87 | "# of MNIST; it consists of 5000 samples only, that is,\n", 88 | "# 10% of the original MNIST dataset\n", 89 | "# http://yann.lecun.com/exdb/mnist/\n", 90 | "X, y = mnist_data()\n", 91 | "X = X.astype(np.float32)\n", 92 | "X_train, X_test, y_train, y_test = train_test_split(X, y,\n", 93 | " test_size=0.2,\n", 94 | " random_state=1,\n", 95 | " stratify=y)\n", 96 | "\n", 97 | "# Initializing Classifiers\n", 98 | "clf1 = LogisticRegression(multi_class='multinomial',\n", 99 | " solver='newton-cg',\n", 100 | " random_state=1)\n", 101 | "clf2 = KNeighborsClassifier(algorithm='ball_tree',\n", 102 | " leaf_size=50)\n", 103 | "clf3 = DecisionTreeClassifier(random_state=1)\n", 104 | "clf4 = SVC(random_state=1)\n", 105 | "clf5 = RandomForestClassifier(random_state=1)\n", 106 | "\n", 107 | "# Building the pipelines\n", 108 | "pipe1 = Pipeline([('std', StandardScaler()),\n", 109 | " ('clf1', clf1)])\n", 110 | "\n", 111 | "pipe2 = Pipeline([('std', StandardScaler()),\n", 112 | " ('clf2', clf2)])\n", 113 | "\n", 114 | "pipe4 = Pipeline([('std', StandardScaler()),\n", 115 | " ('clf4', clf4)])\n", 116 | "\n", 117 | "\n", 118 | "# Setting up the parameter grids\n", 119 | "param_grid1 = [{'clf1__penalty': ['l2'],\n", 120 | " 'clf1__C': np.power(10., np.arange(-4, 4))}]\n", 121 | "\n", 122 | "param_grid2 = [{'clf2__n_neighbors': list(range(1, 10)),\n", 123 | " 'clf2__p': [1, 2]}]\n", 124 | "\n", 125 | "param_grid3 = [{'max_depth': list(range(1, 10)) + [None],\n", 126 | " 'criterion': ['gini', 'entropy']}]\n", 127 | "\n", 128 | "param_grid4 = [{'clf4__kernel': ['rbf'],\n", 129 | " 'clf4__C': np.power(10., np.arange(-4, 4)),\n", 130 | " 'clf4__gamma': np.power(10., np.arange(-5, 0))},\n", 131 | " {'clf4__kernel': ['linear'],\n", 132 | " 'clf4__C': np.power(10., np.arange(-4, 4))}]\n", 133 | "\n", 134 | "param_grid5 = [{'n_estimators': [10, 100, 500, 1000, 10000]}]" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 3, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "# Setting up multiple GridSearchCV objects, 1 for each algorithm\n", 144 | "gridcvs = {}\n", 145 | "inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)\n", 146 | "\n", 147 | "for pgrid, est, name in zip((param_grid1, param_grid2,\n", 148 | " param_grid3, param_grid4, param_grid5),\n", 149 | " (pipe1, pipe2, clf3, pipe4, clf5),\n", 150 | " ('Softmax', 'KNN', 'DTree', 'SVM', 'RForest')):\n", 151 | " gcv = GridSearchCV(estimator=est,\n", 152 | " param_grid=pgrid,\n", 153 | " scoring='accuracy',\n", 154 | " n_jobs=-1,\n", 155 | " cv=inner_cv,\n", 156 | " verbose=0,\n", 157 | " refit=True)\n", 158 | " gridcvs[name] = gcv" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 4, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "DTree | outer ACC 76.75% +/- 1.32\n", 171 | "KNN | outer ACC 91.10% +/- 0.58\n", 172 | "RForest | outer ACC 93.98% +/- 0.98\n", 173 | "SVM | outer ACC 91.80% +/- 1.00\n", 174 | "Softmax | outer ACC 89.97% +/- 0.57\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)\n", 180 | "\n", 181 | "\n", 182 | "for name, gs_est in sorted(gridcvs.items()):\n", 183 | " nested_score = cross_val_score(gs_est, \n", 184 | " X=X_train, \n", 185 | " y=y_train, \n", 186 | " cv=outer_cv,\n", 187 | " n_jobs=-1)\n", 188 | " print('%s | outer ACC %.2f%% +/- %.2f' % \n", 189 | " (name, nested_score.mean() * 100, nested_score.std() * 100))" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "------" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "- Determine the best algorithm from the experiment above; e.g., we find that Random Forest is performing best\n", 204 | "- Now, select a hyperparameters for the model based on regular k-fold on the whole training set" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 5, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "name": "stdout", 214 | "output_type": "stream", 215 | "text": [ 216 | "Fitting 2 folds for each of 5 candidates, totalling 10 fits\n" 217 | ] 218 | }, 219 | { 220 | "name": "stderr", 221 | "output_type": "stream", 222 | "text": [ 223 | "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.\n", 224 | "[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 1.5min finished\n" 225 | ] 226 | }, 227 | { 228 | "data": { 229 | "text/plain": [ 230 | "GridSearchCV(cv=StratifiedKFold(n_splits=2, random_state=1, shuffle=True),\n", 231 | " estimator=RandomForestClassifier(random_state=1), n_jobs=-1,\n", 232 | " param_grid=[{'n_estimators': [10, 100, 500, 1000, 10000]}],\n", 233 | " scoring='accuracy', verbose=1)" 234 | ] 235 | }, 236 | "execution_count": 5, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "gcv_model_select = GridSearchCV(estimator=clf5,\n", 243 | " param_grid=param_grid5,\n", 244 | " scoring='accuracy',\n", 245 | " n_jobs=-1,\n", 246 | " cv=inner_cv,\n", 247 | " verbose=1,\n", 248 | " refit=True)\n", 249 | "\n", 250 | "gcv_model_select.fit(X_train, y_train)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 6, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "Accuracy 93.30% (average over k-fold CV test folds)\n", 263 | "Best Parameters: {'n_estimators': 10000}\n", 264 | "Training Accuracy: 100.00%\n", 265 | "Test Accuracy: 94.00%\n" 266 | ] 267 | } 268 | ], 269 | "source": [ 270 | "best_model = gcv_model_select.best_estimator_\n", 271 | "\n", 272 | "\n", 273 | "## We can skip the next step because we set refit=True\n", 274 | "## so scikit-learn has already fit the model to the\n", 275 | "## whole training set\n", 276 | "\n", 277 | "# best_model.fit(X_train, y_train)\n", 278 | "\n", 279 | "\n", 280 | "train_acc = accuracy_score(y_true=y_train, y_pred=best_model.predict(X_train))\n", 281 | "test_acc = accuracy_score(y_true=y_test, y_pred=best_model.predict(X_test))\n", 282 | "\n", 283 | "print('Accuracy %.2f%% (average over k-fold CV test folds)' %\n", 284 | " (100 * gcv_model_select.best_score_))\n", 285 | "print('Best Parameters: %s' % gcv_model_select.best_params_)\n", 286 | "\n", 287 | "print('Training Accuracy: %.2f%%' % (100 * train_acc))\n", 288 | "print('Test Accuracy: %.2f%%' % (100 * test_acc))" 289 | ] 290 | } 291 | ], 292 | "metadata": { 293 | "anaconda-cloud": {}, 294 | "kernelspec": { 295 | "display_name": "Python 3", 296 | "language": "python", 297 | "name": "python3" 298 | }, 299 | "language_info": { 300 | "codemirror_mode": { 301 | "name": "ipython", 302 | "version": 3 303 | }, 304 | "file_extension": ".py", 305 | "mimetype": "text/x-python", 306 | "name": "python", 307 | "nbconvert_exporter": "python", 308 | "pygments_lexer": "ipython3", 309 | "version": "3.8.2" 310 | } 311 | }, 312 | "nbformat": 4, 313 | "nbformat_minor": 4 314 | } 315 | -------------------------------------------------------------------------------- /L11/code/11-eval4-algo__nested-cv_verbose1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "STAT 451: Machine Learning (Fall 2020) \n", 8 | "Instructor: Sebastian Raschka (sraschka@wisc.edu) \n", 9 | "\n", 10 | "Course website: http://pages.stat.wisc.edu/~sraschka/teaching/stat451-fs2020/" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# L11: Model Evaluation 4 -- Algorithm Comparison (Nested Cross-Validation)\n", 18 | "\n", 19 | "\n", 20 | "## verbose version 1 (using `StratifiedKFold` directly)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "This notebook illustrates how to implement nested cross-validation in scikit-learn.\n", 28 | "\n", 29 | "\n" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 1, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "Sebastian Raschka 2020-11-24 \n", 42 | "\n", 43 | "CPython 3.8.2\n", 44 | "IPython 7.18.1\n", 45 | "\n", 46 | "sklearn 0.23.2\n", 47 | "mlxtend 0.18.0.dev0\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "%load_ext watermark\n", 53 | "%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "- Setting up classifiers (or pipelines) and the parameter grids for model tuning\n", 61 | "- Remember, the hyperparameter tuning takes place in the inner loop" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 2, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "import numpy as np\n", 71 | "from sklearn.model_selection import GridSearchCV\n", 72 | "from sklearn.model_selection import train_test_split\n", 73 | "from sklearn.model_selection import StratifiedKFold\n", 74 | "from sklearn.model_selection import cross_val_score\n", 75 | "from sklearn.pipeline import Pipeline\n", 76 | "from sklearn.preprocessing import StandardScaler\n", 77 | "from sklearn.linear_model import LogisticRegression\n", 78 | "from sklearn.neighbors import KNeighborsClassifier\n", 79 | "from sklearn.tree import DecisionTreeClassifier\n", 80 | "from sklearn.ensemble import RandomForestClassifier\n", 81 | "from sklearn.svm import SVC\n", 82 | "from mlxtend.data import mnist_data\n", 83 | "from sklearn.metrics import accuracy_score\n", 84 | "\n", 85 | "# Loading and splitting the dataset\n", 86 | "# Note that this is a small (stratified) subset\n", 87 | "# of MNIST; it consists of 5000 samples only, that is,\n", 88 | "# 10% of the original MNIST dataset\n", 89 | "# http://yann.lecun.com/exdb/mnist/\n", 90 | "X, y = mnist_data()\n", 91 | "X = X.astype(np.float32)\n", 92 | "X_train, X_test, y_train, y_test = train_test_split(X, y,\n", 93 | " test_size=0.2,\n", 94 | " random_state=1,\n", 95 | " stratify=y)\n", 96 | "\n", 97 | "# Initializing Classifiers\n", 98 | "clf1 = LogisticRegression(multi_class='multinomial',\n", 99 | " solver='newton-cg',\n", 100 | " random_state=1)\n", 101 | "clf2 = KNeighborsClassifier(algorithm='ball_tree',\n", 102 | " leaf_size=50)\n", 103 | "clf3 = DecisionTreeClassifier(random_state=1)\n", 104 | "clf4 = SVC(random_state=1)\n", 105 | "clf5 = RandomForestClassifier(random_state=1)\n", 106 | "\n", 107 | "# Building the pipelines\n", 108 | "pipe1 = Pipeline([('std', StandardScaler()),\n", 109 | " ('clf1', clf1)])\n", 110 | "\n", 111 | "pipe2 = Pipeline([('std', StandardScaler()),\n", 112 | " ('clf2', clf2)])\n", 113 | "\n", 114 | "pipe4 = Pipeline([('std', StandardScaler()),\n", 115 | " ('clf4', clf4)])\n", 116 | "\n", 117 | "\n", 118 | "# Setting up the parameter grids\n", 119 | "param_grid1 = [{'clf1__penalty': ['l2'],\n", 120 | " 'clf1__C': np.power(10., np.arange(-4, 4))}]\n", 121 | "\n", 122 | "param_grid2 = [{'clf2__n_neighbors': list(range(1, 10)),\n", 123 | " 'clf2__p': [1, 2]}]\n", 124 | "\n", 125 | "param_grid3 = [{'max_depth': list(range(1, 10)) + [None],\n", 126 | " 'criterion': ['gini', 'entropy']}]\n", 127 | "\n", 128 | "param_grid4 = [{'clf4__kernel': ['rbf'],\n", 129 | " 'clf4__C': np.power(10., np.arange(-4, 4)),\n", 130 | " 'clf4__gamma': np.power(10., np.arange(-5, 0))},\n", 131 | " {'clf4__kernel': ['linear'],\n", 132 | " 'clf4__C': np.power(10., np.arange(-4, 4))}]\n", 133 | "\n", 134 | "param_grid5 = [{'n_estimators': [10, 100, 500, 1000, 10000]}]" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 3, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "# Setting up multiple GridSearchCV objects, 1 for each algorithm\n", 144 | "gridcvs = {}\n", 145 | "inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)\n", 146 | "\n", 147 | "for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3, param_grid4, param_grid5),\n", 148 | " (pipe1, pipe2, clf3, pipe4, clf5),\n", 149 | " ('Softmax', 'KNN', 'DTree', 'SVM', 'RForest')):\n", 150 | " gcv = GridSearchCV(estimator=est,\n", 151 | " param_grid=pgrid,\n", 152 | " scoring='accuracy',\n", 153 | " n_jobs=-1,\n", 154 | " cv=inner_cv,\n", 155 | " verbose=0,\n", 156 | " refit=True)\n", 157 | " gridcvs[name] = gcv" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "- Next, we define the outer loop\n", 165 | "- The training folds from the outer loop will be used in the inner loop for model tuning\n", 166 | "- The inner loop selects the best hyperparameter setting\n", 167 | "- This best hyperparameter setting can be evaluated on both the avg. over the inner test folds and the 1 corresponding test fold of the outer loop" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 4, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "name": "stdout", 177 | "output_type": "stream", 178 | "text": [ 179 | "-------------------------------------------------- \n", 180 | "\n", 181 | "Algorithm: DTree\n", 182 | " Inner loop:\n", 183 | "\n", 184 | " Best ACC (avg. of inner test folds) 72.59%\n", 185 | " Best parameters: {'criterion': 'gini', 'max_depth': None}\n", 186 | " ACC (on outer test fold) 75.50%\n", 187 | "\n", 188 | " Best ACC (avg. of inner test folds) 74.03%\n", 189 | " Best parameters: {'criterion': 'entropy', 'max_depth': 7}\n", 190 | " ACC (on outer test fold) 78.25%\n", 191 | "\n", 192 | " Best ACC (avg. of inner test folds) 73.88%\n", 193 | " Best parameters: {'criterion': 'entropy', 'max_depth': 9}\n", 194 | " ACC (on outer test fold) 77.38%\n", 195 | "\n", 196 | " Best ACC (avg. of inner test folds) 73.38%\n", 197 | " Best parameters: {'criterion': 'entropy', 'max_depth': 8}\n", 198 | " ACC (on outer test fold) 74.88%\n", 199 | "\n", 200 | " Best ACC (avg. of inner test folds) 73.91%\n", 201 | " Best parameters: {'criterion': 'entropy', 'max_depth': 8}\n", 202 | " ACC (on outer test fold) 77.75%\n", 203 | "\n", 204 | " Outer Loop:\n", 205 | " ACC 76.75% +/- 1.32\n", 206 | "-------------------------------------------------- \n", 207 | "\n", 208 | "Algorithm: KNN\n", 209 | " Inner loop:\n", 210 | "\n", 211 | " Best ACC (avg. of inner test folds) 88.38%\n", 212 | " Best parameters: {'clf2__n_neighbors': 1, 'clf2__p': 1}\n", 213 | " ACC (on outer test fold) 91.62%\n", 214 | "\n", 215 | " Best ACC (avg. of inner test folds) 88.75%\n", 216 | " Best parameters: {'clf2__n_neighbors': 1, 'clf2__p': 1}\n", 217 | " ACC (on outer test fold) 91.88%\n", 218 | "\n", 219 | " Best ACC (avg. of inner test folds) 89.84%\n", 220 | " Best parameters: {'clf2__n_neighbors': 1, 'clf2__p': 1}\n", 221 | " ACC (on outer test fold) 90.88%\n", 222 | "\n", 223 | " Best ACC (avg. of inner test folds) 89.50%\n", 224 | " Best parameters: {'clf2__n_neighbors': 1, 'clf2__p': 1}\n", 225 | " ACC (on outer test fold) 90.88%\n", 226 | "\n", 227 | " Best ACC (avg. of inner test folds) 89.06%\n", 228 | " Best parameters: {'clf2__n_neighbors': 1, 'clf2__p': 1}\n", 229 | " ACC (on outer test fold) 90.25%\n", 230 | "\n", 231 | " Outer Loop:\n", 232 | " ACC 91.10% +/- 0.58\n", 233 | "-------------------------------------------------- \n", 234 | "\n", 235 | "Algorithm: RForest\n", 236 | " Inner loop:\n", 237 | "\n", 238 | " Best ACC (avg. of inner test folds) 92.59%\n", 239 | " Best parameters: {'n_estimators': 1000}\n", 240 | " ACC (on outer test fold) 95.00%\n", 241 | "\n", 242 | " Best ACC (avg. of inner test folds) 92.59%\n", 243 | " Best parameters: {'n_estimators': 10000}\n", 244 | " ACC (on outer test fold) 94.75%\n", 245 | "\n", 246 | " Best ACC (avg. of inner test folds) 92.94%\n", 247 | " Best parameters: {'n_estimators': 10000}\n", 248 | " ACC (on outer test fold) 94.50%\n", 249 | "\n", 250 | " Best ACC (avg. of inner test folds) 93.00%\n", 251 | " Best parameters: {'n_estimators': 10000}\n", 252 | " ACC (on outer test fold) 92.50%\n", 253 | "\n", 254 | " Best ACC (avg. of inner test folds) 92.75%\n", 255 | " Best parameters: {'n_estimators': 500}\n", 256 | " ACC (on outer test fold) 93.12%\n", 257 | "\n", 258 | " Outer Loop:\n", 259 | " ACC 93.98% +/- 0.98\n", 260 | "-------------------------------------------------- \n", 261 | "\n", 262 | "Algorithm: SVM\n", 263 | " Inner loop:\n", 264 | "\n", 265 | " Best ACC (avg. of inner test folds) 90.75%\n", 266 | " Best parameters: {'clf4__C': 10.0, 'clf4__gamma': 0.001, 'clf4__kernel': 'rbf'}\n", 267 | " ACC (on outer test fold) 92.12%\n", 268 | "\n", 269 | " Best ACC (avg. of inner test folds) 90.22%\n", 270 | " Best parameters: {'clf4__C': 0.01, 'clf4__kernel': 'linear'}\n", 271 | " ACC (on outer test fold) 92.88%\n", 272 | "\n", 273 | " Best ACC (avg. of inner test folds) 90.91%\n", 274 | " Best parameters: {'clf4__C': 0.01, 'clf4__kernel': 'linear'}\n", 275 | " ACC (on outer test fold) 90.50%\n", 276 | "\n", 277 | " Best ACC (avg. of inner test folds) 90.53%\n", 278 | " Best parameters: {'clf4__C': 10.0, 'clf4__gamma': 0.001, 'clf4__kernel': 'rbf'}\n", 279 | " ACC (on outer test fold) 92.75%\n", 280 | "\n", 281 | " Best ACC (avg. of inner test folds) 90.12%\n", 282 | " Best parameters: {'clf4__C': 0.001, 'clf4__kernel': 'linear'}\n", 283 | " ACC (on outer test fold) 90.75%\n", 284 | "\n", 285 | " Outer Loop:\n", 286 | " ACC 91.80% +/- 1.00\n", 287 | "-------------------------------------------------- \n", 288 | "\n", 289 | "Algorithm: Softmax\n", 290 | " Inner loop:\n" 291 | ] 292 | }, 293 | { 294 | "name": "stderr", 295 | "output_type": "stream", 296 | "text": [ 297 | "/Users/sebastian/miniconda3/lib/python3.8/site-packages/scipy/optimize/linesearch.py:327: LineSearchWarning: The line search algorithm did not converge\n", 298 | " warn('The line search algorithm did not converge', LineSearchWarning)\n", 299 | "/Users/sebastian/miniconda3/lib/python3.8/site-packages/sklearn/utils/optimize.py:204: UserWarning: Line Search failed\n", 300 | " warnings.warn('Line Search failed')\n" 301 | ] 302 | }, 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | "\n", 308 | " Best ACC (avg. of inner test folds) 88.91%\n", 309 | " Best parameters: {'clf1__C': 0.01, 'clf1__penalty': 'l2'}\n", 310 | " ACC (on outer test fold) 90.00%\n" 311 | ] 312 | }, 313 | { 314 | "name": "stderr", 315 | "output_type": "stream", 316 | "text": [ 317 | "/Users/sebastian/miniconda3/lib/python3.8/site-packages/scipy/optimize/linesearch.py:327: LineSearchWarning: The line search algorithm did not converge\n", 318 | " warn('The line search algorithm did not converge', LineSearchWarning)\n", 319 | "/Users/sebastian/miniconda3/lib/python3.8/site-packages/sklearn/utils/optimize.py:204: UserWarning: Line Search failed\n", 320 | " warnings.warn('Line Search failed')\n" 321 | ] 322 | }, 323 | { 324 | "name": "stdout", 325 | "output_type": "stream", 326 | "text": [ 327 | "\n", 328 | " Best ACC (avg. of inner test folds) 88.75%\n", 329 | " Best parameters: {'clf1__C': 0.01, 'clf1__penalty': 'l2'}\n", 330 | " ACC (on outer test fold) 91.00%\n" 331 | ] 332 | }, 333 | { 334 | "name": "stderr", 335 | "output_type": "stream", 336 | "text": [ 337 | "/Users/sebastian/miniconda3/lib/python3.8/site-packages/scipy/optimize/linesearch.py:327: LineSearchWarning: The line search algorithm did not converge\n", 338 | " warn('The line search algorithm did not converge', LineSearchWarning)\n", 339 | "/Users/sebastian/miniconda3/lib/python3.8/site-packages/sklearn/utils/optimize.py:204: UserWarning: Line Search failed\n", 340 | " warnings.warn('Line Search failed')\n" 341 | ] 342 | }, 343 | { 344 | "name": "stdout", 345 | "output_type": "stream", 346 | "text": [ 347 | "\n", 348 | " Best ACC (avg. of inner test folds) 89.31%\n", 349 | " Best parameters: {'clf1__C': 0.01, 'clf1__penalty': 'l2'}\n", 350 | " ACC (on outer test fold) 90.00%\n" 351 | ] 352 | }, 353 | { 354 | "name": "stderr", 355 | "output_type": "stream", 356 | "text": [ 357 | "/Users/sebastian/miniconda3/lib/python3.8/site-packages/scipy/optimize/linesearch.py:327: LineSearchWarning: The line search algorithm did not converge\n", 358 | " warn('The line search algorithm did not converge', LineSearchWarning)\n", 359 | "/Users/sebastian/miniconda3/lib/python3.8/site-packages/sklearn/utils/optimize.py:204: UserWarning: Line Search failed\n", 360 | " warnings.warn('Line Search failed')\n" 361 | ] 362 | }, 363 | { 364 | "name": "stdout", 365 | "output_type": "stream", 366 | "text": [ 367 | "\n", 368 | " Best ACC (avg. of inner test folds) 88.59%\n", 369 | " Best parameters: {'clf1__C': 0.1, 'clf1__penalty': 'l2'}\n", 370 | " ACC (on outer test fold) 89.38%\n", 371 | "\n", 372 | " Best ACC (avg. of inner test folds) 88.66%\n", 373 | " Best parameters: {'clf1__C': 0.01, 'clf1__penalty': 'l2'}\n", 374 | " ACC (on outer test fold) 89.50%\n", 375 | "\n", 376 | " Outer Loop:\n", 377 | " ACC 89.97% +/- 0.57\n" 378 | ] 379 | }, 380 | { 381 | "name": "stderr", 382 | "output_type": "stream", 383 | "text": [ 384 | "/Users/sebastian/miniconda3/lib/python3.8/site-packages/scipy/optimize/linesearch.py:327: LineSearchWarning: The line search algorithm did not converge\n", 385 | " warn('The line search algorithm did not converge', LineSearchWarning)\n", 386 | "/Users/sebastian/miniconda3/lib/python3.8/site-packages/sklearn/utils/optimize.py:204: UserWarning: Line Search failed\n", 387 | " warnings.warn('Line Search failed')\n" 388 | ] 389 | } 390 | ], 391 | "source": [ 392 | "for name, gs_est in sorted(gridcvs.items()):\n", 393 | "\n", 394 | " print(50 * '-', '\\n')\n", 395 | " print('Algorithm:', name)\n", 396 | " print(' Inner loop:')\n", 397 | " \n", 398 | " outer_scores = []\n", 399 | " outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)\n", 400 | " \n", 401 | " \n", 402 | " for train_idx, valid_idx in outer_cv.split(X_train, y_train):\n", 403 | " \n", 404 | " gridcvs[name].fit(X_train[train_idx], y_train[train_idx]) # run inner loop hyperparam tuning\n", 405 | " print('\\n Best ACC (avg. of inner test folds) %.2f%%' % (gridcvs[name].best_score_ * 100))\n", 406 | " print(' Best parameters:', gridcvs[name].best_params_)\n", 407 | " \n", 408 | " # perf on test fold (valid_idx)\n", 409 | " outer_scores.append(gridcvs[name].best_estimator_.score(X_train[valid_idx], y_train[valid_idx]))\n", 410 | " print(' ACC (on outer test fold) %.2f%%' % (outer_scores[-1]*100))\n", 411 | " \n", 412 | " print('\\n Outer Loop:')\n", 413 | " print(' ACC %.2f%% +/- %.2f' % \n", 414 | " (np.mean(outer_scores) * 100, np.std(outer_scores) * 100))" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": {}, 420 | "source": [ 421 | "------" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "metadata": {}, 427 | "source": [ 428 | "- Determine the best algorithm from the experiment above; e.g., we find that Random Forest is performing best\n", 429 | "- Now, select a hyperparameters for the model based on regular k-fold on the whole training set" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 5, 435 | "metadata": {}, 436 | "outputs": [ 437 | { 438 | "name": "stderr", 439 | "output_type": "stream", 440 | "text": [ 441 | "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.\n" 442 | ] 443 | }, 444 | { 445 | "name": "stdout", 446 | "output_type": "stream", 447 | "text": [ 448 | "Fitting 2 folds for each of 5 candidates, totalling 10 fits\n" 449 | ] 450 | }, 451 | { 452 | "name": "stderr", 453 | "output_type": "stream", 454 | "text": [ 455 | "[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 1.3min finished\n" 456 | ] 457 | }, 458 | { 459 | "name": "stdout", 460 | "output_type": "stream", 461 | "text": [ 462 | "Best CV accuracy: 93.30%\n", 463 | "Best parameters: {'n_estimators': 10000}\n" 464 | ] 465 | } 466 | ], 467 | "source": [ 468 | "gcv_model_select = GridSearchCV(estimator=clf5,\n", 469 | " param_grid=param_grid5,\n", 470 | " scoring='accuracy',\n", 471 | " n_jobs=-1,\n", 472 | " cv=inner_cv,\n", 473 | " verbose=1,\n", 474 | " refit=True)\n", 475 | "\n", 476 | "gcv_model_select.fit(X_train, y_train)\n", 477 | "print('Best CV accuracy: %.2f%%' % (gcv_model_select.best_score_*100))\n", 478 | "print('Best parameters:', gcv_model_select.best_params_)" 479 | ] 480 | }, 481 | { 482 | "cell_type": "markdown", 483 | "metadata": {}, 484 | "source": [ 485 | "- Using these settings, we can now train the best model to the whole training set" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 6, 491 | "metadata": {}, 492 | "outputs": [ 493 | { 494 | "name": "stdout", 495 | "output_type": "stream", 496 | "text": [ 497 | "Training Accuracy: 100.00%\n", 498 | "Test Accuracy: 94.00%\n" 499 | ] 500 | } 501 | ], 502 | "source": [ 503 | "## We can skip the next step because we set refit=True\n", 504 | "## so scikit-learn has already fit the model to the\n", 505 | "## whole training set\n", 506 | "\n", 507 | "# gcv_model_select.fit(X_train, y_train)\n", 508 | "\n", 509 | "train_acc = accuracy_score(y_true=y_train, y_pred=gcv_model_select.predict(X_train))\n", 510 | "test_acc = accuracy_score(y_true=y_test, y_pred=gcv_model_select.predict(X_test))\n", 511 | "\n", 512 | "print('Training Accuracy: %.2f%%' % (100 * train_acc))\n", 513 | "print('Test Accuracy: %.2f%%' % (100 * test_acc))" 514 | ] 515 | }, 516 | { 517 | "cell_type": "markdown", 518 | "metadata": {}, 519 | "source": [ 520 | "For comparison, previously, we have seen that using this algorithm, that the avg. outer fold accuracy was \n", 521 | "\n", 522 | " ACC 93.98% +/- 0.98" 523 | ] 524 | } 525 | ], 526 | "metadata": { 527 | "anaconda-cloud": {}, 528 | "kernelspec": { 529 | "display_name": "Python 3", 530 | "language": "python", 531 | "name": "python3" 532 | }, 533 | "language_info": { 534 | "codemirror_mode": { 535 | "name": "ipython", 536 | "version": 3 537 | }, 538 | "file_extension": ".py", 539 | "mimetype": "text/x-python", 540 | "name": "python", 541 | "nbconvert_exporter": "python", 542 | "pygments_lexer": "ipython3", 543 | "version": "3.8.2" 544 | } 545 | }, 546 | "nbformat": 4, 547 | "nbformat_minor": 4 548 | } 549 | -------------------------------------------------------------------------------- /L11/code/11-eval4-algo__nested-cv_verbose2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "STAT 451: Machine Learning (Fall 2020) \n", 8 | "Instructor: Sebastian Raschka (sraschka@wisc.edu) \n", 9 | "\n", 10 | "Course website: http://pages.stat.wisc.edu/~sraschka/teaching/stat451-fs2020/" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# L11: Model Evaluation 4 -- Algorithm Comparison (Nested Cross-Validation)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "\n", 25 | "\n", 26 | "## -- verbose version 2 (using `cross_validate`)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "This notebook illustrates how to implement nested cross-validation in scikit-learn. This notebook is a more compact version of the other notebook [./11-eval4-algo__nested-cv_verbose1.ipynb](./11-eval4-algo__nested-cv_verbose1.ipynb). Here, instead of using `StratifiedKFold` directly and iterate over the splits, we use the `cross_validate` function.\n", 34 | "\n", 35 | "" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 1, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "Sebastian Raschka 2020-11-24 \n", 48 | "\n", 49 | "CPython 3.8.2\n", 50 | "IPython 7.18.1\n", 51 | "\n", 52 | "sklearn 0.23.2\n", 53 | "mlxtend 0.18.0.dev0\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "%load_ext watermark\n", 59 | "%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 2, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "import numpy as np\n", 69 | "from sklearn.model_selection import GridSearchCV\n", 70 | "from sklearn.model_selection import train_test_split\n", 71 | "from sklearn.model_selection import StratifiedKFold\n", 72 | "from sklearn.model_selection import cross_validate\n", 73 | "from sklearn.pipeline import Pipeline\n", 74 | "from sklearn.preprocessing import StandardScaler\n", 75 | "from sklearn.linear_model import LogisticRegression\n", 76 | "from sklearn.neighbors import KNeighborsClassifier\n", 77 | "from sklearn.tree import DecisionTreeClassifier\n", 78 | "from sklearn.ensemble import RandomForestClassifier\n", 79 | "from sklearn.svm import SVC\n", 80 | "from mlxtend.data import mnist_data\n", 81 | "from sklearn.metrics import accuracy_score\n", 82 | "\n", 83 | "# Loading and splitting the dataset\n", 84 | "# Note that this is a small (stratified) subset\n", 85 | "# of MNIST; it consists of 5000 samples only, that is,\n", 86 | "# 10% of the original MNIST dataset\n", 87 | "# http://yann.lecun.com/exdb/mnist/\n", 88 | "X, y = mnist_data()\n", 89 | "X = X.astype(np.float32)\n", 90 | "X_train, X_test, y_train, y_test = train_test_split(X, y,\n", 91 | " test_size=0.2,\n", 92 | " random_state=1,\n", 93 | " stratify=y)\n", 94 | "\n", 95 | "# Initializing Classifiers\n", 96 | "clf1 = LogisticRegression(multi_class='multinomial',\n", 97 | " solver='newton-cg',\n", 98 | " random_state=1)\n", 99 | "clf2 = KNeighborsClassifier(algorithm='ball_tree',\n", 100 | " leaf_size=50)\n", 101 | "clf3 = DecisionTreeClassifier(random_state=1)\n", 102 | "clf4 = SVC(random_state=1)\n", 103 | "clf5 = RandomForestClassifier(random_state=1)\n", 104 | "\n", 105 | "# Building the pipelines\n", 106 | "pipe1 = Pipeline([('std', StandardScaler()),\n", 107 | " ('clf1', clf1)])\n", 108 | "\n", 109 | "pipe2 = Pipeline([('std', StandardScaler()),\n", 110 | " ('clf2', clf2)])\n", 111 | "\n", 112 | "pipe4 = Pipeline([('std', StandardScaler()),\n", 113 | " ('clf4', clf4)])\n", 114 | "\n", 115 | "\n", 116 | "# Setting up the parameter grids\n", 117 | "param_grid1 = [{'clf1__penalty': ['l2'],\n", 118 | " 'clf1__C': np.power(10., np.arange(-4, 4))}]\n", 119 | "\n", 120 | "param_grid2 = [{'clf2__n_neighbors': list(range(1, 10)),\n", 121 | " 'clf2__p': [1, 2]}]\n", 122 | "\n", 123 | "param_grid3 = [{'max_depth': list(range(1, 10)) + [None],\n", 124 | " 'criterion': ['gini', 'entropy']}]\n", 125 | "\n", 126 | "param_grid4 = [{'clf4__kernel': ['rbf'],\n", 127 | " 'clf4__C': np.power(10., np.arange(-4, 4)),\n", 128 | " 'clf4__gamma': np.power(10., np.arange(-5, 0))},\n", 129 | " {'clf4__kernel': ['linear'],\n", 130 | " 'clf4__C': np.power(10., np.arange(-4, 4))}]\n", 131 | "\n", 132 | "param_grid5 = [{'n_estimators': [10, 100, 500, 1000, 10000]}]" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 3, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "# Setting up multiple GridSearchCV objects, 1 for each algorithm\n", 142 | "gridcvs = {}\n", 143 | "inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)\n", 144 | "\n", 145 | "for pgrid, est, name in zip((param_grid1, param_grid2,\n", 146 | " param_grid3, param_grid4, param_grid5),\n", 147 | " (pipe1, pipe2, clf3, pipe4, clf5),\n", 148 | " ('Softmax', 'KNN', 'DTree', 'SVM', 'RForest')):\n", 149 | " gcv = GridSearchCV(estimator=est,\n", 150 | " param_grid=pgrid,\n", 151 | " scoring='accuracy',\n", 152 | " n_jobs=-1,\n", 153 | " cv=inner_cv,\n", 154 | " verbose=0,\n", 155 | " refit=True)\n", 156 | " gridcvs[name] = gcv" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 4, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "-------------------------------------------------- \n", 169 | "\n", 170 | "Algorithm: DTree\n", 171 | " Inner loop:\n", 172 | "\n", 173 | " Best ACC (avg. of inner test folds) 72.59%\n", 174 | " Best parameters: DecisionTreeClassifier(random_state=1)\n", 175 | " ACC (on outer test fold) 75.50%\n", 176 | "\n", 177 | " Best ACC (avg. of inner test folds) 74.03%\n", 178 | " Best parameters: DecisionTreeClassifier(criterion='entropy', max_depth=7, random_state=1)\n", 179 | " ACC (on outer test fold) 78.25%\n", 180 | "\n", 181 | " Best ACC (avg. of inner test folds) 73.88%\n", 182 | " Best parameters: DecisionTreeClassifier(criterion='entropy', max_depth=9, random_state=1)\n", 183 | " ACC (on outer test fold) 77.38%\n", 184 | "\n", 185 | " Best ACC (avg. of inner test folds) 73.38%\n", 186 | " Best parameters: DecisionTreeClassifier(criterion='entropy', max_depth=8, random_state=1)\n", 187 | " ACC (on outer test fold) 74.88%\n", 188 | "\n", 189 | " Best ACC (avg. of inner test folds) 73.91%\n", 190 | " Best parameters: DecisionTreeClassifier(criterion='entropy', max_depth=8, random_state=1)\n", 191 | " ACC (on outer test fold) 77.75%\n", 192 | "\n", 193 | "DTree | outer ACC 76.75% +/- 1.32\n", 194 | "-------------------------------------------------- \n", 195 | "\n", 196 | "Algorithm: KNN\n", 197 | " Inner loop:\n", 198 | "\n", 199 | " Best ACC (avg. of inner test folds) 88.38%\n", 200 | " Best parameters: Pipeline(steps=[('std', StandardScaler()),\n", 201 | " ('clf2',\n", 202 | " KNeighborsClassifier(algorithm='ball_tree', leaf_size=50,\n", 203 | " n_neighbors=1, p=1))])\n", 204 | " ACC (on outer test fold) 91.62%\n", 205 | "\n", 206 | " Best ACC (avg. of inner test folds) 88.75%\n", 207 | " Best parameters: Pipeline(steps=[('std', StandardScaler()),\n", 208 | " ('clf2',\n", 209 | " KNeighborsClassifier(algorithm='ball_tree', leaf_size=50,\n", 210 | " n_neighbors=1, p=1))])\n", 211 | " ACC (on outer test fold) 91.88%\n", 212 | "\n", 213 | " Best ACC (avg. of inner test folds) 89.84%\n", 214 | " Best parameters: Pipeline(steps=[('std', StandardScaler()),\n", 215 | " ('clf2',\n", 216 | " KNeighborsClassifier(algorithm='ball_tree', leaf_size=50,\n", 217 | " n_neighbors=1, p=1))])\n", 218 | " ACC (on outer test fold) 90.88%\n", 219 | "\n", 220 | " Best ACC (avg. of inner test folds) 89.50%\n", 221 | " Best parameters: Pipeline(steps=[('std', StandardScaler()),\n", 222 | " ('clf2',\n", 223 | " KNeighborsClassifier(algorithm='ball_tree', leaf_size=50,\n", 224 | " n_neighbors=1, p=1))])\n", 225 | " ACC (on outer test fold) 90.88%\n", 226 | "\n", 227 | " Best ACC (avg. of inner test folds) 89.06%\n", 228 | " Best parameters: Pipeline(steps=[('std', StandardScaler()),\n", 229 | " ('clf2',\n", 230 | " KNeighborsClassifier(algorithm='ball_tree', leaf_size=50,\n", 231 | " n_neighbors=1, p=1))])\n", 232 | " ACC (on outer test fold) 90.25%\n", 233 | "\n", 234 | "KNN | outer ACC 91.10% +/- 0.58\n", 235 | "-------------------------------------------------- \n", 236 | "\n", 237 | "Algorithm: RForest\n", 238 | " Inner loop:\n", 239 | "\n", 240 | " Best ACC (avg. of inner test folds) 92.59%\n", 241 | " Best parameters: RandomForestClassifier(n_estimators=1000, random_state=1)\n", 242 | " ACC (on outer test fold) 95.00%\n", 243 | "\n", 244 | " Best ACC (avg. of inner test folds) 92.59%\n", 245 | " Best parameters: RandomForestClassifier(n_estimators=10000, random_state=1)\n", 246 | " ACC (on outer test fold) 94.75%\n", 247 | "\n", 248 | " Best ACC (avg. of inner test folds) 92.94%\n", 249 | " Best parameters: RandomForestClassifier(n_estimators=10000, random_state=1)\n", 250 | " ACC (on outer test fold) 94.50%\n", 251 | "\n", 252 | " Best ACC (avg. of inner test folds) 93.00%\n", 253 | " Best parameters: RandomForestClassifier(n_estimators=10000, random_state=1)\n", 254 | " ACC (on outer test fold) 92.50%\n", 255 | "\n", 256 | " Best ACC (avg. of inner test folds) 92.75%\n", 257 | " Best parameters: RandomForestClassifier(n_estimators=500, random_state=1)\n", 258 | " ACC (on outer test fold) 93.12%\n", 259 | "\n", 260 | "RForest | outer ACC 93.98% +/- 0.98\n", 261 | "-------------------------------------------------- \n", 262 | "\n", 263 | "Algorithm: SVM\n", 264 | " Inner loop:\n", 265 | "\n", 266 | " Best ACC (avg. of inner test folds) 90.75%\n", 267 | " Best parameters: Pipeline(steps=[('std', StandardScaler()),\n", 268 | " ('clf4', SVC(C=10.0, gamma=0.001, random_state=1))])\n", 269 | " ACC (on outer test fold) 92.12%\n", 270 | "\n", 271 | " Best ACC (avg. of inner test folds) 90.22%\n", 272 | " Best parameters: Pipeline(steps=[('std', StandardScaler()),\n", 273 | " ('clf4', SVC(C=0.01, kernel='linear', random_state=1))])\n", 274 | " ACC (on outer test fold) 92.88%\n", 275 | "\n", 276 | " Best ACC (avg. of inner test folds) 90.91%\n", 277 | " Best parameters: Pipeline(steps=[('std', StandardScaler()),\n", 278 | " ('clf4', SVC(C=0.01, kernel='linear', random_state=1))])\n", 279 | " ACC (on outer test fold) 90.50%\n", 280 | "\n", 281 | " Best ACC (avg. of inner test folds) 90.53%\n", 282 | " Best parameters: Pipeline(steps=[('std', StandardScaler()),\n", 283 | " ('clf4', SVC(C=10.0, gamma=0.001, random_state=1))])\n", 284 | " ACC (on outer test fold) 92.75%\n", 285 | "\n", 286 | " Best ACC (avg. of inner test folds) 90.12%\n", 287 | " Best parameters: Pipeline(steps=[('std', StandardScaler()),\n", 288 | " ('clf4', SVC(C=0.001, kernel='linear', random_state=1))])\n", 289 | " ACC (on outer test fold) 90.75%\n", 290 | "\n", 291 | "SVM | outer ACC 91.80% +/- 1.00\n", 292 | "-------------------------------------------------- \n", 293 | "\n", 294 | "Algorithm: Softmax\n", 295 | " Inner loop:\n", 296 | "\n", 297 | " Best ACC (avg. of inner test folds) 88.91%\n", 298 | " Best parameters: Pipeline(steps=[('std', StandardScaler()),\n", 299 | " ('clf1',\n", 300 | " LogisticRegression(C=0.01, multi_class='multinomial',\n", 301 | " random_state=1, solver='newton-cg'))])\n", 302 | " ACC (on outer test fold) 90.00%\n", 303 | "\n", 304 | " Best ACC (avg. of inner test folds) 88.75%\n", 305 | " Best parameters: Pipeline(steps=[('std', StandardScaler()),\n", 306 | " ('clf1',\n", 307 | " LogisticRegression(C=0.01, multi_class='multinomial',\n", 308 | " random_state=1, solver='newton-cg'))])\n", 309 | " ACC (on outer test fold) 91.00%\n", 310 | "\n", 311 | " Best ACC (avg. of inner test folds) 89.31%\n", 312 | " Best parameters: Pipeline(steps=[('std', StandardScaler()),\n", 313 | " ('clf1',\n", 314 | " LogisticRegression(C=0.01, multi_class='multinomial',\n", 315 | " random_state=1, solver='newton-cg'))])\n", 316 | " ACC (on outer test fold) 90.00%\n", 317 | "\n", 318 | " Best ACC (avg. of inner test folds) 88.59%\n", 319 | " Best parameters: Pipeline(steps=[('std', StandardScaler()),\n", 320 | " ('clf1',\n", 321 | " LogisticRegression(C=0.1, multi_class='multinomial',\n", 322 | " random_state=1, solver='newton-cg'))])\n", 323 | " ACC (on outer test fold) 89.38%\n", 324 | "\n", 325 | " Best ACC (avg. of inner test folds) 88.66%\n", 326 | " Best parameters: Pipeline(steps=[('std', StandardScaler()),\n", 327 | " ('clf1',\n", 328 | " LogisticRegression(C=0.01, multi_class='multinomial',\n", 329 | " random_state=1, solver='newton-cg'))])\n", 330 | " ACC (on outer test fold) 89.50%\n", 331 | "\n", 332 | "Softmax | outer ACC 89.97% +/- 0.57\n" 333 | ] 334 | } 335 | ], 336 | "source": [ 337 | "outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)\n", 338 | "\n", 339 | "\n", 340 | "for name, gs_est in sorted(gridcvs.items()):\n", 341 | " scores_dict = cross_validate(gs_est, \n", 342 | " X=X_train, \n", 343 | " y=y_train, \n", 344 | " cv=outer_cv,\n", 345 | " return_estimator=True,\n", 346 | " n_jobs=-1)\n", 347 | "\n", 348 | " print(50 * '-', '\\n')\n", 349 | " print('Algorithm:', name)\n", 350 | " print(' Inner loop:')\n", 351 | " \n", 352 | " \n", 353 | " for i in range(scores_dict['test_score'].shape[0]):\n", 354 | "\n", 355 | " print('\\n Best ACC (avg. of inner test folds) %.2f%%' % (scores_dict['estimator'][i].best_score_ * 100))\n", 356 | " print(' Best parameters:', scores_dict['estimator'][i].best_estimator_)\n", 357 | " print(' ACC (on outer test fold) %.2f%%' % (scores_dict['test_score'][i]*100))\n", 358 | "\n", 359 | " print('\\n%s | outer ACC %.2f%% +/- %.2f' % \n", 360 | " (name, scores_dict['test_score'].mean() * 100, \n", 361 | " scores_dict['test_score'].std() * 100))" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "------" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "- Determine the best algorithm from the experiment above; e.g., we find that Random Forest is performing best\n", 376 | "- Now, select a hyperparameters for the model based on regular k-fold on the whole training set" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 5, 382 | "metadata": {}, 383 | "outputs": [ 384 | { 385 | "name": "stdout", 386 | "output_type": "stream", 387 | "text": [ 388 | "Fitting 2 folds for each of 5 candidates, totalling 10 fits\n" 389 | ] 390 | }, 391 | { 392 | "name": "stderr", 393 | "output_type": "stream", 394 | "text": [ 395 | "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.\n", 396 | "[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 1.4min finished\n" 397 | ] 398 | }, 399 | { 400 | "data": { 401 | "text/plain": [ 402 | "GridSearchCV(cv=StratifiedKFold(n_splits=2, random_state=1, shuffle=True),\n", 403 | " estimator=RandomForestClassifier(random_state=1), n_jobs=-1,\n", 404 | " param_grid=[{'n_estimators': [10, 100, 500, 1000, 10000]}],\n", 405 | " scoring='accuracy', verbose=1)" 406 | ] 407 | }, 408 | "execution_count": 5, 409 | "metadata": {}, 410 | "output_type": "execute_result" 411 | } 412 | ], 413 | "source": [ 414 | "gcv_model_select = GridSearchCV(estimator=clf5,\n", 415 | " param_grid=param_grid5,\n", 416 | " scoring='accuracy',\n", 417 | " n_jobs=-1,\n", 418 | " cv=inner_cv,\n", 419 | " verbose=1,\n", 420 | " refit=True)\n", 421 | "\n", 422 | "gcv_model_select.fit(X_train, y_train)" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 6, 428 | "metadata": {}, 429 | "outputs": [ 430 | { 431 | "name": "stdout", 432 | "output_type": "stream", 433 | "text": [ 434 | "Accuracy 93.30% (average over k-fold CV test folds)\n", 435 | "Best Parameters: {'n_estimators': 10000}\n", 436 | "Training Accuracy: 100.00%\n", 437 | "Test Accuracy: 94.00%\n" 438 | ] 439 | } 440 | ], 441 | "source": [ 442 | "best_model = gcv_model_select.best_estimator_\n", 443 | "\n", 444 | "\n", 445 | "## We can skip the next step because we set refit=True\n", 446 | "## so scikit-learn has already fit the model to the\n", 447 | "## whole training set\n", 448 | "\n", 449 | "# best_model.fit(X_train, y_train)\n", 450 | "\n", 451 | "\n", 452 | "train_acc = accuracy_score(y_true=y_train, y_pred=best_model.predict(X_train))\n", 453 | "test_acc = accuracy_score(y_true=y_test, y_pred=best_model.predict(X_test))\n", 454 | "\n", 455 | "print('Accuracy %.2f%% (average over k-fold CV test folds)' %\n", 456 | " (100 * gcv_model_select.best_score_))\n", 457 | "print('Best Parameters: %s' % gcv_model_select.best_params_)\n", 458 | "\n", 459 | "print('Training Accuracy: %.2f%%' % (100 * train_acc))\n", 460 | "print('Test Accuracy: %.2f%%' % (100 * test_acc))" 461 | ] 462 | } 463 | ], 464 | "metadata": { 465 | "anaconda-cloud": {}, 466 | "kernelspec": { 467 | "display_name": "Python 3", 468 | "language": "python", 469 | "name": "python3" 470 | }, 471 | "language_info": { 472 | "codemirror_mode": { 473 | "name": "ipython", 474 | "version": 3 475 | }, 476 | "file_extension": ".py", 477 | "mimetype": "text/x-python", 478 | "name": "python", 479 | "nbconvert_exporter": "python", 480 | "pygments_lexer": "ipython3", 481 | "version": "3.8.2" 482 | } 483 | }, 484 | "nbformat": 4, 485 | "nbformat_minor": 4 486 | } 487 | -------------------------------------------------------------------------------- /L11/code/nested-cv-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L11/code/nested-cv-image.png -------------------------------------------------------------------------------- /L12/12_eval5-metrics__slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/L12/12_eval5-metrics__slides.pdf -------------------------------------------------------------------------------- /L12/code/12_2_pre-recall-f1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "STAT 451: Machine Learning (Fall 2020) \n", 8 | "Instructor: Sebastian Raschka (sraschka@wisc.edu) \n", 9 | "\n", 10 | "Course website: http://pages.stat.wisc.edu/~sraschka/teaching/stat451-fs2020/" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# 2) Precision, Recall, F1 Score" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Loading the Breast Cancer Wisconsin dataset" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "- In the Breast Cancer Wisconsin dataset, the firt column in this dataset stores the unique ID numbers of patients\n", 32 | "- The second column stores the corresponding cancer diagnoses (M = malignant, B = benign)\n", 33 | "- Columns 3-32 contain features that were extracted from digitized images of the nuclei of the cancer cells, which can be used to build a model to predict whether a tumor is benign or malignant.\n", 34 | "- The Breast Cancer Wisconsin dataset has been deposited in the UCI Machine Learning Repository, and more detailed information about this dataset can be found at https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 1, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/html": [ 45 | "
\n", 46 | "\n", 59 | "\n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | "
0123456789...22232425262728293031
0842302M17.9910.38122.801001.00.118400.277600.30010.14710...25.3817.33184.602019.00.16220.66560.71190.26540.46010.11890
1842517M20.5717.77132.901326.00.084740.078640.08690.07017...24.9923.41158.801956.00.12380.18660.24160.18600.27500.08902
284300903M19.6921.25130.001203.00.109600.159900.19740.12790...23.5725.53152.501709.00.14440.42450.45040.24300.36130.08758
384348301M11.4220.3877.58386.10.142500.283900.24140.10520...14.9126.5098.87567.70.20980.86630.68690.25750.66380.17300
484358402M20.2914.34135.101297.00.100300.132800.19800.10430...22.5416.67152.201575.00.13740.20500.40000.16250.23640.07678
\n", 209 | "

5 rows × 32 columns

\n", 210 | "
" 211 | ], 212 | "text/plain": [ 213 | " 0 1 2 3 4 5 6 7 8 \\\n", 214 | "0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 \n", 215 | "1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 \n", 216 | "2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 \n", 217 | "3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 \n", 218 | "4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 \n", 219 | "\n", 220 | " 9 ... 22 23 24 25 26 27 28 29 \\\n", 221 | "0 0.14710 ... 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 \n", 222 | "1 0.07017 ... 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 \n", 223 | "2 0.12790 ... 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 \n", 224 | "3 0.10520 ... 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 \n", 225 | "4 0.10430 ... 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 \n", 226 | "\n", 227 | " 30 31 \n", 228 | "0 0.4601 0.11890 \n", 229 | "1 0.2750 0.08902 \n", 230 | "2 0.3613 0.08758 \n", 231 | "3 0.6638 0.17300 \n", 232 | "4 0.2364 0.07678 \n", 233 | "\n", 234 | "[5 rows x 32 columns]" 235 | ] 236 | }, 237 | "execution_count": 1, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "import pandas as pd\n", 244 | "\n", 245 | "df = pd.read_csv('https://archive.ics.uci.edu/ml/'\n", 246 | " 'machine-learning-databases'\n", 247 | " '/breast-cancer-wisconsin/wdbc.data', header=None)\n", 248 | "\n", 249 | "df.head()" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 2, 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "data": { 259 | "text/plain": [ 260 | "(569, 32)" 261 | ] 262 | }, 263 | "execution_count": 2, 264 | "metadata": {}, 265 | "output_type": "execute_result" 266 | } 267 | ], 268 | "source": [ 269 | "df.shape" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "
" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "- First, we are converting the class labels from a string format into integers" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 3, 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "data": { 293 | "text/plain": [ 294 | "array(['B', 'M'], dtype=object)" 295 | ] 296 | }, 297 | "execution_count": 3, 298 | "metadata": {}, 299 | "output_type": "execute_result" 300 | } 301 | ], 302 | "source": [ 303 | "from sklearn.preprocessing import LabelEncoder\n", 304 | "\n", 305 | "X = df.loc[:, 2:].values\n", 306 | "y = df.loc[:, 1].values\n", 307 | "le = LabelEncoder()\n", 308 | "y = le.fit_transform(y)\n", 309 | "le.classes_" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "- Here, class \"M\" (malignant cancer) will be converted to class 1, and \"B\" will be converted into class 0 (the order the class labels are mapped depends on the alphabetical order of the string labels)" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 4, 322 | "metadata": {}, 323 | "outputs": [ 324 | { 325 | "data": { 326 | "text/plain": [ 327 | "array([1, 0])" 328 | ] 329 | }, 330 | "execution_count": 4, 331 | "metadata": {}, 332 | "output_type": "execute_result" 333 | } 334 | ], 335 | "source": [ 336 | "le.transform(['M', 'B'])" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "- Next, we split the data into 80% training data and 20% test data, using a stratified split" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 5, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "from sklearn.model_selection import train_test_split\n", 353 | "\n", 354 | "X_train, X_test, y_train, y_test = \\\n", 355 | " train_test_split(X, y, \n", 356 | " test_size=0.20,\n", 357 | " stratify=y,\n", 358 | " random_state=1)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": {}, 364 | "source": [ 365 | "## 2) Precision, Recall, F1 Score" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 6, 371 | "metadata": {}, 372 | "outputs": [ 373 | { 374 | "name": "stdout", 375 | "output_type": "stream", 376 | "text": [ 377 | "[[71 1]\n", 378 | " [ 3 39]]\n" 379 | ] 380 | } 381 | ], 382 | "source": [ 383 | "from sklearn.preprocessing import StandardScaler\n", 384 | "from sklearn.neighbors import KNeighborsClassifier\n", 385 | "from sklearn.pipeline import make_pipeline\n", 386 | "from mlxtend.evaluate import confusion_matrix\n", 387 | "\n", 388 | "\n", 389 | "pipe_knn = make_pipeline(StandardScaler(),\n", 390 | " KNeighborsClassifier(n_neighbors=5))\n", 391 | "\n", 392 | "pipe_knn.fit(X_train, y_train)\n", 393 | "\n", 394 | "y_pred = pipe_knn.predict(X_test)\n", 395 | "\n", 396 | "confmat = confusion_matrix(y_test, y_pred)\n", 397 | "\n", 398 | "print(confmat)" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 7, 404 | "metadata": {}, 405 | "outputs": [ 406 | { 407 | "name": "stdout", 408 | "output_type": "stream", 409 | "text": [ 410 | "Accuracy: 0.965\n", 411 | "Precision: 0.975\n", 412 | "Recall: 0.929\n", 413 | "F1: 0.951\n", 414 | "MCC: 0.925\n" 415 | ] 416 | } 417 | ], 418 | "source": [ 419 | "from sklearn.metrics import accuracy_score, precision_score, \\\n", 420 | " recall_score, f1_score, matthews_corrcoef\n", 421 | "\n", 422 | "\n", 423 | "print('Accuracy: %.3f' % accuracy_score(y_true=y_test, y_pred=y_pred))\n", 424 | "print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred))\n", 425 | "print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))\n", 426 | "print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred))\n", 427 | "print('MCC: %.3f' % matthews_corrcoef(y_true=y_test, y_pred=y_pred))" 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": {}, 433 | "source": [ 434 | "## 3) Using those Metrics in GridSearch" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 8, 440 | "metadata": {}, 441 | "outputs": [ 442 | { 443 | "name": "stdout", 444 | "output_type": "stream", 445 | "text": [ 446 | "0.9564099246736818\n", 447 | "{'kneighborsclassifier__n_neighbors': 5}\n" 448 | ] 449 | } 450 | ], 451 | "source": [ 452 | "from sklearn.model_selection import GridSearchCV\n", 453 | "\n", 454 | "\n", 455 | "param_range = [3, 5, 7, 9, 15, 21, 31]\n", 456 | "\n", 457 | "pipe_knn = make_pipeline(StandardScaler(),\n", 458 | " KNeighborsClassifier())\n", 459 | "\n", 460 | "param_grid = [{'kneighborsclassifier__n_neighbors': param_range}]\n", 461 | "\n", 462 | "\n", 463 | "gs = GridSearchCV(estimator=pipe_knn,\n", 464 | " param_grid=param_grid,\n", 465 | " scoring='f1',\n", 466 | " cv=10,\n", 467 | " n_jobs=-1)\n", 468 | "\n", 469 | "\n", 470 | "gs = gs.fit(X_train, y_train)\n", 471 | "print(gs.best_score_)\n", 472 | "print(gs.best_params_)" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 9, 478 | "metadata": {}, 479 | "outputs": [ 480 | { 481 | "name": "stdout", 482 | "output_type": "stream", 483 | "text": [ 484 | "0.9597306397306398\n", 485 | "{'kneighborsclassifier__n_neighbors': 15}\n" 486 | ] 487 | } 488 | ], 489 | "source": [ 490 | "from sklearn.metrics import make_scorer\n", 491 | "from mlxtend.data import iris_data\n", 492 | "\n", 493 | "\n", 494 | "X_iris, y_iris = iris_data()\n", 495 | "\n", 496 | "\n", 497 | "# for multiclass:\n", 498 | "scorer = make_scorer(f1_score, average='macro')\n", 499 | "\n", 500 | "\n", 501 | "from sklearn.model_selection import GridSearchCV\n", 502 | "\n", 503 | "\n", 504 | "param_range = [3, 5, 7, 9, 15, 21, 31]\n", 505 | "\n", 506 | "pipe_knn = make_pipeline(StandardScaler(),\n", 507 | " KNeighborsClassifier())\n", 508 | "\n", 509 | "param_grid = [{'kneighborsclassifier__n_neighbors': param_range}]\n", 510 | "\n", 511 | "\n", 512 | "gs = GridSearchCV(estimator=pipe_knn,\n", 513 | " param_grid=param_grid,\n", 514 | " scoring=scorer,\n", 515 | " cv=10,\n", 516 | " n_jobs=-1)\n", 517 | "\n", 518 | "\n", 519 | "gs = gs.fit(X_iris, y_iris)\n", 520 | "print(gs.best_score_)\n", 521 | "print(gs.best_params_)" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": null, 527 | "metadata": {}, 528 | "outputs": [], 529 | "source": [] 530 | } 531 | ], 532 | "metadata": { 533 | "kernelspec": { 534 | "display_name": "Python 3", 535 | "language": "python", 536 | "name": "python3" 537 | }, 538 | "language_info": { 539 | "codemirror_mode": { 540 | "name": "ipython", 541 | "version": 3 542 | }, 543 | "file_extension": ".py", 544 | "mimetype": "text/x-python", 545 | "name": "python", 546 | "nbconvert_exporter": "python", 547 | "pygments_lexer": "ipython3", 548 | "version": "3.8.2" 549 | } 550 | }, 551 | "nbformat": 4, 552 | "nbformat_minor": 4 553 | } 554 | -------------------------------------------------------------------------------- /L12/code/12_3_balanced-acc-Copy1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "STAT 451: Machine Learning (Fall 2020) \n", 8 | "Instructor: Sebastian Raschka (sraschka@wisc.edu) \n", 9 | "\n", 10 | "Course website: http://pages.stat.wisc.edu/~sraschka/teaching/stat451-fs2020/" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# Balanced Accuracy" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 23, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "from mlxtend.evaluate import confusion_matrix\n", 27 | "from mlxtend.evaluate import accuracy_score\n", 28 | "import numpy as np" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 24, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "y_targ = np.array(3*[0] + 69*[1] + 18*[2])\n", 38 | "y_pred = np.array(10*[0] + 50*[1] + 30*[2])" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 25, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "name": "stdout", 48 | "output_type": "stream", 49 | "text": [ 50 | "Standard accuracy: 78.89%\n", 51 | "Class 0 accuracy: 92.22%\n", 52 | "Class 1 accuracy: 78.89%\n", 53 | "Class 2 accuracy: 86.67%\n", 54 | "Average per-class accuracy: 85.93%\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "std_acc = accuracy_score(y_targ, y_pred)\n", 60 | "\n", 61 | "bin_acc0 = accuracy_score(y_targ, y_pred, method='binary', pos_label=0)\n", 62 | "bin_acc1 = accuracy_score(y_targ, y_pred, method='binary', pos_label=1)\n", 63 | "bin_acc2 = accuracy_score(y_targ, y_pred, method='binary', pos_label=2)\n", 64 | "\n", 65 | "avg_acc = accuracy_score(y_targ, y_pred, method='average')\n", 66 | "\n", 67 | "print(f'Standard accuracy: {std_acc*100:.2f}%')\n", 68 | "print(f'Class 0 accuracy: {bin_acc0*100:.2f}%')\n", 69 | "print(f'Class 1 accuracy: {bin_acc1*100:.2f}%')\n", 70 | "print(f'Class 2 accuracy: {bin_acc2*100:.2f}%')\n", 71 | "print(f'Average per-class accuracy: {avg_acc*100:.2f}%')" 72 | ] 73 | } 74 | ], 75 | "metadata": { 76 | "kernelspec": { 77 | "display_name": "Python 3", 78 | "language": "python", 79 | "name": "python3" 80 | }, 81 | "language_info": { 82 | "codemirror_mode": { 83 | "name": "ipython", 84 | "version": 3 85 | }, 86 | "file_extension": ".py", 87 | "mimetype": "text/x-python", 88 | "name": "python", 89 | "nbconvert_exporter": "python", 90 | "pygments_lexer": "ipython3", 91 | "version": "3.8.2" 92 | } 93 | }, 94 | "nbformat": 4, 95 | "nbformat_minor": 4 96 | } 97 | -------------------------------------------------------------------------------- /L12/code/wdbc.names.txt: -------------------------------------------------------------------------------- 1 | 1. Title: Wisconsin Diagnostic Breast Cancer (WDBC) 2 | 3 | 2. Source Information 4 | 5 | a) Creators: 6 | 7 | Dr. William H. Wolberg, General Surgery Dept., University of 8 | Wisconsin, Clinical Sciences Center, Madison, WI 53792 9 | wolberg@eagle.surgery.wisc.edu 10 | 11 | W. Nick Street, Computer Sciences Dept., University of 12 | Wisconsin, 1210 West Dayton St., Madison, WI 53706 13 | street@cs.wisc.edu 608-262-6619 14 | 15 | Olvi L. Mangasarian, Computer Sciences Dept., University of 16 | Wisconsin, 1210 West Dayton St., Madison, WI 53706 17 | olvi@cs.wisc.edu 18 | 19 | b) Donor: Nick Street 20 | 21 | c) Date: November 1995 22 | 23 | 3. Past Usage: 24 | 25 | first usage: 26 | 27 | W.N. Street, W.H. Wolberg and O.L. Mangasarian 28 | Nuclear feature extraction for breast tumor diagnosis. 29 | IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science 30 | and Technology, volume 1905, pages 861-870, San Jose, CA, 1993. 31 | 32 | OR literature: 33 | 34 | O.L. Mangasarian, W.N. Street and W.H. Wolberg. 35 | Breast cancer diagnosis and prognosis via linear programming. 36 | Operations Research, 43(4), pages 570-577, July-August 1995. 37 | 38 | Medical literature: 39 | 40 | W.H. Wolberg, W.N. Street, and O.L. Mangasarian. 41 | Machine learning techniques to diagnose breast cancer from 42 | fine-needle aspirates. 43 | Cancer Letters 77 (1994) 163-171. 44 | 45 | W.H. Wolberg, W.N. Street, and O.L. Mangasarian. 46 | Image analysis and machine learning applied to breast cancer 47 | diagnosis and prognosis. 48 | Analytical and Quantitative Cytology and Histology, Vol. 17 49 | No. 2, pages 77-87, April 1995. 50 | 51 | W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian. 52 | Computerized breast cancer diagnosis and prognosis from fine 53 | needle aspirates. 54 | Archives of Surgery 1995;130:511-516. 55 | 56 | W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian. 57 | Computer-derived nuclear features distinguish malignant from 58 | benign breast cytology. 59 | Human Pathology, 26:792--796, 1995. 60 | 61 | See also: 62 | http://www.cs.wisc.edu/~olvi/uwmp/mpml.html 63 | http://www.cs.wisc.edu/~olvi/uwmp/cancer.html 64 | 65 | Results: 66 | 67 | - predicting field 2, diagnosis: B = benign, M = malignant 68 | - sets are linearly separable using all 30 input features 69 | - best predictive accuracy obtained using one separating plane 70 | in the 3-D space of Worst Area, Worst Smoothness and 71 | Mean Texture. Estimated accuracy 97.5% using repeated 72 | 10-fold crossvalidations. Classifier has correctly 73 | diagnosed 176 consecutive new patients as of November 74 | 1995. 75 | 76 | 4. Relevant information 77 | 78 | Features are computed from a digitized image of a fine needle 79 | aspirate (FNA) of a breast mass. They describe 80 | characteristics of the cell nuclei present in the image. 81 | A few of the images can be found at 82 | http://www.cs.wisc.edu/~street/images/ 83 | 84 | Separating plane described above was obtained using 85 | Multisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree 86 | Construction Via Linear Programming." Proceedings of the 4th 87 | Midwest Artificial Intelligence and Cognitive Science Society, 88 | pp. 97-101, 1992], a classification method which uses linear 89 | programming to construct a decision tree. Relevant features 90 | were selected using an exhaustive search in the space of 1-4 91 | features and 1-3 separating planes. 92 | 93 | The actual linear program used to obtain the separating plane 94 | in the 3-dimensional space is that described in: 95 | [K. P. Bennett and O. L. Mangasarian: "Robust Linear 96 | Programming Discrimination of Two Linearly Inseparable Sets", 97 | Optimization Methods and Software 1, 1992, 23-34]. 98 | 99 | 100 | This database is also available through the UW CS ftp server: 101 | 102 | ftp ftp.cs.wisc.edu 103 | cd math-prog/cpo-dataset/machine-learn/WDBC/ 104 | 105 | 5. Number of instances: 569 106 | 107 | 6. Number of attributes: 32 (ID, diagnosis, 30 real-valued input features) 108 | 109 | 7. Attribute information 110 | 111 | 1) ID number 112 | 2) Diagnosis (M = malignant, B = benign) 113 | 3-32) 114 | 115 | Ten real-valued features are computed for each cell nucleus: 116 | 117 | a) radius (mean of distances from center to points on the perimeter) 118 | b) texture (standard deviation of gray-scale values) 119 | c) perimeter 120 | d) area 121 | e) smoothness (local variation in radius lengths) 122 | f) compactness (perimeter^2 / area - 1.0) 123 | g) concavity (severity of concave portions of the contour) 124 | h) concave points (number of concave portions of the contour) 125 | i) symmetry 126 | j) fractal dimension ("coastline approximation" - 1) 127 | 128 | Several of the papers listed above contain detailed descriptions of 129 | how these features are computed. 130 | 131 | The mean, standard error, and "worst" or largest (mean of the three 132 | largest values) of these features were computed for each image, 133 | resulting in 30 features. For instance, field 3 is Mean Radius, field 134 | 13 is Radius SE, field 23 is Worst Radius. 135 | 136 | All feature values are recoded with four significant digits. 137 | 138 | 8. Missing attribute values: none 139 | 140 | 9. Class distribution: 357 benign, 212 malignant -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/rasbt/stat451-machine-learning-fs20/master) 2 | 3 | # stat451-machine-learning-fs20 4 | 5 | STAT 451: Intro to Machine Learning @ UW-Madison (Fall 2020) 6 | 7 | - [Lecture 01](L01): Course overview, introduction to machine learning 8 | - [Lecture 02](L02): Nearest Neighbor Methods 9 | - [Lecture 03](L03): Python 10 | - [Lecture 04](L04): Scientific Computing in Python 11 | - [Lecture 05](L05): Scikit-learn 12 | - [Lecture 06](L06): Decision Trees 13 | - [Lecture 07](L07): Ensemble Methods 14 | - [Lecture 08](L08): Model Evaluation 1: Overfitting and Underfitting 15 | - [Lecture 09](L09): Model Evaluation 2: Resampling Methods and Confidence Intervals 16 | - [Lecture 10](L10): Model Evaluation 3: Cross-Validation and Model Selection 17 | - [Lecture 11](L11): Model Evaluation 4: Algorithm Comparison 18 | - [Lecture 12](L12): Model Evaluation 5: Performance Metrics -------------------------------------------------------------------------------- /report-template/examples/example-presentations.md: -------------------------------------------------------------------------------- 1 | See videos that students volunteered to share on YouTube: [https://www.youtube.com/watch?v=e_I0q3mmfw4] 2 | 3 | 4 | (PS: This is from the deep learning, not machine learning class, so the topics are different. However, the presentation style & expectation is the same.) -------------------------------------------------------------------------------- /report-template/examples/example-proposal.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/report-template/examples/example-proposal.pdf -------------------------------------------------------------------------------- /report-template/examples/example-report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/report-template/examples/example-report.pdf -------------------------------------------------------------------------------- /report-template/project-presentation-assessment.md: -------------------------------------------------------------------------------- 1 | # Project Presentation Assessment 2 | 3 | - 10 pts: Is there a motivation for the project given? 4 | - 40 pts: Is the project described well enough that a general audience, familiar with machine learning, can understand the project? 5 | - 20 pts: Figures are all legible and explained well 6 | - 20 pts: Are the results presented adequately discussed? 7 | - 10 pts: Did all team members contribute to the presentation? -------------------------------------------------------------------------------- /report-template/project-proposal-assessment.md: -------------------------------------------------------------------------------- 1 | # Project Proposal Assessment 2 | 3 | The proposal will be graded based on completeness of each of the 5 sections (Introduction, Motivation, Evaluation, Resources, and Contributions) and *not* be based on language, style, and how "exciting" or "interesting" the project is. For each section, you can receive a maximum of 10 points, totalling 50 pts for the proposal overall. 4 | 5 | Also, it is important to make sure that you acknowledge previous work and use citations properly when referring to other people's work. Even minor forms of plagiarism (e.g., copying sentences from other texts) will result in a subtraction of at least 10 pts each per incidence. And university guidelines dictate that severe incidents need to be reported. If you are unsure about what constitutes plagiarism and how to avoid it, please see the helpful guides at https://conduct.students.wisc.edu/plagiarism/ 6 | 7 | -------------------------------------------------------------------------------- /report-template/project-report-assessment.md: -------------------------------------------------------------------------------- 1 | # Project Report Assessment 2 | 3 | 4 | ### Abstract: 15 pts 5 | 6 | - Is enough information provided get a clear idea about the subject matter? 7 | - Is the abstract conveying the findings? 8 | - Are the main points of the report described succinctly? 9 | 10 | ### Introduction: 15 pts 11 | 12 | - Does the introduction cover the required background information to understand the work? 13 | - Is the introduction well organized: it starts out general and becomes more specific towards the end? 14 | - Is there a motivation explaining why this project is relevant, important, and/or interesting? 15 | 16 | ### Related Work: 15 pts 17 | 18 | - Is the similar and related work discussed adequately? 19 | - Are references cited properly (here, but also throughout the whole paper)? 20 | - Is the a discussion or paragraph on comparing this project with other people's work adequate? 21 | 22 | 23 | ### Proposed Method: 25 pts 24 | 25 | - Are there any missing descriptions of symbols used in mathematical notations (if applicable)? 26 | - Are the main algorithms described well enough so that they can be implemented by a knowledgeable reader? 27 | 28 | ### Experiments: 25 pts 29 | 30 | - Is the experimental setup and methodology described well enough so that it can be repeated? 31 | - If datasets are used, are they referenced appropriately? 32 | 33 | ### Results and Discussion: 30 pts 34 | 35 | - Are the results described clearly? 36 | - Is the data analyzed well, and are the results logical? 37 | - Are the figures clear and have no missing labels? 38 | - Do the figure captions have sufficient information to understand the figure? 39 | - Is each figure referenced in the text? 40 | - Is the discussion critical/honest, and are potential weaknesses/shortcomings are discussed as well? 41 | 42 | ### Conclusions: 15 pts 43 | 44 | - Do the authors describe whether the initial motivation/task was accomplished or not based on the results? 45 | - Is it discussed adequately how the results relate to previous work? 46 | - If applicable, are potential future directions given? 47 | 48 | ### Contributions: 10 pts 49 | 50 | - Are all contributions listed clearly? 51 | - Did each member contribute approximately equally to the project? 52 | 53 | ### Length, Formatting, and Citations: 54 | 55 | - -25 pts if you submit the report in some arbitrary format and didn't use the report template. 56 | - -10 pts for each page that goes over the 8-page limit (references are not counted; so you may have 8 pages of text + an infinite number of reference pages). 57 | - -10 pts for each page below the 6-page minimum requirement (references are not counted; so you may have 6 pages of text + 1 to infinitely many reference pages). 58 | - -10 pts for each missing image reference -- this means, if you are using an image that was not made by yourself and you don't cite the source, 10 pts will be deducted for each missing reference. 59 | - -10 pts will be deducted where a sentence from a book or website is copied without citation. For example, consider the following sentence from [https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm) 60 | 61 | - In pattern recognition, the k-nearest neighbors algorithm (k-NN) is a non-parametric method used for classification and regression. 62 | 63 | If you use it in your text in the following way 64 | 65 | > This section describes the machine learning methods used in this study. As a baseline model, the k-neared neighbors algorithm was used. In pattern recognition, the k-nearest neighbors algorithm (k-NN) is a non-parametric method used for classification and regression. The distance metric ... 66 | 67 | I will deduct 10 pts because you didn't indicate that you obtained the sentence from Wikipedia. However the following is ok: 68 | 69 | a) 70 | 71 | > This section describes the machine learning methods used in this study. As a baseline model, the k-neared neighbors algorithm was used. "In pattern recognition, the k-nearest neighbors algorithm (k-NN) is a non-parametric method used for classification and regression."\footcite{\url{https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm}} The distance metric ... 72 | 73 | the following is also okay -- because you rewrote the sentence you don't need the quotation marks: 74 | 75 | b) 76 | 77 | > This section describes the machine learning methods used in this study. As a baseline model, the k-neared neighbors algorithm was used. The k-nearest neigbhors algorithm is a so-called \texit{lazy} machine learning algorithm and non-parametric method that can be used for classification and regression.\footcite{\url{https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm}} The distance metric ... 78 | 79 | -------------------------------------------------------------------------------- /report-template/proposal-latex/bibliography.bib: -------------------------------------------------------------------------------- 1 | @article{Raschka2020PythonTrends, 2 | title={Machine learning in python: Main developments and technology trends in data science, machine learning, and artificial intelligence}, 3 | author={Raschka, Sebastian and Patterson, Joshua and Nolet, Corey}, 4 | volume={11}, 5 | number={7}, 6 | pages={345}, 7 | year={2020}, 8 | journal={Information}, 9 | publisher={MDPI} 10 | } -------------------------------------------------------------------------------- /report-template/proposal-latex/figures/google-scholar.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/report-template/proposal-latex/figures/google-scholar.pdf -------------------------------------------------------------------------------- /report-template/proposal-latex/figures/not-own-figure.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/report-template/proposal-latex/figures/not-own-figure.pdf -------------------------------------------------------------------------------- /report-template/proposal-latex/proposal.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/report-template/proposal-latex/proposal.pdf -------------------------------------------------------------------------------- /report-template/proposal-latex/proposal.tex: -------------------------------------------------------------------------------- 1 | \documentclass[10pt,twocolumn,letterpaper]{article} 2 | 3 | \usepackage{statcourse} 4 | \usepackage{times} 5 | \usepackage{epsfig} 6 | \usepackage{graphicx} 7 | \usepackage{amsmath} 8 | \usepackage{amssymb} 9 | 10 | % Include other packages here, before hyperref. 11 | 12 | % If you comment hyperref and then uncomment it, you should delete 13 | % egpaper.aux before re-running latex. (Or just hit 'q' on the first latex 14 | % run, let it finish, and you should be clear). 15 | \usepackage[breaklinks=true,bookmarks=false]{hyperref} 16 | 17 | 18 | \statcoursefinalcopy 19 | 20 | 21 | \setcounter{page}{1} 22 | \begin{document} 23 | 24 | 25 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 26 | % DO NOT EDIT ANYTHING ABOVE THIS LINE 27 | % EXCEPT IF YOU LIKE TO USE ADDITIONAL PACKAGES 28 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 29 | 30 | 31 | 32 | %%%%%%%%% TITLE 33 | \title{\LaTeX\ Template for STAT451 Project Proposal (replace with your project title)} 34 | 35 | \author{First Author\\ 36 | {\tt\small firstauthor@wisc.edu} 37 | \and 38 | Second Author\\ 39 | {\tt\small secondauthor@wisc.edu} 40 | \and 41 | Third Author\\ 42 | {\tt\small thirdauthor@wisc.edu} 43 | } 44 | 45 | \maketitle 46 | %\thispagestyle{empty} 47 | 48 | 49 | 50 | % MAIN ARTICLE GOES BELOW 51 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 52 | 53 | 54 | 55 | %%%%%%%%% BODY TEXT 56 | 57 | 58 | 59 | \begin{itemize} 60 | 61 | 62 | \item The information in this template is very minimal, and this file should serve you as a framework for writing your proposal. You may prefer to use a more collaboration-friendly tool while drafting the report with your classmates before you prepare the final report for submission. Remember that you only need to turn in the PDF file on Canvas. Also, \textbf{only one member per team} needs to submit the project proposal. 63 | 64 | \item The project proposal is a 2-4 page document excluding references\footnote{This means, references should of course be included but do not count towards the page limit}. 65 | 66 | \item You are encouraged (not required) to use 1-2 figures to illustrate technical concepts. 67 | 68 | \item The proposal must be formatted and submitted as a PDF document on Canvas (the submission deadline will be later announced on Canvas. 69 | 70 | \item Please 71 | check out the text in the sections below for further information. 72 | 73 | \end{itemize} 74 | 75 | 76 | 77 | 78 | \section{Introduction} 79 | 80 | 81 | In this section, describe what you are planning to do. Also, briefly describe related work. 82 | 83 | \subsection{Notes about Citations} 84 | 85 | When discussing related work, do not forget to include appropriate references. This is an example of a citation \cite{Raschka2020PythonTrends}. To format the citations properly, put the 86 | corresponding references into the ``bibliography.bib`` file. You can obtain 87 | BibTeX-formatted references for the "bib" file from Google Scholar 88 | (\url{https://scholar.google.com}), for example, by clicking on the 89 | double-quote character under a citation and then selecting \mbox{"BibTeX"} as 90 | shown in Figure \ref{fig:google-scholar-1col} and 91 | Figure \ref{fig:google-scholar-2col}. 92 | 93 | To avoid plagiarism, any sentence that is copied from other articles or sources (internet, papers, etc.) must be put in quotation marks. The next sentence provides and example that uses an existing sentence verbatim. 94 | 95 | According to \cite{Raschka2020PythonTrends}, "The development of machine learning algorithms that operate on a set of values (as opposed to a single value) at a time is also commonly known as vectorization." 96 | 97 | Direct quotes should be used sparingly, and it is usually better to rephrase sentences in your own words. The next sentence provides an example. 98 | 99 | Vectorization is a programming approach utilizing functions that operate on multiple values simultaneously to speed up computation \cite{Raschka2020PythonTrends}. 100 | 101 | \begin{figure}[t] 102 | \begin{center} 103 | \includegraphics[width=0.8\linewidth]{figures/google-scholar.pdf} 104 | \end{center} 105 | \caption{Example illustrating how to get BibTeX references from 106 | Google Scholar as a 1-column figure.} 107 | \label{fig:google-scholar-1col} 108 | \end{figure} 109 | 110 | \subsection{Notes about Figures} 111 | 112 | Figure~\ref{fig:google-scholar-1col} shows an example of a 1-column figures. 113 | 114 | You can create two-column figures, too, as shown in Figure \ref{fig:google-scholar-2col}. Please not that you can reuse figures from other papers or lecture material, but for every figure that is not your own, you have to include the "Source" as shown in Figure~\ref{fig:other-figure}. 115 | 116 | \begin{figure*} 117 | \begin{center} 118 | \includegraphics[width=0.8\linewidth]{figures/google-scholar.pdf} 119 | \end{center} 120 | \caption{Example of a 2-column figure.} 121 | \label{fig:google-scholar-2col} 122 | \end{figure*} 123 | 124 | \begin{figure*} 125 | \begin{center} 126 | \includegraphics[width=0.8\linewidth]{figures/not-own-figure.pdf} 127 | \end{center} 128 | \caption{Figure note created by yourself. Image source: \cite{Raschka2020PythonTrends}. (If the source is a website, not a paper, please use the URL link instead of the paper reference. Image source: \url{https://www.mdpi.com/2078-2489/11/4/193}.)} 129 | \label{fig:other-figure} 130 | \end{figure*} 131 | 132 | 133 | \section{Motivation} 134 | 135 | Describe why your project is interesting. E.g., you can describe why your project could have a broader societal impact. Or, you may describe the motivation from a personal learning perspective. 136 | 137 | \section{Evaluation} 138 | 139 | What would the successful outcome of your project look like? In other words, under which circumstances would you consider your project to be “successful?” 140 | 141 | How do you measure success, specific to this project, from a technical standpoint? 142 | 143 | \section{Resources} 144 | 145 | What resources are you going to use (datasets, computer hardware, computational tools, etc.)? 146 | 147 | \section{Contributions} 148 | 149 | You are expected to share the workload evenly, and every group member is expected to participate in both the experiments and writing. (As a group, you only need to submit one proposal and one report, though. So you need to work together and coordinate your efforts.) 150 | 151 | Clearly indicate what computational and writing tasks each member of your group will be participating in. 152 | 153 | 154 | {\small 155 | \bibliographystyle{ieee} 156 | \bibliography{bibliography.bib} 157 | } 158 | 159 | \end{document} 160 | -------------------------------------------------------------------------------- /report-template/proposal-latex/statcourse.sty: -------------------------------------------------------------------------------- 1 | % --------------------------------------------------------------- 2 | % 3 | % $Id: statcourse.sty,v 1.3 2005/10/24 19:56:15 awf Exp $ 4 | % 5 | % by Paolo.Ienne@di.epfl.ch 6 | % some mods by awf@acm.org 7 | % 8 | % --------------------------------------------------------------- 9 | % 10 | % no guarantee is given that the format corresponds perfectly to 11 | % IEEE 8.5" x 11" Proceedings, but most features should be ok. 12 | % 13 | % --------------------------------------------------------------- 14 | % with LaTeX2e: 15 | % ============= 16 | % 17 | % use as 18 | % \documentclass[times,10pt,twocolumn]{article} 19 | % \usepackage{latex8} 20 | % \usepackage{times} 21 | % 22 | % --------------------------------------------------------------- 23 | 24 | % with LaTeX 2.09: 25 | % ================ 26 | % 27 | % use as 28 | % \documentstyle[times,art10,twocolumn,latex8]{article} 29 | % 30 | % --------------------------------------------------------------- 31 | % with both versions: 32 | % =================== 33 | % 34 | % specify \statcoursefinalcopy to emit the final camera-ready copy 35 | % 36 | % specify references as 37 | % \bibliographystyle{ieee} 38 | % \bibliography{...your files...} 39 | % 40 | % --------------------------------------------------------------- 41 | 42 | \usepackage{eso-pic} 43 | \usepackage{xspace} 44 | 45 | \typeout{CVPR 8.5 x 11-Inch Proceedings Style `statcourse.sty'.} 46 | 47 | % ten point helvetica bold required for captions 48 | % eleven point times bold required for second-order headings 49 | % in some sites the name of the fonts may differ, 50 | % change the name here: 51 | \font\statcoursetenhv = phvb at 8pt % *** IF THIS FAILS, SEE statcourse.sty *** 52 | \font\elvbf = ptmb scaled 1100 53 | 54 | % If the above lines give an error message, try to comment them and 55 | % uncomment these: 56 | %\font\statcoursetenhv = phvb7t at 8pt 57 | %\font\elvbf = ptmb7t scaled 1100 58 | 59 | % set dimensions of columns, gap between columns, and paragraph indent 60 | \setlength{\textheight}{8.875in} 61 | \setlength{\textwidth}{6.875in} 62 | \setlength{\columnsep}{0.3125in} 63 | \setlength{\topmargin}{0in} 64 | \setlength{\headheight}{0in} 65 | \setlength{\headsep}{0in} 66 | \setlength{\parindent}{1pc} 67 | \setlength{\oddsidemargin}{-.304in} 68 | \setlength{\evensidemargin}{-.304in} 69 | 70 | \newif\ifstatcoursefinal 71 | \statcoursefinalfalse 72 | \def\statcoursefinalcopy{\global\statcoursefinaltrue} 73 | 74 | % memento from size10.clo 75 | % \normalsize{\@setfontsize\normalsize\@xpt\@xiipt} 76 | % \small{\@setfontsize\small\@ixpt{11}} 77 | % \footnotesize{\@setfontsize\footnotesize\@viiipt{9.5}} 78 | % \scriptsize{\@setfontsize\scriptsize\@viipt\@viiipt} 79 | % \tiny{\@setfontsize\tiny\@vpt\@vipt} 80 | % \large{\@setfontsize\large\@xiipt{14}} 81 | % \Large{\@setfontsize\Large\@xivpt{18}} 82 | % \LARGE{\@setfontsize\LARGE\@xviipt{22}} 83 | % \huge{\@setfontsize\huge\@xxpt{25}} 84 | % \Huge{\@setfontsize\Huge\@xxvpt{30}} 85 | 86 | \def\@maketitle 87 | { 88 | \newpage 89 | \null 90 | \vskip .375in 91 | \begin{center} 92 | {\Large \bf \@title \par} 93 | % additional two empty lines at the end of the title 94 | \vspace*{24pt} 95 | { 96 | \large 97 | \lineskip .5em 98 | \begin{tabular}[t]{c} 99 | \ifstatcoursefinal\@author\else Anonymous CVPR submission\\ 100 | \vspace*{1pt}\\%This space will need to be here in the final copy, so don't squeeze it out for the review copy. 101 | Paper ID \statcoursePaperID \fi 102 | \end{tabular} 103 | \par 104 | } 105 | % additional small space at the end of the author name 106 | \vskip .5em 107 | % additional empty line at the end of the title block 108 | \vspace*{12pt} 109 | \end{center} 110 | } 111 | 112 | \def\abstract 113 | {% 114 | \centerline{\large\bf Abstract}% 115 | \vspace*{12pt}% 116 | \it% 117 | } 118 | 119 | \def\endabstract 120 | { 121 | % additional empty line at the end of the abstract 122 | \vspace*{12pt} 123 | } 124 | 125 | \def\affiliation#1{\gdef\@affiliation{#1}} \gdef\@affiliation{} 126 | 127 | \newlength{\@ctmp} 128 | \newlength{\@figindent} 129 | \setlength{\@figindent}{1pc} 130 | 131 | \long\def\@makecaption#1#2{ 132 | \setbox\@tempboxa\hbox{\small \noindent #1.~#2} 133 | \setlength{\@ctmp}{\hsize} 134 | \addtolength{\@ctmp}{-\@figindent}\addtolength{\@ctmp}{-\@figindent} 135 | % IF longer than one indented paragraph line 136 | \ifdim \wd\@tempboxa >\@ctmp 137 | % THEN DON'T set as an indented paragraph 138 | {\small #1.~#2\par} 139 | \else 140 | % ELSE center 141 | \hbox to\hsize{\hfil\box\@tempboxa\hfil} 142 | \fi} 143 | 144 | % correct heading spacing and type 145 | \def\statcoursesection{\@startsection {section}{1}{\z@} 146 | {10pt plus 2pt minus 2pt}{7pt} {\large\bf}} 147 | \def\statcoursessect#1{\statcoursesection*{#1}} 148 | \def\statcoursesect#1{\statcoursesection{\hskip -1em.~#1}} 149 | \def\section{\@ifstar\statcoursessect\statcoursesect} 150 | 151 | \def\statcoursesubsection{\@startsection {subsection}{2}{\z@} 152 | {8pt plus 2pt minus 2pt}{6pt} {\elvbf}} 153 | \def\statcoursessubsect#1{\statcoursesubsection*{#1}} 154 | \def\statcoursesubsect#1{\statcoursesubsection{\hskip -1em.~#1}} 155 | \def\subsection{\@ifstar\statcoursessubsect\statcoursesubsect} 156 | 157 | %% --------- Page background marks: Ruler and confidentiality 158 | 159 | % ----- define vruler 160 | \makeatletter 161 | \newbox\statcourserulerbox 162 | \newcount\statcourserulercount 163 | \newdimen\statcourseruleroffset 164 | \newdimen\cv@lineheight 165 | \newdimen\cv@boxheight 166 | \newbox\cv@tmpbox 167 | \newcount\cv@refno 168 | \newcount\cv@tot 169 | % NUMBER with left flushed zeros \fillzeros[] 170 | \newcount\cv@tmpc@ \newcount\cv@tmpc 171 | \def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi 172 | \cv@tmpc=1 % 173 | \loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi 174 | \ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat 175 | \ifnum#2<0\advance\cv@tmpc1\relax-\fi 176 | \loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat 177 | \cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}% 178 | % \makevruler[][][][][] 179 | \def\makevruler[#1][#2][#3][#4][#5]{\begingroup\offinterlineskip 180 | \textheight=#5\vbadness=10000\vfuzz=120ex\overfullrule=0pt% 181 | \global\setbox\statcourserulerbox=\vbox to \textheight{% 182 | {\parskip=0pt\hfuzz=150em\cv@boxheight=\textheight 183 | \cv@lineheight=#1\global\statcourserulercount=#2% 184 | \cv@tot\cv@boxheight\divide\cv@tot\cv@lineheight\advance\cv@tot2% 185 | \cv@refno1\vskip-\cv@lineheight\vskip1ex% 186 | \loop\setbox\cv@tmpbox=\hbox to0cm{{\statcoursetenhv\hfil\fillzeros[#4]\statcourserulercount}}% 187 | \ht\cv@tmpbox\cv@lineheight\dp\cv@tmpbox0pt\box\cv@tmpbox\break 188 | \advance\cv@refno1\global\advance\statcourserulercount#3\relax 189 | \ifnum\cv@refno<\cv@tot\repeat}}\endgroup}% 190 | \makeatother 191 | % ----- end of vruler 192 | 193 | % \makevruler[][][][][] 194 | \def\statcourseruler#1{\makevruler[12pt][#1][1][3][0.993\textheight]\usebox{\statcourserulerbox}} 195 | \AddToShipoutPicture{% 196 | \ifstatcoursefinal\else 197 | %\AtTextLowerLeft{% 198 | % \color[gray]{.15}\framebox(\LenToUnit{\textwidth},\LenToUnit{\textheight}){} 199 | %} 200 | \statcourseruleroffset=\textheight 201 | \advance\statcourseruleroffset by -3.7pt 202 | \color[rgb]{.5,.5,1} 203 | \AtTextUpperLeft{% 204 | \put(\LenToUnit{-35pt},\LenToUnit{-\statcourseruleroffset}){%left ruler 205 | \statcourseruler{\statcourserulercount}} 206 | \put(\LenToUnit{\textwidth\kern 30pt},\LenToUnit{-\statcourseruleroffset}){%right ruler 207 | \statcourseruler{\statcourserulercount}} 208 | } 209 | \def\pid{\parbox{1in}{\begin{center}\bf\sf{\small CVPR}\\\#\statcoursePaperID\end{center}}} 210 | \AtTextUpperLeft{%paperID in corners 211 | \put(\LenToUnit{-65pt},\LenToUnit{45pt}){\pid} 212 | \put(\LenToUnit{\textwidth\kern-8pt},\LenToUnit{45pt}){\pid} 213 | } 214 | \AtTextUpperLeft{%confidential 215 | \put(0,\LenToUnit{1cm}){\parbox{\textwidth}{\centering\statcoursetenhv 216 | CVPR 2018 Submission \#\statcoursePaperID. CONFIDENTIAL REVIEW COPY. DO NOT DISTRIBUTE.}} 217 | } 218 | \fi 219 | } 220 | 221 | %%% Make figure placement a little more predictable. 222 | % We trust the user to move figures if this results 223 | % in ugliness. 224 | % Minimize bad page breaks at figures 225 | \renewcommand{\textfraction}{0.01} 226 | \renewcommand{\floatpagefraction}{0.99} 227 | \renewcommand{\topfraction}{0.99} 228 | \renewcommand{\bottomfraction}{0.99} 229 | \renewcommand{\dblfloatpagefraction}{0.99} 230 | \renewcommand{\dbltopfraction}{0.99} 231 | \setcounter{totalnumber}{99} 232 | \setcounter{topnumber}{99} 233 | \setcounter{bottomnumber}{99} 234 | 235 | % Add a period to the end of an abbreviation unless there's one 236 | % already, then \xspace. 237 | \makeatletter 238 | \DeclareRobustCommand\onedot{\futurelet\@let@token\@onedot} 239 | \def\@onedot{\ifx\@let@token.\else.\null\fi\xspace} 240 | 241 | \def\eg{\emph{e.g}\onedot} \def\Eg{\emph{E.g}\onedot} 242 | \def\ie{\emph{i.e}\onedot} \def\Ie{\emph{I.e}\onedot} 243 | \def\cf{\emph{c.f}\onedot} \def\Cf{\emph{C.f}\onedot} 244 | \def\etc{\emph{etc}\onedot} \def\vs{\emph{vs}\onedot} 245 | \def\wrt{w.r.t\onedot} \def\dof{d.o.f\onedot} 246 | \def\etal{\emph{et al}\onedot} 247 | \makeatother 248 | 249 | % --------------------------------------------------------------- 250 | -------------------------------------------------------------------------------- /report-template/report-latex/bibliography.bib: -------------------------------------------------------------------------------- 1 | @article{Raschka2020PythonTrends, 2 | title={Machine learning in python: Main developments and technology trends in data science, machine learning, and artificial intelligence}, 3 | author={Raschka, Sebastian and Patterson, Joshua and Nolet, Corey}, 4 | volume={11}, 5 | number={7}, 6 | pages={345}, 7 | year={2020}, 8 | journal={Information}, 9 | publisher={MDPI} 10 | } -------------------------------------------------------------------------------- /report-template/report-latex/figures/google-scholar.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/report-template/report-latex/figures/google-scholar.pdf -------------------------------------------------------------------------------- /report-template/report-latex/report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs20/51ae6db167ec9ccae555e973179c31be0d111804/report-template/report-latex/report.pdf -------------------------------------------------------------------------------- /report-template/report-latex/report.tex: -------------------------------------------------------------------------------- 1 | \documentclass[10pt,twocolumn,letterpaper]{article} 2 | 3 | \usepackage{statcourse} 4 | \usepackage{times} 5 | \usepackage{epsfig} 6 | \usepackage{graphicx} 7 | \usepackage{amsmath} 8 | \usepackage{amssymb} 9 | 10 | % Include other packages here, before hyperref. 11 | 12 | % If you comment hyperref and then uncomment it, you should delete 13 | % egpaper.aux before re-running latex. (Or just hit 'q' on the first latex 14 | % run, let it finish, and you should be clear). 15 | \usepackage[breaklinks=true,bookmarks=false]{hyperref} 16 | 17 | 18 | \statcoursefinalcopy 19 | 20 | 21 | \setcounter{page}{1} 22 | \begin{document} 23 | 24 | 25 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 26 | % DO NOT EDIT ANYTHING ABOVE THIS LINE 27 | % EXCEPT IF YOU LIKE TO USE ADDITIONAL PACKAGES 28 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 29 | 30 | 31 | 32 | %%%%%%%%% TITLE 33 | \title{\LaTeX\ Template for STAT451 Project Report (replace with your project title)} 34 | 35 | \author{First Author\\ 36 | {\tt\small firstauthor@wisc.edu} 37 | \and 38 | Second Author\\ 39 | {\tt\small secondauthor@wisc.edu} 40 | \and 41 | Third Author\\ 42 | {\tt\small thirdauthor@wisc.edu} 43 | } 44 | 45 | \maketitle 46 | %\thispagestyle{empty} 47 | 48 | 49 | 50 | % MAIN ARTICLE GOES BELOW 51 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 52 | 53 | 54 | %%%%%%%%% ABSTRACT 55 | \begin{abstract} 56 | The abstract for your project goes here. The length of the abstract 57 | should be between 200-250 words. Tips for writing a good abstract 58 | can be found at \url{https://writing.wisc.edu/Handbook/presentations_abstracts.html}. 59 | \end{abstract} 60 | 61 | %%%%%%%%% BODY TEXT 62 | 63 | %------------------------------------------------- 64 | \section{Introduction} 65 | %------------------------------------------------- 66 | 67 | \noindent\textit{Recommended length: 1/2 to 1 pages.}\vspace{1cm} 68 | 69 | For the report, the same rules and guidelines apply as for the proposal. This is an example of a citation \cite{Raschka2020PythonTrends}; if you use the "cite{}" function in LaTeX, the References section will be created automatically at the end of the document. Please read through the {"proposal-latex/proposal.pdf"} for a refresher on how to use citations and figures properly. 70 | 71 | Note that the sections for this report are different, and some additional information is contained in this template document, so please read it carefully before you start writing. 72 | 73 | This is an example of a mathematical equation: 74 | 75 | $$f(\mathbf{x}; \mathbf{w}) = \sum_{i=1}^{n} w_ix_i.$$ 76 | 77 | This is a mathematical expression, $h(\mathbf{x}) = \hat{y}$ formatted in text. 78 | 79 | The project report should be 6-8 pages long (not counting references) 80 | and should contain the sections that are already provided in this paper. Please 81 | check out the text in these sections for further information. 82 | 83 | 84 | \subsection{Subsection} 85 | 86 | You can use paragraphs or subsections to further structure your 87 | main sections. This is an example of a subsection. 88 | 89 | \paragraph{This is a paragraph title.} This is an example of a paragraph. 90 | 91 | Ideally, your report should contain all the major sections provided in this report template. Please also consult the "report-template/project-report-assessment.md" for further information on these sections and grading. 92 | 93 | 94 | 95 | %------------------------------------------------- 96 | \section{Related Work} 97 | %------------------------------------------------- 98 | 99 | \noindent\textit{Recommended length: 1/2 to 1 pages.}\vspace{1cm} 100 | 101 | Related work should be discussed here. This should be a short (1/2 to 1 page) discussion of work (from research papers and articles) that explored similar questions. For example, if you plan to predict COVID-19 from chest X-ray images, discuss previous work that was about a similar project. If the focus of your project is on analyzing the behavior of certain machine learning on a variety of different datasets, and the comparison itself (rather application) is the focus of your paper, discuss other papers that analyzed different algorithms. 102 | 103 | %------------------------------------------------- 104 | \section{Proposed Method} 105 | %------------------------------------------------- 106 | 107 | \noindent\textit{Recommended length: 1 to 2 pages.}\vspace{1cm} 108 | 109 | Describe the method(s) you are proposing, developing, or using. Most students will not propose new or modified machine learning methods or algorithms. In this case, describe how the main algorithms you are using work. This may include mathematical details. 110 | 111 | %------------------------------------------------- 112 | \section{Experiments} 113 | %------------------------------------------------- 114 | 115 | \noindent\textit{Recommended length: 1/2 to 1 pages.}\vspace{1cm} 116 | 117 | Describe the experiments you performed to address specific questions. This includes information about the dataset and software, which are listed as subsections below. Please do not remove these subsections. 118 | 119 | \subsection{Dataset} 120 | 121 | Briefly describe your dataset in a separate subsection. 122 | 123 | 124 | Table \ref{tab:some-table} shows an example for formatting a table. 125 | 126 | \begin{table} 127 | \begin{center} 128 | \begin{tabular}{|l|c|} 129 | \hline 130 | Method & Accuracy \\ 131 | \hline\hline 132 | Method 1 & $70 \pm 3$ \% \\ 133 | Method 2 & $76 \pm 3$ \% \\ 134 | \hline 135 | \end{tabular} 136 | \end{center} 137 | \label{tab:some-table} 138 | \caption{This is an example of a table.} 139 | \end{table} 140 | 141 | 142 | 143 | 144 | \subsection{Software} 145 | 146 | Briefly list (and cite) software software you used. 147 | 148 | \subsection{Hardware} 149 | 150 | If relevant, list hardware resources you used. 151 | 152 | %------------------------------------------------- 153 | \section{Results and Discussion} 154 | %------------------------------------------------- 155 | 156 | \noindent\textit{Recommended length: 2 to 3 pages.}\vspace{1cm} 157 | 158 | Describe the results you obtained from the experiments and interpret them. 159 | Optionally, you could split "Results and Discussion" into two separate 160 | sections, but it is often easier to present the results and discuss them at the same time. In this section, you will likely want to create several subsections that address your specific research questions. As an example for structuring the Results and Discussion section, you can take a look at the following paper: \url{https://www.mdpi.com/2078-2489/11/7/345}. 161 | 162 | %------------------------------------------------- 163 | \section{Conclusions} 164 | %------------------------------------------------- 165 | 166 | \noindent\textit{Recommended length: 1/3 to 1/2 page.}\vspace{1cm} 167 | 168 | Describe your conclusions here. If there are any future directions, you can 169 | describe them here, or you can create a new section for future directions. 170 | 171 | %------------------------------------------------- 172 | \section{Acknowledgements} 173 | %------------------------------------------------- 174 | 175 | \noindent\textit{Recommended length: 2-4 sentences.}\vspace{1cm} 176 | 177 | List acknowledgements if any. For example, if someone provided you a dataset, or 178 | you used someone else's resources, this is a good place to acknowledge 179 | the help or support you received. 180 | 181 | %------------------------------------------------- 182 | \section{Contributions} 183 | %------------------------------------------------- 184 | 185 | \noindent\textit{Recommended length: 1/3 to 1/2 page.}\vspace{1cm} 186 | 187 | Describe the contributions of each team member who worked on this project. 188 | 189 | 190 | {\small 191 | \bibliographystyle{ieee} 192 | \bibliography{bibliography.bib} 193 | } 194 | 195 | \end{document} 196 | -------------------------------------------------------------------------------- /report-template/report-latex/statcourse.sty: -------------------------------------------------------------------------------- 1 | % --------------------------------------------------------------- 2 | % 3 | % $Id: statcourse.sty,v 1.3 2005/10/24 19:56:15 awf Exp $ 4 | % 5 | % by Paolo.Ienne@di.epfl.ch 6 | % some mods by awf@acm.org 7 | % 8 | % --------------------------------------------------------------- 9 | % 10 | % no guarantee is given that the format corresponds perfectly to 11 | % IEEE 8.5" x 11" Proceedings, but most features should be ok. 12 | % 13 | % --------------------------------------------------------------- 14 | % with LaTeX2e: 15 | % ============= 16 | % 17 | % use as 18 | % \documentclass[times,10pt,twocolumn]{article} 19 | % \usepackage{latex8} 20 | % \usepackage{times} 21 | % 22 | % --------------------------------------------------------------- 23 | 24 | % with LaTeX 2.09: 25 | % ================ 26 | % 27 | % use as 28 | % \documentstyle[times,art10,twocolumn,latex8]{article} 29 | % 30 | % --------------------------------------------------------------- 31 | % with both versions: 32 | % =================== 33 | % 34 | % specify \statcoursefinalcopy to emit the final camera-ready copy 35 | % 36 | % specify references as 37 | % \bibliographystyle{ieee} 38 | % \bibliography{...your files...} 39 | % 40 | % --------------------------------------------------------------- 41 | 42 | \usepackage{eso-pic} 43 | \usepackage{xspace} 44 | 45 | \typeout{CVPR 8.5 x 11-Inch Proceedings Style `statcourse.sty'.} 46 | 47 | % ten point helvetica bold required for captions 48 | % eleven point times bold required for second-order headings 49 | % in some sites the name of the fonts may differ, 50 | % change the name here: 51 | \font\statcoursetenhv = phvb at 8pt % *** IF THIS FAILS, SEE statcourse.sty *** 52 | \font\elvbf = ptmb scaled 1100 53 | 54 | % If the above lines give an error message, try to comment them and 55 | % uncomment these: 56 | %\font\statcoursetenhv = phvb7t at 8pt 57 | %\font\elvbf = ptmb7t scaled 1100 58 | 59 | % set dimensions of columns, gap between columns, and paragraph indent 60 | \setlength{\textheight}{8.875in} 61 | \setlength{\textwidth}{6.875in} 62 | \setlength{\columnsep}{0.3125in} 63 | \setlength{\topmargin}{0in} 64 | \setlength{\headheight}{0in} 65 | \setlength{\headsep}{0in} 66 | \setlength{\parindent}{1pc} 67 | \setlength{\oddsidemargin}{-.304in} 68 | \setlength{\evensidemargin}{-.304in} 69 | 70 | \newif\ifstatcoursefinal 71 | \statcoursefinalfalse 72 | \def\statcoursefinalcopy{\global\statcoursefinaltrue} 73 | 74 | % memento from size10.clo 75 | % \normalsize{\@setfontsize\normalsize\@xpt\@xiipt} 76 | % \small{\@setfontsize\small\@ixpt{11}} 77 | % \footnotesize{\@setfontsize\footnotesize\@viiipt{9.5}} 78 | % \scriptsize{\@setfontsize\scriptsize\@viipt\@viiipt} 79 | % \tiny{\@setfontsize\tiny\@vpt\@vipt} 80 | % \large{\@setfontsize\large\@xiipt{14}} 81 | % \Large{\@setfontsize\Large\@xivpt{18}} 82 | % \LARGE{\@setfontsize\LARGE\@xviipt{22}} 83 | % \huge{\@setfontsize\huge\@xxpt{25}} 84 | % \Huge{\@setfontsize\Huge\@xxvpt{30}} 85 | 86 | \def\@maketitle 87 | { 88 | \newpage 89 | \null 90 | \vskip .375in 91 | \begin{center} 92 | {\Large \bf \@title \par} 93 | % additional two empty lines at the end of the title 94 | \vspace*{24pt} 95 | { 96 | \large 97 | \lineskip .5em 98 | \begin{tabular}[t]{c} 99 | \ifstatcoursefinal\@author\else Anonymous CVPR submission\\ 100 | \vspace*{1pt}\\%This space will need to be here in the final copy, so don't squeeze it out for the review copy. 101 | Paper ID \statcoursePaperID \fi 102 | \end{tabular} 103 | \par 104 | } 105 | % additional small space at the end of the author name 106 | \vskip .5em 107 | % additional empty line at the end of the title block 108 | \vspace*{12pt} 109 | \end{center} 110 | } 111 | 112 | \def\abstract 113 | {% 114 | \centerline{\large\bf Abstract}% 115 | \vspace*{12pt}% 116 | \it% 117 | } 118 | 119 | \def\endabstract 120 | { 121 | % additional empty line at the end of the abstract 122 | \vspace*{12pt} 123 | } 124 | 125 | \def\affiliation#1{\gdef\@affiliation{#1}} \gdef\@affiliation{} 126 | 127 | \newlength{\@ctmp} 128 | \newlength{\@figindent} 129 | \setlength{\@figindent}{1pc} 130 | 131 | \long\def\@makecaption#1#2{ 132 | \setbox\@tempboxa\hbox{\small \noindent #1.~#2} 133 | \setlength{\@ctmp}{\hsize} 134 | \addtolength{\@ctmp}{-\@figindent}\addtolength{\@ctmp}{-\@figindent} 135 | % IF longer than one indented paragraph line 136 | \ifdim \wd\@tempboxa >\@ctmp 137 | % THEN DON'T set as an indented paragraph 138 | {\small #1.~#2\par} 139 | \else 140 | % ELSE center 141 | \hbox to\hsize{\hfil\box\@tempboxa\hfil} 142 | \fi} 143 | 144 | % correct heading spacing and type 145 | \def\statcoursesection{\@startsection {section}{1}{\z@} 146 | {10pt plus 2pt minus 2pt}{7pt} {\large\bf}} 147 | \def\statcoursessect#1{\statcoursesection*{#1}} 148 | \def\statcoursesect#1{\statcoursesection{\hskip -1em.~#1}} 149 | \def\section{\@ifstar\statcoursessect\statcoursesect} 150 | 151 | \def\statcoursesubsection{\@startsection {subsection}{2}{\z@} 152 | {8pt plus 2pt minus 2pt}{6pt} {\elvbf}} 153 | \def\statcoursessubsect#1{\statcoursesubsection*{#1}} 154 | \def\statcoursesubsect#1{\statcoursesubsection{\hskip -1em.~#1}} 155 | \def\subsection{\@ifstar\statcoursessubsect\statcoursesubsect} 156 | 157 | %% --------- Page background marks: Ruler and confidentiality 158 | 159 | % ----- define vruler 160 | \makeatletter 161 | \newbox\statcourserulerbox 162 | \newcount\statcourserulercount 163 | \newdimen\statcourseruleroffset 164 | \newdimen\cv@lineheight 165 | \newdimen\cv@boxheight 166 | \newbox\cv@tmpbox 167 | \newcount\cv@refno 168 | \newcount\cv@tot 169 | % NUMBER with left flushed zeros \fillzeros[] 170 | \newcount\cv@tmpc@ \newcount\cv@tmpc 171 | \def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi 172 | \cv@tmpc=1 % 173 | \loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi 174 | \ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat 175 | \ifnum#2<0\advance\cv@tmpc1\relax-\fi 176 | \loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat 177 | \cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}% 178 | % \makevruler[][][][][] 179 | \def\makevruler[#1][#2][#3][#4][#5]{\begingroup\offinterlineskip 180 | \textheight=#5\vbadness=10000\vfuzz=120ex\overfullrule=0pt% 181 | \global\setbox\statcourserulerbox=\vbox to \textheight{% 182 | {\parskip=0pt\hfuzz=150em\cv@boxheight=\textheight 183 | \cv@lineheight=#1\global\statcourserulercount=#2% 184 | \cv@tot\cv@boxheight\divide\cv@tot\cv@lineheight\advance\cv@tot2% 185 | \cv@refno1\vskip-\cv@lineheight\vskip1ex% 186 | \loop\setbox\cv@tmpbox=\hbox to0cm{{\statcoursetenhv\hfil\fillzeros[#4]\statcourserulercount}}% 187 | \ht\cv@tmpbox\cv@lineheight\dp\cv@tmpbox0pt\box\cv@tmpbox\break 188 | \advance\cv@refno1\global\advance\statcourserulercount#3\relax 189 | \ifnum\cv@refno<\cv@tot\repeat}}\endgroup}% 190 | \makeatother 191 | % ----- end of vruler 192 | 193 | % \makevruler[][][][][] 194 | \def\statcourseruler#1{\makevruler[12pt][#1][1][3][0.993\textheight]\usebox{\statcourserulerbox}} 195 | \AddToShipoutPicture{% 196 | \ifstatcoursefinal\else 197 | %\AtTextLowerLeft{% 198 | % \color[gray]{.15}\framebox(\LenToUnit{\textwidth},\LenToUnit{\textheight}){} 199 | %} 200 | \statcourseruleroffset=\textheight 201 | \advance\statcourseruleroffset by -3.7pt 202 | \color[rgb]{.5,.5,1} 203 | \AtTextUpperLeft{% 204 | \put(\LenToUnit{-35pt},\LenToUnit{-\statcourseruleroffset}){%left ruler 205 | \statcourseruler{\statcourserulercount}} 206 | \put(\LenToUnit{\textwidth\kern 30pt},\LenToUnit{-\statcourseruleroffset}){%right ruler 207 | \statcourseruler{\statcourserulercount}} 208 | } 209 | \def\pid{\parbox{1in}{\begin{center}\bf\sf{\small CVPR}\\\#\statcoursePaperID\end{center}}} 210 | \AtTextUpperLeft{%paperID in corners 211 | \put(\LenToUnit{-65pt},\LenToUnit{45pt}){\pid} 212 | \put(\LenToUnit{\textwidth\kern-8pt},\LenToUnit{45pt}){\pid} 213 | } 214 | \AtTextUpperLeft{%confidential 215 | \put(0,\LenToUnit{1cm}){\parbox{\textwidth}{\centering\statcoursetenhv 216 | CVPR 2018 Submission \#\statcoursePaperID. CONFIDENTIAL REVIEW COPY. DO NOT DISTRIBUTE.}} 217 | } 218 | \fi 219 | } 220 | 221 | %%% Make figure placement a little more predictable. 222 | % We trust the user to move figures if this results 223 | % in ugliness. 224 | % Minimize bad page breaks at figures 225 | \renewcommand{\textfraction}{0.01} 226 | \renewcommand{\floatpagefraction}{0.99} 227 | \renewcommand{\topfraction}{0.99} 228 | \renewcommand{\bottomfraction}{0.99} 229 | \renewcommand{\dblfloatpagefraction}{0.99} 230 | \renewcommand{\dbltopfraction}{0.99} 231 | \setcounter{totalnumber}{99} 232 | \setcounter{topnumber}{99} 233 | \setcounter{bottomnumber}{99} 234 | 235 | % Add a period to the end of an abbreviation unless there's one 236 | % already, then \xspace. 237 | \makeatletter 238 | \DeclareRobustCommand\onedot{\futurelet\@let@token\@onedot} 239 | \def\@onedot{\ifx\@let@token.\else.\null\fi\xspace} 240 | 241 | \def\eg{\emph{e.g}\onedot} \def\Eg{\emph{E.g}\onedot} 242 | \def\ie{\emph{i.e}\onedot} \def\Ie{\emph{I.e}\onedot} 243 | \def\cf{\emph{c.f}\onedot} \def\Cf{\emph{C.f}\onedot} 244 | \def\etc{\emph{etc}\onedot} \def\vs{\emph{vs}\onedot} 245 | \def\wrt{w.r.t\onedot} \def\dof{d.o.f\onedot} 246 | \def\etal{\emph{et al}\onedot} 247 | \makeatother 248 | 249 | % --------------------------------------------------------------- 250 | --------------------------------------------------------------------------------