├── dl-course-info.md
├── hw_02
├── some_digit.png
├── tree-viz-1.png
└── hw02.ipynb
├── 02_knn
├── 02_knn_notes.pdf
├── 02_knn_slides.pdf
└── iris.csv
├── hw_03
├── images
│ ├── conf-1.png
│ ├── conf-2.png
│ └── hint-1.png
├── helper.py
├── data
│ └── wine.data
└── hw3.ipynb
├── 05_sklearn
├── images
│ ├── eda.pdf
│ ├── decisionreg.pdf
│ ├── estimator-api.pdf
│ ├── estimator-api.png
│ ├── holdout-tuning.pdf
│ ├── holdout-tuning.png
│ ├── iris-subsampling.pdf
│ ├── iris-subsampling.png
│ ├── sklearn-pipeline.pdf
│ ├── sklearn-pipeline.png
│ ├── transformer-api.pdf
│ └── transformer-api.png
└── 05_sklearn_slides.pdf
├── report-template
├── report.pdf
├── figures
│ └── google-scholar.pdf
├── bibliography.bib
├── project-presentation-assessment.md
├── project-report-assessment.md
├── report.tex
├── statcourse.sty
└── ieee.bst
├── 06_trees
├── 06_trees_notes.pdf
└── 06_trees_slides.pdf
├── 03_python
└── 03_python_notes.pdf
├── 09_eval-ci
├── 09_eval-ci_notes.pdf
└── 09_eval-ci_slides.pdf
├── 10_eval-cv
├── 10_eval-cv_notes.pdf
└── 10_eval-cv_slides.pdf
├── other
├── stat479-fs18-awards.jpg
└── dl-course-info.md
├── 01_overview
├── 01_ml-overview_notes.pdf
└── 01_ml-overview_slides.pdf
├── 04_scipython
├── 04_scipython_notes.pdf
└── images
│ └── numpy-intro
│ ├── ufunc.png
│ ├── array_1.png
│ ├── array_2.png
│ ├── matmul.png
│ ├── matmatmul.png
│ ├── random_1.png
│ ├── random_2.png
│ ├── transpose.png
│ ├── broadcasting-1.png
│ └── broadcasting-2.png
├── 07_ensembles
├── 07_ensembles_notes.pdf
└── 07_ensembles_slides.pdf
├── 11_eval-algo
├── 11_eval-algo_notes.pdf
├── 11_eval-algo_slides.pdf
└── 11_eval-algo_code.ipynb
├── 13_feat-sele
├── 13_feat-sele_slides.pdf
└── code-figures
│ ├── logreg.png
│ └── multinomial-logreg.png
├── 08_eval-intro
├── 08_eval-intro_notes.pdf
└── 08_eval-intro_slides.pdf
├── 12_eval-metrics
└── 12_eval-metrics_slides.pdf
├── 14_feat-extract
└── 14_feat-extract_slides.pdf
├── .gitignore
├── README.md
└── hw_01
├── test_data.txt
└── train_data.txt
/dl-course-info.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/hw_02/some_digit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/hw_02/some_digit.png
--------------------------------------------------------------------------------
/hw_02/tree-viz-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/hw_02/tree-viz-1.png
--------------------------------------------------------------------------------
/02_knn/02_knn_notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/02_knn/02_knn_notes.pdf
--------------------------------------------------------------------------------
/hw_03/images/conf-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/hw_03/images/conf-1.png
--------------------------------------------------------------------------------
/hw_03/images/conf-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/hw_03/images/conf-2.png
--------------------------------------------------------------------------------
/hw_03/images/hint-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/hw_03/images/hint-1.png
--------------------------------------------------------------------------------
/02_knn/02_knn_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/02_knn/02_knn_slides.pdf
--------------------------------------------------------------------------------
/05_sklearn/images/eda.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/05_sklearn/images/eda.pdf
--------------------------------------------------------------------------------
/report-template/report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/report-template/report.pdf
--------------------------------------------------------------------------------
/06_trees/06_trees_notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/06_trees/06_trees_notes.pdf
--------------------------------------------------------------------------------
/06_trees/06_trees_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/06_trees/06_trees_slides.pdf
--------------------------------------------------------------------------------
/03_python/03_python_notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/03_python/03_python_notes.pdf
--------------------------------------------------------------------------------
/09_eval-ci/09_eval-ci_notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/09_eval-ci/09_eval-ci_notes.pdf
--------------------------------------------------------------------------------
/10_eval-cv/10_eval-cv_notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/10_eval-cv/10_eval-cv_notes.pdf
--------------------------------------------------------------------------------
/other/stat479-fs18-awards.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/other/stat479-fs18-awards.jpg
--------------------------------------------------------------------------------
/05_sklearn/05_sklearn_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/05_sklearn/05_sklearn_slides.pdf
--------------------------------------------------------------------------------
/05_sklearn/images/decisionreg.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/05_sklearn/images/decisionreg.pdf
--------------------------------------------------------------------------------
/09_eval-ci/09_eval-ci_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/09_eval-ci/09_eval-ci_slides.pdf
--------------------------------------------------------------------------------
/10_eval-cv/10_eval-cv_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/10_eval-cv/10_eval-cv_slides.pdf
--------------------------------------------------------------------------------
/01_overview/01_ml-overview_notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/01_overview/01_ml-overview_notes.pdf
--------------------------------------------------------------------------------
/04_scipython/04_scipython_notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/04_scipython/04_scipython_notes.pdf
--------------------------------------------------------------------------------
/05_sklearn/images/estimator-api.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/05_sklearn/images/estimator-api.pdf
--------------------------------------------------------------------------------
/05_sklearn/images/estimator-api.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/05_sklearn/images/estimator-api.png
--------------------------------------------------------------------------------
/05_sklearn/images/holdout-tuning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/05_sklearn/images/holdout-tuning.pdf
--------------------------------------------------------------------------------
/05_sklearn/images/holdout-tuning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/05_sklearn/images/holdout-tuning.png
--------------------------------------------------------------------------------
/07_ensembles/07_ensembles_notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/07_ensembles/07_ensembles_notes.pdf
--------------------------------------------------------------------------------
/07_ensembles/07_ensembles_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/07_ensembles/07_ensembles_slides.pdf
--------------------------------------------------------------------------------
/11_eval-algo/11_eval-algo_notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/11_eval-algo/11_eval-algo_notes.pdf
--------------------------------------------------------------------------------
/11_eval-algo/11_eval-algo_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/11_eval-algo/11_eval-algo_slides.pdf
--------------------------------------------------------------------------------
/13_feat-sele/13_feat-sele_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/13_feat-sele/13_feat-sele_slides.pdf
--------------------------------------------------------------------------------
/13_feat-sele/code-figures/logreg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/13_feat-sele/code-figures/logreg.png
--------------------------------------------------------------------------------
/01_overview/01_ml-overview_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/01_overview/01_ml-overview_slides.pdf
--------------------------------------------------------------------------------
/05_sklearn/images/iris-subsampling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/05_sklearn/images/iris-subsampling.pdf
--------------------------------------------------------------------------------
/05_sklearn/images/iris-subsampling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/05_sklearn/images/iris-subsampling.png
--------------------------------------------------------------------------------
/05_sklearn/images/sklearn-pipeline.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/05_sklearn/images/sklearn-pipeline.pdf
--------------------------------------------------------------------------------
/05_sklearn/images/sklearn-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/05_sklearn/images/sklearn-pipeline.png
--------------------------------------------------------------------------------
/05_sklearn/images/transformer-api.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/05_sklearn/images/transformer-api.pdf
--------------------------------------------------------------------------------
/05_sklearn/images/transformer-api.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/05_sklearn/images/transformer-api.png
--------------------------------------------------------------------------------
/08_eval-intro/08_eval-intro_notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/08_eval-intro/08_eval-intro_notes.pdf
--------------------------------------------------------------------------------
/08_eval-intro/08_eval-intro_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/08_eval-intro/08_eval-intro_slides.pdf
--------------------------------------------------------------------------------
/04_scipython/images/numpy-intro/ufunc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/04_scipython/images/numpy-intro/ufunc.png
--------------------------------------------------------------------------------
/04_scipython/images/numpy-intro/array_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/04_scipython/images/numpy-intro/array_1.png
--------------------------------------------------------------------------------
/04_scipython/images/numpy-intro/array_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/04_scipython/images/numpy-intro/array_2.png
--------------------------------------------------------------------------------
/04_scipython/images/numpy-intro/matmul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/04_scipython/images/numpy-intro/matmul.png
--------------------------------------------------------------------------------
/12_eval-metrics/12_eval-metrics_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/12_eval-metrics/12_eval-metrics_slides.pdf
--------------------------------------------------------------------------------
/14_feat-extract/14_feat-extract_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/14_feat-extract/14_feat-extract_slides.pdf
--------------------------------------------------------------------------------
/report-template/figures/google-scholar.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/report-template/figures/google-scholar.pdf
--------------------------------------------------------------------------------
/04_scipython/images/numpy-intro/matmatmul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/04_scipython/images/numpy-intro/matmatmul.png
--------------------------------------------------------------------------------
/04_scipython/images/numpy-intro/random_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/04_scipython/images/numpy-intro/random_1.png
--------------------------------------------------------------------------------
/04_scipython/images/numpy-intro/random_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/04_scipython/images/numpy-intro/random_2.png
--------------------------------------------------------------------------------
/04_scipython/images/numpy-intro/transpose.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/04_scipython/images/numpy-intro/transpose.png
--------------------------------------------------------------------------------
/13_feat-sele/code-figures/multinomial-logreg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/13_feat-sele/code-figures/multinomial-logreg.png
--------------------------------------------------------------------------------
/04_scipython/images/numpy-intro/broadcasting-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/04_scipython/images/numpy-intro/broadcasting-1.png
--------------------------------------------------------------------------------
/04_scipython/images/numpy-intro/broadcasting-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/stat479-machine-learning-fs18/HEAD/04_scipython/images/numpy-intro/broadcasting-2.png
--------------------------------------------------------------------------------
/report-template/bibliography.bib:
--------------------------------------------------------------------------------
1 | @article{mirjalili2018gender,
2 | title={Gender Privacy: An Ensemble of Semi Adversarial Networks for Confounding Arbitrary Gender Classifiers},
3 | author={Mirjalili, Vahid and Raschka, Sebastian and Ross, Arun},
4 | journal={arXiv preprint arXiv:1807.11936},
5 | year={2018}
6 | }
--------------------------------------------------------------------------------
/report-template/project-presentation-assessment.md:
--------------------------------------------------------------------------------
1 | # Project Presentation Assessment
2 |
3 | - 10 pts: Is there a motivation for the project given?
4 | - 40 pts: Is the project described well enough that a general audience, familiar with machine learning, can understand the project?
5 | - 20 pts: Figures are all legible and explained well
6 | - 20 pts: Are the results presented adequately discussed?
7 | - 10 pts: Did all team members contribute to the presentation?
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Datasets
2 | list_attr_celeba.txt
3 | *.zip
4 | *.npz
5 | *.npy
6 | *ubyte.gz
7 | *archive.ics.uci.edu*
8 |
9 | # Binary PyTorch models
10 | *.pt
11 |
12 | # Temporary OS files
13 | .DS_Store
14 |
15 | # TensorFlow Checkpoint files
16 | checkpoint
17 | code/*/*.data-?????-of-?????
18 | code/*/*.index
19 | code/*/*.meta
20 | code/model_zoo/tensorflow_ipynb/*.data-?????-of-?????
21 | code/model_zoo/tensorflow_ipynb/*.index
22 | code/model_zoo/tensorflow_ipynb/*.meta
23 | code/model_zoo/tensorflow_ipynb/cifar-10/*
24 |
25 | # Byte-compiled / optimized / DLL files
26 | __pycache__/
27 | *.py[cod]
28 | *$py.class
29 |
30 | # C extensions
31 | *.so
32 |
33 | # Distribution / packaging
34 | .Python
35 | env/
36 | build/
37 | develop-eggs/
38 | dist/
39 | downloads/
40 | eggs/
41 | .eggs/
42 | lib/
43 | lib64/
44 | parts/
45 | sdist/
46 | var/
47 | *.egg-info/
48 | .installed.cfg
49 | *.egg
50 |
51 | # PyInstaller
52 | # Usually these files are written by a python script from a template
53 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
54 | *.manifest
55 | *.spec
56 |
57 | # Installer logs
58 | pip-log.txt
59 | pip-delete-this-directory.txt
60 |
61 | # Unit test / coverage reports
62 | htmlcov/
63 | .tox/
64 | .coverage
65 | .coverage.*
66 | .cache
67 | nosetests.xml
68 | coverage.xml
69 | *,cover
70 | .hypothesis/
71 |
72 | # Translations
73 | *.mo
74 | *.pot
75 |
76 | # Django stuff:
77 | *.log
78 | local_settings.py
79 |
80 | # Flask stuff:
81 | instance/
82 | .webassets-cache
83 |
84 | # Scrapy stuff:
85 | .scrapy
86 |
87 | # Sphinx documentation
88 | docs/_build/
89 |
90 | # PyBuilder
91 | target/
92 |
93 | # IPython Notebook
94 | .ipynb_checkpoints
95 |
96 | # pyenv
97 | .python-version
98 |
99 | # celery beat schedule file
100 | celerybeat-schedule
101 |
102 | # dotenv
103 | .env
104 |
105 | # virtualenv
106 | venv/
107 | ENV/
108 |
109 | # Spyder project settings
110 | .spyderproject
111 |
112 | # Rope project settings
113 | .ropeproject
114 |
115 | # Datasets
116 | MNIST*
117 |
--------------------------------------------------------------------------------
/report-template/project-report-assessment.md:
--------------------------------------------------------------------------------
1 | # Project Report Assessment
2 |
3 |
4 | ### Abstract: 15 pts
5 |
6 | - Is enough information provided get a clear idea about the subject matter?
7 | - Is the abstract conveying the findings?
8 | - Are the main points of the report described succinctly?
9 |
10 | ### Introduction: 15 pts
11 |
12 | - Does the introduction cover the required background information to understand the work?
13 | - Is the introduction well organized: it starts out general and becomes more specific towards the end?
14 | - Is there a motivation explaining why this project is relevant, important, and/or interesting?
15 |
16 | ### Related Work: 15 pts
17 |
18 | - Is the similar and related work discussed adequately?
19 | - Are references cited properly (here, but also throughout the whole paper)?
20 | - Is the a discussion or paragraph on comparing this project with other people's work adequate?
21 |
22 |
23 | ### Proposed Method: 25 pts
24 |
25 | - Are there any missing descriptions of symbols used in mathematical notations (if applicable)?
26 | - Are the main algorithms described well enough so that they can be implemented by a knowledgeable reader?
27 |
28 | ### Experiments: 25 pts
29 |
30 | - Is the experimental setup and methodology described well enough so that it can be repeated?
31 | - If datasets are used, are they referenced appropriately?
32 |
33 | ### Results and Discussion: 30 pts
34 |
35 | - Are the results described clearly?
36 | - Is the data analyzed well, and are the results logical?
37 | - Are the figures clear and have no missing labels?
38 | - Do the figure captions have sufficient information to understand the figure?
39 | - Is each figure referenced in the text?
40 | - Is the discussion critical/honest, and are potential weaknesses/shortcomings are discussed as well?
41 |
42 | ### Conclusions: 15 pts
43 |
44 | - Do the authors describe whether the initial motivation/task was accomplished or not based on the results?
45 | - Is it discussed adequately how the results relate to previous work?
46 | - If applicable, are potential future directions given?
47 |
48 | ### Contributions: 10 pts
49 |
50 | - Are all contributions listed clearly?
51 | - Did each member contribute approximately equally to the project?
52 |
53 |
--------------------------------------------------------------------------------
/hw_03/helper.py:
--------------------------------------------------------------------------------
1 | # Copyright Sebastian Raschka 2018
2 |
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 |
6 |
7 | def plot_confusion_matrix(conf_mat,
8 | hide_spines=False,
9 | hide_ticks=False,
10 | figsize=None,
11 | cmap=None,
12 | colorbar=False,
13 | show_absolute=True,
14 | show_normed=False):
15 |
16 | if not (show_absolute or show_normed):
17 | raise AssertionError('Both show_absolute and show_normed are False')
18 |
19 | total_samples = conf_mat.sum(axis=1)[:, np.newaxis]
20 | normed_conf_mat = conf_mat.astype('float') / total_samples
21 |
22 | fig, ax = plt.subplots(figsize=figsize)
23 | ax.grid(False)
24 | if cmap is None:
25 | cmap = plt.cm.Blues
26 |
27 | if figsize is None:
28 | figsize = (len(conf_mat)*1.25, len(conf_mat)*1.25)
29 |
30 | if show_absolute:
31 | matshow = ax.matshow(conf_mat, cmap=cmap)
32 | else:
33 | matshow = ax.matshow(normed_conf_mat, cmap=cmap)
34 |
35 | if colorbar:
36 | fig.colorbar(matshow)
37 |
38 | for i in range(conf_mat.shape[0]):
39 | for j in range(conf_mat.shape[1]):
40 | cell_text = ""
41 | if show_absolute:
42 | cell_text += format(conf_mat[i, j], 'd')
43 | if show_normed:
44 | cell_text += "\n" + '('
45 | cell_text += format(normed_conf_mat[i, j], '.2f') + ')'
46 | else:
47 | cell_text += format(normed_conf_mat[i, j], '.2f')
48 | ax.text(x=j,
49 | y=i,
50 | s=cell_text,
51 | va='center',
52 | ha='center',
53 | color="white" if normed_conf_mat[i, j] > 0.5 else "black")
54 |
55 | if hide_spines:
56 | ax.spines['right'].set_visible(False)
57 | ax.spines['top'].set_visible(False)
58 | ax.spines['left'].set_visible(False)
59 | ax.spines['bottom'].set_visible(False)
60 | ax.yaxis.set_ticks_position('left')
61 | ax.xaxis.set_ticks_position('bottom')
62 | if hide_ticks:
63 | ax.axes.get_yaxis().set_ticks([])
64 | ax.axes.get_xaxis().set_ticks([])
65 |
66 | plt.xlabel('predicted label')
67 | plt.ylabel('true label')
68 | return fig, ax
--------------------------------------------------------------------------------
/other/dl-course-info.md:
--------------------------------------------------------------------------------
1 | # STAT 479 SS 2019: Deep Learning
2 |
3 | ## Abstract
4 |
5 | Deep learning is an exciting, young field that specializes in discovering and extracting intricate structures in large, unstructured datasets for parameterizing artificial neural networks with many layers. Since deep learning has pushed the state-of-the-art in many applications, it's become indispensable for modern technology. This is owed to the vast utility of deep learning for tackling complex tasks in the fields of computer vision and natural language processing -- tasks that humans are good at but are traditionally challenging for computers. This includes tasks such as image classification, object detection, and speech recognition.
6 |
7 | The focus of this course will be on understanding artificial neural networks and deep learning algorithmically (discussing the math behind these methods on a basic level) and implementing network models in code as well as applying these to real-world datasets. Some of the topics that will be covered include convolutional neural networks for image classification and object detection, recurrent neural networks for modeling text, and generative adversarial networks for generating new data.
8 |
9 | Familiarity with general machine learning concepts (such as the FS2018 STAT479: Machine Learning course) is recommended but not required. We will review some relevant background concepts, which include general machine learning concepts such as supervised learning, classification, model evaluation, etc. Furthermore, some lectures will focus on reviewing the use of Python's stack for scientific computing (NumPy, SciPy, matplotlib) prior to the introduction of PyTorch as the main computational deep learning library that we are going to use in this course.
10 |
11 |
12 | ## Tentative List of Topics
13 |
14 | - brief history of neural networks and what makes deep learning different from "classic machine learning"
15 | - introducing the concept of neural networks by connecting it to familiar concepts such as logistic regression and multinomial logistic regression (which can be seen as special cases: single-layer neural nets)
16 | - modeling and deriving non-convex loss function through computation graphs
17 | - introduction to automatic differentiation and PyTorch for efficient data manipulation using GPUs
18 | - convolutional neural networks for analyzing unstructured data (image analysis)
19 | - using 1D convolutions for sequence analysis
20 | - more advanced sequence analysis using recurrent neural networks
21 | - introducing generative models to sample from input distributions: autoencoders, variational autoencoders, and generative adversarial neural networks
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # STAT479: Machine Learning (Fall 2018)
2 |
3 | Instructor: Sebastian Raschka
4 |
5 | Lecture material for the Machine Learning course (STAT 479) at University Wisconsin-Madison. For details, please see the course website at http://pages.stat.wisc.edu/~sraschka/teaching/stat479-fs2018/
6 |
7 |
8 |
9 | **Part I: Introduction**
10 |
11 | - [Lecture 1](01_overview): What is Machine Learning? An Overview.
12 | - [Lecture 2](02_knn): Intro to Supervised Learning: KNN
13 |
14 | **Part II: Computational Foundations**
15 |
16 | - [Lecture 3](03_python): Using Python, Anaconda, IPython, Jupyter Notebooks
17 | - [Lecture 4](04_scipython): Scientific Computing with NumPy, SciPy, and Matplotlib
18 | - [Lecture 5](05_sklearn): Data Preprocessing and Machine Learning with Scikit-Learn
19 |
20 | **Part III: Tree-Based Methods**
21 |
22 | - [Lecture 6](06_trees): Decision Trees
23 | - [Lecture 7](07_ensembles): Ensemble Methods
24 |
25 | **Part IV: Evaluation**
26 |
27 | - [Lecture 8](08_eval-intro): Model Evaluation 1: Introduction to Overfitting and Underfitting
28 | - [Lecture 9](09_eval-ci): Model Evaluation 2: Uncertainty Estimates and Resampling
29 | - [Lecture 10](10_eval-cv): Model Evaluation 3: Model Selection and Cross-Validation
30 | - [Lecture 11](11_eval-algo): Model Evaluation 4: Algorithm Selection and Statistical Tests
31 | - [Lecture 12](12_eval-metrics): Model Evaluation 5: Performance Metrics
32 |
33 | **Part V: Dimensionality Reduction**
34 |
35 | - [Lecture 13](13_feat-sele): Feature Selection
36 | - [Lecture 14](14_feat-extract): Feature Extraction
37 |
38 | **Due to time constraints, the following topics could unfortunately not be covered:**
39 |
40 | **Part VI: Bayesian Learning**
41 |
42 | - Bayes Classifiers
43 | - Text Data & Sentiment Analysis
44 | - Naive Bayes Classification
45 |
46 | **Part VII: Regression and Unsupervised Learning**
47 |
48 | - Regression Analysis
49 | - Clustering
50 |
51 | **The following topics will be covered at the beginning of the
52 | Deep Learning class next Spring.** [Tentative outline of the DL course](./other/dl-course-info.md).
53 |
54 | **Part VIII: Introduction to Artificial Neural Networks**
55 |
56 | - Perceptron
57 | - Adaline & Logistic Regression
58 | - SVM
59 | - Multilayer Perceptron
60 |
61 |
62 | 
This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.
63 |
64 |
65 |
66 |
67 |
68 |
69 | Teaching this class was a pleasure, and I am especially happy about how awesome the class projects turned out. Listed below are the winners of the three award categories as determined by ~210 votes. Congratulations!
70 |
71 | 
--------------------------------------------------------------------------------
/hw_01/test_data.txt:
--------------------------------------------------------------------------------
1 | x1 x2 y
2 | -5.75 -6.83 0
3 | 5.51 3.67 1
4 | 5.11 5.32 1
5 | 0.85 -4.11 0
6 | -0.50 -0.45 1
7 | -12.65 -12.05 0
8 | -4.22 -6.39 0
9 | -0.56 -10.23 0
10 | 2.82 1.68 1
11 | 3.44 -7.70 0
12 | 9.56 -7.29 1
13 | 11.22 5.10 1
14 | -2.90 -8.44 0
15 | 3.65 -10.13 0
16 | -5.95 -6.79 0
17 | 10.30 6.20 1
18 | 11.59 5.99 1
19 | -8.87 -2.64 0
20 | -2.63 -6.28 0
21 | 14.82 5.55 1
22 | 4.70 2.81 1
23 | -5.90 2.11 0
24 | -3.98 -8.53 0
25 | 10.52 -0.67 1
26 | -6.96 -3.70 0
27 | -4.06 -1.97 1
28 | 7.40 -0.49 1
29 | -2.08 -3.87 0
30 | -4.07 -2.24 0
31 | 7.31 0.19 1
32 | 2.26 3.73 1
33 | -6.76 -9.25 0
34 | 2.80 0.13 0
35 | -6.79 -5.64 0
36 | 5.54 9.07 1
37 | 0.36 3.12 1
38 | -0.09 -5.57 0
39 | -2.43 -8.09 0
40 | -0.77 7.97 1
41 | -2.36 -3.81 0
42 | -2.96 -1.82 0
43 | -7.74 -4.67 0
44 | -4.85 -12.71 0
45 | 1.07 -4.86 0
46 | -4.71 -2.16 0
47 | -5.00 -6.76 0
48 | -11.60 4.64 0
49 | 4.39 0.39 1
50 | 0.14 0.06 1
51 | 7.64 5.08 1
52 | 8.37 3.39 1
53 | 1.59 9.37 1
54 | 7.96 7.02 1
55 | 3.73 -4.61 0
56 | -8.17 -9.61 0
57 | -1.95 -4.46 0
58 | 0.93 -1.05 1
59 | -14.65 -1.69 0
60 | -7.93 -7.95 0
61 | 7.68 9.08 1
62 | 9.50 -2.88 1
63 | 5.17 7.50 1
64 | -4.86 -6.51 0
65 | 1.94 1.10 1
66 | -0.32 -12.92 0
67 | 7.44 -0.90 1
68 | 10.65 3.87 1
69 | -10.45 -2.66 0
70 | 7.48 -2.95 1
71 | 0.28 -0.52 0
72 | 3.18 -13.24 0
73 | 8.39 0.84 1
74 | 8.86 4.78 1
75 | 0.49 10.36 1
76 | 2.36 -12.78 0
77 | -1.97 -7.52 0
78 | 1.87 -8.03 0
79 | 3.50 5.48 1
80 | -5.58 -2.99 0
81 | 6.99 -8.59 1
82 | -6.34 -3.89 0
83 | 11.34 2.99 1
84 | -0.56 -10.16 0
85 | 8.08 6.18 1
86 | 8.94 2.05 1
87 | -11.12 -2.71 0
88 | 10.76 2.59 1
89 | 0.03 1.11 1
90 | 0.84 2.83 1
91 | 8.36 8.34 1
92 | -4.38 -4.40 0
93 | -6.94 -8.48 0
94 | -11.82 1.06 0
95 | -7.66 -5.78 0
96 | 3.29 -0.30 1
97 | 6.47 7.38 1
98 | 2.08 -6.21 0
99 | 5.97 4.18 1
100 | -1.57 -6.36 0
101 | -1.53 -3.74 0
102 | -2.84 -0.15 0
103 | 12.69 -4.20 1
104 | -7.43 -4.21 0
105 | 3.81 -8.34 0
106 | 4.76 0.32 1
107 | 11.87 6.52 1
108 | -2.01 3.78 0
109 | 1.95 0.55 1
110 | 3.51 -6.28 1
111 | -3.27 -2.19 0
112 | -5.74 1.53 0
113 | 6.98 2.86 1
114 | -7.02 -7.18 0
115 | 2.49 8.94 1
116 | -3.52 1.14 0
117 | 9.68 0.98 1
118 | -13.70 -7.31 0
119 | 11.38 4.25 1
120 | -5.46 -4.15 0
121 | -0.68 -8.03 0
122 | 0.10 -3.51 0
123 | 10.43 6.93 1
124 | 2.74 -4.24 0
125 | -2.99 -6.52 0
126 | -4.69 1.39 0
127 | 6.87 9.68 1
128 | 6.20 4.20 1
129 | 6.75 -1.85 1
130 | 6.32 9.44 1
131 | -6.92 -8.03 0
132 | 12.44 2.15 1
133 | -7.26 -1.17 0
134 | -11.95 1.21 0
135 | -3.93 -5.76 0
136 | 0.84 8.70 1
137 | 0.45 -0.26 1
138 | -0.82 -8.39 0
139 | -7.75 -12.57 0
140 | 7.03 -2.10 1
141 | -4.95 -13.39 0
142 | 5.64 1.28 1
143 | 5.47 6.38 1
144 | 3.04 -4.91 1
145 | -3.33 -3.80 0
146 | -5.89 0.18 0
147 | 8.61 10.52 1
148 | -1.91 -2.04 1
149 | 3.86 5.78 1
150 | -3.50 -5.25 0
151 | 0.78 2.49 1
152 | 8.84 3.60 1
153 | -3.50 0.86 0
154 | -7.13 -8.24 0
155 | 2.82 -8.17 0
156 | 6.67 3.99 1
157 | 10.19 3.48 1
158 | 9.79 -2.40 1
159 | 2.12 -3.79 0
160 | 11.98 5.16 1
161 | 10.65 7.99 1
162 | 9.95 0.36 1
163 | 6.19 0.89 1
164 | -3.94 -10.17 0
165 | -4.30 -9.05 0
166 | 12.59 -3.56 1
167 | 5.04 2.32 1
168 | -9.20 -14.65 0
169 | -8.35 -0.15 0
170 | -5.98 -4.62 0
171 | 4.39 1.88 1
172 | 1.01 8.72 1
173 | 0.25 5.29 1
174 | 7.30 -1.07 1
175 | -2.65 -5.44 0
176 | 12.10 -6.39 1
177 | 8.95 -1.73 1
178 | 8.79 3.18 1
179 | 3.42 12.11 1
180 | 8.71 6.47 1
181 | -15.19 -2.76 0
182 | -3.15 -9.35 0
183 | -3.26 -7.77 0
184 | 12.06 -1.95 1
185 | -1.07 -2.64 0
186 | 0.80 5.37 1
187 | 4.76 -7.93 0
188 | -2.68 -16.15 0
189 | -2.63 -8.02 0
190 | 13.31 -3.46 1
191 | 8.58 -4.67 1
192 | 4.69 2.50 1
193 | 3.25 5.99 1
194 | 1.29 6.16 1
195 | -3.17 -5.06 0
196 | -2.64 -3.66 0
197 | -3.89 -12.56 0
198 | 3.14 5.05 1
199 | 8.05 7.63 1
200 | -4.87 -6.22 0
201 | -12.42 -6.33 0
202 |
--------------------------------------------------------------------------------
/02_knn/iris.csv:
--------------------------------------------------------------------------------
1 | Id,SepalLength[cm],SepalWidth[cm],PetalLength[cm],PetalWidth[cm],Species
2 | 1,5.1,3.5,1.4,0.2,Iris-setosa
3 | 2,4.9,3.0,1.4,0.2,Iris-setosa
4 | 3,4.7,3.2,1.3,0.2,Iris-setosa
5 | 4,4.6,3.1,1.5,0.2,Iris-setosa
6 | 5,5.0,3.6,1.4,0.2,Iris-setosa
7 | 6,5.4,3.9,1.7,0.4,Iris-setosa
8 | 7,4.6,3.4,1.4,0.3,Iris-setosa
9 | 8,5.0,3.4,1.5,0.2,Iris-setosa
10 | 9,4.4,2.9,1.4,0.2,Iris-setosa
11 | 10,4.9,3.1,1.5,0.1,Iris-setosa
12 | 11,5.4,3.7,1.5,0.2,Iris-setosa
13 | 12,4.8,3.4,1.6,0.2,Iris-setosa
14 | 13,4.8,3.0,1.4,0.1,Iris-setosa
15 | 14,4.3,3.0,1.1,0.1,Iris-setosa
16 | 15,5.8,4.0,1.2,0.2,Iris-setosa
17 | 16,5.7,4.4,1.5,0.4,Iris-setosa
18 | 17,5.4,3.9,1.3,0.4,Iris-setosa
19 | 18,5.1,3.5,1.4,0.3,Iris-setosa
20 | 19,5.7,3.8,1.7,0.3,Iris-setosa
21 | 20,5.1,3.8,1.5,0.3,Iris-setosa
22 | 21,5.4,3.4,1.7,0.2,Iris-setosa
23 | 22,5.1,3.7,1.5,0.4,Iris-setosa
24 | 23,4.6,3.6,1.0,0.2,Iris-setosa
25 | 24,5.1,3.3,1.7,0.5,Iris-setosa
26 | 25,4.8,3.4,1.9,0.2,Iris-setosa
27 | 26,5.0,3.0,1.6,0.2,Iris-setosa
28 | 27,5.0,3.4,1.6,0.4,Iris-setosa
29 | 28,5.2,3.5,1.5,0.2,Iris-setosa
30 | 29,5.2,3.4,1.4,0.2,Iris-setosa
31 | 30,4.7,3.2,1.6,0.2,Iris-setosa
32 | 31,4.8,3.1,1.6,0.2,Iris-setosa
33 | 32,5.4,3.4,1.5,0.4,Iris-setosa
34 | 33,5.2,4.1,1.5,0.1,Iris-setosa
35 | 34,5.5,4.2,1.4,0.2,Iris-setosa
36 | 35,4.9,3.1,1.5,0.1,Iris-setosa
37 | 36,5.0,3.2,1.2,0.2,Iris-setosa
38 | 37,5.5,3.5,1.3,0.2,Iris-setosa
39 | 38,4.9,3.1,1.5,0.1,Iris-setosa
40 | 39,4.4,3.0,1.3,0.2,Iris-setosa
41 | 40,5.1,3.4,1.5,0.2,Iris-setosa
42 | 41,5.0,3.5,1.3,0.3,Iris-setosa
43 | 42,4.5,2.3,1.3,0.3,Iris-setosa
44 | 43,4.4,3.2,1.3,0.2,Iris-setosa
45 | 44,5.0,3.5,1.6,0.6,Iris-setosa
46 | 45,5.1,3.8,1.9,0.4,Iris-setosa
47 | 46,4.8,3.0,1.4,0.3,Iris-setosa
48 | 47,5.1,3.8,1.6,0.2,Iris-setosa
49 | 48,4.6,3.2,1.4,0.2,Iris-setosa
50 | 49,5.3,3.7,1.5,0.2,Iris-setosa
51 | 50,5.0,3.3,1.4,0.2,Iris-setosa
52 | 51,7.0,3.2,4.7,1.4,Iris-versicolor
53 | 52,6.4,3.2,4.5,1.5,Iris-versicolor
54 | 53,6.9,3.1,4.9,1.5,Iris-versicolor
55 | 54,5.5,2.3,4.0,1.3,Iris-versicolor
56 | 55,6.5,2.8,4.6,1.5,Iris-versicolor
57 | 56,5.7,2.8,4.5,1.3,Iris-versicolor
58 | 57,6.3,3.3,4.7,1.6,Iris-versicolor
59 | 58,4.9,2.4,3.3,1.0,Iris-versicolor
60 | 59,6.6,2.9,4.6,1.3,Iris-versicolor
61 | 60,5.2,2.7,3.9,1.4,Iris-versicolor
62 | 61,5.0,2.0,3.5,1.0,Iris-versicolor
63 | 62,5.9,3.0,4.2,1.5,Iris-versicolor
64 | 63,6.0,2.2,4.0,1.0,Iris-versicolor
65 | 64,6.1,2.9,4.7,1.4,Iris-versicolor
66 | 65,5.6,2.9,3.6,1.3,Iris-versicolor
67 | 66,6.7,3.1,4.4,1.4,Iris-versicolor
68 | 67,5.6,3.0,4.5,1.5,Iris-versicolor
69 | 68,5.8,2.7,4.1,1.0,Iris-versicolor
70 | 69,6.2,2.2,4.5,1.5,Iris-versicolor
71 | 70,5.6,2.5,3.9,1.1,Iris-versicolor
72 | 71,5.9,3.2,4.8,1.8,Iris-versicolor
73 | 72,6.1,2.8,4.0,1.3,Iris-versicolor
74 | 73,6.3,2.5,4.9,1.5,Iris-versicolor
75 | 74,6.1,2.8,4.7,1.2,Iris-versicolor
76 | 75,6.4,2.9,4.3,1.3,Iris-versicolor
77 | 76,6.6,3.0,4.4,1.4,Iris-versicolor
78 | 77,6.8,2.8,4.8,1.4,Iris-versicolor
79 | 78,6.7,3.0,5.0,1.7,Iris-versicolor
80 | 79,6.0,2.9,4.5,1.5,Iris-versicolor
81 | 80,5.7,2.6,3.5,1.0,Iris-versicolor
82 | 81,5.5,2.4,3.8,1.1,Iris-versicolor
83 | 82,5.5,2.4,3.7,1.0,Iris-versicolor
84 | 83,5.8,2.7,3.9,1.2,Iris-versicolor
85 | 84,6.0,2.7,5.1,1.6,Iris-versicolor
86 | 85,5.4,3.0,4.5,1.5,Iris-versicolor
87 | 86,6.0,3.4,4.5,1.6,Iris-versicolor
88 | 87,6.7,3.1,4.7,1.5,Iris-versicolor
89 | 88,6.3,2.3,4.4,1.3,Iris-versicolor
90 | 89,5.6,3.0,4.1,1.3,Iris-versicolor
91 | 90,5.5,2.5,4.0,1.3,Iris-versicolor
92 | 91,5.5,2.6,4.4,1.2,Iris-versicolor
93 | 92,6.1,3.0,4.6,1.4,Iris-versicolor
94 | 93,5.8,2.6,4.0,1.2,Iris-versicolor
95 | 94,5.0,2.3,3.3,1.0,Iris-versicolor
96 | 95,5.6,2.7,4.2,1.3,Iris-versicolor
97 | 96,5.7,3.0,4.2,1.2,Iris-versicolor
98 | 97,5.7,2.9,4.2,1.3,Iris-versicolor
99 | 98,6.2,2.9,4.3,1.3,Iris-versicolor
100 | 99,5.1,2.5,3.0,1.1,Iris-versicolor
101 | 100,5.7,2.8,4.1,1.3,Iris-versicolor
102 | 101,6.3,3.3,6.0,2.5,Iris-virginica
103 | 102,5.8,2.7,5.1,1.9,Iris-virginica
104 | 103,7.1,3.0,5.9,2.1,Iris-virginica
105 | 104,6.3,2.9,5.6,1.8,Iris-virginica
106 | 105,6.5,3.0,5.8,2.2,Iris-virginica
107 | 106,7.6,3.0,6.6,2.1,Iris-virginica
108 | 107,4.9,2.5,4.5,1.7,Iris-virginica
109 | 108,7.3,2.9,6.3,1.8,Iris-virginica
110 | 109,6.7,2.5,5.8,1.8,Iris-virginica
111 | 110,7.2,3.6,6.1,2.5,Iris-virginica
112 | 111,6.5,3.2,5.1,2.0,Iris-virginica
113 | 112,6.4,2.7,5.3,1.9,Iris-virginica
114 | 113,6.8,3.0,5.5,2.1,Iris-virginica
115 | 114,5.7,2.5,5.0,2.0,Iris-virginica
116 | 115,5.8,2.8,5.1,2.4,Iris-virginica
117 | 116,6.4,3.2,5.3,2.3,Iris-virginica
118 | 117,6.5,3.0,5.5,1.8,Iris-virginica
119 | 118,7.7,3.8,6.7,2.2,Iris-virginica
120 | 119,7.7,2.6,6.9,2.3,Iris-virginica
121 | 120,6.0,2.2,5.0,1.5,Iris-virginica
122 | 121,6.9,3.2,5.7,2.3,Iris-virginica
123 | 122,5.6,2.8,4.9,2.0,Iris-virginica
124 | 123,7.7,2.8,6.7,2.0,Iris-virginica
125 | 124,6.3,2.7,4.9,1.8,Iris-virginica
126 | 125,6.7,3.3,5.7,2.1,Iris-virginica
127 | 126,7.2,3.2,6.0,1.8,Iris-virginica
128 | 127,6.2,2.8,4.8,1.8,Iris-virginica
129 | 128,6.1,3.0,4.9,1.8,Iris-virginica
130 | 129,6.4,2.8,5.6,2.1,Iris-virginica
131 | 130,7.2,3.0,5.8,1.6,Iris-virginica
132 | 131,7.4,2.8,6.1,1.9,Iris-virginica
133 | 132,7.9,3.8,6.4,2.0,Iris-virginica
134 | 133,6.4,2.8,5.6,2.2,Iris-virginica
135 | 134,6.3,2.8,5.1,1.5,Iris-virginica
136 | 135,6.1,2.6,5.6,1.4,Iris-virginica
137 | 136,7.7,3.0,6.1,2.3,Iris-virginica
138 | 137,6.3,3.4,5.6,2.4,Iris-virginica
139 | 138,6.4,3.1,5.5,1.8,Iris-virginica
140 | 139,6.0,3.0,4.8,1.8,Iris-virginica
141 | 140,6.9,3.1,5.4,2.1,Iris-virginica
142 | 141,6.7,3.1,5.6,2.4,Iris-virginica
143 | 142,6.9,3.1,5.1,2.3,Iris-virginica
144 | 143,5.8,2.7,5.1,1.9,Iris-virginica
145 | 144,6.8,3.2,5.9,2.3,Iris-virginica
146 | 145,6.7,3.3,5.7,2.5,Iris-virginica
147 | 146,6.7,3.0,5.2,2.3,Iris-virginica
148 | 147,6.3,2.5,5.0,1.9,Iris-virginica
149 | 148,6.5,3.0,5.2,2.0,Iris-virginica
150 | 149,6.2,3.4,5.4,2.3,Iris-virginica
151 | 150,5.9,3.0,5.1,1.8,Iris-virginica
--------------------------------------------------------------------------------
/report-template/report.tex:
--------------------------------------------------------------------------------
1 | \documentclass[10pt,twocolumn,letterpaper]{article}
2 |
3 | \usepackage{statcourse}
4 | \usepackage{times}
5 | \usepackage{epsfig}
6 | \usepackage{graphicx}
7 | \usepackage{amsmath}
8 | \usepackage{amssymb}
9 |
10 | % Include other packages here, before hyperref.
11 |
12 | % If you comment hyperref and then uncomment it, you should delete
13 | % egpaper.aux before re-running latex. (Or just hit 'q' on the first latex
14 | % run, let it finish, and you should be clear).
15 | \usepackage[breaklinks=true,bookmarks=false]{hyperref}
16 |
17 |
18 | \statcoursefinalcopy
19 |
20 |
21 | \setcounter{page}{1}
22 | \begin{document}
23 |
24 |
25 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
26 | % DO NOT EDIT ANYTHING ABOVE THIS LINE
27 | % EXCEPT IF YOU LIKE TO USE ADDITIONAL PACKAGES
28 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
29 |
30 |
31 |
32 | %%%%%%%%% TITLE
33 | \title{\LaTeX\ Template for STAT479 Project Report}
34 |
35 | \author{First Author\\
36 | {\tt\small firstauthor@wisc.edu}
37 | \and
38 | Second Author\\
39 | {\tt\small secondauthor@wisc.edu}
40 | \and
41 | Third Author\\
42 | {\tt\small thirdauthor@wisc.edu}
43 | }
44 |
45 | \maketitle
46 | %\thispagestyle{empty}
47 |
48 |
49 |
50 | % MAIN ARTICLE GOES BELOW
51 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
52 |
53 |
54 | %%%%%%%%% ABSTRACT
55 | \begin{abstract}
56 | The abstract for your project goes here. The length of the abstract
57 | should be between 200-250 words. Tips for writing a good abstract
58 | can be found at \url{https://writing.wisc.edu/Handbook/presentations_abstracts.html}.
59 | \end{abstract}
60 |
61 | %%%%%%%%% BODY TEXT
62 | \section{Introduction}
63 |
64 | This template is based on the CVPR conference template\footnote{\url{http://statcourse2018.thecvf.com/submission/main_conference/author_guidelines}}.
65 |
66 | The information in this template is very minimal, and this file should serve you as a framework for writing your report. You may prefer to use a more collaboration-friendly tool while drafting the report with your class mates before you prepare the final report for submission. Remember that you should \textbf{submit both the report and code} you used for this project via Canvas. Also, \textbf{only one member per team} needs to submit the project material.
67 |
68 |
69 | This is an example of a mathematical equation:
70 |
71 | $$f(\mathbf{x}; \mathbf{w}) = \sum_{i=1}^{n} w_ix_i.$$
72 |
73 | This is a mathematical expression, $h(\mathbf{x}) = \hat{y}$ formatted in text.
74 |
75 | The project report should be 6-8 pages long (not counting references)
76 | and should contain the sections that are already provided in this paper. Please
77 | check out the text in these sections for further information.
78 |
79 |
80 | \subsection{Subsection}
81 |
82 | You can use paragraphs or subsections to further structure your
83 | main sections. This is an example of a subsection.
84 |
85 | \paragraph{This is a paragraph title.} This is an example of a paragraph.
86 |
87 | \section{Related Work}
88 |
89 | Related work should be discussed here. This is an example of a citation \cite{mirjalili2018gender}. To format the citations properly, put the
90 | corresponding references into the bibliography.bib file. You can obtain
91 | BibTeX-formatted references for the "bib" file from Google Scholar
92 | (\url{https://scholar.google.com}), for example, by clicking on the
93 | double-quote character under a citation and then selecting \mbox{"BibTeX"} as
94 | shown in Figure \ref{fig:google-scholar-1col} and
95 | Figure \ref{fig:google-scholar-2col}.
96 |
97 | \begin{figure}[t]
98 | \begin{center}
99 | \includegraphics[width=0.8\linewidth]{figures/google-scholar.pdf}
100 | \end{center}
101 | \caption{Example illustrating how to get BibTeX references from
102 | Google Scholar as a 1-column figure.}
103 | \label{fig:google-scholar-1col}
104 | \end{figure}
105 |
106 |
107 | \begin{figure*}
108 | \begin{center}
109 | \includegraphics[width=0.8\linewidth]{figures/google-scholar.pdf}
110 | \end{center}
111 | \caption{Example illustrating how to get BibTeX references from
112 | Google Scholar as a 2-column figure.}
113 | \label{fig:google-scholar-2col}
114 | \end{figure*}
115 |
116 | Table \ref{tab:some-table} shows an example for formatting a table.
117 |
118 | \begin{table}
119 | \begin{center}
120 | \begin{tabular}{|l|c|}
121 | \hline
122 | Method & Accuracy \\
123 | \hline\hline
124 | Method 1 & $70 \pm 3$ \% \\
125 | Method 2 & $76 \pm 3$ \% \\
126 | \hline
127 | \end{tabular}
128 | \end{center}
129 | \label{tab:some-table}
130 | \caption{This is an example of a table.}
131 | \end{table}
132 |
133 |
134 | \section{Proposed Method}
135 |
136 | Describe the method(s) you are proposing, developing, or using. I.e., details
137 | of the algorithms may be included here.
138 |
139 | \section{Experiments}
140 |
141 | Describe the experiments you performed. You may want to create separate
142 | subsections to further structure this section.
143 |
144 | \subsection{Dataset}
145 |
146 | Briefly describe your dataset in a separate subsection.
147 |
148 |
149 | \subsection{Software}
150 |
151 | Briefly list (and cite) software software you used.
152 |
153 | \subsection{Hardware}
154 |
155 | If relevant, list hardware resources you used.
156 |
157 |
158 | \section{Results and Discussion}
159 |
160 | Describe the results you obtained from the experiments and interpret them.
161 | Optionally, you could split "Results and Discussion" into two separate
162 | sections.
163 |
164 | \section{Conclusions}
165 |
166 | Describe your conclusions here. If there are any future directions, you can
167 | describe them here, or you can create a new section for future directions.
168 |
169 | \section{Acknowledgements}
170 |
171 | List acknowledgements if any. For example, if someone provided you a dataset, or
172 | you used someone else's resources, this is a good place to acknowledge
173 | the help or support you received.
174 |
175 | \section{Contributions}
176 |
177 | Describe the contributions of each team member who worked on this project.
178 |
179 |
180 | {\small
181 | \bibliographystyle{ieee}
182 | \bibliography{bibliography.bib}
183 | }
184 |
185 | \end{document}
186 |
--------------------------------------------------------------------------------
/11_eval-algo/11_eval-algo_code.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "STAT 479: Machine Learning (Fall 2018) \n",
8 | "Instructor: Sebastian Raschka (sraschka@wisc.edu) \n",
9 | "Course website: http://pages.stat.wisc.edu/~sraschka/teaching/stat479-fs2018/"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "# L11: Model Evaluation 4 -- Algorithm Comparison"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 1,
22 | "metadata": {},
23 | "outputs": [
24 | {
25 | "name": "stdout",
26 | "output_type": "stream",
27 | "text": [
28 | "Sebastian Raschka 2018-11-07 \n",
29 | "\n",
30 | "CPython 3.6.7\n",
31 | "IPython 6.5.0\n",
32 | "\n",
33 | "sklearn 0.20.0\n",
34 | "mlxtend 0.14.0dev\n"
35 | ]
36 | }
37 | ],
38 | "source": [
39 | "%load_ext watermark\n",
40 | "%watermark -a 'Sebastian Raschka' -d -p sklearn,mlxtend -v"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 2,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "import numpy as np\n",
50 | "from sklearn.model_selection import GridSearchCV\n",
51 | "from sklearn.model_selection import train_test_split\n",
52 | "from sklearn.model_selection import StratifiedKFold\n",
53 | "from sklearn.model_selection import cross_val_score\n",
54 | "from sklearn.pipeline import Pipeline\n",
55 | "from sklearn.preprocessing import StandardScaler\n",
56 | "from sklearn.linear_model import LogisticRegression\n",
57 | "from sklearn.neighbors import KNeighborsClassifier\n",
58 | "from sklearn.tree import DecisionTreeClassifier\n",
59 | "from sklearn.svm import SVC\n",
60 | "from mlxtend.data import mnist_data\n",
61 | "from sklearn.metrics import accuracy_score\n",
62 | "\n",
63 | "# Loading and splitting the dataset\n",
64 | "# Note that this is a small (stratified) subset\n",
65 | "# of MNIST; it consists of 5000 samples only, that is,\n",
66 | "# 10% of the original MNIST dataset\n",
67 | "# http://yann.lecun.com/exdb/mnist/\n",
68 | "X, y = mnist_data()\n",
69 | "X = X.astype(np.float32)\n",
70 | "X_train, X_test, y_train, y_test = train_test_split(X, y,\n",
71 | " test_size=0.2,\n",
72 | " random_state=1,\n",
73 | " stratify=y)\n",
74 | "\n",
75 | "# Initializing Classifiers\n",
76 | "clf1 = LogisticRegression(multi_class='multinomial',\n",
77 | " solver='newton-cg',\n",
78 | " random_state=1)\n",
79 | "clf2 = KNeighborsClassifier(algorithm='ball_tree',\n",
80 | " leaf_size=50)\n",
81 | "clf3 = DecisionTreeClassifier(random_state=1)\n",
82 | "clf4 = SVC(random_state=1)\n",
83 | "\n",
84 | "# Building the pipelines\n",
85 | "pipe1 = Pipeline([('std', StandardScaler()),\n",
86 | " ('clf1', clf1)])\n",
87 | "\n",
88 | "pipe2 = Pipeline([('std', StandardScaler()),\n",
89 | " ('clf2', clf2)])\n",
90 | "\n",
91 | "pipe4 = Pipeline([('std', StandardScaler()),\n",
92 | " ('clf4', clf4)])\n",
93 | "\n",
94 | "\n",
95 | "# Setting up the parameter grids\n",
96 | "param_grid1 = [{'clf1__penalty': ['l2'],\n",
97 | " 'clf1__C': np.power(10., np.arange(-4, 4))}]\n",
98 | "\n",
99 | "param_grid2 = [{'clf2__n_neighbors': list(range(1, 10)),\n",
100 | " 'clf2__p': [1, 2]}]\n",
101 | "\n",
102 | "param_grid3 = [{'max_depth': list(range(1, 10)) + [None],\n",
103 | " 'criterion': ['gini', 'entropy']}]\n",
104 | "\n",
105 | "param_grid4 = [{'clf4__kernel': ['rbf'],\n",
106 | " 'clf4__C': np.power(10., np.arange(-4, 4)),\n",
107 | " 'clf4__gamma': np.power(10., np.arange(-5, 0))},\n",
108 | " {'clf4__kernel': ['linear'],\n",
109 | " 'clf4__C': np.power(10., np.arange(-4, 4))}]\n",
110 | "\n",
111 | "# Setting up multiple GridSearchCV objects, 1 for each algorithm\n",
112 | "gridcvs = {}\n",
113 | "inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)\n",
114 | "\n",
115 | "for pgrid, est, name in zip((param_grid1, param_grid2,\n",
116 | " param_grid3, param_grid4),\n",
117 | " (pipe1, pipe2, clf3, pipe4),\n",
118 | " ('Softmax', 'KNN', 'DTree', 'SVM')):\n",
119 | " gcv = GridSearchCV(estimator=est,\n",
120 | " param_grid=pgrid,\n",
121 | " scoring='accuracy',\n",
122 | " n_jobs=1,\n",
123 | " cv=inner_cv,\n",
124 | " verbose=0,\n",
125 | " refit=True)\n",
126 | " gridcvs[name] = gcv"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 3,
132 | "metadata": {},
133 | "outputs": [
134 | {
135 | "name": "stdout",
136 | "output_type": "stream",
137 | "text": [
138 | "DTree | outer ACC 77.25% +/- 2.05\n",
139 | "KNN | outer ACC 91.17% +/- 1.07\n",
140 | "SVM | outer ACC 91.93% +/- 1.38\n",
141 | "Softmax | outer ACC 90.25% +/- 1.31\n"
142 | ]
143 | }
144 | ],
145 | "source": [
146 | "outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)\n",
147 | "\n",
148 | "for name, gs_est in sorted(gridcvs.items()):\n",
149 | " nested_score = cross_val_score(gs_est, \n",
150 | " X=X_train, \n",
151 | " y=y_train, \n",
152 | " cv=outer_cv,\n",
153 | " n_jobs=-1)\n",
154 | " print('%s | outer ACC %.2f%% +/- %.2f' % \n",
155 | " (name, nested_score.mean() * 100, nested_score.std() * 100))"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 4,
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "name": "stdout",
165 | "output_type": "stream",
166 | "text": [
167 | "Accuracy 91.30% (average over CV test folds)\n",
168 | "Best Parameters: {'clf4__C': 100.0, 'clf4__gamma': 0.001, 'clf4__kernel': 'rbf'}\n",
169 | "Training Accuracy: 100.00%\n",
170 | "Test Accuracy: 93.00%\n"
171 | ]
172 | }
173 | ],
174 | "source": [
175 | "# Fitting a model to the whole training set\n",
176 | "# using the \"best\" algorithm\n",
177 | "best_algo = gridcvs['SVM']\n",
178 | "\n",
179 | "best_algo.fit(X_train, y_train)\n",
180 | "train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))\n",
181 | "test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))\n",
182 | "\n",
183 | "print('Accuracy %.2f%% (average over CV test folds)' %\n",
184 | " (100 * best_algo.best_score_))\n",
185 | "print('Best Parameters: %s' % gridcvs['SVM'].best_params_)\n",
186 | "print('Training Accuracy: %.2f%%' % (100 * train_acc))\n",
187 | "print('Test Accuracy: %.2f%%' % (100 * test_acc))"
188 | ]
189 | }
190 | ],
191 | "metadata": {
192 | "anaconda-cloud": {},
193 | "kernelspec": {
194 | "display_name": "Python 3",
195 | "language": "python",
196 | "name": "python3"
197 | },
198 | "language_info": {
199 | "codemirror_mode": {
200 | "name": "ipython",
201 | "version": 3
202 | },
203 | "file_extension": ".py",
204 | "mimetype": "text/x-python",
205 | "name": "python",
206 | "nbconvert_exporter": "python",
207 | "pygments_lexer": "ipython3",
208 | "version": "3.6.7"
209 | }
210 | },
211 | "nbformat": 4,
212 | "nbformat_minor": 1
213 | }
214 |
--------------------------------------------------------------------------------
/report-template/statcourse.sty:
--------------------------------------------------------------------------------
1 | % ---------------------------------------------------------------
2 | %
3 | % $Id: statcourse.sty,v 1.3 2005/10/24 19:56:15 awf Exp $
4 | %
5 | % by Paolo.Ienne@di.epfl.ch
6 | % some mods by awf@acm.org
7 | %
8 | % ---------------------------------------------------------------
9 | %
10 | % no guarantee is given that the format corresponds perfectly to
11 | % IEEE 8.5" x 11" Proceedings, but most features should be ok.
12 | %
13 | % ---------------------------------------------------------------
14 | % with LaTeX2e:
15 | % =============
16 | %
17 | % use as
18 | % \documentclass[times,10pt,twocolumn]{article}
19 | % \usepackage{latex8}
20 | % \usepackage{times}
21 | %
22 | % ---------------------------------------------------------------
23 |
24 | % with LaTeX 2.09:
25 | % ================
26 | %
27 | % use as
28 | % \documentstyle[times,art10,twocolumn,latex8]{article}
29 | %
30 | % ---------------------------------------------------------------
31 | % with both versions:
32 | % ===================
33 | %
34 | % specify \statcoursefinalcopy to emit the final camera-ready copy
35 | %
36 | % specify references as
37 | % \bibliographystyle{ieee}
38 | % \bibliography{...your files...}
39 | %
40 | % ---------------------------------------------------------------
41 |
42 | \usepackage{eso-pic}
43 | \usepackage{xspace}
44 |
45 | \typeout{CVPR 8.5 x 11-Inch Proceedings Style `statcourse.sty'.}
46 |
47 | % ten point helvetica bold required for captions
48 | % eleven point times bold required for second-order headings
49 | % in some sites the name of the fonts may differ,
50 | % change the name here:
51 | \font\statcoursetenhv = phvb at 8pt % *** IF THIS FAILS, SEE statcourse.sty ***
52 | \font\elvbf = ptmb scaled 1100
53 |
54 | % If the above lines give an error message, try to comment them and
55 | % uncomment these:
56 | %\font\statcoursetenhv = phvb7t at 8pt
57 | %\font\elvbf = ptmb7t scaled 1100
58 |
59 | % set dimensions of columns, gap between columns, and paragraph indent
60 | \setlength{\textheight}{8.875in}
61 | \setlength{\textwidth}{6.875in}
62 | \setlength{\columnsep}{0.3125in}
63 | \setlength{\topmargin}{0in}
64 | \setlength{\headheight}{0in}
65 | \setlength{\headsep}{0in}
66 | \setlength{\parindent}{1pc}
67 | \setlength{\oddsidemargin}{-.304in}
68 | \setlength{\evensidemargin}{-.304in}
69 |
70 | \newif\ifstatcoursefinal
71 | \statcoursefinalfalse
72 | \def\statcoursefinalcopy{\global\statcoursefinaltrue}
73 |
74 | % memento from size10.clo
75 | % \normalsize{\@setfontsize\normalsize\@xpt\@xiipt}
76 | % \small{\@setfontsize\small\@ixpt{11}}
77 | % \footnotesize{\@setfontsize\footnotesize\@viiipt{9.5}}
78 | % \scriptsize{\@setfontsize\scriptsize\@viipt\@viiipt}
79 | % \tiny{\@setfontsize\tiny\@vpt\@vipt}
80 | % \large{\@setfontsize\large\@xiipt{14}}
81 | % \Large{\@setfontsize\Large\@xivpt{18}}
82 | % \LARGE{\@setfontsize\LARGE\@xviipt{22}}
83 | % \huge{\@setfontsize\huge\@xxpt{25}}
84 | % \Huge{\@setfontsize\Huge\@xxvpt{30}}
85 |
86 | \def\@maketitle
87 | {
88 | \newpage
89 | \null
90 | \vskip .375in
91 | \begin{center}
92 | {\Large \bf \@title \par}
93 | % additional two empty lines at the end of the title
94 | \vspace*{24pt}
95 | {
96 | \large
97 | \lineskip .5em
98 | \begin{tabular}[t]{c}
99 | \ifstatcoursefinal\@author\else Anonymous CVPR submission\\
100 | \vspace*{1pt}\\%This space will need to be here in the final copy, so don't squeeze it out for the review copy.
101 | Paper ID \statcoursePaperID \fi
102 | \end{tabular}
103 | \par
104 | }
105 | % additional small space at the end of the author name
106 | \vskip .5em
107 | % additional empty line at the end of the title block
108 | \vspace*{12pt}
109 | \end{center}
110 | }
111 |
112 | \def\abstract
113 | {%
114 | \centerline{\large\bf Abstract}%
115 | \vspace*{12pt}%
116 | \it%
117 | }
118 |
119 | \def\endabstract
120 | {
121 | % additional empty line at the end of the abstract
122 | \vspace*{12pt}
123 | }
124 |
125 | \def\affiliation#1{\gdef\@affiliation{#1}} \gdef\@affiliation{}
126 |
127 | \newlength{\@ctmp}
128 | \newlength{\@figindent}
129 | \setlength{\@figindent}{1pc}
130 |
131 | \long\def\@makecaption#1#2{
132 | \setbox\@tempboxa\hbox{\small \noindent #1.~#2}
133 | \setlength{\@ctmp}{\hsize}
134 | \addtolength{\@ctmp}{-\@figindent}\addtolength{\@ctmp}{-\@figindent}
135 | % IF longer than one indented paragraph line
136 | \ifdim \wd\@tempboxa >\@ctmp
137 | % THEN DON'T set as an indented paragraph
138 | {\small #1.~#2\par}
139 | \else
140 | % ELSE center
141 | \hbox to\hsize{\hfil\box\@tempboxa\hfil}
142 | \fi}
143 |
144 | % correct heading spacing and type
145 | \def\statcoursesection{\@startsection {section}{1}{\z@}
146 | {10pt plus 2pt minus 2pt}{7pt} {\large\bf}}
147 | \def\statcoursessect#1{\statcoursesection*{#1}}
148 | \def\statcoursesect#1{\statcoursesection{\hskip -1em.~#1}}
149 | \def\section{\@ifstar\statcoursessect\statcoursesect}
150 |
151 | \def\statcoursesubsection{\@startsection {subsection}{2}{\z@}
152 | {8pt plus 2pt minus 2pt}{6pt} {\elvbf}}
153 | \def\statcoursessubsect#1{\statcoursesubsection*{#1}}
154 | \def\statcoursesubsect#1{\statcoursesubsection{\hskip -1em.~#1}}
155 | \def\subsection{\@ifstar\statcoursessubsect\statcoursesubsect}
156 |
157 | %% --------- Page background marks: Ruler and confidentiality
158 |
159 | % ----- define vruler
160 | \makeatletter
161 | \newbox\statcourserulerbox
162 | \newcount\statcourserulercount
163 | \newdimen\statcourseruleroffset
164 | \newdimen\cv@lineheight
165 | \newdimen\cv@boxheight
166 | \newbox\cv@tmpbox
167 | \newcount\cv@refno
168 | \newcount\cv@tot
169 | % NUMBER with left flushed zeros \fillzeros[]
170 | \newcount\cv@tmpc@ \newcount\cv@tmpc
171 | \def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi
172 | \cv@tmpc=1 %
173 | \loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi
174 | \ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat
175 | \ifnum#2<0\advance\cv@tmpc1\relax-\fi
176 | \loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat
177 | \cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}%
178 | % \makevruler[][][][][]
179 | \def\makevruler[#1][#2][#3][#4][#5]{\begingroup\offinterlineskip
180 | \textheight=#5\vbadness=10000\vfuzz=120ex\overfullrule=0pt%
181 | \global\setbox\statcourserulerbox=\vbox to \textheight{%
182 | {\parskip=0pt\hfuzz=150em\cv@boxheight=\textheight
183 | \cv@lineheight=#1\global\statcourserulercount=#2%
184 | \cv@tot\cv@boxheight\divide\cv@tot\cv@lineheight\advance\cv@tot2%
185 | \cv@refno1\vskip-\cv@lineheight\vskip1ex%
186 | \loop\setbox\cv@tmpbox=\hbox to0cm{{\statcoursetenhv\hfil\fillzeros[#4]\statcourserulercount}}%
187 | \ht\cv@tmpbox\cv@lineheight\dp\cv@tmpbox0pt\box\cv@tmpbox\break
188 | \advance\cv@refno1\global\advance\statcourserulercount#3\relax
189 | \ifnum\cv@refno<\cv@tot\repeat}}\endgroup}%
190 | \makeatother
191 | % ----- end of vruler
192 |
193 | % \makevruler[][][][][]
194 | \def\statcourseruler#1{\makevruler[12pt][#1][1][3][0.993\textheight]\usebox{\statcourserulerbox}}
195 | \AddToShipoutPicture{%
196 | \ifstatcoursefinal\else
197 | %\AtTextLowerLeft{%
198 | % \color[gray]{.15}\framebox(\LenToUnit{\textwidth},\LenToUnit{\textheight}){}
199 | %}
200 | \statcourseruleroffset=\textheight
201 | \advance\statcourseruleroffset by -3.7pt
202 | \color[rgb]{.5,.5,1}
203 | \AtTextUpperLeft{%
204 | \put(\LenToUnit{-35pt},\LenToUnit{-\statcourseruleroffset}){%left ruler
205 | \statcourseruler{\statcourserulercount}}
206 | \put(\LenToUnit{\textwidth\kern 30pt},\LenToUnit{-\statcourseruleroffset}){%right ruler
207 | \statcourseruler{\statcourserulercount}}
208 | }
209 | \def\pid{\parbox{1in}{\begin{center}\bf\sf{\small CVPR}\\\#\statcoursePaperID\end{center}}}
210 | \AtTextUpperLeft{%paperID in corners
211 | \put(\LenToUnit{-65pt},\LenToUnit{45pt}){\pid}
212 | \put(\LenToUnit{\textwidth\kern-8pt},\LenToUnit{45pt}){\pid}
213 | }
214 | \AtTextUpperLeft{%confidential
215 | \put(0,\LenToUnit{1cm}){\parbox{\textwidth}{\centering\statcoursetenhv
216 | CVPR 2018 Submission \#\statcoursePaperID. CONFIDENTIAL REVIEW COPY. DO NOT DISTRIBUTE.}}
217 | }
218 | \fi
219 | }
220 |
221 | %%% Make figure placement a little more predictable.
222 | % We trust the user to move figures if this results
223 | % in ugliness.
224 | % Minimize bad page breaks at figures
225 | \renewcommand{\textfraction}{0.01}
226 | \renewcommand{\floatpagefraction}{0.99}
227 | \renewcommand{\topfraction}{0.99}
228 | \renewcommand{\bottomfraction}{0.99}
229 | \renewcommand{\dblfloatpagefraction}{0.99}
230 | \renewcommand{\dbltopfraction}{0.99}
231 | \setcounter{totalnumber}{99}
232 | \setcounter{topnumber}{99}
233 | \setcounter{bottomnumber}{99}
234 |
235 | % Add a period to the end of an abbreviation unless there's one
236 | % already, then \xspace.
237 | \makeatletter
238 | \DeclareRobustCommand\onedot{\futurelet\@let@token\@onedot}
239 | \def\@onedot{\ifx\@let@token.\else.\null\fi\xspace}
240 |
241 | \def\eg{\emph{e.g}\onedot} \def\Eg{\emph{E.g}\onedot}
242 | \def\ie{\emph{i.e}\onedot} \def\Ie{\emph{I.e}\onedot}
243 | \def\cf{\emph{c.f}\onedot} \def\Cf{\emph{C.f}\onedot}
244 | \def\etc{\emph{etc}\onedot} \def\vs{\emph{vs}\onedot}
245 | \def\wrt{w.r.t\onedot} \def\dof{d.o.f\onedot}
246 | \def\etal{\emph{et al}\onedot}
247 | \makeatother
248 |
249 | % ---------------------------------------------------------------
250 |
--------------------------------------------------------------------------------
/hw_01/train_data.txt:
--------------------------------------------------------------------------------
1 | x1 x2 y
2 | -3.84 -4.40 0
3 | 16.36 6.54 1
4 | -2.73 -5.13 0
5 | 4.83 7.22 1
6 | 3.66 -5.34 0
7 | -0.25 3.12 1
8 | -4.05 -5.13 0
9 | 5.92 4.12 1
10 | 5.55 -1.74 1
11 | 5.68 3.40 1
12 | 10.18 8.89 1
13 | -5.23 -6.67 0
14 | -2.94 -7.10 0
15 | 3.17 6.16 1
16 | 1.82 -1.63 1
17 | -9.18 -1.19 0
18 | 1.28 -4.73 0
19 | -1.49 -2.72 0
20 | 7.21 1.48 1
21 | 0.83 6.78 1
22 | -13.54 -1.02 0
23 | 3.14 1.96 1
24 | 0.94 0.11 1
25 | -4.76 -8.73 0
26 | 5.20 7.22 1
27 | 4.49 4.01 1
28 | 5.28 -2.48 1
29 | 6.70 -6.34 0
30 | 5.42 -2.77 1
31 | -0.43 -3.38 0
32 | -5.37 -3.82 0
33 | -0.09 -8.31 0
34 | -10.86 -9.11 0
35 | 2.16 4.69 1
36 | -1.67 0.07 0
37 | 0.18 -9.78 0
38 | 4.27 -13.91 0
39 | 3.71 9.04 1
40 | 9.27 1.85 1
41 | 1.80 4.61 1
42 | -7.37 -11.87 0
43 | -0.37 -7.59 0
44 | -0.96 -5.23 0
45 | -3.35 -6.77 0
46 | 4.13 -7.18 0
47 | 10.44 -6.05 1
48 | -4.22 -7.05 0
49 | 3.72 8.25 1
50 | 2.76 -0.68 1
51 | -3.50 -5.68 0
52 | 5.95 -10.07 0
53 | -5.17 -4.59 0
54 | -1.76 -0.97 0
55 | -7.83 -3.18 0
56 | -1.57 0.57 1
57 | 9.14 4.46 1
58 | -10.80 -4.57 0
59 | -0.08 3.66 0
60 | -3.28 -1.54 1
61 | -1.04 -5.42 0
62 | 10.21 3.82 1
63 | 3.71 2.54 1
64 | 12.28 -0.10 1
65 | -0.84 -3.87 0
66 | 6.53 0.10 1
67 | 8.97 2.10 1
68 | -3.97 -4.71 0
69 | 2.84 -7.89 0
70 | -4.31 -2.16 0
71 | -2.30 -4.22 0
72 | -3.62 -7.97 0
73 | 11.72 -3.33 1
74 | 0.79 -4.98 0
75 | 11.03 -7.03 0
76 | -3.30 -2.64 0
77 | 7.84 -5.64 0
78 | -5.49 -1.57 0
79 | -8.69 -9.69 0
80 | -5.89 -5.96 0
81 | 5.36 2.73 1
82 | 1.53 -4.95 0
83 | -1.05 4.01 1
84 | -4.65 -7.61 0
85 | -4.66 -0.78 0
86 | 1.18 -9.71 0
87 | 4.03 5.24 1
88 | 4.09 4.61 1
89 | -0.88 -4.48 0
90 | 0.56 -5.17 0
91 | 12.29 -2.51 1
92 | 9.77 6.69 1
93 | -4.52 -11.13 0
94 | 0.80 -8.83 0
95 | -4.89 -8.58 0
96 | 3.40 -2.12 1
97 | 3.25 3.71 1
98 | 4.78 0.08 1
99 | 6.11 4.34 1
100 | -7.67 -10.05 0
101 | 2.69 -0.84 1
102 | -3.69 -10.78 0
103 | 0.04 -2.91 1
104 | 8.93 7.30 1
105 | 2.85 1.86 1
106 | 10.66 -2.37 1
107 | 4.36 -2.10 1
108 | 2.53 1.89 1
109 | 8.36 10.60 1
110 | 9.12 -1.53 1
111 | 2.06 -8.03 0
112 | 0.02 -5.39 0
113 | 12.79 8.90 1
114 | -5.52 -9.25 0
115 | 3.61 5.99 1
116 | -5.45 -5.48 0
117 | 2.74 11.48 1
118 | -8.05 1.79 0
119 | 8.87 -3.80 1
120 | 2.33 7.95 1
121 | 5.22 7.43 1
122 | 4.34 0.68 1
123 | 6.33 3.30 1
124 | 9.39 3.89 1
125 | 6.83 2.22 1
126 | 5.69 6.50 1
127 | -6.70 -10.23 0
128 | 0.89 3.70 1
129 | 2.74 -9.34 0
130 | -0.40 6.67 1
131 | 0.63 -0.58 0
132 | -0.97 -0.19 0
133 | -0.38 -13.55 0
134 | 7.35 1.79 1
135 | 3.10 -11.50 0
136 | -1.53 -7.31 0
137 | -5.52 -4.68 0
138 | 4.38 -5.04 0
139 | 2.22 -0.00 1
140 | -1.05 -3.75 0
141 | 1.53 -12.24 0
142 | 6.83 -2.38 1
143 | -3.96 -9.17 0
144 | 3.77 1.20 1
145 | 10.50 -1.03 1
146 | 7.93 0.80 1
147 | 7.26 -6.40 0
148 | 4.84 3.15 1
149 | 10.10 2.34 1
150 | -4.68 -8.24 0
151 | 14.16 2.35 1
152 | -3.83 -0.51 0
153 | -1.74 -7.86 0
154 | 7.38 7.20 1
155 | -5.17 -1.23 0
156 | 3.13 3.11 1
157 | -5.92 -10.49 0
158 | 15.94 9.48 1
159 | -3.12 -9.22 0
160 | 11.43 -4.44 1
161 | -0.05 -4.04 0
162 | 4.63 6.95 1
163 | 4.13 5.42 1
164 | 4.24 -6.61 0
165 | 14.14 -6.83 1
166 | -14.85 -2.24 0
167 | 11.43 1.90 1
168 | 12.33 1.21 1
169 | 4.59 4.69 1
170 | 4.03 0.40 1
171 | 1.64 -2.76 0
172 | 5.90 1.57 1
173 | 2.83 6.11 1
174 | -2.02 -3.45 0
175 | 7.11 8.73 1
176 | 7.76 3.95 1
177 | 5.94 3.97 1
178 | 7.00 4.18 1
179 | -8.12 -12.72 0
180 | -3.11 -4.88 0
181 | 6.72 5.81 1
182 | -8.97 -4.16 0
183 | 6.42 0.60 1
184 | -8.41 -5.61 0
185 | -4.09 -2.59 0
186 | -0.63 -2.20 1
187 | -0.02 -12.95 0
188 | -1.45 -12.04 0
189 | -10.99 4.08 0
190 | 14.14 2.09 1
191 | 1.37 3.49 1
192 | -11.21 -12.60 0
193 | -6.72 -2.12 0
194 | 9.90 2.87 1
195 | 1.43 -10.15 0
196 | -4.91 -8.80 0
197 | -0.15 -6.41 0
198 | -1.50 -5.15 0
199 | -3.31 -6.48 0
200 | 4.82 -2.20 1
201 | 4.88 4.83 1
202 | -4.89 -0.84 0
203 | -2.56 -1.44 0
204 | -5.38 -3.27 0
205 | 5.31 1.29 1
206 | 2.40 -8.01 0
207 | -3.84 1.85 0
208 | -8.64 0.75 0
209 | 6.58 6.45 1
210 | -6.61 -7.82 0
211 | -2.16 -5.64 0
212 | 7.00 1.84 1
213 | 3.56 -7.63 0
214 | 4.14 -3.39 1
215 | 1.21 -5.49 0
216 | 9.53 0.58 1
217 | -8.63 -3.64 0
218 | 10.51 0.32 1
219 | 12.28 8.25 1
220 | 6.30 9.16 1
221 | -8.06 -7.50 0
222 | -8.03 -9.91 0
223 | 6.51 6.24 1
224 | -6.99 -12.41 0
225 | -7.52 -1.73 0
226 | -3.81 -6.57 0
227 | -8.33 0.31 0
228 | -3.07 -0.45 0
229 | 6.49 4.80 1
230 | -2.00 -0.73 0
231 | 5.91 3.55 1
232 | 4.41 -5.24 1
233 | 4.69 -2.42 1
234 | -0.44 -0.16 1
235 | 4.42 3.53 1
236 | 1.96 -2.66 0
237 | 7.35 -1.35 1
238 | -6.70 -1.99 0
239 | -2.80 -2.71 0
240 | -4.58 -6.58 0
241 | -3.40 -3.48 0
242 | -1.53 -0.63 0
243 | -5.97 -2.88 0
244 | 4.52 -3.56 0
245 | -2.74 -3.33 0
246 | -8.16 -0.73 0
247 | 2.88 -1.97 1
248 | -0.15 -5.59 0
249 | 7.59 3.10 1
250 | 5.66 2.11 1
251 | 6.66 1.61 1
252 | -6.10 -8.85 0
253 | 8.85 4.87 1
254 | -0.23 -2.25 0
255 | 5.42 6.79 1
256 | 3.95 -1.02 0
257 | -1.68 6.95 1
258 | 9.08 1.09 1
259 | -6.78 -6.66 0
260 | -2.70 -2.01 0
261 | 8.34 0.42 1
262 | 1.72 0.18 1
263 | 7.00 8.32 1
264 | 7.93 8.65 1
265 | 5.25 8.99 1
266 | 8.60 8.71 1
267 | 6.35 3.75 1
268 | 11.18 -7.69 1
269 | 4.05 7.97 1
270 | -6.92 3.60 0
271 | 9.77 1.08 1
272 | 1.00 -4.85 1
273 | -3.50 -3.90 0
274 | -5.00 -6.54 0
275 | 9.92 8.11 1
276 | 10.27 2.32 1
277 | 12.08 2.77 1
278 | -8.65 -3.61 0
279 | 6.10 -3.14 0
280 | 12.19 1.87 1
281 | 11.21 -0.54 1
282 | 2.47 -2.72 1
283 | 5.38 -2.78 1
284 | 5.18 1.96 1
285 | 10.55 0.84 1
286 | 3.82 9.14 1
287 | -6.08 -14.13 0
288 | -2.09 -2.07 0
289 | 0.05 0.24 1
290 | -3.57 -3.27 0
291 | 0.50 -6.19 0
292 | -5.03 0.37 0
293 | -9.77 -6.21 0
294 | -2.97 -5.53 0
295 | -5.04 -12.17 0
296 | 2.59 -4.90 0
297 | 6.53 0.61 1
298 | 5.29 3.97 1
299 | 1.32 0.07 1
300 | 3.03 7.38 1
301 | -5.93 1.51 0
302 | -0.79 -12.55 0
303 | -4.89 -3.07 0
304 | -2.02 -8.23 0
305 | -1.91 0.51 0
306 | 1.28 -8.06 0
307 | -2.17 -0.35 0
308 | -5.11 -0.12 0
309 | -0.39 -3.54 0
310 | -2.81 -11.67 0
311 | 5.85 5.42 1
312 | 5.46 10.15 1
313 | -3.51 -7.83 0
314 | 3.84 8.11 1
315 | -4.96 -4.69 0
316 | 1.93 9.17 1
317 | 15.33 4.70 1
318 | 7.52 8.67 1
319 | -2.23 -8.06 0
320 | -6.72 -10.20 0
321 | -6.04 -4.30 0
322 | 1.96 -7.93 0
323 | 7.78 -5.09 1
324 | 5.82 3.20 1
325 | 0.76 5.85 1
326 | -6.11 -9.28 0
327 | 3.83 10.35 1
328 | -8.57 -4.99 0
329 | 8.56 5.87 1
330 | 6.15 0.12 1
331 | 4.00 1.99 1
332 | 3.48 -0.73 1
333 | -11.02 -5.98 0
334 | 6.14 5.43 1
335 | -3.27 -2.94 0
336 | 2.18 3.36 1
337 | 0.49 3.84 1
338 | 2.08 1.81 1
339 | 17.31 0.60 1
340 | 2.98 8.29 1
341 | 2.05 5.49 1
342 | 2.29 0.69 0
343 | -3.56 0.85 0
344 | 8.20 -1.62 1
345 | -5.60 -3.07 0
346 | 6.52 3.71 1
347 | -7.34 -3.16 0
348 | -6.43 -7.56 0
349 | -8.50 -7.98 0
350 | 1.36 -0.27 1
351 | 7.82 -3.16 1
352 | 4.59 -1.90 1
353 | 7.24 -5.03 1
354 | -5.51 -6.32 0
355 | 0.34 -4.44 0
356 | 2.02 -2.24 0
357 | -7.31 -4.34 0
358 | -0.46 8.11 1
359 | -1.79 -1.83 0
360 | -11.32 -6.57 0
361 | 2.50 4.13 1
362 | 2.92 8.44 1
363 | 0.69 0.32 1
364 | 10.97 -0.40 1
365 | -1.04 -12.37 0
366 | 3.66 3.09 1
367 | -2.28 -6.20 0
368 | 3.73 -1.49 1
369 | -1.29 -7.59 0
370 | 5.97 -1.52 1
371 | -1.93 0.49 0
372 | 3.40 -2.34 1
373 | 8.66 4.40 1
374 | -2.75 13.66 1
375 | 1.60 -13.26 0
376 | 14.95 4.36 1
377 | 3.86 -1.50 1
378 | 13.71 2.04 1
379 | 2.72 4.63 1
380 | 6.24 -0.43 1
381 | 4.38 -1.27 1
382 | 9.06 9.67 1
383 | 3.83 5.15 1
384 | 4.14 -11.07 0
385 | -4.44 -6.76 0
386 | -6.64 -9.32 0
387 | -4.65 1.24 0
388 | 4.55 0.21 1
389 | 5.57 8.57 1
390 | -4.79 -5.34 0
391 | 2.97 -4.13 1
392 | 5.99 -2.15 1
393 | -4.93 -3.56 0
394 | -8.14 -12.20 0
395 | -0.14 -6.42 0
396 | -4.79 -3.73 0
397 | 0.68 -3.48 0
398 | -4.16 -3.25 0
399 | 10.64 2.00 1
400 | -8.16 -7.55 0
401 | 5.96 5.37 1
402 | 11.09 -3.39 1
403 | 7.46 -4.72 1
404 | -0.42 2.09 0
405 | -1.40 1.66 0
406 | 9.24 -0.16 1
407 | -2.97 -11.87 0
408 | 2.60 -10.34 0
409 | -1.24 -7.76 0
410 | -2.84 -7.49 0
411 | 10.89 9.67 1
412 | 1.16 -5.77 1
413 | 1.94 -5.81 0
414 | 10.42 -0.43 1
415 | -2.81 -3.98 0
416 | 3.73 -4.75 1
417 | 6.19 -2.02 1
418 | 10.06 3.45 1
419 | -1.59 -3.61 0
420 | -0.19 6.68 1
421 | 7.74 5.71 1
422 | 4.56 3.95 1
423 | -3.00 0.04 0
424 | 5.94 1.09 1
425 | -7.53 -2.33 0
426 | 4.57 5.36 1
427 | 5.10 1.44 1
428 | 0.20 -6.57 0
429 | 1.37 8.58 1
430 | -1.90 -12.73 0
431 | -4.96 -9.93 0
432 | -1.05 4.67 1
433 | 0.52 6.56 1
434 | -1.27 -5.65 0
435 | -0.93 0.78 1
436 | -2.12 3.12 1
437 | -3.87 -2.52 0
438 | 3.61 5.72 1
439 | -1.07 -8.50 0
440 | -1.38 -2.40 0
441 | 13.24 1.52 1
442 | -5.94 -6.61 0
443 | 7.74 -6.51 1
444 | 2.35 2.45 1
445 | -1.94 -4.15 0
446 | -6.16 -5.45 0
447 | 6.09 -0.46 1
448 | 1.99 -10.66 0
449 | -4.25 -5.11 0
450 | 4.65 1.91 1
451 | 2.85 5.48 1
452 | -1.24 -10.13 0
453 | 0.93 -12.92 0
454 | 7.44 -4.40 1
455 | 4.18 2.07 1
456 | -1.03 1.92 1
457 | -9.23 -5.69 0
458 | -8.26 -5.02 0
459 | 5.56 -0.05 1
460 | 11.94 5.48 1
461 | 3.57 0.19 1
462 | -4.58 -1.32 0
463 | 2.34 5.58 1
464 | 0.71 -14.05 0
465 | -0.77 -6.71 0
466 | -8.67 -3.51 0
467 | 8.42 -2.26 1
468 | -0.81 -11.36 0
469 | 0.15 -12.24 0
470 | 1.08 2.51 1
471 | 3.28 4.80 1
472 | -0.77 -3.27 1
473 | 4.47 0.62 1
474 | -4.60 -4.60 0
475 | -0.24 1.90 1
476 | -2.33 -5.57 0
477 | -7.92 -7.43 0
478 | 2.76 -5.48 0
479 | 10.90 7.48 1
480 | -4.81 -2.50 0
481 | -9.87 0.80 0
482 | 14.55 3.38 1
483 | -3.02 -5.36 0
484 | -5.06 -10.03 0
485 | 2.62 10.62 1
486 | 6.75 1.53 1
487 | 6.57 4.42 1
488 | -5.56 -3.18 0
489 | -3.70 -7.45 0
490 | 8.03 3.40 1
491 | -3.10 -2.89 0
492 | -3.57 -11.72 0
493 | 2.87 -0.17 1
494 | 5.93 2.04 1
495 | -9.25 -2.29 0
496 | 5.21 10.32 1
497 | -5.71 -2.44 0
498 | -0.46 2.13 0
499 | -1.83 -6.59 0
500 | 4.24 -0.65 1
501 | 5.84 2.89 1
502 | -4.12 -3.02 0
503 | 5.04 6.66 1
504 | -8.33 3.65 0
505 | 6.01 5.23 1
506 | 6.95 -0.67 1
507 | 4.10 2.33 1
508 | -3.21 -9.92 0
509 | -9.60 -9.94 0
510 | -0.52 -0.78 0
511 | 7.93 8.80 1
512 | -9.67 -5.47 0
513 | 3.25 -1.47 1
514 | 10.65 5.04 1
515 | -5.51 2.58 1
516 | -1.96 -1.92 0
517 | -7.86 0.78 0
518 | -1.39 -8.28 0
519 | -2.48 -9.59 0
520 | 10.16 3.82 1
521 | 2.95 3.52 1
522 | 5.94 6.22 1
523 | -2.61 -3.42 0
524 | -10.44 -0.81 0
525 | -3.32 1.46 0
526 | 9.07 0.55 1
527 | 4.19 3.70 1
528 | 1.46 0.04 1
529 | 7.85 3.80 1
530 | 0.84 -5.74 0
531 | -0.22 1.19 1
532 | 9.63 9.58 1
533 | 9.67 2.25 1
534 | 4.58 11.08 1
535 | -8.67 -3.77 0
536 | 8.11 5.11 1
537 | -0.07 -0.68 0
538 | -1.64 -2.83 0
539 | 3.16 0.57 1
540 | -10.26 -12.83 0
541 | -6.24 -3.93 0
542 | -9.27 -7.59 0
543 | 9.04 -4.97 1
544 | -2.17 -9.35 0
545 | -6.71 -6.63 0
546 | 3.85 7.37 1
547 | 1.86 6.00 1
548 | 9.99 6.05 1
549 | -0.42 -4.97 0
550 | -8.11 -8.39 0
551 | -4.51 0.34 0
552 | -4.18 -3.82 0
553 | -6.86 -5.77 0
554 | 9.11 -0.19 1
555 | -1.96 0.63 0
556 | 14.16 -5.06 1
557 | -2.73 -11.75 0
558 | 6.44 3.08 1
559 | 1.01 -2.94 0
560 | -0.31 -0.05 0
561 | -0.63 -6.24 1
562 | -7.52 -4.67 0
563 | -2.70 -2.01 0
564 | 6.00 1.02 1
565 | -5.10 -4.24 0
566 | 11.22 2.00 1
567 | 5.33 -0.79 0
568 | -2.94 0.57 0
569 | 1.86 -2.52 0
570 | -7.77 -4.05 0
571 | 4.95 4.44 1
572 | -10.64 -5.98 0
573 | 8.72 -1.71 1
574 | -0.91 -9.54 0
575 | -2.29 -2.71 1
576 | -7.20 -15.09 0
577 | -4.73 -2.52 0
578 | 5.78 7.52 1
579 | 8.60 2.52 1
580 | 5.55 4.51 1
581 | 2.44 -4.08 0
582 | 0.75 -8.00 0
583 | 12.48 5.19 1
584 | -4.74 3.36 0
585 | 1.39 2.07 1
586 | 5.83 2.80 1
587 | -6.47 -0.05 0
588 | 6.04 5.53 1
589 | -0.94 -12.89 0
590 | 1.00 -10.54 0
591 | -12.01 -0.84 0
592 | 4.10 6.69 1
593 | 6.33 9.37 1
594 | -10.23 -0.92 0
595 | 6.39 -3.54 1
596 | -0.75 -0.03 1
597 | -1.03 -5.81 0
598 | 1.11 4.33 1
599 | -3.33 -5.00 0
600 | 3.58 1.97 1
601 | 5.41 4.52 1
602 |
--------------------------------------------------------------------------------
/hw_03/data/wine.data:
--------------------------------------------------------------------------------
1 | 1,14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065
2 | 1,13.2,1.78,2.14,11.2,100,2.65,2.76,.26,1.28,4.38,1.05,3.4,1050
3 | 1,13.16,2.36,2.67,18.6,101,2.8,3.24,.3,2.81,5.68,1.03,3.17,1185
4 | 1,14.37,1.95,2.5,16.8,113,3.85,3.49,.24,2.18,7.8,.86,3.45,1480
5 | 1,13.24,2.59,2.87,21,118,2.8,2.69,.39,1.82,4.32,1.04,2.93,735
6 | 1,14.2,1.76,2.45,15.2,112,3.27,3.39,.34,1.97,6.75,1.05,2.85,1450
7 | 1,14.39,1.87,2.45,14.6,96,2.5,2.52,.3,1.98,5.25,1.02,3.58,1290
8 | 1,14.06,2.15,2.61,17.6,121,2.6,2.51,.31,1.25,5.05,1.06,3.58,1295
9 | 1,14.83,1.64,2.17,14,97,2.8,2.98,.29,1.98,5.2,1.08,2.85,1045
10 | 1,13.86,1.35,2.27,16,98,2.98,3.15,.22,1.85,7.22,1.01,3.55,1045
11 | 1,14.1,2.16,2.3,18,105,2.95,3.32,.22,2.38,5.75,1.25,3.17,1510
12 | 1,14.12,1.48,2.32,16.8,95,2.2,2.43,.26,1.57,5,1.17,2.82,1280
13 | 1,13.75,1.73,2.41,16,89,2.6,2.76,.29,1.81,5.6,1.15,2.9,1320
14 | 1,14.75,1.73,2.39,11.4,91,3.1,3.69,.43,2.81,5.4,1.25,2.73,1150
15 | 1,14.38,1.87,2.38,12,102,3.3,3.64,.29,2.96,7.5,1.2,3,1547
16 | 1,13.63,1.81,2.7,17.2,112,2.85,2.91,.3,1.46,7.3,1.28,2.88,1310
17 | 1,14.3,1.92,2.72,20,120,2.8,3.14,.33,1.97,6.2,1.07,2.65,1280
18 | 1,13.83,1.57,2.62,20,115,2.95,3.4,.4,1.72,6.6,1.13,2.57,1130
19 | 1,14.19,1.59,2.48,16.5,108,3.3,3.93,.32,1.86,8.7,1.23,2.82,1680
20 | 1,13.64,3.1,2.56,15.2,116,2.7,3.03,.17,1.66,5.1,.96,3.36,845
21 | 1,14.06,1.63,2.28,16,126,3,3.17,.24,2.1,5.65,1.09,3.71,780
22 | 1,12.93,3.8,2.65,18.6,102,2.41,2.41,.25,1.98,4.5,1.03,3.52,770
23 | 1,13.71,1.86,2.36,16.6,101,2.61,2.88,.27,1.69,3.8,1.11,4,1035
24 | 1,12.85,1.6,2.52,17.8,95,2.48,2.37,.26,1.46,3.93,1.09,3.63,1015
25 | 1,13.5,1.81,2.61,20,96,2.53,2.61,.28,1.66,3.52,1.12,3.82,845
26 | 1,13.05,2.05,3.22,25,124,2.63,2.68,.47,1.92,3.58,1.13,3.2,830
27 | 1,13.39,1.77,2.62,16.1,93,2.85,2.94,.34,1.45,4.8,.92,3.22,1195
28 | 1,13.3,1.72,2.14,17,94,2.4,2.19,.27,1.35,3.95,1.02,2.77,1285
29 | 1,13.87,1.9,2.8,19.4,107,2.95,2.97,.37,1.76,4.5,1.25,3.4,915
30 | 1,14.02,1.68,2.21,16,96,2.65,2.33,.26,1.98,4.7,1.04,3.59,1035
31 | 1,13.73,1.5,2.7,22.5,101,3,3.25,.29,2.38,5.7,1.19,2.71,1285
32 | 1,13.58,1.66,2.36,19.1,106,2.86,3.19,.22,1.95,6.9,1.09,2.88,1515
33 | 1,13.68,1.83,2.36,17.2,104,2.42,2.69,.42,1.97,3.84,1.23,2.87,990
34 | 1,13.76,1.53,2.7,19.5,132,2.95,2.74,.5,1.35,5.4,1.25,3,1235
35 | 1,13.51,1.8,2.65,19,110,2.35,2.53,.29,1.54,4.2,1.1,2.87,1095
36 | 1,13.48,1.81,2.41,20.5,100,2.7,2.98,.26,1.86,5.1,1.04,3.47,920
37 | 1,13.28,1.64,2.84,15.5,110,2.6,2.68,.34,1.36,4.6,1.09,2.78,880
38 | 1,13.05,1.65,2.55,18,98,2.45,2.43,.29,1.44,4.25,1.12,2.51,1105
39 | 1,13.07,1.5,2.1,15.5,98,2.4,2.64,.28,1.37,3.7,1.18,2.69,1020
40 | 1,14.22,3.99,2.51,13.2,128,3,3.04,.2,2.08,5.1,.89,3.53,760
41 | 1,13.56,1.71,2.31,16.2,117,3.15,3.29,.34,2.34,6.13,.95,3.38,795
42 | 1,13.41,3.84,2.12,18.8,90,2.45,2.68,.27,1.48,4.28,.91,3,1035
43 | 1,13.88,1.89,2.59,15,101,3.25,3.56,.17,1.7,5.43,.88,3.56,1095
44 | 1,13.24,3.98,2.29,17.5,103,2.64,2.63,.32,1.66,4.36,.82,3,680
45 | 1,13.05,1.77,2.1,17,107,3,3,.28,2.03,5.04,.88,3.35,885
46 | 1,14.21,4.04,2.44,18.9,111,2.85,2.65,.3,1.25,5.24,.87,3.33,1080
47 | 1,14.38,3.59,2.28,16,102,3.25,3.17,.27,2.19,4.9,1.04,3.44,1065
48 | 1,13.9,1.68,2.12,16,101,3.1,3.39,.21,2.14,6.1,.91,3.33,985
49 | 1,14.1,2.02,2.4,18.8,103,2.75,2.92,.32,2.38,6.2,1.07,2.75,1060
50 | 1,13.94,1.73,2.27,17.4,108,2.88,3.54,.32,2.08,8.90,1.12,3.1,1260
51 | 1,13.05,1.73,2.04,12.4,92,2.72,3.27,.17,2.91,7.2,1.12,2.91,1150
52 | 1,13.83,1.65,2.6,17.2,94,2.45,2.99,.22,2.29,5.6,1.24,3.37,1265
53 | 1,13.82,1.75,2.42,14,111,3.88,3.74,.32,1.87,7.05,1.01,3.26,1190
54 | 1,13.77,1.9,2.68,17.1,115,3,2.79,.39,1.68,6.3,1.13,2.93,1375
55 | 1,13.74,1.67,2.25,16.4,118,2.6,2.9,.21,1.62,5.85,.92,3.2,1060
56 | 1,13.56,1.73,2.46,20.5,116,2.96,2.78,.2,2.45,6.25,.98,3.03,1120
57 | 1,14.22,1.7,2.3,16.3,118,3.2,3,.26,2.03,6.38,.94,3.31,970
58 | 1,13.29,1.97,2.68,16.8,102,3,3.23,.31,1.66,6,1.07,2.84,1270
59 | 1,13.72,1.43,2.5,16.7,108,3.4,3.67,.19,2.04,6.8,.89,2.87,1285
60 | 2,12.37,.94,1.36,10.6,88,1.98,.57,.28,.42,1.95,1.05,1.82,520
61 | 2,12.33,1.1,2.28,16,101,2.05,1.09,.63,.41,3.27,1.25,1.67,680
62 | 2,12.64,1.36,2.02,16.8,100,2.02,1.41,.53,.62,5.75,.98,1.59,450
63 | 2,13.67,1.25,1.92,18,94,2.1,1.79,.32,.73,3.8,1.23,2.46,630
64 | 2,12.37,1.13,2.16,19,87,3.5,3.1,.19,1.87,4.45,1.22,2.87,420
65 | 2,12.17,1.45,2.53,19,104,1.89,1.75,.45,1.03,2.95,1.45,2.23,355
66 | 2,12.37,1.21,2.56,18.1,98,2.42,2.65,.37,2.08,4.6,1.19,2.3,678
67 | 2,13.11,1.01,1.7,15,78,2.98,3.18,.26,2.28,5.3,1.12,3.18,502
68 | 2,12.37,1.17,1.92,19.6,78,2.11,2,.27,1.04,4.68,1.12,3.48,510
69 | 2,13.34,.94,2.36,17,110,2.53,1.3,.55,.42,3.17,1.02,1.93,750
70 | 2,12.21,1.19,1.75,16.8,151,1.85,1.28,.14,2.5,2.85,1.28,3.07,718
71 | 2,12.29,1.61,2.21,20.4,103,1.1,1.02,.37,1.46,3.05,.906,1.82,870
72 | 2,13.86,1.51,2.67,25,86,2.95,2.86,.21,1.87,3.38,1.36,3.16,410
73 | 2,13.49,1.66,2.24,24,87,1.88,1.84,.27,1.03,3.74,.98,2.78,472
74 | 2,12.99,1.67,2.6,30,139,3.3,2.89,.21,1.96,3.35,1.31,3.5,985
75 | 2,11.96,1.09,2.3,21,101,3.38,2.14,.13,1.65,3.21,.99,3.13,886
76 | 2,11.66,1.88,1.92,16,97,1.61,1.57,.34,1.15,3.8,1.23,2.14,428
77 | 2,13.03,.9,1.71,16,86,1.95,2.03,.24,1.46,4.6,1.19,2.48,392
78 | 2,11.84,2.89,2.23,18,112,1.72,1.32,.43,.95,2.65,.96,2.52,500
79 | 2,12.33,.99,1.95,14.8,136,1.9,1.85,.35,2.76,3.4,1.06,2.31,750
80 | 2,12.7,3.87,2.4,23,101,2.83,2.55,.43,1.95,2.57,1.19,3.13,463
81 | 2,12,.92,2,19,86,2.42,2.26,.3,1.43,2.5,1.38,3.12,278
82 | 2,12.72,1.81,2.2,18.8,86,2.2,2.53,.26,1.77,3.9,1.16,3.14,714
83 | 2,12.08,1.13,2.51,24,78,2,1.58,.4,1.4,2.2,1.31,2.72,630
84 | 2,13.05,3.86,2.32,22.5,85,1.65,1.59,.61,1.62,4.8,.84,2.01,515
85 | 2,11.84,.89,2.58,18,94,2.2,2.21,.22,2.35,3.05,.79,3.08,520
86 | 2,12.67,.98,2.24,18,99,2.2,1.94,.3,1.46,2.62,1.23,3.16,450
87 | 2,12.16,1.61,2.31,22.8,90,1.78,1.69,.43,1.56,2.45,1.33,2.26,495
88 | 2,11.65,1.67,2.62,26,88,1.92,1.61,.4,1.34,2.6,1.36,3.21,562
89 | 2,11.64,2.06,2.46,21.6,84,1.95,1.69,.48,1.35,2.8,1,2.75,680
90 | 2,12.08,1.33,2.3,23.6,70,2.2,1.59,.42,1.38,1.74,1.07,3.21,625
91 | 2,12.08,1.83,2.32,18.5,81,1.6,1.5,.52,1.64,2.4,1.08,2.27,480
92 | 2,12,1.51,2.42,22,86,1.45,1.25,.5,1.63,3.6,1.05,2.65,450
93 | 2,12.69,1.53,2.26,20.7,80,1.38,1.46,.58,1.62,3.05,.96,2.06,495
94 | 2,12.29,2.83,2.22,18,88,2.45,2.25,.25,1.99,2.15,1.15,3.3,290
95 | 2,11.62,1.99,2.28,18,98,3.02,2.26,.17,1.35,3.25,1.16,2.96,345
96 | 2,12.47,1.52,2.2,19,162,2.5,2.27,.32,3.28,2.6,1.16,2.63,937
97 | 2,11.81,2.12,2.74,21.5,134,1.6,.99,.14,1.56,2.5,.95,2.26,625
98 | 2,12.29,1.41,1.98,16,85,2.55,2.5,.29,1.77,2.9,1.23,2.74,428
99 | 2,12.37,1.07,2.1,18.5,88,3.52,3.75,.24,1.95,4.5,1.04,2.77,660
100 | 2,12.29,3.17,2.21,18,88,2.85,2.99,.45,2.81,2.3,1.42,2.83,406
101 | 2,12.08,2.08,1.7,17.5,97,2.23,2.17,.26,1.4,3.3,1.27,2.96,710
102 | 2,12.6,1.34,1.9,18.5,88,1.45,1.36,.29,1.35,2.45,1.04,2.77,562
103 | 2,12.34,2.45,2.46,21,98,2.56,2.11,.34,1.31,2.8,.8,3.38,438
104 | 2,11.82,1.72,1.88,19.5,86,2.5,1.64,.37,1.42,2.06,.94,2.44,415
105 | 2,12.51,1.73,1.98,20.5,85,2.2,1.92,.32,1.48,2.94,1.04,3.57,672
106 | 2,12.42,2.55,2.27,22,90,1.68,1.84,.66,1.42,2.7,.86,3.3,315
107 | 2,12.25,1.73,2.12,19,80,1.65,2.03,.37,1.63,3.4,1,3.17,510
108 | 2,12.72,1.75,2.28,22.5,84,1.38,1.76,.48,1.63,3.3,.88,2.42,488
109 | 2,12.22,1.29,1.94,19,92,2.36,2.04,.39,2.08,2.7,.86,3.02,312
110 | 2,11.61,1.35,2.7,20,94,2.74,2.92,.29,2.49,2.65,.96,3.26,680
111 | 2,11.46,3.74,1.82,19.5,107,3.18,2.58,.24,3.58,2.9,.75,2.81,562
112 | 2,12.52,2.43,2.17,21,88,2.55,2.27,.26,1.22,2,.9,2.78,325
113 | 2,11.76,2.68,2.92,20,103,1.75,2.03,.6,1.05,3.8,1.23,2.5,607
114 | 2,11.41,.74,2.5,21,88,2.48,2.01,.42,1.44,3.08,1.1,2.31,434
115 | 2,12.08,1.39,2.5,22.5,84,2.56,2.29,.43,1.04,2.9,.93,3.19,385
116 | 2,11.03,1.51,2.2,21.5,85,2.46,2.17,.52,2.01,1.9,1.71,2.87,407
117 | 2,11.82,1.47,1.99,20.8,86,1.98,1.6,.3,1.53,1.95,.95,3.33,495
118 | 2,12.42,1.61,2.19,22.5,108,2,2.09,.34,1.61,2.06,1.06,2.96,345
119 | 2,12.77,3.43,1.98,16,80,1.63,1.25,.43,.83,3.4,.7,2.12,372
120 | 2,12,3.43,2,19,87,2,1.64,.37,1.87,1.28,.93,3.05,564
121 | 2,11.45,2.4,2.42,20,96,2.9,2.79,.32,1.83,3.25,.8,3.39,625
122 | 2,11.56,2.05,3.23,28.5,119,3.18,5.08,.47,1.87,6,.93,3.69,465
123 | 2,12.42,4.43,2.73,26.5,102,2.2,2.13,.43,1.71,2.08,.92,3.12,365
124 | 2,13.05,5.8,2.13,21.5,86,2.62,2.65,.3,2.01,2.6,.73,3.1,380
125 | 2,11.87,4.31,2.39,21,82,2.86,3.03,.21,2.91,2.8,.75,3.64,380
126 | 2,12.07,2.16,2.17,21,85,2.6,2.65,.37,1.35,2.76,.86,3.28,378
127 | 2,12.43,1.53,2.29,21.5,86,2.74,3.15,.39,1.77,3.94,.69,2.84,352
128 | 2,11.79,2.13,2.78,28.5,92,2.13,2.24,.58,1.76,3,.97,2.44,466
129 | 2,12.37,1.63,2.3,24.5,88,2.22,2.45,.4,1.9,2.12,.89,2.78,342
130 | 2,12.04,4.3,2.38,22,80,2.1,1.75,.42,1.35,2.6,.79,2.57,580
131 | 3,12.86,1.35,2.32,18,122,1.51,1.25,.21,.94,4.1,.76,1.29,630
132 | 3,12.88,2.99,2.4,20,104,1.3,1.22,.24,.83,5.4,.74,1.42,530
133 | 3,12.81,2.31,2.4,24,98,1.15,1.09,.27,.83,5.7,.66,1.36,560
134 | 3,12.7,3.55,2.36,21.5,106,1.7,1.2,.17,.84,5,.78,1.29,600
135 | 3,12.51,1.24,2.25,17.5,85,2,.58,.6,1.25,5.45,.75,1.51,650
136 | 3,12.6,2.46,2.2,18.5,94,1.62,.66,.63,.94,7.1,.73,1.58,695
137 | 3,12.25,4.72,2.54,21,89,1.38,.47,.53,.8,3.85,.75,1.27,720
138 | 3,12.53,5.51,2.64,25,96,1.79,.6,.63,1.1,5,.82,1.69,515
139 | 3,13.49,3.59,2.19,19.5,88,1.62,.48,.58,.88,5.7,.81,1.82,580
140 | 3,12.84,2.96,2.61,24,101,2.32,.6,.53,.81,4.92,.89,2.15,590
141 | 3,12.93,2.81,2.7,21,96,1.54,.5,.53,.75,4.6,.77,2.31,600
142 | 3,13.36,2.56,2.35,20,89,1.4,.5,.37,.64,5.6,.7,2.47,780
143 | 3,13.52,3.17,2.72,23.5,97,1.55,.52,.5,.55,4.35,.89,2.06,520
144 | 3,13.62,4.95,2.35,20,92,2,.8,.47,1.02,4.4,.91,2.05,550
145 | 3,12.25,3.88,2.2,18.5,112,1.38,.78,.29,1.14,8.21,.65,2,855
146 | 3,13.16,3.57,2.15,21,102,1.5,.55,.43,1.3,4,.6,1.68,830
147 | 3,13.88,5.04,2.23,20,80,.98,.34,.4,.68,4.9,.58,1.33,415
148 | 3,12.87,4.61,2.48,21.5,86,1.7,.65,.47,.86,7.65,.54,1.86,625
149 | 3,13.32,3.24,2.38,21.5,92,1.93,.76,.45,1.25,8.42,.55,1.62,650
150 | 3,13.08,3.9,2.36,21.5,113,1.41,1.39,.34,1.14,9.40,.57,1.33,550
151 | 3,13.5,3.12,2.62,24,123,1.4,1.57,.22,1.25,8.60,.59,1.3,500
152 | 3,12.79,2.67,2.48,22,112,1.48,1.36,.24,1.26,10.8,.48,1.47,480
153 | 3,13.11,1.9,2.75,25.5,116,2.2,1.28,.26,1.56,7.1,.61,1.33,425
154 | 3,13.23,3.3,2.28,18.5,98,1.8,.83,.61,1.87,10.52,.56,1.51,675
155 | 3,12.58,1.29,2.1,20,103,1.48,.58,.53,1.4,7.6,.58,1.55,640
156 | 3,13.17,5.19,2.32,22,93,1.74,.63,.61,1.55,7.9,.6,1.48,725
157 | 3,13.84,4.12,2.38,19.5,89,1.8,.83,.48,1.56,9.01,.57,1.64,480
158 | 3,12.45,3.03,2.64,27,97,1.9,.58,.63,1.14,7.5,.67,1.73,880
159 | 3,14.34,1.68,2.7,25,98,2.8,1.31,.53,2.7,13,.57,1.96,660
160 | 3,13.48,1.67,2.64,22.5,89,2.6,1.1,.52,2.29,11.75,.57,1.78,620
161 | 3,12.36,3.83,2.38,21,88,2.3,.92,.5,1.04,7.65,.56,1.58,520
162 | 3,13.69,3.26,2.54,20,107,1.83,.56,.5,.8,5.88,.96,1.82,680
163 | 3,12.85,3.27,2.58,22,106,1.65,.6,.6,.96,5.58,.87,2.11,570
164 | 3,12.96,3.45,2.35,18.5,106,1.39,.7,.4,.94,5.28,.68,1.75,675
165 | 3,13.78,2.76,2.3,22,90,1.35,.68,.41,1.03,9.58,.7,1.68,615
166 | 3,13.73,4.36,2.26,22.5,88,1.28,.47,.52,1.15,6.62,.78,1.75,520
167 | 3,13.45,3.7,2.6,23,111,1.7,.92,.43,1.46,10.68,.85,1.56,695
168 | 3,12.82,3.37,2.3,19.5,88,1.48,.66,.4,.97,10.26,.72,1.75,685
169 | 3,13.58,2.58,2.69,24.5,105,1.55,.84,.39,1.54,8.66,.74,1.8,750
170 | 3,13.4,4.6,2.86,25,112,1.98,.96,.27,1.11,8.5,.67,1.92,630
171 | 3,12.2,3.03,2.32,19,96,1.25,.49,.4,.73,5.5,.66,1.83,510
172 | 3,12.77,2.39,2.28,19.5,86,1.39,.51,.48,.64,9.899999,.57,1.63,470
173 | 3,14.16,2.51,2.48,20,91,1.68,.7,.44,1.24,9.7,.62,1.71,660
174 | 3,13.71,5.65,2.45,20.5,95,1.68,.61,.52,1.06,7.7,.64,1.74,740
175 | 3,13.4,3.91,2.48,23,102,1.8,.75,.43,1.41,7.3,.7,1.56,750
176 | 3,13.27,4.28,2.26,20,120,1.59,.69,.43,1.35,10.2,.59,1.56,835
177 | 3,13.17,2.59,2.37,20,120,1.65,.68,.53,1.46,9.3,.6,1.62,840
178 | 3,14.13,4.1,2.74,24.5,96,2.05,.76,.56,1.35,9.2,.61,1.6,560
179 |
--------------------------------------------------------------------------------
/report-template/ieee.bst:
--------------------------------------------------------------------------------
1 |
2 | % ---------------------------------------------------------------
3 | %
4 | % ieee.bst,v 1.0 2002/04/16
5 | %
6 | % by Glenn Paulley (paulley@acm.org)
7 | %
8 | % Modified from latex8.bst 1995/09/15 15:13:49 ienne Exp $
9 | %
10 | % by Paolo.Ienne@di.epfl.ch
11 | %
12 | %
13 | % ---------------------------------------------------------------
14 | %
15 | % no guarantee is given that the format corresponds perfectly to
16 | % IEEE 8.5" x 11" Proceedings, but most features should be ok.
17 | %
18 | % ---------------------------------------------------------------
19 | %
20 | % `ieee' from BibTeX standard bibliography style `abbrv'
21 | % version 0.99a for BibTeX versions 0.99a or later, LaTeX version 2.09.
22 | % Copyright (C) 1985, all rights reserved.
23 | % Copying of this file is authorized only if either
24 | % (1) you make absolutely no changes to your copy, including name, or
25 | % (2) if you do make changes, you name it something other than
26 | % btxbst.doc, plain.bst, unsrt.bst, alpha.bst, and abbrv.bst.
27 | % This restriction helps ensure that all standard styles are identical.
28 | % The file btxbst.doc has the documentation for this style.
29 |
30 | ENTRY
31 | { address
32 | author
33 | booktitle
34 | chapter
35 | edition
36 | editor
37 | howpublished
38 | institution
39 | journal
40 | key
41 | month
42 | note
43 | number
44 | organization
45 | pages
46 | publisher
47 | school
48 | series
49 | title
50 | type
51 | volume
52 | year
53 | }
54 | {}
55 | { label }
56 |
57 | INTEGERS { output.state before.all mid.sentence after.sentence after.block }
58 |
59 | FUNCTION {init.state.consts}
60 | { #0 'before.all :=
61 | #1 'mid.sentence :=
62 | #2 'after.sentence :=
63 | #3 'after.block :=
64 | }
65 |
66 | STRINGS { s t }
67 |
68 | FUNCTION {output.nonnull}
69 | { 's :=
70 | output.state mid.sentence =
71 | { ", " * write$ }
72 | { output.state after.block =
73 | { add.period$ write$
74 | newline$
75 | "\newblock " write$
76 | }
77 | { output.state before.all =
78 | 'write$
79 | { add.period$ " " * write$ }
80 | if$
81 | }
82 | if$
83 | mid.sentence 'output.state :=
84 | }
85 | if$
86 | s
87 | }
88 |
89 | FUNCTION {output}
90 | { duplicate$ empty$
91 | 'pop$
92 | 'output.nonnull
93 | if$
94 | }
95 |
96 | FUNCTION {output.check}
97 | { 't :=
98 | duplicate$ empty$
99 | { pop$ "empty " t * " in " * cite$ * warning$ }
100 | 'output.nonnull
101 | if$
102 | }
103 |
104 | FUNCTION {output.bibitem}
105 | { newline$
106 | "\bibitem{" write$
107 | cite$ write$
108 | "}" write$
109 | newline$
110 | ""
111 | before.all 'output.state :=
112 | }
113 |
114 | FUNCTION {fin.entry}
115 | { add.period$
116 | write$
117 | newline$
118 | }
119 |
120 | FUNCTION {new.block}
121 | { output.state before.all =
122 | 'skip$
123 | { after.block 'output.state := }
124 | if$
125 | }
126 |
127 | FUNCTION {new.sentence}
128 | { output.state after.block =
129 | 'skip$
130 | { output.state before.all =
131 | 'skip$
132 | { after.sentence 'output.state := }
133 | if$
134 | }
135 | if$
136 | }
137 |
138 | FUNCTION {not}
139 | { { #0 }
140 | { #1 }
141 | if$
142 | }
143 |
144 | FUNCTION {and}
145 | { 'skip$
146 | { pop$ #0 }
147 | if$
148 | }
149 |
150 | FUNCTION {or}
151 | { { pop$ #1 }
152 | 'skip$
153 | if$
154 | }
155 |
156 | FUNCTION {new.block.checka}
157 | { empty$
158 | 'skip$
159 | 'new.block
160 | if$
161 | }
162 |
163 | FUNCTION {new.block.checkb}
164 | { empty$
165 | swap$ empty$
166 | and
167 | 'skip$
168 | 'new.block
169 | if$
170 | }
171 |
172 | FUNCTION {new.sentence.checka}
173 | { empty$
174 | 'skip$
175 | 'new.sentence
176 | if$
177 | }
178 |
179 | FUNCTION {new.sentence.checkb}
180 | { empty$
181 | swap$ empty$
182 | and
183 | 'skip$
184 | 'new.sentence
185 | if$
186 | }
187 |
188 | FUNCTION {field.or.null}
189 | { duplicate$ empty$
190 | { pop$ "" }
191 | 'skip$
192 | if$
193 | }
194 |
195 | FUNCTION {emphasize}
196 | { duplicate$ empty$
197 | { pop$ "" }
198 | { "{\em " swap$ * "}" * }
199 | if$
200 | }
201 |
202 | INTEGERS { nameptr namesleft numnames }
203 |
204 | FUNCTION {format.names}
205 | { 's :=
206 | #1 'nameptr :=
207 | s num.names$ 'numnames :=
208 | numnames 'namesleft :=
209 | { namesleft #0 > }
210 | { s nameptr "{f.~}{vv~}{ll}{, jj}" format.name$ 't :=
211 | nameptr #1 >
212 | { namesleft #1 >
213 | { ", " * t * }
214 | { numnames #2 >
215 | { "," * }
216 | 'skip$
217 | if$
218 | t "others" =
219 | { " et~al." * }
220 | { " and " * t * }
221 | if$
222 | }
223 | if$
224 | }
225 | 't
226 | if$
227 | nameptr #1 + 'nameptr :=
228 |
229 | namesleft #1 - 'namesleft :=
230 | }
231 | while$
232 | }
233 |
234 | FUNCTION {format.authors}
235 | { author empty$
236 | { "" }
237 | { author format.names }
238 | if$
239 | }
240 |
241 | FUNCTION {format.editors}
242 | { editor empty$
243 | { "" }
244 | { editor format.names
245 | editor num.names$ #1 >
246 | { ", editors" * }
247 | { ", editor" * }
248 | if$
249 | }
250 | if$
251 | }
252 |
253 | FUNCTION {format.title}
254 | { title empty$
255 | { "" }
256 | { title "t" change.case$ }
257 | if$
258 | }
259 |
260 | FUNCTION {n.dashify}
261 | { 't :=
262 | ""
263 | { t empty$ not }
264 | { t #1 #1 substring$ "-" =
265 | { t #1 #2 substring$ "--" = not
266 | { "--" *
267 | t #2 global.max$ substring$ 't :=
268 | }
269 | { { t #1 #1 substring$ "-" = }
270 | { "-" *
271 | t #2 global.max$ substring$ 't :=
272 | }
273 | while$
274 | }
275 | if$
276 | }
277 | { t #1 #1 substring$ *
278 | t #2 global.max$ substring$ 't :=
279 | }
280 | if$
281 | }
282 | while$
283 | }
284 |
285 | FUNCTION {format.date}
286 | { year empty$
287 | { month empty$
288 | { "" }
289 | { "there's a month but no year in " cite$ * warning$
290 | month
291 | }
292 | if$
293 | }
294 | { month empty$
295 | 'year
296 | { month " " * year * }
297 | if$
298 | }
299 | if$
300 | }
301 |
302 | FUNCTION {format.btitle}
303 | { title emphasize
304 | }
305 |
306 | FUNCTION {tie.or.space.connect}
307 | { duplicate$ text.length$ #3 <
308 | { "~" }
309 | { " " }
310 | if$
311 | swap$ * *
312 | }
313 |
314 | FUNCTION {either.or.check}
315 | { empty$
316 | 'pop$
317 | { "can't use both " swap$ * " fields in " * cite$ * warning$ }
318 | if$
319 | }
320 |
321 | FUNCTION {format.bvolume}
322 | { volume empty$
323 | { "" }
324 | { "volume" volume tie.or.space.connect
325 | series empty$
326 | 'skip$
327 | { " of " * series emphasize * }
328 | if$
329 | "volume and number" number either.or.check
330 | }
331 | if$
332 | }
333 |
334 | FUNCTION {format.number.series}
335 | { volume empty$
336 | { number empty$
337 | { series field.or.null }
338 | { output.state mid.sentence =
339 | { "number" }
340 | { "Number" }
341 | if$
342 | number tie.or.space.connect
343 | series empty$
344 | { "there's a number but no series in " cite$ * warning$ }
345 | { " in " * series * }
346 | if$
347 | }
348 | if$
349 | }
350 | { "" }
351 | if$
352 | }
353 |
354 | FUNCTION {format.edition}
355 | { edition empty$
356 | { "" }
357 | { output.state mid.sentence =
358 | { edition "l" change.case$ " edition" * }
359 | { edition "t" change.case$ " edition" * }
360 | if$
361 | }
362 | if$
363 | }
364 |
365 | INTEGERS { multiresult }
366 |
367 | FUNCTION {multi.page.check}
368 | { 't :=
369 | #0 'multiresult :=
370 | { multiresult not
371 | t empty$ not
372 | and
373 | }
374 | { t #1 #1 substring$
375 | duplicate$ "-" =
376 | swap$ duplicate$ "," =
377 | swap$ "+" =
378 | or or
379 | { #1 'multiresult := }
380 | { t #2 global.max$ substring$ 't := }
381 | if$
382 | }
383 | while$
384 | multiresult
385 | }
386 |
387 | FUNCTION {format.pages}
388 | { pages empty$
389 | { "" }
390 | { pages multi.page.check
391 | { "pages" pages n.dashify tie.or.space.connect }
392 | { "page" pages tie.or.space.connect }
393 | if$
394 | }
395 | if$
396 | }
397 |
398 | FUNCTION {format.vol.num.pages}
399 | { volume field.or.null
400 | number empty$
401 | 'skip$
402 | { "(" number * ")" * *
403 | volume empty$
404 | { "there's a number but no volume in " cite$ * warning$ }
405 | 'skip$
406 | if$
407 | }
408 | if$
409 | pages empty$
410 | 'skip$
411 | { duplicate$ empty$
412 | { pop$ format.pages }
413 | { ":" * pages n.dashify * }
414 | if$
415 | }
416 | if$
417 | }
418 |
419 | FUNCTION {format.chapter.pages}
420 | { chapter empty$
421 | 'format.pages
422 | { type empty$
423 | { "chapter" }
424 | { type "l" change.case$ }
425 | if$
426 | chapter tie.or.space.connect
427 | pages empty$
428 | 'skip$
429 | { ", " * format.pages * }
430 | if$
431 | }
432 | if$
433 | }
434 |
435 | FUNCTION {format.in.ed.booktitle}
436 | { booktitle empty$
437 | { "" }
438 | { editor empty$
439 | { "In " booktitle emphasize * }
440 | { "In " format.editors * ", " * booktitle emphasize * }
441 | if$
442 | }
443 | if$
444 | }
445 |
446 | FUNCTION {empty.misc.check}
447 |
448 | { author empty$ title empty$ howpublished empty$
449 | month empty$ year empty$ note empty$
450 | and and and and and
451 | key empty$ not and
452 | { "all relevant fields are empty in " cite$ * warning$ }
453 | 'skip$
454 | if$
455 | }
456 |
457 | FUNCTION {format.thesis.type}
458 | { type empty$
459 | 'skip$
460 | { pop$
461 | type "t" change.case$
462 | }
463 | if$
464 | }
465 |
466 | FUNCTION {format.tr.number}
467 | { type empty$
468 | { "Technical Report" }
469 | 'type
470 | if$
471 | number empty$
472 | { "t" change.case$ }
473 | { number tie.or.space.connect }
474 | if$
475 | }
476 |
477 | FUNCTION {format.article.crossref}
478 | { key empty$
479 | { journal empty$
480 | { "need key or journal for " cite$ * " to crossref " * crossref *
481 | warning$
482 | ""
483 | }
484 | { "In {\em " journal * "\/}" * }
485 | if$
486 | }
487 | { "In " key * }
488 | if$
489 | " \cite{" * crossref * "}" *
490 | }
491 |
492 | FUNCTION {format.crossref.editor}
493 | { editor #1 "{vv~}{ll}" format.name$
494 | editor num.names$ duplicate$
495 | #2 >
496 | { pop$ " et~al." * }
497 | { #2 <
498 | 'skip$
499 | { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
500 | { " et~al." * }
501 | { " and " * editor #2 "{vv~}{ll}" format.name$ * }
502 | if$
503 | }
504 | if$
505 | }
506 | if$
507 | }
508 |
509 | FUNCTION {format.book.crossref}
510 | { volume empty$
511 | { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
512 | "In "
513 | }
514 | { "Volume" volume tie.or.space.connect
515 | " of " *
516 | }
517 | if$
518 | editor empty$
519 | editor field.or.null author field.or.null =
520 | or
521 | { key empty$
522 | { series empty$
523 | { "need editor, key, or series for " cite$ * " to crossref " *
524 | crossref * warning$
525 | "" *
526 | }
527 | { "{\em " * series * "\/}" * }
528 | if$
529 | }
530 | { key * }
531 | if$
532 | }
533 | { format.crossref.editor * }
534 | if$
535 | " \cite{" * crossref * "}" *
536 | }
537 |
538 | FUNCTION {format.incoll.inproc.crossref}
539 | { editor empty$
540 | editor field.or.null author field.or.null =
541 | or
542 | { key empty$
543 | { booktitle empty$
544 | { "need editor, key, or booktitle for " cite$ * " to crossref " *
545 | crossref * warning$
546 | ""
547 | }
548 | { "In {\em " booktitle * "\/}" * }
549 | if$
550 | }
551 | { "In " key * }
552 | if$
553 | }
554 | { "In " format.crossref.editor * }
555 | if$
556 | " \cite{" * crossref * "}" *
557 | }
558 |
559 | FUNCTION {article}
560 | { output.bibitem
561 | format.authors "author" output.check
562 | new.block
563 | format.title "title" output.check
564 | new.block
565 | crossref missing$
566 | { journal emphasize "journal" output.check
567 | format.vol.num.pages output
568 | format.date "year" output.check
569 | }
570 | { format.article.crossref output.nonnull
571 | format.pages output
572 | }
573 | if$
574 | new.block
575 | note output
576 | fin.entry
577 | }
578 |
579 | FUNCTION {book}
580 | { output.bibitem
581 | author empty$
582 | { format.editors "author and editor" output.check }
583 | { format.authors output.nonnull
584 | crossref missing$
585 | { "author and editor" editor either.or.check }
586 | 'skip$
587 | if$
588 | }
589 | if$
590 | new.block
591 | format.btitle "title" output.check
592 | crossref missing$
593 | { format.bvolume output
594 | new.block
595 | format.number.series output
596 | new.sentence
597 | publisher "publisher" output.check
598 | address output
599 | }
600 | { new.block
601 | format.book.crossref output.nonnull
602 | }
603 | if$
604 | format.edition output
605 | format.date "year" output.check
606 | new.block
607 | note output
608 | fin.entry
609 | }
610 |
611 | FUNCTION {booklet}
612 | { output.bibitem
613 | format.authors output
614 | new.block
615 | format.title "title" output.check
616 | howpublished address new.block.checkb
617 | howpublished output
618 | address output
619 | format.date output
620 | new.block
621 | note output
622 | fin.entry
623 | }
624 |
625 | FUNCTION {inbook}
626 | { output.bibitem
627 | author empty$
628 | { format.editors "author and editor" output.check }
629 | { format.authors output.nonnull
630 |
631 | crossref missing$
632 | { "author and editor" editor either.or.check }
633 | 'skip$
634 | if$
635 | }
636 | if$
637 | new.block
638 | format.btitle "title" output.check
639 | crossref missing$
640 | { format.bvolume output
641 | format.chapter.pages "chapter and pages" output.check
642 | new.block
643 | format.number.series output
644 | new.sentence
645 | publisher "publisher" output.check
646 | address output
647 | }
648 | { format.chapter.pages "chapter and pages" output.check
649 | new.block
650 | format.book.crossref output.nonnull
651 | }
652 | if$
653 | format.edition output
654 | format.date "year" output.check
655 | new.block
656 | note output
657 | fin.entry
658 | }
659 |
660 | FUNCTION {incollection}
661 | { output.bibitem
662 | format.authors "author" output.check
663 | new.block
664 | format.title "title" output.check
665 | new.block
666 | crossref missing$
667 | { format.in.ed.booktitle "booktitle" output.check
668 | format.bvolume output
669 | format.number.series output
670 | format.chapter.pages output
671 | new.sentence
672 | publisher "publisher" output.check
673 | address output
674 | format.edition output
675 | format.date "year" output.check
676 | }
677 | { format.incoll.inproc.crossref output.nonnull
678 | format.chapter.pages output
679 | }
680 | if$
681 | new.block
682 | note output
683 | fin.entry
684 | }
685 |
686 | FUNCTION {inproceedings}
687 | { output.bibitem
688 | format.authors "author" output.check
689 | new.block
690 | format.title "title" output.check
691 | new.block
692 | crossref missing$
693 | { format.in.ed.booktitle "booktitle" output.check
694 | format.bvolume output
695 | format.number.series output
696 | format.pages output
697 | address empty$
698 | { organization publisher new.sentence.checkb
699 | organization output
700 | publisher output
701 | format.date "year" output.check
702 | }
703 | { address output.nonnull
704 | format.date "year" output.check
705 | new.sentence
706 | organization output
707 | publisher output
708 | }
709 | if$
710 | }
711 | { format.incoll.inproc.crossref output.nonnull
712 | format.pages output
713 | }
714 | if$
715 | new.block
716 | note output
717 | fin.entry
718 | }
719 |
720 | FUNCTION {conference} { inproceedings }
721 |
722 | FUNCTION {manual}
723 | { output.bibitem
724 | author empty$
725 | { organization empty$
726 | 'skip$
727 | { organization output.nonnull
728 | address output
729 | }
730 | if$
731 | }
732 | { format.authors output.nonnull }
733 | if$
734 | new.block
735 | format.btitle "title" output.check
736 | author empty$
737 | { organization empty$
738 | { address new.block.checka
739 | address output
740 | }
741 | 'skip$
742 | if$
743 | }
744 | { organization address new.block.checkb
745 | organization output
746 | address output
747 | }
748 | if$
749 | format.edition output
750 | format.date output
751 | new.block
752 | note output
753 | fin.entry
754 | }
755 |
756 | FUNCTION {mastersthesis}
757 | { output.bibitem
758 | format.authors "author" output.check
759 | new.block
760 | format.title "title" output.check
761 | new.block
762 | "Master's thesis" format.thesis.type output.nonnull
763 | school "school" output.check
764 | address output
765 | format.date "year" output.check
766 | new.block
767 | note output
768 | fin.entry
769 | }
770 |
771 | FUNCTION {misc}
772 | { output.bibitem
773 | format.authors output
774 | title howpublished new.block.checkb
775 | format.title output
776 | howpublished new.block.checka
777 | howpublished output
778 | format.date output
779 | new.block
780 | note output
781 | fin.entry
782 | empty.misc.check
783 | }
784 |
785 | FUNCTION {phdthesis}
786 | { output.bibitem
787 | format.authors "author" output.check
788 | new.block
789 | format.btitle "title" output.check
790 | new.block
791 | "PhD thesis" format.thesis.type output.nonnull
792 | school "school" output.check
793 | address output
794 | format.date "year" output.check
795 | new.block
796 | note output
797 | fin.entry
798 | }
799 |
800 | FUNCTION {proceedings}
801 | { output.bibitem
802 | editor empty$
803 | { organization output }
804 | { format.editors output.nonnull }
805 |
806 | if$
807 | new.block
808 | format.btitle "title" output.check
809 | format.bvolume output
810 | format.number.series output
811 | address empty$
812 | { editor empty$
813 | { publisher new.sentence.checka }
814 | { organization publisher new.sentence.checkb
815 | organization output
816 | }
817 | if$
818 | publisher output
819 | format.date "year" output.check
820 | }
821 | { address output.nonnull
822 | format.date "year" output.check
823 | new.sentence
824 | editor empty$
825 | 'skip$
826 | { organization output }
827 | if$
828 | publisher output
829 | }
830 | if$
831 | new.block
832 | note output
833 | fin.entry
834 | }
835 |
836 | FUNCTION {techreport}
837 | { output.bibitem
838 | format.authors "author" output.check
839 | new.block
840 | format.title "title" output.check
841 | new.block
842 | format.tr.number output.nonnull
843 | institution "institution" output.check
844 | address output
845 | format.date "year" output.check
846 | new.block
847 | note output
848 | fin.entry
849 | }
850 |
851 | FUNCTION {unpublished}
852 | { output.bibitem
853 | format.authors "author" output.check
854 | new.block
855 | format.title "title" output.check
856 | new.block
857 | note "note" output.check
858 | format.date output
859 | fin.entry
860 | }
861 |
862 | FUNCTION {default.type} { misc }
863 |
864 | MACRO {jan} {"Jan."}
865 |
866 | MACRO {feb} {"Feb."}
867 |
868 | MACRO {mar} {"Mar."}
869 |
870 | MACRO {apr} {"Apr."}
871 |
872 | MACRO {may} {"May"}
873 |
874 | MACRO {jun} {"June"}
875 |
876 | MACRO {jul} {"July"}
877 |
878 | MACRO {aug} {"Aug."}
879 |
880 | MACRO {sep} {"Sept."}
881 |
882 | MACRO {oct} {"Oct."}
883 |
884 | MACRO {nov} {"Nov."}
885 |
886 | MACRO {dec} {"Dec."}
887 |
888 | MACRO {acmcs} {"ACM Comput. Surv."}
889 |
890 | MACRO {acta} {"Acta Inf."}
891 |
892 | MACRO {cacm} {"Commun. ACM"}
893 |
894 | MACRO {ibmjrd} {"IBM J. Res. Dev."}
895 |
896 | MACRO {ibmsj} {"IBM Syst.~J."}
897 |
898 | MACRO {ieeese} {"IEEE Trans. Softw. Eng."}
899 |
900 | MACRO {ieeetc} {"IEEE Trans. Comput."}
901 |
902 | MACRO {ieeetcad}
903 | {"IEEE Trans. Comput.-Aided Design Integrated Circuits"}
904 |
905 | MACRO {ipl} {"Inf. Process. Lett."}
906 |
907 | MACRO {jacm} {"J.~ACM"}
908 |
909 | MACRO {jcss} {"J.~Comput. Syst. Sci."}
910 |
911 | MACRO {scp} {"Sci. Comput. Programming"}
912 |
913 | MACRO {sicomp} {"SIAM J. Comput."}
914 |
915 | MACRO {tocs} {"ACM Trans. Comput. Syst."}
916 |
917 | MACRO {tods} {"ACM Trans. Database Syst."}
918 |
919 | MACRO {tog} {"ACM Trans. Gr."}
920 |
921 | MACRO {toms} {"ACM Trans. Math. Softw."}
922 |
923 | MACRO {toois} {"ACM Trans. Office Inf. Syst."}
924 |
925 | MACRO {toplas} {"ACM Trans. Prog. Lang. Syst."}
926 |
927 | MACRO {tcs} {"Theoretical Comput. Sci."}
928 |
929 | READ
930 |
931 | FUNCTION {sortify}
932 | { purify$
933 | "l" change.case$
934 | }
935 |
936 | INTEGERS { len }
937 |
938 | FUNCTION {chop.word}
939 | { 's :=
940 | 'len :=
941 | s #1 len substring$ =
942 | { s len #1 + global.max$ substring$ }
943 | 's
944 | if$
945 | }
946 |
947 | FUNCTION {sort.format.names}
948 | { 's :=
949 | #1 'nameptr :=
950 | ""
951 | s num.names$ 'numnames :=
952 | numnames 'namesleft :=
953 | { namesleft #0 > }
954 | { nameptr #1 >
955 | { " " * }
956 | 'skip$
957 | if$
958 | s nameptr "{vv{ } }{ll{ }}{ f{ }}{ jj{ }}" format.name$ 't :=
959 | nameptr numnames = t "others" = and
960 | { "et al" * }
961 | { t sortify * }
962 | if$
963 | nameptr #1 + 'nameptr :=
964 | namesleft #1 - 'namesleft :=
965 | }
966 | while$
967 | }
968 |
969 | FUNCTION {sort.format.title}
970 | { 't :=
971 | "A " #2
972 | "An " #3
973 | "The " #4 t chop.word
974 | chop.word
975 | chop.word
976 | sortify
977 | #1 global.max$ substring$
978 | }
979 |
980 | FUNCTION {author.sort}
981 | { author empty$
982 | { key empty$
983 | { "to sort, need author or key in " cite$ * warning$
984 | ""
985 | }
986 | { key sortify }
987 | if$
988 | }
989 | { author sort.format.names }
990 | if$
991 | }
992 |
993 | FUNCTION {author.editor.sort}
994 | { author empty$
995 | { editor empty$
996 | { key empty$
997 | { "to sort, need author, editor, or key in " cite$ * warning$
998 | ""
999 | }
1000 | { key sortify }
1001 | if$
1002 | }
1003 | { editor sort.format.names }
1004 | if$
1005 | }
1006 | { author sort.format.names }
1007 | if$
1008 | }
1009 |
1010 | FUNCTION {author.organization.sort}
1011 | { author empty$
1012 |
1013 | { organization empty$
1014 | { key empty$
1015 | { "to sort, need author, organization, or key in " cite$ * warning$
1016 | ""
1017 | }
1018 | { key sortify }
1019 | if$
1020 | }
1021 | { "The " #4 organization chop.word sortify }
1022 | if$
1023 | }
1024 | { author sort.format.names }
1025 | if$
1026 | }
1027 |
1028 | FUNCTION {editor.organization.sort}
1029 | { editor empty$
1030 | { organization empty$
1031 | { key empty$
1032 | { "to sort, need editor, organization, or key in " cite$ * warning$
1033 | ""
1034 | }
1035 | { key sortify }
1036 | if$
1037 | }
1038 | { "The " #4 organization chop.word sortify }
1039 | if$
1040 | }
1041 | { editor sort.format.names }
1042 | if$
1043 | }
1044 |
1045 | FUNCTION {presort}
1046 | { type$ "book" =
1047 | type$ "inbook" =
1048 | or
1049 | 'author.editor.sort
1050 | { type$ "proceedings" =
1051 | 'editor.organization.sort
1052 | { type$ "manual" =
1053 | 'author.organization.sort
1054 | 'author.sort
1055 | if$
1056 | }
1057 | if$
1058 | }
1059 | if$
1060 | " "
1061 | *
1062 | year field.or.null sortify
1063 | *
1064 | " "
1065 | *
1066 | title field.or.null
1067 | sort.format.title
1068 | *
1069 | #1 entry.max$ substring$
1070 | 'sort.key$ :=
1071 | }
1072 |
1073 | ITERATE {presort}
1074 |
1075 | SORT
1076 |
1077 | STRINGS { longest.label }
1078 |
1079 | INTEGERS { number.label longest.label.width }
1080 |
1081 | FUNCTION {initialize.longest.label}
1082 | { "" 'longest.label :=
1083 | #1 'number.label :=
1084 | #0 'longest.label.width :=
1085 | }
1086 |
1087 | FUNCTION {longest.label.pass}
1088 | { number.label int.to.str$ 'label :=
1089 | number.label #1 + 'number.label :=
1090 | label width$ longest.label.width >
1091 | { label 'longest.label :=
1092 | label width$ 'longest.label.width :=
1093 | }
1094 | 'skip$
1095 | if$
1096 | }
1097 |
1098 | EXECUTE {initialize.longest.label}
1099 |
1100 | ITERATE {longest.label.pass}
1101 |
1102 | FUNCTION {begin.bib}
1103 | { preamble$ empty$
1104 | 'skip$
1105 | { preamble$ write$ newline$ }
1106 | if$
1107 | "\begin{thebibliography}{" longest.label * "}" *
1108 | "\itemsep=-1pt" * % Compact the entries a little.
1109 | write$ newline$
1110 | }
1111 |
1112 | EXECUTE {begin.bib}
1113 |
1114 | EXECUTE {init.state.consts}
1115 |
1116 | ITERATE {call.type$}
1117 |
1118 | FUNCTION {end.bib}
1119 | { newline$
1120 | "\end{thebibliography}" write$ newline$
1121 | }
1122 |
1123 | EXECUTE {end.bib}
1124 |
1125 | % end of file ieee.bst
1126 | % ---------------------------------------------------------------
1127 |
1128 |
1129 |
1130 |
--------------------------------------------------------------------------------
/hw_02/hw02.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Problem Set 2"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "STAT 479: Machine Learning (Fall 2018) \n",
15 | "Instructor: Sebastian Raschka (sraschka@wisc.edu) \n",
16 | "Course website: http://pages.stat.wisc.edu/~sraschka/teaching/stat479-fs2018/\n",
17 | "\n",
18 | "**Due**: Nov 08, before class (before 8:00 am).\n",
19 | "\n",
20 | "**How to submit**\n",
21 | "\n",
22 | "As mentioned in the lecture, you need to submit the `.ipynb` file with your answers plus an `.html` file, which will serve as a backup for us in case the `.ipynb` file cannot be opened on my or the TA's computer. In addition, you may also export the notebook as PDF and upload it as well.\n",
23 | "\n",
24 | "This time, we will be using the Canvas platform, so you need to submit your homework there. You should be able to resubmit the homework as many times as you like before the due date."
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "**You are highly encouraged to use Piazza to ask questions and help each other while working on the homework. However, do not share any solutions with other students as this would be a violation of the Academic Integrity guidelines (for more info, see http://pages.stat.wisc.edu/~sraschka/teaching/stat479-fs2018/#other-important-course-information)**\n",
32 | "\n",
33 | "\n",
34 | "For example, a resonable question & answer would be:\n",
35 | "\n",
36 | "- Q: When I am asked to implement the code for majority voting, my code produces an array that has the wrong dimensions (I get the following dimensions ...). \n",
37 | "- A: Hm, I suspect you compute the `argmax` over rows, not columns. Maybe check that you specify the correct dimension for the `axis` parameter in the `argmax` function.\n",
38 | "\n",
39 | "Not ok would be:\n",
40 | "\n",
41 | "- Q: Here is my code and solution for exercise XXX. Is this correct? "
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 1,
47 | "metadata": {},
48 | "outputs": [
49 | {
50 | "name": "stdout",
51 | "output_type": "stream",
52 | "text": [
53 | " \n",
54 | "last updated: 2018-10-21 \n",
55 | "\n",
56 | "CPython 3.6.6\n",
57 | "IPython 6.5.0\n",
58 | "\n",
59 | "numpy 1.15.1\n",
60 | "scipy 1.1.0\n",
61 | "matplotlib 2.2.3\n",
62 | "sklearn 0.20.0\n"
63 | ]
64 | }
65 | ],
66 | "source": [
67 | "%load_ext watermark\n",
68 | "%watermark -d -u -a '' -v -p numpy,scipy,matplotlib,sklearn"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 2,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "import numpy as np"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {},
83 | "source": [
84 | "
\n",
85 | "
\n",
86 | "
\n",
87 | "
\n",
88 | "
\n",
89 | "
"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "metadata": {},
95 | "source": [
96 | "## 1) Implementing an ID3 Decision Tree"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "In this first part of the homework, you are going to implement the ID3 decision tree algorithm we discussed in class. This decision tree algorithm will support multi-category splits, but just like the original ID3 algorithm, it will only support categorical feature values for simplicity. Here, categorical feature values will be represented by integer numbers. \n",
104 | "\n",
105 | "\n",
106 | "Implementing machine learning algorithms from scratch is a very important skill, and this homework will provide exercises that will help you to develop this skill. Even if you are interested in the more theoretical aspects of machine learning, being comfortable with implementing and trying out algorithms is vital for doing research, since even the more theoretical papers in machine learning are usually accompanied by experiments or simulations to a) verify results and b) to compare algorithms with the state-of-the art.\n",
107 | "\n",
108 | "Since many students are not expert Python programmers (yet), I will provide partial solutions to the homework tasks such that you have a framework or guide to implement the solutions. Areas that you need to fill in will be marked with comments (e.g., `# your code`). For these partial solutions, I first implemented the functions myself, and then I deleted parts you need to fill in by these comments. However, note that you can, of course, use more or fewer lines of code than I did. In other words, all that matter is that the function you write can create the same outputs as the ones I provide. How many lines of code you need to implement that function, and how efficient it is, does not matter here. The expected outputs for the respective functions will be provided so that you can double-check your solutions. "
109 | ]
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "metadata": {},
114 | "source": [
115 | "### 1.1) Splitting a node (10 pts)"
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {},
121 | "source": [
122 | "First, we are going to implement a function that splits a dataset along a feature axis into sub-datasets. Since we are going to implement a decision tree that only supports categorical features (like ID3) for simplicity, you do not need to account for continuous feature variables. In other words, the splitting function only needs to support integer NumPy arrays. \n",
123 | "\n",
124 | "To provide an intuitive example, suppose you are given the following NumPy array with four feature values, feature values 0-3:\n",
125 | "\n",
126 | " np.array([0, 1, 2, 1, 0, 3, 1, 0, 1, 2])\n",
127 | " \n",
128 | "The function you are going to implement should return a dictionary, where each dictionary key represents a unique value in the array, and the values are the indices in that array that map to the respective feature value. Hence, based on the feature array above, your `split` function should return the following dictionary:\n",
129 | "\n",
130 | " {0: array([0, 4, 7]), \n",
131 | " 1: array([1, 3, 6, 8]), \n",
132 | " 2: array([2, 9]), \n",
133 | " 3: array([5])}"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "Tip: I recommend you to use `np.where` and `np.unique` functions to make the implementation easier. If you do not remember these functions from the \"computational foundations\" lectures, you can either look up those functions in the NumPy documentation online, or you can execute `np.where?` and `np.unique?` in a new code cell to get more information."
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "def split(array):\n",
150 | " # your code to generate dictionary\n",
151 | " return # return the dictionary variable"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "I added the following code cell for your convenience to double-check your solution. If your results don't match the results shown below, there is a bug in your implementation."
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 4,
164 | "metadata": {},
165 | "outputs": [
166 | {
167 | "name": "stdout",
168 | "output_type": "stream",
169 | "text": [
170 | "{0: array([0]), 1: array([1]), 2: array([2])}\n",
171 | "{0: array([1, 3, 4, 6]), 1: array([0, 2, 5])}\n",
172 | "{0: array([1, 4]), 1: array([0, 5, 6]), 2: array([3]), 3: array([2])}\n"
173 | ]
174 | }
175 | ],
176 | "source": [
177 | "# DO NOT EDIT OR DELETE THIS CELL\n",
178 | "\n",
179 | "print(split(np.array([0, 1, 2])))\n",
180 | "print(split(np.array([1, 0, 1, 0, 0, 1, 0])))\n",
181 | "print(split(np.array([1, 0, 3, 2, 0, 1, 1])))"
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "metadata": {},
187 | "source": [
188 | "### 1.2) Implement Entropy (10 pts)"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | "After implementing the splitting function, we are now have to implement a criterion function so that we can compare splits on different features, to decide which feature is the best feature to split for growing the decision tree. As discussed in class, our splitting criterion will be Information Gain. However, before we implement an Information Gain function, we need to implement a function that computes the entropy at each node, which we need to compute Information Gain.\n",
196 | "\n",
197 | "For your reference, we defined entropy (i.e., Shannon Entropy) as follows:\n",
198 | "\n",
199 | "$$H(p) = \\sum_i p_i \\log_2 (1/p_i) = - \\sum_i p_i \\log_2 (p_i)$$\n",
200 | "\n",
201 | "where you can think of $p_i$ as the proportion of examples with class label $i$ at a given node."
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": null,
207 | "metadata": {},
208 | "outputs": [],
209 | "source": [
210 | "def entropy(array):\n",
211 | " # your code\n",
212 | " # your code\n",
213 | " return # return a scalar"
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "metadata": {},
219 | "source": [
220 | "I added the following code cell for your convenience to double-check your solution. If your results don't match the results shown below, there is a bug in your implementation of the `entropy` function."
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 8,
226 | "metadata": {},
227 | "outputs": [
228 | {
229 | "name": "stdout",
230 | "output_type": "stream",
231 | "text": [
232 | "1.0\n",
233 | "1.0\n",
234 | "0.0\n",
235 | "0.4395\n",
236 | "0.0\n",
237 | "1.6577\n"
238 | ]
239 | }
240 | ],
241 | "source": [
242 | "# DO NOT EDIT OR DELETE THIS CELL\n",
243 | "\n",
244 | "print(round(entropy(np.array([0, 1, 0, 1, 1, 0])), 4))\n",
245 | "print(round(entropy(np.array([1, 2])), 4))\n",
246 | "print(round(entropy(np.array([1, 1])), 4))\n",
247 | "print(round(entropy(np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])), 4))\n",
248 | "print(round(entropy(np.array([0, 0, 0])), 4))\n",
249 | "print(round(entropy(np.array([1, 1, 1, 0, 1, 4, 4, 2, 1])), 4))"
250 | ]
251 | },
252 | {
253 | "cell_type": "markdown",
254 | "metadata": {},
255 | "source": [
256 | "### 1.3) Implement Information Gain (10 pts)"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "metadata": {},
262 | "source": [
263 | "Now that you have a working solution for the `entropy` function, the next step is to compute the Information Gain. For your reference, information gain is computed as\n",
264 | "\n",
265 | "$$GAIN(\\mathcal{D}, x_j) = H(\\mathcal{D}) - \\sum_{v \\in Values(x_j)} \\frac{|\\mathcal{D}_v|}{|\\mathcal{D}|} H(\\mathcal{D}_v).$$"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": null,
271 | "metadata": {},
272 | "outputs": [],
273 | "source": [
274 | "def information_gain(x_array, y_array):\n",
275 | " parent_entropy = # your code\n",
276 | "\n",
277 | " split_dict = # your code\n",
278 | " \n",
279 | " for val in split_dict:\n",
280 | " freq = # your code\n",
281 | " child_entropy = # your code\n",
282 | " parent_entropy -= # your code\n",
283 | " \n",
284 | " return parent_entropy"
285 | ]
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "metadata": {},
290 | "source": [
291 | "I added the following code cell for your convenience to double-check your solution. If your results don't match the results shown below, there is a bug in your implementation of the `information_gain` function."
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 11,
297 | "metadata": {},
298 | "outputs": [
299 | {
300 | "name": "stdout",
301 | "output_type": "stream",
302 | "text": [
303 | "0.4591\n",
304 | "0.2516\n"
305 | ]
306 | }
307 | ],
308 | "source": [
309 | "# DO NOT EDIT OR DELETE THIS CELL\n",
310 | "\n",
311 | "x = np.array([0, 1, 0, 1, 0, 1])\n",
312 | "y = np.array([0, 1, 0, 1, 1, 1])\n",
313 | "print(round(information_gain(x, y), 4))\n",
314 | "\n",
315 | "x = np.array([0, 0, 1, 1, 2, 2])\n",
316 | "y = np.array([0, 1, 0, 1, 1, 1])\n",
317 | "print(round(information_gain(x, y), 4))"
318 | ]
319 | },
320 | {
321 | "cell_type": "markdown",
322 | "metadata": {},
323 | "source": [
324 | "(You may notice that these are actually the feature arrays from the midterm exam, Q 14.)"
325 | ]
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "metadata": {},
330 | "source": [
331 | "### 1.4) Decision Tree Splitting (10 pts)"
332 | ]
333 | },
334 | {
335 | "cell_type": "markdown",
336 | "metadata": {},
337 | "source": [
338 | "Now, we should have all the main components that we need for implementing the ID3 decision tree algorithm: a `split` function, an `entropy` function, and an `information_gain` function based on the `entropy` function. \n",
339 | "\n",
340 | "The next task is combine these functions to recursively split a dataset on its different features to construct a decision tree that separate the examples from different classes well. We will call this function `make_tree`. \n",
341 | "\n",
342 | "For simplicity, the decision tree returned by the `make_tree` function will be represented by a Python dictionary. To illustrate this, consider the following dataset:\n",
343 | "\n",
344 | "```\n",
345 | "Inputs:\n",
346 | " [[0 0]\n",
347 | " [0 1]\n",
348 | " [1 0]\n",
349 | " [1 1]\n",
350 | " [2 0]\n",
351 | " [2 1]]\n",
352 | "\n",
353 | "Labels:\n",
354 | " [0 1 0 1 1 1]\n",
355 | "```\n",
356 | " \n",
357 | "This is a dataset with 6 training examples and two features. (Again, this is an example from the midterm exam.) The decision tree in form of the Python dictionary should look like as follows:\n",
358 | "\n",
359 | "\n",
360 | "\n",
361 | "You should return a dictionary with the following form:\n",
362 | "\n",
363 | "```\n",
364 | "{'X_1 = 0': {'X_0 = 0': array([0]),\n",
365 | " 'X_0 = 1': array([0]),\n",
366 | " 'X_0 = 2': array([1])},\n",
367 | " 'X_1 = 1': array([1, 1, 1])}\n",
368 | " ```\n",
369 | " \n",
370 | "Let me further illustrate what the different parts of the dictionary mean. Here, the `'X_1'` in `'X_1 = 0'` refers feature 2 (the first column of the NumPy array; remember that Python starts the index at 0, in contrast to R). \n",
371 | "\n",
372 | "- 'X_1 = 0': For training examples stored in this node, the second feature has the value 0\n",
373 | "- 'X_1 = 1': For training examples stored in this node, the second feature has the value 1\n",
374 | "\n",
375 | "The \"array\" is a NumPy array that stores the class labels of the training examples at that node. In the case of `'X_1 = 0'` we actually store actually a sub-dictionary, because this node can be split further. If you have trouble understanding this dictionary representation, the following illustration might help:\n",
376 | "\n",
377 | "\n",
378 | ""
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": null,
384 | "metadata": {},
385 | "outputs": [],
386 | "source": [
387 | "def make_tree(X, y):\n",
388 | " \n",
389 | " # Return array if node is empty or pure (1 example in leaf node)\n",
390 | " if y.shape[0] == 1 or y.shape[0] == 0:\n",
391 | " return y\n",
392 | "\n",
393 | " # Compute information gain for each feature\n",
394 | " gains = # YOUR CODE\n",
395 | "\n",
396 | " # Early stopping if there is no information gain\n",
397 | " if (gains <= 1e-05).all():\n",
398 | " return # YOUR CODE\n",
399 | " \n",
400 | " # Else, get best feature\n",
401 | " best_feature = np.argmax(gains)\n",
402 | "\n",
403 | " \n",
404 | " results = {}\n",
405 | " \n",
406 | " # Use the `split` function to split on the best feature\n",
407 | " subset_dict = split(X[:, best_feature])\n",
408 | "\n",
409 | " # Note that each entry in the dictionary returned by \n",
410 | " # split is an attribute_value:array_indices pair.\n",
411 | " # here, we are going to iterate over these key-value\n",
412 | " # pairs and select the respective examples for the\n",
413 | " # new child nodes\n",
414 | " \n",
415 | " for feature_value, train_example_indices in subset_dict.items():\n",
416 | " child_y_subset = # YOUR CODE\n",
417 | " child_x_subset = # YOUR CODE\n",
418 | "\n",
419 | " # Next, we are using \"recursion,\" that is, calling the same\n",
420 | " # tree_split function on the child subset(s)\n",
421 | " \n",
422 | " results[\"X_%d = %d\" % (best_feature, feature_value)] = \\\n",
423 | " make_tree(child_x_subset, child_y_subset)\n",
424 | "\n",
425 | " \n",
426 | " return results"
427 | ]
428 | },
429 | {
430 | "cell_type": "markdown",
431 | "metadata": {},
432 | "source": [
433 | "I added the following code cell for your convenience to double-check your solution. If your results don't match the results shown below, there is a bug in your implementation of the `make_tree` function."
434 | ]
435 | },
436 | {
437 | "cell_type": "code",
438 | "execution_count": 10,
439 | "metadata": {},
440 | "outputs": [
441 | {
442 | "name": "stdout",
443 | "output_type": "stream",
444 | "text": [
445 | "Inputs:\n",
446 | " [[0 0]\n",
447 | " [0 1]\n",
448 | " [1 0]\n",
449 | " [1 1]\n",
450 | " [2 0]\n",
451 | " [2 1]]\n",
452 | "\n",
453 | "Labels:\n",
454 | " [0 1 0 1 1 1]\n",
455 | "\n",
456 | "Decision tree:\n",
457 | " {'X_1 = 0': {'X_0 = 0': array([0]), 'X_0 = 1': array([0]), 'X_0 = 2': array([1])}, 'X_1 = 1': array([1, 1, 1])}\n"
458 | ]
459 | }
460 | ],
461 | "source": [
462 | "# DO NOT EDIT OR DELETE THIS CELL\n",
463 | "\n",
464 | "x1 = np.array([0, 0, 1, 1, 2, 2])\n",
465 | "x2 = np.array([0, 1, 0, 1, 0, 1])\n",
466 | "X = np.array([x1, x2]).T\n",
467 | "y = np.array([0, 1, 0, 1, 1, 1])\n",
468 | "\n",
469 | "print('Inputs:\\n', X)\n",
470 | "print('\\nLabels:\\n', y)\n",
471 | "\n",
472 | "print('\\nDecision tree:\\n', make_tree(X, y))"
473 | ]
474 | },
475 | {
476 | "cell_type": "markdown",
477 | "metadata": {},
478 | "source": [
479 | "### 1.5) Building a Decision Tree API (10 pts)"
480 | ]
481 | },
482 | {
483 | "cell_type": "markdown",
484 | "metadata": {},
485 | "source": [
486 | "The final step of this part of the homework is now to write an API around our decision tree code so that we can use is for making predictions. Here, we will use the common convention, established by scikit-learn, to implement the decision tree as a Python class with \n",
487 | "\n",
488 | "- a `fit` method that learns the decision tree model from a training set via the `make_tree` function we already implemented;\n",
489 | "- a `predict` method to predict the class labels of training examples or any unseen data points.\n",
490 | "\n",
491 | "For making predictions, since not all leaf nodes are guaranteed to be single training examples, we will use a majority voting function to predict the class label as discussed in class. I already implemented a `_traverse` method, which will recursively traverse a decision tree dictionary that is produced by the `make_tree` function.\n",
492 | "\n",
493 | "Note that for simplicity, the `predict` method will only be able to accept one data point at a time (instead of a collection of data points). Hence `x` is a vector of size $\\mathbb{R}^m$, where $m$ is the number of features. I use capital letters `X` to denote a matrix of size $\\mathbb{R}^{n\\times m}$, where $n$ is the number of training examples."
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": null,
499 | "metadata": {},
500 | "outputs": [],
501 | "source": [
502 | "class ID3DecisionTreeClassifer(object):\n",
503 | " \n",
504 | " def __init__(self):\n",
505 | " pass\n",
506 | " \n",
507 | " def fit(self, X, y):\n",
508 | " self.splits_ = # YOUR CODE to generate the decision tree dictionary\n",
509 | " \n",
510 | " def _majority_vote(self, label_array):\n",
511 | " return # YOUR CODE\n",
512 | " \n",
513 | " def _traverse(self, x, d):\n",
514 | " if isinstance(d, np.ndarray):\n",
515 | " return d\n",
516 | " for key in d:\n",
517 | " name, value = key.split(' = ')\n",
518 | " feature_idx = int(name.split('_')[-1])\n",
519 | " value = int(value)\n",
520 | " if x[feature_idx] == value:\n",
521 | " return self._traverse(x, d[key])\n",
522 | " \n",
523 | " def predict(self, x):\n",
524 | " \n",
525 | " label_array = # YOUR CODE to get class labels from the target node\n",
526 | " return #YOUR CODE to predict the class label via majority voting from label_array"
527 | ]
528 | },
529 | {
530 | "cell_type": "markdown",
531 | "metadata": {},
532 | "source": [
533 | "I added the following code cell for your convenience to double-check your solution. If your results don't match the results shown below, there is a bug in your implementation of the `make_tree` function."
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": 12,
539 | "metadata": {},
540 | "outputs": [
541 | {
542 | "name": "stdout",
543 | "output_type": "stream",
544 | "text": [
545 | "0\n",
546 | "1\n",
547 | "0\n",
548 | "0\n",
549 | "1\n",
550 | "1\n",
551 | "1\n"
552 | ]
553 | }
554 | ],
555 | "source": [
556 | "# DO NOT EDIT OR DELETE THIS CELL\n",
557 | "\n",
558 | "tree = ID3DecisionTreeClassifer()\n",
559 | "tree.fit(X, y)\n",
560 | "\n",
561 | "print(tree.predict(np.array([0, 0])))\n",
562 | "print(tree.predict(np.array([0, 1])))\n",
563 | "print(tree.predict(np.array([1, 0])))\n",
564 | "print(tree.predict(np.array([1, 0])))\n",
565 | "print(tree.predict(np.array([1, 1])))\n",
566 | "print(tree.predict(np.array([2, 0])))\n",
567 | "print(tree.predict(np.array([2, 1])))"
568 | ]
569 | },
570 | {
571 | "cell_type": "markdown",
572 | "metadata": {},
573 | "source": [
574 | "
\n",
575 | "
\n",
576 | "
\n",
577 | "
\n",
578 | "
\n",
579 | "
"
580 | ]
581 | },
582 | {
583 | "cell_type": "markdown",
584 | "metadata": {},
585 | "source": [
586 | "## 2) Bagging"
587 | ]
588 | },
589 | {
590 | "cell_type": "markdown",
591 | "metadata": {},
592 | "source": [
593 | "In this second part of this homework, you will be combining multiple decision trees to a bagging classifier. This time, we will be using the decision tree algorithm implemented in scikit-learn (which is some variant of the CART algorithm for binary splits, as discussed in class)."
594 | ]
595 | },
596 | {
597 | "cell_type": "markdown",
598 | "metadata": {},
599 | "source": [
600 | "### 2.1 Bootrapping (10 pts)"
601 | ]
602 | },
603 | {
604 | "cell_type": "markdown",
605 | "metadata": {},
606 | "source": [
607 | "As you remember, bagging relies on bootstrap sampling. So, as a first step, your task is to implement a function for generating bootstrap samples. In this exercise, for simplicity, we will perform the computations based on the Iris dataset.\n",
608 | "\n",
609 | "On an interesting side note, scikit-learn recently updated their version of the Iris dataset since it was discovered that the Iris version hosted on the UCI machine learning repository (https://archive.ics.uci.edu/ml/datasets/Iris/) has two data points that are different from R. Fisher's original paper (Fisher,R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).) and changed it in their most recent version. Since most students may not have the latest scikit-learn version installed, we will be working with the Iris dataset that is deposited on UCI, which has become quite the standard in the Python machine learning community for benchmarking algorithms. Instead of manually downloading it, we will be fetching it through the `mlxtend` (http://rasbt.github.io/mlxtend/) library that you installed in the last homework."
610 | ]
611 | },
612 | {
613 | "cell_type": "code",
614 | "execution_count": 13,
615 | "metadata": {},
616 | "outputs": [
617 | {
618 | "name": "stdout",
619 | "output_type": "stream",
620 | "text": [
621 | "Number of examples: 150\n",
622 | "Number of features: 4\n",
623 | "Unique class labels: [0 1 2]\n"
624 | ]
625 | }
626 | ],
627 | "source": [
628 | "# DO NOT EDIT OR DELETE THIS CELL\n",
629 | "\n",
630 | "from mlxtend.data import iris_data\n",
631 | "X, y = iris_data()\n",
632 | "\n",
633 | "print('Number of examples:', X.shape[0])\n",
634 | "print('Number of features:', X.shape[1])\n",
635 | "print('Unique class labels:', np.unique(y))"
636 | ]
637 | },
638 | {
639 | "cell_type": "markdown",
640 | "metadata": {},
641 | "source": [
642 | "Use scikit-learn's `train_test_split` function to divide the dataset into a training and a test set.\n",
643 | "\n",
644 | "- The test set should contain 45 examples, and the training set should contain 105 examples.\n",
645 | "- To ensure reproducible results, use `123` as a random seed.\n",
646 | "- Perform a stratified split."
647 | ]
648 | },
649 | {
650 | "cell_type": "code",
651 | "execution_count": null,
652 | "metadata": {},
653 | "outputs": [],
654 | "source": [
655 | "from sklearn.model_selection import # YOUR CODE\n",
656 | "\n",
657 | "\n",
658 | "X_train, X_test, y_train, y_test = # YOUR CODE\n",
659 | "\n",
660 | "print('Number of training examples:', X_train.shape[0])\n",
661 | "print('Number of test examples:', X_test.shape[0])"
662 | ]
663 | },
664 | {
665 | "cell_type": "markdown",
666 | "metadata": {},
667 | "source": [
668 | "I added the following code cell for your convenience to double-check your solution. If your results don't match the results shown below, there is a bug in your implementation of the `make_tree` function."
669 | ]
670 | },
671 | {
672 | "cell_type": "code",
673 | "execution_count": 15,
674 | "metadata": {},
675 | "outputs": [
676 | {
677 | "name": "stdout",
678 | "output_type": "stream",
679 | "text": [
680 | "Number of training examples: 105\n",
681 | "Number of test examples: 45\n"
682 | ]
683 | }
684 | ],
685 | "source": [
686 | "# DO NOT EDIT OR DELETE THIS CELL\n",
687 | "\n",
688 | "print('Number of training examples:', X_train.shape[0])\n",
689 | "print('Number of test examples:', X_test.shape[0])"
690 | ]
691 | },
692 | {
693 | "cell_type": "markdown",
694 | "metadata": {},
695 | "source": [
696 | "Next we are implementing a function to generate bootstrap samples of the training set. In particular, we will perform the bootstrapping as follows:\n",
697 | "\n",
698 | "- Create an index array with values 0, ..., 104.\n",
699 | "- Draw a random sample (with replacement) from this index array using the `choice` method of a NumPy `RandomState` object that is passed to the function as `rng`. \n",
700 | "- Select training examples from the X array and labels from the y array using the new sample of indices."
701 | ]
702 | },
703 | {
704 | "cell_type": "code",
705 | "execution_count": null,
706 | "metadata": {},
707 | "outputs": [],
708 | "source": [
709 | "def draw_bootstrap_sample(rng, X, y):\n",
710 | " sample_indices = # YOUR CODE\n",
711 | " bootstrap_indices = rng.choice( # YOUR CODE )\n",
712 | " return X[# YOUR CODE], y[# YOUR CODE]"
713 | ]
714 | },
715 | {
716 | "cell_type": "markdown",
717 | "metadata": {},
718 | "source": [
719 | "I added the following code cell for your convenience to double-check your solution. If your results don't match the results shown below, there is a bug in your implementation of the `draw_bootstrap_sample` function."
720 | ]
721 | },
722 | {
723 | "cell_type": "code",
724 | "execution_count": 17,
725 | "metadata": {},
726 | "outputs": [
727 | {
728 | "name": "stdout",
729 | "output_type": "stream",
730 | "text": [
731 | "Number of training inputs from bootstrap round: 105\n",
732 | "Number of training labels from bootstrap round: 105\n",
733 | "Labels:\n",
734 | " [0 0 1 0 0 1 2 0 2 1 0 0 2 1 1 1 1 2 1 1 2 0 2 1 2 1 1 1 0 1 0 0 1 2 0 0 0\n",
735 | " 0 2 1 1 2 1 2 1 1 2 1 2 0 1 1 2 2 1 0 1 0 2 2 0 1 0 2 0 0 0 0 1 2 0 0 1 0\n",
736 | " 1 1 0 1 1 2 2 0 2 0 2 0 1 1 2 2 0 2 2 2 0 1 0 1 2 2 2 1 0 0 0]\n"
737 | ]
738 | }
739 | ],
740 | "source": [
741 | "# DO NOT EDIT OR DELETE THIS CELL\n",
742 | "\n",
743 | "rng = np.random.RandomState(123)\n",
744 | "X_boot, y_boot = draw_bootstrap_sample(rng, X_train, y_train)\n",
745 | "\n",
746 | "print('Number of training inputs from bootstrap round:', X_boot.shape[0])\n",
747 | "print('Number of training labels from bootstrap round:', y_boot.shape[0])\n",
748 | "print('Labels:\\n', y_boot)"
749 | ]
750 | },
751 | {
752 | "cell_type": "markdown",
753 | "metadata": {},
754 | "source": [
755 | "### 2.2 Baggging classifier from decision trees (10 pts)"
756 | ]
757 | },
758 | {
759 | "cell_type": "markdown",
760 | "metadata": {},
761 | "source": [
762 | "In this section, you will implement a Bagging algorithm based on the `DecisionTreeClassifier`. I provided a partial solution for you. "
763 | ]
764 | },
765 | {
766 | "cell_type": "code",
767 | "execution_count": null,
768 | "metadata": {},
769 | "outputs": [],
770 | "source": [
771 | "from sklearn.tree import DecisionTreeClassifier\n",
772 | "\n",
773 | "\n",
774 | "class BaggingClassifier(object):\n",
775 | " \n",
776 | " def __init__(self, num_trees=10, random_state=123):\n",
777 | " self.num_trees = num_trees\n",
778 | " self.rng = np.random.RandomState(random_state)\n",
779 | " \n",
780 | " \n",
781 | " def fit(self, X, y):\n",
782 | " self.trees_ = [DecisionTreeClassifier(random_state=self.rng) for i in range(self.num_trees)]\n",
783 | " for i in range(self.num_trees):\n",
784 | " X_boot, y_boot = # YOUR CODE to draw a bootstrap sample\n",
785 | " # YOUR CODE to\n",
786 | " # fit the trees in self.trees_ on the bootstrap samples\n",
787 | " \n",
788 | " def predict(self, X):\n",
789 | " ary = np.zeros((X.shape[0], len(self.trees_)), dtype=np.int)\n",
790 | " for i in range(len(self.trees_)):\n",
791 | " ary[:, i] = self.trees_[i].predict(X)\n",
792 | "\n",
793 | " maj = np.apply_along_axis(lambda x:\n",
794 | " np.argmax(np.bincount(x)),\n",
795 | " axis=1,\n",
796 | " arr=ary)\n",
797 | " return maj"
798 | ]
799 | },
800 | {
801 | "cell_type": "markdown",
802 | "metadata": {},
803 | "source": [
804 | "I added the following code cell for your convenience to double-check your solution. If your results don't match the results shown below, there is a bug in your implementation of the `BaggingClassifier()`."
805 | ]
806 | },
807 | {
808 | "cell_type": "code",
809 | "execution_count": 29,
810 | "metadata": {},
811 | "outputs": [
812 | {
813 | "name": "stdout",
814 | "output_type": "stream",
815 | "text": [
816 | "Individual Tree Accuracies:\n",
817 | "88.9%\n",
818 | "93.3%\n",
819 | "97.8%\n",
820 | "93.3%\n",
821 | "93.3%\n",
822 | "93.3%\n",
823 | "91.1%\n",
824 | "97.8%\n",
825 | "97.8%\n",
826 | "97.8%\n",
827 | "\n",
828 | "Bagging Test Accuracy: 97.8%\n"
829 | ]
830 | }
831 | ],
832 | "source": [
833 | "# DO NOT EDIT OR DELETE THIS CELL\n",
834 | "\n",
835 | "model = BaggingClassifier()\n",
836 | "model.fit(X_train, y_train)\n",
837 | "\n",
838 | "predictions = model.predict(X_test)\n",
839 | "\n",
840 | "print('Individual Tree Accuracies:')\n",
841 | "for tree in model.trees_:\n",
842 | " predictions = tree.predict(X_test) \n",
843 | " print('%.1f%%' % ((predictions == y_test).sum() / X_test.shape[0] * 100))\n",
844 | "\n",
845 | "print('\\nBagging Test Accuracy: %.1f%%' % ((predictions == y_test).sum() / X_test.shape[0] * 100))"
846 | ]
847 | },
848 | {
849 | "cell_type": "markdown",
850 | "metadata": {},
851 | "source": [
852 | "
\n",
853 | "
\n",
854 | "
\n",
855 | "
\n",
856 | "
\n",
857 | "
"
858 | ]
859 | },
860 | {
861 | "cell_type": "markdown",
862 | "metadata": {},
863 | "source": [
864 | "## 3) Bias-Variance Decomposition"
865 | ]
866 | },
867 | {
868 | "cell_type": "markdown",
869 | "metadata": {},
870 | "source": [
871 | "In this exercise you will be asked to compute the variance and bias components of the 0-1 loss that we discussed in class. \n",
872 | "\n",
873 | "- In particular, you will compute the average bias and the average variance over all test examples (instead of a single test example. \n",
874 | "\n",
875 | "- The dataset you will be using as training set(s) and test set is the Iris dataset that you already divided into `X_train` / `y_train` and `X_test` / `y_test` earlier.\n",
876 | "\n",
877 | "- Since we do not have unlimited training datasets to estimate the parameters (think back of the estimation over the training sets), we will use bootstrapping to simulate \"new\" training sets. \n"
878 | ]
879 | },
880 | {
881 | "cell_type": "markdown",
882 | "metadata": {},
883 | "source": [
884 | "### 3.1 Bias-Variance decomposition of the 0-1 Loss for Decision Trees (10 pts)"
885 | ]
886 | },
887 | {
888 | "cell_type": "markdown",
889 | "metadata": {},
890 | "source": [
891 | "In this first part, you will be computing the averaged bias and variance components over the test set examples for the decision tree algorithm implemented in scikit-learn on the Iris data. \n",
892 | "\n",
893 | "I already implemented the code for computing the \"main prediction\" for you:"
894 | ]
895 | },
896 | {
897 | "cell_type": "code",
898 | "execution_count": 20,
899 | "metadata": {},
900 | "outputs": [],
901 | "source": [
902 | "# DO NOT EDIT OR DELETE THIS CELL\n",
903 | "\n",
904 | "rng = np.random.RandomState(123)\n",
905 | "\n",
906 | "num_bootstrap = 200\n",
907 | "\n",
908 | "all_pred = np.zeros((num_bootstrap, y_test.shape[0]), dtype=np.int)\n",
909 | "\n",
910 | "for i in range(num_bootstrap):\n",
911 | " X_boot, y_boot = draw_bootstrap_sample(rng, X_train, y_train)\n",
912 | " pred = DecisionTreeClassifier(random_state=66).fit(X_boot, y_boot).predict(X_test)\n",
913 | " all_pred[i] = pred\n",
914 | " \n",
915 | "main_predictions = np.apply_along_axis(lambda x:\n",
916 | " np.argmax(np.bincount(x)),\n",
917 | " axis=0,\n",
918 | " arr=all_pred)"
919 | ]
920 | },
921 | {
922 | "cell_type": "markdown",
923 | "metadata": {},
924 | "source": [
925 | "Note that `all_pred` is a 2D array of dimension $\\mathbb{R}^{b \\times n_{test}}$, where $m$ is the number of bootstrap rounds and $n_{test}$ is the number of test examples in the test set. In other words, each of the 200 rows in this array stores the predictions of one particular decision tree hypothesis for all 45 test data points.\n",
926 | "\n",
927 | "Your first task is to compute the average bias over all test examples:"
928 | ]
929 | },
930 | {
931 | "cell_type": "code",
932 | "execution_count": null,
933 | "metadata": {},
934 | "outputs": [],
935 | "source": [
936 | "# YOUR CODE\n",
937 | "\n",
938 | "\n",
939 | "print('Average bias:', bias)"
940 | ]
941 | },
942 | {
943 | "cell_type": "markdown",
944 | "metadata": {},
945 | "source": [
946 | "Your second task is to compute the average variance over all test examples:"
947 | ]
948 | },
949 | {
950 | "cell_type": "code",
951 | "execution_count": null,
952 | "metadata": {},
953 | "outputs": [],
954 | "source": [
955 | "# YOUR CODE\n",
956 | "# you probably need multiple\n",
957 | "# lines of code and a for-loop\n",
958 | "\n",
959 | "print('Average variance:', var)"
960 | ]
961 | },
962 | {
963 | "cell_type": "markdown",
964 | "metadata": {},
965 | "source": [
966 | "Hint: The average bias and variance values are both scalars, not vectors or matrices. In other words, for each of the code cells above, you should return a real number (float)."
967 | ]
968 | },
969 | {
970 | "cell_type": "markdown",
971 | "metadata": {},
972 | "source": [
973 | "### 3.2 Bias-Variance decomposition of the 0-1 Loss for Bagging (10 pts)"
974 | ]
975 | },
976 | {
977 | "cell_type": "markdown",
978 | "metadata": {},
979 | "source": [
980 | "Use the code from the previous section, 3.1, to compare the decision tree algorithm with a BaggingClassifier from scikit-learn.\n",
981 | "\n",
982 | "- Report both the average bias and average variance just like before, but use the `BaggingClassifier` in scikit-learn instead of the `DecisionTreeClassifier`. You can use the default values of `BaggingClassifier`."
983 | ]
984 | },
985 | {
986 | "cell_type": "code",
987 | "execution_count": null,
988 | "metadata": {},
989 | "outputs": [],
990 | "source": [
991 | "# YOUR SOLUTION\n",
992 | "# Many lines of code (which you may copy and modify from 3.1)\n",
993 | "\n",
994 | "\n",
995 | "print('Average bias:', bias)\n",
996 | "print('Average variance:', var)"
997 | ]
998 | },
999 | {
1000 | "cell_type": "markdown",
1001 | "metadata": {},
1002 | "source": [
1003 | "Is the average variance higher or lower than the avergage of the decision tree in 3.1? And what about the average bias?"
1004 | ]
1005 | },
1006 | {
1007 | "cell_type": "markdown",
1008 | "metadata": {},
1009 | "source": [
1010 | "!!! TYPE YOUR ANSWER HERE !!!"
1011 | ]
1012 | },
1013 | {
1014 | "cell_type": "markdown",
1015 | "metadata": {},
1016 | "source": [
1017 | "### 3.3 Bias-Variance decomposition of the 0-1 Loss for AdaBoost (10 pts)"
1018 | ]
1019 | },
1020 | {
1021 | "cell_type": "markdown",
1022 | "metadata": {},
1023 | "source": [
1024 | "Use the code from the previous section, 3.1, to compare the decision tree algorithm with a AdaBoostClassifier from scikit-learn.\n",
1025 | "\n",
1026 | "- Report both the average bias and average variance just like before, but use the `AdaboostClassifier` in scikit-learn instead of the `DecisionTreeClassifier`. You can use the default values of `AdaboostClassifier`."
1027 | ]
1028 | },
1029 | {
1030 | "cell_type": "code",
1031 | "execution_count": null,
1032 | "metadata": {},
1033 | "outputs": [],
1034 | "source": [
1035 | "# YOUR SOLUTION\n",
1036 | "# Many lines of code (which you may copy and modify from 3.1)\n",
1037 | "\n",
1038 | "\n",
1039 | "\n",
1040 | "print('Average bias:', bias)\n",
1041 | "print('Average variance:', var)"
1042 | ]
1043 | },
1044 | {
1045 | "cell_type": "markdown",
1046 | "metadata": {},
1047 | "source": [
1048 | "Is the average variance higher or lower than the avergage of the decision tree in 3.1? And what about the average bias?"
1049 | ]
1050 | },
1051 | {
1052 | "cell_type": "markdown",
1053 | "metadata": {},
1054 | "source": [
1055 | "!!! TYPE YOUR ANSWER HERE !!!"
1056 | ]
1057 | },
1058 | {
1059 | "cell_type": "markdown",
1060 | "metadata": {},
1061 | "source": [
1062 | "
\n",
1063 | "
\n",
1064 | "
\n",
1065 | "
\n",
1066 | "
\n",
1067 | "
"
1068 | ]
1069 | },
1070 | {
1071 | "cell_type": "markdown",
1072 | "metadata": {},
1073 | "source": [
1074 | "## Bonus Exercise (10 pts)"
1075 | ]
1076 | },
1077 | {
1078 | "cell_type": "markdown",
1079 | "metadata": {},
1080 | "source": [
1081 | "In this bonus exercise, you will be asked to fit a `RandomForestClassifier` on a small subset (10%) of the MNIST handwritten digits dataset (http://yann.lecun.com/exdb/mnist/). For convenience, the following code loads this small subset via mlxtend:"
1082 | ]
1083 | },
1084 | {
1085 | "cell_type": "code",
1086 | "execution_count": 2,
1087 | "metadata": {},
1088 | "outputs": [
1089 | {
1090 | "name": "stdout",
1091 | "output_type": "stream",
1092 | "text": [
1093 | "Dimensions: 5000 x 784\n",
1094 | "1st row [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1095 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1096 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1097 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1098 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1099 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1100 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1101 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1102 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1103 | " 0. 51. 159. 253. 159. 50. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1104 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1105 | " 48. 238. 252. 252. 252. 237. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1106 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 54.\n",
1107 | " 227. 253. 252. 239. 233. 252. 57. 6. 0. 0. 0. 0. 0. 0.\n",
1108 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 10. 60. 224.\n",
1109 | " 252. 253. 252. 202. 84. 252. 253. 122. 0. 0. 0. 0. 0. 0.\n",
1110 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 163. 252. 252.\n",
1111 | " 252. 253. 252. 252. 96. 189. 253. 167. 0. 0. 0. 0. 0. 0.\n",
1112 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 51. 238. 253. 253.\n",
1113 | " 190. 114. 253. 228. 47. 79. 255. 168. 0. 0. 0. 0. 0. 0.\n",
1114 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 48. 238. 252. 252. 179.\n",
1115 | " 12. 75. 121. 21. 0. 0. 253. 243. 50. 0. 0. 0. 0. 0.\n",
1116 | " 0. 0. 0. 0. 0. 0. 0. 0. 38. 165. 253. 233. 208. 84.\n",
1117 | " 0. 0. 0. 0. 0. 0. 253. 252. 165. 0. 0. 0. 0. 0.\n",
1118 | " 0. 0. 0. 0. 0. 0. 0. 7. 178. 252. 240. 71. 19. 28.\n",
1119 | " 0. 0. 0. 0. 0. 0. 253. 252. 195. 0. 0. 0. 0. 0.\n",
1120 | " 0. 0. 0. 0. 0. 0. 0. 57. 252. 252. 63. 0. 0. 0.\n",
1121 | " 0. 0. 0. 0. 0. 0. 253. 252. 195. 0. 0. 0. 0. 0.\n",
1122 | " 0. 0. 0. 0. 0. 0. 0. 198. 253. 190. 0. 0. 0. 0.\n",
1123 | " 0. 0. 0. 0. 0. 0. 255. 253. 196. 0. 0. 0. 0. 0.\n",
1124 | " 0. 0. 0. 0. 0. 0. 76. 246. 252. 112. 0. 0. 0. 0.\n",
1125 | " 0. 0. 0. 0. 0. 0. 253. 252. 148. 0. 0. 0. 0. 0.\n",
1126 | " 0. 0. 0. 0. 0. 0. 85. 252. 230. 25. 0. 0. 0. 0.\n",
1127 | " 0. 0. 0. 0. 7. 135. 253. 186. 12. 0. 0. 0. 0. 0.\n",
1128 | " 0. 0. 0. 0. 0. 0. 85. 252. 223. 0. 0. 0. 0. 0.\n",
1129 | " 0. 0. 0. 7. 131. 252. 225. 71. 0. 0. 0. 0. 0. 0.\n",
1130 | " 0. 0. 0. 0. 0. 0. 85. 252. 145. 0. 0. 0. 0. 0.\n",
1131 | " 0. 0. 48. 165. 252. 173. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1132 | " 0. 0. 0. 0. 0. 0. 86. 253. 225. 0. 0. 0. 0. 0.\n",
1133 | " 0. 114. 238. 253. 162. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1134 | " 0. 0. 0. 0. 0. 0. 85. 252. 249. 146. 48. 29. 85. 178.\n",
1135 | " 225. 253. 223. 167. 56. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1136 | " 0. 0. 0. 0. 0. 0. 85. 252. 252. 252. 229. 215. 252. 252.\n",
1137 | " 252. 196. 130. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1138 | " 0. 0. 0. 0. 0. 0. 28. 199. 252. 252. 253. 252. 252. 233.\n",
1139 | " 145. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1140 | " 0. 0. 0. 0. 0. 0. 0. 25. 128. 252. 253. 252. 141. 37.\n",
1141 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1142 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1143 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1144 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1145 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1146 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1147 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1148 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
1149 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n"
1150 | ]
1151 | }
1152 | ],
1153 | "source": [
1154 | "from mlxtend.data import mnist_data\n",
1155 | "X, y = mnist_data()\n",
1156 | "\n",
1157 | "print('Dimensions: %s x %s' % (X.shape[0], X.shape[1]))\n",
1158 | "print('1st row', X[0])"
1159 | ]
1160 | },
1161 | {
1162 | "cell_type": "markdown",
1163 | "metadata": {},
1164 | "source": [
1165 | "The next code cell shuffles the dataset and divides it into 4500 training examples and 500 test examples, respectively."
1166 | ]
1167 | },
1168 | {
1169 | "cell_type": "code",
1170 | "execution_count": 3,
1171 | "metadata": {},
1172 | "outputs": [],
1173 | "source": [
1174 | "from mlxtend.preprocessing import shuffle_arrays_unison\n",
1175 | "\n",
1176 | "\n",
1177 | "X, y = shuffle_arrays_unison((X, y), random_seed=1)\n",
1178 | "X_train, y_train = X[:4500], y[:4500]\n",
1179 | "X_test, y_test = X[4500:], y[4500:]"
1180 | ]
1181 | },
1182 | {
1183 | "cell_type": "markdown",
1184 | "metadata": {},
1185 | "source": [
1186 | "Now, your task is to fit a RandomForest classifier on the training set and evaluate it's predictive accuracy on the test set. "
1187 | ]
1188 | },
1189 | {
1190 | "cell_type": "code",
1191 | "execution_count": 5,
1192 | "metadata": {},
1193 | "outputs": [
1194 | {
1195 | "name": "stdout",
1196 | "output_type": "stream",
1197 | "text": [
1198 | "Accuracy 93.6%\n"
1199 | ]
1200 | }
1201 | ],
1202 | "source": [
1203 | "from sklearn.ensemble import RandomForestClassifier\n",
1204 | "\n",
1205 | "model = RandomForestClassifier(n_estimators=100, random_state=123)\n",
1206 | "model.fit(#YOUR CODE)\n",
1207 | "\n",
1208 | "acc = # YOUR CODE\n",
1209 | "print('Accuracy %.1f%%' % acc)"
1210 | ]
1211 | },
1212 | {
1213 | "cell_type": "markdown",
1214 | "metadata": {},
1215 | "source": [
1216 | "Next, your task is to load an image of a digit (some_digit.png) from this directory into a Python array and classify it using the random forest model. The some_digit.png image is displayed below:"
1217 | ]
1218 | },
1219 | {
1220 | "cell_type": "markdown",
1221 | "metadata": {},
1222 | "source": [
1223 | ""
1224 | ]
1225 | },
1226 | {
1227 | "cell_type": "markdown",
1228 | "metadata": {},
1229 | "source": [
1230 | "Note: For loading the image, you need to install the Python imaging library PIL. Actually, Pillow, a more up-to-date fork is recommended. Execute one of the following two if you haven't installed Pillow already.\n",
1231 | " \n",
1232 | "- `conda install Pillow`\n",
1233 | "\n",
1234 | "- `pip install Pillow`"
1235 | ]
1236 | },
1237 | {
1238 | "cell_type": "markdown",
1239 | "metadata": {},
1240 | "source": [
1241 | "Again, I have partially pre-written the code for you."
1242 | ]
1243 | },
1244 | {
1245 | "cell_type": "code",
1246 | "execution_count": null,
1247 | "metadata": {},
1248 | "outputs": [],
1249 | "source": [
1250 | "from PIL import Image\n",
1251 | "import numpy as np\n",
1252 | "\n",
1253 | "def load_image(file_name):\n",
1254 | " img = Image.open(file_name)\n",
1255 | " img.load()\n",
1256 | " data = np.asarray(img, dtype=np.float)\n",
1257 | " return data\n",
1258 | "\n",
1259 | "x_image = # YOUR CODE"
1260 | ]
1261 | },
1262 | {
1263 | "cell_type": "code",
1264 | "execution_count": 5,
1265 | "metadata": {},
1266 | "outputs": [
1267 | {
1268 | "name": "stdout",
1269 | "output_type": "stream",
1270 | "text": [
1271 | "Digit: 5\n"
1272 | ]
1273 | }
1274 | ],
1275 | "source": [
1276 | "# The data needs to be represented as a vector (1 position for each feature)\n",
1277 | "x_transf = # YOUR CODE\n",
1278 | "\n",
1279 | "# Also, scikit-learn expects 2D arrays, so we need to add a dimension\n",
1280 | "x_transf = # YOUR CODE\n",
1281 | "\n",
1282 | "print('Digit:', model.predict(x_transf)[0])"
1283 | ]
1284 | }
1285 | ],
1286 | "metadata": {
1287 | "kernelspec": {
1288 | "display_name": "Python 3",
1289 | "language": "python",
1290 | "name": "python3"
1291 | },
1292 | "language_info": {
1293 | "codemirror_mode": {
1294 | "name": "ipython",
1295 | "version": 3
1296 | },
1297 | "file_extension": ".py",
1298 | "mimetype": "text/x-python",
1299 | "name": "python",
1300 | "nbconvert_exporter": "python",
1301 | "pygments_lexer": "ipython3",
1302 | "version": "3.6.6"
1303 | }
1304 | },
1305 | "nbformat": 4,
1306 | "nbformat_minor": 2
1307 | }
1308 |
--------------------------------------------------------------------------------
/hw_03/hw3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Problem Set 3"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "STAT 479: Machine Learning (Fall 2018) \n",
15 | "Instructor: Sebastian Raschka (sraschka@wisc.edu) \n",
16 | "Course website: http://pages.stat.wisc.edu/~sraschka/teaching/stat479-fs2018/\n",
17 | "\n",
18 | "**Due**: Dec 03 (before 11:59 pm).\n",
19 | "\n",
20 | "**How to submit**\n",
21 | "\n",
22 | "As mentioned in the lecture, you need to submit the `.ipynb` file with your answers plus an `.html` file, which will serve as a backup for us in case the `.ipynb` file cannot be opened on my or the TA's computer. In addition, you may also export the notebook as PDF and upload it as well.\n",
23 | "\n",
24 | "Again, we will be using the Canvas platform, so you need to submit your homework there. You should be able to resubmit the homework as many times as you like before the due date."
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "As usual, you do not write the whole code from scratch, and I provided you with a skeleton of code where you need to add the lines that I indicated. Not, however, that everyone's coding style is different. Where I use only one line of code, you may want to use multiple ones. Also, where you use one line of code, I may use multiple ones."
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "%load_ext watermark\n",
41 | "%watermark -d -u -a '' -v -p numpy,scipy,matplotlib,sklearn,mlxtend"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "\n",
49 | "
\n",
50 | "
\n",
51 | "
\n",
52 | "
\n",
53 | "
\n",
54 | "
\n",
55 | "
"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "## 1. Hyperparameter Tuning and Model Selection"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "### 1.1 [10 pts] Using Grid Search for Hyperparameter Tuning"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "In this exercise, you will be working with the Breast Cancer Wisconsin dataset,\n",
77 | "which contains 569 samples of malignant and benign tumor cells. \n",
78 | "\n",
79 | "The first two columns in the dataset store the unique ID numbers of the samples and the corresponding diagnoses (M = malignant, B = benign), respectively. Columns 3-32 contain 30 real-valued features that have been computed from digitized images of the cell nuclei, which can be used to build a model to predict whether a tumor is benign or malignant. The Breast Cancer Wisconsin dataset has been deposited in the UCI Machine Learning Repository, and more detailed information about this dataset can be found at https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wi sconsin+(Diagnostic).\n",
80 | "\n",
81 | "The next cell loads the datasets and converts the class label M (malignant) to a integer 1 and the label B (benign) to class label 0."
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "# EXECUTE BUT DO NOT MODIFY THIS CELL\n",
91 | "\n",
92 | "import pandas as pd\n",
93 | "\n",
94 | "\n",
95 | "df = pd.read_csv('data/wdbc.data', header=None)\n",
96 | "\n",
97 | "# convert class label \"M\"->1 and label \"B\"->0\n",
98 | "df[1] = df[1].apply(lambda x: 1 if x == 'M' else 0)\n",
99 | "\n",
100 | "\n",
101 | "df.head()"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "# EXECUTE BUT DO NOT MODIFY THIS CELL\n",
111 | "\n",
112 | "\n",
113 | "from sklearn.model_selection import train_test_split\n",
114 | "\n",
115 | "\n",
116 | "y = df[1].values\n",
117 | "X = df.loc[:, 2:].values\n",
118 | "\n",
119 | "X_train, X_test, y_train, y_test = \\\n",
120 | " train_test_split(X, y, test_size=0.3, shuffle=True, random_state=0, stratify=y)"
121 | ]
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "metadata": {},
126 | "source": [
127 | "Now, your task is to use `GridSearchCV` from scikit-learn to find the best parameter for `n_neighbors` of a `KNearestNeighborClassifier`\n",
128 | "\n",
129 | "As hyperparameter values, you only need to consider the number of `n_neighbors` within the range 1-16 (including 16)."
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "# MODIFY THIS CELL\n",
139 | "\n",
140 | "from sklearn.pipeline import make_pipeline\n",
141 | "from sklearn.preprocessing import StandardScaler\n",
142 | "from sklearn.neighbors import KNeighborsClassifier\n",
143 | "from sklearn.model_selection import GridSearchCV\n",
144 | "\n",
145 | "\n",
146 | "pipe = make_pipeline(# YOUR CODE HERE\n",
147 | " # YOUR CODE HERE\n",
148 | ")\n",
149 | "\n",
150 | "param_grid = [{ # YOUR CODE HERE }]\n",
151 | "\n",
152 | "\n",
153 | "gs = GridSearchCV(# YOUR CODE HERE \n",
154 | " # YOUR CODE HERE \n",
155 | " iid=False,\n",
156 | " n_jobs=-1,\n",
157 | " refit=True,\n",
158 | " scoring='accuracy',\n",
159 | " cv=10)\n",
160 | "\n",
161 | "gs.fit(X_train, y_train)\n",
162 | "\n",
163 | "print('Best Accuracy: %.2f%%' % (gs.best_score_*100))"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {},
169 | "source": [
170 | "Next, print the best parameters obtained from the `GridSearchCV` run and compute the accuracy a `KNearestNeighborClassifier` would achieve with these settings on the test set (`X_test`, `y_test`)."
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "# MODIFY THIS CELL\n",
180 | "\n",
181 | "print('Best Params: %s' % # YOUR CODE HERE)\n",
182 | "print('Test Accuracy: %.2f%%' % # YOUR CODE HERE)"
183 | ]
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "metadata": {},
188 | "source": [
189 | "\n",
190 | "
\n",
191 | "
\n",
192 | "
\n",
193 | "
\n",
194 | "
\n",
195 | "
\n",
196 | "
"
197 | ]
198 | },
199 | {
200 | "cell_type": "markdown",
201 | "metadata": {},
202 | "source": [
203 | "### 1.2 [10 pts] Estimate the Generalization Performance using the '.632+' Bootstrap"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "In this exercise, you are asked to compute the accuracy of the model from the previous exercise (1.1) on the test set (`X_test`, `y_test`) using the .632+ Bootstrap method. For this you can use the `bootstrap_point632_score` function implemented in MLxtend for this: \n",
211 | "http://rasbt.github.io/mlxtend/user_guide/evaluate/bootstrap_point632_score/"
212 | ]
213 | },
214 | {
215 | "cell_type": "markdown",
216 | "metadata": {},
217 | "source": [
218 | "- use 200 bootstrap rounds\n",
219 | "- set the random seed to 1\n",
220 | "\n",
221 | "The accruacy should be the mean accuracy over the 200 bootstrap values that the `bootstrap_point632_score` method returns."
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "metadata": {},
228 | "outputs": [],
229 | "source": [
230 | "# MODIFY THIS CELL\n",
231 | "\n",
232 | "from mlxtend.evaluate import bootstrap_point632_score\n",
233 | "import numpy as np\n",
234 | "\n",
235 | "\n",
236 | "scores = bootstrap_point632_score(# YOUR CODE HERE)\n",
237 | "\n",
238 | "acc = # YOUR CODE HERE\n",
239 | "print('Accuracy: %.2f%%' % (100*acc))"
240 | ]
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "metadata": {},
245 | "source": [
246 | "Next, compute the lower and upper bound on the mean accuracy via a 95% confidence interval. For that, you should use the `scores` you computed in the cell above."
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": null,
252 | "metadata": {},
253 | "outputs": [],
254 | "source": [
255 | "# MODIFY THIS CELL\n",
256 | "\n",
257 | "lower = # YOUR CODE\n",
258 | "upper = # YOUR CODE\n",
259 | "\n",
260 | "print('95%% Confidence interval: [%.2f, %.2f]' % (100*lower, 100*upper))"
261 | ]
262 | },
263 | {
264 | "cell_type": "markdown",
265 | "metadata": {},
266 | "source": [
267 | "\n",
268 | "
\n",
269 | "
\n",
270 | "
\n",
271 | "
\n",
272 | "
\n",
273 | "
\n",
274 | "
"
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {},
280 | "source": [
281 | "## 2. Confusion Matrices"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "### 2.1 [10 pts] Contructing a Binary Confusion Matrix"
289 | ]
290 | },
291 | {
292 | "cell_type": "markdown",
293 | "metadata": {},
294 | "source": [
295 | "The task of this execise is to construct a binary confusion matrix based of the following form:\n",
296 | "\n",
297 | "\n",
298 | "\n",
299 | "Here, assume that the positive class is the class with label 0, and the negative class is the class with label 1. You are given an array of the actual class labels, `y_true`, as well as an array of the predicted class labels, `y_predicted`. The output should be a numpy array, like shown below\n",
300 | "\n",
301 | "```\n",
302 | "array([[101, 21],\n",
303 | " [41, 121]])\n",
304 | "``` \n",
305 | " \n",
306 | "(Note that these number in the array are not the actual, expected or correct values.)\n",
307 | "\n",
308 | "Using the `plot_confusion_matrix` from the `helper.py` script (which should be in the same directory as this notebook) the example array/confusion matrix is visualized as follows:"
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": null,
314 | "metadata": {},
315 | "outputs": [],
316 | "source": [
317 | "%matplotlib inline"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": null,
323 | "metadata": {},
324 | "outputs": [],
325 | "source": [
326 | "# EXECUTE BUT DO NOT MODIFY THIS CELL\n",
327 | "\n",
328 | "import numpy as np\n",
329 | "from helper import plot_confusion_matrix\n",
330 | "import matplotlib.pyplot as plt\n",
331 | "\n",
332 | "\n",
333 | "example_cm = np.array([[101, 21],\n",
334 | " [41, 121]])\n",
335 | "\n",
336 | "plot_confusion_matrix(example_cm)\n",
337 | "plt.show()"
338 | ]
339 | },
340 | {
341 | "cell_type": "markdown",
342 | "metadata": {},
343 | "source": [
344 | "Now, your task is to complete the `confusion_matrix_binary` below in order to construct a confusion matrix from 2 label arrays:\n",
345 | "\n",
346 | "- `y_true` (true or actual class labels)\n",
347 | "- `y_predicted` (class labels predicted by a classifier)\n",
348 | "\n",
349 | "To make it easier for you, you only need to replace the `???`'s with the right variable name (`tp`, `fn`, `fp`, or `tn`)."
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": null,
355 | "metadata": {},
356 | "outputs": [],
357 | "source": [
358 | "# MODIFY THIS CELL\n",
359 | "\n",
360 | "\n",
361 | "y_true = np.array([1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0])\n",
362 | "y_predicted = np.array([1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0])\n",
363 | "\n",
364 | "\n",
365 | "def confusion_matrix_binary(y_true, y_predicted):\n",
366 | "\n",
367 | " tp, fn, fp, tn = 0, 0, 0, 0\n",
368 | " \n",
369 | " for i, j in zip(y_true, y_predicted):\n",
370 | " if i == j:\n",
371 | " if i == 0:\n",
372 | " ??? += 1\n",
373 | " else:\n",
374 | " ??? += 1\n",
375 | " else:\n",
376 | " if i == 0:\n",
377 | " ??? += 1\n",
378 | " else:\n",
379 | " ??? += 1\n",
380 | " \n",
381 | " conf_matrix = np.zeros(4).reshape(2, 2).astype(int)\n",
382 | " conf_matrix[0, 0] = ???\n",
383 | " conf_matrix[0, 1] = ???\n",
384 | " conf_matrix[1, 0] = ???\n",
385 | " conf_matrix[1, 1] = ??? \n",
386 | " \n",
387 | " return conf_matrix\n",
388 | "\n",
389 | "result_matrix = confusion_matrix_binary(y_true, y_predicted)"
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": null,
395 | "metadata": {},
396 | "outputs": [],
397 | "source": [
398 | "# EXECUTE BUT DO NOT MODIFY THIS CELL\n",
399 | "\n",
400 | "print('Conusion matrix array:\\n', result_matrix)"
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": null,
406 | "metadata": {},
407 | "outputs": [],
408 | "source": [
409 | "# EXECUTE BUT DO NOT MODIFY THIS CELL\n",
410 | "\n",
411 | "plot_confusion_matrix(result_matrix)\n",
412 | "plt.show()"
413 | ]
414 | },
415 | {
416 | "cell_type": "markdown",
417 | "metadata": {},
418 | "source": [
419 | "\n",
420 | "
\n",
421 | "
\n",
422 | "
\n",
423 | "
\n",
424 | "
\n",
425 | "
\n",
426 | "
"
427 | ]
428 | },
429 | {
430 | "cell_type": "markdown",
431 | "metadata": {},
432 | "source": [
433 | "### 2.2 [10 pts] Constructing a Multiclass Confusion Matrix"
434 | ]
435 | },
436 | {
437 | "cell_type": "markdown",
438 | "metadata": {},
439 | "source": [
440 | "Next, write a version of this confusion matrix that generalizes to multi-class settings as shown in the figure below:\n",
441 | "\n",
442 | " \n",
443 | "\n",
444 | "\n",
445 | "\n",
446 | "Again, the output should be a 2D NumPy array:\n",
447 | "\n",
448 | "```\n",
449 | "array([[3, 0, 0],\n",
450 | " [7, 50, 12],\n",
451 | " [0, 0, 18]])\n",
452 | "```\n",
453 | " \n",
454 | "(Note that these number in the array are not the actual, expected or correct values for this exercise.)\n",
455 | "\n",
456 | "\n",
457 | "There are many different ways to implement a function to construct a multi-class confusion matrix, and in this exercise, you are given the freedom to implement it however way you prefer. Please note though that you should not import confusion matrix code from other packages but implement it by your self in Python (and NumPy)."
458 | ]
459 | },
460 | {
461 | "cell_type": "markdown",
462 | "metadata": {},
463 | "source": [
464 | "Note that if there are 5 different class labels (0, ..., 4), then the result should be a 5x5 confusion matrix."
465 | ]
466 | },
467 | {
468 | "cell_type": "code",
469 | "execution_count": null,
470 | "metadata": {},
471 | "outputs": [],
472 | "source": [
473 | "## FOR STUDENTS\n",
474 | "\n",
475 | "\n",
476 | "import numpy as np\n",
477 | "\n",
478 | "\n",
479 | "def confusion_matrix_multiclass(y_true, y_predicted):\n",
480 | "\n",
481 | " # YOUR CODE (As many lines of code as you like)\n",
482 | " \n",
483 | " return matrix\n",
484 | "\n",
485 | "\n",
486 | "y_true = [1, 1, 1, 1, 0, 2, 0, 3, 4, 2, 1, 2, 2, 1, 2, 1, 0, 1, 1, 0]\n",
487 | "y_predicted = [1, 0, 1, 1, 0, 2, 1, 3, 4, 2, 2, 0, 2, 1, 2, 1, 0, 3, 1, 1]\n",
488 | "\n",
489 | "result_matrix = confusion_matrix_multiclass(y_true, y_predicted)\n",
490 | "result_matrix"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": null,
496 | "metadata": {},
497 | "outputs": [],
498 | "source": [
499 | "# EXECUTE BUT DO NOT MODIFY THIS CELL\n",
500 | "\n",
501 | "from helper import plot_confusion_matrix\n",
502 | "\n",
503 | "\n",
504 | "plot_confusion_matrix(result_matrix)\n",
505 | "plt.show()"
506 | ]
507 | },
508 | {
509 | "cell_type": "markdown",
510 | "metadata": {},
511 | "source": [
512 | "\n",
513 | "
\n",
514 | "
\n",
515 | "
\n",
516 | "
\n",
517 | "
\n",
518 | "
\n",
519 | "
"
520 | ]
521 | },
522 | {
523 | "cell_type": "markdown",
524 | "metadata": {},
525 | "source": [
526 | "### 2.3 [10 pts] Binary Confusion Matrices for Multiclass Problems"
527 | ]
528 | },
529 | {
530 | "cell_type": "markdown",
531 | "metadata": {},
532 | "source": [
533 | "In this exercise, you will be building binary confusion matrices for multiclass problems as discussed in class when we talked about computing the balanced accuracy. Here, you can reuse the `confusion_matrix_binary` function you implemented in 2.1. \n",
534 | "\n",
535 | "Remember, if we are given 5 class labels (0, ..., 4) then we can construct 5 binary confusion matrices, where each time one of the 5 classes is assigned the positive class where all other classes will be considered as the negative class. The `positive_label` argument in the `binary_cm_from_multiclass` function below can be used to determine which class label refers to the positive class.\n",
536 | "\n",
537 | "Implementing the function below is actually very easy and should only require you to add 2 lines of code with the help of the `np.where` function. "
538 | ]
539 | },
540 | {
541 | "cell_type": "code",
542 | "execution_count": null,
543 | "metadata": {},
544 | "outputs": [],
545 | "source": [
546 | "# MODIFY THIS CELL\n",
547 | "\n",
548 | "def binary_cm_from_multiclass(y_true, y_predicted, positive_label):\n",
549 | " \n",
550 | " y_true_ary = np.array(y_true)\n",
551 | " y_predicted_ary = np.array(y_predicted)\n",
552 | " \n",
553 | " y_true_mod = np.where( # YOUR CODE\n",
554 | " y_predicted_mod = np.where( # YOUR CODE\n",
555 | " \n",
556 | " cm = confusion_matrix_binary(y_true_mod, y_predicted_mod)\n",
557 | " return cm"
558 | ]
559 | },
560 | {
561 | "cell_type": "markdown",
562 | "metadata": {},
563 | "source": [
564 | "As a hint, the expected output for label 0 as positive label is shown below:"
565 | ]
566 | },
567 | {
568 | "cell_type": "markdown",
569 | "metadata": {},
570 | "source": [
571 | ""
572 | ]
573 | },
574 | {
575 | "cell_type": "code",
576 | "execution_count": null,
577 | "metadata": {},
578 | "outputs": [],
579 | "source": [
580 | "# EXECUTE BUT DO NOT MODIFY THIS CELL\n",
581 | "\n",
582 | "\n",
583 | "y_true = [1, 1, 1, 1, 0, 2, 0, 3, 4, 2, 1, 2, 2, 1, 2, 1, 0, 1, 1, 0]\n",
584 | "y_predicted = [1, 0, 1, 1, 0, 2, 1, 3, 4, 2, 2, 0, 2, 1, 2, 1, 0, 3, 1, 1]\n",
585 | "\n",
586 | "\n",
587 | "mat_pos0 = binary_cm_from_multiclass(y_true, y_predicted, positive_label=0)\n",
588 | "print('Positive Label 0:\\n', mat_pos0)\n",
589 | "\n",
590 | "fig, ax = plot_confusion_matrix(mat_pos0)\n",
591 | "ax.set_xticklabels(['', 'Pos Class (0)', 'Neg Class (Rest)'])\n",
592 | "ax.set_yticklabels(['', 'Pos Class (0)', 'Neg Class (Rest)']);"
593 | ]
594 | },
595 | {
596 | "cell_type": "code",
597 | "execution_count": null,
598 | "metadata": {},
599 | "outputs": [],
600 | "source": [
601 | "# EXECUTE BUT DO NOT MODIFY THIS CELL\n",
602 | "\n",
603 | "mat_pos1 = binary_cm_from_multiclass(y_true, y_predicted, positive_label=1)\n",
604 | "print('\\n\\nPositive Label 1:\\n', mat_pos1)\n",
605 | "\n",
606 | "fig, ax = plot_confusion_matrix(mat_pos1)\n",
607 | "ax.set_xticklabels(['', 'Pos Class (1)', 'Neg Class (Rest)'])\n",
608 | "ax.set_yticklabels(['', 'Pos Class (1)', 'Neg Class (Rest)']);\n",
609 | "\n",
610 | "plt.show()"
611 | ]
612 | },
613 | {
614 | "cell_type": "markdown",
615 | "metadata": {},
616 | "source": [
617 | "\n",
618 | "
\n",
619 | "
\n",
620 | "
\n",
621 | "
\n",
622 | "
\n",
623 | "
\n",
624 | "
"
625 | ]
626 | },
627 | {
628 | "cell_type": "markdown",
629 | "metadata": {},
630 | "source": [
631 | "## 3. [10 pts] Balanced Accuracy"
632 | ]
633 | },
634 | {
635 | "cell_type": "markdown",
636 | "metadata": {},
637 | "source": [
638 | "Based on our discussion in class, implement a function that computes the balanced accuracy. You can implement the accuracy whatever way you like using Python and NumPy. Note that you can also re-use the binary confusion matrix code and the `binary_cm_from_multiclass` code if you like (but you don't have to).\n",
639 | "\n",
640 | "Below is a template that you can use that does not require code from the previous exercises (but you can write the function in a different way if you like as long as it gives the correct results)."
641 | ]
642 | },
643 | {
644 | "cell_type": "code",
645 | "execution_count": null,
646 | "metadata": {},
647 | "outputs": [],
648 | "source": [
649 | "# MODIFY THIS CELL\n",
650 | "\n",
651 | "import numpy as np\n",
652 | "\n",
653 | "\n",
654 | "def balanced_accuracy(y_true, y_predicted):\n",
655 | " \n",
656 | " y_true_ary = np.array(y_true)\n",
657 | " y_predicted_ary = np.array(y_predicted)\n",
658 | " \n",
659 | " unique_labels = np.unique(np.concatenate((y_true_ary, y_predicted_ary)))\n",
660 | " class_accuracies = []\n",
661 | " for l in unique_labels:\n",
662 | " # YOUR CODE HERE\n",
663 | " # YOUR CODE HERE\n",
664 | " # YOUR CODE HERE\n",
665 | " class_accuracies.append(acc)\n",
666 | " return np.mean(class_accuracies)"
667 | ]
668 | },
669 | {
670 | "cell_type": "code",
671 | "execution_count": null,
672 | "metadata": {},
673 | "outputs": [],
674 | "source": [
675 | "# EXECUTE BUT DO NOT MODIFY THIS CELL\n",
676 | "\n",
677 | "y_targ = [1, 1, 2, 1, 1, 2, 0, 3]\n",
678 | "y_pred = [0, 0, 2, 1, 1, 2, 1, 3]\n",
679 | " \n",
680 | "balanced_accuracy(y_targ, y_pred)"
681 | ]
682 | },
683 | {
684 | "cell_type": "markdown",
685 | "metadata": {},
686 | "source": [
687 | "\n",
688 | "
\n",
689 | "
\n",
690 | "
\n",
691 | "
\n",
692 | "
\n",
693 | "
\n",
694 | "
"
695 | ]
696 | },
697 | {
698 | "cell_type": "markdown",
699 | "metadata": {},
700 | "source": [
701 | "## 4. Receiver Operater Characteristic (ROC)"
702 | ]
703 | },
704 | {
705 | "cell_type": "markdown",
706 | "metadata": {},
707 | "source": [
708 | "### 4.1 [10 pts] Plotting a ROC Curve"
709 | ]
710 | },
711 | {
712 | "cell_type": "markdown",
713 | "metadata": {},
714 | "source": [
715 | "In this exercise, you are asked to plot a ROC curve. You are given a 2D array of probability values (`y_probabilities`; see next code cells) where \n",
716 | "- a value in the first column refer to the probability that a given test example (each row is one test example) belongs to class 0\n",
717 | "- a value in the second column refer to the probability that a given test example belongs to class 1"
718 | ]
719 | },
720 | {
721 | "cell_type": "code",
722 | "execution_count": null,
723 | "metadata": {},
724 | "outputs": [],
725 | "source": [
726 | "# EXECUTE BUT DO NOT MODIFY THIS CELL\n",
727 | "\n",
728 | "\n",
729 | "from mlxtend.data import iris_data\n",
730 | "from sklearn.model_selection import train_test_split\n",
731 | "from sklearn.linear_model import LogisticRegression\n",
732 | "\n",
733 | "\n",
734 | "X, y = iris_data()\n",
735 | "X, y = X[:100, [1]], y[:100]\n",
736 | "X_train, X_test, y_train, y_test = \\\n",
737 | " train_test_split(X, y, test_size=0.5, shuffle=True, random_state=0, stratify=y)\n",
738 | "\n",
739 | "model = LogisticRegression(solver='lbfgs', random_state=123)\n",
740 | "model.fit(X_train, y_train)\n",
741 | "\n",
742 | "y_probabilities = model.predict_proba(X_test)\n",
743 | "\n",
744 | "print(y_probabilities)"
745 | ]
746 | },
747 | {
748 | "cell_type": "markdown",
749 | "metadata": {},
750 | "source": [
751 | "For this exercise, these scores are probabilities here, but scores can be obtained from an arbitrary classifier (ROC curves are not limited to logistic regression classifiers). For instance, in k-nearest neighbor classifiers, we can consider the fraction of the majority class labels and number of neighbors as the score. In decision tree classifiers, the score can be calculated as the ratio of the majority class labels and number of data points at a given node.\n",
752 | "\n",
753 | "(In case you are curious, 'lbfgs' stands for Limited-memory BFGS, which is an optimization algorithm in the family of quasi-Newton methods that approximates the Broyden–Fletcher–Goldfarb–Shanno; not important to know here though.) "
754 | ]
755 | },
756 | {
757 | "cell_type": "markdown",
758 | "metadata": {},
759 | "source": [
760 | "**Note: You should only use Python base functions, NumPy, and matplotlib to get full points (do not use other external libraries)**"
761 | ]
762 | },
763 | {
764 | "cell_type": "markdown",
765 | "metadata": {},
766 | "source": [
767 | "The `pos_label` argument is used to specify the positive label and the threshold. For instance, if we are given score\n",
768 | "0.8, this score refers to the \"probability\" of the positive label. Assuming that the positive label is 1, this refers to a 80% probability that the true class label is 1. \n",
769 | "\n",
770 | "- Note that in the `y_probabilities` array, the second column refers to the probabilities of class label 1.\n",
771 | "- The `plot_roc_curve` function should only receive a 1D array for `y_score`. E.g., \n",
772 | "\n",
773 | "if `y_probabilities` is \n",
774 | "\n",
775 | "```\n",
776 | "[[0.44001556 0.55998444]\n",
777 | " [0.69026364 0.30973636]\n",
778 | " [0.31814182 0.68185818]\n",
779 | " [0.56957726 0.43042274]\n",
780 | " [0.86339788 0.13660212]\n",
781 | " [0.56957726 0.43042274]\n",
782 | " [0.86339788 0.13660212]\n",
783 | " [0.44001556 0.55998444]\n",
784 | " [0.08899234 0.91100766]\n",
785 | " [0.50487831 0.49512169]\n",
786 | " [0.74306586 0.25693414]\n",
787 | "```\n",
788 | " \n",
789 | "The `y_score` array is expected to be \n",
790 | "\n",
791 | "a) `y_score = [0.5599..., 0.3097..., 0.6818..., 0.4304..., ...]` for `pos_label=1`\n",
792 | "\n",
793 | "and \n",
794 | "\n",
795 | "b) `y_score = [0.4400..., 0.6902..., 0.3181..., 0.5695..., ...]` for `pos_label=0`"
796 | ]
797 | },
798 | {
799 | "cell_type": "code",
800 | "execution_count": null,
801 | "metadata": {},
802 | "outputs": [],
803 | "source": [
804 | "# MODIFY THIS CELL\n",
805 | "\n",
806 | "\n",
807 | "import matplotlib.pyplot as plt\n",
808 | "import numpy as np\n",
809 | "\n",
810 | "\n",
811 | "def plot_roc_curve(y_true, y_score, pos_label=1, num_thresholds=100):\n",
812 | "\n",
813 | " y_true_ary = np.array(y_true)\n",
814 | " y_score_ary = np.array(y_score)\n",
815 | " x_axis_values = []\n",
816 | " y_axis_values = []\n",
817 | " thresholds = np.linspace(0., 1., num_thresholds)\n",
818 | "\n",
819 | " num_positives = # YOUR CODE\n",
820 | " num_negatives = # YOUR CODE\n",
821 | "\n",
822 | " for i, thr in enumerate(thresholds):\n",
823 | " \n",
824 | " binarized_scores = np.where(y_score >= thr, pos_label, int(not pos_label))\n",
825 | " \n",
826 | " positive_predictions = # YOUR CODE\n",
827 | " num_true_positives = # YOUR CODE\n",
828 | " num_false_positives = # YOUR CODE\n",
829 | " \n",
830 | " x_axis_values.append(# YOUR CODE)\n",
831 | " y_axis_values.append(# YOUR CODE)\n",
832 | "\n",
833 | " plt.step(x_axis_values, y_axis_values, where='post')\n",
834 | " \n",
835 | " plt.xlim([0., 1.01])\n",
836 | " plt.ylim([0., 1.01])\n",
837 | " plt.ylabel('True Positive Rate')\n",
838 | " plt.xlabel('False Positive Rate')\n",
839 | " \n",
840 | " return None"
841 | ]
842 | },
843 | {
844 | "cell_type": "code",
845 | "execution_count": null,
846 | "metadata": {
847 | "scrolled": true
848 | },
849 | "outputs": [],
850 | "source": [
851 | "# EXECUTE BUT DO NOT MODIFY THIS CELL\n",
852 | "\n",
853 | "plot_roc_curve(y_test, y_probabilities[:, 1], pos_label=1)\n",
854 | "plt.show()"
855 | ]
856 | },
857 | {
858 | "cell_type": "code",
859 | "execution_count": null,
860 | "metadata": {},
861 | "outputs": [],
862 | "source": [
863 | "# EXECUTE BUT DO NOT MODIFY THIS CELL\n",
864 | "\n",
865 | "plot_roc_curve(y_test, y_probabilities[:, 0], pos_label=0)\n",
866 | "plt.show()"
867 | ]
868 | },
869 | {
870 | "cell_type": "markdown",
871 | "metadata": {},
872 | "source": [
873 | "\n",
874 | "
\n",
875 | "
\n",
876 | "
\n",
877 | "
\n",
878 | "
\n",
879 | "
\n",
880 | "
"
881 | ]
882 | },
883 | {
884 | "cell_type": "markdown",
885 | "metadata": {},
886 | "source": [
887 | "### 4.2 [10 pts] Calculating the ROC AUC"
888 | ]
889 | },
890 | {
891 | "cell_type": "markdown",
892 | "metadata": {},
893 | "source": [
894 | "In this exercise, you are asked to modify your previous `plot_roc_curve` function to compute the ROC area under the curve (ROC AUC). To compute the ROC AUC, you can use NumPy's `trapz` function for your convenience (https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.trapz.html).\n",
895 | "\n",
896 | "- As before, you should only use basic Python functions, NumPy, and matplotlib to get full points for this exercise (do not use other external libraries)"
897 | ]
898 | },
899 | {
900 | "cell_type": "code",
901 | "execution_count": null,
902 | "metadata": {},
903 | "outputs": [],
904 | "source": [
905 | "# MODIFY THIS CELL\n",
906 | "\n",
907 | "\n",
908 | "def plot_roc_curve_plus_auc(y_true, y_score, pos_label=1, num_thresholds=100):\n",
909 | "\n",
910 | " # INSERT YOUR CODE FROM THE PREVIOUS EXERCISE HERE\n",
911 | " # BUT MODIFY IT SUCH THAT IT ALSO RETURNS THE\n",
912 | " # ROC Area Under the Curve\n",
913 | " return roc_auc"
914 | ]
915 | },
916 | {
917 | "cell_type": "markdown",
918 | "metadata": {},
919 | "source": [
920 | "1) Calculate the ROC AUC for the positive class label 0"
921 | ]
922 | },
923 | {
924 | "cell_type": "code",
925 | "execution_count": null,
926 | "metadata": {},
927 | "outputs": [],
928 | "source": [
929 | "# DON'T MODIFY BUT EXECUTE THIS CELL TO SHOW YOUR SOLUTION\n",
930 | "\n",
931 | "auc = plot_roc_curve_plus_auc(y_test, y_probabilities[:, 0], pos_label=0)\n",
932 | "print('ROC AUC: %.4f' % auc)"
933 | ]
934 | },
935 | {
936 | "cell_type": "markdown",
937 | "metadata": {},
938 | "source": [
939 | "2) Calculate the ROC AUC for the positive class label 1"
940 | ]
941 | },
942 | {
943 | "cell_type": "code",
944 | "execution_count": null,
945 | "metadata": {},
946 | "outputs": [],
947 | "source": [
948 | "# DON'T MODIFY BUT EXECUTE THIS CELL TO SHOW YOUR SOLUTION\n",
949 | "\n",
950 | "auc = plot_roc_curve_plus_auc(y_test, y_probabilities[:, 1], pos_label=1)\n",
951 | "print('ROC AUC: %.4f' % auc)"
952 | ]
953 | },
954 | {
955 | "cell_type": "markdown",
956 | "metadata": {},
957 | "source": [
958 | "\n",
959 | "
\n",
960 | "
\n",
961 | "
\n",
962 | "
\n",
963 | "
\n",
964 | "
\n",
965 | "
"
966 | ]
967 | },
968 | {
969 | "cell_type": "markdown",
970 | "metadata": {},
971 | "source": [
972 | "## 5. Feature Importance"
973 | ]
974 | },
975 | {
976 | "cell_type": "markdown",
977 | "metadata": {},
978 | "source": [
979 | "### [10 pts] 5.1 Drop-Column Feature Importance"
980 | ]
981 | },
982 | {
983 | "cell_type": "markdown",
984 | "metadata": {},
985 | "source": [
986 | "In this exercise, you are asked to implement the \"drop-column feature importance\" method discussed in class, to measure the importance of individual features present in a dataset.\n",
987 | "\n",
988 | "\n",
989 | "- You will be using regular accuracy measure as performance metric\n",
990 | "- Use 5 fold cross-validation to compute the accuracies\n",
991 | "\n",
992 | "The dataset you will be using for this exercise is the so-called \"Wine\" dataset. \n",
993 | "\n",
994 | "The Wine dataset is another open-source dataset that is available from the UCI machine learning repository (https://archive.ics.uci.edu/ml/datasets/Wine); it consists of 178 wine samples with 13 features describing their different chemical properties.\n",
995 | "\n",
996 | "The 13 different features in the Wine dataset, describing the chemical properties of the 178 wine samples, are listed in the following table that you will see after executing the next code cell.\n"
997 | ]
998 | },
999 | {
1000 | "cell_type": "code",
1001 | "execution_count": null,
1002 | "metadata": {},
1003 | "outputs": [],
1004 | "source": [
1005 | "# EXECUTE BUT DO NOT MODIFY THIS CELL\n",
1006 | "\n",
1007 | "\n",
1008 | "import pandas as pd\n",
1009 | "\n",
1010 | "df_wine = pd.read_csv('data/wine.data',\n",
1011 | " header=None)\n",
1012 | "\n",
1013 | "df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',\n",
1014 | " 'Alcalinity of ash', 'Magnesium', 'Total phenols',\n",
1015 | " 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',\n",
1016 | " 'Color intensity', 'Hue',\n",
1017 | " 'OD280/OD315 of diluted wines', 'Proline']\n",
1018 | "\n",
1019 | "df_wine.head()"
1020 | ]
1021 | },
1022 | {
1023 | "cell_type": "markdown",
1024 | "metadata": {},
1025 | "source": [
1026 | "The samples belong to one of three different classes, 1, 2, and 3, which refer to the three different types of grape grown in the same region in Italy but derived from different wine cultivars, as described in the dataset summary (https://archive. ics.uci.edu/ml/machine-learning-databases/wine/wine.names)."
1027 | ]
1028 | },
1029 | {
1030 | "cell_type": "code",
1031 | "execution_count": null,
1032 | "metadata": {},
1033 | "outputs": [],
1034 | "source": [
1035 | "# EXECUTE BUT DO NOT MODIFY THIS CELL\n",
1036 | "\n",
1037 | "\n",
1038 | "from sklearn.model_selection import train_test_split\n",
1039 | "\n",
1040 | "X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values\n",
1041 | "\n",
1042 | "X_train, X_test, y_train, y_test = \\\n",
1043 | " train_test_split(X, y, test_size=0.3, \n",
1044 | " stratify=y,\n",
1045 | " random_state=0)"
1046 | ]
1047 | },
1048 | {
1049 | "cell_type": "markdown",
1050 | "metadata": {},
1051 | "source": [
1052 | "Now the task is to implement the `feature_importance_dropcolumn` function to compute the feature importance according the Drop-Column method discussed in class. Here, use the `cross_val_score` function from scikit-learn to compute the acccuracy as the average accuracy from 5-fold cross-validation."
1053 | ]
1054 | },
1055 | {
1056 | "cell_type": "code",
1057 | "execution_count": null,
1058 | "metadata": {},
1059 | "outputs": [],
1060 | "source": [
1061 | "# MODIFY THIS CELL\n",
1062 | "\n",
1063 | "\n",
1064 | "import numpy as np\n",
1065 | "from sklearn.model_selection import cross_val_score\n",
1066 | "\n",
1067 | "\n",
1068 | "def feature_importance_dropcolumn(estimator, X, y, cv=5):\n",
1069 | "\n",
1070 | " base_accuracy = # YOUR CODE\n",
1071 | " column_indices = np.arange(X.shape[1]).astype(int)\n",
1072 | " drop_accuracies = np.zeros(column_indices.shape[0])\n",
1073 | " \n",
1074 | " for idx in column_indices:\n",
1075 | " mask = np.ones(column_indices.shape[0]).astype(bool)\n",
1076 | " mask[idx] = False\n",
1077 | " drop_accuracy = # YOUR CODE\n",
1078 | " drop_accuracies[idx] = # YOUR CODE\n",
1079 | " \n",
1080 | " return drop_accuracies"
1081 | ]
1082 | },
1083 | {
1084 | "cell_type": "markdown",
1085 | "metadata": {},
1086 | "source": [
1087 | "Next, apply the `feature_importance_dropcolumn` function to the Wine training dataset (`X_train`, `y_train`) on a `KNeighborsClassifier` (you should use the `make_pipeline` function to create an estimator where the features are scaled to z-scores via the `StandardScaler`, since `KNeighborsClassifier` is very sensitive to feature scales).\n",
1088 | "\n",
1089 | "- You should use a `KNeighborsClassifier` with 5 nearest neighbors."
1090 | ]
1091 | },
1092 | {
1093 | "cell_type": "code",
1094 | "execution_count": null,
1095 | "metadata": {},
1096 | "outputs": [],
1097 | "source": [
1098 | "# MODIFY THIS CELL\n",
1099 | "\n",
1100 | "from sklearn.pipeline import make_pipeline\n",
1101 | "from sklearn.preprocessing import StandardScaler\n",
1102 | "from sklearn.neighbors import KNeighborsClassifier\n",
1103 | "\n",
1104 | "\n",
1105 | "\n",
1106 | "pipe = make_pipeline(\n",
1107 | " # YOUR CODE\n",
1108 | " # YOUE CODE\n",
1109 | ")\n",
1110 | "\n",
1111 | "\n",
1112 | "feature_importance_dropcolumn(# YOUR CODE)"
1113 | ]
1114 | },
1115 | {
1116 | "cell_type": "markdown",
1117 | "metadata": {},
1118 | "source": [
1119 | "\n",
1120 | "
\n",
1121 | "
\n",
1122 | "
\n",
1123 | "
\n",
1124 | "
\n",
1125 | "
\n",
1126 | "
"
1127 | ]
1128 | },
1129 | {
1130 | "cell_type": "markdown",
1131 | "metadata": {},
1132 | "source": [
1133 | "### [10 pts] 5.2 Random Forest Feature Importance"
1134 | ]
1135 | },
1136 | {
1137 | "cell_type": "markdown",
1138 | "metadata": {},
1139 | "source": [
1140 | "First, use a `RandomForestClassifier` in your `feature_importance_dropcolumn` from the previous exercise, 5.1. Use a random forest \n",
1141 | "\n",
1142 | "- with 200 estimators and \n",
1143 | "- random seed 0. "
1144 | ]
1145 | },
1146 | {
1147 | "cell_type": "code",
1148 | "execution_count": null,
1149 | "metadata": {},
1150 | "outputs": [],
1151 | "source": [
1152 | "# MODIFY THIS CELL\n",
1153 | "\n",
1154 | "\n",
1155 | "from sklearn.ensemble import RandomForestClassifier\n",
1156 | "\n",
1157 | "\n",
1158 | "drop_importances = feature_importance_dropcolumn(\n",
1159 | " # YOUR CODE]\n",
1160 | " X=X_train, \n",
1161 | " y=y_train,\n",
1162 | " cv=5)\n",
1163 | "\n",
1164 | "\n",
1165 | "print('Drop Importance from RF:', drop_importances)"
1166 | ]
1167 | },
1168 | {
1169 | "cell_type": "markdown",
1170 | "metadata": {},
1171 | "source": [
1172 | "Next, compute the ranking among the features as determined by the outputs of the previous code cell, saved under `drop_importances`. You may use `np.argsort` in your computation, to compute the ranking, where the highest number should correspond to the most important feature."
1173 | ]
1174 | },
1175 | {
1176 | "cell_type": "code",
1177 | "execution_count": null,
1178 | "metadata": {},
1179 | "outputs": [],
1180 | "source": [
1181 | "# MODIFY THIS CELL\n",
1182 | "\n",
1183 | "\n",
1184 | "# YOUR CODE"
1185 | ]
1186 | },
1187 | {
1188 | "cell_type": "markdown",
1189 | "metadata": {},
1190 | "source": [
1191 | "Which are the 3 most important features? You can either write the feature indices below that correspond to the most important features or write out the full column names (you can see the column names in the pandas `DataFrame` in 5.1)."
1192 | ]
1193 | },
1194 | {
1195 | "cell_type": "markdown",
1196 | "metadata": {},
1197 | "source": [
1198 | "!!! **EDIT THIS CELL TO ENTER YOUR ANSWER** !!!"
1199 | ]
1200 | },
1201 | {
1202 | "cell_type": "markdown",
1203 | "metadata": {},
1204 | "source": [
1205 | "\n",
1206 | "
\n",
1207 | "
\n",
1208 | "
\n",
1209 | "
\n",
1210 | "
\n",
1211 | "
\n",
1212 | "
"
1213 | ]
1214 | },
1215 | {
1216 | "cell_type": "markdown",
1217 | "metadata": {},
1218 | "source": [
1219 | "Next, obtain the feature importance from the random forest classifier directly and compute the ranking as before."
1220 | ]
1221 | },
1222 | {
1223 | "cell_type": "code",
1224 | "execution_count": null,
1225 | "metadata": {},
1226 | "outputs": [],
1227 | "source": [
1228 | "# MODIFY THIS CELL\n",
1229 | "\n",
1230 | "forest = RandomForestClassifier(n_estimators=100, random_state=0)\n",
1231 | "forest.fit(X_train, y_train)\n",
1232 | "\n",
1233 | "print('Random Forest Feature Importance:\\n', # YOUR CODE)"
1234 | ]
1235 | },
1236 | {
1237 | "cell_type": "code",
1238 | "execution_count": null,
1239 | "metadata": {},
1240 | "outputs": [],
1241 | "source": [
1242 | "# MODIFY THIS CELL\n",
1243 | "\n",
1244 | "\n",
1245 | "# YOUR CODE TO RANK THE FEATURES"
1246 | ]
1247 | },
1248 | {
1249 | "cell_type": "markdown",
1250 | "metadata": {},
1251 | "source": [
1252 | "Which are the 3 most important features now? You can either write the feature indices below that correspond to the most important features or write out the full column names (you can see the column names in the pandas `DataFrame` in 5.1)."
1253 | ]
1254 | },
1255 | {
1256 | "cell_type": "markdown",
1257 | "metadata": {},
1258 | "source": [
1259 | "!!! **EDIT THIS CELL TO ENTER YOUR ANSWER** !!!"
1260 | ]
1261 | },
1262 | {
1263 | "cell_type": "markdown",
1264 | "metadata": {},
1265 | "source": [
1266 | "\n",
1267 | "
\n",
1268 | "
\n",
1269 | "
\n",
1270 | "
\n",
1271 | "
\n",
1272 | "
\n",
1273 | "
"
1274 | ]
1275 | },
1276 | {
1277 | "cell_type": "markdown",
1278 | "metadata": {},
1279 | "source": [
1280 | "Finally, use the `feature_importance_permutation` function from mlxtend (http://rasbt.github.io/mlxtend/user_guide/evaluate/feature_importance_permutation/) to compute the most important features. Inside `the feature_importance_permutation` function,\n",
1281 | "\n",
1282 | "- use a random seed of 0\n",
1283 | "- use 50 permutation rounds\n",
1284 | "\n",
1285 | "then print the importance values."
1286 | ]
1287 | },
1288 | {
1289 | "cell_type": "code",
1290 | "execution_count": null,
1291 | "metadata": {},
1292 | "outputs": [],
1293 | "source": [
1294 | "# MODIFY THIS CELL\n",
1295 | "\n",
1296 | "\n",
1297 | "from mlxtend.evaluate import feature_importance_permutation\n",
1298 | "\n",
1299 | "\n",
1300 | "forest = RandomForestClassifier(n_estimators=100,\n",
1301 | " random_state=0)\n",
1302 | "\n",
1303 | "forest.fit(X_train, y_train)\n",
1304 | "\n",
1305 | "# YOUR CODE"
1306 | ]
1307 | },
1308 | {
1309 | "cell_type": "code",
1310 | "execution_count": null,
1311 | "metadata": {},
1312 | "outputs": [],
1313 | "source": [
1314 | "# MODIFY THIS CELL\n",
1315 | "\n",
1316 | "\n",
1317 | "# YOUR CODE TO RANK THE FEATURES"
1318 | ]
1319 | },
1320 | {
1321 | "cell_type": "markdown",
1322 | "metadata": {},
1323 | "source": [
1324 | "Which are the 3 most important features now? You can either write the feature indices below that correspond to the most important features or write out the full column names (you can see the column names in the pandas `DataFrame` in 5.1)."
1325 | ]
1326 | },
1327 | {
1328 | "cell_type": "markdown",
1329 | "metadata": {},
1330 | "source": [
1331 | "!!! **EDIT THIS CELL TO ENTER YOUR ANSWER** !!!"
1332 | ]
1333 | },
1334 | {
1335 | "cell_type": "markdown",
1336 | "metadata": {},
1337 | "source": [
1338 | "\n",
1339 | "
\n",
1340 | "
\n",
1341 | "
\n",
1342 | "
\n",
1343 | "
\n",
1344 | "
\n",
1345 | "
"
1346 | ]
1347 | },
1348 | {
1349 | "cell_type": "markdown",
1350 | "metadata": {},
1351 | "source": [
1352 | "### [10 pts] 5.3 Creating your Own Feature Selection Transformer Class"
1353 | ]
1354 | },
1355 | {
1356 | "cell_type": "markdown",
1357 | "metadata": {},
1358 | "source": [
1359 | "This section will help you understand how you can implement your own feature selection method in a way that is compatible with scikit-learn.\n",
1360 | "\n",
1361 | "The following code (`ColumnSelector`) implements a feature selector that works similarly to the feature selctors implemented in scikit-learn. However, this `ColumnSelector` does not do anything automatically."
1362 | ]
1363 | },
1364 | {
1365 | "cell_type": "code",
1366 | "execution_count": null,
1367 | "metadata": {},
1368 | "outputs": [],
1369 | "source": [
1370 | "# EXECUTE BUT DO NOT EDIT THIS CELL\n",
1371 | "\n",
1372 | "from sklearn.base import BaseEstimator\n",
1373 | "import numpy as np\n",
1374 | "\n",
1375 | "\n",
1376 | "class ColumnSelector(BaseEstimator):\n",
1377 | "\n",
1378 | " def __init__(self, cols=None):\n",
1379 | " self.cols = cols\n",
1380 | "\n",
1381 | " def fit_transform(self, X, y=None):\n",
1382 | " return self.transform(X=X, y=y)\n",
1383 | "\n",
1384 | " def transform(self, X, y=None):\n",
1385 | " feature_subset = X[:, self.cols]\n",
1386 | " if len(feature_subset.shape) == 1:\n",
1387 | " feature_subset = feature_subset[:, np.newaxis]\n",
1388 | " return feature_subset\n",
1389 | "\n",
1390 | " def fit(self, X, y=None):\n",
1391 | " return self"
1392 | ]
1393 | },
1394 | {
1395 | "cell_type": "markdown",
1396 | "metadata": {},
1397 | "source": [
1398 | "As the name implies, we `ColumnSelector` selects specific columns that we as the user need to specify. For example, consider the Wine dataset from earlier:"
1399 | ]
1400 | },
1401 | {
1402 | "cell_type": "code",
1403 | "execution_count": null,
1404 | "metadata": {},
1405 | "outputs": [],
1406 | "source": [
1407 | "# EXECUTE BUT DO NOT EDIT THIS CELL\n",
1408 | "\n",
1409 | "import pandas as pd\n",
1410 | "\n",
1411 | "df_wine = pd.read_csv('data/wine.data',\n",
1412 | " header=None)\n",
1413 | "\n",
1414 | "df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',\n",
1415 | " 'Alcalinity of ash', 'Magnesium', 'Total phenols',\n",
1416 | " 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',\n",
1417 | " 'Color intensity', 'Hue',\n",
1418 | " 'OD280/OD315 of diluted wines', 'Proline']\n",
1419 | "\n",
1420 | "df_wine.head()"
1421 | ]
1422 | },
1423 | {
1424 | "cell_type": "code",
1425 | "execution_count": null,
1426 | "metadata": {},
1427 | "outputs": [],
1428 | "source": [
1429 | "# EXECUTE BUT DO NOT EDIT THIS CELL\n",
1430 | "\n",
1431 | "from sklearn.model_selection import train_test_split\n",
1432 | "\n",
1433 | "X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values\n",
1434 | "\n",
1435 | "X_train, X_test, y_train, y_test = \\\n",
1436 | " train_test_split(X, y, test_size=0.3, \n",
1437 | " stratify=y,\n",
1438 | " random_state=0)"
1439 | ]
1440 | },
1441 | {
1442 | "cell_type": "markdown",
1443 | "metadata": {},
1444 | "source": [
1445 | "Via the `ColumnSelector`, we can select select specific columns from the dataset. E.g., to select the 1st, 6th, and 9th column, and 12th column, we can initialize the `ColumnSelector` with the argument `cols=[0, 5, 8, 11]` and use the transform method as shown below:"
1446 | ]
1447 | },
1448 | {
1449 | "cell_type": "code",
1450 | "execution_count": null,
1451 | "metadata": {},
1452 | "outputs": [],
1453 | "source": [
1454 | "# EXECUTE BUT DO NOT EDIT THIS CELL\n",
1455 | "\n",
1456 | "col_sele = ColumnSelector(cols=[0, 5, 8, 11])\n",
1457 | "reduced_subset = col_sele.transform(X_train)\n",
1458 | "\n",
1459 | "print('Original feature set size:', X_train.shape)\n",
1460 | "print('Selected feature set size:', reduced_subset.shape)"
1461 | ]
1462 | },
1463 | {
1464 | "cell_type": "markdown",
1465 | "metadata": {},
1466 | "source": [
1467 | "Your task now is to use the `feature_importances_` attribute from a fitted random forest model inside a custom feature selector. Using this feature selector, you should be able to select features as follows:\n",
1468 | "\n",
1469 | "\n",
1470 | "```python\n",
1471 | "\n",
1472 | "forest = RandomForestClassifier(n_estimators=100, random_state=123)\n",
1473 | "\n",
1474 | "selector = ImportanceSelector(num_features=3, random_forest_estimator=forest)\n",
1475 | "selector.fit(X_train, y_train)\n",
1476 | "reduced_train_features = selector.transform(X_train, y_train)\n",
1477 | "```\n",
1478 | "\n",
1479 | "- If `num_features=3` as shown above, this means that we are interested to select the top 3 most important features from a dataset based on the random forest feature importance values.\n",
1480 | "\n",
1481 | "\n",
1482 | "- Actually, while it might be more interesting to implement a feature selctor based on the column-drop performance (which would then be somewhat related to sequential feature selection), we use the feature importance values from a `RandomForest`'s `feature_importances_` attribute for simplicity here, to allow you to implement this method in case your `feature_importance_dropcolumn` function does not work correctly."
1483 | ]
1484 | },
1485 | {
1486 | "cell_type": "code",
1487 | "execution_count": null,
1488 | "metadata": {},
1489 | "outputs": [],
1490 | "source": [
1491 | "# MODIFY THIS CELL\n",
1492 | "\n",
1493 | "from sklearn.base import BaseEstimator\n",
1494 | "import numpy as np\n",
1495 | "\n",
1496 | "\n",
1497 | "class ImportanceSelector(BaseEstimator):\n",
1498 | "\n",
1499 | " def __init__(self, num_features, random_forest_estimator):\n",
1500 | " self.num_features = num_features\n",
1501 | " self.forest = random_forest_estimator\n",
1502 | "\n",
1503 | " def transform(self, X, y=None):\n",
1504 | " \n",
1505 | " # Feature by increasing feature importance:\n",
1506 | " features_by_importance = # YOUR CODE\n",
1507 | " top_k_feature_indices = # YOUR CODE\n",
1508 | " \n",
1509 | " feature_subset = X[:, top_k_feature_indices]\n",
1510 | " if len(feature_subset.shape) == 1:\n",
1511 | " feature_subset = feature_subset[:, np.newaxis]\n",
1512 | " return feature_subset\n",
1513 | "\n",
1514 | " def fit(self, X, y=None):\n",
1515 | " self.forest.fit(X, y)\n",
1516 | " return self"
1517 | ]
1518 | },
1519 | {
1520 | "cell_type": "markdown",
1521 | "metadata": {},
1522 | "source": [
1523 | "Now, use the `ImportanceSelector` to select the 3 most important features in the dataset:"
1524 | ]
1525 | },
1526 | {
1527 | "cell_type": "code",
1528 | "execution_count": null,
1529 | "metadata": {},
1530 | "outputs": [],
1531 | "source": [
1532 | "# MODIFY THIS CELL\n",
1533 | "\n",
1534 | "from sklearn.ensemble import RandomForestClassifier\n",
1535 | "\n",
1536 | "\n",
1537 | "forest = RandomForestClassifier(n_estimators=100, random_state=123)\n",
1538 | "\n",
1539 | "selector = # YOUR CODE\n",
1540 | "# YOUR CODE\n",
1541 | "reduced_train_features = # YOUR CODE\n",
1542 | "\n",
1543 | "print('Original feature set size:', X_train.shape)\n",
1544 | "print('Selected feature set size:', reduced_train_features.shape)\n",
1545 | "print('First 5 rows:\\n', reduced_train_features[:5])"
1546 | ]
1547 | },
1548 | {
1549 | "cell_type": "markdown",
1550 | "metadata": {},
1551 | "source": [
1552 | "\n",
1553 | "
\n",
1554 | "
\n",
1555 | "
\n",
1556 | "
\n",
1557 | "
\n",
1558 | "
\n",
1559 | "
"
1560 | ]
1561 | },
1562 | {
1563 | "cell_type": "markdown",
1564 | "metadata": {},
1565 | "source": [
1566 | "## (5 pts) Bonus Exercise: Evaluating a KNN Classifier on Different Feature Subsets"
1567 | ]
1568 | },
1569 | {
1570 | "cell_type": "markdown",
1571 | "metadata": {},
1572 | "source": [
1573 | "In this *Bonus Exercise*, your task is to use a scikit-learn pipeline to fit a KNN classifier based on different 2-feature combinations and different values of *k* (number of neighbors) via grid search. More specifically,\n",
1574 | "\n",
1575 | "1. Create a scikit-learn pipeline that consists of a `StandardScaler`, a `ColumnSelector`, and a `KNeighborsClassifeir` (think about the right way to order these elements in the pipeline);\n",
1576 | "2. Using this pipeline, find the best value for `k` in the KNN classifier as well as the best feature combination (restricted to 2-feature subsets for simplicity) using `GridSearchCV`;\n",
1577 | "3. Fit the best model determined via grid search on the whole training set and evaluate the performance on the test set."
1578 | ]
1579 | },
1580 | {
1581 | "cell_type": "code",
1582 | "execution_count": null,
1583 | "metadata": {},
1584 | "outputs": [],
1585 | "source": [
1586 | "# EXECUTE BUT DO NOT EDIT\n",
1587 | "\n",
1588 | "\n",
1589 | "import pandas as pd\n",
1590 | "\n",
1591 | "\n",
1592 | "df_wine = pd.read_csv('data/wine.data',\n",
1593 | " header=None)\n",
1594 | "\n",
1595 | "df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',\n",
1596 | " 'Alcalinity of ash', 'Magnesium', 'Total phenols',\n",
1597 | " 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',\n",
1598 | " 'Color intensity', 'Hue',\n",
1599 | " 'OD280/OD315 of diluted wines', 'Proline']\n",
1600 | "\n",
1601 | "df_wine.head()"
1602 | ]
1603 | },
1604 | {
1605 | "cell_type": "code",
1606 | "execution_count": null,
1607 | "metadata": {},
1608 | "outputs": [],
1609 | "source": [
1610 | "# EXECUTE BUT DO NOT EDIT\n",
1611 | "\n",
1612 | "from sklearn.model_selection import train_test_split\n",
1613 | "\n",
1614 | "\n",
1615 | "X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values\n",
1616 | "\n",
1617 | "X_train, X_test, y_train, y_test = \\\n",
1618 | " train_test_split(X, y, test_size=0.3, \n",
1619 | " stratify=y,\n",
1620 | " random_state=0)"
1621 | ]
1622 | },
1623 | {
1624 | "cell_type": "code",
1625 | "execution_count": null,
1626 | "metadata": {},
1627 | "outputs": [],
1628 | "source": [
1629 | "# EXECUTE BUT DO NOT EDIT THIS CELL\n",
1630 | "\n",
1631 | "from sklearn.base import BaseEstimator\n",
1632 | "import numpy as np\n",
1633 | "\n",
1634 | "\n",
1635 | "class ColumnSelector(BaseEstimator):\n",
1636 | "\n",
1637 | " def __init__(self, cols=None):\n",
1638 | " self.cols = cols\n",
1639 | "\n",
1640 | " def fit_transform(self, X, y=None):\n",
1641 | " return self.transform(X=X, y=y)\n",
1642 | "\n",
1643 | " def transform(self, X, y=None):\n",
1644 | " feature_subset = X[:, self.cols]\n",
1645 | " if len(feature_subset.shape) == 1:\n",
1646 | " feature_subset = feature_subset[:, np.newaxis]\n",
1647 | " return feature_subset\n",
1648 | "\n",
1649 | " def fit(self, X, y=None):\n",
1650 | " return self"
1651 | ]
1652 | },
1653 | {
1654 | "cell_type": "markdown",
1655 | "metadata": {},
1656 | "source": [
1657 | "Modify the following code cell to create a list of all possible 2-feature combinations:"
1658 | ]
1659 | },
1660 | {
1661 | "cell_type": "code",
1662 | "execution_count": null,
1663 | "metadata": {},
1664 | "outputs": [],
1665 | "source": [
1666 | "# MODIFY THIS CELL\n",
1667 | "\n",
1668 | "import itertools\n",
1669 | "\n",
1670 | "\n",
1671 | "all_combin_2 = list(itertools.combinations( # YOUR CODE)\n",
1672 | "\n",
1673 | "\n",
1674 | "print('Number of all possible 2-feature combinations:', len(all_combin_2))"
1675 | ]
1676 | },
1677 | {
1678 | "cell_type": "markdown",
1679 | "metadata": {},
1680 | "source": [
1681 | "Modify the following code cell to create a `pipeline` (as explained at the beginning of this section), and use the given `param_grid` to fit the `GridSearchCV` to obtain the best parameters settings and a classifier fit to `X_train` and `y_train` based on these best hyperparameter values.\n",
1682 | "\n",
1683 | "(Note that the code may take 10-30 seconds to execute.)"
1684 | ]
1685 | },
1686 | {
1687 | "cell_type": "code",
1688 | "execution_count": null,
1689 | "metadata": {},
1690 | "outputs": [],
1691 | "source": [
1692 | "# MODIFY THIS CELL\n",
1693 | "\n",
1694 | "from sklearn.pipeline import make_pipeline\n",
1695 | "from sklearn.preprocessing import StandardScaler\n",
1696 | "from sklearn.neighbors import KNeighborsClassifier\n",
1697 | "from sklearn.model_selection import GridSearchCV\n",
1698 | "\n",
1699 | "\n",
1700 | "pipe = make_pipeline(\n",
1701 | "# YOUR CODE\n",
1702 | "# YOUR CODE\n",
1703 | "# YOUR CODE\n",
1704 | ")\n",
1705 | "\n",
1706 | "\n",
1707 | "param_grid = {'kneighborsclassifier__n_neighbors': list(range(1, 8)),\n",
1708 | " 'columnselector__cols': all_combin_2}\n",
1709 | "\n",
1710 | "gsearch = GridSearchCV(pipe,\n",
1711 | " param_grid=param_grid,\n",
1712 | " refit=True,\n",
1713 | " iid=False,\n",
1714 | " cv=5)\n",
1715 | "\n",
1716 | "gsearch.fit(X_train, y_train)"
1717 | ]
1718 | },
1719 | {
1720 | "cell_type": "code",
1721 | "execution_count": null,
1722 | "metadata": {},
1723 | "outputs": [],
1724 | "source": [
1725 | "# EXECUTE BUT DO NOT EDIT\n",
1726 | "\n",
1727 | "\n",
1728 | "print(gsearch.best_params_)"
1729 | ]
1730 | },
1731 | {
1732 | "cell_type": "markdown",
1733 | "metadata": {},
1734 | "source": [
1735 | "Based on the best combination of a 2-feature subset and the number of `n_neigbors` your model should be fit the the training dataset now. Use the fitted model and compute its classification accuracy on the test set (`X_test`, `y_test`)."
1736 | ]
1737 | },
1738 | {
1739 | "cell_type": "code",
1740 | "execution_count": null,
1741 | "metadata": {},
1742 | "outputs": [],
1743 | "source": [
1744 | "# MODIFY THIS CELL\n",
1745 | "\n",
1746 | "# YOUR CODE TO COMPUTE THE TEST ACCURACY"
1747 | ]
1748 | }
1749 | ],
1750 | "metadata": {
1751 | "kernelspec": {
1752 | "display_name": "Python 3",
1753 | "language": "python",
1754 | "name": "python3"
1755 | },
1756 | "language_info": {
1757 | "codemirror_mode": {
1758 | "name": "ipython",
1759 | "version": 3
1760 | },
1761 | "file_extension": ".py",
1762 | "mimetype": "text/x-python",
1763 | "name": "python",
1764 | "nbconvert_exporter": "python",
1765 | "pygments_lexer": "ipython3",
1766 | "version": "3.6.5"
1767 | }
1768 | },
1769 | "nbformat": 4,
1770 | "nbformat_minor": 2
1771 | }
1772 |
--------------------------------------------------------------------------------