├── LICENSE
├── README.md
├── assets
    ├── tut_01_a_vs_b.png
    ├── tut_01_bayes_rule.png
    ├── tut_01_cor_1.jpg
    ├── tut_01_cor_2.jpg
    ├── tut_01_cor_3.jpg
    ├── tut_01_intersection.jpg
    ├── tut_01_param_est.PNG
    ├── tut_01_transformation.jpg
    ├── tut_02_benjamin.jpg
    ├── tut_02_confusion_mat.jpg
    ├── tut_02_error_types.jpg
    ├── tut_02_p_val.png
    ├── tut_02_pop.png
    ├── tut_02_quote.jpg
    ├── tut_02_var_param.png
    ├── tut_02_z_stat.jpg
    ├── tut_03_gram.gif
    ├── tut_03_linear_dep.jpg
    ├── tut_03_norm.jpg
    ├── tut_03_svd.gif
    ├── tut_03_svd_1.jpg
    ├── tut_03_svd_2.jpg
    ├── tut_03_svd_3.jpg
    ├── tut_03_svd_vis.png
    ├── tut_04_dim_red.jpg
    ├── tut_04_feature_select_1.jpg
    ├── tut_04_feature_select_2.jpg
    ├── tut_04_feature_select_3.jpg
    ├── tut_04_filter_methods.jpg
    ├── tut_04_outlier.jpeg
    ├── tut_04_pca_1.jpg
    ├── tut_04_sbs.jpg
    ├── tut_04_sfs.jpg
    ├── tut_04_sfs_example.jpg
    ├── tut_04_sfs_sbs.jpg
    ├── tut_04_wrapper_methods_1.jpg
    ├── tut_05_conf_mat.jpg
    ├── tut_05_holdout.jpg
    ├── tut_05_kcv.jpeg
    ├── tut_05_kfold.jpeg
    ├── tut_05_overfit.png
    ├── tut_05_overfitting.png
    ├── tut_05_test.jpg
    ├── tut_05_tp_tf.jpg
    ├── tut_05_train.jpg
    ├── tut_05_validation.jpg
    ├── tut_06_convex_1.jpg
    ├── tut_06_convex_2.jpg
    ├── tut_06_convex_concave.gif
    ├── tut_06_deriv.jpg
    ├── tut_06_deriv_2.jpg
    ├── tut_06_gd.gif
    ├── tut_06_gradient.gif
    ├── tut_06_gradient_2.jpg
    ├── tut_06_lagrange.jpg
    ├── tut_06_lls_anim.gif
    ├── tut_06_lr.png
    ├── tut_06_mb_sgd.png
    ├── tut_06_minimum.jpg
    ├── tut_06_multimodal.jpg
    ├── tut_06_saddle.jpg
    ├── tut_06_sgd.png
    ├── tut_06_step.png
    ├── tut_06_unimodal.jpg
    ├── tut_07_akinator.jpg
    ├── tut_07_boundries.jpg
    ├── tut_07_entropy.jpg
    ├── tut_07_example_1.jpg
    ├── tut_07_example_2.jpg
    ├── tut_07_gini.jpg
    ├── tut_07_multitree.jpg
    ├── tut_07_overfitting.jpg
    ├── tut_07_random_forest.png
    ├── tut_07_titan_tree.jpg
    ├── tut_07_vis.jpg
    ├── tut_08_adaline.gif
    ├── tut_08_alpha.jpg
    ├── tut_08_alpha_2.jpg
    ├── tut_08_bias_var.jpg
    ├── tut_08_bias_var_2.jpg
    ├── tut_08_cos.jpg
    ├── tut_08_intro.jpg
    ├── tut_08_lass_ridge.jpg
    ├── tut_08_lr_1.jpg
    ├── tut_08_lr_2.jpg
    ├── tut_08_lr_3.jpg
    ├── tut_08_perc_vs_ada.png
    ├── tut_08_poly_1.jpg
    ├── tut_08_reg.jpg
    ├── tut_08_res_1.jpg
    ├── tut_08_res_2.jpg
    ├── tut_09_bayes.jpg
    ├── tut_09_lda.jpg
    ├── tut_09_lin_models.jpg
    ├── tut_09_lms_perc.jpg
    ├── tut_09_one_vs_all_1.png
    ├── tut_09_one_vs_all_2.png
    ├── tut_09_one_vs_all_3.png
    ├── tut_09_one_vs_all_4.png
    ├── tut_09_perceptron.jpg
    ├── tut_09_qda.jpg
    ├── tut_10_em_step.jpg
    ├── tut_10_gmm.jpg
    ├── tut_10_gmm_2.jpg
    ├── tut_10_gmm_anim.gif
    ├── tut_10_kmeans.jpg
    ├── tut_10_kmeans_anim.gif
    ├── tut_11_adaboost.gif
    ├── tut_11_adaboost_1.gif
    ├── tut_11_bagging_1.jpg
    ├── tut_11_bagging_2.jpg
    ├── tut_11_bagging_pasting.png
    ├── tut_11_boosting_1.jpg
    ├── tut_11_boosting_2.jpg
    ├── tut_11_exp_loss.jpg
    ├── tut_11_geom.jpg
    ├── tut_11_hinge.jpg
    ├── tut_11_large_margin.png
    ├── tut_11_margin.jpg
    ├── tut_11_rbf_kernel.png
    ├── tut_11_slack.jpg
    ├── tut_11_tuning_c.gif
    ├── tut_11_tuning_c_linear.gif
    ├── tut_12_backward.jpg
    ├── tut_12_boolean.jpg
    ├── tut_12_ex_1.png
    ├── tut_12_ex_2.png
    ├── tut_12_forward.jpg
    ├── tut_12_intro.jpg
    ├── tut_12_mlp.jpg
    ├── tut_12_mod_app_1.jpg
    ├── tut_12_mod_app_2.jpg
    ├── tut_12_optim_general.jpg
    ├── tut_13_convex.jpg
    ├── tut_13_halfspaces.jpg
    ├── tut_13_intervals.jpg
    ├── tut_13_linear.jpg
    ├── tut_13_overfitting.jpg
    ├── tut_13_pacman.png
    ├── tut_13_rectangles.jpg
    ├── tut_13_rectangles_2.jpg
    └── tut_13_threshold.jpg
├── cs236756_tutorial_01_probability_mle.ipynb
├── cs236756_tutorial_02_statistics.ipynb
├── cs236756_tutorial_03_linear_algebra.ipynb
├── cs236756_tutorial_04_pca_feature_selection.ipynb
├── cs236756_tutorial_05_evaluation_validation.ipynb
├── cs236756_tutorial_06_decision_trees.ipynb
├── cs236756_tutorial_07_optimization.ipynb
├── cs236756_tutorial_08_linear_regression.ipynb
├── cs236756_tutorial_09_linear_models.ipynb
├── cs236756_tutorial_10_expectation_maximization.ipynb
├── cs236756_tutorial_11_boosting_bagging.ipynb
├── cs236756_tutorial_12_svm.ipynb
├── cs236756_tutorial_13_deep_learning_intro_backprop.ipynb
├── cs236756_tutorial_14_pac_vc_dimension.ipynb
├── datasets
    ├── cancer_dataset.csv
    ├── heights_dataset.csv
    └── titanic_dataset.csv
├── environment.yml
└── pdf
    ├── cs236756_tutorial_01_probability_mle.pdf
    ├── cs236756_tutorial_02_statistics.pdf
    ├── cs236756_tutorial_03_linear_algebra.pdf
    ├── cs236756_tutorial_04_pca_feature_selection.pdf
    ├── cs236756_tutorial_05_evaluation_validation.pdf
    ├── cs236756_tutorial_06_decision_trees.pdf
    ├── cs236756_tutorial_07_optimization.pdf
    ├── cs236756_tutorial_08_linear_regression.pdf
    ├── cs236756_tutorial_09_linear_models.pdf
    ├── cs236756_tutorial_10_expectation_maximization.pdf
    ├── cs236756_tutorial_11_boosting_bagging.pdf
    ├── cs236756_tutorial_12_svm.pdf
    ├── cs236756_tutorial_13_deep_learning_intro_backprop.pdf
    └── cs236756_tutorial_14_pac_vc_dimension.pdf


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Tal Daniel
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # cs236756-intro-to-ml
  2 | 
  3 | <h1 align="center">
  4 |   <br>
  5 | Technion CS 236756 - Introduction to Machine Learning
  6 |   <br>
  7 |   <img src="https://github.com/taldatech/cs236756-intro-to-ml/blob/master/assets/tut_08_adaline.gif" height="200"> <img src="https://github.com/taldatech/cs236756-intro-to-ml/blob/master/assets/tut_06_gd.gif" height="200">
  8 | </h1>
  9 | 
 10 |   <p align="center">
 11 |     <a href="https://taldatech.github.io">Tal Daniel</a>
 12 |   </p>
 13 | 
 14 | Jupyter Notebook tutorials for the Technion's CS 236756 course "Introduction to Machine Learning"
 15 | 
 16 | <h4 align="center">
 17 |     <a href="https://colab.research.google.com/github/taldatech/cs236756-intro-to-ml"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
 18 |     <a href="https://nbviewer.jupyter.org/github/taldatech/cs236756-intro-to-ml/tree/master/"><img src="https://raw.githubusercontent.com/taldatech/ee046211-deep-learning/main/assets/nbviewer_badge.svg" alt="Open In NBViewer"/></a>
 19 |     <a href="https://mybinder.org/v2/gh/taldatech/cs236756-intro-to-ml/master"><img src="https://mybinder.org/badge_logo.svg" alt="Open In Binder"/></a>
 20 | 
 21 | </h4>
 22 | 
 23 | 
 24 | * For the old tutorials, see `spring19` branch.
 25 | 
 26 | - [cs236756-intro-to-ml](#cs236756-intro-to-ml)
 27 |   * [Running The Notebooks](#running-the-notebooks)
 28 |     + [Running Online](#running-online)
 29 |     + [Running Locally](#running-locally)
 30 |   * [Agenda](#agenda)
 31 |   * [Installation Instructions](#installation-instructions)
 32 |     + [Libraries to Install](#libraries-to-install)
 33 | 
 34 | ## Running The Notebooks
 35 | You can view the tutorials online or download and run locally.
 36 | 
 37 | ### Running Online
 38 | 
 39 | |Service      | Usage |
 40 | |-------------|---------|
 41 | |Jupyter Nbviewer| Render and view the notebooks (can not edit) |
 42 | |Binder| Render, view and edit the notebooks (limited time) |
 43 | |Google Colab| Render, view, edit and save the notebooks to Google Drive (limited time) |
 44 | 
 45 | 
 46 | Jupyter Nbviewer:
 47 | 
 48 | [![nbviewer](https://raw.githubusercontent.com/taldatech/ee046211-deep-learning/main/assets/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/taldatech/cs236756-intro-to-ml/tree/master/)
 49 | 
 50 | 
 51 | Press on the "Open in Colab" button below to use Google Colab:
 52 | 
 53 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/taldatech/cs236756-intro-to-ml)
 54 | 
 55 | Or press on the "launch binder" button below to launch in Binder:
 56 | 
 57 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/taldatech/cs236756-intro-to-ml/master)
 58 | 
 59 | Note: creating the Binder instance takes about ~5-10 minutes, so be patient
 60 | 
 61 | ### Running Locally
 62 | Press "Download ZIP" under the green button `Clone or download` or use `git` to clone the repository using the 
 63 | following command: `git clone https://github.com/taldatech/cs236756-intro-to-ml.git` (in cmd/PowerShell in Windows or in the Terminal in Linux/Mac)
 64 | 
 65 | Open the folder in Jupyter Notebook (it is recommended to use Anaconda). Installation instructions can be found at the bottom of the README file.
 66 | 
 67 | 
 68 | 
 69 | ## Agenda
 70 | 
 71 | |File       | Topics Covered |
 72 | |----------------|---------|
 73 | |`cs236756_tutorial_01_probability_mle.ipynb\pdf`|  Probability basics, random variables, Bayes rule, histograms, correlation, parameter estimation, Maximum Likelihood Estimation (MLE)|
 74 | |`cs236756_tutorial_02_statistics.ipynb\pdf`|Statistics definitions, hypothesis testing steps, z-statistic, Central Limit Theorem (CLT), Area Under the Curve (AUC), error types, confusion matrix|
 75 | |`cs236756_tutorial_03_linear_algebra.ipynb\pdf`|Linear Algebra basics (vectors, inner/outer product spaces, norms, linear dependency, matrix operations, matrix rank, range and nullspace), least-squares solution, eigenvalues and eigenvectors, Singuar Value Decomposition (SVD)|
 76 | |`cs236756_tutorial_04_pca_feature_selection.ipynb\pdf`|Dimensionality Reduction, Outliers, PCA, SVD, Breast Cancer dataset, Feature Selection, Filter methods, Wrapper methods, RFE (scikit-learn)|
 77 | |`cs236756_tutorial_05_evaluation_validation.ipynb\pdf`|Classifier Evaluation and Validation, metrics, accuracy, precision, recall, FN/TP rate, Confusion Matrix, F1 score, K-Fold Cross-Validation, train-validation-test split, holdout method, stratification, ROC curve|
 78 | |`cs236756_tutorial_06_decision_trees.ipynb\pdf`|Decision Trees, The CART algorithm, Pruning, Regularization, Impurity Metrics, Entropy, Gini, Information Gain (IG), SplitInformation, Gain Ratio (GR), The Titanic Dataset, Tree Visualization with Scikit-Learn, Random Forest, Mutual Information (MI)|
 79 | |`cs236756_tutorial_07_optimization.ipynb\pdf`|Optimization in ML, Gradient Descent, Batch Gradient Descent, Mini-Batch (MB) Gradient Descent, Stochastic Gradient Descent (SGD), Convexity, Uni/Multi-modal problems, Lagrangian and Largrange Multipliers, Constrained Optimization|
 80 | |`cs236756_tutorial_08_linear_regression.ipynb\pdf`|Classification vs. Regression, NLL (Negative Log-Likelihood), MLE connection to MSE, Residual Analysis, Basis Functions Expansion, Feature Extraction, Linear and Polynomial Regression, Bias-Variance Tradeoff, Irreducible Error, Regularization (L1 + L2), Ridge and LASSO Regression|
 81 | |`cs236756_tutorial_09_linear_models.ipynb\pdf`|Discriminative vs Generative Models, Linear Models, Perceptron, Least Mean Square (LMS) - Adaptive Linear Neuron (ADALINE), MLE with Bernoulli, Logistic Regression, Softmax, Maximum A Posteriori (MAP), Quadratic Discriminant Analysis (QDA), Naive Bayes, Linear Discriminant Analysis (LDA), One-vs-All Classification|
 82 | |`cs236756_tutorial_10_expectation_maximization.ipynb\pdf`|Soft Clustering, Hard Clustering, K-Means, Incomplete/Complete Likelihood, Expectation Maximization (EM) Algorithm, Gaussian Mixture Model (GMM), Bernoulli Mixture Model (BMM), Dataset Generation with Scikit-Learn|
 83 | |`cs236756_tutorial_11_boosting_bagging.ipynb\pdf`|Ensemble Learning, Voting Classifiers, Hard Voting, Soft Voting, Random Forests, Bagging, Pasting, Bootstrap, Boosting, AdaBoost|
 84 | |`cs236756_tutorial_12_svm.ipynb\pdf`|	Support Vector Machine (SVM), Linear SVM, Hard/Soft SVM, The Primal Problem, The Dual Problem, The Kernel Trick, Kernel SVM, RBF Kernel, Polynomial Kernel, The Mercer Condition|
 85 | |`cs236756_tutorial_13_deep_learning_intro_backprop.ipynb\pdf`| Deep Learning Introduction, The XOR Problem, Multi-Layer Perceptron (MLP), Backpropagation, Activation Functions: Sigmoid, Tanh, ReLU, Forward Pass, Backward Pass, Boston Housing Dataset	|
 86 | |`cs236756_tutorial_14_pac_vc_dimension.ipynb\pdf`| Probably Approximately Correct (PAC) Learning, Risk, Empirical Risk, Empirical Risk Minimization (ERM), Inductive Bias, VC Dimension, Shattering, Dichotomy, No Free Lunch Theorem	|
 87 | 
 88 | ## Installation Instructions
 89 | 1. Get Anaconda with Python 3, follow the instructions according to your OS (Windows/Mac/Linux) at: https://www.anaconda.com/distribution/
 90 | 2. Create a new environment for the course (full guide at https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands):
 91 | In Windows open `Anaconda Prompt` from the start menu, in Mac/Linux open the terminal and run `conda create --name ml_course`
 92 | 3. To activate the environment, open the terminal (or `Anaconda Prompt` in Windows) and run `conda activate ml_course`
 93 | 4. Install the required libraries according to the table below (to search for a specific library and the corresponding command you can also look at https://anaconda.org/)
 94 | 
 95 | ### Libraries to Install
 96 | |Library         | Command to Run |
 97 | |----------------|---------|
 98 | |`Jupyter Notebook`|  `conda install -c conda-forge notebook`|
 99 | |`numpy`|  `conda install -c conda-forge numpy`|
100 | |`matplotlib`|  `conda install -c conda-forge matplotlib`|
101 | |`pandas`|  `conda install -c conda-forge pandas`|
102 | |`scipy`| `conda install -c anaconda scipy `|
103 | |`scikit-learn`|  `conda install -c conda-forge scikit-learn`|
104 | 
105 | 5. To open the notbooks, run `jupyter notebook` in the terminal (or `Anaconda Prompt` in Windows) while the `ml_course` environment is activated.
106 | 


--------------------------------------------------------------------------------
/assets/tut_01_a_vs_b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_01_a_vs_b.png


--------------------------------------------------------------------------------
/assets/tut_01_bayes_rule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_01_bayes_rule.png


--------------------------------------------------------------------------------
/assets/tut_01_cor_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_01_cor_1.jpg


--------------------------------------------------------------------------------
/assets/tut_01_cor_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_01_cor_2.jpg


--------------------------------------------------------------------------------
/assets/tut_01_cor_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_01_cor_3.jpg


--------------------------------------------------------------------------------
/assets/tut_01_intersection.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_01_intersection.jpg


--------------------------------------------------------------------------------
/assets/tut_01_param_est.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_01_param_est.PNG


--------------------------------------------------------------------------------
/assets/tut_01_transformation.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_01_transformation.jpg


--------------------------------------------------------------------------------
/assets/tut_02_benjamin.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_02_benjamin.jpg


--------------------------------------------------------------------------------
/assets/tut_02_confusion_mat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_02_confusion_mat.jpg


--------------------------------------------------------------------------------
/assets/tut_02_error_types.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_02_error_types.jpg


--------------------------------------------------------------------------------
/assets/tut_02_p_val.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_02_p_val.png


--------------------------------------------------------------------------------
/assets/tut_02_pop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_02_pop.png


--------------------------------------------------------------------------------
/assets/tut_02_quote.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_02_quote.jpg


--------------------------------------------------------------------------------
/assets/tut_02_var_param.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_02_var_param.png


--------------------------------------------------------------------------------
/assets/tut_02_z_stat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_02_z_stat.jpg


--------------------------------------------------------------------------------
/assets/tut_03_gram.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_03_gram.gif


--------------------------------------------------------------------------------
/assets/tut_03_linear_dep.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_03_linear_dep.jpg


--------------------------------------------------------------------------------
/assets/tut_03_norm.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_03_norm.jpg


--------------------------------------------------------------------------------
/assets/tut_03_svd.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_03_svd.gif


--------------------------------------------------------------------------------
/assets/tut_03_svd_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_03_svd_1.jpg


--------------------------------------------------------------------------------
/assets/tut_03_svd_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_03_svd_2.jpg


--------------------------------------------------------------------------------
/assets/tut_03_svd_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_03_svd_3.jpg


--------------------------------------------------------------------------------
/assets/tut_03_svd_vis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_03_svd_vis.png


--------------------------------------------------------------------------------
/assets/tut_04_dim_red.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_dim_red.jpg


--------------------------------------------------------------------------------
/assets/tut_04_feature_select_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_feature_select_1.jpg


--------------------------------------------------------------------------------
/assets/tut_04_feature_select_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_feature_select_2.jpg


--------------------------------------------------------------------------------
/assets/tut_04_feature_select_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_feature_select_3.jpg


--------------------------------------------------------------------------------
/assets/tut_04_filter_methods.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_filter_methods.jpg


--------------------------------------------------------------------------------
/assets/tut_04_outlier.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_outlier.jpeg


--------------------------------------------------------------------------------
/assets/tut_04_pca_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_pca_1.jpg


--------------------------------------------------------------------------------
/assets/tut_04_sbs.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_sbs.jpg


--------------------------------------------------------------------------------
/assets/tut_04_sfs.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_sfs.jpg


--------------------------------------------------------------------------------
/assets/tut_04_sfs_example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_sfs_example.jpg


--------------------------------------------------------------------------------
/assets/tut_04_sfs_sbs.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_sfs_sbs.jpg


--------------------------------------------------------------------------------
/assets/tut_04_wrapper_methods_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_wrapper_methods_1.jpg


--------------------------------------------------------------------------------
/assets/tut_05_conf_mat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_conf_mat.jpg


--------------------------------------------------------------------------------
/assets/tut_05_holdout.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_holdout.jpg


--------------------------------------------------------------------------------
/assets/tut_05_kcv.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_kcv.jpeg


--------------------------------------------------------------------------------
/assets/tut_05_kfold.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_kfold.jpeg


--------------------------------------------------------------------------------
/assets/tut_05_overfit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_overfit.png


--------------------------------------------------------------------------------
/assets/tut_05_overfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_overfitting.png


--------------------------------------------------------------------------------
/assets/tut_05_test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_test.jpg


--------------------------------------------------------------------------------
/assets/tut_05_tp_tf.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_tp_tf.jpg


--------------------------------------------------------------------------------
/assets/tut_05_train.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_train.jpg


--------------------------------------------------------------------------------
/assets/tut_05_validation.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_validation.jpg


--------------------------------------------------------------------------------
/assets/tut_06_convex_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_convex_1.jpg


--------------------------------------------------------------------------------
/assets/tut_06_convex_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_convex_2.jpg


--------------------------------------------------------------------------------
/assets/tut_06_convex_concave.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_convex_concave.gif


--------------------------------------------------------------------------------
/assets/tut_06_deriv.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_deriv.jpg


--------------------------------------------------------------------------------
/assets/tut_06_deriv_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_deriv_2.jpg


--------------------------------------------------------------------------------
/assets/tut_06_gd.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_gd.gif


--------------------------------------------------------------------------------
/assets/tut_06_gradient.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_gradient.gif


--------------------------------------------------------------------------------
/assets/tut_06_gradient_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_gradient_2.jpg


--------------------------------------------------------------------------------
/assets/tut_06_lagrange.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_lagrange.jpg


--------------------------------------------------------------------------------
/assets/tut_06_lls_anim.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_lls_anim.gif


--------------------------------------------------------------------------------
/assets/tut_06_lr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_lr.png


--------------------------------------------------------------------------------
/assets/tut_06_mb_sgd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_mb_sgd.png


--------------------------------------------------------------------------------
/assets/tut_06_minimum.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_minimum.jpg


--------------------------------------------------------------------------------
/assets/tut_06_multimodal.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_multimodal.jpg


--------------------------------------------------------------------------------
/assets/tut_06_saddle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_saddle.jpg


--------------------------------------------------------------------------------
/assets/tut_06_sgd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_sgd.png


--------------------------------------------------------------------------------
/assets/tut_06_step.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_step.png


--------------------------------------------------------------------------------
/assets/tut_06_unimodal.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_unimodal.jpg


--------------------------------------------------------------------------------
/assets/tut_07_akinator.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_akinator.jpg


--------------------------------------------------------------------------------
/assets/tut_07_boundries.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_boundries.jpg


--------------------------------------------------------------------------------
/assets/tut_07_entropy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_entropy.jpg


--------------------------------------------------------------------------------
/assets/tut_07_example_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_example_1.jpg


--------------------------------------------------------------------------------
/assets/tut_07_example_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_example_2.jpg


--------------------------------------------------------------------------------
/assets/tut_07_gini.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_gini.jpg


--------------------------------------------------------------------------------
/assets/tut_07_multitree.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_multitree.jpg


--------------------------------------------------------------------------------
/assets/tut_07_overfitting.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_overfitting.jpg


--------------------------------------------------------------------------------
/assets/tut_07_random_forest.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_random_forest.png


--------------------------------------------------------------------------------
/assets/tut_07_titan_tree.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_titan_tree.jpg


--------------------------------------------------------------------------------
/assets/tut_07_vis.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_vis.jpg


--------------------------------------------------------------------------------
/assets/tut_08_adaline.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_adaline.gif


--------------------------------------------------------------------------------
/assets/tut_08_alpha.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_alpha.jpg


--------------------------------------------------------------------------------
/assets/tut_08_alpha_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_alpha_2.jpg


--------------------------------------------------------------------------------
/assets/tut_08_bias_var.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_bias_var.jpg


--------------------------------------------------------------------------------
/assets/tut_08_bias_var_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_bias_var_2.jpg


--------------------------------------------------------------------------------
/assets/tut_08_cos.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_cos.jpg


--------------------------------------------------------------------------------
/assets/tut_08_intro.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_intro.jpg


--------------------------------------------------------------------------------
/assets/tut_08_lass_ridge.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_lass_ridge.jpg


--------------------------------------------------------------------------------
/assets/tut_08_lr_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_lr_1.jpg


--------------------------------------------------------------------------------
/assets/tut_08_lr_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_lr_2.jpg


--------------------------------------------------------------------------------
/assets/tut_08_lr_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_lr_3.jpg


--------------------------------------------------------------------------------
/assets/tut_08_perc_vs_ada.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_perc_vs_ada.png


--------------------------------------------------------------------------------
/assets/tut_08_poly_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_poly_1.jpg


--------------------------------------------------------------------------------
/assets/tut_08_reg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_reg.jpg


--------------------------------------------------------------------------------
/assets/tut_08_res_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_res_1.jpg


--------------------------------------------------------------------------------
/assets/tut_08_res_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_res_2.jpg


--------------------------------------------------------------------------------
/assets/tut_09_bayes.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_bayes.jpg


--------------------------------------------------------------------------------
/assets/tut_09_lda.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_lda.jpg


--------------------------------------------------------------------------------
/assets/tut_09_lin_models.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_lin_models.jpg


--------------------------------------------------------------------------------
/assets/tut_09_lms_perc.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_lms_perc.jpg


--------------------------------------------------------------------------------
/assets/tut_09_one_vs_all_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_one_vs_all_1.png


--------------------------------------------------------------------------------
/assets/tut_09_one_vs_all_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_one_vs_all_2.png


--------------------------------------------------------------------------------
/assets/tut_09_one_vs_all_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_one_vs_all_3.png


--------------------------------------------------------------------------------
/assets/tut_09_one_vs_all_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_one_vs_all_4.png


--------------------------------------------------------------------------------
/assets/tut_09_perceptron.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_perceptron.jpg


--------------------------------------------------------------------------------
/assets/tut_09_qda.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_qda.jpg


--------------------------------------------------------------------------------
/assets/tut_10_em_step.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_10_em_step.jpg


--------------------------------------------------------------------------------
/assets/tut_10_gmm.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_10_gmm.jpg


--------------------------------------------------------------------------------
/assets/tut_10_gmm_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_10_gmm_2.jpg


--------------------------------------------------------------------------------
/assets/tut_10_gmm_anim.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_10_gmm_anim.gif


--------------------------------------------------------------------------------
/assets/tut_10_kmeans.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_10_kmeans.jpg


--------------------------------------------------------------------------------
/assets/tut_10_kmeans_anim.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_10_kmeans_anim.gif


--------------------------------------------------------------------------------
/assets/tut_11_adaboost.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_adaboost.gif


--------------------------------------------------------------------------------
/assets/tut_11_adaboost_1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_adaboost_1.gif


--------------------------------------------------------------------------------
/assets/tut_11_bagging_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_bagging_1.jpg


--------------------------------------------------------------------------------
/assets/tut_11_bagging_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_bagging_2.jpg


--------------------------------------------------------------------------------
/assets/tut_11_bagging_pasting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_bagging_pasting.png


--------------------------------------------------------------------------------
/assets/tut_11_boosting_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_boosting_1.jpg


--------------------------------------------------------------------------------
/assets/tut_11_boosting_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_boosting_2.jpg


--------------------------------------------------------------------------------
/assets/tut_11_exp_loss.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_exp_loss.jpg


--------------------------------------------------------------------------------
/assets/tut_11_geom.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_geom.jpg


--------------------------------------------------------------------------------
/assets/tut_11_hinge.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_hinge.jpg


--------------------------------------------------------------------------------
/assets/tut_11_large_margin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_large_margin.png


--------------------------------------------------------------------------------
/assets/tut_11_margin.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_margin.jpg


--------------------------------------------------------------------------------
/assets/tut_11_rbf_kernel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_rbf_kernel.png


--------------------------------------------------------------------------------
/assets/tut_11_slack.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_slack.jpg


--------------------------------------------------------------------------------
/assets/tut_11_tuning_c.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_tuning_c.gif


--------------------------------------------------------------------------------
/assets/tut_11_tuning_c_linear.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_tuning_c_linear.gif


--------------------------------------------------------------------------------
/assets/tut_12_backward.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_backward.jpg


--------------------------------------------------------------------------------
/assets/tut_12_boolean.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_boolean.jpg


--------------------------------------------------------------------------------
/assets/tut_12_ex_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_ex_1.png


--------------------------------------------------------------------------------
/assets/tut_12_ex_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_ex_2.png


--------------------------------------------------------------------------------
/assets/tut_12_forward.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_forward.jpg


--------------------------------------------------------------------------------
/assets/tut_12_intro.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_intro.jpg


--------------------------------------------------------------------------------
/assets/tut_12_mlp.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_mlp.jpg


--------------------------------------------------------------------------------
/assets/tut_12_mod_app_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_mod_app_1.jpg


--------------------------------------------------------------------------------
/assets/tut_12_mod_app_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_mod_app_2.jpg


--------------------------------------------------------------------------------
/assets/tut_12_optim_general.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_optim_general.jpg


--------------------------------------------------------------------------------
/assets/tut_13_convex.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_13_convex.jpg


--------------------------------------------------------------------------------
/assets/tut_13_halfspaces.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_13_halfspaces.jpg


--------------------------------------------------------------------------------
/assets/tut_13_intervals.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_13_intervals.jpg


--------------------------------------------------------------------------------
/assets/tut_13_linear.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_13_linear.jpg


--------------------------------------------------------------------------------
/assets/tut_13_overfitting.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_13_overfitting.jpg


--------------------------------------------------------------------------------
/assets/tut_13_pacman.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_13_pacman.png


--------------------------------------------------------------------------------
/assets/tut_13_rectangles.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_13_rectangles.jpg


--------------------------------------------------------------------------------
/assets/tut_13_rectangles_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_13_rectangles_2.jpg


--------------------------------------------------------------------------------
/assets/tut_13_threshold.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_13_threshold.jpg


--------------------------------------------------------------------------------
/cs236756_tutorial_03_linear_algebra.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {
   6 |     "slideshow": {
   7 |      "slide_type": "slide"
   8 |     }
   9 |    },
  10 |    "source": [
  11 |     "# <img src=\"https://img.icons8.com/dusk/64/000000/mind-map.png\" style=\"height:50px;display:inline\"> CS 236756 - Technion - Intro to Machine Learning\n",
  12 |     "---\n",
  13 |     "#### Tal Daniel\n",
  14 |     "\n",
  15 |     "## Tutorial 03 - Linear Algebra & SVD\n",
  16 |     "---\n"
  17 |    ]
  18 |   },
  19 |   {
  20 |    "cell_type": "markdown",
  21 |    "metadata": {
  22 |     "slideshow": {
  23 |      "slide_type": "subslide"
  24 |     }
  25 |    },
  26 |    "source": [
  27 |     "### <img src=\"https://img.icons8.com/bubbles/50/000000/checklist.png\" style=\"height:50px;display:inline\"> Agenda\n",
  28 |     "---\n",
  29 |     "* [Linear Algebra Refresher](#-Linear-Algebra-Refresher)\n",
  30 |     "* [Eigen Values and Vectors Decomposition](#-Eigenvalues-and-Eigenvectors)\n",
  31 |     "* [Singular Value Decomposition (SVD)](#-Singular-Value-Decomposition-(SVD))\n",
  32 |     "* [Recommended Videos](#-Recommended-Videos)\n",
  33 |     "* [Credits](#-Credits)"
  34 |    ]
  35 |   },
  36 |   {
  37 |    "cell_type": "markdown",
  38 |    "metadata": {
  39 |     "slideshow": {
  40 |      "slide_type": "subslide"
  41 |     }
  42 |    },
  43 |    "source": [
  44 |     "#### <img src=\"https://img.icons8.com/bubbles/50/000000/link.png\" style=\"height:30px;display:inline\"> Useful Resource\n",
  45 |     "---\n",
  46 |     "<center>\n",
  47 |     "    <a href=\"http://www2.imm.dtu.dk/pubdb/views/edoc_download.php/3274/pdf/imm3274.pdf\">The Matrix Cookbook</a>\n",
  48 |     "</center>"
  49 |    ]
  50 |   },
  51 |   {
  52 |    "cell_type": "code",
  53 |    "execution_count": 1,
  54 |    "metadata": {
  55 |     "slideshow": {
  56 |      "slide_type": "skip"
  57 |     }
  58 |    },
  59 |    "outputs": [],
  60 |    "source": [
  61 |     "# imports for the tutorial\n",
  62 |     "import numpy as np\n",
  63 |     "import pandas as pd\n",
  64 |     "import matplotlib.pyplot as plt\n",
  65 |     "%matplotlib notebook"
  66 |    ]
  67 |   },
  68 |   {
  69 |    "cell_type": "markdown",
  70 |    "metadata": {
  71 |     "slideshow": {
  72 |      "slide_type": "slide"
  73 |     }
  74 |    },
  75 |    "source": [
  76 |     "## <img src=\"https://img.icons8.com/dusk/50/000000/math.png\" style=\"height:50px;display:inline\"> Linear Algebra Refresher\n",
  77 |     "---\n",
  78 |     "### <img src=\"https://img.icons8.com/nolan/64/000000/circled-up-right-2.png\" style=\"height:50px;display:inline\"> Vectors\n",
  79 |     "---\n",
  80 |     "* Geometric object that has both a magnitude and direction\n",
  81 |     "    * $ x = \\begin{bmatrix} x_{1} \\\\ x_{2} \\\\ \\vdots \\\\ x_{n} \\end{bmatrix} = (x_1, x_2, ..., x_n)^{T} \\in \\mathcal{R}^n$\n",
  82 |     "* Magnitude of a vector: $||x|| = \\sqrt{x^{T}x} = \\sqrt{x_1^2 +x_2^2 +... +x_n^2}$\n",
  83 |     "* **Cardinality** of a vector - the number of non zero elements"
  84 |    ]
  85 |   },
  86 |   {
  87 |    "cell_type": "code",
  88 |    "execution_count": 2,
  89 |    "metadata": {
  90 |     "slideshow": {
  91 |      "slide_type": "subslide"
  92 |     }
  93 |    },
  94 |    "outputs": [
  95 |     {
  96 |      "name": "stdout",
  97 |      "output_type": "stream",
  98 |      "text": [
  99 |       "v:\n",
 100 |       "[[ 16]\n",
 101 |       " [  0]\n",
 102 |       " [ 19]\n",
 103 |       " [-16]\n",
 104 |       " [ -9]\n",
 105 |       " [ 10]]\n",
 106 |       "v^T:\n",
 107 |       "[[ 16   0  19 -16  -9  10]]\n"
 108 |      ]
 109 |     }
 110 |    ],
 111 |    "source": [
 112 |     "# let's see some vectors\n",
 113 |     "v = np.random.randint(low=-20, high=20, size=(6, 1))\n",
 114 |     "print(\"v:\")\n",
 115 |     "print(v)\n",
 116 |     "print(\"v^T:\")\n",
 117 |     "print(v.T)"
 118 |    ]
 119 |   },
 120 |   {
 121 |    "cell_type": "code",
 122 |    "execution_count": 3,
 123 |    "metadata": {
 124 |     "slideshow": {
 125 |      "slide_type": "subslide"
 126 |     }
 127 |    },
 128 |    "outputs": [
 129 |     {
 130 |      "name": "stdout",
 131 |      "output_type": "stream",
 132 |      "text": [
 133 |       "magnitude of v:\n",
 134 |       "32.46536616149585\n",
 135 |       "cardinality- non zero elements:\n",
 136 |       "5\n"
 137 |      ]
 138 |     }
 139 |    ],
 140 |    "source": [
 141 |     "print(\"magnitude of v:\")\n",
 142 |     "print(np.sqrt(np.sum(np.square(v))))\n",
 143 |     "print(\"cardinality- non zero elements:\")\n",
 144 |     "print(np.sum(v != 0))"
 145 |    ]
 146 |   },
 147 |   {
 148 |    "cell_type": "markdown",
 149 |    "metadata": {
 150 |     "slideshow": {
 151 |      "slide_type": "slide"
 152 |     }
 153 |    },
 154 |    "source": [
 155 |     "### <img src=\"https://img.icons8.com/color/96/000000/satellites.png\" style=\"height:50px;display:inline\"> Inner Product Space\n",
 156 |     "---\n",
 157 |     "* A mapping $\\langle \\cdot, \\cdot \\rangle : V \\times V \\rightarrow F$ that satisfies:\n",
 158 |     "    * Conjucate Symmetry: $\\langle x, y \\rangle = \\overline{\\langle y, x \\rangle} $\n",
 159 |     "    * Linearity in the First Argument: \n",
 160 |     "        * $\\langle a \\cdot x, y \\rangle = a \\cdot \\langle x, y \\rangle$\n",
 161 |     "        * $\\langle x + z, y \\rangle = \\langle x, y \\rangle + \\langle z, y \\rangle$\n",
 162 |     "    * Positive-definiteness: \n",
 163 |     "        * $\\langle x, x \\rangle \\geq 0$\n",
 164 |     "        * $\\langle x, x \\rangle = 0 \\rightarrow x=0$"
 165 |    ]
 166 |   },
 167 |   {
 168 |    "cell_type": "markdown",
 169 |    "metadata": {
 170 |     "slideshow": {
 171 |      "slide_type": "subslide"
 172 |     }
 173 |    },
 174 |    "source": [
 175 |     "\n",
 176 |     "* Common Inner Products:\n",
 177 |     "    * Real Vector: $\\langle x, y \\rangle = x^{T} y$\n",
 178 |     "    * Real Matrix: $\\langle A, B \\rangle = \\textit{trace}(AB^{T})$\n",
 179 |     "    * Random Variables: $\\langle x, y \\rangle = \\mathbb{E}[x \\cdot y]$\n"
 180 |    ]
 181 |   },
 182 |   {
 183 |    "cell_type": "markdown",
 184 |    "metadata": {
 185 |     "slideshow": {
 186 |      "slide_type": "subslide"
 187 |     }
 188 |    },
 189 |    "source": [
 190 |     "* Properties of **Dot Product**:\n",
 191 |     "    * Distributiveness: \n",
 192 |     "        * $(a + b)\\cdot c = a \\cdot c + b \\cdot c$\n",
 193 |     "        * $a \\cdot (b+c) = a\\cdot b + a\\cdot c$\n",
 194 |     "    * Linearity: $(\\lambda a)\\cdot b= a \\cdot (\\lambda b) = \\lambda(a \\cdot b)$\n",
 195 |     "    * Symmetry: $a \\cdot b= b\\cdot a$\n",
 196 |     "    * Non-Negativity: $\\forall a \\neq 0, a\\cdot a >0 , a \\cdot a =0 \\iff a=0$"
 197 |    ]
 198 |   },
 199 |   {
 200 |    "cell_type": "code",
 201 |    "execution_count": 4,
 202 |    "metadata": {
 203 |     "slideshow": {
 204 |      "slide_type": "subslide"
 205 |     }
 206 |    },
 207 |    "outputs": [
 208 |     {
 209 |      "name": "stdout",
 210 |      "output_type": "stream",
 211 |      "text": [
 212 |       "a:\n",
 213 |       "[[1.]\n",
 214 |       " [1.]\n",
 215 |       " [1.]\n",
 216 |       " [1.]\n",
 217 |       " [1.]]\n",
 218 |       "b:\n",
 219 |       "[[ 3]\n",
 220 |       " [-4]\n",
 221 |       " [ 5]\n",
 222 |       " [ 8]\n",
 223 |       " [-4]]\n",
 224 |       "a.T.dot(b)=\n",
 225 |       "[[8.]]\n",
 226 |       "the same as a.T @ b:\n",
 227 |       "[[8.]]\n"
 228 |      ]
 229 |     }
 230 |    ],
 231 |    "source": [
 232 |     "# let's see some dot products\n",
 233 |     "a = np.ones((5,1))\n",
 234 |     "b = np.random.randint(low=-10, high=10, size=(5,1))\n",
 235 |     "print(\"a:\")\n",
 236 |     "print(a)\n",
 237 |     "print(\"b:\")\n",
 238 |     "print(b)\n",
 239 |     "print(\"a.T.dot(b)=\")\n",
 240 |     "print(a.T.dot(b))\n",
 241 |     "print(\"the same as a.T @ b:\")\n",
 242 |     "print(a.T @ b)"
 243 |    ]
 244 |   },
 245 |   {
 246 |    "cell_type": "code",
 247 |    "execution_count": 5,
 248 |    "metadata": {
 249 |     "slideshow": {
 250 |      "slide_type": "subslide"
 251 |     }
 252 |    },
 253 |    "outputs": [
 254 |     {
 255 |      "name": "stdout",
 256 |      "output_type": "stream",
 257 |      "text": [
 258 |       "a + 0.5=\n",
 259 |       "[[1.5]\n",
 260 |       " [1.5]\n",
 261 |       " [1.5]\n",
 262 |       " [1.5]\n",
 263 |       " [1.5]]\n",
 264 |       "(a + 2 * a).T @ b\n",
 265 |       "[[24.]]\n",
 266 |       "the same as a.T @ b + (2 * a).T @ b\n",
 267 |       "[[24.]]\n"
 268 |      ]
 269 |     }
 270 |    ],
 271 |    "source": [
 272 |     "print(\"a + 0.5=\")\n",
 273 |     "print(a + 0.5)\n",
 274 |     "print(\"(a + 2 * a).T @ b\")\n",
 275 |     "print((a + 2 * a).T @ b)\n",
 276 |     "print(\"the same as a.T @ b + (2 * a).T @ b\")\n",
 277 |     "print(a.T @ b + (2 * a).T @ b)"
 278 |    ]
 279 |   },
 280 |   {
 281 |    "cell_type": "markdown",
 282 |    "metadata": {
 283 |     "slideshow": {
 284 |      "slide_type": "slide"
 285 |     }
 286 |    },
 287 |    "source": [
 288 |     "### <img src=\"https://img.icons8.com/color/48/000000/matrix-desktop.png\" style=\"height:30px;display:inline\"> Outer Product\n",
 289 |     "---\n",
 290 |     "* Let:\n",
 291 |     "    * $a = (a_1, a_2, ..., a_n)^{T}$\n",
 292 |     "    * $b = (b_1, b_2, ..., b_n)^{T}$\n",
 293 |     "* The outer product $ab^{T}$: $$ ab^{T} = \\begin{bmatrix} a_{1} \\\\ a_{2} \\\\ \\vdots \\\\ a_{n} \\end{bmatrix} [b_1, b_2, ..., b_n] = \\begin{pmatrix} a_1 b_1 & a_1 b_2 & \\cdots & a_1 b_n \\\\ a_2 b_1 & a_2 b_2 & \\cdots & a_2 b_n \\\\ \\vdots  & \\vdots  & \\ddots & \\vdots  \\\\ a_n b_1 & a_n b_2 & \\cdots & a_n b_n \\end{pmatrix} $$"
 294 |    ]
 295 |   },
 296 |   {
 297 |    "cell_type": "code",
 298 |    "execution_count": 6,
 299 |    "metadata": {
 300 |     "slideshow": {
 301 |      "slide_type": "subslide"
 302 |     }
 303 |    },
 304 |    "outputs": [
 305 |     {
 306 |      "name": "stdout",
 307 |      "output_type": "stream",
 308 |      "text": [
 309 |       "a:\n",
 310 |       "[[0.68496376]\n",
 311 |       " [0.51514789]\n",
 312 |       " [0.97263803]\n",
 313 |       " [0.47948046]\n",
 314 |       " [0.97063678]]\n",
 315 |       "b:\n",
 316 |       "[[0.16180323]\n",
 317 |       " [0.64818973]\n",
 318 |       " [0.00683339]\n",
 319 |       " [0.5219497 ]\n",
 320 |       " [0.02569252]]\n"
 321 |      ]
 322 |     }
 323 |    ],
 324 |    "source": [
 325 |     "# outer product\n",
 326 |     "a = np.random.random(size=(5,1))\n",
 327 |     "print(\"a:\")\n",
 328 |     "print(a)\n",
 329 |     "b = np.random.random(size=(5,1))\n",
 330 |     "print(\"b:\")\n",
 331 |     "print(b)"
 332 |    ]
 333 |   },
 334 |   {
 335 |    "cell_type": "code",
 336 |    "execution_count": 7,
 337 |    "metadata": {
 338 |     "slideshow": {
 339 |      "slide_type": "subslide"
 340 |     }
 341 |    },
 342 |    "outputs": [
 343 |     {
 344 |      "name": "stdout",
 345 |      "output_type": "stream",
 346 |      "text": [
 347 |       "outer product: a @ b.T = \n",
 348 |       "[[0.11082935 0.44398648 0.00468062 0.35751663 0.01759844]\n",
 349 |       " [0.08335259 0.33391357 0.00352021 0.26888128 0.01323545]\n",
 350 |       " [0.15737597 0.63045398 0.00664641 0.50766812 0.02498952]\n",
 351 |       " [0.07758149 0.31079431 0.00327648 0.25026468 0.01231906]\n",
 352 |       " [0.15705217 0.62915679 0.00663274 0.50662357 0.0249381 ]]\n"
 353 |      ]
 354 |     }
 355 |    ],
 356 |    "source": [
 357 |     "ab_t = a @ b.T\n",
 358 |     "print(\"outer product: a @ b.T = \")\n",
 359 |     "print(ab_t)"
 360 |    ]
 361 |   },
 362 |   {
 363 |    "cell_type": "markdown",
 364 |    "metadata": {
 365 |     "slideshow": {
 366 |      "slide_type": "slide"
 367 |     }
 368 |    },
 369 |    "source": [
 370 |     "### <img src=\"https://img.icons8.com/nolan/64/000000/resize-four-directions.png\" style=\"height:50px;display:inline\"> Vector Norms\n",
 371 |     "---\n",
 372 |     "* A norm on a vector sapce $\\Omega$ is a function $f: \\Omega \\rightarrow \\mathcal{R}$ with the following properties:\n",
 373 |     "    * Positive Scalability: $f(ax) = |a|f(x)$\n",
 374 |     "    * Triangle Inequality: $f(x+y) \\leq f(x) + f(y)$\n",
 375 |     "    * If $f(x) = 0 \\rightarrow x = 0$\n",
 376 |     "* $l_1$ norm: $||x||_1 = \\sum_{i=1}^n |x_i| $"
 377 |    ]
 378 |   },
 379 |   {
 380 |    "cell_type": "markdown",
 381 |    "metadata": {
 382 |     "slideshow": {
 383 |      "slide_type": "subslide"
 384 |     }
 385 |    },
 386 |    "source": [
 387 |     "\n",
 388 |     "* $l_2$ norm: $||x||_2 = \\sqrt{\\sum_{i=1}^n |x_i|^2} $\n",
 389 |     "    * For **Vectors**: $||x||_2^2 = x^{T}x$\n",
 390 |     "    * $l_2$-distance: $||x -y||_2^2 = (x-y)^{T}(x-y)= ||x||_2^2 -2x^{T}y + ||y||_2^2$\n",
 391 |     "* $l_p$ norm: $||x||_p = (\\sum_{i=1}^n |x_i|^p)^{\\frac{1}{p}} $\n",
 392 |     "* $l_{\\infty}$ norm: $||x||_{\\infty} = \\max{(|x_1|, |x_2|, ..., |x_n|)} $"
 393 |    ]
 394 |   },
 395 |   {
 396 |    "cell_type": "markdown",
 397 |    "metadata": {
 398 |     "slideshow": {
 399 |      "slide_type": "subslide"
 400 |     }
 401 |    },
 402 |    "source": [
 403 |     "<img src=\"./assets/tut_03_norm.jpg\">"
 404 |    ]
 405 |   },
 406 |   {
 407 |    "cell_type": "code",
 408 |    "execution_count": 8,
 409 |    "metadata": {
 410 |     "slideshow": {
 411 |      "slide_type": "subslide"
 412 |     }
 413 |    },
 414 |    "outputs": [
 415 |     {
 416 |      "name": "stdout",
 417 |      "output_type": "stream",
 418 |      "text": [
 419 |       "a:\n",
 420 |       "[[0.20110422]\n",
 421 |       " [0.3103417 ]\n",
 422 |       " [0.25755954]\n",
 423 |       " [0.84291866]\n",
 424 |       " [0.00855558]]\n",
 425 |       "l-1 norm: \n",
 426 |       "1.6204796988041368\n",
 427 |       "l-2 norm: \n",
 428 |       "0.9558644554276373\n",
 429 |       "l-infinity norm:\n",
 430 |       "0.8429186563888088\n"
 431 |      ]
 432 |     }
 433 |    ],
 434 |    "source": [
 435 |     "# norms and distance\n",
 436 |     "a = np.random.random(size=(5,1))\n",
 437 |     "print(\"a:\")\n",
 438 |     "print(a)\n",
 439 |     "print(\"l-1 norm: \")\n",
 440 |     "print(np.sum(abs(a)))\n",
 441 |     "print(\"l-2 norm: \")\n",
 442 |     "print(np.sqrt(np.sum(np.square(a))))\n",
 443 |     "print(\"l-infinity norm:\")\n",
 444 |     "print(np.max(abs(a)))"
 445 |    ]
 446 |   },
 447 |   {
 448 |    "cell_type": "code",
 449 |    "execution_count": 9,
 450 |    "metadata": {
 451 |     "slideshow": {
 452 |      "slide_type": "subslide"
 453 |     }
 454 |    },
 455 |    "outputs": [
 456 |     {
 457 |      "name": "stdout",
 458 |      "output_type": "stream",
 459 |      "text": [
 460 |       "b:\n",
 461 |       "[[0.59011591]\n",
 462 |       " [0.77681828]\n",
 463 |       " [0.31464032]\n",
 464 |       " [0.78600795]\n",
 465 |       " [0.85952156]]\n",
 466 |       "l-2 distance between a and b:\n",
 467 |       "[[1.04860414]]\n"
 468 |      ]
 469 |     }
 470 |    ],
 471 |    "source": [
 472 |     "b = np.random.random(size=(5,1))\n",
 473 |     "print(\"b:\")\n",
 474 |     "print(b)\n",
 475 |     "print(\"l-2 distance between a and b:\")\n",
 476 |     "print(np.sqrt((a - b).T @ (a - b)))"
 477 |    ]
 478 |   },
 479 |   {
 480 |    "cell_type": "markdown",
 481 |    "metadata": {
 482 |     "slideshow": {
 483 |      "slide_type": "slide"
 484 |     }
 485 |    },
 486 |    "source": [
 487 |     "### <img src=\"https://img.icons8.com/bubbles/100/000000/groups.png\" style=\"height:50px;display:inline\"> Linear Dependency\n",
 488 |     "---\n",
 489 |     "* Given a set of vectors $X =\\{x_1, x_2, ..., x_n \\}$, a **linear combination** of vectors is written as:\n",
 490 |     "$$ ax = a_1 x_1 + a_2 x_2 + ... +a_n x_n $$\n",
 491 |     "* $x_i \\in X$ is **linearly dependent** if it can be written as linear combination of $X \\setminus \\{x_i\\}$"
 492 |    ]
 493 |   },
 494 |   {
 495 |    "cell_type": "markdown",
 496 |    "metadata": {
 497 |     "slideshow": {
 498 |      "slide_type": "subslide"
 499 |     }
 500 |    },
 501 |    "source": [
 502 |     "<img src=\"./assets/tut_03_linear_dep.jpg\" style=\"height:200px\">"
 503 |    ]
 504 |   },
 505 |   {
 506 |    "cell_type": "markdown",
 507 |    "metadata": {
 508 |     "slideshow": {
 509 |      "slide_type": "slide"
 510 |     }
 511 |    },
 512 |    "source": [
 513 |     "### <img src=\"https://img.icons8.com/dusk/64/000000/lego.png\" style=\"height:50px;display:inline\"> Basis\n",
 514 |     "---\n",
 515 |     "* A basis is a **linearly independent** set of vectors that spans the \"whole sapce\"\n",
 516 |     "* Every vector in the space can be written as a linear combination of vectors in the basis\n",
 517 |     "    * For example, **the standard basis (unit vectors)**: $\\{e_i \\in \\mathcal{R}^n | e_i =(0, 0, ..., 0, 1,0, ..., 0)^{T}\\}$ \n",
 518 |     "        * $x^{T} = (3 ,2 ,5)^{T} = 3(1,0,0)^{T}+2(0,1,0)^{T}+5(0,0,1)^{T} = 3e_1^T +2e_2^T +5e_3^T$"
 519 |    ]
 520 |   },
 521 |   {
 522 |    "cell_type": "markdown",
 523 |    "metadata": {
 524 |     "slideshow": {
 525 |      "slide_type": "subslide"
 526 |     }
 527 |    },
 528 |    "source": [
 529 |     "* **Projection** of a vector: $x\\cdot e_i = x^T e_i = e_i^T x$\n",
 530 |     "* The basis vectors suffice:\n",
 531 |     "    * Orthogonal - $e_i^T e_j = 0$\n",
 532 |     "    * Normalized - $e_i^T e_i = 1$\n",
 533 |     "    * Orthogonal + Normalized = Orthonormal\n",
 534 |     "    * If $A$ is **orthogonal** then:\n",
 535 |     "        * $A$ is a square matrix\n",
 536 |     "        * The columns of $A$ are **orthonormal** vectors\n",
 537 |     "        * $A^TA = AA^T = I \\rightarrow A^T= A^{-1}$"
 538 |    ]
 539 |   },
 540 |   {
 541 |    "cell_type": "markdown",
 542 |    "metadata": {
 543 |     "slideshow": {
 544 |      "slide_type": "subslide"
 545 |     }
 546 |    },
 547 |    "source": [
 548 |     "* **Change of Basis** - suppose that we have a basis not necessarily orthonormal $B=\\{b_1, b_2, ..., b_n\\}, b_i \\in \\mathcal{R}^m $\n",
 549 |     "    * Vector in the **new** basis is represented with a **matrix-vector** multiplication\n",
 550 |     "    * The Identity matrix $I$ maps a vector to itself\n",
 551 |     "    * Basis change can be decomposed to: **rotation** matrix and **scale** matrix\n",
 552 |     "    * Using an **orthonormal** basis means only a **rotation** around the origin\n",
 553 |     "    * **Gram-Schmidt Orthonormaliztion Process**: <a href=\"https://en.wikipedia.org/wiki/Gram%E2%80%93Schmidt_process\">Link</a>\n",
 554 |     "    "
 555 |    ]
 556 |   },
 557 |   {
 558 |    "cell_type": "markdown",
 559 |    "metadata": {
 560 |     "slideshow": {
 561 |      "slide_type": "subslide"
 562 |     }
 563 |    },
 564 |    "source": [
 565 |     "<img src=\"./assets/tut_03_gram.gif\">\n",
 566 |     "   By <a href=\"//commons.wikimedia.org/wiki/User:Kieff\" class=\"mw-redirect\" title=\"User:Kieff\">Lucas V. Barbosa</a> - <span class=\"int-own-work\" lang=\"en\">Own work</span>, Public Domain, <a href=\"https://commons.wikimedia.org/w/index.php?curid=24396471\">Link</a>"
 567 |    ]
 568 |   },
 569 |   {
 570 |    "cell_type": "code",
 571 |    "execution_count": 6,
 572 |    "metadata": {
 573 |     "slideshow": {
 574 |      "slide_type": "skip"
 575 |     }
 576 |    },
 577 |    "outputs": [
 578 |     {
 579 |      "name": "stdout",
 580 |      "output_type": "stream",
 581 |      "text": [
 582 |       "V:\n",
 583 |       "[[3. 2.]\n",
 584 |       " [1. 2.]]\n",
 585 |       "U:\n",
 586 |       "[[0.9486833  0.70710678]\n",
 587 |       " [0.31622777 0.70710678]]\n"
 588 |      ]
 589 |     }
 590 |    ],
 591 |    "source": [
 592 |     "# Gram-Schmidt Algorithm\n",
 593 |     "def gram_schmidt(V):\n",
 594 |     "    \"\"\"\n",
 595 |     "    Implements Gram-Schmidt Orthonormaliztion Process.\n",
 596 |     "    Parameters:\n",
 597 |     "        V - matrix such that each column is a vector in the original basis\n",
 598 |     "    Returns:\n",
 599 |     "        U - matrix with orthonormal vectors as columns\n",
 600 |     "    \"\"\"\n",
 601 |     "    n, k = np.array(V, dtype=np.float).shape  # get dimensions\n",
 602 |     "    # initialize U matrix\n",
 603 |     "    U = np.zeros_like(V, dtype=np.float)\n",
 604 |     "    U[:,0] = V[:,0] / np.sqrt(V[:,0].T @ V[:,0])\n",
 605 |     "    for i in range(1, k):\n",
 606 |     "        U[:,i] = V[:,i]\n",
 607 |     "        for j in range(i - 1):\n",
 608 |     "            U[:,i] = U[:,i] - ((U[:,i].T @ U[:,j]) / (U[:,j].T @ U[:,j])) * U[:, j]\n",
 609 |     "        # normalize\n",
 610 |     "        U[:,i] = U[:,i] / np.sqrt(U[:,i].T @ U[:,i])\n",
 611 |     "    return U\n",
 612 |     "\n",
 613 |     "v1 = [3.0, 1.0]\n",
 614 |     "v2 = [2.0, 2.0]\n",
 615 |     "v = np.stack((v1, v2), axis=1)\n",
 616 |     "print(\"V:\")\n",
 617 |     "print(v)\n",
 618 |     "U = gram_schmidt(v)\n",
 619 |     "print(\"U:\")\n",
 620 |     "print(U)  "
 621 |    ]
 622 |   },
 623 |   {
 624 |    "cell_type": "markdown",
 625 |    "metadata": {
 626 |     "slideshow": {
 627 |      "slide_type": "slide"
 628 |     }
 629 |    },
 630 |    "source": [
 631 |     "### <img src=\"https://img.icons8.com/bubbles/50/000000/services.png\" style=\"height:50px;display:inline\"> Matrix Operations\n",
 632 |     "---\n",
 633 |     "* Addition\n",
 634 |     "    * Commutative: $A + B = B +A$\n",
 635 |     "    * Associative: $(A+B) + C = A + (B+C)$\n",
 636 |     "* Multiplication - **PAY ATTENTION TO DIMENSTIONS**\n",
 637 |     "    * Associative: $A(BC) = (AB)C$\n",
 638 |     "    * Distributive: $A(B+C) = AB + AC$\n",
 639 |     "    * Non-comutative (**!**): $AB \\neq BA$"
 640 |    ]
 641 |   },
 642 |   {
 643 |    "cell_type": "markdown",
 644 |    "metadata": {
 645 |     "slideshow": {
 646 |      "slide_type": "subslide"
 647 |     }
 648 |    },
 649 |    "source": [
 650 |     "* Transpose\n",
 651 |     "    * $(A^{T})_{ij}$\n",
 652 |     "    * $(A^{T})^T = A$\n",
 653 |     "    * $(AB)^{T} = B^{T}A^{T}$\n",
 654 |     "* Inverse - **MAKE SURE CONDITIONS APPLY**\n",
 655 |     "    * **Positive Semi-definite (PSD)** - Matrix $M$ is called *PSD* if for every non-zero column vector $z$, the scalar $z^T M z \\geq 0$\n",
 656 |     "    * **Every positive definite matrix is invertible** and its inverse is also positive definite\n",
 657 |     "    * $(A^{-1})^{-1} = A$\n",
 658 |     "    * $(AB)^{-1} = B^{-1} A^{-1}$\n",
 659 |     "    * $(A^T)^{-1} = A^{-T}$\n",
 660 |     "    * Inverse of 2x2 matrix: see tutorial 1"
 661 |    ]
 662 |   },
 663 |   {
 664 |    "cell_type": "code",
 665 |    "execution_count": 10,
 666 |    "metadata": {
 667 |     "slideshow": {
 668 |      "slide_type": "subslide"
 669 |     }
 670 |    },
 671 |    "outputs": [
 672 |     {
 673 |      "name": "stdout",
 674 |      "output_type": "stream",
 675 |      "text": [
 676 |       "A:\n",
 677 |       "[[0.56274722 0.57692677 0.31759767 0.9135175  0.39388189]\n",
 678 |       " [0.3260898  0.73720574 0.3526661  0.02961814 0.16645483]\n",
 679 |       " [0.01740472 0.24892669 0.4684225  0.60255541 0.11491183]\n",
 680 |       " [0.60243149 0.97287256 0.72073364 0.33608398 0.94720029]\n",
 681 |       " [0.3300669  0.15559865 0.27349031 0.41204091 0.83342534]]\n",
 682 |       "inverse of A:\n",
 683 |       "[[  21.57251296 -108.00106195  -17.70755954   87.22168674  -85.31216784]\n",
 684 |       " [ -14.53515995   78.79387459   11.54867247  -62.74719293   60.8531958 ]\n",
 685 |       " [  14.80752023  -84.10430348  -11.05067623   68.39955569  -66.41395368]\n",
 686 |       " [  -4.51707378   27.97302751    4.82828719  -23.33701698   22.40506618]\n",
 687 |       " [  -8.45572468   41.8310235     6.09595381  -33.73598262   34.3423877 ]]\n"
 688 |      ]
 689 |     }
 690 |    ],
 691 |    "source": [
 692 |     "# inverse\n",
 693 |     "A = np.random.rand(5, 5)\n",
 694 |     "print(\"A:\")\n",
 695 |     "print(A)\n",
 696 |     "print(\"inverse of A:\")\n",
 697 |     "print(np.linalg.inv(A))"
 698 |    ]
 699 |   },
 700 |   {
 701 |    "cell_type": "markdown",
 702 |    "metadata": {
 703 |     "slideshow": {
 704 |      "slide_type": "slide"
 705 |     }
 706 |    },
 707 |    "source": [
 708 |     "### <img src=\"https://img.icons8.com/color/96/000000/master-sergeant-msg.png\" style=\"height:50px;display:inline\"> Matrix Rank\n",
 709 |     "---\n",
 710 |     "* The rank of a matrix is the **maximal number of linearly independent** columns or rows of a matrix\n",
 711 |     "* $ A \\in \\mathcal{R}^{m \\times n} \\rightarrow \\textit{rank}(A) \\leq \\min(m,n)$\n",
 712 |     "* $\\textit{rank}(A) = \\textit{rank}(A^T)$\n",
 713 |     "* $\\textit{rank}(A^T A) = \\textit{rank}(A)$\n",
 714 |     "* $\\textit{rank}(A + B) \\leq \\textit{rank}(A) + \\textit{rank}(B)$\n",
 715 |     "* $\\textit{rank}(AB) \\leq \\min(\\textit{rank}(A), \\textit{rank}(B))$\n",
 716 |     "* A is **full rank** if $\\textit{rank}(A) = \\min(m,n)$\n",
 717 |     "* **Singular Matrix** - has dependent rows (and at least one zero eigen-value)"
 718 |    ]
 719 |   },
 720 |   {
 721 |    "cell_type": "code",
 722 |    "execution_count": 11,
 723 |    "metadata": {
 724 |     "slideshow": {
 725 |      "slide_type": "subslide"
 726 |     }
 727 |    },
 728 |    "outputs": [
 729 |     {
 730 |      "name": "stdout",
 731 |      "output_type": "stream",
 732 |      "text": [
 733 |       "A:\n",
 734 |       "[[0 3 3 3 1]\n",
 735 |       " [1 1 1 3 3]\n",
 736 |       " [1 1 2 2 0]\n",
 737 |       " [2 0 3 1 2]\n",
 738 |       " [3 1 2 1 1]]\n",
 739 |       "rank(A):\n",
 740 |       "5\n"
 741 |      ]
 742 |     }
 743 |    ],
 744 |    "source": [
 745 |     "A = np.random.randint(low=0, high=4, size=(5,5))\n",
 746 |     "print(\"A:\")\n",
 747 |     "print(A)\n",
 748 |     "print(\"rank(A):\")\n",
 749 |     "print(np.linalg.matrix_rank(A))"
 750 |    ]
 751 |   },
 752 |   {
 753 |    "cell_type": "markdown",
 754 |    "metadata": {
 755 |     "slideshow": {
 756 |      "slide_type": "slide"
 757 |     }
 758 |    },
 759 |    "source": [
 760 |     "### <img src=\"https://img.icons8.com/dusk/64/000000/radar.png\" style=\"height:50px;display:inline\"> Range & Nullspace\n",
 761 |     "---\n",
 762 |     "* **Range** (of a matrix) - the span of the columns of the matrix, denoted by the set: $$\\mathcal{R}(A) = \\{y|y= Ax\\} $$\n",
 763 |     "* **Nullspace** (of a matrix) - the set of vectors that when multiplied by the matrix result in 0, given by the set: $$\\mathcal{N}(A) = \\{x|Ax=0\\} $$"
 764 |    ]
 765 |   },
 766 |   {
 767 |    "cell_type": "markdown",
 768 |    "metadata": {
 769 |     "slideshow": {
 770 |      "slide_type": "slide"
 771 |     }
 772 |    },
 773 |    "source": [
 774 |     "### <img src=\"https://img.icons8.com/dusk/64/000000/radar.png\" style=\"height:50px;display:inline\"> Determinant\n",
 775 |     "---\n",
 776 |     "Let $A = \\begin{pmatrix}x_1 & y_1 & z_1 \\\\ x_2 & y_2 & z_2 \\\\ x_3 & y_3 & z_3 \\end{pmatrix} $, a **square matrix**, then:\n",
 777 |     "$$det(A) = |A| = \\begin{vmatrix} x_1 & y_1 & z_1 \\\\ x_2 & y_2 & z_2 \\\\ x_3 & y_3 & z_3 \\end{vmatrix} = x_1 \\begin{vmatrix} y_1 & z_2 \\\\ y_3 & z_3 \\end{vmatrix} -x_2 \\begin{vmatrix} y_1 & z_1 \\\\ y_3 & z_3  \\end{vmatrix} +x_3\\begin{vmatrix} y_1 & z_1\\\\ y_2 & z_2 \\end{vmatrix}$$ <br> $$ = x_1 (y_2z_3 - z_2 y_3) -x_2(y_1z_3 - z_1y_3) +x_3(y_1z_2 - z_1 y_2) $$\n",
 778 |     "* $det(A) = 0 \\iff A$ is **singular** (at least one eigen-value is zero)\n",
 779 |     "* If $A$ is diagonal, then $det(A)$ is the prodcut of the diagonal elements (the eigen-values)\n",
 780 |     "* $det(AB) = det(A)det(B)$\n",
 781 |     "* $det(A^{-1}) = det(A)^{-1}$\n",
 782 |     "* $det(\\lambda A) = \\lambda^n det(A)$"
 783 |    ]
 784 |   },
 785 |   {
 786 |    "cell_type": "code",
 787 |    "execution_count": 12,
 788 |    "metadata": {
 789 |     "slideshow": {
 790 |      "slide_type": "subslide"
 791 |     }
 792 |    },
 793 |    "outputs": [
 794 |     {
 795 |      "name": "stdout",
 796 |      "output_type": "stream",
 797 |      "text": [
 798 |       "A:\n",
 799 |       "[[-0.11682683 -0.60007878  0.20168493 -0.41938087 -1.44710738]\n",
 800 |       " [-0.77820688  0.97102027 -0.95386608 -0.81321839  0.83334389]\n",
 801 |       " [-1.44149225 -0.44278972 -0.07846115  0.59192462  0.21563895]\n",
 802 |       " [-0.75701366 -1.49163516 -0.2865721  -0.46047925 -0.01296227]\n",
 803 |       " [ 1.250518    1.20554034 -0.14421321  0.44739448 -0.14740781]]\n",
 804 |       "det(A):\n",
 805 |       "3.073911389887483\n"
 806 |      ]
 807 |     }
 808 |    ],
 809 |    "source": [
 810 |     "# determinant\n",
 811 |     "A = np.random.randn(5,5)\n",
 812 |     "print(\"A:\")\n",
 813 |     "print(A)\n",
 814 |     "print(\"det(A):\")\n",
 815 |     "print(np.linalg.det(A))"
 816 |    ]
 817 |   },
 818 |   {
 819 |    "cell_type": "markdown",
 820 |    "metadata": {
 821 |     "slideshow": {
 822 |      "slide_type": "slide"
 823 |     }
 824 |    },
 825 |    "source": [
 826 |     "## <img src=\"https://img.icons8.com/dusk/64/000000/fine-print.png\" style=\"height:50px;display:inline\"> Solve Linear Equation Analytically\n",
 827 |     "---\n",
 828 |     "* Definitions:\n",
 829 |     "    * $A \\in \\mathcal{R}^{n \\times n}$\n",
 830 |     "    * $x, b \\in \\mathcal{R}^{n \\times 1}$\n",
 831 |     "* The problem: find the solution of $Ax = b$\n",
 832 |     "* Solution: if $A$ is PSD (and thus invertible), then $x = A^{-1} b$\n",
 833 |     "* What if $A \\in \\mathcal{R}^{m \\times n}$, $x \\in \\mathcal{R}^{n \\times 1}$, $b \\in \\mathcal{R}^{m \\times 1}$ ?\n",
 834 |     "    * $A$ is no longer invertible!\n",
 835 |     "* The problem redefined: find $x$ that minimzes the distance from $Ax$ to $b$, or more formally: $$ \\underset{x}{\\mathrm{argmin}} ||Ax - b ||_2^2$$ (also called **least-squares** solution)"
 836 |    ]
 837 |   },
 838 |   {
 839 |    "cell_type": "markdown",
 840 |    "metadata": {
 841 |     "slideshow": {
 842 |      "slide_type": "subslide"
 843 |     }
 844 |    },
 845 |    "source": [
 846 |     "### <img src=\"https://img.icons8.com/dusk/64/000000/alarm.png\" style=\"height:50px;display:inline\"> Reminder (Tutorial 01) - Vector & Matrix Derivatives\n",
 847 |     "---\n",
 848 |     "* $\\nabla_x Ax = A^{T}$\n",
 849 |     "* $\\nabla_x x^{T} A x = (A + A^{T}) x$ \n",
 850 |     "* $\\frac{\\partial}{\\partial A} \\ln |A| = A^{-T}$\n",
 851 |     "* $\\frac{\\partial}{\\partial A} Tr[AB] = B^{T}$"
 852 |    ]
 853 |   },
 854 |   {
 855 |    "cell_type": "markdown",
 856 |    "metadata": {
 857 |     "slideshow": {
 858 |      "slide_type": "subslide"
 859 |     }
 860 |    },
 861 |    "source": [
 862 |     "### <img src=\"https://img.icons8.com/dusk/64/000000/task.png\" style=\"height:50px;display:inline\"> Exercise 1 - Least-Squares Solution\n",
 863 |     "---\n",
 864 |     "Given $A \\in \\mathcal{R}^{m \\times n}$, $x \\in \\mathcal{R}^{n \\times 1}$, $b \\in \\mathcal{R}^{m \\times 1}$\n",
 865 |     "\n",
 866 |     "Find $x$ that minimizes the distance from $Ax$ to $b$, or more formally: $$ \\underset{x}{\\mathrm{argmin}} ||Ax - b ||_2^2$$"
 867 |    ]
 868 |   },
 869 |   {
 870 |    "cell_type": "markdown",
 871 |    "metadata": {
 872 |     "slideshow": {
 873 |      "slide_type": "subslide"
 874 |     }
 875 |    },
 876 |    "source": [
 877 |     "### <img src=\"https://img.icons8.com/dusk/64/000000/idea.png\" style=\"height:50px;display:inline\"> Solution 1\n",
 878 |     "---\n",
 879 |     "$$ ||Ax - b ||_2^2 = (Ax-b)^T (Ax-b) = x^TA^TAx -x^TA^Tb-b^TAx +b^Tb $$\n",
 880 |     "$$\\frac{\\partial ||Ax - b ||_2^2}{\\partial x} = 2A^TAx-2A^Tb = 0 \\rightarrow x = (A^TA)^{-1}A^Tb $$"
 881 |    ]
 882 |   },
 883 |   {
 884 |    "cell_type": "code",
 885 |    "execution_count": 13,
 886 |    "metadata": {
 887 |     "slideshow": {
 888 |      "slide_type": "subslide"
 889 |     }
 890 |    },
 891 |    "outputs": [
 892 |     {
 893 |      "name": "stdout",
 894 |      "output_type": "stream",
 895 |      "text": [
 896 |       "A:\n",
 897 |       "[[ 3  2  8  9]\n",
 898 |       " [-3 -5 -5  2]\n",
 899 |       " [ 0  5  7  5]\n",
 900 |       " [ 1 -3  6 -5]\n",
 901 |       " [ 1  1  8  6]]\n",
 902 |       "b:\n",
 903 |       "[[-2]\n",
 904 |       " [-7]\n",
 905 |       " [-3]\n",
 906 |       " [-3]\n",
 907 |       " [ 0]]\n"
 908 |      ]
 909 |     }
 910 |    ],
 911 |    "source": [
 912 |     "# Least Squares Solution\n",
 913 |     "m = 5\n",
 914 |     "n = 4\n",
 915 |     "A = np.random.randint(low=-5, high=10, size=(m,n))\n",
 916 |     "b = np.random.randint(low=-10, high=3, size=(m,1))\n",
 917 |     "print(\"A:\")\n",
 918 |     "print(A)\n",
 919 |     "print(\"b:\")\n",
 920 |     "print(b)"
 921 |    ]
 922 |   },
 923 |   {
 924 |    "cell_type": "code",
 925 |    "execution_count": 14,
 926 |    "metadata": {
 927 |     "slideshow": {
 928 |      "slide_type": "subslide"
 929 |     }
 930 |    },
 931 |    "outputs": [
 932 |     {
 933 |      "name": "stdout",
 934 |      "output_type": "stream",
 935 |      "text": [
 936 |       "Least Squares solution for x:\n",
 937 |       "[[ 1.54495052]\n",
 938 |       " [ 0.65381817]\n",
 939 |       " [-0.47872248]\n",
 940 |       " [-0.27042109]]\n"
 941 |      ]
 942 |     }
 943 |    ],
 944 |    "source": [
 945 |     "print(\"Least Squares solution for x:\")\n",
 946 |     "x = np.linalg.inv(A.T @ A) @ A.T @ b\n",
 947 |     "print(x)"
 948 |    ]
 949 |   },
 950 |   {
 951 |    "cell_type": "markdown",
 952 |    "metadata": {
 953 |     "slideshow": {
 954 |      "slide_type": "slide"
 955 |     }
 956 |    },
 957 |    "source": [
 958 |     "## <img src=\"https://img.icons8.com/bubbles/50/000000/calculator.png\" style=\"height:50px;display:inline\"> Solve Linear Equation Non-Analytically\n",
 959 |     "---\n",
 960 |     "###  <img src=\"https://img.icons8.com/office/80/000000/person-in-a-mirror.png\" style=\"height:50px;display:inline\"> Eigenvalues and Eigenvectors\n",
 961 |     "---\n",
 962 |     "* Definition: Matrix $A$ with **Eigenvalue** $\\lambda \\in \\mathbb{C}$ and **Eigenvector** $x \\in \\mathbb{C}^n$ if $$Ax=\\lambda x, x \\neq 0 $$\n",
 963 |     "* Finding eigenvalues and eigenvectors\n",
 964 |     "    * Find eigenvalues by finding the roots of the polynomial generated by: $$det(\\lambda I -A) = |\\lambda I -A| =0 $$\n",
 965 |     "    * For each eigenvalue $\\lambda$, find its corresponding eigenvector $x$ by solving: $$ Ax = \\lambda x$$"
 966 |    ]
 967 |   },
 968 |   {
 969 |    "cell_type": "markdown",
 970 |    "metadata": {
 971 |     "slideshow": {
 972 |      "slide_type": "subslide"
 973 |     }
 974 |    },
 975 |    "source": [
 976 |     "* Example: $M = \\begin{pmatrix} 2 & 1 \\\\ 1 & 2 \\end{pmatrix} \\rightarrow |\\lambda I -M| =  \\begin{vmatrix} 2 - \\lambda & 1 \\\\ 1 & 2 - \\lambda \\end{vmatrix}  = 3 - 4 \\lambda + \\lambda^2 \\rightarrow \\lambda_{1,2} = 1, 3 \\rightarrow x_{\\lambda = 1}= \\begin{bmatrix} 1 \\\\ -1 \\end{bmatrix} , x_{\\lambda=3} = \\begin{bmatrix} 1 \\\\ 1 \\end{bmatrix}$\n",
 977 |     " \n",
 978 |     "* Eigenvalues Properties\n",
 979 |     "    * $det(\\Lambda) = |\\Lambda| = \\prod_{i=1}^n \\lambda_i$\n",
 980 |     "    * $\\textit{rank}(A) = \\sum_{i=1}^n \\mathbb{1}_{\\lambda_i \\neq 0}$\n",
 981 |     "    * Eigenvalues of a **diagonal** matrix are the diagonal entries\n",
 982 |     "    * A (square) matrix is said to be **diagonalizable** if it can be rewritten as: $A = X \\Lambda X^{-1}$\n",
 983 |     "* Eigenvalues of **Symmetric Matrices**:\n",
 984 |     "    * Eigenvalues are **real**\n",
 985 |     "    * Eigenvectors of **real symmetric** matrices are orthonormal"
 986 |    ]
 987 |   },
 988 |   {
 989 |    "cell_type": "code",
 990 |    "execution_count": 15,
 991 |    "metadata": {},
 992 |    "outputs": [
 993 |     {
 994 |      "name": "stdout",
 995 |      "output_type": "stream",
 996 |      "text": [
 997 |       "A:\n",
 998 |       "[[-4 -9  7  8  1]\n",
 999 |       " [ 6 -8 -5 -3 -9]\n",
1000 |       " [ 0  9 -6  0  3]\n",
1001 |       " [ 8  2 -6  0 -6]\n",
1002 |       " [ 3  3  2  4 -1]]\n"
1003 |      ]
1004 |     }
1005 |    ],
1006 |    "source": [
1007 |     "# eigenvalues and eigenvectors\n",
1008 |     "A = np.random.randint(low=-10, high=10, size=(5,5))\n",
1009 |     "eig, vec = np.linalg.eig(A)\n",
1010 |     "print(\"A:\")\n",
1011 |     "print(A)"
1012 |    ]
1013 |   },
1014 |   {
1015 |    "cell_type": "code",
1016 |    "execution_count": 16,
1017 |    "metadata": {
1018 |     "slideshow": {
1019 |      "slide_type": "subslide"
1020 |     }
1021 |    },
1022 |    "outputs": [
1023 |     {
1024 |      "name": "stdout",
1025 |      "output_type": "stream",
1026 |      "text": [
1027 |       "eigenvalues:\n",
1028 |       "[-9.29854727+11.14091902j -9.29854727-11.14091902j\n",
1029 |       "  3.93378061 +0.j         -3.01573245 +0.j\n",
1030 |       " -1.32095361 +0.j        ]\n",
1031 |       "eigenvectors:\n",
1032 |       "[[ 0.2627824 -0.45749602j  0.2627824 +0.45749602j -0.68051908+0.j\n",
1033 |       "   0.33207203+0.j          0.54461743+0.j        ]\n",
1034 |       " [-0.57285962+0.j         -0.57285962-0.j          0.20089578+0.j\n",
1035 |       "  -0.39152492+0.j         -0.32875482+0.j        ]\n",
1036 |       " [ 0.13729842+0.38931395j  0.13729842-0.38931395j  0.00207705+0.j\n",
1037 |       "  -0.45532511+0.j         -0.15673315+0.j        ]\n",
1038 |       " [-0.31676371+0.31808551j -0.31676371-0.31808551j -0.3762196 +0.j\n",
1039 |       "  -0.09140478+0.j         -0.14305208+0.j        ]\n",
1040 |       " [ 0.12184535+0.08182002j  0.12184535-0.08182002j -0.59580967+0.j\n",
1041 |       "   0.72163745+0.j          0.74181059+0.j        ]]\n"
1042 |      ]
1043 |     }
1044 |    ],
1045 |    "source": [
1046 |     "print(\"eigenvalues:\")\n",
1047 |     "print(eig)\n",
1048 |     "print(\"eigenvectors:\")\n",
1049 |     "print(vec)"
1050 |    ]
1051 |   },
1052 |   {
1053 |    "cell_type": "markdown",
1054 |    "metadata": {
1055 |     "slideshow": {
1056 |      "slide_type": "slide"
1057 |     }
1058 |    },
1059 |    "source": [
1060 |     "## <img src=\"https://img.icons8.com/doodle/96/000000/big-puzzle.png\" style=\"height:50px;display:inline\"> Eigen Decomposition\n",
1061 |     "---\n",
1062 |     "* **Eigen-decomposition** (also **spectral decomposition**) - factorization of a matrix into a canonical form, that is, the matrix is represented in terms of its **eigenvalues and eigenvectors**.\n",
1063 |     "* **Only** diagonalizable matrices can be factorized\n",
1064 |     "* Formally:\n",
1065 |     "    * Denote $\\Lambda$ as a matrix with eigenvalues on the diagonal\n",
1066 |     "    * Denote $Q$ as a matrix where the columns are the eigenvectors\n",
1067 |     "    * Let $A$ be a square $n \\times n$ matrix with $N$ linearly **independent** columns. Then $A$ can factorized as: $$A = Q \\Lambda Q^{-1} $$"
1068 |    ]
1069 |   },
1070 |   {
1071 |    "cell_type": "markdown",
1072 |    "metadata": {
1073 |     "slideshow": {
1074 |      "slide_type": "slide"
1075 |     }
1076 |    },
1077 |    "source": [
1078 |     "# <img src=\"https://img.icons8.com/bubbles/100/000000/question-mark.png\" style=\"height:50px;display:inline\"> What If A Is Non-Square?"
1079 |    ]
1080 |   },
1081 |   {
1082 |    "cell_type": "markdown",
1083 |    "metadata": {
1084 |     "slideshow": {
1085 |      "slide_type": "slide"
1086 |     }
1087 |    },
1088 |    "source": [
1089 |     "## <img src=\"https://img.icons8.com/dusk/64/000000/data-sheet.png\" style=\"height:50px;display:inline\"> Singular Value Decomposition (SVD)\n",
1090 |     "---\n",
1091 |     "* In linear algebra, the singular-value decomposition (SVD) is a factorization of a real or complex matrix. It is the generalization of the eigendecomposition of a positive semidefinite normal matrix (for example, a symmetric matrix with positive eigenvalues) to any $ m\\times n$ matrix via an extension of the polar decomposition.\n",
1092 |     "* Definition: $$ A_{[m \\times n]} = U_{[m \\times r]} \\Sigma_{[r \\times r]} (V_{[n \\times r]})^T $$"
1093 |    ]
1094 |   },
1095 |   {
1096 |    "cell_type": "markdown",
1097 |    "metadata": {
1098 |     "slideshow": {
1099 |      "slide_type": "subslide"
1100 |     }
1101 |    },
1102 |    "source": [
1103 |     "* $A$ - Input Data matrix\n",
1104 |     "    * $m \\times n$ matrix (e.g. $m$ documents and $n$ terms that can appear in each document)\n",
1105 |     "* $U$ - Left Singular vectors\n",
1106 |     "    * $m \\times r$ matrix (e.g. $m$ documents and $r$ concepts)\n",
1107 |     "    * $U = eig(AA^T)$"
1108 |    ]
1109 |   },
1110 |   {
1111 |    "cell_type": "markdown",
1112 |    "metadata": {
1113 |     "slideshow": {
1114 |      "slide_type": "subslide"
1115 |     }
1116 |    },
1117 |    "source": [
1118 |     "* $\\Sigma$ - Singular values\n",
1119 |     "    * $r \\times r$ **diagonal** matrix (strength of each 'concept')\n",
1120 |     "    * $r$ represnts the **rank** of matrix $A$\n",
1121 |     "    * $\\Sigma = diag\\left(\\sqrt{eigenvalues(A^TA)}\\right)$\n",
1122 |     "    * **Singular Values** definition: the singular values of a matrix $X \\in \\mathbb{R}^{M \\times N}$ are the *square root* of the **eigenvalues** of the matrix $X^TX \\in \\mathbb{R}^{N \\times N}$. If $X \\in \\mathbb{R}^{N \\times N}$ already, then the singular values are the eigenvalues.\n",
1123 |     "* $V$ - Right Singular vectors\n",
1124 |     "    * $n \\times r$ matrix (e.g. $n$ terms and $r$ concepts)\n",
1125 |     "    * $V = eig(A^TA)$"
1126 |    ]
1127 |   },
1128 |   {
1129 |    "cell_type": "markdown",
1130 |    "metadata": {
1131 |     "slideshow": {
1132 |      "slide_type": "subslide"
1133 |     }
1134 |    },
1135 |    "source": [
1136 |     "* Illustration:\n",
1137 |     "    <img src=\"./assets/tut_03_svd.gif\" style=\"height:300px\">\n",
1138 |     "    First, we see the unit disc in blue together with the two canonical unit vectors. We then see the action of M, which distorts the disk to an ellipse. The SVD decomposes M into three simple transformations: an initial rotation $V^{*}$, a scaling $\\Sigma$ along the coordinate axes, and a final rotation $U$. The lengths $\\sigma_1$ and $\\sigma_2$ of the semi-axes of the ellipse are the singular values of $M$, namely $\\Sigma_{1,1}$ and $\\Sigma_{2,2}$.\n",
1139 |     "    \n",
1140 |     "* By <a href=\"//commons.wikimedia.org/wiki/User:Kieff\" class=\"mw-redirect\" title=\"User:Kieff\">Kieff</a> - <span class=\"int-own-work\" lang=\"en\">Own work</span>, Public Domain, <a href=\"https://commons.wikimedia.org/w/index.php?curid=11416486\">Link</a>"
1141 |    ]
1142 |   },
1143 |   {
1144 |    "cell_type": "markdown",
1145 |    "metadata": {
1146 |     "slideshow": {
1147 |      "slide_type": "subslide"
1148 |     }
1149 |    },
1150 |    "source": [
1151 |     "* Another way to look at SVD: $$ A \\approx U\\Sigma V^T = \\sum_i \\sigma_i u_i \\circ v_i^T $$ \n",
1152 |     "    <img src='./assets/tut_03_svd_1.jpg' style=\"height:200px\"> <img src='./assets/tut_03_svd_2.jpg' style=\"height:200px\">"
1153 |    ]
1154 |   },
1155 |   {
1156 |    "cell_type": "markdown",
1157 |    "metadata": {
1158 |     "slideshow": {
1159 |      "slide_type": "subslide"
1160 |     }
1161 |    },
1162 |    "source": [
1163 |     "* **SVD Properties**\n",
1164 |     "    * It is **always** possible to decompose a **real** matrix $A$ to $A = U\\Sigma V^T$ where\n",
1165 |     "        * $U, \\Sigma, V$ are **uniuqe**\n",
1166 |     "        * $U, V$ are column **orthonormal**\n",
1167 |     "            * $U^T U = I, V^T V = I$\n",
1168 |     "        * $\\Sigma$ is **diagonal**\n",
1169 |     "            * Entries (the singular values) are positive and **sorted** in decreasing order ($\\sigma_1 \\geq \\sigma_2 \\geq ... \\geq 0$)\n",
1170 |     "    * <a href=\"http://www.mpi-inf.mpg.de/~bast/ir-seminar-ws04/lecture2.pdf![image.png](attachment:image.png)\">Proof of uniqueness</a>"
1171 |    ]
1172 |   },
1173 |   {
1174 |    "cell_type": "markdown",
1175 |    "metadata": {
1176 |     "slideshow": {
1177 |      "slide_type": "subslide"
1178 |     }
1179 |    },
1180 |    "source": [
1181 |     "<img src=\"./assets/tut_03_svd_vis.png\" style=\"height:200px\">\n",
1182 |     "\n",
1183 |     "* <a href=\"https://en.wikipedia.org/wiki/Singular_value_decomposition\">Image Source</a>"
1184 |    ]
1185 |   },
1186 |   {
1187 |    "cell_type": "markdown",
1188 |    "metadata": {
1189 |     "slideshow": {
1190 |      "slide_type": "slide"
1191 |     }
1192 |    },
1193 |    "source": [
1194 |     "### <img src=\"https://img.icons8.com/dusk/64/000000/cinema-.png\" style=\"height:50px;display:inline\"> SVD Example - Users-to-Movies\n",
1195 |     "---\n",
1196 |     "We are given a dataset of user's rating (1 to 5) for several movies of 3 genres (concepts) and we wish to use SVD to decompose to the following components:\n",
1197 |     "* User-to-Concept - which genres the users prefer: $U$ matrix\n",
1198 |     "* Concepts - what is the strength of each genre in the dataset: $\\Sigma$ - strength of each concept (the singular values)\n",
1199 |     "* Movie-to-Concept - for each movie, what genres are the most dominant: $V$ matrix"
1200 |    ]
1201 |   },
1202 |   {
1203 |    "cell_type": "code",
1204 |    "execution_count": 17,
1205 |    "metadata": {
1206 |     "slideshow": {
1207 |      "slide_type": "subslide"
1208 |     }
1209 |    },
1210 |    "outputs": [
1211 |     {
1212 |      "name": "stdout",
1213 |      "output_type": "stream",
1214 |      "text": [
1215 |       "User-to-Movies matrix:\n"
1216 |      ]
1217 |     },
1218 |     {
1219 |      "data": {
1220 |       "text/html": [
1221 |        "<div>\n",
1222 |        "<style scoped>\n",
1223 |        "    .dataframe tbody tr th:only-of-type {\n",
1224 |        "        vertical-align: middle;\n",
1225 |        "    }\n",
1226 |        "\n",
1227 |        "    .dataframe tbody tr th {\n",
1228 |        "        vertical-align: top;\n",
1229 |        "    }\n",
1230 |        "\n",
1231 |        "    .dataframe thead th {\n",
1232 |        "        text-align: right;\n",
1233 |        "    }\n",
1234 |        "</style>\n",
1235 |        "<table border=\"1\" class=\"dataframe\">\n",
1236 |        "  <thead>\n",
1237 |        "    <tr style=\"text-align: right;\">\n",
1238 |        "      <th></th>\n",
1239 |        "      <th>Matrix</th>\n",
1240 |        "      <th>Alien</th>\n",
1241 |        "      <th>Serenity</th>\n",
1242 |        "      <th>Casablanca</th>\n",
1243 |        "      <th>Amelie</th>\n",
1244 |        "    </tr>\n",
1245 |        "  </thead>\n",
1246 |        "  <tbody>\n",
1247 |        "    <tr>\n",
1248 |        "      <td>User 1</td>\n",
1249 |        "      <td>1</td>\n",
1250 |        "      <td>1</td>\n",
1251 |        "      <td>1</td>\n",
1252 |        "      <td>0</td>\n",
1253 |        "      <td>0</td>\n",
1254 |        "    </tr>\n",
1255 |        "    <tr>\n",
1256 |        "      <td>User 2</td>\n",
1257 |        "      <td>3</td>\n",
1258 |        "      <td>3</td>\n",
1259 |        "      <td>3</td>\n",
1260 |        "      <td>0</td>\n",
1261 |        "      <td>0</td>\n",
1262 |        "    </tr>\n",
1263 |        "    <tr>\n",
1264 |        "      <td>User 3</td>\n",
1265 |        "      <td>4</td>\n",
1266 |        "      <td>4</td>\n",
1267 |        "      <td>4</td>\n",
1268 |        "      <td>0</td>\n",
1269 |        "      <td>0</td>\n",
1270 |        "    </tr>\n",
1271 |        "    <tr>\n",
1272 |        "      <td>User 4</td>\n",
1273 |        "      <td>5</td>\n",
1274 |        "      <td>5</td>\n",
1275 |        "      <td>5</td>\n",
1276 |        "      <td>0</td>\n",
1277 |        "      <td>0</td>\n",
1278 |        "    </tr>\n",
1279 |        "    <tr>\n",
1280 |        "      <td>User 5</td>\n",
1281 |        "      <td>0</td>\n",
1282 |        "      <td>2</td>\n",
1283 |        "      <td>0</td>\n",
1284 |        "      <td>4</td>\n",
1285 |        "      <td>4</td>\n",
1286 |        "    </tr>\n",
1287 |        "    <tr>\n",
1288 |        "      <td>User 6</td>\n",
1289 |        "      <td>0</td>\n",
1290 |        "      <td>0</td>\n",
1291 |        "      <td>0</td>\n",
1292 |        "      <td>5</td>\n",
1293 |        "      <td>5</td>\n",
1294 |        "    </tr>\n",
1295 |        "    <tr>\n",
1296 |        "      <td>User 7</td>\n",
1297 |        "      <td>0</td>\n",
1298 |        "      <td>1</td>\n",
1299 |        "      <td>0</td>\n",
1300 |        "      <td>2</td>\n",
1301 |        "      <td>2</td>\n",
1302 |        "    </tr>\n",
1303 |        "  </tbody>\n",
1304 |        "</table>\n",
1305 |        "</div>"
1306 |       ],
1307 |       "text/plain": [
1308 |        "        Matrix  Alien  Serenity  Casablanca  Amelie\n",
1309 |        "User 1       1      1         1           0       0\n",
1310 |        "User 2       3      3         3           0       0\n",
1311 |        "User 3       4      4         4           0       0\n",
1312 |        "User 4       5      5         5           0       0\n",
1313 |        "User 5       0      2         0           4       4\n",
1314 |        "User 6       0      0         0           5       5\n",
1315 |        "User 7       0      1         0           2       2"
1316 |       ]
1317 |      },
1318 |      "execution_count": 17,
1319 |      "metadata": {},
1320 |      "output_type": "execute_result"
1321 |     }
1322 |    ],
1323 |    "source": [
1324 |     "# load the dataset and create a pandas DataFrame\n",
1325 |     "u_t_m = np.array([[1,1,1,0,0], [3,3,3,0,0], [4,4,4,0,0], [5,5,5,0,0], [0,2,0,4,4], [0,0,0,5,5], [0,1,0,2,2]])\n",
1326 |     "print(\"User-to-Movies matrix:\")\n",
1327 |     "# print(u_t_m)\n",
1328 |     "u_t_m_df = pd.DataFrame(u_t_m, columns=['Matrix', 'Alien', 'Serenity', 'Casablanca', 'Amelie'],\n",
1329 |     "                        index=['User 1', 'User 2','User 3', 'User 4', 'User 5', 'User 6', 'User 7'])\n",
1330 |     "u_t_m_df"
1331 |    ]
1332 |   },
1333 |   {
1334 |    "cell_type": "code",
1335 |    "execution_count": 18,
1336 |    "metadata": {
1337 |     "slideshow": {
1338 |      "slide_type": "subslide"
1339 |     }
1340 |    },
1341 |    "outputs": [],
1342 |    "source": [
1343 |     "# perform SVD for 3 concepts\n",
1344 |     "u, s, vh = np.linalg.svd(u_t_m, full_matrices=False)"
1345 |    ]
1346 |   },
1347 |   {
1348 |    "cell_type": "code",
1349 |    "execution_count": 19,
1350 |    "metadata": {
1351 |     "slideshow": {
1352 |      "slide_type": "subslide"
1353 |     }
1354 |    },
1355 |    "outputs": [
1356 |     {
1357 |      "name": "stdout",
1358 |      "output_type": "stream",
1359 |      "text": [
1360 |       "U of size (7, 3) :\n",
1361 |       "[[-0.1376   0.0236   0.01081]\n",
1362 |       " [-0.4128   0.07086  0.03244]\n",
1363 |       " [-0.5503   0.0944   0.04324]\n",
1364 |       " [-0.688    0.11804  0.05405]\n",
1365 |       " [-0.1528  -0.5913  -0.654  ]\n",
1366 |       " [-0.0722  -0.7314   0.678  ]\n",
1367 |       " [-0.0764  -0.2957  -0.327  ]]\n"
1368 |      ]
1369 |     }
1370 |    ],
1371 |    "source": [
1372 |     "print(\"U of size\", u[:,:3].shape, \":\")\n",
1373 |     "print(u[:,:3].astype(np.float16))"
1374 |    ]
1375 |   },
1376 |   {
1377 |    "cell_type": "code",
1378 |    "execution_count": 21,
1379 |    "metadata": {
1380 |     "slideshow": {
1381 |      "slide_type": "subslide"
1382 |     }
1383 |    },
1384 |    "outputs": [
1385 |     {
1386 |      "name": "stdout",
1387 |      "output_type": "stream",
1388 |      "text": [
1389 |       "Singular values:\n",
1390 |       "as a matrix:\n",
1391 |       "[[12.484  0.     0.   ]\n",
1392 |       " [ 0.     9.51   0.   ]\n",
1393 |       " [ 0.     0.     1.346]]\n"
1394 |      ]
1395 |     }
1396 |    ],
1397 |    "source": [
1398 |     "print(\"Singular values:\")\n",
1399 |     "print(\"as a matrix:\")\n",
1400 |     "print(np.diag(s[:3]).astype(np.float16))"
1401 |    ]
1402 |   },
1403 |   {
1404 |    "cell_type": "code",
1405 |    "execution_count": 22,
1406 |    "metadata": {
1407 |     "slideshow": {
1408 |      "slide_type": "subslide"
1409 |     }
1410 |    },
1411 |    "outputs": [
1412 |     {
1413 |      "name": "stdout",
1414 |      "output_type": "stream",
1415 |      "text": [
1416 |       "V of size (3, 5) :\n",
1417 |       "[[-0.5625  -0.593   -0.5625  -0.09015 -0.09015]\n",
1418 |       " [ 0.1266  -0.02878  0.1266  -0.6953  -0.6953 ]\n",
1419 |       " [ 0.4097  -0.8047   0.4097   0.09125  0.09125]]\n"
1420 |      ]
1421 |     }
1422 |    ],
1423 |    "source": [
1424 |     "print(\"V of size\", vh[:3,:].shape, \":\")\n",
1425 |     "print(vh[:3,:].astype(np.float16))"
1426 |    ]
1427 |   },
1428 |   {
1429 |    "cell_type": "code",
1430 |    "execution_count": 23,
1431 |    "metadata": {},
1432 |    "outputs": [
1433 |     {
1434 |      "name": "stdout",
1435 |      "output_type": "stream",
1436 |      "text": [
1437 |       "reconstruction of user-to-movie:\n"
1438 |      ]
1439 |     },
1440 |     {
1441 |      "data": {
1442 |       "text/html": [
1443 |        "<div>\n",
1444 |        "<style scoped>\n",
1445 |        "    .dataframe tbody tr th:only-of-type {\n",
1446 |        "        vertical-align: middle;\n",
1447 |        "    }\n",
1448 |        "\n",
1449 |        "    .dataframe tbody tr th {\n",
1450 |        "        vertical-align: top;\n",
1451 |        "    }\n",
1452 |        "\n",
1453 |        "    .dataframe thead th {\n",
1454 |        "        text-align: right;\n",
1455 |        "    }\n",
1456 |        "</style>\n",
1457 |        "<table border=\"1\" class=\"dataframe\">\n",
1458 |        "  <thead>\n",
1459 |        "    <tr style=\"text-align: right;\">\n",
1460 |        "      <th></th>\n",
1461 |        "      <th>Matrix</th>\n",
1462 |        "      <th>Alien</th>\n",
1463 |        "      <th>Serenity</th>\n",
1464 |        "      <th>Casablanca</th>\n",
1465 |        "      <th>Amelie</th>\n",
1466 |        "    </tr>\n",
1467 |        "  </thead>\n",
1468 |        "  <tbody>\n",
1469 |        "    <tr>\n",
1470 |        "      <td>User 1</td>\n",
1471 |        "      <td>1.0</td>\n",
1472 |        "      <td>1.0</td>\n",
1473 |        "      <td>1.0</td>\n",
1474 |        "      <td>0.0</td>\n",
1475 |        "      <td>0.0</td>\n",
1476 |        "    </tr>\n",
1477 |        "    <tr>\n",
1478 |        "      <td>User 2</td>\n",
1479 |        "      <td>3.0</td>\n",
1480 |        "      <td>3.0</td>\n",
1481 |        "      <td>3.0</td>\n",
1482 |        "      <td>-0.0</td>\n",
1483 |        "      <td>-0.0</td>\n",
1484 |        "    </tr>\n",
1485 |        "    <tr>\n",
1486 |        "      <td>User 3</td>\n",
1487 |        "      <td>4.0</td>\n",
1488 |        "      <td>4.0</td>\n",
1489 |        "      <td>4.0</td>\n",
1490 |        "      <td>0.0</td>\n",
1491 |        "      <td>-0.0</td>\n",
1492 |        "    </tr>\n",
1493 |        "    <tr>\n",
1494 |        "      <td>User 4</td>\n",
1495 |        "      <td>5.0</td>\n",
1496 |        "      <td>5.0</td>\n",
1497 |        "      <td>5.0</td>\n",
1498 |        "      <td>-0.0</td>\n",
1499 |        "      <td>-0.0</td>\n",
1500 |        "    </tr>\n",
1501 |        "    <tr>\n",
1502 |        "      <td>User 5</td>\n",
1503 |        "      <td>0.0</td>\n",
1504 |        "      <td>2.0</td>\n",
1505 |        "      <td>-0.0</td>\n",
1506 |        "      <td>4.0</td>\n",
1507 |        "      <td>4.0</td>\n",
1508 |        "    </tr>\n",
1509 |        "    <tr>\n",
1510 |        "      <td>User 6</td>\n",
1511 |        "      <td>0.0</td>\n",
1512 |        "      <td>0.0</td>\n",
1513 |        "      <td>-0.0</td>\n",
1514 |        "      <td>5.0</td>\n",
1515 |        "      <td>5.0</td>\n",
1516 |        "    </tr>\n",
1517 |        "    <tr>\n",
1518 |        "      <td>User 7</td>\n",
1519 |        "      <td>0.0</td>\n",
1520 |        "      <td>1.0</td>\n",
1521 |        "      <td>-0.0</td>\n",
1522 |        "      <td>2.0</td>\n",
1523 |        "      <td>2.0</td>\n",
1524 |        "    </tr>\n",
1525 |        "  </tbody>\n",
1526 |        "</table>\n",
1527 |        "</div>"
1528 |       ],
1529 |       "text/plain": [
1530 |        "        Matrix  Alien  Serenity  Casablanca  Amelie\n",
1531 |        "User 1     1.0    1.0       1.0         0.0     0.0\n",
1532 |        "User 2     3.0    3.0       3.0        -0.0    -0.0\n",
1533 |        "User 3     4.0    4.0       4.0         0.0    -0.0\n",
1534 |        "User 4     5.0    5.0       5.0        -0.0    -0.0\n",
1535 |        "User 5     0.0    2.0      -0.0         4.0     4.0\n",
1536 |        "User 6     0.0    0.0      -0.0         5.0     5.0\n",
1537 |        "User 7     0.0    1.0      -0.0         2.0     2.0"
1538 |       ]
1539 |      },
1540 |      "execution_count": 23,
1541 |      "metadata": {},
1542 |      "output_type": "execute_result"
1543 |     }
1544 |    ],
1545 |    "source": [
1546 |     "# reconstruct the user-to-movie matrix\n",
1547 |     "A_aprox = u[:,:3] @ np.diag(s[:3]) @ vh[:3,:]\n",
1548 |     "A_aprox_df = pd.DataFrame(A_aprox.astype(np.float16), columns=['Matrix', 'Alien', 'Serenity', 'Casablanca', 'Amelie'],\n",
1549 |     "                        index=['User 1', 'User 2','User 3', 'User 4', 'User 5', 'User 6', 'User 7'])\n",
1550 |     "print(\"reconstruction of user-to-movie:\")\n",
1551 |     "A_aprox_df"
1552 |    ]
1553 |   },
1554 |   {
1555 |    "cell_type": "markdown",
1556 |    "metadata": {
1557 |     "slideshow": {
1558 |      "slide_type": "subslide"
1559 |     }
1560 |    },
1561 |    "source": [
1562 |     "<img src=\"./assets/tut_03_svd_3.jpg\" style=\"height:350px\">"
1563 |    ]
1564 |   },
1565 |   {
1566 |    "cell_type": "markdown",
1567 |    "metadata": {
1568 |     "slideshow": {
1569 |      "slide_type": "slide"
1570 |     }
1571 |    },
1572 |    "source": [
1573 |     "### <img src=\"https://img.icons8.com/bubbles/50/000000/video-playlist.png\" style=\"height:50px;display:inline\"> Recommended Videos\n",
1574 |     "---\n",
1575 |     "#### <img src=\"https://img.icons8.com/cute-clipart/64/000000/warning-shield.png\" style=\"height:30px;display:inline\"> Warning!\n",
1576 |     "* These videos do not replace the lectures and tutorials.\n",
1577 |     "* Please use these to get a better understanding of the material, and not as an alternative to the written material.\n",
1578 |     "\n",
1579 |     "#### Video By Subject\n",
1580 |     "* Basic Linear Algebra - <a href=\"https://www.youtube.com/watch?v=T3TpdPmTLso\">Mathematics for Machine Learning full Course || Linear Algebra || Part-1</a>\n",
1581 |     "* SVD - <a href=\"https://www.youtube.com/watch?v=P5mlg91as1c\">Lecture 47 — Singular Value Decomposition | Stanford University</a>\n"
1582 |    ]
1583 |   },
1584 |   {
1585 |    "cell_type": "markdown",
1586 |    "metadata": {
1587 |     "slideshow": {
1588 |      "slide_type": "skip"
1589 |     }
1590 |    },
1591 |    "source": [
1592 |     "## <img src=\"https://img.icons8.com/dusk/64/000000/prize.png\" style=\"height:50px;display:inline\"> Credits\n",
1593 |     "---\n",
1594 |     "* Inspired by slides by Elad Osherov and slides from <a href=\"http://www.mmds.org/\">MMDS</a>\n",
1595 |     "* Icons from <a href=\"https://icons8.com/\">Icon8.com</a> - https://icons8.com\n",
1596 |     "* Datasets from <a href=\"https://www.kaggle.com/\">Kaggle</a> - https://www.kaggle.com/"
1597 |    ]
1598 |   }
1599 |  ],
1600 |  "metadata": {
1601 |   "kernelspec": {
1602 |    "display_name": "Python 3",
1603 |    "language": "python",
1604 |    "name": "python3"
1605 |   },
1606 |   "language_info": {
1607 |    "codemirror_mode": {
1608 |     "name": "ipython",
1609 |     "version": 3
1610 |    },
1611 |    "file_extension": ".py",
1612 |    "mimetype": "text/x-python",
1613 |    "name": "python",
1614 |    "nbconvert_exporter": "python",
1615 |    "pygments_lexer": "ipython3",
1616 |    "version": "3.6.9"
1617 |   }
1618 |  },
1619 |  "nbformat": 4,
1620 |  "nbformat_minor": 2
1621 | }
1622 | 


--------------------------------------------------------------------------------
/cs236756_tutorial_11_boosting_bagging.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {
   6 |     "slideshow": {
   7 |      "slide_type": "slide"
   8 |     }
   9 |    },
  10 |    "source": [
  11 |     "# <img src=\"https://img.icons8.com/dusk/64/000000/mind-map.png\" style=\"height:50px;display:inline\"> CS 236756 - Technion - Intro to Machine Learning\n",
  12 |     "---\n",
  13 |     "#### Tal Daniel\n",
  14 |     "\n",
  15 |     "## Tutorial 11 - Boosting & Bagging\n",
  16 |     "---\n",
  17 |     "\n",
  18 |     "<img src=\"./assets/tut_11_adaboost_1.gif\" style=\"hright:250px\">\n",
  19 |     "\n",
  20 |     "<a href=\"https://web.eecs.umich.edu/~jabernet/eecs598course/fall2013/web/\">Image Source</a>"
  21 |    ]
  22 |   },
  23 |   {
  24 |    "cell_type": "markdown",
  25 |    "metadata": {
  26 |     "slideshow": {
  27 |      "slide_type": "slide"
  28 |     }
  29 |    },
  30 |    "source": [
  31 |     "### <img src=\"https://img.icons8.com/bubbles/50/000000/checklist.png\" style=\"height:50px;display:inline\"> Agenda\n",
  32 |     "---\n",
  33 |     "* [Ensemble Learning](#-Ensemble-Learning)\n",
  34 |     "    * [Voting Classifiers](#-Voting-Classifiers)\n",
  35 |     "* [Bagging (& Pasting)](#-Bagging-(&-Pasting))\n",
  36 |     "    * [Bootstrap](#Bootstrap)\n",
  37 |     "* [Boosting](#-Boosting)\n",
  38 |     "    * [AdaBoost](#-AdaBoost)\n",
  39 |     "* [Recommended Videos](#-Recommended-Videos)\n",
  40 |     "* [Credits](#-Credits)"
  41 |    ]
  42 |   },
  43 |   {
  44 |    "cell_type": "code",
  45 |    "execution_count": 15,
  46 |    "metadata": {
  47 |     "slideshow": {
  48 |      "slide_type": "skip"
  49 |     }
  50 |    },
  51 |    "outputs": [],
  52 |    "source": [
  53 |     "# imports for the tutorial\n",
  54 |     "import numpy as np\n",
  55 |     "import pandas as pd\n",
  56 |     "import matplotlib.pyplot as plt\n",
  57 |     "%matplotlib notebook\n",
  58 |     "\n",
  59 |     "from sklearn.metrics import accuracy_score\n",
  60 |     "from sklearn.ensemble import RandomForestClassifier\n",
  61 |     "from sklearn.ensemble import VotingClassifier\n",
  62 |     "from sklearn.ensemble import BaggingClassifier\n",
  63 |     "from sklearn.ensemble import AdaBoostClassifier\n",
  64 |     "from sklearn.tree import DecisionTreeClassifier\n",
  65 |     "from sklearn.tree import DecisionTreeClassifier\n",
  66 |     "from sklearn.linear_model import LogisticRegression\n",
  67 |     "from sklearn.svm import SVC\n",
  68 |     "from sklearn.preprocessing import StandardScaler\n",
  69 |     "\n",
  70 |     "import warnings\n",
  71 |     "warnings.filterwarnings(\"ignore\", category=DeprecationWarning) "
  72 |    ]
  73 |   },
  74 |   {
  75 |    "cell_type": "markdown",
  76 |    "metadata": {
  77 |     "slideshow": {
  78 |      "slide_type": "slide"
  79 |     }
  80 |    },
  81 |    "source": [
  82 |     "## <img src=\"https://img.icons8.com/dusk/64/000000/elections.png\" style=\"height:50px;display:inline\"> Ensemble Learning\n",
  83 |     "---\n",
  84 |     "* **Wisdom of the Crowd** - assembling the predictions of a group of predictors (such as classifiers or regressors) often results in a better prediction than with the best individual predictor.\n",
  85 |     "* **Ensemble** - a group of predictors. An *Ensemble Learning* algorithm is called an **Ensemble method**.\n",
  86 |     "    * For example: **Random Forest** -train a group of Decision Tree classifiers, each is trained on a random subset of the training set. To make predicitons, we obtain the predicitons of all individual trees, and then predict the class that gets the most votes. This is one of the most powerful ML algorithms available today."
  87 |    ]
  88 |   },
  89 |   {
  90 |    "cell_type": "markdown",
  91 |    "metadata": {
  92 |     "slideshow": {
  93 |      "slide_type": "slide"
  94 |     }
  95 |    },
  96 |    "source": [
  97 |     "### <img src=\"https://img.icons8.com/flat_round/64/000000/vote-button.png\" style=\"height:50px;display:inline\"> Voting Classifiers\n",
  98 |     "---\n",
  99 |     "* **Hard Voting Classifier** - aggregate the predictions of each classifier and predict the class that gets the most votes.\n",
 100 |     "    * In fact, even if each classifier is a *weak learner* (it does only slightly better than random guessing), the ensemble can still be a *strong learner* (achieving high accuracy), provided there are a sufficient number of weak learners and they are sufficiently diverse.\n",
 101 |     "    * **The Law of Large Numbers** - how can the above fact be explained? building an ensemble containing 1,000 classifiers that are individually correct only 51% of the time (slighly better than random guessing) and predict the majority voted class, it is possible to reach 75% accuracy if all the classifiers are perfectly independent (which is not really the case since they are trained on the same data).\n",
 102 |     "    * One way to get diverse classifiers is to train them using very different algorithms (increases the chance that they will make very different types of erros and thus improving the ensemble's accuracy).\n",
 103 |     "* **Soft Voting Classifier** - if all the classifiers are able to estimate class probabilities, then the class probability can be averaged over all the individual classifiers.\n",
 104 |     "    * It often achieves higher performance than *hard voting* because it gives more weight to highly confident votes. "
 105 |    ]
 106 |   },
 107 |   {
 108 |    "cell_type": "code",
 109 |    "execution_count": 2,
 110 |    "metadata": {
 111 |     "slideshow": {
 112 |      "slide_type": "subslide"
 113 |     }
 114 |    },
 115 |    "outputs": [
 116 |     {
 117 |      "name": "stdout",
 118 |      "output_type": "stream",
 119 |      "text": [
 120 |       "total samples: 569\n",
 121 |       "total positive sampels (M): 212, total negative samples (B): 357\n"
 122 |      ]
 123 |     },
 124 |     {
 125 |      "data": {
 126 |       "text/html": [
 127 |        "<div>\n",
 128 |        "<style scoped>\n",
 129 |        "    .dataframe tbody tr th:only-of-type {\n",
 130 |        "        vertical-align: middle;\n",
 131 |        "    }\n",
 132 |        "\n",
 133 |        "    .dataframe tbody tr th {\n",
 134 |        "        vertical-align: top;\n",
 135 |        "    }\n",
 136 |        "\n",
 137 |        "    .dataframe thead th {\n",
 138 |        "        text-align: right;\n",
 139 |        "    }\n",
 140 |        "</style>\n",
 141 |        "<table border=\"1\" class=\"dataframe\">\n",
 142 |        "  <thead>\n",
 143 |        "    <tr style=\"text-align: right;\">\n",
 144 |        "      <th></th>\n",
 145 |        "      <th>id</th>\n",
 146 |        "      <th>diagnosis</th>\n",
 147 |        "      <th>radius_mean</th>\n",
 148 |        "      <th>texture_mean</th>\n",
 149 |        "      <th>perimeter_mean</th>\n",
 150 |        "      <th>area_mean</th>\n",
 151 |        "      <th>smoothness_mean</th>\n",
 152 |        "      <th>compactness_mean</th>\n",
 153 |        "      <th>concavity_mean</th>\n",
 154 |        "      <th>concave points_mean</th>\n",
 155 |        "      <th>...</th>\n",
 156 |        "      <th>texture_worst</th>\n",
 157 |        "      <th>perimeter_worst</th>\n",
 158 |        "      <th>area_worst</th>\n",
 159 |        "      <th>smoothness_worst</th>\n",
 160 |        "      <th>compactness_worst</th>\n",
 161 |        "      <th>concavity_worst</th>\n",
 162 |        "      <th>concave points_worst</th>\n",
 163 |        "      <th>symmetry_worst</th>\n",
 164 |        "      <th>fractal_dimension_worst</th>\n",
 165 |        "      <th>Unnamed: 32</th>\n",
 166 |        "    </tr>\n",
 167 |        "  </thead>\n",
 168 |        "  <tbody>\n",
 169 |        "    <tr>\n",
 170 |        "      <td>237</td>\n",
 171 |        "      <td>883263</td>\n",
 172 |        "      <td>M</td>\n",
 173 |        "      <td>20.48</td>\n",
 174 |        "      <td>21.46</td>\n",
 175 |        "      <td>132.50</td>\n",
 176 |        "      <td>1306.0</td>\n",
 177 |        "      <td>0.08355</td>\n",
 178 |        "      <td>0.08348</td>\n",
 179 |        "      <td>0.09042</td>\n",
 180 |        "      <td>0.060220</td>\n",
 181 |        "      <td>...</td>\n",
 182 |        "      <td>26.17</td>\n",
 183 |        "      <td>161.70</td>\n",
 184 |        "      <td>1750.0</td>\n",
 185 |        "      <td>0.12280</td>\n",
 186 |        "      <td>0.23110</td>\n",
 187 |        "      <td>0.31580</td>\n",
 188 |        "      <td>0.14450</td>\n",
 189 |        "      <td>0.2238</td>\n",
 190 |        "      <td>0.07127</td>\n",
 191 |        "      <td>NaN</td>\n",
 192 |        "    </tr>\n",
 193 |        "    <tr>\n",
 194 |        "      <td>159</td>\n",
 195 |        "      <td>871149</td>\n",
 196 |        "      <td>B</td>\n",
 197 |        "      <td>10.90</td>\n",
 198 |        "      <td>12.96</td>\n",
 199 |        "      <td>68.69</td>\n",
 200 |        "      <td>366.8</td>\n",
 201 |        "      <td>0.07515</td>\n",
 202 |        "      <td>0.03718</td>\n",
 203 |        "      <td>0.00309</td>\n",
 204 |        "      <td>0.006588</td>\n",
 205 |        "      <td>...</td>\n",
 206 |        "      <td>18.20</td>\n",
 207 |        "      <td>78.07</td>\n",
 208 |        "      <td>470.0</td>\n",
 209 |        "      <td>0.11710</td>\n",
 210 |        "      <td>0.08294</td>\n",
 211 |        "      <td>0.01854</td>\n",
 212 |        "      <td>0.03953</td>\n",
 213 |        "      <td>0.2738</td>\n",
 214 |        "      <td>0.07685</td>\n",
 215 |        "      <td>NaN</td>\n",
 216 |        "    </tr>\n",
 217 |        "    <tr>\n",
 218 |        "      <td>442</td>\n",
 219 |        "      <td>90944601</td>\n",
 220 |        "      <td>B</td>\n",
 221 |        "      <td>13.78</td>\n",
 222 |        "      <td>15.79</td>\n",
 223 |        "      <td>88.37</td>\n",
 224 |        "      <td>585.9</td>\n",
 225 |        "      <td>0.08817</td>\n",
 226 |        "      <td>0.06718</td>\n",
 227 |        "      <td>0.01055</td>\n",
 228 |        "      <td>0.009937</td>\n",
 229 |        "      <td>...</td>\n",
 230 |        "      <td>17.50</td>\n",
 231 |        "      <td>97.90</td>\n",
 232 |        "      <td>706.6</td>\n",
 233 |        "      <td>0.10720</td>\n",
 234 |        "      <td>0.10710</td>\n",
 235 |        "      <td>0.03517</td>\n",
 236 |        "      <td>0.03312</td>\n",
 237 |        "      <td>0.1859</td>\n",
 238 |        "      <td>0.06810</td>\n",
 239 |        "      <td>NaN</td>\n",
 240 |        "    </tr>\n",
 241 |        "    <tr>\n",
 242 |        "      <td>283</td>\n",
 243 |        "      <td>8912280</td>\n",
 244 |        "      <td>M</td>\n",
 245 |        "      <td>16.24</td>\n",
 246 |        "      <td>18.77</td>\n",
 247 |        "      <td>108.80</td>\n",
 248 |        "      <td>805.1</td>\n",
 249 |        "      <td>0.10660</td>\n",
 250 |        "      <td>0.18020</td>\n",
 251 |        "      <td>0.19480</td>\n",
 252 |        "      <td>0.090520</td>\n",
 253 |        "      <td>...</td>\n",
 254 |        "      <td>25.09</td>\n",
 255 |        "      <td>126.90</td>\n",
 256 |        "      <td>1031.0</td>\n",
 257 |        "      <td>0.13650</td>\n",
 258 |        "      <td>0.47060</td>\n",
 259 |        "      <td>0.50260</td>\n",
 260 |        "      <td>0.17320</td>\n",
 261 |        "      <td>0.2770</td>\n",
 262 |        "      <td>0.10630</td>\n",
 263 |        "      <td>NaN</td>\n",
 264 |        "    </tr>\n",
 265 |        "    <tr>\n",
 266 |        "      <td>477</td>\n",
 267 |        "      <td>911673</td>\n",
 268 |        "      <td>B</td>\n",
 269 |        "      <td>13.90</td>\n",
 270 |        "      <td>16.62</td>\n",
 271 |        "      <td>88.97</td>\n",
 272 |        "      <td>599.4</td>\n",
 273 |        "      <td>0.06828</td>\n",
 274 |        "      <td>0.05319</td>\n",
 275 |        "      <td>0.02224</td>\n",
 276 |        "      <td>0.013390</td>\n",
 277 |        "      <td>...</td>\n",
 278 |        "      <td>21.80</td>\n",
 279 |        "      <td>101.20</td>\n",
 280 |        "      <td>718.9</td>\n",
 281 |        "      <td>0.09384</td>\n",
 282 |        "      <td>0.20060</td>\n",
 283 |        "      <td>0.13840</td>\n",
 284 |        "      <td>0.06222</td>\n",
 285 |        "      <td>0.2679</td>\n",
 286 |        "      <td>0.07698</td>\n",
 287 |        "      <td>NaN</td>\n",
 288 |        "    </tr>\n",
 289 |        "    <tr>\n",
 290 |        "      <td>45</td>\n",
 291 |        "      <td>857010</td>\n",
 292 |        "      <td>M</td>\n",
 293 |        "      <td>18.65</td>\n",
 294 |        "      <td>17.60</td>\n",
 295 |        "      <td>123.70</td>\n",
 296 |        "      <td>1076.0</td>\n",
 297 |        "      <td>0.10990</td>\n",
 298 |        "      <td>0.16860</td>\n",
 299 |        "      <td>0.19740</td>\n",
 300 |        "      <td>0.100900</td>\n",
 301 |        "      <td>...</td>\n",
 302 |        "      <td>21.32</td>\n",
 303 |        "      <td>150.60</td>\n",
 304 |        "      <td>1567.0</td>\n",
 305 |        "      <td>0.16790</td>\n",
 306 |        "      <td>0.50900</td>\n",
 307 |        "      <td>0.73450</td>\n",
 308 |        "      <td>0.23780</td>\n",
 309 |        "      <td>0.3799</td>\n",
 310 |        "      <td>0.09185</td>\n",
 311 |        "      <td>NaN</td>\n",
 312 |        "    </tr>\n",
 313 |        "    <tr>\n",
 314 |        "      <td>127</td>\n",
 315 |        "      <td>866203</td>\n",
 316 |        "      <td>M</td>\n",
 317 |        "      <td>19.00</td>\n",
 318 |        "      <td>18.91</td>\n",
 319 |        "      <td>123.40</td>\n",
 320 |        "      <td>1138.0</td>\n",
 321 |        "      <td>0.08217</td>\n",
 322 |        "      <td>0.08028</td>\n",
 323 |        "      <td>0.09271</td>\n",
 324 |        "      <td>0.056270</td>\n",
 325 |        "      <td>...</td>\n",
 326 |        "      <td>25.73</td>\n",
 327 |        "      <td>148.20</td>\n",
 328 |        "      <td>1538.0</td>\n",
 329 |        "      <td>0.10210</td>\n",
 330 |        "      <td>0.22640</td>\n",
 331 |        "      <td>0.32070</td>\n",
 332 |        "      <td>0.12180</td>\n",
 333 |        "      <td>0.2841</td>\n",
 334 |        "      <td>0.06541</td>\n",
 335 |        "      <td>NaN</td>\n",
 336 |        "    </tr>\n",
 337 |        "    <tr>\n",
 338 |        "      <td>561</td>\n",
 339 |        "      <td>925311</td>\n",
 340 |        "      <td>B</td>\n",
 341 |        "      <td>11.20</td>\n",
 342 |        "      <td>29.37</td>\n",
 343 |        "      <td>70.67</td>\n",
 344 |        "      <td>386.0</td>\n",
 345 |        "      <td>0.07449</td>\n",
 346 |        "      <td>0.03558</td>\n",
 347 |        "      <td>0.00000</td>\n",
 348 |        "      <td>0.000000</td>\n",
 349 |        "      <td>...</td>\n",
 350 |        "      <td>38.30</td>\n",
 351 |        "      <td>75.19</td>\n",
 352 |        "      <td>439.6</td>\n",
 353 |        "      <td>0.09267</td>\n",
 354 |        "      <td>0.05494</td>\n",
 355 |        "      <td>0.00000</td>\n",
 356 |        "      <td>0.00000</td>\n",
 357 |        "      <td>0.1566</td>\n",
 358 |        "      <td>0.05905</td>\n",
 359 |        "      <td>NaN</td>\n",
 360 |        "    </tr>\n",
 361 |        "    <tr>\n",
 362 |        "      <td>120</td>\n",
 363 |        "      <td>865137</td>\n",
 364 |        "      <td>B</td>\n",
 365 |        "      <td>11.41</td>\n",
 366 |        "      <td>10.82</td>\n",
 367 |        "      <td>73.34</td>\n",
 368 |        "      <td>403.3</td>\n",
 369 |        "      <td>0.09373</td>\n",
 370 |        "      <td>0.06685</td>\n",
 371 |        "      <td>0.03512</td>\n",
 372 |        "      <td>0.026230</td>\n",
 373 |        "      <td>...</td>\n",
 374 |        "      <td>15.97</td>\n",
 375 |        "      <td>83.74</td>\n",
 376 |        "      <td>510.5</td>\n",
 377 |        "      <td>0.15480</td>\n",
 378 |        "      <td>0.23900</td>\n",
 379 |        "      <td>0.21020</td>\n",
 380 |        "      <td>0.08958</td>\n",
 381 |        "      <td>0.3016</td>\n",
 382 |        "      <td>0.08523</td>\n",
 383 |        "      <td>NaN</td>\n",
 384 |        "    </tr>\n",
 385 |        "    <tr>\n",
 386 |        "      <td>444</td>\n",
 387 |        "      <td>9110127</td>\n",
 388 |        "      <td>M</td>\n",
 389 |        "      <td>18.03</td>\n",
 390 |        "      <td>16.85</td>\n",
 391 |        "      <td>117.50</td>\n",
 392 |        "      <td>990.0</td>\n",
 393 |        "      <td>0.08947</td>\n",
 394 |        "      <td>0.12320</td>\n",
 395 |        "      <td>0.10900</td>\n",
 396 |        "      <td>0.062540</td>\n",
 397 |        "      <td>...</td>\n",
 398 |        "      <td>22.02</td>\n",
 399 |        "      <td>133.30</td>\n",
 400 |        "      <td>1292.0</td>\n",
 401 |        "      <td>0.12630</td>\n",
 402 |        "      <td>0.26660</td>\n",
 403 |        "      <td>0.42900</td>\n",
 404 |        "      <td>0.15350</td>\n",
 405 |        "      <td>0.2842</td>\n",
 406 |        "      <td>0.08225</td>\n",
 407 |        "      <td>NaN</td>\n",
 408 |        "    </tr>\n",
 409 |        "  </tbody>\n",
 410 |        "</table>\n",
 411 |        "<p>10 rows × 33 columns</p>\n",
 412 |        "</div>"
 413 |       ],
 414 |       "text/plain": [
 415 |        "           id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \\\n",
 416 |        "237    883263         M        20.48         21.46          132.50     1306.0   \n",
 417 |        "159    871149         B        10.90         12.96           68.69      366.8   \n",
 418 |        "442  90944601         B        13.78         15.79           88.37      585.9   \n",
 419 |        "283   8912280         M        16.24         18.77          108.80      805.1   \n",
 420 |        "477    911673         B        13.90         16.62           88.97      599.4   \n",
 421 |        "45     857010         M        18.65         17.60          123.70     1076.0   \n",
 422 |        "127    866203         M        19.00         18.91          123.40     1138.0   \n",
 423 |        "561    925311         B        11.20         29.37           70.67      386.0   \n",
 424 |        "120    865137         B        11.41         10.82           73.34      403.3   \n",
 425 |        "444   9110127         M        18.03         16.85          117.50      990.0   \n",
 426 |        "\n",
 427 |        "     smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \\\n",
 428 |        "237          0.08355           0.08348         0.09042             0.060220   \n",
 429 |        "159          0.07515           0.03718         0.00309             0.006588   \n",
 430 |        "442          0.08817           0.06718         0.01055             0.009937   \n",
 431 |        "283          0.10660           0.18020         0.19480             0.090520   \n",
 432 |        "477          0.06828           0.05319         0.02224             0.013390   \n",
 433 |        "45           0.10990           0.16860         0.19740             0.100900   \n",
 434 |        "127          0.08217           0.08028         0.09271             0.056270   \n",
 435 |        "561          0.07449           0.03558         0.00000             0.000000   \n",
 436 |        "120          0.09373           0.06685         0.03512             0.026230   \n",
 437 |        "444          0.08947           0.12320         0.10900             0.062540   \n",
 438 |        "\n",
 439 |        "     ...  texture_worst  perimeter_worst  area_worst  smoothness_worst  \\\n",
 440 |        "237  ...          26.17           161.70      1750.0           0.12280   \n",
 441 |        "159  ...          18.20            78.07       470.0           0.11710   \n",
 442 |        "442  ...          17.50            97.90       706.6           0.10720   \n",
 443 |        "283  ...          25.09           126.90      1031.0           0.13650   \n",
 444 |        "477  ...          21.80           101.20       718.9           0.09384   \n",
 445 |        "45   ...          21.32           150.60      1567.0           0.16790   \n",
 446 |        "127  ...          25.73           148.20      1538.0           0.10210   \n",
 447 |        "561  ...          38.30            75.19       439.6           0.09267   \n",
 448 |        "120  ...          15.97            83.74       510.5           0.15480   \n",
 449 |        "444  ...          22.02           133.30      1292.0           0.12630   \n",
 450 |        "\n",
 451 |        "     compactness_worst  concavity_worst  concave points_worst  symmetry_worst  \\\n",
 452 |        "237            0.23110          0.31580               0.14450          0.2238   \n",
 453 |        "159            0.08294          0.01854               0.03953          0.2738   \n",
 454 |        "442            0.10710          0.03517               0.03312          0.1859   \n",
 455 |        "283            0.47060          0.50260               0.17320          0.2770   \n",
 456 |        "477            0.20060          0.13840               0.06222          0.2679   \n",
 457 |        "45             0.50900          0.73450               0.23780          0.3799   \n",
 458 |        "127            0.22640          0.32070               0.12180          0.2841   \n",
 459 |        "561            0.05494          0.00000               0.00000          0.1566   \n",
 460 |        "120            0.23900          0.21020               0.08958          0.3016   \n",
 461 |        "444            0.26660          0.42900               0.15350          0.2842   \n",
 462 |        "\n",
 463 |        "     fractal_dimension_worst  Unnamed: 32  \n",
 464 |        "237                  0.07127          NaN  \n",
 465 |        "159                  0.07685          NaN  \n",
 466 |        "442                  0.06810          NaN  \n",
 467 |        "283                  0.10630          NaN  \n",
 468 |        "477                  0.07698          NaN  \n",
 469 |        "45                   0.09185          NaN  \n",
 470 |        "127                  0.06541          NaN  \n",
 471 |        "561                  0.05905          NaN  \n",
 472 |        "120                  0.08523          NaN  \n",
 473 |        "444                  0.08225          NaN  \n",
 474 |        "\n",
 475 |        "[10 rows x 33 columns]"
 476 |       ]
 477 |      },
 478 |      "execution_count": 2,
 479 |      "metadata": {},
 480 |      "output_type": "execute_result"
 481 |     }
 482 |    ],
 483 |    "source": [
 484 |     "# let's load the cancer dataset, shuffle it and  speratre into train and test set\n",
 485 |     "dataset = pd.read_csv('./datasets/cancer_dataset.csv')\n",
 486 |     "# print the number of rows in the data set\n",
 487 |     "number_of_rows = len(dataset)\n",
 488 |     "print(\"total samples: {}\".format(number_of_rows))\n",
 489 |     "total_positive_samples = np.sum(dataset['diagnosis'].values == 'M')\n",
 490 |     "print(\"total positive sampels (M): {}, total negative samples (B): {}\".format(total_positive_samples, number_of_rows - total_positive_samples))\n",
 491 |     "num_train = int(0.8 * number_of_rows)\n",
 492 |     "# reminder, the data looks like this\n",
 493 |     "# dataset.head(10) # the dataset is ordered by the diagnosis\n",
 494 |     "dataset.sample(10)"
 495 |    ]
 496 |   },
 497 |   {
 498 |    "cell_type": "code",
 499 |    "execution_count": 3,
 500 |    "metadata": {
 501 |     "slideshow": {
 502 |      "slide_type": "subslide"
 503 |     }
 504 |    },
 505 |    "outputs": [],
 506 |    "source": [
 507 |     "# prepare the dataset\n",
 508 |     "# we will take the first 2 features as our data (X) and the diagnosis as labels (y)\n",
 509 |     "x = dataset[['radius_mean', 'texture_mean', 'concavity_mean']].values\n",
 510 |     "y = dataset['diagnosis'].values == 'M'  # 1 for Malignat, 0 for Benign\n",
 511 |     "# shuffle\n",
 512 |     "rand_gen = np.random.RandomState(0)\n",
 513 |     "shuffled_indices = rand_gen.permutation(np.arange(len(x)))"
 514 |    ]
 515 |   },
 516 |   {
 517 |    "cell_type": "code",
 518 |    "execution_count": 4,
 519 |    "metadata": {
 520 |     "slideshow": {
 521 |      "slide_type": "subslide"
 522 |     }
 523 |    },
 524 |    "outputs": [],
 525 |    "source": [
 526 |     "x_train = x[shuffled_indices[:num_train]]\n",
 527 |     "y_train = y[shuffled_indices[:num_train]]\n",
 528 |     "x_test = x[shuffled_indices[num_train:]]\n",
 529 |     "y_test = y[shuffled_indices[num_train:]]"
 530 |    ]
 531 |   },
 532 |   {
 533 |    "cell_type": "code",
 534 |    "execution_count": 5,
 535 |    "metadata": {
 536 |     "slideshow": {
 537 |      "slide_type": "subslide"
 538 |     }
 539 |    },
 540 |    "outputs": [
 541 |     {
 542 |      "name": "stdout",
 543 |      "output_type": "stream",
 544 |      "text": [
 545 |       "total training samples: 455, total test samples: 114\n"
 546 |      ]
 547 |     }
 548 |    ],
 549 |    "source": [
 550 |     "# pre-process - standartization\n",
 551 |     "scaler = StandardScaler()\n",
 552 |     "scaler.fit(x_train)\n",
 553 |     "x_train = scaler.transform(x_train)\n",
 554 |     "x_test = scaler.transform(x_test)\n",
 555 |     "\n",
 556 |     "print(\"total training samples: {}, total test samples: {}\".format(num_train, number_of_rows - num_train))"
 557 |    ]
 558 |   },
 559 |   {
 560 |    "cell_type": "code",
 561 |    "execution_count": 9,
 562 |    "metadata": {
 563 |     "slideshow": {
 564 |      "slide_type": "subslide"
 565 |     }
 566 |    },
 567 |    "outputs": [],
 568 |    "source": [
 569 |     "# hard voting\n",
 570 |     "random_state = 38\n",
 571 |     "# create different classifiers\n",
 572 |     "log_clf = LogisticRegression(random_state=random_state, solver='lbfgs')\n",
 573 |     "rnd_clf = RandomForestClassifier(random_state=random_state, n_estimators=100)\n",
 574 |     "svm_clf = SVC(random_state=random_state)\n",
 575 |     "# create a voting classifier\n",
 576 |     "voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)], voting='hard')\n",
 577 |     "# voting_clf.fit(x_train, y_train)"
 578 |    ]
 579 |   },
 580 |   {
 581 |    "cell_type": "code",
 582 |    "execution_count": 10,
 583 |    "metadata": {
 584 |     "scrolled": false
 585 |    },
 586 |    "outputs": [
 587 |     {
 588 |      "name": "stdout",
 589 |      "output_type": "stream",
 590 |      "text": [
 591 |       "LogisticRegression 0.9385964912280702\n",
 592 |       "RandomForestClassifier 0.9298245614035088\n",
 593 |       "SVC 0.9473684210526315\n",
 594 |       "VotingClassifier 0.9473684210526315\n"
 595 |      ]
 596 |     }
 597 |    ],
 598 |    "source": [
 599 |     "# let's look at each classifier's accuracy on the test set\n",
 600 |     "for clf in (log_clf, rnd_clf, svm_clf, voting_clf):\n",
 601 |     "    clf.fit(x_train, y_train)\n",
 602 |     "    y_pred = clf.predict(x_test)\n",
 603 |     "    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))"
 604 |    ]
 605 |   },
 606 |   {
 607 |    "cell_type": "markdown",
 608 |    "metadata": {
 609 |     "slideshow": {
 610 |      "slide_type": "slide"
 611 |     }
 612 |    },
 613 |    "source": [
 614 |     "## <img src=\"https://img.icons8.com/plasticine/100/000000/briefcase.png\" style=\"height:50px;display:inline\"> Bagging (& Pasting)\n",
 615 |     "---\n",
 616 |     "* Another approach to get a diverse set of classifiers is to use the **same training algorithm** for every predictor, but to train them on **different random subsets of the training set**.\n",
 617 |     "* When sampling is performed **with replacement** this method is called **bagging** (which is a short for *bootstrap aggregating*).\n",
 618 |     "    * In sampling **with replacement**, each sample unit of the population can occur one or more times in the sample.\n",
 619 |     "    * In statistics, resampling with replacement is called *bootstrapping*.\n",
 620 |     "* When sampling is performed **without replacement** this method is called **pasting**.\n",
 621 |     "* Thus, both bagging and pasting allow training instances to be sampled several times across multiple predictors, but only bagging allows training instances to be sampled several times for the same predictor."
 622 |    ]
 623 |   },
 624 |   {
 625 |    "cell_type": "markdown",
 626 |    "metadata": {
 627 |     "slideshow": {
 628 |      "slide_type": "subslide"
 629 |     }
 630 |    },
 631 |    "source": [
 632 |     "* Illustartion: <img src=\"./assets/tut_11_bagging_pasting.png\" style=\"height:200px\">\n",
 633 |     "\n",
 634 |     "<a href=\"https://github.com/SoojungHong/MachineLearning/wiki/Random-Forest\">Image Source</a>"
 635 |    ]
 636 |   },
 637 |   {
 638 |    "cell_type": "markdown",
 639 |    "metadata": {
 640 |     "slideshow": {
 641 |      "slide_type": "subslide"
 642 |     }
 643 |    },
 644 |    "source": [
 645 |     "* Once all predictors are trained, the ensemble can make a prediction for a new instance by collecting all the predictions of all the predictors. It usually decided by *hard voting* or average for regression.\n",
 646 |     "* Each individual predictor has a higher bias than if it were trained on the original training set, but the aggregation **reduces both bias and variance**.\n",
 647 |     "    * It is common to see that the ensemble has a **similar bias** but a **lower variance** than a single predictor trained on the original training set."
 648 |    ]
 649 |   },
 650 |   {
 651 |    "cell_type": "markdown",
 652 |    "metadata": {
 653 |     "slideshow": {
 654 |      "slide_type": "subslide"
 655 |     }
 656 |    },
 657 |    "source": [
 658 |     "#### Bootstrap\n",
 659 |     "---\n",
 660 |     "* **Bootstrap Algorithm**:\n",
 661 |     "    * Denote the original sample: $ L_N = (x_1, x_2, ..., x_N) $\n",
 662 |     "    * Repeat $M$ times:\n",
 663 |     "        * Generate a sample $L_k$ of size $k$ from $L_N$ by sampling *with replacement*.\n",
 664 |     "        * Compute $h$ from $L_k$ (that is, train a predictor $h$ using $L_k$).\n",
 665 |     "    * Denote the bootstrap values $H=(h^1, h^2, ..., h^M)$\n",
 666 |     "        * Use these values for calculating all the quantities of interest."
 667 |    ]
 668 |   },
 669 |   {
 670 |    "cell_type": "markdown",
 671 |    "metadata": {
 672 |     "slideshow": {
 673 |      "slide_type": "subslide"
 674 |     }
 675 |    },
 676 |    "source": [
 677 |     "* **Bagging**:\n",
 678 |     "    * Train each model with a random training set (bootsrap).\n",
 679 |     "    * Each model in the ensemble has an **equal weight** in the voting.\n",
 680 |     "    * Finally: $$ H(x) = sign(h^1(x) +h^2(x) +... +h^M(x)) $$"
 681 |    ]
 682 |   },
 683 |   {
 684 |    "cell_type": "markdown",
 685 |    "metadata": {
 686 |     "slideshow": {
 687 |      "slide_type": "subslide"
 688 |     }
 689 |    },
 690 |    "source": [
 691 |     "* One classifier can be wrong as long as the others are correct (*hard voting*) <img src=\"./assets/tut_11_bagging_1.jpg\" style=\"height:200px\">"
 692 |    ]
 693 |   },
 694 |   {
 695 |    "cell_type": "markdown",
 696 |    "metadata": {},
 697 |    "source": [
 698 |     "* Since given equal weight, this may cause problems when there is overlap. <img src=\"./assets/tut_11_bagging_2.jpg\" style=\"height:200px\">"
 699 |    ]
 700 |   },
 701 |   {
 702 |    "cell_type": "code",
 703 |    "execution_count": 13,
 704 |    "metadata": {
 705 |     "slideshow": {
 706 |      "slide_type": "subslide"
 707 |     }
 708 |    },
 709 |    "outputs": [
 710 |     {
 711 |      "name": "stdout",
 712 |      "output_type": "stream",
 713 |      "text": [
 714 |       "bagging accuracy: 0.939\n"
 715 |      ]
 716 |     }
 717 |    ],
 718 |    "source": [
 719 |     "# bagging\n",
 720 |     "# note: BaggingClassifiers will automatically perform 'soft voting' instead of 'hard voting'\n",
 721 |     "# if the base classifier can estimate class probabilities (i.e. if it has a \"predict_proba()\" method).\n",
 722 |     "\n",
 723 |     "bag_clf = BaggingClassifier(\n",
 724 |     "    DecisionTreeClassifier(),\n",
 725 |     "    n_estimators=500,\n",
 726 |     "    max_samples=100,\n",
 727 |     "    bootstrap=True,\n",
 728 |     "    n_jobs=1)\n",
 729 |     "bag_clf.fit(x_train, y_train)\n",
 730 |     "y_pred = bag_clf.predict(x_test)\n",
 731 |     "bag_acc = accuracy_score(y_test, y_pred)\n",
 732 |     "print(\"bagging accuracy: {:.3f}\".format(bag_acc))"
 733 |    ]
 734 |   },
 735 |   {
 736 |    "cell_type": "code",
 737 |    "execution_count": 14,
 738 |    "metadata": {
 739 |     "scrolled": true,
 740 |     "slideshow": {
 741 |      "slide_type": "subslide"
 742 |     }
 743 |    },
 744 |    "outputs": [
 745 |     {
 746 |      "name": "stdout",
 747 |      "output_type": "stream",
 748 |      "text": [
 749 |       "pasting accuracy: 0.930\n"
 750 |      ]
 751 |     }
 752 |    ],
 753 |    "source": [
 754 |     "# pasting\n",
 755 |     "pas_clf = BaggingClassifier(\n",
 756 |     "    DecisionTreeClassifier(),\n",
 757 |     "    n_estimators=500,\n",
 758 |     "    max_samples=100,\n",
 759 |     "    bootstrap=False,\n",
 760 |     "    n_jobs=1)\n",
 761 |     "pas_clf.fit(x_train, y_train)\n",
 762 |     "y_pred = pas_clf.predict(x_test)\n",
 763 |     "pas_acc = accuracy_score(y_test, y_pred)\n",
 764 |     "print(\"pasting accuracy: {:.3f}\".format(pas_acc))"
 765 |    ]
 766 |   },
 767 |   {
 768 |    "cell_type": "markdown",
 769 |    "metadata": {
 770 |     "slideshow": {
 771 |      "slide_type": "slide"
 772 |     }
 773 |    },
 774 |    "source": [
 775 |     "## <img src=\"https://img.icons8.com/bubbles/100/000000/rocket.png\" style=\"height:50px;display:inline\"> Boosting\n",
 776 |     "---\n",
 777 |     "* **Boosting** (also *hypothesis boosting*) - any Ensemble method that can combine several weak learners into a strong learner. In boosting methods, predictors are trained **sequentially**, each trying to correct its predecessor.\n",
 778 |     "    * Weak Learner - as before, the error rate is slighty better than flipping a coin\n",
 779 |     "    * We also define:\n",
 780 |     "        * $h$ is binary classifier such that $h \\in \\{-1, 1\\}$\n",
 781 |     "        * Error rate $Err \\in [0,1]$\n",
 782 |     "* The principal difference between boosting and the committe methods is that in boosting, the base classifiers are **trained in sequence**."
 783 |    ]
 784 |   },
 785 |   {
 786 |    "cell_type": "markdown",
 787 |    "metadata": {
 788 |     "slideshow": {
 789 |      "slide_type": "subslide"
 790 |     }
 791 |    },
 792 |    "source": [
 793 |     "* Each base classifier is trained using a **weighted form of the dataset**, in which the weight coefficient associated with each data point depends on the performance of the previous classifiers.\n",
 794 |     "    * In particular, points that are misclassified by one of the base classifiers are given greater weight when used to train the next classifier in the sequence.\n",
 795 |     "* Once all the classifiers have been trained, their predictions are then combined through a **weighted majority voting** scheme."
 796 |    ]
 797 |   },
 798 |   {
 799 |    "cell_type": "markdown",
 800 |    "metadata": {
 801 |     "slideshow": {
 802 |      "slide_type": "subslide"
 803 |     }
 804 |    },
 805 |    "source": [
 806 |     "* Visually: <img src=\"./assets/tut_11_boosting_1.jpg\" style=\"height:300px\">"
 807 |    ]
 808 |   },
 809 |   {
 810 |    "cell_type": "markdown",
 811 |    "metadata": {
 812 |     "slideshow": {
 813 |      "slide_type": "subslide"
 814 |     }
 815 |    },
 816 |    "source": [
 817 |     "<img src=\"./assets/tut_11_boosting_2.jpg\" style=\"height:400px\">\n",
 818 |     "\n",
 819 |     "* There are many boosting methods, but we will examine one of the most popular one called *AdaBoost*."
 820 |    ]
 821 |   },
 822 |   {
 823 |    "cell_type": "markdown",
 824 |    "metadata": {
 825 |     "slideshow": {
 826 |      "slide_type": "slide"
 827 |     }
 828 |    },
 829 |    "source": [
 830 |     "### <img src=\"https://img.icons8.com/dusk/64/000000/wired-network.png\" style=\"height:50px;display:inline\"> AdaBoost\n",
 831 |     "---\n",
 832 |     "* The idea of AdaBoost is to give more attention to training instances that the predecessor underfitted. This leads to a predictor that focuses more and more on the hard cases.\n",
 833 |     "* The sequential learning in Boosting seems similar to Gradient Descent, only in AdaBoost predictors are added to the ensemble in order to make it better where in GD, a single predictor's paramerters are optimized to minimize an objective function.\n",
 834 |     "* Once all predictors are trained, the ensemble makes predictions by assigning different weights to each predictor, depending on their **overall accuracy on the weighted training set**."
 835 |    ]
 836 |   },
 837 |   {
 838 |    "cell_type": "markdown",
 839 |    "metadata": {
 840 |     "slideshow": {
 841 |      "slide_type": "subslide"
 842 |     }
 843 |    },
 844 |    "source": [
 845 |     "#### Definitions\n",
 846 |     "---\n",
 847 |     "* Class labels are $\\{-1, 1\\}$\n",
 848 |     "* $m$ - number of samples in the training dataset\n",
 849 |     "* The weighted error rate of the $t^{th}$ predictor: $$ \\epsilon_t =\\sum _{i=1}^m w^{(i)} \\cdot \\mathbb{1}(\\hat{y}_t^{(i)} \\neq y^{(i)})$$ In the more general case where the weights are not normalized to 1: $$ \\epsilon_t =\\frac{\\sum _{i=1}^m w^{(i)} \\cdot \\mathbb{1}(\\hat{y}_t^{(i)} \\neq y^{(i)})}{\\sum _{i=1}^m w^{(i)}} $$\n",
 850 |     "    * $\\hat{y}_t^{(i)}$ is the $t^{th}$ predictor's prediction for the $i^{th}$ instance."
 851 |    ]
 852 |   },
 853 |   {
 854 |    "cell_type": "markdown",
 855 |    "metadata": {
 856 |     "slideshow": {
 857 |      "slide_type": "subslide"
 858 |     }
 859 |    },
 860 |    "source": [
 861 |     "* The predictors weight of the $t^{th}$ predictor: $$ \\alpha_t = \\eta \\ln \\frac{1 - \\epsilon_t}{\\epsilon_t} $$\n",
 862 |     "    * $\\eta$ it the learning rate hyperparameter, e.g. $\\frac{1}{2}$ or 1.\n",
 863 |     "    * The more accurate the predictor is, the more weight the predictor will be given.\n",
 864 |     "* The update rule: for $i = 1,2, ..., m $ $$ w^{(i)} \\leftarrow \\begin{cases} w^{(i)}e^{-\\alpha_t} & \\quad \\text{if } \\hat{y}_t^{(i)} = y^{(i)}  \\\\ w^{(i)}e^{\\alpha_t} & \\quad \\text{if } \\hat{y}_t^{(i)} \\neq y^{(i)} \\end{cases} = w^{(i)}e^{-\\alpha_t\\cdot y^{(i)} \\cdot \\hat{y}_t^{(i)}}$$\n",
 865 |     "    * Once all the weights were calculated, they are summed. The sum is denoted $Z_t$. Then, all the weights are normalized by dividing each weight by $Z_t$.\n",
 866 |     "* **Stopping criteria**:\n",
 867 |     "    * The desired number of predictors is reached.\n",
 868 |     "    * A perfert predictor is found."
 869 |    ]
 870 |   },
 871 |   {
 872 |    "cell_type": "markdown",
 873 |    "metadata": {
 874 |     "slideshow": {
 875 |      "slide_type": "subslide"
 876 |     }
 877 |    },
 878 |    "source": [
 879 |     "<img src=\"./assets/tut_11_adaboost.gif\" style=\"height:300px\">\n",
 880 |     "\n",
 881 |     "<a href=\"http://talimi.se/ml/adaboost/\">Image Source</a>"
 882 |    ]
 883 |   },
 884 |   {
 885 |    "cell_type": "markdown",
 886 |    "metadata": {
 887 |     "slideshow": {
 888 |      "slide_type": "subslide"
 889 |     }
 890 |    },
 891 |    "source": [
 892 |     "* **The AdaBoost Algorithm**:\n",
 893 |     "    * Initialize the data weights coefficients $\\{w^{(i)}\\}_{i=1}^m$: $$ w^{(i)} = \\frac{1}{m}, \\forall i= 1,2,...,m $$\n",
 894 |     "    * For $t = 1,...,T$:\n",
 895 |     "        * Fit a weak classifier $h_t(x)$ (which makes predictions $\\hat{y}_t$) to the weighted training data and calculate the weighted error rate: $$ \\epsilon_t =\\frac{\\sum _{i=1}^m w^{(i)} \\cdot \\mathbb{1}(\\hat{y}_t^{(i)} \\neq y^{(i)})}{\\sum _{i=1}^m w^{(i)}} $$\n",
 896 |     "        * Choose $\\alpha_t$ (default $\\eta=\\frac{1}{2}$): $$ \\alpha_t = \\frac{1}{2} \\ln \\frac{1 - \\epsilon_t}{\\epsilon_t} $$\n",
 897 |     "        * Update the weights: for $i = 1,2, ..., m $ $$ w^{(i)} \\leftarrow \\begin{cases} w^{(i)}e^{-\\alpha_t} & \\quad \\text{if } \\hat{y}_t^{(i)} = y^{(i)}  \\\\ w^{(i)}e^{\\alpha_t} & \\quad \\text{if } \\hat{y}_t^{(i)} \\neq y^{(i)} \\end{cases} = w^{(i)}e^{-\\alpha_t\\cdot y^{(i)} \\cdot \\hat{y}_t^{(i)}}$$\n",
 898 |     "        * Normalize the weights: for $i = 1,2, ..., m $ $$ w^{(i)} \\leftarrow \\frac{w^{(i)}}{Z_t} $$\n",
 899 |     "            * $Z_t = \\sum_{i=1}^m w^{(i)}$\n",
 900 |     "    * Use predictions using the final model, which is given by: $$ H(x) = sign(\\sum_{i=1}^T \\alpha_th_t(x)) $$"
 901 |    ]
 902 |   },
 903 |   {
 904 |    "cell_type": "markdown",
 905 |    "metadata": {
 906 |     "slideshow": {
 907 |      "slide_type": "slide"
 908 |     }
 909 |    },
 910 |    "source": [
 911 |     "#### <img src=\"https://img.icons8.com/nolan/64/000000/down2.png\" style=\"height:30px;display:inline\"> Exponential Loss\n",
 912 |     "---\n",
 913 |     "* So far, the loss functions we have seen:\n",
 914 |     "    * 0-1 loss\n",
 915 |     "    * Hinge loss\n",
 916 |     "    * Log loss\n",
 917 |     "* Unlike previously learnt classifiers, AdaBoost minimzes the exponential loss.\n",
 918 |     "* All lossess upper bound the 0-1 loss and act as differentiable surrogate loss functions.\n",
 919 |     "* <img src=\"./assets/tut_11_exp_loss.jpg\" style=\"height:200px\">\n"
 920 |    ]
 921 |   },
 922 |   {
 923 |    "cell_type": "markdown",
 924 |    "metadata": {
 925 |     "slideshow": {
 926 |      "slide_type": "subslide"
 927 |     }
 928 |    },
 929 |    "source": [
 930 |     "* Optimizing the exponential loss:\n",
 931 |     "    * As shown in class, the training error is upper bounded by $H$: $$ \\frac{1}{m} \\sum_i^m \\mathbb{1}(H(x_i) \\neq y_i) \\leq \\prod_{t=1}^T Z_t  $$\n",
 932 |     "        * $Z_t = \\sum_i w_t^{(i)} e^{-\\alpha_t y_i h_t(x_i)} $\n",
 933 |     "    * At each round we minimize $Z_t$ by:\n",
 934 |     "        * Choosing the optimal $h_t$\n",
 935 |     "        * Finding the optimal $\\alpha_t$\n",
 936 |     "        * $$ \\frac{dZ}{d\\alpha} = -\\sum_{i=1}^m w^{(i)} y_ih(x_i) e^{-\\alpha y_ih(x_i)} = 0 $$ $$ -\\sum_{i:y_i=h(x_i)}w^{(i)} e^{-\\alpha} + \\sum_{i: y_i \\neq h(x_i)} w^{(i)} e^{\\alpha} = 0 $$ $$ -e^{-\\alpha} (1-\\epsilon) +e^{\\alpha} \\epsilon = 0 $$ $$ \\rightarrow \\alpha_t = \\frac{1}{2} \\ln \\frac{1 - \\epsilon_t}{\\epsilon_t} $$"
 937 |    ]
 938 |   },
 939 |   {
 940 |    "cell_type": "markdown",
 941 |    "metadata": {
 942 |     "slideshow": {
 943 |      "slide_type": "slide"
 944 |     }
 945 |    },
 946 |    "source": [
 947 |     "### <img src=\"https://img.icons8.com/dusk/64/000000/classroom.png\" style=\"height:50px;display:inline\"> Boosting (AdaBoost) Example By Hand\n",
 948 |     "---\n",
 949 |     "Moses is a student who wants to avoid hard courses. \n",
 950 |     "\n",
 951 |     "In order to achieve this he wants to build a classifier that classifies courses as \"easy\" or \"hard\".\n",
 952 |     "\n",
 953 |     "He decides to classify courses' hardness by using AdaBoost with decision trees stumps (decision trees with max depth of 1) on the following data:"
 954 |    ]
 955 |   },
 956 |   {
 957 |    "cell_type": "markdown",
 958 |    "metadata": {
 959 |     "slideshow": {
 960 |      "slide_type": "subslide"
 961 |     }
 962 |    },
 963 |    "source": [
 964 |     "| <center> Course ID</center>| <center> Hard </center> | <center> Final Exam </center> | <center>Theoretical </center>  | <center> Midterm </center>| <center> 236* </center> | <center> Number of HW </center>  \n",
 965 |     "| --- | --- | --- | --- | --- | --- | --- |\n",
 966 |     "|<center> 1</center> | <center> Y </center> | <center>Y </center>|<center> N </center> | <center> Y </center>| <center>N </center>| <center> 5</center>|\n",
 967 |     "|<center> 2</center> | <center> Y </center> | <center>N </center>|<center> Y </center> | <center> Y </center>| <center>N </center>| <center> 5</center>|\n",
 968 |     "|<center> 3</center> | <center> Y </center> | <center>N </center>|<center> Y </center> | <center> N </center>| <center>Y </center>| <center> 1</center>|\n",
 969 |     "|<center> 4</center> | <center> Y </center> | <center>N </center>|<center> Y </center> | <center> N </center>| <center>N </center>| <center> 3</center>|\n",
 970 |     "|<center> 5</center> | <center> Y </center> | <center>N </center>|<center> Y </center> | <center> N </center>| <center>N </center>| <center> 5</center>|\n",
 971 |     "|<center> 6</center> | <center> Y </center> | <center>Y </center>|<center> N </center> | <center> Y </center>| <center>N </center>| <center> 5</center>|\n",
 972 |     "|<center> 7</center> | <center> Y </center> | <center>Y </center>|<center> N </center> | <center> Y </center>| <center>N </center>| <center> 5</center>|\n",
 973 |     "|<center> 8</center> | <center> N </center> | <center>N </center>|<center> N </center> | <center> Y </center>| <center>Y </center>| <center> 1</center>|\n",
 974 |     "|<center> 9</center> | <center> N </center> | <center>N </center>|<center> Y </center> | <center> N </center>| <center>N </center>| <center> 1</center>|\n",
 975 |     "|<center> 10</center> | <center> Y </center> | <center>N </center>|<center> N </center> | <center> N </center>| <center>N </center>| <center> 5</center>|"
 976 |    ]
 977 |   },
 978 |   {
 979 |    "cell_type": "markdown",
 980 |    "metadata": {
 981 |     "slideshow": {
 982 |      "slide_type": "subslide"
 983 |     }
 984 |    },
 985 |    "source": [
 986 |     "As a first step, he first determined for each possible classifier (including the trivial constant classifier), which of the data points were misclassfied.\n",
 987 |     "\n",
 988 |     "For example, for the first classifier which classfies courses as hard if they have a final exam, the classifier is wrong on samples 2,3,4 and 5."
 989 |    ]
 990 |   },
 991 |   {
 992 |    "cell_type": "markdown",
 993 |    "metadata": {
 994 |     "slideshow": {
 995 |      "slide_type": "subslide"
 996 |     }
 997 |    },
 998 |    "source": [
 999 |     "| <center> Classifier</center>| <center> Test </center> | <center> Value </center> | <center>Misclassified </center>  |\n",
1000 |     "| --- | --- | --- | --- |\n",
1001 |     "|<center> A</center> | <center> Final Exam </center> | <center>Y </center>|<center> 2,3,4,5 </center> |\n",
1002 |     "|<center> B</center> | <center> Theoretical </center> | <center>Y </center>|<center> 1,6,7,9 </center> |\n",
1003 |     "|<center> C</center> | <center> Midterm</center> | <center>Y </center>|<center> 3,4,5,8 </center> |\n",
1004 |     "|<center> D</center> | <center> Undergrduate </center> | <center>Y </center>|<center> 1,2,4,5,6,7,8 </center> |\n",
1005 |     "|<center> E</center> | <center> # HW > 2 </center> | <center>Y </center>|<center> 3,10 </center> |\n",
1006 |     "|<center> F</center> | <center> # HW > 4 </center> | <center>Y </center>|<center> 3,4,10 </center> |\n",
1007 |     "|<center> G</center> | <center> True (const) </center> | <center> </center>|<center> 8,9,10 </center> |\n",
1008 |     "|<center> H</center> | <center> Final Exam </center> | <center>N </center>|<center> 1,6,7,8,9,10 </center> |\n",
1009 |     "|<center> I</center> | <center> Theoretical </center> | <center>N </center>|<center> 2,3,4,5,8,10 </center> |\n",
1010 |     "|<center> J</center> | <center> Midterm</center> | <center>N </center>|<center> 1,2,6,7,9,10 </center> |\n",
1011 |     "|<center> K</center> | <center> Undergraduate </center> | <center>N </center>|<center> 3,9,10 </center> |\n",
1012 |     "|<center> L</center> | <center> # HW < 2 </center> | <center>Y </center>|<center> 1,2,4,5,6,7,8,9 </center> |\n",
1013 |     "|<center> M</center> | <center> # HW < 4 </center> | <center>Y </center>|<center> 1,2,5,6,7,8,9 </center> |\n",
1014 |     "|<center> N</center> | <center> False (const) </center> | <center> </center>|<center> 1,2,3,4,5,6,7 </center> |"
1015 |    ]
1016 |   },
1017 |   {
1018 |    "cell_type": "markdown",
1019 |    "metadata": {
1020 |     "slideshow": {
1021 |      "slide_type": "subslide"
1022 |     }
1023 |    },
1024 |    "source": [
1025 |     "#### Consider only useful classifiers\n",
1026 |     "Only 6 classifiers from the table above would ever be used because the other 8 make all the same error as one of the other classifiers and then make additional erros. For example, classifiers I and N do the same mistakes as A and add to that. The 6 useful classifiers are:\n",
1027 |     "\n",
1028 |     "\n",
1029 |     "| <center> Classifier</center>| <center> Test </center> | <center> Value </center> | <center>Misclassified </center>  |\n",
1030 |     "| --- | --- | --- | --- |\n",
1031 |     "|<center> A</center> | <center> Final Exam </center> | <center>Y </center>|<center> 2,3,4,5 </center> |\n",
1032 |     "|<center> B</center> | <center> Theoretical </center> | <center>Y </center>|<center> 1,6,7,9 </center> |\n",
1033 |     "|<center> C</center> | <center> Midterm</center> | <center>Y </center>|<center> 3,4,5,8 </center> |\n",
1034 |     "|<center> D</center> | <center> Undergrduate </center> | <center>Y </center>|<center> 1,2,4,5,6,7,8 </center> |\n",
1035 |     "|<center> E</center> | <center> # HW > 2 </center> | <center>Y </center>|<center> 3,10 </center> |\n",
1036 |     "|<center> G</center> | <center> True (const) </center> | <center> </center>|<center> 8,9,10 </center> |"
1037 |    ]
1038 |   },
1039 |   {
1040 |    "cell_type": "markdown",
1041 |    "metadata": {
1042 |     "slideshow": {
1043 |      "slide_type": "subslide"
1044 |     }
1045 |    },
1046 |    "source": [
1047 |     "#### AdaBoost\n",
1048 |     "* We will now perform AdaBoost by calculating the weights at each iteration.\n",
1049 |     "* We will calculate the 10 weights, the classification $h$, the error and $\\alpha$.\n",
1050 |     "* If there is a tie, we break it by choosing the classifier that is higher on the list (lexicographical order)\n",
1051 |     "* Note: in this example we assume that the weights of the data points do not affect the clasification and are just meant to calculate the final weight of each classifier."
1052 |    ]
1053 |   },
1054 |   {
1055 |    "cell_type": "markdown",
1056 |    "metadata": {
1057 |     "slideshow": {
1058 |      "slide_type": "subslide"
1059 |     }
1060 |    },
1061 |    "source": [
1062 |     "#### Round 1\n",
1063 |     "* Each weight is given the same value: $\\frac{1}{m} = \\frac{1}{10}$\n",
1064 |     "* Since classifier $E$ is the most accurate, it will serve as the classifier.\n",
1065 |     "* The weight error rate of classifier $E$ is $\\epsilon_E = \\frac{2}{10}$\n",
1066 |     "* Thus: $\\alpha_E = \\frac{1}{2}\\ln \\frac{1 - \\epsilon_E}{\\epsilon_E} = \\frac{1}{2} \\ln (4)$"
1067 |    ]
1068 |   },
1069 |   {
1070 |    "cell_type": "markdown",
1071 |    "metadata": {
1072 |     "slideshow": {
1073 |      "slide_type": "subslide"
1074 |     }
1075 |    },
1076 |    "source": [
1077 |     "| <center> Parameters &nbsp; &nbsp; &nbsp; &nbsp;</center>| <center> Round 1 </center> | <center> Round 2 </center> | <center> Round 3 </center>  |\n",
1078 |     "| ----- | --- | --- | --- |\n",
1079 |     "|<center> w1</center> | <center> $\\frac{1}{10}$ </center> | <center> </center>|<center> </center> |\n",
1080 |     "|<center> w2</center> | <center> $\\frac{1}{10}$ </center> | <center> </center>|<center> </center> |\n",
1081 |     "|<center> w3</center> | <center> $\\frac{1}{10}$ </center> | <center> </center>|<center> </center> |\n",
1082 |     "|<center> w4</center> | <center> $\\frac{1}{10}$ </center> | <center> </center>|<center> </center> |\n",
1083 |     "|<center> w5</center> | <center> $\\frac{1}{10}$ </center> | <center> </center>|<center> </center> |\n",
1084 |     "|<center> w6</center> | <center> $\\frac{1}{10}$ </center> | <center> </center>|<center> </center> |\n",
1085 |     "|<center> w7</center> | <center> $\\frac{1}{10}$ </center> | <center> </center>|<center> </center> |\n",
1086 |     "|<center> w8</center> | <center> $\\frac{1}{10}$ </center> | <center> </center>|<center> </center> |\n",
1087 |     "|<center> w9</center> | <center> $\\frac{1}{10}$ </center> | <center> </center>|<center> </center> |\n",
1088 |     "|<center> w10</center> | <center> $\\frac{1}{10}$ </center> | <center> </center>|<center> </center> |\n",
1089 |     "|<center> $h$</center> | <center> $E$ </center> | <center> </center>|<center> </center> |\n",
1090 |     "|<center> Err - $\\epsilon$</center> | <center> $\\frac{2}{10}$ </center> | <center> </center>|<center> </center> |\n",
1091 |     "|<center> $$\\alpha = \\frac{1}{2}\\ln \\frac{1 - \\epsilon}{\\epsilon} $$</center> | <center> $\\frac{1}{2} \\ln (4)$ </center> | <center> </center>|<center> </center> |"
1092 |    ]
1093 |   },
1094 |   {
1095 |    "cell_type": "markdown",
1096 |    "metadata": {},
1097 |    "source": [
1098 |     "#### AdaBoost - calculating the new weights\n",
1099 |     "* Recall that the un-normalized weights update: $$ \\tilde{w}_{t+1}^{(i)} = w_t^{(i)} e^{-\\alpha_ty_ih_t(x_i)} $$\n",
1100 |     "* For the correctly classified data points (8 points): $$ \\tilde{w}_{t+1}^{(i)} = \\frac{1}{10}e^{-\\frac{1}{2}\\ln (4)} = \\frac{1}{10} \\cdot \\frac{1}{2} = \\frac{1}{20} $$\n",
1101 |     "* For the incorrectly classified data points (2 points): $$ \\tilde{w}_{t+1}^{(i)} = \\frac{1}{10}e^{\\frac{1}{2}\\ln (4)} = \\frac{1}{10} \\cdot 2 = \\frac{1}{5} $$\n",
1102 |     "* Calculate the normalization factor: $$ Z_t = 8 \\cdot \\frac{1}{20} + 2 \\cdot \\frac{1}{5} = \\frac{4}{5}  $$\n",
1103 |     "* The final weights after normalization:\n",
1104 |     "    * Correct: $w_{t+1}^{(i)} = \\frac{1}{20} \\cdot \\frac{5}{4} = \\frac{1}{16}$\n",
1105 |     "    * Incorrect: $w_{t+1}^{(i)} = \\frac{1}{5} \\cdot \\frac{5}{4} = \\frac{1}{4}$"
1106 |    ]
1107 |   },
1108 |   {
1109 |    "cell_type": "markdown",
1110 |    "metadata": {
1111 |     "slideshow": {
1112 |      "slide_type": "subslide"
1113 |     }
1114 |    },
1115 |    "source": [
1116 |     "Similarly, we fill in the rest of the table:\n",
1117 |     "\n",
1118 |     "\n",
1119 |     "| <center> Parameters &nbsp; &nbsp; &nbsp; &nbsp;</center>| <center> Round 1 </center> | <center> Round 2 </center> | <center> Round 3 </center>  |\n",
1120 |     "| ----- | --- | --- | --- |\n",
1121 |     "|<center> w1</center> | <center> $\\frac{1}{10}$ </center> | <center> $\\frac{1}{16}$</center>|<center>$\\frac{3}{24}$ </center> |\n",
1122 |     "|<center> w2</center> | <center> $\\frac{1}{10}$ </center> | <center>$\\frac{1}{16}$ </center>|<center> $\\frac{1}{24}$</center> |\n",
1123 |     "|<center> w3</center> | <center> $\\frac{1}{10}$ </center> | <center>$\\frac{4}{16}$ </center>|<center>$\\frac{4}{24}$ </center> |\n",
1124 |     "|<center> w4</center> | <center> $\\frac{1}{10}$ </center> | <center>$\\frac{1}{16}$ </center>|<center> $\\frac{1}{24}$</center> |\n",
1125 |     "|<center> w5</center> | <center> $\\frac{1}{10}$ </center> | <center>$\\frac{1}{16}$ </center>|<center>$\\frac{1}{24}$ </center> |\n",
1126 |     "|<center> w6</center> | <center> $\\frac{1}{10}$ </center> | <center>$\\frac{1}{16}$ </center>|<center>$\\frac{3}{24}$ </center> |\n",
1127 |     "|<center> w7</center> | <center> $\\frac{1}{10}$ </center> | <center>$\\frac{1}{16}$ </center>|<center>$\\frac{3}{24}$ </center> |\n",
1128 |     "|<center> w8</center> | <center> $\\frac{1}{10}$ </center> | <center>$\\frac{1}{16}$ </center>|<center>$\\frac{1}{24}$ </center> |\n",
1129 |     "|<center> w9</center> | <center> $\\frac{1}{10}$ </center> | <center> $\\frac{1}{16}$</center>|<center>$\\frac{3}{24}$ </center> |\n",
1130 |     "|<center> w10</center> | <center> $\\frac{1}{10}$ </center> | <center>$\\frac{4}{16}$ </center>|<center>$\\frac{4}{24}$ </center> |\n",
1131 |     "|<center> $h$</center> | <center> $E$ </center> | <center> $B$ </center>|<center> $A$ </center> |\n",
1132 |     "|<center> Err - $\\epsilon$</center> | <center> $\\frac{2}{10}$ </center> | <center> $\\frac{1}{4}$ </center>|<center> $\\frac{7}{24}$ </center> |\n",
1133 |     "|<center> $$\\alpha = \\frac{1}{2}\\ln \\frac{1 - \\epsilon}{\\epsilon} $$</center> | <center> $\\frac{1}{2} \\ln (4)$ </center> | <center>  $\\frac{1}{2} \\ln (3)$ </center>|<center> $\\frac{1}{2} \\ln \\frac{17}{7}$ </center> |"
1134 |    ]
1135 |   },
1136 |   {
1137 |    "cell_type": "markdown",
1138 |    "metadata": {
1139 |     "slideshow": {
1140 |      "slide_type": "subslide"
1141 |     }
1142 |    },
1143 |    "source": [
1144 |     "#### AdaBoost - Putting the classifiers together\n",
1145 |     "* The final classifier for 3 rounds of Boosting: $$ H(x) = sign(\\frac{1}{2} \\ln (4) \\cdot h_E(x) + \\frac{1}{2} \\ln (3) \\cdot h_B(x) + \\frac{1}{2} \\ln \\frac{17}{7} \\cdot h_A(x)) $$\n",
1146 |     "    * $h_c(x)$ returns +1 or -1 for $c=E,B,A$\n",
1147 |     "* The data points that the final classifier is correct about them:\n",
1148 |     "    * Since $\\alpha_E, \\alpha_B > \\alpha_A$ - it is just a *majority vote*\n",
1149 |     "    * Only one example (3) is misclassified"
1150 |    ]
1151 |   },
1152 |   {
1153 |    "cell_type": "markdown",
1154 |    "metadata": {
1155 |     "slideshow": {
1156 |      "slide_type": "slide"
1157 |     }
1158 |    },
1159 |    "source": [
1160 |     "### AdaBoost in Scikit-Learn\n",
1161 |     "* Scikit-Learn uses a multiclass version of AdaBoost called *SAMME* (Stagewise Additive Modeling using a Multiclass Exponential loss function).\n",
1162 |     "    * When there are just 2 classes, SAMME is equivalent to AdaBoost.\n",
1163 |     "    * If the predictors can estimate class probabilities (i.e. they have a `predict_proba()` method), Scikit-Learn can use a variant of SAMME called *SAMMER* (R for \"Real\"), which relies on class probabilities rather than predictions and generally performs better.\n",
1164 |     "    \n",
1165 |     "* The following code trains an AdaBoost classifier on 600 Decision Stumps.\n",
1166 |     "* Note: if the AdaBoost classifier is **overfitting** the training set, a good regularization may be reducing the number of estimators or more strongly regularize the base classifier.\n",
1167 |     "* An important drawback to sequential learning is that **it cannot be parallelized**, since each predictor can only be trained after the previous predictor has been trained and evaluated. Thus, it does not scale as well as bagging or pasting."
1168 |    ]
1169 |   },
1170 |   {
1171 |    "cell_type": "code",
1172 |    "execution_count": 16,
1173 |    "metadata": {
1174 |     "slideshow": {
1175 |      "slide_type": "subslide"
1176 |     }
1177 |    },
1178 |    "outputs": [
1179 |     {
1180 |      "name": "stdout",
1181 |      "output_type": "stream",
1182 |      "text": [
1183 |       "adaboost accuracy: 0.930\n"
1184 |      ]
1185 |     }
1186 |    ],
1187 |    "source": [
1188 |     "# AdaBoost\n",
1189 |     "ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=600, algorithm=\"SAMME.R\", learning_rate=0.5)\n",
1190 |     "ada_clf.fit(x_train, y_train)\n",
1191 |     "y_pred = ada_clf.predict(x_test)\n",
1192 |     "ada_acc = accuracy_score(y_test, y_pred)\n",
1193 |     "print(\"adaboost accuracy: {:.3f}\".format(ada_acc))"
1194 |    ]
1195 |   },
1196 |   {
1197 |    "cell_type": "markdown",
1198 |    "metadata": {
1199 |     "slideshow": {
1200 |      "slide_type": "slide"
1201 |     }
1202 |    },
1203 |    "source": [
1204 |     "### <img src=\"https://img.icons8.com/bubbles/50/000000/video-playlist.png\" style=\"height:50px;display:inline\"> Recommended Videos\n",
1205 |     "---\n",
1206 |     "#### <img src=\"https://img.icons8.com/cute-clipart/64/000000/warning-shield.png\" style=\"height:30px;display:inline\"> Warning!\n",
1207 |     "* These videos do not replace the lectures and tutorials.\n",
1208 |     "* Please use these to get a better understanding of the material, and not as an alternative to the written material.\n",
1209 |     "\n",
1210 |     "#### Video By Subject\n",
1211 |     "\n",
1212 |     "* Simple Ensemble, Mixture of Experts - <a href=\"https://www.youtube.com/watch?v=Yvn3--rIdZg\">Ensembles (1): Basics</a>\n",
1213 |     "* Bagging - <a href=\"https://www.youtube.com/watch?v=Rm6s6gmLTdg\">Ensembles (2): Bagging</a>\n",
1214 |     "* Boosting, AdaBoost - <a href=\"https://www.youtube.com/watch?v=toOAToTaGV4\">Machine Learning Lecture 34 \"Boosting / Adaboost\" -Cornell CS4780</a>\n",
1215 |     "    * <a href=\"https://www.youtube.com/watch?v=UHBmv7qCey4\">MIT - 6.034 Artificial Intelligence -  Learning: Boosting</a>\n",
1216 |     "    * <a href=\"https://www.youtube.com/watch?v=ix6IvwbVpw0\">Ensembles (4): AdaBoost</a>"
1217 |    ]
1218 |   },
1219 |   {
1220 |    "cell_type": "markdown",
1221 |    "metadata": {
1222 |     "slideshow": {
1223 |      "slide_type": "skip"
1224 |     }
1225 |    },
1226 |    "source": [
1227 |     "## <img src=\"https://img.icons8.com/dusk/64/000000/prize.png\" style=\"height:50px;display:inline\"> Credits\n",
1228 |     "---\n",
1229 |     "* Icons from <a href=\"https://icons8.com/\">Icon8.com</a> - https://icons8.com\n",
1230 |     "* Datasets from <a href=\"https://www.kaggle.com/\">Kaggle</a> - https://www.kaggle.com/\n",
1231 |     "* Examples and code snippets were taken from <a href=\"http://shop.oreilly.com/product/0636920052289.do\">\"Hands-On Machine Learning with Scikit-Learn and TensorFlow\"</a>"
1232 |    ]
1233 |   }
1234 |  ],
1235 |  "metadata": {
1236 |   "kernelspec": {
1237 |    "display_name": "Python 3",
1238 |    "language": "python",
1239 |    "name": "python3"
1240 |   },
1241 |   "language_info": {
1242 |    "codemirror_mode": {
1243 |     "name": "ipython",
1244 |     "version": 3
1245 |    },
1246 |    "file_extension": ".py",
1247 |    "mimetype": "text/x-python",
1248 |    "name": "python",
1249 |    "nbconvert_exporter": "python",
1250 |    "pygments_lexer": "ipython3",
1251 |    "version": "3.6.9"
1252 |   }
1253 |  },
1254 |  "nbformat": 4,
1255 |  "nbformat_minor": 2
1256 | }
1257 | 


--------------------------------------------------------------------------------
/cs236756_tutorial_14_pac_vc_dimension.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "slideshow": {
  7 |      "slide_type": "slide"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "# <img src=\"https://img.icons8.com/dusk/64/000000/mind-map.png\" style=\"height:50px;display:inline\"> CS 236756 - Technion - Intro to Machine Learning\n",
 12 |     "---\n",
 13 |     "#### Tal Daniel\n",
 14 |     "\n",
 15 |     "\n",
 16 |     "## Tutorial 14 - PAC Learning & VC Dimension\n",
 17 |     "---\n",
 18 |     "\n",
 19 |     "<img src=\"./assets/tut_13_pacman.png\" style=\"height:200px\">"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {
 25 |     "slideshow": {
 26 |      "slide_type": "slide"
 27 |     }
 28 |    },
 29 |    "source": [
 30 |     "### <img src=\"https://img.icons8.com/bubbles/50/000000/checklist.png\" style=\"height:50px;display:inline\"> Agenda\n",
 31 |     "---\n",
 32 |     "* [The PAC (**P**robably **A**pproximately **C**orrect) Learning Framework](#-The-PAC-Learning-Framework)\n",
 33 |     "    * [Empirical Risk Minimization (ERM)](#-Empirical-Risk-Minimization-(ERM))\n",
 34 |     "    * [The Fundamental Theorem of Statistical Learning](#-The-Fundamental-Theorem-of-Statistical-Learning)\n",
 35 |     "* [The VC Dimension](#-VC-Dimension)\n",
 36 |     "    * [Theory](#-VC-Dimension---Formal-Definition)\n",
 37 |     "    * [Examples](#-VC-Dimension---Examples)\n",
 38 |     "* [Recommended Videos](#-Recommended-Videos)\n",
 39 |     "* [Credits](#-Credits)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {
 45 |     "slideshow": {
 46 |      "slide_type": "slide"
 47 |     }
 48 |    },
 49 |    "source": [
 50 |     "## <img src=\"https://img.icons8.com/cotton/64/000000/pacman.png\" style=\"height:50px;display:inline\"> The PAC Learning Framework\n",
 51 |     "---\n",
 52 |     "PAC stands for \"probably approximately correct\", which is a framework and set of assumptions under which numerous results on learning theory were proven."
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {
 58 |     "slideshow": {
 59 |      "slide_type": "slide"
 60 |     }
 61 |    },
 62 |    "source": [
 63 |     "### <img src=\"https://img.icons8.com/dusk/64/000000/class.png\" style=\"height:50px;display:inline\"> Classification Learning Problem\n",
 64 |     "---\n",
 65 |     "* The learner's *input*:\n",
 66 |     "    * **Domain Set - $\\mathcal{X}$**: the set of objects we wish to label.\n",
 67 |     "    * **Label Set - $\\mathcal{Y}$**: possible outcomes of an experiment.\n",
 68 |     "    * **Training Data - $S=\\{(x^{(i)}, y^{(i)}); i=1,...,m\\}$**: a finite sequence of pairs in $\\mathcal{X} \\times \\mathcal{Y}$ \n",
 69 |     "        * Drawn iid from some probability distribution $\\mathcal{D}$\n",
 70 |     "* The learner's *output*:\n",
 71 |     "    * **Prediction Rule - hypothesis** - $h: \\mathcal{X} \\to \\mathcal{Y}$: a function that must predict a label for new domain points.\n",
 72 |     "        * The function is also called: predictor, hypothesis or classifier.\n",
 73 |     "* Sample generating model\n",
 74 |     "    * We assume the instances are generated by an **unknown** probability distribution over $\\mathcal{X}$ denoted $\\mathcal{D}$.\n",
 75 |     "    * **i.i.d.**: each $x^{(i)}$ is sampled independently from $\\mathcal{D}$.\n",
 76 |     "    * **Realizability**: we also assume: $\\exists f, f: \\mathcal{X} \\to \\mathcal{Y}$ such that $y^{(i)} = f(x^{(i)}), \\forall i$."
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {
 82 |     "slideshow": {
 83 |      "slide_type": "subslide"
 84 |     }
 85 |    },
 86 |    "source": [
 87 |     "* Measures of success\n",
 88 |     "    * **Training Error** (also called the **empirical risk** or **empirical error**): $$ \\hat{\\epsilon}(h) = \\hat{L}(h) = \\frac{1}{m} \\sum_{i=1}^m \\mathbb{1} \\{h(x^{(i)}) \\neq y^{(i)} \\}$$\n",
 89 |     "    * **Classifier Error** (also called the **generalization error**, the **risk** or the **true error**): the error of $h$ is the probability to draw a random sample $(x, y) \\sim \\mathcal{D}$ such that $h(x) \\neq y$: $$ \\epsilon(h) = L(h) = P_{(x,y) \\sim \\mathcal{D}}(h(x) \\neq y)$$\n",
 90 |     "        * This is the probability that, if we now draw a new example $(x,y)$ from $\\mathcal{D}$, $h$ will misclassify it.\n",
 91 |     "        * We assume that the training data was drawn from the *same* distribution $\\mathcal{D}$ with which we are going to evaluate our hypothesis (the assumption of training and testing on the same distribution is part of the **PAC assumptions**)."
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {
 97 |     "slideshow": {
 98 |      "slide_type": "slide"
 99 |     }
100 |    },
101 |    "source": [
102 |     "#### <img src=\"https://img.icons8.com/bubbles/100/000000/classroom.png\" style=\"height:30px;display:inline\"> Classifier Error Example\n",
103 |     "---\n",
104 |     "\n",
105 |     "* Assume binary features of *papayas* (the fruit...)\n",
106 |     "\n",
107 |     "| Softness | Color | $Pr(x) $ (Probability)|  $h(x)$ | $f(x)$ |\n",
108 |     "|------|------|------|------|------|\n",
109 |     "| Soft | Green | 0.1 | Tasty | Not-Tasty|\n",
110 |     "| Hard | Green | 0.1 | Not-Tasty | Not-Tasty|\n",
111 |     "| Soft | Orange | 0.7 | Tasty | Tasty|\n",
112 |     "| Hard | Orange | 0.1 | Tasty | Not-Tasty|\n",
113 |     "\n",
114 |     "* $\\hat{L}(h) = \\hat{\\epsilon}(h) = 0.5$\n",
115 |     "* $L(h) = \\epsilon(h) = 0.2$"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {
121 |     "slideshow": {
122 |      "slide_type": "subslide"
123 |     }
124 |    },
125 |    "source": [
126 |     "* What is $L_D(h)$?\n",
127 |     "    * We can only approximate it with some probability.\n",
128 |     "* Why can it only be **approximately** correct?\n",
129 |     "    * **Claim**: we can't hope to find $h \\in \\mathcal{H}, \\text{s.t. } L_{D,f}(h) = 0$\n",
130 |     "    * **Proof**:\n",
131 |     "        * For every $\\epsilon \\in (0,1)$ take $X = \\{x_1, x_2\\}, P(x_1) = 1 - \\epsilon, P(x_2) = \\epsilon$\n",
132 |     "        * The probability not to see $x_2$ at all among $m$ i.i.d. examples is $(1-\\epsilon)^m \\approx e^{-\\epsilon m}$\n",
133 |     "        * So, if $\\epsilon << \\frac{1}{m}$ we are likely not to see $x_2$ at all, but then we can't know its label!\n",
134 |     "    * **Relaxation**: we would be happy with $L_{D,f}(h) \\leq \\epsilon$"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {
140 |     "slideshow": {
141 |      "slide_type": "subslide"
142 |     }
143 |    },
144 |    "source": [
145 |     "* Why can it only be **probably** correct?\n",
146 |     "    * Recall that the input to the learner is *randomly generated*.\n",
147 |     "    * There is always a (very small) chance to see the same example again and again.\n",
148 |     "    * **Claim**: no algorithm can guarantee $L_{D,f}(h) \\leq \\epsilon$ for sure, that is, with absolute certainty ($P=1$)\n",
149 |     "    * **Relaxation**: we would allow the algorithm to fail with probability $\\delta$ where $\\delta \\in (0,1)$ is *user-specified*."
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {
155 |     "slideshow": {
156 |      "slide_type": "slide"
157 |     }
158 |    },
159 |    "source": [
160 |     "### <img src=\"https://img.icons8.com/ultraviolet/80/000000/approximately-equal-2.png\" style=\"height:50px;display:inline\"> Probably Approximately Correct (PAC) Learning\n",
161 |     "---\n",
162 |     "* The learner doesn't know $\\mathcal{D}$ and $f$.\n",
163 |     "* The learner receives 2 parameters:\n",
164 |     "    1. $\\epsilon$ - *accuracy* parameter.\n",
165 |     "    2. $\\delta$ - *confidence* parameter.\n",
166 |     "* The learner can ask for training data, $S$ containing <a style=\"color:red\"> $m(\\epsilon, \\delta)$</a> examples.\n",
167 |     "* The learner should output a hypothesis $h$ such that with probability of **at least** $1-\\delta$ it holds that $L_{D,f} \\leq \\epsilon$.\n",
168 |     "    * That is, the learner should be **P**robably (with probability at least $1-\\delta$) **A**pproximately (up to accuracy $\\epsilon$) **C**orrect.\n"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {
174 |     "slideshow": {
175 |      "slide_type": "slide"
176 |     }
177 |    },
178 |    "source": [
179 |     "### <img src=\"https://img.icons8.com/dusk/64/000000/thin-test-tube.png\" style=\"height:50px;display:inline\"> Empirical Risk Minimization (ERM)\n",
180 |     "---\n",
181 |     "* Consider the setting of *linear classification* and let $h_{\\theta}(x) = \\mathbb{1}\\{\\theta^Tx \\geq 0\\}$.\n",
182 |     "* Algorithm goal:\n",
183 |     "    * Find a hypothesis $h_s$ that minimizes the error (risk) with respect to $\\mathcal{D}$ and $f$.\n",
184 |     "    * But $\\mathcal{D}$ and $f$ are **unknown**!"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {
190 |     "slideshow": {
191 |      "slide_type": "subslide"
192 |     }
193 |    },
194 |    "source": [
195 |     "* An alternative goal and a reasonable way to fit the parameters $\\theta$ would be to try and minimize the training error: $$ \\hat{L}(h) = L_s(h) = \\frac{|\\{ i \\in [m]: h(x^{(i)}) \\neq y^{(i)} \\}|}{m}, [m]=\\{1,...,m\\} $$ and pick $$ \\hat{\\theta} = \\underset{\\theta}{\\mathrm{argmin}} \\hat{\\epsilon}(h_{\\theta}) = \\underset{\\theta}{\\mathrm{argmin}} \\hat{L}(h_{\\theta}) $$\n",
196 |     "    * This process is called **empirical risk minimization** (ERM).\n",
197 |     "    * The resulting hypothesis output by the algorithm is $\\hat{h} = h_{\\hat{\\theta}}$.\n",
198 |     "    * ERM can be thought of as the most basic learning algorithm.\n",
199 |     "        * Algorithms like Logistic Regression can also be viewed as approximations to ERM."
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {
205 |     "slideshow": {
206 |      "slide_type": "subslide"
207 |     }
208 |    },
209 |    "source": [
210 |     "* We will leave out the specific parameterization of the hypothesis $\\theta$ and will define the **hypothesis class** $\\mathcal{H}$ used by the learning algorithm to be the set of all classifiers considered by it.\n",
211 |     "* ERM can now be thought of as a **minimization over the class of functions** $\\mathcal{H}$, in which the learning algorithm picks the hypothesis: $$ \\hat{h} = \\underset{h \\in \\mathcal{H}}{\\mathrm{argmin}} \\hat{\\epsilon}(h) $$"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "markdown",
216 |    "metadata": {
217 |     "slideshow": {
218 |      "slide_type": "subslide"
219 |     }
220 |    },
221 |    "source": [
222 |     "* **Overfitting**:\n",
223 |     "    * ERM may result in overfitting for the obvious reasons.\n",
224 |     "    * Assuming the following distribution: <img src=\"./assets/tut_13_overfitting.jpg\" style=\"height:200px\">\n",
225 |     "    * We may build a trivial estimator with 0 (empirical) error: $$ h_s(x) = \\begin{cases}y^{(i)}, \\text{if } \\exists i \\in [m] \\text{ s.t. } x^{(i)} = x \\\\ 0, \\text{ otherwise} \\end{cases} $$\n",
226 |     "    * In order to avoid overfitting, we induce bias."
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {
232 |     "slideshow": {
233 |      "slide_type": "subslide"
234 |     }
235 |    },
236 |    "source": [
237 |     "* **ERM with Inductive Bias**:\n",
238 |     "    * A common solution to overfitting is to restrict the hypothesis search space.\n",
239 |     "    * The learner chooses in advance a set of predictors (the hypothesis class $\\mathcal{H}$).\n",
240 |     "    * The choice of $\\mathcal{H}$ imposes an *inductive* bias (prior knowledge).\n",
241 |     "    * In the following we will assume **realizability**: $$ \\exists h^{*} \\in \\mathcal{H}, \\text{ s.t. } L_{D,f}(h^{*})=\\epsilon(h^{*}) = 0$$"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {
247 |     "slideshow": {
248 |      "slide_type": "slide"
249 |     }
250 |    },
251 |    "source": [
252 |     "### <img src=\"https://img.icons8.com/color/96/000000/master.png\" style=\"height:50px;display:inline\"> The Fundamental Theorem of Statistical Learning\n",
253 |     "---\n",
254 |     "* Let $\\mathcal{H}$ denote a hypothesis class of binary classifiers.\n",
255 |     "* Then, there are absolute **constants** $C_1, C_2$ such that the *sample complexity* (how many samples to draw, roughly) of PAC learning $\\mathcal{H}$ is: $$ C_1 \\frac{d(\\mathcal{H}) + \\log(\\frac{1}{\\delta})}{\\epsilon} \\leq m_{\\mathcal{H}}(\\epsilon, \\delta) \\leq C_2 \\frac{d(\\mathcal{H})\\log(\\frac{1}{\\epsilon}) + \\log(\\frac{1}{\\delta})}{\\epsilon} $$\n",
256 |     "    * $d(\\mathcal{H})$ - the *VC Dimension* (which will be introduced shortly) of hypotheses class $\\mathcal{H}$.\n",
257 |     "* Furthermore, this sample complexity is achieved by the ERM learning rule"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "metadata": {
263 |     "slideshow": {
264 |      "slide_type": "slide"
265 |     }
266 |    },
267 |    "source": [
268 |     "### <img src=\"https://img.icons8.com/dusk/64/000000/question-mark.png\" style=\"height:50px;display:inline\"> What Is Learnable and How to Learn?\n",
269 |     "---\n",
270 |     "* From the fundamental theorem of statistical learning:\n",
271 |     "    * The sample complexity is characterized by the **VC Dimension**.\n",
272 |     "    * The ERM learning rule is generic (near) optimal learner."
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "metadata": {
278 |     "slideshow": {
279 |      "slide_type": "slide"
280 |     }
281 |    },
282 |    "source": [
283 |     "## <img src=\"https://img.icons8.com/dusk/64/000000/v.png\" style=\"height:40px;display:inline\"> VC Dimension\n",
284 |     "---\n",
285 |     "\n",
286 |     "### <img src=\"https://img.icons8.com/cotton/64/000000/party-baloons.png\" style=\"height:50px;display:inline\"> Motivation\n",
287 |     "---\n",
288 |     "* **Complexity of a learner** - representational power, the ability to generalize.\n",
289 |     "    * The usual **trade-off**:\n",
290 |     "        * More power - represent more complex systems $\\to$ may lead to **overfitting**.\n",
291 |     "        * Less power - won't overfit, but may not find the \"best\" learner.\n",
292 |     "    * How to quantify the representational power? Not easily...\n",
293 |     "        * One solution is the **VC Dimension**"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "metadata": {
299 |     "slideshow": {
300 |      "slide_type": "subslide"
301 |     }
302 |    },
303 |    "source": [
304 |     "* **No Free Lunch**\n",
305 |     "    * Suppose that $|\\mathcal{X}| = \\infty$\n",
306 |     "    * For any finite subset $\\mathcal{C} \\subset \\mathcal{X}$ take $\\mathcal{D}$ to be *uniform* distribution over $\\mathcal{C}$\n",
307 |     "    * If the number of training examples is $m \\leq \\frac{\\mathcal{C}}{2}$, then the learner has no knowledge on at least half the elements in $\\mathcal{C}$\n",
308 |     "    * Formally: **No Free Lunch Theorem**\n",
309 |     "        * Fix $\\delta \\in (0,1), \\epsilon < \\frac{1}{2}$. For every learner $\\mathcal{A}$ and training set size $m$, there exists $\\mathcal{D}, f$ such that with probability of at least $\\delta$ over the generation of training data $S$ of $m$ examples, it holds that $$ L_{\\mathcal{D}, f}(A(S)) \\geq \\epsilon $$\n",
310 |     "            * For a *random guess*, $ L_{\\mathcal{D}, f} = \\frac{1}{2}$, so the theorem states that you can't be better than a random guess."
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "markdown",
315 |    "metadata": {
316 |     "slideshow": {
317 |      "slide_type": "subslide"
318 |     }
319 |    },
320 |    "source": [
321 |     "* Suppose we got a **training** set $S=\\{(x^{(1)}, y^{(1)}), ..., (x^{(m)}, y^{(m)})\\}$, and we choose classifiers or hypotheses from a hypotheses class $\\mathcal{H}$.\n",
322 |     "    * We try to explain the labels using a hypothesis from $\\mathcal{H}$\n",
323 |     "    * It turned out that the labels we received were *incorrect* and now we get the same instances with different labels: $S' = \\{(x^{(1)}, y'^{(1)}), ..., (x^{(m)}, y'^{(m)})\\}$\n",
324 |     "    * We try again to explain the labels using a hypothesis from $\\mathcal{H}$\n",
325 |     "    * If we succeed in doing so (that is, find a hypothesis that explains these labels), then something is fishy...\n",
326 |     "    * Conclusion: if the classifier is able to explain everything, then it is useless...\n",
327 |     "    * Formally, if $\\mathcal{H}$ allows all functions over some set $\\mathcal{C}$ of size $m$, then based on the **No Free Lunch** theorem, we can't learn from a subset of size $\\frac{m}{2}$, for example."
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "markdown",
332 |    "metadata": {
333 |     "slideshow": {
334 |      "slide_type": "slide"
335 |     }
336 |    },
337 |    "source": [
338 |     "### <img src=\"https://img.icons8.com/cotton/64/000000/tie.png\" style=\"height:50px;display:inline\"> VC Dimension - Formal Definition\n",
339 |     "---\n",
340 |     "* Let $\\mathcal{C} = \\{x_1, ..., x_{|C|} \\} \\subset \\mathcal{X}$\n",
341 |     "* Let $\\mathcal{H}_C$ be the restriction of $\\mathcal{H}$ to $\\mathcal{C}$, namely, $\\mathcal{H}_C = \\{h_C: h \\in \\mathcal{H} \\}$ where $h_C: \\mathcal{C} \\to \\{0,1\\}$ or $\\{-1,+1\\} $ is s.t. $h_C(x_i) = h(x_i)$ for every $x_i \\in C$\n",
342 |     "* Observation: we can represent each $h_c$ as the vector: $$ \\begin{bmatrix} h(x_1) \\\\ \\vdots \\\\ h(x_{|C|}) \\end{bmatrix} \\in \\{ \\pm 1\\}^{|C|} $$\n",
343 |     "* Therfore: $\\mathcal{H}_C \\leq 2^{|C|}$"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "markdown",
348 |    "metadata": {
349 |     "slideshow": {
350 |      "slide_type": "subslide"
351 |     }
352 |    },
353 |    "source": [
354 |     "* We say that $\\mathcal{H}$ <a style=\"color:red\">**shatters** </a> $\\mathcal{C}$ if $|\\mathcal{H}_C| = 2^{|C|}$\n",
355 |     "    * That is, $\\mathcal{H}$ can realize any labeling on $\\mathcal{C}$, i.e., if for *any* set of labels $\\{y^{(1)}, ..., y^{(m)} \\}$ there exists some $h \\in \\mathcal{H}$ so that $h(x^{(i)}) = y^{(i)}$ for **all** $i = 1,..., m$ \n",
356 |     "* $VCdim(\\mathcal{H})= sup\\{|C| : \\mathcal{H} \\text{ shatters } \\mathcal{C} \\}$"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "markdown",
361 |    "metadata": {
362 |     "slideshow": {
363 |      "slide_type": "subslide"
364 |     }
365 |    },
366 |    "source": [
367 |     "* The VC dimension  is the maximal size of a set $\\mathcal{C}$ such that $\\mathcal{H}$ gives no prior knowledge w.r.t. $\\mathcal{C}$, or, the size of the largest set that is shattered by $\\mathcal{H}$.\n",
368 |     "* In other words, the VC dimension is the maximum number of points that can be arranged such that $h \\in \\mathcal{H}$ can shatter them.\n",
369 |     "* **Dichotomy**: a possible seperation of the sample space into sub-samples.\n",
370 |     "    * For example: $\\{(x_1, 1), (x_2, 0), (x_3, 1)\\}$ is a dichotomy, and also $\\{(x_1, 0), (x_2, 0), (x_3, 1)\\}$ (a total of $2^3$ for this example)."
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "markdown",
375 |    "metadata": {
376 |     "slideshow": {
377 |      "slide_type": "subslide"
378 |     }
379 |    },
380 |    "source": [
381 |     "* **Theorem**: Let $\\mathcal{H}$ be given, and let $d = VCdim(\\mathcal{H})$. Then with probability at least $1-\\delta$, we have that for all $h \\in \\mathcal{H}$: $$ |\\epsilon(h) - \\hat{\\epsilon}(h)| \\leq O(\\sqrt{\\frac{d}{m}\\log\\frac{m}{d} + \\frac{1}{m}\\log\\frac{1}{\\delta}}) $$\n",
382 |     "Thus, with probability at least $1-\\delta$ we also have that: $$ \\epsilon(\\hat{h}) \\leq \\epsilon(h^{*}) + O(\\sqrt{\\frac{d}{m}\\log\\frac{m}{d} + \\frac{1}{m}\\log\\frac{1}{\\delta}}) $$\n",
383 |     "    * $\\epsilon(h)$ is the real (test) error and $\\hat{\\epsilon}(h)$ is the training error (empirical risk).\n",
384 |     "    * In other words, if a hypothesis class has finite VC dimension, then uniform convergence occurs as $m$ becomes large.\n",
385 |     "    * **This is a very strong result because we can make a statement on data we have not seen!**"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "markdown",
390 |    "metadata": {
391 |     "slideshow": {
392 |      "slide_type": "slide"
393 |     }
394 |    },
395 |    "source": [
396 |     "###  <img src=\"https://img.icons8.com/cute-clipart/50/000000/controller.png\" style=\"height:50px;display:inline\"> Finding VC Dimension\n",
397 |     "---\n",
398 |     "* To show that $VCdim(\\mathcal{H}) = d$ we need to show that:\n",
399 |     "    1. There **exists** a set $\\mathcal{C}$ of size $d$ which is shattered by $\\mathcal{H}$\n",
400 |     "        * That is, show that for some ordering of the points, **any** kind of labeling can be attained by hypothesis from $\\mathcal{H}$\n",
401 |     "    2. **Every** set $\\mathcal{C}$ of size $d + 1$ is not shattered by  $\\mathcal{H}$"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "markdown",
406 |    "metadata": {
407 |     "slideshow": {
408 |      "slide_type": "subslide"
409 |     }
410 |    },
411 |    "source": [
412 |     "* Can be thought of as a **2-player game**:\n",
413 |     "    * Fix the definition of $h_{\\theta} = f(x;\\theta)$ (the hypotheses class, e.g. linear classifiers)\n",
414 |     "    * **Player 1**: choose locations $x^{(1)},...,x^{(d)}$\n",
415 |     "    * *Player 2*: choose target labels $y^{(1)},...,y^{(d)}$\n",
416 |     "    * **Player 1**: choose a hypothesis $h \\in \\mathcal{H}$, e.g., choose $\\theta$ in the linear classifier\n",
417 |     "    * If $f(x;\\theta)$ can reproduce the target labeles, **Player 1** wins.\n",
418 |     "        * $\\exists \\{ x^{(1)}, ..., x^{(d)}\\} \\text{ s.t. } \\forall  \\{ y^{(1)}, ..., y^{(d)}\\} \\exists \\theta \\text{ s.t. } \\forall i, f(x^{(i)}) = y^{(i)}$\n",
419 |     "    * The VC dimension would be the value $d$ if *Player 2* covered all the possibles labels and **Player 1** won every game."
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "markdown",
424 |    "metadata": {
425 |     "slideshow": {
426 |      "slide_type": "slide"
427 |     }
428 |    },
429 |    "source": [
430 |     "### <img src=\"https://img.icons8.com/doodle/96/000000/children-pyramid.png\" style=\"height:50px;display:inline\"> VC Dimension - Examples\n",
431 |     "---\n",
432 |     "#### <img src=\"https://img.icons8.com/dusk/64/000000/1.png\" style=\"height:30px;display:inline\"> Example 1 - Toy Example\n",
433 |     "---\n",
434 |     "Consider 9 samples, and 8 hypotheses as follows:\n",
435 |     "\n",
436 |     "|  | $x_1$ |$x_2$| $x_3$ | $x_4$ |$x_5$ |$x_6$ | $x_7$ | $x_8$ |$x_9$ |\n",
437 |     "|------|------|------|------|------|------|------|------|------|------|\n",
438 |     "| $h_1$ | 0 | 0 | 1 | 0|0|0|1|0|0|\n",
439 |     "| $h_2$ | 0 | 1 | 0 | 0|0|1|0|0|0|\n",
440 |     "| $h_3$ | 1 | 0 | 0 | 0|1|1|0|0|0|\n",
441 |     "| $h_4$ | 0 | 0 | 0 | 1|1|0|0|0|1|\n",
442 |     "| $h_5$ | 0 | 0 | 1 | 0|0|0|0|1|0|\n",
443 |     "| $h_6$ | 0 | 1 | 0 | 0|0|0|1|0|0|\n",
444 |     "| $h_7$ | 1 | 0 | 0 | 0|0|1|0|0|0|\n",
445 |     "| $h_8$ | 0 | 0 | 0 | 0|0|0|0|0|0|\n",
446 |     "\n",
447 |     "* The first thing to notice is that the whole sample set (1-9) cannot be shattered as we don't have enough hypotheses. In order to shatter the whole set we would need at least $2^9$ hypotheses."
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "markdown",
452 |    "metadata": {
453 |     "slideshow": {
454 |      "slide_type": "subslide"
455 |     }
456 |    },
457 |    "source": [
458 |     "* **Excercise**: Are the following sets shattered?\n",
459 |     "    * $\\{x_1\\}$\n",
460 |     "    * $\\{x_5, x_6\\}$\n",
461 |     "    * $\\{x_1, x_2\\}$\n",
462 |     "    * $\\{x_5, x_6, x_7\\}$"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "markdown",
467 |    "metadata": {
468 |     "slideshow": {
469 |      "slide_type": "subslide"
470 |     }
471 |    },
472 |    "source": [
473 |     "* **Solution**:\n",
474 |     "    * $\\{x_1\\}$ - **yes**, by $\\{h_2, h_3\\}$\n",
475 |     "    * $\\{x_5, x_6\\}$ - **yes**, by $\\{h_1, h_2, h_3, h_4\\}$\n",
476 |     "    * $\\{x_1, x_2\\}$ - **no**, can't get the classification: $x_1 = 1$ and $x_2 = 1$\n",
477 |     "    * $\\{x_5, x_6, x_7\\}$ - **no**, can't get the classification: $x_5=x_6=x_7=1$"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "markdown",
482 |    "metadata": {
483 |     "slideshow": {
484 |      "slide_type": "subslide"
485 |     }
486 |    },
487 |    "source": [
488 |     "* **Excercise**: What is the VC dimension of $\\mathcal{X}$?"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "markdown",
493 |    "metadata": {
494 |     "slideshow": {
495 |      "slide_type": "subslide"
496 |     }
497 |    },
498 |    "source": [
499 |     "* **Solution**:\n",
500 |     "    * The only 3 points with the dichotomy $\\{1, 1, 1\\}$ are $\\{x_1, x_5, x_6 \\}$\n",
501 |     "    * But the dichotomy $\\{1,0,0\\}$ isn't achievable.\n",
502 |     "        * $\\to$ No 3 points can be shattered\n",
503 |     "    * $\\to VCdim(\\mathcal{H}) = 2 $"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "markdown",
508 |    "metadata": {
509 |     "slideshow": {
510 |      "slide_type": "slide"
511 |     }
512 |    },
513 |    "source": [
514 |     "#### <img src=\"https://img.icons8.com/dusk/64/000000/2.png\" style=\"height:30px;display:inline\"> Example 2 -Threshold Functions\n",
515 |     "---\n",
516 |     "* Threshold functions - $f \\in \\mathcal{H}$ is a single-parametric threshold classifier on real numbers, i.e., for a certain threshold $\\theta$, the classifier $f_{\\theta}$ returns 1 if the input number is larger than $\\theta$ and 0 otherwise. Formally: $$ \\mathcal{X} = \\mathbb{R}, \\mathcal{H} = \\{ x \\to sign(x-\\theta): \\theta \\in \\mathbb{R} \\} $$"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "markdown",
521 |    "metadata": {
522 |     "slideshow": {
523 |      "slide_type": "subslide"
524 |     }
525 |    },
526 |    "source": [
527 |     "\n",
528 |     "* Let's \"prove\" that $VCdim(\\mathcal{H}) = 1$:\n",
529 |     "    1. One ($n=1$) point can be shattered because for every point $x$, a classifier $f_{\\theta}(x)$ labels it as 0 if $\\theta > x$ and 1 if $\\theta < x$. For example, for $(x=0, label=0), \\theta= 1$ and for $(x=0, label=1), \\theta= -1$.\n",
530 |     "    2. No two ($n+1=2$) points can be shattered - because for every set of 2 points, if the smaller is labeled 1, then the larger must also be labeled 1, so not all labelings are possible.\n",
531 |     "    \n",
532 |     "<img src=\"./assets/tut_13_threshold.jpg\" style=\"height:100px\">\n",
533 |     " \n",
534 |     " <a href=\"http://work.caltech.edu/lectures.html\">Image Source</a> (CalTech's free machine Learning online course by Yaser Abu-Mostafa)"
535 |    ]
536 |   },
537 |   {
538 |    "cell_type": "markdown",
539 |    "metadata": {
540 |     "slideshow": {
541 |      "slide_type": "slide"
542 |     }
543 |    },
544 |    "source": [
545 |     "#### <img src=\"https://img.icons8.com/plasticine/100/000000/3.png\" style=\"height:30px;display:inline\"> Example 3 - Intervals Functions\n",
546 |     "---\n",
547 |     "* Intervals functions - $f \\in \\mathcal{H}$ is a single-parametric interval classifier on real numbers, i.e, for a certain parameter $\\theta$, the classifier $f_{\\theta}$ returns 1 if the input number is in the interval $[\\theta, \\theta+4]$ and 0 otherwise."
548 |    ]
549 |   },
550 |   {
551 |    "cell_type": "markdown",
552 |    "metadata": {
553 |     "slideshow": {
554 |      "slide_type": "subslide"
555 |     }
556 |    },
557 |    "source": [
558 |     "* Let's \"prove\" that $VCdim(\\mathcal{H}) = 2$:\n",
559 |     "    1. Two ($n=2$) points can be shattered because for every set $\\{x, x+2\\}$, a classifier $f_{\\theta}(x)$ labels it as:\n",
560 |     "        * $(0,0)$ -  if $\\theta < x - 4$ or if $\\theta > x + 2$.\n",
561 |     "        * $(1,0)$ -  if $\\theta \\in [x-4, x-2)$.\n",
562 |     "        * $(1,1)$ - if $\\theta \\in [x-2, x]$.\n",
563 |     "        * $(0,1)$ - if $\\theta \\in (x, x+2]$.\n",
564 |     "    2. No three ($n+1=3$) points can be shattered - because for every set of three numbers, if the smallest and the largest are labeled 1, then the middle one must also be labeled 1, so not all labelings are possible."
565 |    ]
566 |   },
567 |   {
568 |    "cell_type": "markdown",
569 |    "metadata": {
570 |     "slideshow": {
571 |      "slide_type": "subslide"
572 |     }
573 |    },
574 |    "source": [
575 |     "* This result can be generalized for a two-parametric interval classifier $h_{a,b}$: $$ \\mathcal{X} = \\mathbb{R}, \\mathcal{H} = \\{ h_{a,b}: a < b \\in \\mathbb{R} \\} $$ where $$ h_{a,b}(x) = 1 \\iff x \\in [a,b] $$\n",
576 |     "\n",
577 |     "<img src=\"./assets/tut_13_intervals.jpg\" style=\"height:100px\">\n",
578 |     "\n",
579 |     "<a href=\"http://work.caltech.edu/lectures.html\">Image Source</a> (CalTech's free machine Learning online course by Yaser Abu-Mostafa)"
580 |    ]
581 |   },
582 |   {
583 |    "cell_type": "markdown",
584 |    "metadata": {
585 |     "slideshow": {
586 |      "slide_type": "slide"
587 |     }
588 |    },
589 |    "source": [
590 |     "#### <img src=\"https://img.icons8.com/plasticine/100/000000/4.png\" style=\"height:30px;display:inline\"> Example 4 - Axis Aligned Rectangles\n",
591 |     "---\n",
592 |     "* Axis aligned rectangles: $$ \\mathcal{X} = \\mathbb{R}^2, \\mathcal{H} = \\{ h_{a_1,a_2,b_1, b_2}: a_1 < a_2 \\text{ and } b_1 < b_2 \\} $$, where $$ h_{a_1,a_2,b_1, b_2}(x_1, x_2) = 1 \\iff x_1 \\in [a_1, a_2] \\text{ and } x_2 \\in [b_1, b_2] $$"
593 |    ]
594 |   },
595 |   {
596 |    "cell_type": "markdown",
597 |    "metadata": {
598 |     "slideshow": {
599 |      "slide_type": "subslide"
600 |     }
601 |    },
602 |    "source": [
603 |     "* Let's \"prove\" that $VCdim(\\mathcal{H}) = 4$:\n",
604 |     "    \n",
605 |     "1.Four ($n=4$) points can be shattered as seen in the following arrangement: <img src=\"./assets/tut_13_rectangles.jpg\" style=\"height:200px\">\n",
606 |     "\n",
607 |     "Image from Princeton's COS 511: Theoretical Machine Learning, Lecture on <a href=\"https://www.cs.princeton.edu/courses/archive/spring14/cos511/scribe_notes/0220.pdf\"> VC-Dimension</a>"
608 |    ]
609 |   },
610 |   {
611 |    "cell_type": "markdown",
612 |    "metadata": {
613 |     "slideshow": {
614 |      "slide_type": "subslide"
615 |     }
616 |    },
617 |    "source": [
618 |     "2.No five ($n+1=5$) can be shattered - for any 5-point set, we can construct a data assignment in this way: pick the topmost, bottommost, leftmost and rightmost points and give them the label “+”. Because there are 5 points, there must be at least one point left to which we assign “−”. Any rectangle that contains all the “+” points must contain the “−” point, which is a case where shattering is not possible.\n",
619 |     "\n",
620 |     "<img src=\"./assets/tut_13_rectangles_2.jpg\" style=\"height:200px\">"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "markdown",
625 |    "metadata": {
626 |     "slideshow": {
627 |      "slide_type": "slide"
628 |     }
629 |    },
630 |    "source": [
631 |     "#### <img src=\"https://img.icons8.com/plasticine/100/000000/5.png\" style=\"height:30px;display:inline\"> Example 5 - Halfspaces\n",
632 |     "---\n",
633 |     "* Halfspaces (linear classifiers): $$ \\mathcal{X} = \\mathbb{R}^2, \\mathcal{H} = \\{ x \\to sign(\\langle w, x \\rangle) \\}: w \\in \\mathbb{R}^2 $$\n",
634 |     "    * For example: $h(x) = \\mathbb{1}\\{ \\theta_1 x_1 + \\theta_2 x_2 \\geq 0\\}$"
635 |    ]
636 |   },
637 |   {
638 |    "cell_type": "markdown",
639 |    "metadata": {
640 |     "slideshow": {
641 |      "slide_type": "subslide"
642 |     }
643 |    },
644 |    "source": [
645 |     "* Let's \"prove\" that $VCdim(\\mathcal{H}) = 3$:\n",
646 |     "    \n",
647 |     "1.Three ($n=3$) points can be shattered as seen in the following arrangement: <img src=\"./assets/tut_13_linear.jpg\" style=\"height:200px\">"
648 |    ]
649 |   },
650 |   {
651 |    "cell_type": "markdown",
652 |    "metadata": {
653 |     "slideshow": {
654 |      "slide_type": "subslide"
655 |     }
656 |    },
657 |    "source": [
658 |     "2.No four ($n+1=4$) can be shattered - We consider two cases:\n",
659 |     "   1. The four points form a convex region, i.e., lie on the convex hull defined by the 4 points. \n",
660 |     "   2. Three of the 4 points define the convex hull and the 4th point is internal. \n",
661 |     "    \n",
662 |     "* In the first case, the labeling which is positive for one diagonal pair and negative to the other pair cannot be realized by a separating line. \n",
663 |     "* In the second case, a labeling which is positive for the three hull points and negative for the interior point cannot be realized.\n",
664 |     "    \n",
665 |     "<img src=\"./assets/tut_13_halfspaces.jpg\" style=\"height:100px\">\n",
666 |     "\n",
667 |     "* The results is generalized for hyperplanes: VC dimension of hyperplanes in $\\mathbb{R}^d$ is $d+1$."
668 |    ]
669 |   },
670 |   {
671 |    "cell_type": "markdown",
672 |    "metadata": {
673 |     "slideshow": {
674 |      "slide_type": "slide"
675 |     }
676 |    },
677 |    "source": [
678 |     "### <img src=\"https://img.icons8.com/bubbles/50/000000/star.png\" style=\"height:50px;display:inline\"> VC Dimension - Special Cases\n",
679 |     "---\n",
680 |     "* $VCdim(\\mathcal{H}) = 0$ - When is the VC dimension equals to zero? Assume $\\mathcal{X} = \\mathbb{R}^2$. Let $\\mathcal{H}$ contain a **single** hypothesis $h_1$. Thus, the VC dimension of $\\mathcal{H}$ is **always** 0! A single hypothesis can impose only one classification, can only assign one labeling to a set of points.\n"
681 |    ]
682 |   },
683 |   {
684 |    "cell_type": "markdown",
685 |    "metadata": {
686 |     "slideshow": {
687 |      "slide_type": "subslide"
688 |     }
689 |    },
690 |    "source": [
691 |     "* $VCdim(\\mathcal{H}) = \\infty$ - When does the VC dimension go to infinity? Assume $\\mathcal{X} = \\mathbb{R}^2$. Let $\\mathcal{A}$ be the **set of all convex polygons** in $\\mathcal{X}$. Define $\\mathcal{H}$ as the class of all hypotheses $h_p(x), p \\in \\mathcal{A}$: $$ h_p(x) = \\begin{cases} 1, \\text{ if } x \\text{ is contained within polygon } p \\\\ 0, \\text{ otherwise} \\end{cases} $$"
692 |    ]
693 |   },
694 |   {
695 |    "cell_type": "markdown",
696 |    "metadata": {
697 |     "slideshow": {
698 |      "slide_type": "subslide"
699 |     }
700 |    },
701 |    "source": [
702 |     "Let's see why $VCdim(\\mathcal{H}) = \\infty$: for any positive integer $n$, take $n$ points from $\\mathcal{X}$. Place the $n$ points **uniformly spaced** on the **unit circle**. For each $2^n$ subset of this data, there is a convex polygon with vertices at these $n$ points. For each subset, the convex polygon contains the set and excludes its complement.\n",
703 |     "<img src=\"./assets/tut_13_convex.jpg\" style=\"height:200px\">\n",
704 |     "Image from <a href=\"http://www.mathematik.uni-muenchen.de/~deckert/teaching/SS17/ATML/media/VC_dimension.pdf\"> Learnability and VC Dimension</a> at LMU Munchen"
705 |    ]
706 |   },
707 |   {
708 |    "cell_type": "markdown",
709 |    "metadata": {
710 |     "slideshow": {
711 |      "slide_type": "slide"
712 |     }
713 |    },
714 |    "source": [
715 |     "### <img src=\"https://img.icons8.com/bubbles/50/000000/video-playlist.png\" style=\"height:50px;display:inline\"> Recommended Videos\n",
716 |     "---\n",
717 |     "#### <img src=\"https://img.icons8.com/cute-clipart/64/000000/warning-shield.png\" style=\"height:30px;display:inline\"> Warning!\n",
718 |     "* These videos do not replace the lectures and tutorials.\n",
719 |     "* Please use these to get a better understanding of the material, and not as an alternative to the written material.\n",
720 |     "\n",
721 |     "#### Video By Subject\n",
722 |     "\n",
723 |     "* VC Dimension - <a href=\"https://www.youtube.com/watch?v=puDzy2XmR5c\">VC Dimension - Alexander Ihler</a>\n",
724 |     "* Learning Theory by Andrew Ng (Stanford)\n",
725 |     "    * <a href=\"https://www.youtube.com/watch?v=tojaGtMPo5U\">Lecture 9 | Machine Learning (Stanford)</a>\n",
726 |     "    * <a href=\"https://www.youtube.com/watch?v=0kWZoyNRxTY\">Lecture 10 | Machine Learning (Stanford)</a>\n",
727 |     "* Learning Theory Lectures By Shai Ben-David\n",
728 |     "    * <a href=\"https://www.youtube.com/watch?v=aILazXK059Y\"> Lecture 2</a>\n",
729 |     "    * <a href=\"https://www.youtube.com/watch?v=PflkE9JmNLc\"> Lecture 3</a>"
730 |    ]
731 |   },
732 |   {
733 |    "cell_type": "markdown",
734 |    "metadata": {
735 |     "slideshow": {
736 |      "slide_type": "skip"
737 |     }
738 |    },
739 |    "source": [
740 |     "## <img src=\"https://img.icons8.com/dusk/64/000000/prize.png\" style=\"height:50px;display:inline\"> Credits\n",
741 |     "---\n",
742 |     "* Based on <a href=\"https://www.cs.huji.ac.il/~shais/Lectures2014/lecture2.pdf\">slides by Shai Shalev-Schwarz</a>\n",
743 |     "* Great (!) Reading Resource - <a href=\"http://cs229.stanford.edu/notes-spring2019/cs229-notes4.pdf\">CS229 - Stanford - Machine Learning - Learning Theory</a>\n",
744 |     "    * It covers everything and goes into much more details\n",
745 |     "* Icons from <a href=\"https://icons8.com/\">Icon8.com</a> - https://icons8.com"
746 |    ]
747 |   }
748 |  ],
749 |  "metadata": {
750 |   "kernelspec": {
751 |    "display_name": "Python 3",
752 |    "language": "python",
753 |    "name": "python3"
754 |   },
755 |   "language_info": {
756 |    "codemirror_mode": {
757 |     "name": "ipython",
758 |     "version": 3
759 |    },
760 |    "file_extension": ".py",
761 |    "mimetype": "text/x-python",
762 |    "name": "python",
763 |    "nbconvert_exporter": "python",
764 |    "pygments_lexer": "ipython3",
765 |    "version": "3.6.9"
766 |   }
767 |  },
768 |  "nbformat": 4,
769 |  "nbformat_minor": 2
770 | }
771 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: ml-course
 2 | channels:
 3 |   - conda-forge
 4 |   - anaconda
 5 | dependencies:
 6 |   - python
 7 |   - numpy
 8 |   - scikit-learn
 9 |   - scipy
10 |   - pip:
11 |     - pandas
12 |     - matplotlib


--------------------------------------------------------------------------------
/pdf/cs236756_tutorial_01_probability_mle.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_01_probability_mle.pdf


--------------------------------------------------------------------------------
/pdf/cs236756_tutorial_02_statistics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_02_statistics.pdf


--------------------------------------------------------------------------------
/pdf/cs236756_tutorial_03_linear_algebra.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_03_linear_algebra.pdf


--------------------------------------------------------------------------------
/pdf/cs236756_tutorial_04_pca_feature_selection.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_04_pca_feature_selection.pdf


--------------------------------------------------------------------------------
/pdf/cs236756_tutorial_05_evaluation_validation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_05_evaluation_validation.pdf


--------------------------------------------------------------------------------
/pdf/cs236756_tutorial_06_decision_trees.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_06_decision_trees.pdf


--------------------------------------------------------------------------------
/pdf/cs236756_tutorial_07_optimization.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_07_optimization.pdf


--------------------------------------------------------------------------------
/pdf/cs236756_tutorial_08_linear_regression.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_08_linear_regression.pdf


--------------------------------------------------------------------------------
/pdf/cs236756_tutorial_09_linear_models.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_09_linear_models.pdf


--------------------------------------------------------------------------------
/pdf/cs236756_tutorial_10_expectation_maximization.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_10_expectation_maximization.pdf


--------------------------------------------------------------------------------
/pdf/cs236756_tutorial_11_boosting_bagging.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_11_boosting_bagging.pdf


--------------------------------------------------------------------------------
/pdf/cs236756_tutorial_12_svm.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_12_svm.pdf


--------------------------------------------------------------------------------
/pdf/cs236756_tutorial_13_deep_learning_intro_backprop.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_13_deep_learning_intro_backprop.pdf


--------------------------------------------------------------------------------
/pdf/cs236756_tutorial_14_pac_vc_dimension.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_14_pac_vc_dimension.pdf


--------------------------------------------------------------------------------