├── LICENSE ├── README.md ├── assets ├── tut_01_a_vs_b.png ├── tut_01_bayes_rule.png ├── tut_01_cor_1.jpg ├── tut_01_cor_2.jpg ├── tut_01_cor_3.jpg ├── tut_01_intersection.jpg ├── tut_01_param_est.PNG ├── tut_01_transformation.jpg ├── tut_02_benjamin.jpg ├── tut_02_confusion_mat.jpg ├── tut_02_error_types.jpg ├── tut_02_p_val.png ├── tut_02_pop.png ├── tut_02_quote.jpg ├── tut_02_var_param.png ├── tut_02_z_stat.jpg ├── tut_03_gram.gif ├── tut_03_linear_dep.jpg ├── tut_03_norm.jpg ├── tut_03_svd.gif ├── tut_03_svd_1.jpg ├── tut_03_svd_2.jpg ├── tut_03_svd_3.jpg ├── tut_03_svd_vis.png ├── tut_04_dim_red.jpg ├── tut_04_feature_select_1.jpg ├── tut_04_feature_select_2.jpg ├── tut_04_feature_select_3.jpg ├── tut_04_filter_methods.jpg ├── tut_04_outlier.jpeg ├── tut_04_pca_1.jpg ├── tut_04_sbs.jpg ├── tut_04_sfs.jpg ├── tut_04_sfs_example.jpg ├── tut_04_sfs_sbs.jpg ├── tut_04_wrapper_methods_1.jpg ├── tut_05_conf_mat.jpg ├── tut_05_holdout.jpg ├── tut_05_kcv.jpeg ├── tut_05_kfold.jpeg ├── tut_05_overfit.png ├── tut_05_overfitting.png ├── tut_05_test.jpg ├── tut_05_tp_tf.jpg ├── tut_05_train.jpg ├── tut_05_validation.jpg ├── tut_06_convex_1.jpg ├── tut_06_convex_2.jpg ├── tut_06_convex_concave.gif ├── tut_06_deriv.jpg ├── tut_06_deriv_2.jpg ├── tut_06_gd.gif ├── tut_06_gradient.gif ├── tut_06_gradient_2.jpg ├── tut_06_lagrange.jpg ├── tut_06_lls_anim.gif ├── tut_06_lr.png ├── tut_06_mb_sgd.png ├── tut_06_minimum.jpg ├── tut_06_multimodal.jpg ├── tut_06_saddle.jpg ├── tut_06_sgd.png ├── tut_06_step.png ├── tut_06_unimodal.jpg ├── tut_07_akinator.jpg ├── tut_07_boundries.jpg ├── tut_07_entropy.jpg ├── tut_07_example_1.jpg ├── tut_07_example_2.jpg ├── tut_07_gini.jpg ├── tut_07_multitree.jpg ├── tut_07_overfitting.jpg ├── tut_07_random_forest.png ├── tut_07_titan_tree.jpg ├── tut_07_vis.jpg ├── tut_08_adaline.gif ├── tut_08_alpha.jpg ├── tut_08_alpha_2.jpg ├── tut_08_bias_var.jpg ├── tut_08_bias_var_2.jpg ├── tut_08_cos.jpg ├── tut_08_intro.jpg ├── tut_08_lass_ridge.jpg ├── tut_08_lr_1.jpg ├── tut_08_lr_2.jpg ├── tut_08_lr_3.jpg ├── tut_08_perc_vs_ada.png ├── tut_08_poly_1.jpg ├── tut_08_reg.jpg ├── tut_08_res_1.jpg ├── tut_08_res_2.jpg ├── tut_09_bayes.jpg ├── tut_09_lda.jpg ├── tut_09_lin_models.jpg ├── tut_09_lms_perc.jpg ├── tut_09_one_vs_all_1.png ├── tut_09_one_vs_all_2.png ├── tut_09_one_vs_all_3.png ├── tut_09_one_vs_all_4.png ├── tut_09_perceptron.jpg ├── tut_09_qda.jpg ├── tut_10_em_step.jpg ├── tut_10_gmm.jpg ├── tut_10_gmm_2.jpg ├── tut_10_gmm_anim.gif ├── tut_10_kmeans.jpg ├── tut_10_kmeans_anim.gif ├── tut_11_adaboost.gif ├── tut_11_adaboost_1.gif ├── tut_11_bagging_1.jpg ├── tut_11_bagging_2.jpg ├── tut_11_bagging_pasting.png ├── tut_11_boosting_1.jpg ├── tut_11_boosting_2.jpg ├── tut_11_exp_loss.jpg ├── tut_11_geom.jpg ├── tut_11_hinge.jpg ├── tut_11_large_margin.png ├── tut_11_margin.jpg ├── tut_11_rbf_kernel.png ├── tut_11_slack.jpg ├── tut_11_tuning_c.gif ├── tut_11_tuning_c_linear.gif ├── tut_12_backward.jpg ├── tut_12_boolean.jpg ├── tut_12_ex_1.png ├── tut_12_ex_2.png ├── tut_12_forward.jpg ├── tut_12_intro.jpg ├── tut_12_mlp.jpg ├── tut_12_mod_app_1.jpg ├── tut_12_mod_app_2.jpg ├── tut_12_optim_general.jpg ├── tut_13_convex.jpg ├── tut_13_halfspaces.jpg ├── tut_13_intervals.jpg ├── tut_13_linear.jpg ├── tut_13_overfitting.jpg ├── tut_13_pacman.png ├── tut_13_rectangles.jpg ├── tut_13_rectangles_2.jpg └── tut_13_threshold.jpg ├── cs236756_tutorial_01_probability_mle.ipynb ├── cs236756_tutorial_02_statistics.ipynb ├── cs236756_tutorial_03_linear_algebra.ipynb ├── cs236756_tutorial_04_pca_feature_selection.ipynb ├── cs236756_tutorial_05_evaluation_validation.ipynb ├── cs236756_tutorial_06_decision_trees.ipynb ├── cs236756_tutorial_07_optimization.ipynb ├── cs236756_tutorial_08_linear_regression.ipynb ├── cs236756_tutorial_09_linear_models.ipynb ├── cs236756_tutorial_10_expectation_maximization.ipynb ├── cs236756_tutorial_11_boosting_bagging.ipynb ├── cs236756_tutorial_12_svm.ipynb ├── cs236756_tutorial_13_deep_learning_intro_backprop.ipynb ├── cs236756_tutorial_14_pac_vc_dimension.ipynb ├── datasets ├── cancer_dataset.csv ├── heights_dataset.csv └── titanic_dataset.csv ├── environment.yml └── pdf ├── cs236756_tutorial_01_probability_mle.pdf ├── cs236756_tutorial_02_statistics.pdf ├── cs236756_tutorial_03_linear_algebra.pdf ├── cs236756_tutorial_04_pca_feature_selection.pdf ├── cs236756_tutorial_05_evaluation_validation.pdf ├── cs236756_tutorial_06_decision_trees.pdf ├── cs236756_tutorial_07_optimization.pdf ├── cs236756_tutorial_08_linear_regression.pdf ├── cs236756_tutorial_09_linear_models.pdf ├── cs236756_tutorial_10_expectation_maximization.pdf ├── cs236756_tutorial_11_boosting_bagging.pdf ├── cs236756_tutorial_12_svm.pdf ├── cs236756_tutorial_13_deep_learning_intro_backprop.pdf └── cs236756_tutorial_14_pac_vc_dimension.pdf /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Tal Daniel 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cs236756-intro-to-ml 2 | 3 |

4 |
5 | Technion CS 236756 - Introduction to Machine Learning 6 |
7 | 8 |

9 | 10 |

11 | Tal Daniel 12 |

13 | 14 | Jupyter Notebook tutorials for the Technion's CS 236756 course "Introduction to Machine Learning" 15 | 16 |

17 | Open In Colab 18 | Open In NBViewer 19 | Open In Binder 20 | 21 |

22 | 23 | 24 | * For the old tutorials, see `spring19` branch. 25 | 26 | - [cs236756-intro-to-ml](#cs236756-intro-to-ml) 27 | * [Running The Notebooks](#running-the-notebooks) 28 | + [Running Online](#running-online) 29 | + [Running Locally](#running-locally) 30 | * [Agenda](#agenda) 31 | * [Installation Instructions](#installation-instructions) 32 | + [Libraries to Install](#libraries-to-install) 33 | 34 | ## Running The Notebooks 35 | You can view the tutorials online or download and run locally. 36 | 37 | ### Running Online 38 | 39 | |Service | Usage | 40 | |-------------|---------| 41 | |Jupyter Nbviewer| Render and view the notebooks (can not edit) | 42 | |Binder| Render, view and edit the notebooks (limited time) | 43 | |Google Colab| Render, view, edit and save the notebooks to Google Drive (limited time) | 44 | 45 | 46 | Jupyter Nbviewer: 47 | 48 | [![nbviewer](https://raw.githubusercontent.com/taldatech/ee046211-deep-learning/main/assets/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/taldatech/cs236756-intro-to-ml/tree/master/) 49 | 50 | 51 | Press on the "Open in Colab" button below to use Google Colab: 52 | 53 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/taldatech/cs236756-intro-to-ml) 54 | 55 | Or press on the "launch binder" button below to launch in Binder: 56 | 57 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/taldatech/cs236756-intro-to-ml/master) 58 | 59 | Note: creating the Binder instance takes about ~5-10 minutes, so be patient 60 | 61 | ### Running Locally 62 | Press "Download ZIP" under the green button `Clone or download` or use `git` to clone the repository using the 63 | following command: `git clone https://github.com/taldatech/cs236756-intro-to-ml.git` (in cmd/PowerShell in Windows or in the Terminal in Linux/Mac) 64 | 65 | Open the folder in Jupyter Notebook (it is recommended to use Anaconda). Installation instructions can be found at the bottom of the README file. 66 | 67 | 68 | 69 | ## Agenda 70 | 71 | |File | Topics Covered | 72 | |----------------|---------| 73 | |`cs236756_tutorial_01_probability_mle.ipynb\pdf`| Probability basics, random variables, Bayes rule, histograms, correlation, parameter estimation, Maximum Likelihood Estimation (MLE)| 74 | |`cs236756_tutorial_02_statistics.ipynb\pdf`|Statistics definitions, hypothesis testing steps, z-statistic, Central Limit Theorem (CLT), Area Under the Curve (AUC), error types, confusion matrix| 75 | |`cs236756_tutorial_03_linear_algebra.ipynb\pdf`|Linear Algebra basics (vectors, inner/outer product spaces, norms, linear dependency, matrix operations, matrix rank, range and nullspace), least-squares solution, eigenvalues and eigenvectors, Singuar Value Decomposition (SVD)| 76 | |`cs236756_tutorial_04_pca_feature_selection.ipynb\pdf`|Dimensionality Reduction, Outliers, PCA, SVD, Breast Cancer dataset, Feature Selection, Filter methods, Wrapper methods, RFE (scikit-learn)| 77 | |`cs236756_tutorial_05_evaluation_validation.ipynb\pdf`|Classifier Evaluation and Validation, metrics, accuracy, precision, recall, FN/TP rate, Confusion Matrix, F1 score, K-Fold Cross-Validation, train-validation-test split, holdout method, stratification, ROC curve| 78 | |`cs236756_tutorial_06_decision_trees.ipynb\pdf`|Decision Trees, The CART algorithm, Pruning, Regularization, Impurity Metrics, Entropy, Gini, Information Gain (IG), SplitInformation, Gain Ratio (GR), The Titanic Dataset, Tree Visualization with Scikit-Learn, Random Forest, Mutual Information (MI)| 79 | |`cs236756_tutorial_07_optimization.ipynb\pdf`|Optimization in ML, Gradient Descent, Batch Gradient Descent, Mini-Batch (MB) Gradient Descent, Stochastic Gradient Descent (SGD), Convexity, Uni/Multi-modal problems, Lagrangian and Largrange Multipliers, Constrained Optimization| 80 | |`cs236756_tutorial_08_linear_regression.ipynb\pdf`|Classification vs. Regression, NLL (Negative Log-Likelihood), MLE connection to MSE, Residual Analysis, Basis Functions Expansion, Feature Extraction, Linear and Polynomial Regression, Bias-Variance Tradeoff, Irreducible Error, Regularization (L1 + L2), Ridge and LASSO Regression| 81 | |`cs236756_tutorial_09_linear_models.ipynb\pdf`|Discriminative vs Generative Models, Linear Models, Perceptron, Least Mean Square (LMS) - Adaptive Linear Neuron (ADALINE), MLE with Bernoulli, Logistic Regression, Softmax, Maximum A Posteriori (MAP), Quadratic Discriminant Analysis (QDA), Naive Bayes, Linear Discriminant Analysis (LDA), One-vs-All Classification| 82 | |`cs236756_tutorial_10_expectation_maximization.ipynb\pdf`|Soft Clustering, Hard Clustering, K-Means, Incomplete/Complete Likelihood, Expectation Maximization (EM) Algorithm, Gaussian Mixture Model (GMM), Bernoulli Mixture Model (BMM), Dataset Generation with Scikit-Learn| 83 | |`cs236756_tutorial_11_boosting_bagging.ipynb\pdf`|Ensemble Learning, Voting Classifiers, Hard Voting, Soft Voting, Random Forests, Bagging, Pasting, Bootstrap, Boosting, AdaBoost| 84 | |`cs236756_tutorial_12_svm.ipynb\pdf`| Support Vector Machine (SVM), Linear SVM, Hard/Soft SVM, The Primal Problem, The Dual Problem, The Kernel Trick, Kernel SVM, RBF Kernel, Polynomial Kernel, The Mercer Condition| 85 | |`cs236756_tutorial_13_deep_learning_intro_backprop.ipynb\pdf`| Deep Learning Introduction, The XOR Problem, Multi-Layer Perceptron (MLP), Backpropagation, Activation Functions: Sigmoid, Tanh, ReLU, Forward Pass, Backward Pass, Boston Housing Dataset | 86 | |`cs236756_tutorial_14_pac_vc_dimension.ipynb\pdf`| Probably Approximately Correct (PAC) Learning, Risk, Empirical Risk, Empirical Risk Minimization (ERM), Inductive Bias, VC Dimension, Shattering, Dichotomy, No Free Lunch Theorem | 87 | 88 | ## Installation Instructions 89 | 1. Get Anaconda with Python 3, follow the instructions according to your OS (Windows/Mac/Linux) at: https://www.anaconda.com/distribution/ 90 | 2. Create a new environment for the course (full guide at https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands): 91 | In Windows open `Anaconda Prompt` from the start menu, in Mac/Linux open the terminal and run `conda create --name ml_course` 92 | 3. To activate the environment, open the terminal (or `Anaconda Prompt` in Windows) and run `conda activate ml_course` 93 | 4. Install the required libraries according to the table below (to search for a specific library and the corresponding command you can also look at https://anaconda.org/) 94 | 95 | ### Libraries to Install 96 | |Library | Command to Run | 97 | |----------------|---------| 98 | |`Jupyter Notebook`| `conda install -c conda-forge notebook`| 99 | |`numpy`| `conda install -c conda-forge numpy`| 100 | |`matplotlib`| `conda install -c conda-forge matplotlib`| 101 | |`pandas`| `conda install -c conda-forge pandas`| 102 | |`scipy`| `conda install -c anaconda scipy `| 103 | |`scikit-learn`| `conda install -c conda-forge scikit-learn`| 104 | 105 | 5. To open the notbooks, run `jupyter notebook` in the terminal (or `Anaconda Prompt` in Windows) while the `ml_course` environment is activated. 106 | -------------------------------------------------------------------------------- /assets/tut_01_a_vs_b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_01_a_vs_b.png -------------------------------------------------------------------------------- /assets/tut_01_bayes_rule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_01_bayes_rule.png -------------------------------------------------------------------------------- /assets/tut_01_cor_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_01_cor_1.jpg -------------------------------------------------------------------------------- /assets/tut_01_cor_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_01_cor_2.jpg -------------------------------------------------------------------------------- /assets/tut_01_cor_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_01_cor_3.jpg -------------------------------------------------------------------------------- /assets/tut_01_intersection.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_01_intersection.jpg -------------------------------------------------------------------------------- /assets/tut_01_param_est.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_01_param_est.PNG -------------------------------------------------------------------------------- /assets/tut_01_transformation.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_01_transformation.jpg -------------------------------------------------------------------------------- /assets/tut_02_benjamin.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_02_benjamin.jpg -------------------------------------------------------------------------------- /assets/tut_02_confusion_mat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_02_confusion_mat.jpg -------------------------------------------------------------------------------- /assets/tut_02_error_types.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_02_error_types.jpg -------------------------------------------------------------------------------- /assets/tut_02_p_val.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_02_p_val.png -------------------------------------------------------------------------------- /assets/tut_02_pop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_02_pop.png -------------------------------------------------------------------------------- /assets/tut_02_quote.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_02_quote.jpg -------------------------------------------------------------------------------- /assets/tut_02_var_param.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_02_var_param.png -------------------------------------------------------------------------------- /assets/tut_02_z_stat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_02_z_stat.jpg -------------------------------------------------------------------------------- /assets/tut_03_gram.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_03_gram.gif -------------------------------------------------------------------------------- /assets/tut_03_linear_dep.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_03_linear_dep.jpg -------------------------------------------------------------------------------- /assets/tut_03_norm.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_03_norm.jpg -------------------------------------------------------------------------------- /assets/tut_03_svd.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_03_svd.gif -------------------------------------------------------------------------------- /assets/tut_03_svd_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_03_svd_1.jpg -------------------------------------------------------------------------------- /assets/tut_03_svd_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_03_svd_2.jpg -------------------------------------------------------------------------------- /assets/tut_03_svd_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_03_svd_3.jpg -------------------------------------------------------------------------------- /assets/tut_03_svd_vis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_03_svd_vis.png -------------------------------------------------------------------------------- /assets/tut_04_dim_red.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_dim_red.jpg -------------------------------------------------------------------------------- /assets/tut_04_feature_select_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_feature_select_1.jpg -------------------------------------------------------------------------------- /assets/tut_04_feature_select_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_feature_select_2.jpg -------------------------------------------------------------------------------- /assets/tut_04_feature_select_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_feature_select_3.jpg -------------------------------------------------------------------------------- /assets/tut_04_filter_methods.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_filter_methods.jpg -------------------------------------------------------------------------------- /assets/tut_04_outlier.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_outlier.jpeg -------------------------------------------------------------------------------- /assets/tut_04_pca_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_pca_1.jpg -------------------------------------------------------------------------------- /assets/tut_04_sbs.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_sbs.jpg -------------------------------------------------------------------------------- /assets/tut_04_sfs.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_sfs.jpg -------------------------------------------------------------------------------- /assets/tut_04_sfs_example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_sfs_example.jpg -------------------------------------------------------------------------------- /assets/tut_04_sfs_sbs.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_sfs_sbs.jpg -------------------------------------------------------------------------------- /assets/tut_04_wrapper_methods_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_04_wrapper_methods_1.jpg -------------------------------------------------------------------------------- /assets/tut_05_conf_mat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_conf_mat.jpg -------------------------------------------------------------------------------- /assets/tut_05_holdout.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_holdout.jpg -------------------------------------------------------------------------------- /assets/tut_05_kcv.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_kcv.jpeg -------------------------------------------------------------------------------- /assets/tut_05_kfold.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_kfold.jpeg -------------------------------------------------------------------------------- /assets/tut_05_overfit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_overfit.png -------------------------------------------------------------------------------- /assets/tut_05_overfitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_overfitting.png -------------------------------------------------------------------------------- /assets/tut_05_test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_test.jpg -------------------------------------------------------------------------------- /assets/tut_05_tp_tf.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_tp_tf.jpg -------------------------------------------------------------------------------- /assets/tut_05_train.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_train.jpg -------------------------------------------------------------------------------- /assets/tut_05_validation.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_05_validation.jpg -------------------------------------------------------------------------------- /assets/tut_06_convex_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_convex_1.jpg -------------------------------------------------------------------------------- /assets/tut_06_convex_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_convex_2.jpg -------------------------------------------------------------------------------- /assets/tut_06_convex_concave.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_convex_concave.gif -------------------------------------------------------------------------------- /assets/tut_06_deriv.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_deriv.jpg -------------------------------------------------------------------------------- /assets/tut_06_deriv_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_deriv_2.jpg -------------------------------------------------------------------------------- /assets/tut_06_gd.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_gd.gif -------------------------------------------------------------------------------- /assets/tut_06_gradient.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_gradient.gif -------------------------------------------------------------------------------- /assets/tut_06_gradient_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_gradient_2.jpg -------------------------------------------------------------------------------- /assets/tut_06_lagrange.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_lagrange.jpg -------------------------------------------------------------------------------- /assets/tut_06_lls_anim.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_lls_anim.gif -------------------------------------------------------------------------------- /assets/tut_06_lr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_lr.png -------------------------------------------------------------------------------- /assets/tut_06_mb_sgd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_mb_sgd.png -------------------------------------------------------------------------------- /assets/tut_06_minimum.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_minimum.jpg -------------------------------------------------------------------------------- /assets/tut_06_multimodal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_multimodal.jpg -------------------------------------------------------------------------------- /assets/tut_06_saddle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_saddle.jpg -------------------------------------------------------------------------------- /assets/tut_06_sgd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_sgd.png -------------------------------------------------------------------------------- /assets/tut_06_step.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_step.png -------------------------------------------------------------------------------- /assets/tut_06_unimodal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_06_unimodal.jpg -------------------------------------------------------------------------------- /assets/tut_07_akinator.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_akinator.jpg -------------------------------------------------------------------------------- /assets/tut_07_boundries.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_boundries.jpg -------------------------------------------------------------------------------- /assets/tut_07_entropy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_entropy.jpg -------------------------------------------------------------------------------- /assets/tut_07_example_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_example_1.jpg -------------------------------------------------------------------------------- /assets/tut_07_example_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_example_2.jpg -------------------------------------------------------------------------------- /assets/tut_07_gini.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_gini.jpg -------------------------------------------------------------------------------- /assets/tut_07_multitree.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_multitree.jpg -------------------------------------------------------------------------------- /assets/tut_07_overfitting.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_overfitting.jpg -------------------------------------------------------------------------------- /assets/tut_07_random_forest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_random_forest.png -------------------------------------------------------------------------------- /assets/tut_07_titan_tree.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_titan_tree.jpg -------------------------------------------------------------------------------- /assets/tut_07_vis.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_07_vis.jpg -------------------------------------------------------------------------------- /assets/tut_08_adaline.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_adaline.gif -------------------------------------------------------------------------------- /assets/tut_08_alpha.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_alpha.jpg -------------------------------------------------------------------------------- /assets/tut_08_alpha_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_alpha_2.jpg -------------------------------------------------------------------------------- /assets/tut_08_bias_var.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_bias_var.jpg -------------------------------------------------------------------------------- /assets/tut_08_bias_var_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_bias_var_2.jpg -------------------------------------------------------------------------------- /assets/tut_08_cos.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_cos.jpg -------------------------------------------------------------------------------- /assets/tut_08_intro.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_intro.jpg -------------------------------------------------------------------------------- /assets/tut_08_lass_ridge.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_lass_ridge.jpg -------------------------------------------------------------------------------- /assets/tut_08_lr_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_lr_1.jpg -------------------------------------------------------------------------------- /assets/tut_08_lr_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_lr_2.jpg -------------------------------------------------------------------------------- /assets/tut_08_lr_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_lr_3.jpg -------------------------------------------------------------------------------- /assets/tut_08_perc_vs_ada.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_perc_vs_ada.png -------------------------------------------------------------------------------- /assets/tut_08_poly_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_poly_1.jpg -------------------------------------------------------------------------------- /assets/tut_08_reg.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_reg.jpg -------------------------------------------------------------------------------- /assets/tut_08_res_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_res_1.jpg -------------------------------------------------------------------------------- /assets/tut_08_res_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_08_res_2.jpg -------------------------------------------------------------------------------- /assets/tut_09_bayes.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_bayes.jpg -------------------------------------------------------------------------------- /assets/tut_09_lda.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_lda.jpg -------------------------------------------------------------------------------- /assets/tut_09_lin_models.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_lin_models.jpg -------------------------------------------------------------------------------- /assets/tut_09_lms_perc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_lms_perc.jpg -------------------------------------------------------------------------------- /assets/tut_09_one_vs_all_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_one_vs_all_1.png -------------------------------------------------------------------------------- /assets/tut_09_one_vs_all_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_one_vs_all_2.png -------------------------------------------------------------------------------- /assets/tut_09_one_vs_all_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_one_vs_all_3.png -------------------------------------------------------------------------------- /assets/tut_09_one_vs_all_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_one_vs_all_4.png -------------------------------------------------------------------------------- /assets/tut_09_perceptron.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_perceptron.jpg -------------------------------------------------------------------------------- /assets/tut_09_qda.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_09_qda.jpg -------------------------------------------------------------------------------- /assets/tut_10_em_step.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_10_em_step.jpg -------------------------------------------------------------------------------- /assets/tut_10_gmm.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_10_gmm.jpg -------------------------------------------------------------------------------- /assets/tut_10_gmm_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_10_gmm_2.jpg -------------------------------------------------------------------------------- /assets/tut_10_gmm_anim.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_10_gmm_anim.gif -------------------------------------------------------------------------------- /assets/tut_10_kmeans.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_10_kmeans.jpg -------------------------------------------------------------------------------- /assets/tut_10_kmeans_anim.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_10_kmeans_anim.gif -------------------------------------------------------------------------------- /assets/tut_11_adaboost.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_adaboost.gif -------------------------------------------------------------------------------- /assets/tut_11_adaboost_1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_adaboost_1.gif -------------------------------------------------------------------------------- /assets/tut_11_bagging_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_bagging_1.jpg -------------------------------------------------------------------------------- /assets/tut_11_bagging_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_bagging_2.jpg -------------------------------------------------------------------------------- /assets/tut_11_bagging_pasting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_bagging_pasting.png -------------------------------------------------------------------------------- /assets/tut_11_boosting_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_boosting_1.jpg -------------------------------------------------------------------------------- /assets/tut_11_boosting_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_boosting_2.jpg -------------------------------------------------------------------------------- /assets/tut_11_exp_loss.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_exp_loss.jpg -------------------------------------------------------------------------------- /assets/tut_11_geom.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_geom.jpg -------------------------------------------------------------------------------- /assets/tut_11_hinge.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_hinge.jpg -------------------------------------------------------------------------------- /assets/tut_11_large_margin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_large_margin.png -------------------------------------------------------------------------------- /assets/tut_11_margin.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_margin.jpg -------------------------------------------------------------------------------- /assets/tut_11_rbf_kernel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_rbf_kernel.png -------------------------------------------------------------------------------- /assets/tut_11_slack.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_slack.jpg -------------------------------------------------------------------------------- /assets/tut_11_tuning_c.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_tuning_c.gif -------------------------------------------------------------------------------- /assets/tut_11_tuning_c_linear.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_11_tuning_c_linear.gif -------------------------------------------------------------------------------- /assets/tut_12_backward.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_backward.jpg -------------------------------------------------------------------------------- /assets/tut_12_boolean.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_boolean.jpg -------------------------------------------------------------------------------- /assets/tut_12_ex_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_ex_1.png -------------------------------------------------------------------------------- /assets/tut_12_ex_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_ex_2.png -------------------------------------------------------------------------------- /assets/tut_12_forward.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_forward.jpg -------------------------------------------------------------------------------- /assets/tut_12_intro.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_intro.jpg -------------------------------------------------------------------------------- /assets/tut_12_mlp.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_mlp.jpg -------------------------------------------------------------------------------- /assets/tut_12_mod_app_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_mod_app_1.jpg -------------------------------------------------------------------------------- /assets/tut_12_mod_app_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_mod_app_2.jpg -------------------------------------------------------------------------------- /assets/tut_12_optim_general.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_12_optim_general.jpg -------------------------------------------------------------------------------- /assets/tut_13_convex.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_13_convex.jpg -------------------------------------------------------------------------------- /assets/tut_13_halfspaces.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_13_halfspaces.jpg -------------------------------------------------------------------------------- /assets/tut_13_intervals.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_13_intervals.jpg -------------------------------------------------------------------------------- /assets/tut_13_linear.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_13_linear.jpg -------------------------------------------------------------------------------- /assets/tut_13_overfitting.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_13_overfitting.jpg -------------------------------------------------------------------------------- /assets/tut_13_pacman.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_13_pacman.png -------------------------------------------------------------------------------- /assets/tut_13_rectangles.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_13_rectangles.jpg -------------------------------------------------------------------------------- /assets/tut_13_rectangles_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_13_rectangles_2.jpg -------------------------------------------------------------------------------- /assets/tut_13_threshold.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/assets/tut_13_threshold.jpg -------------------------------------------------------------------------------- /cs236756_tutorial_03_linear_algebra.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# CS 236756 - Technion - Intro to Machine Learning\n", 12 | "---\n", 13 | "#### Tal Daniel\n", 14 | "\n", 15 | "## Tutorial 03 - Linear Algebra & SVD\n", 16 | "---\n" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "slideshow": { 23 | "slide_type": "subslide" 24 | } 25 | }, 26 | "source": [ 27 | "### Agenda\n", 28 | "---\n", 29 | "* [Linear Algebra Refresher](#-Linear-Algebra-Refresher)\n", 30 | "* [Eigen Values and Vectors Decomposition](#-Eigenvalues-and-Eigenvectors)\n", 31 | "* [Singular Value Decomposition (SVD)](#-Singular-Value-Decomposition-(SVD))\n", 32 | "* [Recommended Videos](#-Recommended-Videos)\n", 33 | "* [Credits](#-Credits)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": { 39 | "slideshow": { 40 | "slide_type": "subslide" 41 | } 42 | }, 43 | "source": [ 44 | "#### Useful Resource\n", 45 | "---\n", 46 | "
\n", 47 | " The Matrix Cookbook\n", 48 | "
" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 1, 54 | "metadata": { 55 | "slideshow": { 56 | "slide_type": "skip" 57 | } 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "# imports for the tutorial\n", 62 | "import numpy as np\n", 63 | "import pandas as pd\n", 64 | "import matplotlib.pyplot as plt\n", 65 | "%matplotlib notebook" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "slideshow": { 72 | "slide_type": "slide" 73 | } 74 | }, 75 | "source": [ 76 | "## Linear Algebra Refresher\n", 77 | "---\n", 78 | "### Vectors\n", 79 | "---\n", 80 | "* Geometric object that has both a magnitude and direction\n", 81 | " * $ x = \\begin{bmatrix} x_{1} \\\\ x_{2} \\\\ \\vdots \\\\ x_{n} \\end{bmatrix} = (x_1, x_2, ..., x_n)^{T} \\in \\mathcal{R}^n$\n", 82 | "* Magnitude of a vector: $||x|| = \\sqrt{x^{T}x} = \\sqrt{x_1^2 +x_2^2 +... +x_n^2}$\n", 83 | "* **Cardinality** of a vector - the number of non zero elements" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 2, 89 | "metadata": { 90 | "slideshow": { 91 | "slide_type": "subslide" 92 | } 93 | }, 94 | "outputs": [ 95 | { 96 | "name": "stdout", 97 | "output_type": "stream", 98 | "text": [ 99 | "v:\n", 100 | "[[ 16]\n", 101 | " [ 0]\n", 102 | " [ 19]\n", 103 | " [-16]\n", 104 | " [ -9]\n", 105 | " [ 10]]\n", 106 | "v^T:\n", 107 | "[[ 16 0 19 -16 -9 10]]\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "# let's see some vectors\n", 113 | "v = np.random.randint(low=-20, high=20, size=(6, 1))\n", 114 | "print(\"v:\")\n", 115 | "print(v)\n", 116 | "print(\"v^T:\")\n", 117 | "print(v.T)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 3, 123 | "metadata": { 124 | "slideshow": { 125 | "slide_type": "subslide" 126 | } 127 | }, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "magnitude of v:\n", 134 | "32.46536616149585\n", 135 | "cardinality- non zero elements:\n", 136 | "5\n" 137 | ] 138 | } 139 | ], 140 | "source": [ 141 | "print(\"magnitude of v:\")\n", 142 | "print(np.sqrt(np.sum(np.square(v))))\n", 143 | "print(\"cardinality- non zero elements:\")\n", 144 | "print(np.sum(v != 0))" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": { 150 | "slideshow": { 151 | "slide_type": "slide" 152 | } 153 | }, 154 | "source": [ 155 | "### Inner Product Space\n", 156 | "---\n", 157 | "* A mapping $\\langle \\cdot, \\cdot \\rangle : V \\times V \\rightarrow F$ that satisfies:\n", 158 | " * Conjucate Symmetry: $\\langle x, y \\rangle = \\overline{\\langle y, x \\rangle} $\n", 159 | " * Linearity in the First Argument: \n", 160 | " * $\\langle a \\cdot x, y \\rangle = a \\cdot \\langle x, y \\rangle$\n", 161 | " * $\\langle x + z, y \\rangle = \\langle x, y \\rangle + \\langle z, y \\rangle$\n", 162 | " * Positive-definiteness: \n", 163 | " * $\\langle x, x \\rangle \\geq 0$\n", 164 | " * $\\langle x, x \\rangle = 0 \\rightarrow x=0$" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": { 170 | "slideshow": { 171 | "slide_type": "subslide" 172 | } 173 | }, 174 | "source": [ 175 | "\n", 176 | "* Common Inner Products:\n", 177 | " * Real Vector: $\\langle x, y \\rangle = x^{T} y$\n", 178 | " * Real Matrix: $\\langle A, B \\rangle = \\textit{trace}(AB^{T})$\n", 179 | " * Random Variables: $\\langle x, y \\rangle = \\mathbb{E}[x \\cdot y]$\n" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": { 185 | "slideshow": { 186 | "slide_type": "subslide" 187 | } 188 | }, 189 | "source": [ 190 | "* Properties of **Dot Product**:\n", 191 | " * Distributiveness: \n", 192 | " * $(a + b)\\cdot c = a \\cdot c + b \\cdot c$\n", 193 | " * $a \\cdot (b+c) = a\\cdot b + a\\cdot c$\n", 194 | " * Linearity: $(\\lambda a)\\cdot b= a \\cdot (\\lambda b) = \\lambda(a \\cdot b)$\n", 195 | " * Symmetry: $a \\cdot b= b\\cdot a$\n", 196 | " * Non-Negativity: $\\forall a \\neq 0, a\\cdot a >0 , a \\cdot a =0 \\iff a=0$" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 4, 202 | "metadata": { 203 | "slideshow": { 204 | "slide_type": "subslide" 205 | } 206 | }, 207 | "outputs": [ 208 | { 209 | "name": "stdout", 210 | "output_type": "stream", 211 | "text": [ 212 | "a:\n", 213 | "[[1.]\n", 214 | " [1.]\n", 215 | " [1.]\n", 216 | " [1.]\n", 217 | " [1.]]\n", 218 | "b:\n", 219 | "[[ 3]\n", 220 | " [-4]\n", 221 | " [ 5]\n", 222 | " [ 8]\n", 223 | " [-4]]\n", 224 | "a.T.dot(b)=\n", 225 | "[[8.]]\n", 226 | "the same as a.T @ b:\n", 227 | "[[8.]]\n" 228 | ] 229 | } 230 | ], 231 | "source": [ 232 | "# let's see some dot products\n", 233 | "a = np.ones((5,1))\n", 234 | "b = np.random.randint(low=-10, high=10, size=(5,1))\n", 235 | "print(\"a:\")\n", 236 | "print(a)\n", 237 | "print(\"b:\")\n", 238 | "print(b)\n", 239 | "print(\"a.T.dot(b)=\")\n", 240 | "print(a.T.dot(b))\n", 241 | "print(\"the same as a.T @ b:\")\n", 242 | "print(a.T @ b)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 5, 248 | "metadata": { 249 | "slideshow": { 250 | "slide_type": "subslide" 251 | } 252 | }, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "a + 0.5=\n", 259 | "[[1.5]\n", 260 | " [1.5]\n", 261 | " [1.5]\n", 262 | " [1.5]\n", 263 | " [1.5]]\n", 264 | "(a + 2 * a).T @ b\n", 265 | "[[24.]]\n", 266 | "the same as a.T @ b + (2 * a).T @ b\n", 267 | "[[24.]]\n" 268 | ] 269 | } 270 | ], 271 | "source": [ 272 | "print(\"a + 0.5=\")\n", 273 | "print(a + 0.5)\n", 274 | "print(\"(a + 2 * a).T @ b\")\n", 275 | "print((a + 2 * a).T @ b)\n", 276 | "print(\"the same as a.T @ b + (2 * a).T @ b\")\n", 277 | "print(a.T @ b + (2 * a).T @ b)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": { 283 | "slideshow": { 284 | "slide_type": "slide" 285 | } 286 | }, 287 | "source": [ 288 | "### Outer Product\n", 289 | "---\n", 290 | "* Let:\n", 291 | " * $a = (a_1, a_2, ..., a_n)^{T}$\n", 292 | " * $b = (b_1, b_2, ..., b_n)^{T}$\n", 293 | "* The outer product $ab^{T}$: $$ ab^{T} = \\begin{bmatrix} a_{1} \\\\ a_{2} \\\\ \\vdots \\\\ a_{n} \\end{bmatrix} [b_1, b_2, ..., b_n] = \\begin{pmatrix} a_1 b_1 & a_1 b_2 & \\cdots & a_1 b_n \\\\ a_2 b_1 & a_2 b_2 & \\cdots & a_2 b_n \\\\ \\vdots & \\vdots & \\ddots & \\vdots \\\\ a_n b_1 & a_n b_2 & \\cdots & a_n b_n \\end{pmatrix} $$" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 6, 299 | "metadata": { 300 | "slideshow": { 301 | "slide_type": "subslide" 302 | } 303 | }, 304 | "outputs": [ 305 | { 306 | "name": "stdout", 307 | "output_type": "stream", 308 | "text": [ 309 | "a:\n", 310 | "[[0.68496376]\n", 311 | " [0.51514789]\n", 312 | " [0.97263803]\n", 313 | " [0.47948046]\n", 314 | " [0.97063678]]\n", 315 | "b:\n", 316 | "[[0.16180323]\n", 317 | " [0.64818973]\n", 318 | " [0.00683339]\n", 319 | " [0.5219497 ]\n", 320 | " [0.02569252]]\n" 321 | ] 322 | } 323 | ], 324 | "source": [ 325 | "# outer product\n", 326 | "a = np.random.random(size=(5,1))\n", 327 | "print(\"a:\")\n", 328 | "print(a)\n", 329 | "b = np.random.random(size=(5,1))\n", 330 | "print(\"b:\")\n", 331 | "print(b)" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 7, 337 | "metadata": { 338 | "slideshow": { 339 | "slide_type": "subslide" 340 | } 341 | }, 342 | "outputs": [ 343 | { 344 | "name": "stdout", 345 | "output_type": "stream", 346 | "text": [ 347 | "outer product: a @ b.T = \n", 348 | "[[0.11082935 0.44398648 0.00468062 0.35751663 0.01759844]\n", 349 | " [0.08335259 0.33391357 0.00352021 0.26888128 0.01323545]\n", 350 | " [0.15737597 0.63045398 0.00664641 0.50766812 0.02498952]\n", 351 | " [0.07758149 0.31079431 0.00327648 0.25026468 0.01231906]\n", 352 | " [0.15705217 0.62915679 0.00663274 0.50662357 0.0249381 ]]\n" 353 | ] 354 | } 355 | ], 356 | "source": [ 357 | "ab_t = a @ b.T\n", 358 | "print(\"outer product: a @ b.T = \")\n", 359 | "print(ab_t)" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": { 365 | "slideshow": { 366 | "slide_type": "slide" 367 | } 368 | }, 369 | "source": [ 370 | "### Vector Norms\n", 371 | "---\n", 372 | "* A norm on a vector sapce $\\Omega$ is a function $f: \\Omega \\rightarrow \\mathcal{R}$ with the following properties:\n", 373 | " * Positive Scalability: $f(ax) = |a|f(x)$\n", 374 | " * Triangle Inequality: $f(x+y) \\leq f(x) + f(y)$\n", 375 | " * If $f(x) = 0 \\rightarrow x = 0$\n", 376 | "* $l_1$ norm: $||x||_1 = \\sum_{i=1}^n |x_i| $" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": { 382 | "slideshow": { 383 | "slide_type": "subslide" 384 | } 385 | }, 386 | "source": [ 387 | "\n", 388 | "* $l_2$ norm: $||x||_2 = \\sqrt{\\sum_{i=1}^n |x_i|^2} $\n", 389 | " * For **Vectors**: $||x||_2^2 = x^{T}x$\n", 390 | " * $l_2$-distance: $||x -y||_2^2 = (x-y)^{T}(x-y)= ||x||_2^2 -2x^{T}y + ||y||_2^2$\n", 391 | "* $l_p$ norm: $||x||_p = (\\sum_{i=1}^n |x_i|^p)^{\\frac{1}{p}} $\n", 392 | "* $l_{\\infty}$ norm: $||x||_{\\infty} = \\max{(|x_1|, |x_2|, ..., |x_n|)} $" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": { 398 | "slideshow": { 399 | "slide_type": "subslide" 400 | } 401 | }, 402 | "source": [ 403 | "" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 8, 409 | "metadata": { 410 | "slideshow": { 411 | "slide_type": "subslide" 412 | } 413 | }, 414 | "outputs": [ 415 | { 416 | "name": "stdout", 417 | "output_type": "stream", 418 | "text": [ 419 | "a:\n", 420 | "[[0.20110422]\n", 421 | " [0.3103417 ]\n", 422 | " [0.25755954]\n", 423 | " [0.84291866]\n", 424 | " [0.00855558]]\n", 425 | "l-1 norm: \n", 426 | "1.6204796988041368\n", 427 | "l-2 norm: \n", 428 | "0.9558644554276373\n", 429 | "l-infinity norm:\n", 430 | "0.8429186563888088\n" 431 | ] 432 | } 433 | ], 434 | "source": [ 435 | "# norms and distance\n", 436 | "a = np.random.random(size=(5,1))\n", 437 | "print(\"a:\")\n", 438 | "print(a)\n", 439 | "print(\"l-1 norm: \")\n", 440 | "print(np.sum(abs(a)))\n", 441 | "print(\"l-2 norm: \")\n", 442 | "print(np.sqrt(np.sum(np.square(a))))\n", 443 | "print(\"l-infinity norm:\")\n", 444 | "print(np.max(abs(a)))" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 9, 450 | "metadata": { 451 | "slideshow": { 452 | "slide_type": "subslide" 453 | } 454 | }, 455 | "outputs": [ 456 | { 457 | "name": "stdout", 458 | "output_type": "stream", 459 | "text": [ 460 | "b:\n", 461 | "[[0.59011591]\n", 462 | " [0.77681828]\n", 463 | " [0.31464032]\n", 464 | " [0.78600795]\n", 465 | " [0.85952156]]\n", 466 | "l-2 distance between a and b:\n", 467 | "[[1.04860414]]\n" 468 | ] 469 | } 470 | ], 471 | "source": [ 472 | "b = np.random.random(size=(5,1))\n", 473 | "print(\"b:\")\n", 474 | "print(b)\n", 475 | "print(\"l-2 distance between a and b:\")\n", 476 | "print(np.sqrt((a - b).T @ (a - b)))" 477 | ] 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "metadata": { 482 | "slideshow": { 483 | "slide_type": "slide" 484 | } 485 | }, 486 | "source": [ 487 | "### Linear Dependency\n", 488 | "---\n", 489 | "* Given a set of vectors $X =\\{x_1, x_2, ..., x_n \\}$, a **linear combination** of vectors is written as:\n", 490 | "$$ ax = a_1 x_1 + a_2 x_2 + ... +a_n x_n $$\n", 491 | "* $x_i \\in X$ is **linearly dependent** if it can be written as linear combination of $X \\setminus \\{x_i\\}$" 492 | ] 493 | }, 494 | { 495 | "cell_type": "markdown", 496 | "metadata": { 497 | "slideshow": { 498 | "slide_type": "subslide" 499 | } 500 | }, 501 | "source": [ 502 | "" 503 | ] 504 | }, 505 | { 506 | "cell_type": "markdown", 507 | "metadata": { 508 | "slideshow": { 509 | "slide_type": "slide" 510 | } 511 | }, 512 | "source": [ 513 | "### Basis\n", 514 | "---\n", 515 | "* A basis is a **linearly independent** set of vectors that spans the \"whole sapce\"\n", 516 | "* Every vector in the space can be written as a linear combination of vectors in the basis\n", 517 | " * For example, **the standard basis (unit vectors)**: $\\{e_i \\in \\mathcal{R}^n | e_i =(0, 0, ..., 0, 1,0, ..., 0)^{T}\\}$ \n", 518 | " * $x^{T} = (3 ,2 ,5)^{T} = 3(1,0,0)^{T}+2(0,1,0)^{T}+5(0,0,1)^{T} = 3e_1^T +2e_2^T +5e_3^T$" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": { 524 | "slideshow": { 525 | "slide_type": "subslide" 526 | } 527 | }, 528 | "source": [ 529 | "* **Projection** of a vector: $x\\cdot e_i = x^T e_i = e_i^T x$\n", 530 | "* The basis vectors suffice:\n", 531 | " * Orthogonal - $e_i^T e_j = 0$\n", 532 | " * Normalized - $e_i^T e_i = 1$\n", 533 | " * Orthogonal + Normalized = Orthonormal\n", 534 | " * If $A$ is **orthogonal** then:\n", 535 | " * $A$ is a square matrix\n", 536 | " * The columns of $A$ are **orthonormal** vectors\n", 537 | " * $A^TA = AA^T = I \\rightarrow A^T= A^{-1}$" 538 | ] 539 | }, 540 | { 541 | "cell_type": "markdown", 542 | "metadata": { 543 | "slideshow": { 544 | "slide_type": "subslide" 545 | } 546 | }, 547 | "source": [ 548 | "* **Change of Basis** - suppose that we have a basis not necessarily orthonormal $B=\\{b_1, b_2, ..., b_n\\}, b_i \\in \\mathcal{R}^m $\n", 549 | " * Vector in the **new** basis is represented with a **matrix-vector** multiplication\n", 550 | " * The Identity matrix $I$ maps a vector to itself\n", 551 | " * Basis change can be decomposed to: **rotation** matrix and **scale** matrix\n", 552 | " * Using an **orthonormal** basis means only a **rotation** around the origin\n", 553 | " * **Gram-Schmidt Orthonormaliztion Process**: Link\n", 554 | " " 555 | ] 556 | }, 557 | { 558 | "cell_type": "markdown", 559 | "metadata": { 560 | "slideshow": { 561 | "slide_type": "subslide" 562 | } 563 | }, 564 | "source": [ 565 | "\n", 566 | " By Lucas V. Barbosa - Own work, Public Domain, Link" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 6, 572 | "metadata": { 573 | "slideshow": { 574 | "slide_type": "skip" 575 | } 576 | }, 577 | "outputs": [ 578 | { 579 | "name": "stdout", 580 | "output_type": "stream", 581 | "text": [ 582 | "V:\n", 583 | "[[3. 2.]\n", 584 | " [1. 2.]]\n", 585 | "U:\n", 586 | "[[0.9486833 0.70710678]\n", 587 | " [0.31622777 0.70710678]]\n" 588 | ] 589 | } 590 | ], 591 | "source": [ 592 | "# Gram-Schmidt Algorithm\n", 593 | "def gram_schmidt(V):\n", 594 | " \"\"\"\n", 595 | " Implements Gram-Schmidt Orthonormaliztion Process.\n", 596 | " Parameters:\n", 597 | " V - matrix such that each column is a vector in the original basis\n", 598 | " Returns:\n", 599 | " U - matrix with orthonormal vectors as columns\n", 600 | " \"\"\"\n", 601 | " n, k = np.array(V, dtype=np.float).shape # get dimensions\n", 602 | " # initialize U matrix\n", 603 | " U = np.zeros_like(V, dtype=np.float)\n", 604 | " U[:,0] = V[:,0] / np.sqrt(V[:,0].T @ V[:,0])\n", 605 | " for i in range(1, k):\n", 606 | " U[:,i] = V[:,i]\n", 607 | " for j in range(i - 1):\n", 608 | " U[:,i] = U[:,i] - ((U[:,i].T @ U[:,j]) / (U[:,j].T @ U[:,j])) * U[:, j]\n", 609 | " # normalize\n", 610 | " U[:,i] = U[:,i] / np.sqrt(U[:,i].T @ U[:,i])\n", 611 | " return U\n", 612 | "\n", 613 | "v1 = [3.0, 1.0]\n", 614 | "v2 = [2.0, 2.0]\n", 615 | "v = np.stack((v1, v2), axis=1)\n", 616 | "print(\"V:\")\n", 617 | "print(v)\n", 618 | "U = gram_schmidt(v)\n", 619 | "print(\"U:\")\n", 620 | "print(U) " 621 | ] 622 | }, 623 | { 624 | "cell_type": "markdown", 625 | "metadata": { 626 | "slideshow": { 627 | "slide_type": "slide" 628 | } 629 | }, 630 | "source": [ 631 | "### Matrix Operations\n", 632 | "---\n", 633 | "* Addition\n", 634 | " * Commutative: $A + B = B +A$\n", 635 | " * Associative: $(A+B) + C = A + (B+C)$\n", 636 | "* Multiplication - **PAY ATTENTION TO DIMENSTIONS**\n", 637 | " * Associative: $A(BC) = (AB)C$\n", 638 | " * Distributive: $A(B+C) = AB + AC$\n", 639 | " * Non-comutative (**!**): $AB \\neq BA$" 640 | ] 641 | }, 642 | { 643 | "cell_type": "markdown", 644 | "metadata": { 645 | "slideshow": { 646 | "slide_type": "subslide" 647 | } 648 | }, 649 | "source": [ 650 | "* Transpose\n", 651 | " * $(A^{T})_{ij}$\n", 652 | " * $(A^{T})^T = A$\n", 653 | " * $(AB)^{T} = B^{T}A^{T}$\n", 654 | "* Inverse - **MAKE SURE CONDITIONS APPLY**\n", 655 | " * **Positive Semi-definite (PSD)** - Matrix $M$ is called *PSD* if for every non-zero column vector $z$, the scalar $z^T M z \\geq 0$\n", 656 | " * **Every positive definite matrix is invertible** and its inverse is also positive definite\n", 657 | " * $(A^{-1})^{-1} = A$\n", 658 | " * $(AB)^{-1} = B^{-1} A^{-1}$\n", 659 | " * $(A^T)^{-1} = A^{-T}$\n", 660 | " * Inverse of 2x2 matrix: see tutorial 1" 661 | ] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": 10, 666 | "metadata": { 667 | "slideshow": { 668 | "slide_type": "subslide" 669 | } 670 | }, 671 | "outputs": [ 672 | { 673 | "name": "stdout", 674 | "output_type": "stream", 675 | "text": [ 676 | "A:\n", 677 | "[[0.56274722 0.57692677 0.31759767 0.9135175 0.39388189]\n", 678 | " [0.3260898 0.73720574 0.3526661 0.02961814 0.16645483]\n", 679 | " [0.01740472 0.24892669 0.4684225 0.60255541 0.11491183]\n", 680 | " [0.60243149 0.97287256 0.72073364 0.33608398 0.94720029]\n", 681 | " [0.3300669 0.15559865 0.27349031 0.41204091 0.83342534]]\n", 682 | "inverse of A:\n", 683 | "[[ 21.57251296 -108.00106195 -17.70755954 87.22168674 -85.31216784]\n", 684 | " [ -14.53515995 78.79387459 11.54867247 -62.74719293 60.8531958 ]\n", 685 | " [ 14.80752023 -84.10430348 -11.05067623 68.39955569 -66.41395368]\n", 686 | " [ -4.51707378 27.97302751 4.82828719 -23.33701698 22.40506618]\n", 687 | " [ -8.45572468 41.8310235 6.09595381 -33.73598262 34.3423877 ]]\n" 688 | ] 689 | } 690 | ], 691 | "source": [ 692 | "# inverse\n", 693 | "A = np.random.rand(5, 5)\n", 694 | "print(\"A:\")\n", 695 | "print(A)\n", 696 | "print(\"inverse of A:\")\n", 697 | "print(np.linalg.inv(A))" 698 | ] 699 | }, 700 | { 701 | "cell_type": "markdown", 702 | "metadata": { 703 | "slideshow": { 704 | "slide_type": "slide" 705 | } 706 | }, 707 | "source": [ 708 | "### Matrix Rank\n", 709 | "---\n", 710 | "* The rank of a matrix is the **maximal number of linearly independent** columns or rows of a matrix\n", 711 | "* $ A \\in \\mathcal{R}^{m \\times n} \\rightarrow \\textit{rank}(A) \\leq \\min(m,n)$\n", 712 | "* $\\textit{rank}(A) = \\textit{rank}(A^T)$\n", 713 | "* $\\textit{rank}(A^T A) = \\textit{rank}(A)$\n", 714 | "* $\\textit{rank}(A + B) \\leq \\textit{rank}(A) + \\textit{rank}(B)$\n", 715 | "* $\\textit{rank}(AB) \\leq \\min(\\textit{rank}(A), \\textit{rank}(B))$\n", 716 | "* A is **full rank** if $\\textit{rank}(A) = \\min(m,n)$\n", 717 | "* **Singular Matrix** - has dependent rows (and at least one zero eigen-value)" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": 11, 723 | "metadata": { 724 | "slideshow": { 725 | "slide_type": "subslide" 726 | } 727 | }, 728 | "outputs": [ 729 | { 730 | "name": "stdout", 731 | "output_type": "stream", 732 | "text": [ 733 | "A:\n", 734 | "[[0 3 3 3 1]\n", 735 | " [1 1 1 3 3]\n", 736 | " [1 1 2 2 0]\n", 737 | " [2 0 3 1 2]\n", 738 | " [3 1 2 1 1]]\n", 739 | "rank(A):\n", 740 | "5\n" 741 | ] 742 | } 743 | ], 744 | "source": [ 745 | "A = np.random.randint(low=0, high=4, size=(5,5))\n", 746 | "print(\"A:\")\n", 747 | "print(A)\n", 748 | "print(\"rank(A):\")\n", 749 | "print(np.linalg.matrix_rank(A))" 750 | ] 751 | }, 752 | { 753 | "cell_type": "markdown", 754 | "metadata": { 755 | "slideshow": { 756 | "slide_type": "slide" 757 | } 758 | }, 759 | "source": [ 760 | "### Range & Nullspace\n", 761 | "---\n", 762 | "* **Range** (of a matrix) - the span of the columns of the matrix, denoted by the set: $$\\mathcal{R}(A) = \\{y|y= Ax\\} $$\n", 763 | "* **Nullspace** (of a matrix) - the set of vectors that when multiplied by the matrix result in 0, given by the set: $$\\mathcal{N}(A) = \\{x|Ax=0\\} $$" 764 | ] 765 | }, 766 | { 767 | "cell_type": "markdown", 768 | "metadata": { 769 | "slideshow": { 770 | "slide_type": "slide" 771 | } 772 | }, 773 | "source": [ 774 | "### Determinant\n", 775 | "---\n", 776 | "Let $A = \\begin{pmatrix}x_1 & y_1 & z_1 \\\\ x_2 & y_2 & z_2 \\\\ x_3 & y_3 & z_3 \\end{pmatrix} $, a **square matrix**, then:\n", 777 | "$$det(A) = |A| = \\begin{vmatrix} x_1 & y_1 & z_1 \\\\ x_2 & y_2 & z_2 \\\\ x_3 & y_3 & z_3 \\end{vmatrix} = x_1 \\begin{vmatrix} y_1 & z_2 \\\\ y_3 & z_3 \\end{vmatrix} -x_2 \\begin{vmatrix} y_1 & z_1 \\\\ y_3 & z_3 \\end{vmatrix} +x_3\\begin{vmatrix} y_1 & z_1\\\\ y_2 & z_2 \\end{vmatrix}$$
$$ = x_1 (y_2z_3 - z_2 y_3) -x_2(y_1z_3 - z_1y_3) +x_3(y_1z_2 - z_1 y_2) $$\n", 778 | "* $det(A) = 0 \\iff A$ is **singular** (at least one eigen-value is zero)\n", 779 | "* If $A$ is diagonal, then $det(A)$ is the prodcut of the diagonal elements (the eigen-values)\n", 780 | "* $det(AB) = det(A)det(B)$\n", 781 | "* $det(A^{-1}) = det(A)^{-1}$\n", 782 | "* $det(\\lambda A) = \\lambda^n det(A)$" 783 | ] 784 | }, 785 | { 786 | "cell_type": "code", 787 | "execution_count": 12, 788 | "metadata": { 789 | "slideshow": { 790 | "slide_type": "subslide" 791 | } 792 | }, 793 | "outputs": [ 794 | { 795 | "name": "stdout", 796 | "output_type": "stream", 797 | "text": [ 798 | "A:\n", 799 | "[[-0.11682683 -0.60007878 0.20168493 -0.41938087 -1.44710738]\n", 800 | " [-0.77820688 0.97102027 -0.95386608 -0.81321839 0.83334389]\n", 801 | " [-1.44149225 -0.44278972 -0.07846115 0.59192462 0.21563895]\n", 802 | " [-0.75701366 -1.49163516 -0.2865721 -0.46047925 -0.01296227]\n", 803 | " [ 1.250518 1.20554034 -0.14421321 0.44739448 -0.14740781]]\n", 804 | "det(A):\n", 805 | "3.073911389887483\n" 806 | ] 807 | } 808 | ], 809 | "source": [ 810 | "# determinant\n", 811 | "A = np.random.randn(5,5)\n", 812 | "print(\"A:\")\n", 813 | "print(A)\n", 814 | "print(\"det(A):\")\n", 815 | "print(np.linalg.det(A))" 816 | ] 817 | }, 818 | { 819 | "cell_type": "markdown", 820 | "metadata": { 821 | "slideshow": { 822 | "slide_type": "slide" 823 | } 824 | }, 825 | "source": [ 826 | "## Solve Linear Equation Analytically\n", 827 | "---\n", 828 | "* Definitions:\n", 829 | " * $A \\in \\mathcal{R}^{n \\times n}$\n", 830 | " * $x, b \\in \\mathcal{R}^{n \\times 1}$\n", 831 | "* The problem: find the solution of $Ax = b$\n", 832 | "* Solution: if $A$ is PSD (and thus invertible), then $x = A^{-1} b$\n", 833 | "* What if $A \\in \\mathcal{R}^{m \\times n}$, $x \\in \\mathcal{R}^{n \\times 1}$, $b \\in \\mathcal{R}^{m \\times 1}$ ?\n", 834 | " * $A$ is no longer invertible!\n", 835 | "* The problem redefined: find $x$ that minimzes the distance from $Ax$ to $b$, or more formally: $$ \\underset{x}{\\mathrm{argmin}} ||Ax - b ||_2^2$$ (also called **least-squares** solution)" 836 | ] 837 | }, 838 | { 839 | "cell_type": "markdown", 840 | "metadata": { 841 | "slideshow": { 842 | "slide_type": "subslide" 843 | } 844 | }, 845 | "source": [ 846 | "### Reminder (Tutorial 01) - Vector & Matrix Derivatives\n", 847 | "---\n", 848 | "* $\\nabla_x Ax = A^{T}$\n", 849 | "* $\\nabla_x x^{T} A x = (A + A^{T}) x$ \n", 850 | "* $\\frac{\\partial}{\\partial A} \\ln |A| = A^{-T}$\n", 851 | "* $\\frac{\\partial}{\\partial A} Tr[AB] = B^{T}$" 852 | ] 853 | }, 854 | { 855 | "cell_type": "markdown", 856 | "metadata": { 857 | "slideshow": { 858 | "slide_type": "subslide" 859 | } 860 | }, 861 | "source": [ 862 | "### Exercise 1 - Least-Squares Solution\n", 863 | "---\n", 864 | "Given $A \\in \\mathcal{R}^{m \\times n}$, $x \\in \\mathcal{R}^{n \\times 1}$, $b \\in \\mathcal{R}^{m \\times 1}$\n", 865 | "\n", 866 | "Find $x$ that minimizes the distance from $Ax$ to $b$, or more formally: $$ \\underset{x}{\\mathrm{argmin}} ||Ax - b ||_2^2$$" 867 | ] 868 | }, 869 | { 870 | "cell_type": "markdown", 871 | "metadata": { 872 | "slideshow": { 873 | "slide_type": "subslide" 874 | } 875 | }, 876 | "source": [ 877 | "### Solution 1\n", 878 | "---\n", 879 | "$$ ||Ax - b ||_2^2 = (Ax-b)^T (Ax-b) = x^TA^TAx -x^TA^Tb-b^TAx +b^Tb $$\n", 880 | "$$\\frac{\\partial ||Ax - b ||_2^2}{\\partial x} = 2A^TAx-2A^Tb = 0 \\rightarrow x = (A^TA)^{-1}A^Tb $$" 881 | ] 882 | }, 883 | { 884 | "cell_type": "code", 885 | "execution_count": 13, 886 | "metadata": { 887 | "slideshow": { 888 | "slide_type": "subslide" 889 | } 890 | }, 891 | "outputs": [ 892 | { 893 | "name": "stdout", 894 | "output_type": "stream", 895 | "text": [ 896 | "A:\n", 897 | "[[ 3 2 8 9]\n", 898 | " [-3 -5 -5 2]\n", 899 | " [ 0 5 7 5]\n", 900 | " [ 1 -3 6 -5]\n", 901 | " [ 1 1 8 6]]\n", 902 | "b:\n", 903 | "[[-2]\n", 904 | " [-7]\n", 905 | " [-3]\n", 906 | " [-3]\n", 907 | " [ 0]]\n" 908 | ] 909 | } 910 | ], 911 | "source": [ 912 | "# Least Squares Solution\n", 913 | "m = 5\n", 914 | "n = 4\n", 915 | "A = np.random.randint(low=-5, high=10, size=(m,n))\n", 916 | "b = np.random.randint(low=-10, high=3, size=(m,1))\n", 917 | "print(\"A:\")\n", 918 | "print(A)\n", 919 | "print(\"b:\")\n", 920 | "print(b)" 921 | ] 922 | }, 923 | { 924 | "cell_type": "code", 925 | "execution_count": 14, 926 | "metadata": { 927 | "slideshow": { 928 | "slide_type": "subslide" 929 | } 930 | }, 931 | "outputs": [ 932 | { 933 | "name": "stdout", 934 | "output_type": "stream", 935 | "text": [ 936 | "Least Squares solution for x:\n", 937 | "[[ 1.54495052]\n", 938 | " [ 0.65381817]\n", 939 | " [-0.47872248]\n", 940 | " [-0.27042109]]\n" 941 | ] 942 | } 943 | ], 944 | "source": [ 945 | "print(\"Least Squares solution for x:\")\n", 946 | "x = np.linalg.inv(A.T @ A) @ A.T @ b\n", 947 | "print(x)" 948 | ] 949 | }, 950 | { 951 | "cell_type": "markdown", 952 | "metadata": { 953 | "slideshow": { 954 | "slide_type": "slide" 955 | } 956 | }, 957 | "source": [ 958 | "## Solve Linear Equation Non-Analytically\n", 959 | "---\n", 960 | "### Eigenvalues and Eigenvectors\n", 961 | "---\n", 962 | "* Definition: Matrix $A$ with **Eigenvalue** $\\lambda \\in \\mathbb{C}$ and **Eigenvector** $x \\in \\mathbb{C}^n$ if $$Ax=\\lambda x, x \\neq 0 $$\n", 963 | "* Finding eigenvalues and eigenvectors\n", 964 | " * Find eigenvalues by finding the roots of the polynomial generated by: $$det(\\lambda I -A) = |\\lambda I -A| =0 $$\n", 965 | " * For each eigenvalue $\\lambda$, find its corresponding eigenvector $x$ by solving: $$ Ax = \\lambda x$$" 966 | ] 967 | }, 968 | { 969 | "cell_type": "markdown", 970 | "metadata": { 971 | "slideshow": { 972 | "slide_type": "subslide" 973 | } 974 | }, 975 | "source": [ 976 | "* Example: $M = \\begin{pmatrix} 2 & 1 \\\\ 1 & 2 \\end{pmatrix} \\rightarrow |\\lambda I -M| = \\begin{vmatrix} 2 - \\lambda & 1 \\\\ 1 & 2 - \\lambda \\end{vmatrix} = 3 - 4 \\lambda + \\lambda^2 \\rightarrow \\lambda_{1,2} = 1, 3 \\rightarrow x_{\\lambda = 1}= \\begin{bmatrix} 1 \\\\ -1 \\end{bmatrix} , x_{\\lambda=3} = \\begin{bmatrix} 1 \\\\ 1 \\end{bmatrix}$\n", 977 | " \n", 978 | "* Eigenvalues Properties\n", 979 | " * $det(\\Lambda) = |\\Lambda| = \\prod_{i=1}^n \\lambda_i$\n", 980 | " * $\\textit{rank}(A) = \\sum_{i=1}^n \\mathbb{1}_{\\lambda_i \\neq 0}$\n", 981 | " * Eigenvalues of a **diagonal** matrix are the diagonal entries\n", 982 | " * A (square) matrix is said to be **diagonalizable** if it can be rewritten as: $A = X \\Lambda X^{-1}$\n", 983 | "* Eigenvalues of **Symmetric Matrices**:\n", 984 | " * Eigenvalues are **real**\n", 985 | " * Eigenvectors of **real symmetric** matrices are orthonormal" 986 | ] 987 | }, 988 | { 989 | "cell_type": "code", 990 | "execution_count": 15, 991 | "metadata": {}, 992 | "outputs": [ 993 | { 994 | "name": "stdout", 995 | "output_type": "stream", 996 | "text": [ 997 | "A:\n", 998 | "[[-4 -9 7 8 1]\n", 999 | " [ 6 -8 -5 -3 -9]\n", 1000 | " [ 0 9 -6 0 3]\n", 1001 | " [ 8 2 -6 0 -6]\n", 1002 | " [ 3 3 2 4 -1]]\n" 1003 | ] 1004 | } 1005 | ], 1006 | "source": [ 1007 | "# eigenvalues and eigenvectors\n", 1008 | "A = np.random.randint(low=-10, high=10, size=(5,5))\n", 1009 | "eig, vec = np.linalg.eig(A)\n", 1010 | "print(\"A:\")\n", 1011 | "print(A)" 1012 | ] 1013 | }, 1014 | { 1015 | "cell_type": "code", 1016 | "execution_count": 16, 1017 | "metadata": { 1018 | "slideshow": { 1019 | "slide_type": "subslide" 1020 | } 1021 | }, 1022 | "outputs": [ 1023 | { 1024 | "name": "stdout", 1025 | "output_type": "stream", 1026 | "text": [ 1027 | "eigenvalues:\n", 1028 | "[-9.29854727+11.14091902j -9.29854727-11.14091902j\n", 1029 | " 3.93378061 +0.j -3.01573245 +0.j\n", 1030 | " -1.32095361 +0.j ]\n", 1031 | "eigenvectors:\n", 1032 | "[[ 0.2627824 -0.45749602j 0.2627824 +0.45749602j -0.68051908+0.j\n", 1033 | " 0.33207203+0.j 0.54461743+0.j ]\n", 1034 | " [-0.57285962+0.j -0.57285962-0.j 0.20089578+0.j\n", 1035 | " -0.39152492+0.j -0.32875482+0.j ]\n", 1036 | " [ 0.13729842+0.38931395j 0.13729842-0.38931395j 0.00207705+0.j\n", 1037 | " -0.45532511+0.j -0.15673315+0.j ]\n", 1038 | " [-0.31676371+0.31808551j -0.31676371-0.31808551j -0.3762196 +0.j\n", 1039 | " -0.09140478+0.j -0.14305208+0.j ]\n", 1040 | " [ 0.12184535+0.08182002j 0.12184535-0.08182002j -0.59580967+0.j\n", 1041 | " 0.72163745+0.j 0.74181059+0.j ]]\n" 1042 | ] 1043 | } 1044 | ], 1045 | "source": [ 1046 | "print(\"eigenvalues:\")\n", 1047 | "print(eig)\n", 1048 | "print(\"eigenvectors:\")\n", 1049 | "print(vec)" 1050 | ] 1051 | }, 1052 | { 1053 | "cell_type": "markdown", 1054 | "metadata": { 1055 | "slideshow": { 1056 | "slide_type": "slide" 1057 | } 1058 | }, 1059 | "source": [ 1060 | "## Eigen Decomposition\n", 1061 | "---\n", 1062 | "* **Eigen-decomposition** (also **spectral decomposition**) - factorization of a matrix into a canonical form, that is, the matrix is represented in terms of its **eigenvalues and eigenvectors**.\n", 1063 | "* **Only** diagonalizable matrices can be factorized\n", 1064 | "* Formally:\n", 1065 | " * Denote $\\Lambda$ as a matrix with eigenvalues on the diagonal\n", 1066 | " * Denote $Q$ as a matrix where the columns are the eigenvectors\n", 1067 | " * Let $A$ be a square $n \\times n$ matrix with $N$ linearly **independent** columns. Then $A$ can factorized as: $$A = Q \\Lambda Q^{-1} $$" 1068 | ] 1069 | }, 1070 | { 1071 | "cell_type": "markdown", 1072 | "metadata": { 1073 | "slideshow": { 1074 | "slide_type": "slide" 1075 | } 1076 | }, 1077 | "source": [ 1078 | "# What If A Is Non-Square?" 1079 | ] 1080 | }, 1081 | { 1082 | "cell_type": "markdown", 1083 | "metadata": { 1084 | "slideshow": { 1085 | "slide_type": "slide" 1086 | } 1087 | }, 1088 | "source": [ 1089 | "## Singular Value Decomposition (SVD)\n", 1090 | "---\n", 1091 | "* In linear algebra, the singular-value decomposition (SVD) is a factorization of a real or complex matrix. It is the generalization of the eigendecomposition of a positive semidefinite normal matrix (for example, a symmetric matrix with positive eigenvalues) to any $ m\\times n$ matrix via an extension of the polar decomposition.\n", 1092 | "* Definition: $$ A_{[m \\times n]} = U_{[m \\times r]} \\Sigma_{[r \\times r]} (V_{[n \\times r]})^T $$" 1093 | ] 1094 | }, 1095 | { 1096 | "cell_type": "markdown", 1097 | "metadata": { 1098 | "slideshow": { 1099 | "slide_type": "subslide" 1100 | } 1101 | }, 1102 | "source": [ 1103 | "* $A$ - Input Data matrix\n", 1104 | " * $m \\times n$ matrix (e.g. $m$ documents and $n$ terms that can appear in each document)\n", 1105 | "* $U$ - Left Singular vectors\n", 1106 | " * $m \\times r$ matrix (e.g. $m$ documents and $r$ concepts)\n", 1107 | " * $U = eig(AA^T)$" 1108 | ] 1109 | }, 1110 | { 1111 | "cell_type": "markdown", 1112 | "metadata": { 1113 | "slideshow": { 1114 | "slide_type": "subslide" 1115 | } 1116 | }, 1117 | "source": [ 1118 | "* $\\Sigma$ - Singular values\n", 1119 | " * $r \\times r$ **diagonal** matrix (strength of each 'concept')\n", 1120 | " * $r$ represnts the **rank** of matrix $A$\n", 1121 | " * $\\Sigma = diag\\left(\\sqrt{eigenvalues(A^TA)}\\right)$\n", 1122 | " * **Singular Values** definition: the singular values of a matrix $X \\in \\mathbb{R}^{M \\times N}$ are the *square root* of the **eigenvalues** of the matrix $X^TX \\in \\mathbb{R}^{N \\times N}$. If $X \\in \\mathbb{R}^{N \\times N}$ already, then the singular values are the eigenvalues.\n", 1123 | "* $V$ - Right Singular vectors\n", 1124 | " * $n \\times r$ matrix (e.g. $n$ terms and $r$ concepts)\n", 1125 | " * $V = eig(A^TA)$" 1126 | ] 1127 | }, 1128 | { 1129 | "cell_type": "markdown", 1130 | "metadata": { 1131 | "slideshow": { 1132 | "slide_type": "subslide" 1133 | } 1134 | }, 1135 | "source": [ 1136 | "* Illustration:\n", 1137 | " \n", 1138 | " First, we see the unit disc in blue together with the two canonical unit vectors. We then see the action of M, which distorts the disk to an ellipse. The SVD decomposes M into three simple transformations: an initial rotation $V^{*}$, a scaling $\\Sigma$ along the coordinate axes, and a final rotation $U$. The lengths $\\sigma_1$ and $\\sigma_2$ of the semi-axes of the ellipse are the singular values of $M$, namely $\\Sigma_{1,1}$ and $\\Sigma_{2,2}$.\n", 1139 | " \n", 1140 | "* By Kieff - Own work, Public Domain, Link" 1141 | ] 1142 | }, 1143 | { 1144 | "cell_type": "markdown", 1145 | "metadata": { 1146 | "slideshow": { 1147 | "slide_type": "subslide" 1148 | } 1149 | }, 1150 | "source": [ 1151 | "* Another way to look at SVD: $$ A \\approx U\\Sigma V^T = \\sum_i \\sigma_i u_i \\circ v_i^T $$ \n", 1152 | " " 1153 | ] 1154 | }, 1155 | { 1156 | "cell_type": "markdown", 1157 | "metadata": { 1158 | "slideshow": { 1159 | "slide_type": "subslide" 1160 | } 1161 | }, 1162 | "source": [ 1163 | "* **SVD Properties**\n", 1164 | " * It is **always** possible to decompose a **real** matrix $A$ to $A = U\\Sigma V^T$ where\n", 1165 | " * $U, \\Sigma, V$ are **uniuqe**\n", 1166 | " * $U, V$ are column **orthonormal**\n", 1167 | " * $U^T U = I, V^T V = I$\n", 1168 | " * $\\Sigma$ is **diagonal**\n", 1169 | " * Entries (the singular values) are positive and **sorted** in decreasing order ($\\sigma_1 \\geq \\sigma_2 \\geq ... \\geq 0$)\n", 1170 | " * Proof of uniqueness" 1171 | ] 1172 | }, 1173 | { 1174 | "cell_type": "markdown", 1175 | "metadata": { 1176 | "slideshow": { 1177 | "slide_type": "subslide" 1178 | } 1179 | }, 1180 | "source": [ 1181 | "\n", 1182 | "\n", 1183 | "* Image Source" 1184 | ] 1185 | }, 1186 | { 1187 | "cell_type": "markdown", 1188 | "metadata": { 1189 | "slideshow": { 1190 | "slide_type": "slide" 1191 | } 1192 | }, 1193 | "source": [ 1194 | "### SVD Example - Users-to-Movies\n", 1195 | "---\n", 1196 | "We are given a dataset of user's rating (1 to 5) for several movies of 3 genres (concepts) and we wish to use SVD to decompose to the following components:\n", 1197 | "* User-to-Concept - which genres the users prefer: $U$ matrix\n", 1198 | "* Concepts - what is the strength of each genre in the dataset: $\\Sigma$ - strength of each concept (the singular values)\n", 1199 | "* Movie-to-Concept - for each movie, what genres are the most dominant: $V$ matrix" 1200 | ] 1201 | }, 1202 | { 1203 | "cell_type": "code", 1204 | "execution_count": 17, 1205 | "metadata": { 1206 | "slideshow": { 1207 | "slide_type": "subslide" 1208 | } 1209 | }, 1210 | "outputs": [ 1211 | { 1212 | "name": "stdout", 1213 | "output_type": "stream", 1214 | "text": [ 1215 | "User-to-Movies matrix:\n" 1216 | ] 1217 | }, 1218 | { 1219 | "data": { 1220 | "text/html": [ 1221 | "
\n", 1222 | "\n", 1235 | "\n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | "
MatrixAlienSerenityCasablancaAmelie
User 111100
User 233300
User 344400
User 455500
User 502044
User 600055
User 701022
\n", 1305 | "
" 1306 | ], 1307 | "text/plain": [ 1308 | " Matrix Alien Serenity Casablanca Amelie\n", 1309 | "User 1 1 1 1 0 0\n", 1310 | "User 2 3 3 3 0 0\n", 1311 | "User 3 4 4 4 0 0\n", 1312 | "User 4 5 5 5 0 0\n", 1313 | "User 5 0 2 0 4 4\n", 1314 | "User 6 0 0 0 5 5\n", 1315 | "User 7 0 1 0 2 2" 1316 | ] 1317 | }, 1318 | "execution_count": 17, 1319 | "metadata": {}, 1320 | "output_type": "execute_result" 1321 | } 1322 | ], 1323 | "source": [ 1324 | "# load the dataset and create a pandas DataFrame\n", 1325 | "u_t_m = np.array([[1,1,1,0,0], [3,3,3,0,0], [4,4,4,0,0], [5,5,5,0,0], [0,2,0,4,4], [0,0,0,5,5], [0,1,0,2,2]])\n", 1326 | "print(\"User-to-Movies matrix:\")\n", 1327 | "# print(u_t_m)\n", 1328 | "u_t_m_df = pd.DataFrame(u_t_m, columns=['Matrix', 'Alien', 'Serenity', 'Casablanca', 'Amelie'],\n", 1329 | " index=['User 1', 'User 2','User 3', 'User 4', 'User 5', 'User 6', 'User 7'])\n", 1330 | "u_t_m_df" 1331 | ] 1332 | }, 1333 | { 1334 | "cell_type": "code", 1335 | "execution_count": 18, 1336 | "metadata": { 1337 | "slideshow": { 1338 | "slide_type": "subslide" 1339 | } 1340 | }, 1341 | "outputs": [], 1342 | "source": [ 1343 | "# perform SVD for 3 concepts\n", 1344 | "u, s, vh = np.linalg.svd(u_t_m, full_matrices=False)" 1345 | ] 1346 | }, 1347 | { 1348 | "cell_type": "code", 1349 | "execution_count": 19, 1350 | "metadata": { 1351 | "slideshow": { 1352 | "slide_type": "subslide" 1353 | } 1354 | }, 1355 | "outputs": [ 1356 | { 1357 | "name": "stdout", 1358 | "output_type": "stream", 1359 | "text": [ 1360 | "U of size (7, 3) :\n", 1361 | "[[-0.1376 0.0236 0.01081]\n", 1362 | " [-0.4128 0.07086 0.03244]\n", 1363 | " [-0.5503 0.0944 0.04324]\n", 1364 | " [-0.688 0.11804 0.05405]\n", 1365 | " [-0.1528 -0.5913 -0.654 ]\n", 1366 | " [-0.0722 -0.7314 0.678 ]\n", 1367 | " [-0.0764 -0.2957 -0.327 ]]\n" 1368 | ] 1369 | } 1370 | ], 1371 | "source": [ 1372 | "print(\"U of size\", u[:,:3].shape, \":\")\n", 1373 | "print(u[:,:3].astype(np.float16))" 1374 | ] 1375 | }, 1376 | { 1377 | "cell_type": "code", 1378 | "execution_count": 21, 1379 | "metadata": { 1380 | "slideshow": { 1381 | "slide_type": "subslide" 1382 | } 1383 | }, 1384 | "outputs": [ 1385 | { 1386 | "name": "stdout", 1387 | "output_type": "stream", 1388 | "text": [ 1389 | "Singular values:\n", 1390 | "as a matrix:\n", 1391 | "[[12.484 0. 0. ]\n", 1392 | " [ 0. 9.51 0. ]\n", 1393 | " [ 0. 0. 1.346]]\n" 1394 | ] 1395 | } 1396 | ], 1397 | "source": [ 1398 | "print(\"Singular values:\")\n", 1399 | "print(\"as a matrix:\")\n", 1400 | "print(np.diag(s[:3]).astype(np.float16))" 1401 | ] 1402 | }, 1403 | { 1404 | "cell_type": "code", 1405 | "execution_count": 22, 1406 | "metadata": { 1407 | "slideshow": { 1408 | "slide_type": "subslide" 1409 | } 1410 | }, 1411 | "outputs": [ 1412 | { 1413 | "name": "stdout", 1414 | "output_type": "stream", 1415 | "text": [ 1416 | "V of size (3, 5) :\n", 1417 | "[[-0.5625 -0.593 -0.5625 -0.09015 -0.09015]\n", 1418 | " [ 0.1266 -0.02878 0.1266 -0.6953 -0.6953 ]\n", 1419 | " [ 0.4097 -0.8047 0.4097 0.09125 0.09125]]\n" 1420 | ] 1421 | } 1422 | ], 1423 | "source": [ 1424 | "print(\"V of size\", vh[:3,:].shape, \":\")\n", 1425 | "print(vh[:3,:].astype(np.float16))" 1426 | ] 1427 | }, 1428 | { 1429 | "cell_type": "code", 1430 | "execution_count": 23, 1431 | "metadata": {}, 1432 | "outputs": [ 1433 | { 1434 | "name": "stdout", 1435 | "output_type": "stream", 1436 | "text": [ 1437 | "reconstruction of user-to-movie:\n" 1438 | ] 1439 | }, 1440 | { 1441 | "data": { 1442 | "text/html": [ 1443 | "
\n", 1444 | "\n", 1457 | "\n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | "
MatrixAlienSerenityCasablancaAmelie
User 11.01.01.00.00.0
User 23.03.03.0-0.0-0.0
User 34.04.04.00.0-0.0
User 45.05.05.0-0.0-0.0
User 50.02.0-0.04.04.0
User 60.00.0-0.05.05.0
User 70.01.0-0.02.02.0
\n", 1527 | "
" 1528 | ], 1529 | "text/plain": [ 1530 | " Matrix Alien Serenity Casablanca Amelie\n", 1531 | "User 1 1.0 1.0 1.0 0.0 0.0\n", 1532 | "User 2 3.0 3.0 3.0 -0.0 -0.0\n", 1533 | "User 3 4.0 4.0 4.0 0.0 -0.0\n", 1534 | "User 4 5.0 5.0 5.0 -0.0 -0.0\n", 1535 | "User 5 0.0 2.0 -0.0 4.0 4.0\n", 1536 | "User 6 0.0 0.0 -0.0 5.0 5.0\n", 1537 | "User 7 0.0 1.0 -0.0 2.0 2.0" 1538 | ] 1539 | }, 1540 | "execution_count": 23, 1541 | "metadata": {}, 1542 | "output_type": "execute_result" 1543 | } 1544 | ], 1545 | "source": [ 1546 | "# reconstruct the user-to-movie matrix\n", 1547 | "A_aprox = u[:,:3] @ np.diag(s[:3]) @ vh[:3,:]\n", 1548 | "A_aprox_df = pd.DataFrame(A_aprox.astype(np.float16), columns=['Matrix', 'Alien', 'Serenity', 'Casablanca', 'Amelie'],\n", 1549 | " index=['User 1', 'User 2','User 3', 'User 4', 'User 5', 'User 6', 'User 7'])\n", 1550 | "print(\"reconstruction of user-to-movie:\")\n", 1551 | "A_aprox_df" 1552 | ] 1553 | }, 1554 | { 1555 | "cell_type": "markdown", 1556 | "metadata": { 1557 | "slideshow": { 1558 | "slide_type": "subslide" 1559 | } 1560 | }, 1561 | "source": [ 1562 | "" 1563 | ] 1564 | }, 1565 | { 1566 | "cell_type": "markdown", 1567 | "metadata": { 1568 | "slideshow": { 1569 | "slide_type": "slide" 1570 | } 1571 | }, 1572 | "source": [ 1573 | "### Recommended Videos\n", 1574 | "---\n", 1575 | "#### Warning!\n", 1576 | "* These videos do not replace the lectures and tutorials.\n", 1577 | "* Please use these to get a better understanding of the material, and not as an alternative to the written material.\n", 1578 | "\n", 1579 | "#### Video By Subject\n", 1580 | "* Basic Linear Algebra - Mathematics for Machine Learning full Course || Linear Algebra || Part-1\n", 1581 | "* SVD - Lecture 47 — Singular Value Decomposition | Stanford University\n" 1582 | ] 1583 | }, 1584 | { 1585 | "cell_type": "markdown", 1586 | "metadata": { 1587 | "slideshow": { 1588 | "slide_type": "skip" 1589 | } 1590 | }, 1591 | "source": [ 1592 | "## Credits\n", 1593 | "---\n", 1594 | "* Inspired by slides by Elad Osherov and slides from MMDS\n", 1595 | "* Icons from Icon8.com - https://icons8.com\n", 1596 | "* Datasets from Kaggle - https://www.kaggle.com/" 1597 | ] 1598 | } 1599 | ], 1600 | "metadata": { 1601 | "kernelspec": { 1602 | "display_name": "Python 3", 1603 | "language": "python", 1604 | "name": "python3" 1605 | }, 1606 | "language_info": { 1607 | "codemirror_mode": { 1608 | "name": "ipython", 1609 | "version": 3 1610 | }, 1611 | "file_extension": ".py", 1612 | "mimetype": "text/x-python", 1613 | "name": "python", 1614 | "nbconvert_exporter": "python", 1615 | "pygments_lexer": "ipython3", 1616 | "version": "3.6.9" 1617 | } 1618 | }, 1619 | "nbformat": 4, 1620 | "nbformat_minor": 2 1621 | } 1622 | -------------------------------------------------------------------------------- /cs236756_tutorial_11_boosting_bagging.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# CS 236756 - Technion - Intro to Machine Learning\n", 12 | "---\n", 13 | "#### Tal Daniel\n", 14 | "\n", 15 | "## Tutorial 11 - Boosting & Bagging\n", 16 | "---\n", 17 | "\n", 18 | "\n", 19 | "\n", 20 | "Image Source" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": { 26 | "slideshow": { 27 | "slide_type": "slide" 28 | } 29 | }, 30 | "source": [ 31 | "### Agenda\n", 32 | "---\n", 33 | "* [Ensemble Learning](#-Ensemble-Learning)\n", 34 | " * [Voting Classifiers](#-Voting-Classifiers)\n", 35 | "* [Bagging (& Pasting)](#-Bagging-(&-Pasting))\n", 36 | " * [Bootstrap](#Bootstrap)\n", 37 | "* [Boosting](#-Boosting)\n", 38 | " * [AdaBoost](#-AdaBoost)\n", 39 | "* [Recommended Videos](#-Recommended-Videos)\n", 40 | "* [Credits](#-Credits)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 15, 46 | "metadata": { 47 | "slideshow": { 48 | "slide_type": "skip" 49 | } 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "# imports for the tutorial\n", 54 | "import numpy as np\n", 55 | "import pandas as pd\n", 56 | "import matplotlib.pyplot as plt\n", 57 | "%matplotlib notebook\n", 58 | "\n", 59 | "from sklearn.metrics import accuracy_score\n", 60 | "from sklearn.ensemble import RandomForestClassifier\n", 61 | "from sklearn.ensemble import VotingClassifier\n", 62 | "from sklearn.ensemble import BaggingClassifier\n", 63 | "from sklearn.ensemble import AdaBoostClassifier\n", 64 | "from sklearn.tree import DecisionTreeClassifier\n", 65 | "from sklearn.tree import DecisionTreeClassifier\n", 66 | "from sklearn.linear_model import LogisticRegression\n", 67 | "from sklearn.svm import SVC\n", 68 | "from sklearn.preprocessing import StandardScaler\n", 69 | "\n", 70 | "import warnings\n", 71 | "warnings.filterwarnings(\"ignore\", category=DeprecationWarning) " 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": { 77 | "slideshow": { 78 | "slide_type": "slide" 79 | } 80 | }, 81 | "source": [ 82 | "## Ensemble Learning\n", 83 | "---\n", 84 | "* **Wisdom of the Crowd** - assembling the predictions of a group of predictors (such as classifiers or regressors) often results in a better prediction than with the best individual predictor.\n", 85 | "* **Ensemble** - a group of predictors. An *Ensemble Learning* algorithm is called an **Ensemble method**.\n", 86 | " * For example: **Random Forest** -train a group of Decision Tree classifiers, each is trained on a random subset of the training set. To make predicitons, we obtain the predicitons of all individual trees, and then predict the class that gets the most votes. This is one of the most powerful ML algorithms available today." 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": { 92 | "slideshow": { 93 | "slide_type": "slide" 94 | } 95 | }, 96 | "source": [ 97 | "### Voting Classifiers\n", 98 | "---\n", 99 | "* **Hard Voting Classifier** - aggregate the predictions of each classifier and predict the class that gets the most votes.\n", 100 | " * In fact, even if each classifier is a *weak learner* (it does only slightly better than random guessing), the ensemble can still be a *strong learner* (achieving high accuracy), provided there are a sufficient number of weak learners and they are sufficiently diverse.\n", 101 | " * **The Law of Large Numbers** - how can the above fact be explained? building an ensemble containing 1,000 classifiers that are individually correct only 51% of the time (slighly better than random guessing) and predict the majority voted class, it is possible to reach 75% accuracy if all the classifiers are perfectly independent (which is not really the case since they are trained on the same data).\n", 102 | " * One way to get diverse classifiers is to train them using very different algorithms (increases the chance that they will make very different types of erros and thus improving the ensemble's accuracy).\n", 103 | "* **Soft Voting Classifier** - if all the classifiers are able to estimate class probabilities, then the class probability can be averaged over all the individual classifiers.\n", 104 | " * It often achieves higher performance than *hard voting* because it gives more weight to highly confident votes. " 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 2, 110 | "metadata": { 111 | "slideshow": { 112 | "slide_type": "subslide" 113 | } 114 | }, 115 | "outputs": [ 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "total samples: 569\n", 121 | "total positive sampels (M): 212, total negative samples (B): 357\n" 122 | ] 123 | }, 124 | { 125 | "data": { 126 | "text/html": [ 127 | "
\n", 128 | "\n", 141 | "\n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | "
iddiagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_mean...texture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worstUnnamed: 32
237883263M20.4821.46132.501306.00.083550.083480.090420.060220...26.17161.701750.00.122800.231100.315800.144500.22380.07127NaN
159871149B10.9012.9668.69366.80.075150.037180.003090.006588...18.2078.07470.00.117100.082940.018540.039530.27380.07685NaN
44290944601B13.7815.7988.37585.90.088170.067180.010550.009937...17.5097.90706.60.107200.107100.035170.033120.18590.06810NaN
2838912280M16.2418.77108.80805.10.106600.180200.194800.090520...25.09126.901031.00.136500.470600.502600.173200.27700.10630NaN
477911673B13.9016.6288.97599.40.068280.053190.022240.013390...21.80101.20718.90.093840.200600.138400.062220.26790.07698NaN
45857010M18.6517.60123.701076.00.109900.168600.197400.100900...21.32150.601567.00.167900.509000.734500.237800.37990.09185NaN
127866203M19.0018.91123.401138.00.082170.080280.092710.056270...25.73148.201538.00.102100.226400.320700.121800.28410.06541NaN
561925311B11.2029.3770.67386.00.074490.035580.000000.000000...38.3075.19439.60.092670.054940.000000.000000.15660.05905NaN
120865137B11.4110.8273.34403.30.093730.066850.035120.026230...15.9783.74510.50.154800.239000.210200.089580.30160.08523NaN
4449110127M18.0316.85117.50990.00.089470.123200.109000.062540...22.02133.301292.00.126300.266600.429000.153500.28420.08225NaN
\n", 411 | "

10 rows × 33 columns

\n", 412 | "
" 413 | ], 414 | "text/plain": [ 415 | " id diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n", 416 | "237 883263 M 20.48 21.46 132.50 1306.0 \n", 417 | "159 871149 B 10.90 12.96 68.69 366.8 \n", 418 | "442 90944601 B 13.78 15.79 88.37 585.9 \n", 419 | "283 8912280 M 16.24 18.77 108.80 805.1 \n", 420 | "477 911673 B 13.90 16.62 88.97 599.4 \n", 421 | "45 857010 M 18.65 17.60 123.70 1076.0 \n", 422 | "127 866203 M 19.00 18.91 123.40 1138.0 \n", 423 | "561 925311 B 11.20 29.37 70.67 386.0 \n", 424 | "120 865137 B 11.41 10.82 73.34 403.3 \n", 425 | "444 9110127 M 18.03 16.85 117.50 990.0 \n", 426 | "\n", 427 | " smoothness_mean compactness_mean concavity_mean concave points_mean \\\n", 428 | "237 0.08355 0.08348 0.09042 0.060220 \n", 429 | "159 0.07515 0.03718 0.00309 0.006588 \n", 430 | "442 0.08817 0.06718 0.01055 0.009937 \n", 431 | "283 0.10660 0.18020 0.19480 0.090520 \n", 432 | "477 0.06828 0.05319 0.02224 0.013390 \n", 433 | "45 0.10990 0.16860 0.19740 0.100900 \n", 434 | "127 0.08217 0.08028 0.09271 0.056270 \n", 435 | "561 0.07449 0.03558 0.00000 0.000000 \n", 436 | "120 0.09373 0.06685 0.03512 0.026230 \n", 437 | "444 0.08947 0.12320 0.10900 0.062540 \n", 438 | "\n", 439 | " ... texture_worst perimeter_worst area_worst smoothness_worst \\\n", 440 | "237 ... 26.17 161.70 1750.0 0.12280 \n", 441 | "159 ... 18.20 78.07 470.0 0.11710 \n", 442 | "442 ... 17.50 97.90 706.6 0.10720 \n", 443 | "283 ... 25.09 126.90 1031.0 0.13650 \n", 444 | "477 ... 21.80 101.20 718.9 0.09384 \n", 445 | "45 ... 21.32 150.60 1567.0 0.16790 \n", 446 | "127 ... 25.73 148.20 1538.0 0.10210 \n", 447 | "561 ... 38.30 75.19 439.6 0.09267 \n", 448 | "120 ... 15.97 83.74 510.5 0.15480 \n", 449 | "444 ... 22.02 133.30 1292.0 0.12630 \n", 450 | "\n", 451 | " compactness_worst concavity_worst concave points_worst symmetry_worst \\\n", 452 | "237 0.23110 0.31580 0.14450 0.2238 \n", 453 | "159 0.08294 0.01854 0.03953 0.2738 \n", 454 | "442 0.10710 0.03517 0.03312 0.1859 \n", 455 | "283 0.47060 0.50260 0.17320 0.2770 \n", 456 | "477 0.20060 0.13840 0.06222 0.2679 \n", 457 | "45 0.50900 0.73450 0.23780 0.3799 \n", 458 | "127 0.22640 0.32070 0.12180 0.2841 \n", 459 | "561 0.05494 0.00000 0.00000 0.1566 \n", 460 | "120 0.23900 0.21020 0.08958 0.3016 \n", 461 | "444 0.26660 0.42900 0.15350 0.2842 \n", 462 | "\n", 463 | " fractal_dimension_worst Unnamed: 32 \n", 464 | "237 0.07127 NaN \n", 465 | "159 0.07685 NaN \n", 466 | "442 0.06810 NaN \n", 467 | "283 0.10630 NaN \n", 468 | "477 0.07698 NaN \n", 469 | "45 0.09185 NaN \n", 470 | "127 0.06541 NaN \n", 471 | "561 0.05905 NaN \n", 472 | "120 0.08523 NaN \n", 473 | "444 0.08225 NaN \n", 474 | "\n", 475 | "[10 rows x 33 columns]" 476 | ] 477 | }, 478 | "execution_count": 2, 479 | "metadata": {}, 480 | "output_type": "execute_result" 481 | } 482 | ], 483 | "source": [ 484 | "# let's load the cancer dataset, shuffle it and speratre into train and test set\n", 485 | "dataset = pd.read_csv('./datasets/cancer_dataset.csv')\n", 486 | "# print the number of rows in the data set\n", 487 | "number_of_rows = len(dataset)\n", 488 | "print(\"total samples: {}\".format(number_of_rows))\n", 489 | "total_positive_samples = np.sum(dataset['diagnosis'].values == 'M')\n", 490 | "print(\"total positive sampels (M): {}, total negative samples (B): {}\".format(total_positive_samples, number_of_rows - total_positive_samples))\n", 491 | "num_train = int(0.8 * number_of_rows)\n", 492 | "# reminder, the data looks like this\n", 493 | "# dataset.head(10) # the dataset is ordered by the diagnosis\n", 494 | "dataset.sample(10)" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 3, 500 | "metadata": { 501 | "slideshow": { 502 | "slide_type": "subslide" 503 | } 504 | }, 505 | "outputs": [], 506 | "source": [ 507 | "# prepare the dataset\n", 508 | "# we will take the first 2 features as our data (X) and the diagnosis as labels (y)\n", 509 | "x = dataset[['radius_mean', 'texture_mean', 'concavity_mean']].values\n", 510 | "y = dataset['diagnosis'].values == 'M' # 1 for Malignat, 0 for Benign\n", 511 | "# shuffle\n", 512 | "rand_gen = np.random.RandomState(0)\n", 513 | "shuffled_indices = rand_gen.permutation(np.arange(len(x)))" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": 4, 519 | "metadata": { 520 | "slideshow": { 521 | "slide_type": "subslide" 522 | } 523 | }, 524 | "outputs": [], 525 | "source": [ 526 | "x_train = x[shuffled_indices[:num_train]]\n", 527 | "y_train = y[shuffled_indices[:num_train]]\n", 528 | "x_test = x[shuffled_indices[num_train:]]\n", 529 | "y_test = y[shuffled_indices[num_train:]]" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 5, 535 | "metadata": { 536 | "slideshow": { 537 | "slide_type": "subslide" 538 | } 539 | }, 540 | "outputs": [ 541 | { 542 | "name": "stdout", 543 | "output_type": "stream", 544 | "text": [ 545 | "total training samples: 455, total test samples: 114\n" 546 | ] 547 | } 548 | ], 549 | "source": [ 550 | "# pre-process - standartization\n", 551 | "scaler = StandardScaler()\n", 552 | "scaler.fit(x_train)\n", 553 | "x_train = scaler.transform(x_train)\n", 554 | "x_test = scaler.transform(x_test)\n", 555 | "\n", 556 | "print(\"total training samples: {}, total test samples: {}\".format(num_train, number_of_rows - num_train))" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 9, 562 | "metadata": { 563 | "slideshow": { 564 | "slide_type": "subslide" 565 | } 566 | }, 567 | "outputs": [], 568 | "source": [ 569 | "# hard voting\n", 570 | "random_state = 38\n", 571 | "# create different classifiers\n", 572 | "log_clf = LogisticRegression(random_state=random_state, solver='lbfgs')\n", 573 | "rnd_clf = RandomForestClassifier(random_state=random_state, n_estimators=100)\n", 574 | "svm_clf = SVC(random_state=random_state)\n", 575 | "# create a voting classifier\n", 576 | "voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)], voting='hard')\n", 577 | "# voting_clf.fit(x_train, y_train)" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": 10, 583 | "metadata": { 584 | "scrolled": false 585 | }, 586 | "outputs": [ 587 | { 588 | "name": "stdout", 589 | "output_type": "stream", 590 | "text": [ 591 | "LogisticRegression 0.9385964912280702\n", 592 | "RandomForestClassifier 0.9298245614035088\n", 593 | "SVC 0.9473684210526315\n", 594 | "VotingClassifier 0.9473684210526315\n" 595 | ] 596 | } 597 | ], 598 | "source": [ 599 | "# let's look at each classifier's accuracy on the test set\n", 600 | "for clf in (log_clf, rnd_clf, svm_clf, voting_clf):\n", 601 | " clf.fit(x_train, y_train)\n", 602 | " y_pred = clf.predict(x_test)\n", 603 | " print(clf.__class__.__name__, accuracy_score(y_test, y_pred))" 604 | ] 605 | }, 606 | { 607 | "cell_type": "markdown", 608 | "metadata": { 609 | "slideshow": { 610 | "slide_type": "slide" 611 | } 612 | }, 613 | "source": [ 614 | "## Bagging (& Pasting)\n", 615 | "---\n", 616 | "* Another approach to get a diverse set of classifiers is to use the **same training algorithm** for every predictor, but to train them on **different random subsets of the training set**.\n", 617 | "* When sampling is performed **with replacement** this method is called **bagging** (which is a short for *bootstrap aggregating*).\n", 618 | " * In sampling **with replacement**, each sample unit of the population can occur one or more times in the sample.\n", 619 | " * In statistics, resampling with replacement is called *bootstrapping*.\n", 620 | "* When sampling is performed **without replacement** this method is called **pasting**.\n", 621 | "* Thus, both bagging and pasting allow training instances to be sampled several times across multiple predictors, but only bagging allows training instances to be sampled several times for the same predictor." 622 | ] 623 | }, 624 | { 625 | "cell_type": "markdown", 626 | "metadata": { 627 | "slideshow": { 628 | "slide_type": "subslide" 629 | } 630 | }, 631 | "source": [ 632 | "* Illustartion: \n", 633 | "\n", 634 | "Image Source" 635 | ] 636 | }, 637 | { 638 | "cell_type": "markdown", 639 | "metadata": { 640 | "slideshow": { 641 | "slide_type": "subslide" 642 | } 643 | }, 644 | "source": [ 645 | "* Once all predictors are trained, the ensemble can make a prediction for a new instance by collecting all the predictions of all the predictors. It usually decided by *hard voting* or average for regression.\n", 646 | "* Each individual predictor has a higher bias than if it were trained on the original training set, but the aggregation **reduces both bias and variance**.\n", 647 | " * It is common to see that the ensemble has a **similar bias** but a **lower variance** than a single predictor trained on the original training set." 648 | ] 649 | }, 650 | { 651 | "cell_type": "markdown", 652 | "metadata": { 653 | "slideshow": { 654 | "slide_type": "subslide" 655 | } 656 | }, 657 | "source": [ 658 | "#### Bootstrap\n", 659 | "---\n", 660 | "* **Bootstrap Algorithm**:\n", 661 | " * Denote the original sample: $ L_N = (x_1, x_2, ..., x_N) $\n", 662 | " * Repeat $M$ times:\n", 663 | " * Generate a sample $L_k$ of size $k$ from $L_N$ by sampling *with replacement*.\n", 664 | " * Compute $h$ from $L_k$ (that is, train a predictor $h$ using $L_k$).\n", 665 | " * Denote the bootstrap values $H=(h^1, h^2, ..., h^M)$\n", 666 | " * Use these values for calculating all the quantities of interest." 667 | ] 668 | }, 669 | { 670 | "cell_type": "markdown", 671 | "metadata": { 672 | "slideshow": { 673 | "slide_type": "subslide" 674 | } 675 | }, 676 | "source": [ 677 | "* **Bagging**:\n", 678 | " * Train each model with a random training set (bootsrap).\n", 679 | " * Each model in the ensemble has an **equal weight** in the voting.\n", 680 | " * Finally: $$ H(x) = sign(h^1(x) +h^2(x) +... +h^M(x)) $$" 681 | ] 682 | }, 683 | { 684 | "cell_type": "markdown", 685 | "metadata": { 686 | "slideshow": { 687 | "slide_type": "subslide" 688 | } 689 | }, 690 | "source": [ 691 | "* One classifier can be wrong as long as the others are correct (*hard voting*) " 692 | ] 693 | }, 694 | { 695 | "cell_type": "markdown", 696 | "metadata": {}, 697 | "source": [ 698 | "* Since given equal weight, this may cause problems when there is overlap. " 699 | ] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "execution_count": 13, 704 | "metadata": { 705 | "slideshow": { 706 | "slide_type": "subslide" 707 | } 708 | }, 709 | "outputs": [ 710 | { 711 | "name": "stdout", 712 | "output_type": "stream", 713 | "text": [ 714 | "bagging accuracy: 0.939\n" 715 | ] 716 | } 717 | ], 718 | "source": [ 719 | "# bagging\n", 720 | "# note: BaggingClassifiers will automatically perform 'soft voting' instead of 'hard voting'\n", 721 | "# if the base classifier can estimate class probabilities (i.e. if it has a \"predict_proba()\" method).\n", 722 | "\n", 723 | "bag_clf = BaggingClassifier(\n", 724 | " DecisionTreeClassifier(),\n", 725 | " n_estimators=500,\n", 726 | " max_samples=100,\n", 727 | " bootstrap=True,\n", 728 | " n_jobs=1)\n", 729 | "bag_clf.fit(x_train, y_train)\n", 730 | "y_pred = bag_clf.predict(x_test)\n", 731 | "bag_acc = accuracy_score(y_test, y_pred)\n", 732 | "print(\"bagging accuracy: {:.3f}\".format(bag_acc))" 733 | ] 734 | }, 735 | { 736 | "cell_type": "code", 737 | "execution_count": 14, 738 | "metadata": { 739 | "scrolled": true, 740 | "slideshow": { 741 | "slide_type": "subslide" 742 | } 743 | }, 744 | "outputs": [ 745 | { 746 | "name": "stdout", 747 | "output_type": "stream", 748 | "text": [ 749 | "pasting accuracy: 0.930\n" 750 | ] 751 | } 752 | ], 753 | "source": [ 754 | "# pasting\n", 755 | "pas_clf = BaggingClassifier(\n", 756 | " DecisionTreeClassifier(),\n", 757 | " n_estimators=500,\n", 758 | " max_samples=100,\n", 759 | " bootstrap=False,\n", 760 | " n_jobs=1)\n", 761 | "pas_clf.fit(x_train, y_train)\n", 762 | "y_pred = pas_clf.predict(x_test)\n", 763 | "pas_acc = accuracy_score(y_test, y_pred)\n", 764 | "print(\"pasting accuracy: {:.3f}\".format(pas_acc))" 765 | ] 766 | }, 767 | { 768 | "cell_type": "markdown", 769 | "metadata": { 770 | "slideshow": { 771 | "slide_type": "slide" 772 | } 773 | }, 774 | "source": [ 775 | "## Boosting\n", 776 | "---\n", 777 | "* **Boosting** (also *hypothesis boosting*) - any Ensemble method that can combine several weak learners into a strong learner. In boosting methods, predictors are trained **sequentially**, each trying to correct its predecessor.\n", 778 | " * Weak Learner - as before, the error rate is slighty better than flipping a coin\n", 779 | " * We also define:\n", 780 | " * $h$ is binary classifier such that $h \\in \\{-1, 1\\}$\n", 781 | " * Error rate $Err \\in [0,1]$\n", 782 | "* The principal difference between boosting and the committe methods is that in boosting, the base classifiers are **trained in sequence**." 783 | ] 784 | }, 785 | { 786 | "cell_type": "markdown", 787 | "metadata": { 788 | "slideshow": { 789 | "slide_type": "subslide" 790 | } 791 | }, 792 | "source": [ 793 | "* Each base classifier is trained using a **weighted form of the dataset**, in which the weight coefficient associated with each data point depends on the performance of the previous classifiers.\n", 794 | " * In particular, points that are misclassified by one of the base classifiers are given greater weight when used to train the next classifier in the sequence.\n", 795 | "* Once all the classifiers have been trained, their predictions are then combined through a **weighted majority voting** scheme." 796 | ] 797 | }, 798 | { 799 | "cell_type": "markdown", 800 | "metadata": { 801 | "slideshow": { 802 | "slide_type": "subslide" 803 | } 804 | }, 805 | "source": [ 806 | "* Visually: " 807 | ] 808 | }, 809 | { 810 | "cell_type": "markdown", 811 | "metadata": { 812 | "slideshow": { 813 | "slide_type": "subslide" 814 | } 815 | }, 816 | "source": [ 817 | "\n", 818 | "\n", 819 | "* There are many boosting methods, but we will examine one of the most popular one called *AdaBoost*." 820 | ] 821 | }, 822 | { 823 | "cell_type": "markdown", 824 | "metadata": { 825 | "slideshow": { 826 | "slide_type": "slide" 827 | } 828 | }, 829 | "source": [ 830 | "### AdaBoost\n", 831 | "---\n", 832 | "* The idea of AdaBoost is to give more attention to training instances that the predecessor underfitted. This leads to a predictor that focuses more and more on the hard cases.\n", 833 | "* The sequential learning in Boosting seems similar to Gradient Descent, only in AdaBoost predictors are added to the ensemble in order to make it better where in GD, a single predictor's paramerters are optimized to minimize an objective function.\n", 834 | "* Once all predictors are trained, the ensemble makes predictions by assigning different weights to each predictor, depending on their **overall accuracy on the weighted training set**." 835 | ] 836 | }, 837 | { 838 | "cell_type": "markdown", 839 | "metadata": { 840 | "slideshow": { 841 | "slide_type": "subslide" 842 | } 843 | }, 844 | "source": [ 845 | "#### Definitions\n", 846 | "---\n", 847 | "* Class labels are $\\{-1, 1\\}$\n", 848 | "* $m$ - number of samples in the training dataset\n", 849 | "* The weighted error rate of the $t^{th}$ predictor: $$ \\epsilon_t =\\sum _{i=1}^m w^{(i)} \\cdot \\mathbb{1}(\\hat{y}_t^{(i)} \\neq y^{(i)})$$ In the more general case where the weights are not normalized to 1: $$ \\epsilon_t =\\frac{\\sum _{i=1}^m w^{(i)} \\cdot \\mathbb{1}(\\hat{y}_t^{(i)} \\neq y^{(i)})}{\\sum _{i=1}^m w^{(i)}} $$\n", 850 | " * $\\hat{y}_t^{(i)}$ is the $t^{th}$ predictor's prediction for the $i^{th}$ instance." 851 | ] 852 | }, 853 | { 854 | "cell_type": "markdown", 855 | "metadata": { 856 | "slideshow": { 857 | "slide_type": "subslide" 858 | } 859 | }, 860 | "source": [ 861 | "* The predictors weight of the $t^{th}$ predictor: $$ \\alpha_t = \\eta \\ln \\frac{1 - \\epsilon_t}{\\epsilon_t} $$\n", 862 | " * $\\eta$ it the learning rate hyperparameter, e.g. $\\frac{1}{2}$ or 1.\n", 863 | " * The more accurate the predictor is, the more weight the predictor will be given.\n", 864 | "* The update rule: for $i = 1,2, ..., m $ $$ w^{(i)} \\leftarrow \\begin{cases} w^{(i)}e^{-\\alpha_t} & \\quad \\text{if } \\hat{y}_t^{(i)} = y^{(i)} \\\\ w^{(i)}e^{\\alpha_t} & \\quad \\text{if } \\hat{y}_t^{(i)} \\neq y^{(i)} \\end{cases} = w^{(i)}e^{-\\alpha_t\\cdot y^{(i)} \\cdot \\hat{y}_t^{(i)}}$$\n", 865 | " * Once all the weights were calculated, they are summed. The sum is denoted $Z_t$. Then, all the weights are normalized by dividing each weight by $Z_t$.\n", 866 | "* **Stopping criteria**:\n", 867 | " * The desired number of predictors is reached.\n", 868 | " * A perfert predictor is found." 869 | ] 870 | }, 871 | { 872 | "cell_type": "markdown", 873 | "metadata": { 874 | "slideshow": { 875 | "slide_type": "subslide" 876 | } 877 | }, 878 | "source": [ 879 | "\n", 880 | "\n", 881 | "Image Source" 882 | ] 883 | }, 884 | { 885 | "cell_type": "markdown", 886 | "metadata": { 887 | "slideshow": { 888 | "slide_type": "subslide" 889 | } 890 | }, 891 | "source": [ 892 | "* **The AdaBoost Algorithm**:\n", 893 | " * Initialize the data weights coefficients $\\{w^{(i)}\\}_{i=1}^m$: $$ w^{(i)} = \\frac{1}{m}, \\forall i= 1,2,...,m $$\n", 894 | " * For $t = 1,...,T$:\n", 895 | " * Fit a weak classifier $h_t(x)$ (which makes predictions $\\hat{y}_t$) to the weighted training data and calculate the weighted error rate: $$ \\epsilon_t =\\frac{\\sum _{i=1}^m w^{(i)} \\cdot \\mathbb{1}(\\hat{y}_t^{(i)} \\neq y^{(i)})}{\\sum _{i=1}^m w^{(i)}} $$\n", 896 | " * Choose $\\alpha_t$ (default $\\eta=\\frac{1}{2}$): $$ \\alpha_t = \\frac{1}{2} \\ln \\frac{1 - \\epsilon_t}{\\epsilon_t} $$\n", 897 | " * Update the weights: for $i = 1,2, ..., m $ $$ w^{(i)} \\leftarrow \\begin{cases} w^{(i)}e^{-\\alpha_t} & \\quad \\text{if } \\hat{y}_t^{(i)} = y^{(i)} \\\\ w^{(i)}e^{\\alpha_t} & \\quad \\text{if } \\hat{y}_t^{(i)} \\neq y^{(i)} \\end{cases} = w^{(i)}e^{-\\alpha_t\\cdot y^{(i)} \\cdot \\hat{y}_t^{(i)}}$$\n", 898 | " * Normalize the weights: for $i = 1,2, ..., m $ $$ w^{(i)} \\leftarrow \\frac{w^{(i)}}{Z_t} $$\n", 899 | " * $Z_t = \\sum_{i=1}^m w^{(i)}$\n", 900 | " * Use predictions using the final model, which is given by: $$ H(x) = sign(\\sum_{i=1}^T \\alpha_th_t(x)) $$" 901 | ] 902 | }, 903 | { 904 | "cell_type": "markdown", 905 | "metadata": { 906 | "slideshow": { 907 | "slide_type": "slide" 908 | } 909 | }, 910 | "source": [ 911 | "#### Exponential Loss\n", 912 | "---\n", 913 | "* So far, the loss functions we have seen:\n", 914 | " * 0-1 loss\n", 915 | " * Hinge loss\n", 916 | " * Log loss\n", 917 | "* Unlike previously learnt classifiers, AdaBoost minimzes the exponential loss.\n", 918 | "* All lossess upper bound the 0-1 loss and act as differentiable surrogate loss functions.\n", 919 | "* \n" 920 | ] 921 | }, 922 | { 923 | "cell_type": "markdown", 924 | "metadata": { 925 | "slideshow": { 926 | "slide_type": "subslide" 927 | } 928 | }, 929 | "source": [ 930 | "* Optimizing the exponential loss:\n", 931 | " * As shown in class, the training error is upper bounded by $H$: $$ \\frac{1}{m} \\sum_i^m \\mathbb{1}(H(x_i) \\neq y_i) \\leq \\prod_{t=1}^T Z_t $$\n", 932 | " * $Z_t = \\sum_i w_t^{(i)} e^{-\\alpha_t y_i h_t(x_i)} $\n", 933 | " * At each round we minimize $Z_t$ by:\n", 934 | " * Choosing the optimal $h_t$\n", 935 | " * Finding the optimal $\\alpha_t$\n", 936 | " * $$ \\frac{dZ}{d\\alpha} = -\\sum_{i=1}^m w^{(i)} y_ih(x_i) e^{-\\alpha y_ih(x_i)} = 0 $$ $$ -\\sum_{i:y_i=h(x_i)}w^{(i)} e^{-\\alpha} + \\sum_{i: y_i \\neq h(x_i)} w^{(i)} e^{\\alpha} = 0 $$ $$ -e^{-\\alpha} (1-\\epsilon) +e^{\\alpha} \\epsilon = 0 $$ $$ \\rightarrow \\alpha_t = \\frac{1}{2} \\ln \\frac{1 - \\epsilon_t}{\\epsilon_t} $$" 937 | ] 938 | }, 939 | { 940 | "cell_type": "markdown", 941 | "metadata": { 942 | "slideshow": { 943 | "slide_type": "slide" 944 | } 945 | }, 946 | "source": [ 947 | "### Boosting (AdaBoost) Example By Hand\n", 948 | "---\n", 949 | "Moses is a student who wants to avoid hard courses. \n", 950 | "\n", 951 | "In order to achieve this he wants to build a classifier that classifies courses as \"easy\" or \"hard\".\n", 952 | "\n", 953 | "He decides to classify courses' hardness by using AdaBoost with decision trees stumps (decision trees with max depth of 1) on the following data:" 954 | ] 955 | }, 956 | { 957 | "cell_type": "markdown", 958 | "metadata": { 959 | "slideshow": { 960 | "slide_type": "subslide" 961 | } 962 | }, 963 | "source": [ 964 | "|
Course ID
|
Hard
|
Final Exam
|
Theoretical
|
Midterm
|
236*
|
Number of HW
\n", 965 | "| --- | --- | --- | --- | --- | --- | --- |\n", 966 | "|
1
|
Y
|
Y
|
N
|
Y
|
N
|
5
|\n", 967 | "|
2
|
Y
|
N
|
Y
|
Y
|
N
|
5
|\n", 968 | "|
3
|
Y
|
N
|
Y
|
N
|
Y
|
1
|\n", 969 | "|
4
|
Y
|
N
|
Y
|
N
|
N
|
3
|\n", 970 | "|
5
|
Y
|
N
|
Y
|
N
|
N
|
5
|\n", 971 | "|
6
|
Y
|
Y
|
N
|
Y
|
N
|
5
|\n", 972 | "|
7
|
Y
|
Y
|
N
|
Y
|
N
|
5
|\n", 973 | "|
8
|
N
|
N
|
N
|
Y
|
Y
|
1
|\n", 974 | "|
9
|
N
|
N
|
Y
|
N
|
N
|
1
|\n", 975 | "|
10
|
Y
|
N
|
N
|
N
|
N
|
5
|" 976 | ] 977 | }, 978 | { 979 | "cell_type": "markdown", 980 | "metadata": { 981 | "slideshow": { 982 | "slide_type": "subslide" 983 | } 984 | }, 985 | "source": [ 986 | "As a first step, he first determined for each possible classifier (including the trivial constant classifier), which of the data points were misclassfied.\n", 987 | "\n", 988 | "For example, for the first classifier which classfies courses as hard if they have a final exam, the classifier is wrong on samples 2,3,4 and 5." 989 | ] 990 | }, 991 | { 992 | "cell_type": "markdown", 993 | "metadata": { 994 | "slideshow": { 995 | "slide_type": "subslide" 996 | } 997 | }, 998 | "source": [ 999 | "|
Classifier
|
Test
|
Value
|
Misclassified
|\n", 1000 | "| --- | --- | --- | --- |\n", 1001 | "|
A
|
Final Exam
|
Y
|
2,3,4,5
|\n", 1002 | "|
B
|
Theoretical
|
Y
|
1,6,7,9
|\n", 1003 | "|
C
|
Midterm
|
Y
|
3,4,5,8
|\n", 1004 | "|
D
|
Undergrduate
|
Y
|
1,2,4,5,6,7,8
|\n", 1005 | "|
E
|
# HW > 2
|
Y
|
3,10
|\n", 1006 | "|
F
|
# HW > 4
|
Y
|
3,4,10
|\n", 1007 | "|
G
|
True (const)
|
|
8,9,10
|\n", 1008 | "|
H
|
Final Exam
|
N
|
1,6,7,8,9,10
|\n", 1009 | "|
I
|
Theoretical
|
N
|
2,3,4,5,8,10
|\n", 1010 | "|
J
|
Midterm
|
N
|
1,2,6,7,9,10
|\n", 1011 | "|
K
|
Undergraduate
|
N
|
3,9,10
|\n", 1012 | "|
L
|
# HW < 2
|
Y
|
1,2,4,5,6,7,8,9
|\n", 1013 | "|
M
|
# HW < 4
|
Y
|
1,2,5,6,7,8,9
|\n", 1014 | "|
N
|
False (const)
|
|
1,2,3,4,5,6,7
|" 1015 | ] 1016 | }, 1017 | { 1018 | "cell_type": "markdown", 1019 | "metadata": { 1020 | "slideshow": { 1021 | "slide_type": "subslide" 1022 | } 1023 | }, 1024 | "source": [ 1025 | "#### Consider only useful classifiers\n", 1026 | "Only 6 classifiers from the table above would ever be used because the other 8 make all the same error as one of the other classifiers and then make additional erros. For example, classifiers I and N do the same mistakes as A and add to that. The 6 useful classifiers are:\n", 1027 | "\n", 1028 | "\n", 1029 | "|
Classifier
|
Test
|
Value
|
Misclassified
|\n", 1030 | "| --- | --- | --- | --- |\n", 1031 | "|
A
|
Final Exam
|
Y
|
2,3,4,5
|\n", 1032 | "|
B
|
Theoretical
|
Y
|
1,6,7,9
|\n", 1033 | "|
C
|
Midterm
|
Y
|
3,4,5,8
|\n", 1034 | "|
D
|
Undergrduate
|
Y
|
1,2,4,5,6,7,8
|\n", 1035 | "|
E
|
# HW > 2
|
Y
|
3,10
|\n", 1036 | "|
G
|
True (const)
|
|
8,9,10
|" 1037 | ] 1038 | }, 1039 | { 1040 | "cell_type": "markdown", 1041 | "metadata": { 1042 | "slideshow": { 1043 | "slide_type": "subslide" 1044 | } 1045 | }, 1046 | "source": [ 1047 | "#### AdaBoost\n", 1048 | "* We will now perform AdaBoost by calculating the weights at each iteration.\n", 1049 | "* We will calculate the 10 weights, the classification $h$, the error and $\\alpha$.\n", 1050 | "* If there is a tie, we break it by choosing the classifier that is higher on the list (lexicographical order)\n", 1051 | "* Note: in this example we assume that the weights of the data points do not affect the clasification and are just meant to calculate the final weight of each classifier." 1052 | ] 1053 | }, 1054 | { 1055 | "cell_type": "markdown", 1056 | "metadata": { 1057 | "slideshow": { 1058 | "slide_type": "subslide" 1059 | } 1060 | }, 1061 | "source": [ 1062 | "#### Round 1\n", 1063 | "* Each weight is given the same value: $\\frac{1}{m} = \\frac{1}{10}$\n", 1064 | "* Since classifier $E$ is the most accurate, it will serve as the classifier.\n", 1065 | "* The weight error rate of classifier $E$ is $\\epsilon_E = \\frac{2}{10}$\n", 1066 | "* Thus: $\\alpha_E = \\frac{1}{2}\\ln \\frac{1 - \\epsilon_E}{\\epsilon_E} = \\frac{1}{2} \\ln (4)$" 1067 | ] 1068 | }, 1069 | { 1070 | "cell_type": "markdown", 1071 | "metadata": { 1072 | "slideshow": { 1073 | "slide_type": "subslide" 1074 | } 1075 | }, 1076 | "source": [ 1077 | "|
Parameters        
|
Round 1
|
Round 2
|
Round 3
|\n", 1078 | "| ----- | --- | --- | --- |\n", 1079 | "|
w1
|
$\\frac{1}{10}$
|
|
|\n", 1080 | "|
w2
|
$\\frac{1}{10}$
|
|
|\n", 1081 | "|
w3
|
$\\frac{1}{10}$
|
|
|\n", 1082 | "|
w4
|
$\\frac{1}{10}$
|
|
|\n", 1083 | "|
w5
|
$\\frac{1}{10}$
|
|
|\n", 1084 | "|
w6
|
$\\frac{1}{10}$
|
|
|\n", 1085 | "|
w7
|
$\\frac{1}{10}$
|
|
|\n", 1086 | "|
w8
|
$\\frac{1}{10}$
|
|
|\n", 1087 | "|
w9
|
$\\frac{1}{10}$
|
|
|\n", 1088 | "|
w10
|
$\\frac{1}{10}$
|
|
|\n", 1089 | "|
$h$
|
$E$
|
|
|\n", 1090 | "|
Err - $\\epsilon$
|
$\\frac{2}{10}$
|
|
|\n", 1091 | "|
$$\\alpha = \\frac{1}{2}\\ln \\frac{1 - \\epsilon}{\\epsilon} $$
|
$\\frac{1}{2} \\ln (4)$
|
|
|" 1092 | ] 1093 | }, 1094 | { 1095 | "cell_type": "markdown", 1096 | "metadata": {}, 1097 | "source": [ 1098 | "#### AdaBoost - calculating the new weights\n", 1099 | "* Recall that the un-normalized weights update: $$ \\tilde{w}_{t+1}^{(i)} = w_t^{(i)} e^{-\\alpha_ty_ih_t(x_i)} $$\n", 1100 | "* For the correctly classified data points (8 points): $$ \\tilde{w}_{t+1}^{(i)} = \\frac{1}{10}e^{-\\frac{1}{2}\\ln (4)} = \\frac{1}{10} \\cdot \\frac{1}{2} = \\frac{1}{20} $$\n", 1101 | "* For the incorrectly classified data points (2 points): $$ \\tilde{w}_{t+1}^{(i)} = \\frac{1}{10}e^{\\frac{1}{2}\\ln (4)} = \\frac{1}{10} \\cdot 2 = \\frac{1}{5} $$\n", 1102 | "* Calculate the normalization factor: $$ Z_t = 8 \\cdot \\frac{1}{20} + 2 \\cdot \\frac{1}{5} = \\frac{4}{5} $$\n", 1103 | "* The final weights after normalization:\n", 1104 | " * Correct: $w_{t+1}^{(i)} = \\frac{1}{20} \\cdot \\frac{5}{4} = \\frac{1}{16}$\n", 1105 | " * Incorrect: $w_{t+1}^{(i)} = \\frac{1}{5} \\cdot \\frac{5}{4} = \\frac{1}{4}$" 1106 | ] 1107 | }, 1108 | { 1109 | "cell_type": "markdown", 1110 | "metadata": { 1111 | "slideshow": { 1112 | "slide_type": "subslide" 1113 | } 1114 | }, 1115 | "source": [ 1116 | "Similarly, we fill in the rest of the table:\n", 1117 | "\n", 1118 | "\n", 1119 | "|
Parameters        
|
Round 1
|
Round 2
|
Round 3
|\n", 1120 | "| ----- | --- | --- | --- |\n", 1121 | "|
w1
|
$\\frac{1}{10}$
|
$\\frac{1}{16}$
|
$\\frac{3}{24}$
|\n", 1122 | "|
w2
|
$\\frac{1}{10}$
|
$\\frac{1}{16}$
|
$\\frac{1}{24}$
|\n", 1123 | "|
w3
|
$\\frac{1}{10}$
|
$\\frac{4}{16}$
|
$\\frac{4}{24}$
|\n", 1124 | "|
w4
|
$\\frac{1}{10}$
|
$\\frac{1}{16}$
|
$\\frac{1}{24}$
|\n", 1125 | "|
w5
|
$\\frac{1}{10}$
|
$\\frac{1}{16}$
|
$\\frac{1}{24}$
|\n", 1126 | "|
w6
|
$\\frac{1}{10}$
|
$\\frac{1}{16}$
|
$\\frac{3}{24}$
|\n", 1127 | "|
w7
|
$\\frac{1}{10}$
|
$\\frac{1}{16}$
|
$\\frac{3}{24}$
|\n", 1128 | "|
w8
|
$\\frac{1}{10}$
|
$\\frac{1}{16}$
|
$\\frac{1}{24}$
|\n", 1129 | "|
w9
|
$\\frac{1}{10}$
|
$\\frac{1}{16}$
|
$\\frac{3}{24}$
|\n", 1130 | "|
w10
|
$\\frac{1}{10}$
|
$\\frac{4}{16}$
|
$\\frac{4}{24}$
|\n", 1131 | "|
$h$
|
$E$
|
$B$
|
$A$
|\n", 1132 | "|
Err - $\\epsilon$
|
$\\frac{2}{10}$
|
$\\frac{1}{4}$
|
$\\frac{7}{24}$
|\n", 1133 | "|
$$\\alpha = \\frac{1}{2}\\ln \\frac{1 - \\epsilon}{\\epsilon} $$
|
$\\frac{1}{2} \\ln (4)$
|
$\\frac{1}{2} \\ln (3)$
|
$\\frac{1}{2} \\ln \\frac{17}{7}$
|" 1134 | ] 1135 | }, 1136 | { 1137 | "cell_type": "markdown", 1138 | "metadata": { 1139 | "slideshow": { 1140 | "slide_type": "subslide" 1141 | } 1142 | }, 1143 | "source": [ 1144 | "#### AdaBoost - Putting the classifiers together\n", 1145 | "* The final classifier for 3 rounds of Boosting: $$ H(x) = sign(\\frac{1}{2} \\ln (4) \\cdot h_E(x) + \\frac{1}{2} \\ln (3) \\cdot h_B(x) + \\frac{1}{2} \\ln \\frac{17}{7} \\cdot h_A(x)) $$\n", 1146 | " * $h_c(x)$ returns +1 or -1 for $c=E,B,A$\n", 1147 | "* The data points that the final classifier is correct about them:\n", 1148 | " * Since $\\alpha_E, \\alpha_B > \\alpha_A$ - it is just a *majority vote*\n", 1149 | " * Only one example (3) is misclassified" 1150 | ] 1151 | }, 1152 | { 1153 | "cell_type": "markdown", 1154 | "metadata": { 1155 | "slideshow": { 1156 | "slide_type": "slide" 1157 | } 1158 | }, 1159 | "source": [ 1160 | "### AdaBoost in Scikit-Learn\n", 1161 | "* Scikit-Learn uses a multiclass version of AdaBoost called *SAMME* (Stagewise Additive Modeling using a Multiclass Exponential loss function).\n", 1162 | " * When there are just 2 classes, SAMME is equivalent to AdaBoost.\n", 1163 | " * If the predictors can estimate class probabilities (i.e. they have a `predict_proba()` method), Scikit-Learn can use a variant of SAMME called *SAMMER* (R for \"Real\"), which relies on class probabilities rather than predictions and generally performs better.\n", 1164 | " \n", 1165 | "* The following code trains an AdaBoost classifier on 600 Decision Stumps.\n", 1166 | "* Note: if the AdaBoost classifier is **overfitting** the training set, a good regularization may be reducing the number of estimators or more strongly regularize the base classifier.\n", 1167 | "* An important drawback to sequential learning is that **it cannot be parallelized**, since each predictor can only be trained after the previous predictor has been trained and evaluated. Thus, it does not scale as well as bagging or pasting." 1168 | ] 1169 | }, 1170 | { 1171 | "cell_type": "code", 1172 | "execution_count": 16, 1173 | "metadata": { 1174 | "slideshow": { 1175 | "slide_type": "subslide" 1176 | } 1177 | }, 1178 | "outputs": [ 1179 | { 1180 | "name": "stdout", 1181 | "output_type": "stream", 1182 | "text": [ 1183 | "adaboost accuracy: 0.930\n" 1184 | ] 1185 | } 1186 | ], 1187 | "source": [ 1188 | "# AdaBoost\n", 1189 | "ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=600, algorithm=\"SAMME.R\", learning_rate=0.5)\n", 1190 | "ada_clf.fit(x_train, y_train)\n", 1191 | "y_pred = ada_clf.predict(x_test)\n", 1192 | "ada_acc = accuracy_score(y_test, y_pred)\n", 1193 | "print(\"adaboost accuracy: {:.3f}\".format(ada_acc))" 1194 | ] 1195 | }, 1196 | { 1197 | "cell_type": "markdown", 1198 | "metadata": { 1199 | "slideshow": { 1200 | "slide_type": "slide" 1201 | } 1202 | }, 1203 | "source": [ 1204 | "### Recommended Videos\n", 1205 | "---\n", 1206 | "#### Warning!\n", 1207 | "* These videos do not replace the lectures and tutorials.\n", 1208 | "* Please use these to get a better understanding of the material, and not as an alternative to the written material.\n", 1209 | "\n", 1210 | "#### Video By Subject\n", 1211 | "\n", 1212 | "* Simple Ensemble, Mixture of Experts - Ensembles (1): Basics\n", 1213 | "* Bagging - Ensembles (2): Bagging\n", 1214 | "* Boosting, AdaBoost - Machine Learning Lecture 34 \"Boosting / Adaboost\" -Cornell CS4780\n", 1215 | " * MIT - 6.034 Artificial Intelligence - Learning: Boosting\n", 1216 | " * Ensembles (4): AdaBoost" 1217 | ] 1218 | }, 1219 | { 1220 | "cell_type": "markdown", 1221 | "metadata": { 1222 | "slideshow": { 1223 | "slide_type": "skip" 1224 | } 1225 | }, 1226 | "source": [ 1227 | "## Credits\n", 1228 | "---\n", 1229 | "* Icons from Icon8.com - https://icons8.com\n", 1230 | "* Datasets from Kaggle - https://www.kaggle.com/\n", 1231 | "* Examples and code snippets were taken from \"Hands-On Machine Learning with Scikit-Learn and TensorFlow\"" 1232 | ] 1233 | } 1234 | ], 1235 | "metadata": { 1236 | "kernelspec": { 1237 | "display_name": "Python 3", 1238 | "language": "python", 1239 | "name": "python3" 1240 | }, 1241 | "language_info": { 1242 | "codemirror_mode": { 1243 | "name": "ipython", 1244 | "version": 3 1245 | }, 1246 | "file_extension": ".py", 1247 | "mimetype": "text/x-python", 1248 | "name": "python", 1249 | "nbconvert_exporter": "python", 1250 | "pygments_lexer": "ipython3", 1251 | "version": "3.6.9" 1252 | } 1253 | }, 1254 | "nbformat": 4, 1255 | "nbformat_minor": 2 1256 | } 1257 | -------------------------------------------------------------------------------- /cs236756_tutorial_14_pac_vc_dimension.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# CS 236756 - Technion - Intro to Machine Learning\n", 12 | "---\n", 13 | "#### Tal Daniel\n", 14 | "\n", 15 | "\n", 16 | "## Tutorial 14 - PAC Learning & VC Dimension\n", 17 | "---\n", 18 | "\n", 19 | "" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": { 25 | "slideshow": { 26 | "slide_type": "slide" 27 | } 28 | }, 29 | "source": [ 30 | "### Agenda\n", 31 | "---\n", 32 | "* [The PAC (**P**robably **A**pproximately **C**orrect) Learning Framework](#-The-PAC-Learning-Framework)\n", 33 | " * [Empirical Risk Minimization (ERM)](#-Empirical-Risk-Minimization-(ERM))\n", 34 | " * [The Fundamental Theorem of Statistical Learning](#-The-Fundamental-Theorem-of-Statistical-Learning)\n", 35 | "* [The VC Dimension](#-VC-Dimension)\n", 36 | " * [Theory](#-VC-Dimension---Formal-Definition)\n", 37 | " * [Examples](#-VC-Dimension---Examples)\n", 38 | "* [Recommended Videos](#-Recommended-Videos)\n", 39 | "* [Credits](#-Credits)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "slideshow": { 46 | "slide_type": "slide" 47 | } 48 | }, 49 | "source": [ 50 | "## The PAC Learning Framework\n", 51 | "---\n", 52 | "PAC stands for \"probably approximately correct\", which is a framework and set of assumptions under which numerous results on learning theory were proven." 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": { 58 | "slideshow": { 59 | "slide_type": "slide" 60 | } 61 | }, 62 | "source": [ 63 | "### Classification Learning Problem\n", 64 | "---\n", 65 | "* The learner's *input*:\n", 66 | " * **Domain Set - $\\mathcal{X}$**: the set of objects we wish to label.\n", 67 | " * **Label Set - $\\mathcal{Y}$**: possible outcomes of an experiment.\n", 68 | " * **Training Data - $S=\\{(x^{(i)}, y^{(i)}); i=1,...,m\\}$**: a finite sequence of pairs in $\\mathcal{X} \\times \\mathcal{Y}$ \n", 69 | " * Drawn iid from some probability distribution $\\mathcal{D}$\n", 70 | "* The learner's *output*:\n", 71 | " * **Prediction Rule - hypothesis** - $h: \\mathcal{X} \\to \\mathcal{Y}$: a function that must predict a label for new domain points.\n", 72 | " * The function is also called: predictor, hypothesis or classifier.\n", 73 | "* Sample generating model\n", 74 | " * We assume the instances are generated by an **unknown** probability distribution over $\\mathcal{X}$ denoted $\\mathcal{D}$.\n", 75 | " * **i.i.d.**: each $x^{(i)}$ is sampled independently from $\\mathcal{D}$.\n", 76 | " * **Realizability**: we also assume: $\\exists f, f: \\mathcal{X} \\to \\mathcal{Y}$ such that $y^{(i)} = f(x^{(i)}), \\forall i$." 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": { 82 | "slideshow": { 83 | "slide_type": "subslide" 84 | } 85 | }, 86 | "source": [ 87 | "* Measures of success\n", 88 | " * **Training Error** (also called the **empirical risk** or **empirical error**): $$ \\hat{\\epsilon}(h) = \\hat{L}(h) = \\frac{1}{m} \\sum_{i=1}^m \\mathbb{1} \\{h(x^{(i)}) \\neq y^{(i)} \\}$$\n", 89 | " * **Classifier Error** (also called the **generalization error**, the **risk** or the **true error**): the error of $h$ is the probability to draw a random sample $(x, y) \\sim \\mathcal{D}$ such that $h(x) \\neq y$: $$ \\epsilon(h) = L(h) = P_{(x,y) \\sim \\mathcal{D}}(h(x) \\neq y)$$\n", 90 | " * This is the probability that, if we now draw a new example $(x,y)$ from $\\mathcal{D}$, $h$ will misclassify it.\n", 91 | " * We assume that the training data was drawn from the *same* distribution $\\mathcal{D}$ with which we are going to evaluate our hypothesis (the assumption of training and testing on the same distribution is part of the **PAC assumptions**)." 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": { 97 | "slideshow": { 98 | "slide_type": "slide" 99 | } 100 | }, 101 | "source": [ 102 | "#### Classifier Error Example\n", 103 | "---\n", 104 | "\n", 105 | "* Assume binary features of *papayas* (the fruit...)\n", 106 | "\n", 107 | "| Softness | Color | $Pr(x) $ (Probability)| $h(x)$ | $f(x)$ |\n", 108 | "|------|------|------|------|------|\n", 109 | "| Soft | Green | 0.1 | Tasty | Not-Tasty|\n", 110 | "| Hard | Green | 0.1 | Not-Tasty | Not-Tasty|\n", 111 | "| Soft | Orange | 0.7 | Tasty | Tasty|\n", 112 | "| Hard | Orange | 0.1 | Tasty | Not-Tasty|\n", 113 | "\n", 114 | "* $\\hat{L}(h) = \\hat{\\epsilon}(h) = 0.5$\n", 115 | "* $L(h) = \\epsilon(h) = 0.2$" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": { 121 | "slideshow": { 122 | "slide_type": "subslide" 123 | } 124 | }, 125 | "source": [ 126 | "* What is $L_D(h)$?\n", 127 | " * We can only approximate it with some probability.\n", 128 | "* Why can it only be **approximately** correct?\n", 129 | " * **Claim**: we can't hope to find $h \\in \\mathcal{H}, \\text{s.t. } L_{D,f}(h) = 0$\n", 130 | " * **Proof**:\n", 131 | " * For every $\\epsilon \\in (0,1)$ take $X = \\{x_1, x_2\\}, P(x_1) = 1 - \\epsilon, P(x_2) = \\epsilon$\n", 132 | " * The probability not to see $x_2$ at all among $m$ i.i.d. examples is $(1-\\epsilon)^m \\approx e^{-\\epsilon m}$\n", 133 | " * So, if $\\epsilon << \\frac{1}{m}$ we are likely not to see $x_2$ at all, but then we can't know its label!\n", 134 | " * **Relaxation**: we would be happy with $L_{D,f}(h) \\leq \\epsilon$" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": { 140 | "slideshow": { 141 | "slide_type": "subslide" 142 | } 143 | }, 144 | "source": [ 145 | "* Why can it only be **probably** correct?\n", 146 | " * Recall that the input to the learner is *randomly generated*.\n", 147 | " * There is always a (very small) chance to see the same example again and again.\n", 148 | " * **Claim**: no algorithm can guarantee $L_{D,f}(h) \\leq \\epsilon$ for sure, that is, with absolute certainty ($P=1$)\n", 149 | " * **Relaxation**: we would allow the algorithm to fail with probability $\\delta$ where $\\delta \\in (0,1)$ is *user-specified*." 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": { 155 | "slideshow": { 156 | "slide_type": "slide" 157 | } 158 | }, 159 | "source": [ 160 | "### Probably Approximately Correct (PAC) Learning\n", 161 | "---\n", 162 | "* The learner doesn't know $\\mathcal{D}$ and $f$.\n", 163 | "* The learner receives 2 parameters:\n", 164 | " 1. $\\epsilon$ - *accuracy* parameter.\n", 165 | " 2. $\\delta$ - *confidence* parameter.\n", 166 | "* The learner can ask for training data, $S$ containing $m(\\epsilon, \\delta)$ examples.\n", 167 | "* The learner should output a hypothesis $h$ such that with probability of **at least** $1-\\delta$ it holds that $L_{D,f} \\leq \\epsilon$.\n", 168 | " * That is, the learner should be **P**robably (with probability at least $1-\\delta$) **A**pproximately (up to accuracy $\\epsilon$) **C**orrect.\n" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": { 174 | "slideshow": { 175 | "slide_type": "slide" 176 | } 177 | }, 178 | "source": [ 179 | "### Empirical Risk Minimization (ERM)\n", 180 | "---\n", 181 | "* Consider the setting of *linear classification* and let $h_{\\theta}(x) = \\mathbb{1}\\{\\theta^Tx \\geq 0\\}$.\n", 182 | "* Algorithm goal:\n", 183 | " * Find a hypothesis $h_s$ that minimizes the error (risk) with respect to $\\mathcal{D}$ and $f$.\n", 184 | " * But $\\mathcal{D}$ and $f$ are **unknown**!" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": { 190 | "slideshow": { 191 | "slide_type": "subslide" 192 | } 193 | }, 194 | "source": [ 195 | "* An alternative goal and a reasonable way to fit the parameters $\\theta$ would be to try and minimize the training error: $$ \\hat{L}(h) = L_s(h) = \\frac{|\\{ i \\in [m]: h(x^{(i)}) \\neq y^{(i)} \\}|}{m}, [m]=\\{1,...,m\\} $$ and pick $$ \\hat{\\theta} = \\underset{\\theta}{\\mathrm{argmin}} \\hat{\\epsilon}(h_{\\theta}) = \\underset{\\theta}{\\mathrm{argmin}} \\hat{L}(h_{\\theta}) $$\n", 196 | " * This process is called **empirical risk minimization** (ERM).\n", 197 | " * The resulting hypothesis output by the algorithm is $\\hat{h} = h_{\\hat{\\theta}}$.\n", 198 | " * ERM can be thought of as the most basic learning algorithm.\n", 199 | " * Algorithms like Logistic Regression can also be viewed as approximations to ERM." 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": { 205 | "slideshow": { 206 | "slide_type": "subslide" 207 | } 208 | }, 209 | "source": [ 210 | "* We will leave out the specific parameterization of the hypothesis $\\theta$ and will define the **hypothesis class** $\\mathcal{H}$ used by the learning algorithm to be the set of all classifiers considered by it.\n", 211 | "* ERM can now be thought of as a **minimization over the class of functions** $\\mathcal{H}$, in which the learning algorithm picks the hypothesis: $$ \\hat{h} = \\underset{h \\in \\mathcal{H}}{\\mathrm{argmin}} \\hat{\\epsilon}(h) $$" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": { 217 | "slideshow": { 218 | "slide_type": "subslide" 219 | } 220 | }, 221 | "source": [ 222 | "* **Overfitting**:\n", 223 | " * ERM may result in overfitting for the obvious reasons.\n", 224 | " * Assuming the following distribution: \n", 225 | " * We may build a trivial estimator with 0 (empirical) error: $$ h_s(x) = \\begin{cases}y^{(i)}, \\text{if } \\exists i \\in [m] \\text{ s.t. } x^{(i)} = x \\\\ 0, \\text{ otherwise} \\end{cases} $$\n", 226 | " * In order to avoid overfitting, we induce bias." 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": { 232 | "slideshow": { 233 | "slide_type": "subslide" 234 | } 235 | }, 236 | "source": [ 237 | "* **ERM with Inductive Bias**:\n", 238 | " * A common solution to overfitting is to restrict the hypothesis search space.\n", 239 | " * The learner chooses in advance a set of predictors (the hypothesis class $\\mathcal{H}$).\n", 240 | " * The choice of $\\mathcal{H}$ imposes an *inductive* bias (prior knowledge).\n", 241 | " * In the following we will assume **realizability**: $$ \\exists h^{*} \\in \\mathcal{H}, \\text{ s.t. } L_{D,f}(h^{*})=\\epsilon(h^{*}) = 0$$" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": { 247 | "slideshow": { 248 | "slide_type": "slide" 249 | } 250 | }, 251 | "source": [ 252 | "### The Fundamental Theorem of Statistical Learning\n", 253 | "---\n", 254 | "* Let $\\mathcal{H}$ denote a hypothesis class of binary classifiers.\n", 255 | "* Then, there are absolute **constants** $C_1, C_2$ such that the *sample complexity* (how many samples to draw, roughly) of PAC learning $\\mathcal{H}$ is: $$ C_1 \\frac{d(\\mathcal{H}) + \\log(\\frac{1}{\\delta})}{\\epsilon} \\leq m_{\\mathcal{H}}(\\epsilon, \\delta) \\leq C_2 \\frac{d(\\mathcal{H})\\log(\\frac{1}{\\epsilon}) + \\log(\\frac{1}{\\delta})}{\\epsilon} $$\n", 256 | " * $d(\\mathcal{H})$ - the *VC Dimension* (which will be introduced shortly) of hypotheses class $\\mathcal{H}$.\n", 257 | "* Furthermore, this sample complexity is achieved by the ERM learning rule" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": { 263 | "slideshow": { 264 | "slide_type": "slide" 265 | } 266 | }, 267 | "source": [ 268 | "### What Is Learnable and How to Learn?\n", 269 | "---\n", 270 | "* From the fundamental theorem of statistical learning:\n", 271 | " * The sample complexity is characterized by the **VC Dimension**.\n", 272 | " * The ERM learning rule is generic (near) optimal learner." 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": { 278 | "slideshow": { 279 | "slide_type": "slide" 280 | } 281 | }, 282 | "source": [ 283 | "## VC Dimension\n", 284 | "---\n", 285 | "\n", 286 | "### Motivation\n", 287 | "---\n", 288 | "* **Complexity of a learner** - representational power, the ability to generalize.\n", 289 | " * The usual **trade-off**:\n", 290 | " * More power - represent more complex systems $\\to$ may lead to **overfitting**.\n", 291 | " * Less power - won't overfit, but may not find the \"best\" learner.\n", 292 | " * How to quantify the representational power? Not easily...\n", 293 | " * One solution is the **VC Dimension**" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": { 299 | "slideshow": { 300 | "slide_type": "subslide" 301 | } 302 | }, 303 | "source": [ 304 | "* **No Free Lunch**\n", 305 | " * Suppose that $|\\mathcal{X}| = \\infty$\n", 306 | " * For any finite subset $\\mathcal{C} \\subset \\mathcal{X}$ take $\\mathcal{D}$ to be *uniform* distribution over $\\mathcal{C}$\n", 307 | " * If the number of training examples is $m \\leq \\frac{\\mathcal{C}}{2}$, then the learner has no knowledge on at least half the elements in $\\mathcal{C}$\n", 308 | " * Formally: **No Free Lunch Theorem**\n", 309 | " * Fix $\\delta \\in (0,1), \\epsilon < \\frac{1}{2}$. For every learner $\\mathcal{A}$ and training set size $m$, there exists $\\mathcal{D}, f$ such that with probability of at least $\\delta$ over the generation of training data $S$ of $m$ examples, it holds that $$ L_{\\mathcal{D}, f}(A(S)) \\geq \\epsilon $$\n", 310 | " * For a *random guess*, $ L_{\\mathcal{D}, f} = \\frac{1}{2}$, so the theorem states that you can't be better than a random guess." 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": { 316 | "slideshow": { 317 | "slide_type": "subslide" 318 | } 319 | }, 320 | "source": [ 321 | "* Suppose we got a **training** set $S=\\{(x^{(1)}, y^{(1)}), ..., (x^{(m)}, y^{(m)})\\}$, and we choose classifiers or hypotheses from a hypotheses class $\\mathcal{H}$.\n", 322 | " * We try to explain the labels using a hypothesis from $\\mathcal{H}$\n", 323 | " * It turned out that the labels we received were *incorrect* and now we get the same instances with different labels: $S' = \\{(x^{(1)}, y'^{(1)}), ..., (x^{(m)}, y'^{(m)})\\}$\n", 324 | " * We try again to explain the labels using a hypothesis from $\\mathcal{H}$\n", 325 | " * If we succeed in doing so (that is, find a hypothesis that explains these labels), then something is fishy...\n", 326 | " * Conclusion: if the classifier is able to explain everything, then it is useless...\n", 327 | " * Formally, if $\\mathcal{H}$ allows all functions over some set $\\mathcal{C}$ of size $m$, then based on the **No Free Lunch** theorem, we can't learn from a subset of size $\\frac{m}{2}$, for example." 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": { 333 | "slideshow": { 334 | "slide_type": "slide" 335 | } 336 | }, 337 | "source": [ 338 | "### VC Dimension - Formal Definition\n", 339 | "---\n", 340 | "* Let $\\mathcal{C} = \\{x_1, ..., x_{|C|} \\} \\subset \\mathcal{X}$\n", 341 | "* Let $\\mathcal{H}_C$ be the restriction of $\\mathcal{H}$ to $\\mathcal{C}$, namely, $\\mathcal{H}_C = \\{h_C: h \\in \\mathcal{H} \\}$ where $h_C: \\mathcal{C} \\to \\{0,1\\}$ or $\\{-1,+1\\} $ is s.t. $h_C(x_i) = h(x_i)$ for every $x_i \\in C$\n", 342 | "* Observation: we can represent each $h_c$ as the vector: $$ \\begin{bmatrix} h(x_1) \\\\ \\vdots \\\\ h(x_{|C|}) \\end{bmatrix} \\in \\{ \\pm 1\\}^{|C|} $$\n", 343 | "* Therfore: $\\mathcal{H}_C \\leq 2^{|C|}$" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": { 349 | "slideshow": { 350 | "slide_type": "subslide" 351 | } 352 | }, 353 | "source": [ 354 | "* We say that $\\mathcal{H}$ **shatters** $\\mathcal{C}$ if $|\\mathcal{H}_C| = 2^{|C|}$\n", 355 | " * That is, $\\mathcal{H}$ can realize any labeling on $\\mathcal{C}$, i.e., if for *any* set of labels $\\{y^{(1)}, ..., y^{(m)} \\}$ there exists some $h \\in \\mathcal{H}$ so that $h(x^{(i)}) = y^{(i)}$ for **all** $i = 1,..., m$ \n", 356 | "* $VCdim(\\mathcal{H})= sup\\{|C| : \\mathcal{H} \\text{ shatters } \\mathcal{C} \\}$" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": { 362 | "slideshow": { 363 | "slide_type": "subslide" 364 | } 365 | }, 366 | "source": [ 367 | "* The VC dimension is the maximal size of a set $\\mathcal{C}$ such that $\\mathcal{H}$ gives no prior knowledge w.r.t. $\\mathcal{C}$, or, the size of the largest set that is shattered by $\\mathcal{H}$.\n", 368 | "* In other words, the VC dimension is the maximum number of points that can be arranged such that $h \\in \\mathcal{H}$ can shatter them.\n", 369 | "* **Dichotomy**: a possible seperation of the sample space into sub-samples.\n", 370 | " * For example: $\\{(x_1, 1), (x_2, 0), (x_3, 1)\\}$ is a dichotomy, and also $\\{(x_1, 0), (x_2, 0), (x_3, 1)\\}$ (a total of $2^3$ for this example)." 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": { 376 | "slideshow": { 377 | "slide_type": "subslide" 378 | } 379 | }, 380 | "source": [ 381 | "* **Theorem**: Let $\\mathcal{H}$ be given, and let $d = VCdim(\\mathcal{H})$. Then with probability at least $1-\\delta$, we have that for all $h \\in \\mathcal{H}$: $$ |\\epsilon(h) - \\hat{\\epsilon}(h)| \\leq O(\\sqrt{\\frac{d}{m}\\log\\frac{m}{d} + \\frac{1}{m}\\log\\frac{1}{\\delta}}) $$\n", 382 | "Thus, with probability at least $1-\\delta$ we also have that: $$ \\epsilon(\\hat{h}) \\leq \\epsilon(h^{*}) + O(\\sqrt{\\frac{d}{m}\\log\\frac{m}{d} + \\frac{1}{m}\\log\\frac{1}{\\delta}}) $$\n", 383 | " * $\\epsilon(h)$ is the real (test) error and $\\hat{\\epsilon}(h)$ is the training error (empirical risk).\n", 384 | " * In other words, if a hypothesis class has finite VC dimension, then uniform convergence occurs as $m$ becomes large.\n", 385 | " * **This is a very strong result because we can make a statement on data we have not seen!**" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": { 391 | "slideshow": { 392 | "slide_type": "slide" 393 | } 394 | }, 395 | "source": [ 396 | "### Finding VC Dimension\n", 397 | "---\n", 398 | "* To show that $VCdim(\\mathcal{H}) = d$ we need to show that:\n", 399 | " 1. There **exists** a set $\\mathcal{C}$ of size $d$ which is shattered by $\\mathcal{H}$\n", 400 | " * That is, show that for some ordering of the points, **any** kind of labeling can be attained by hypothesis from $\\mathcal{H}$\n", 401 | " 2. **Every** set $\\mathcal{C}$ of size $d + 1$ is not shattered by $\\mathcal{H}$" 402 | ] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "metadata": { 407 | "slideshow": { 408 | "slide_type": "subslide" 409 | } 410 | }, 411 | "source": [ 412 | "* Can be thought of as a **2-player game**:\n", 413 | " * Fix the definition of $h_{\\theta} = f(x;\\theta)$ (the hypotheses class, e.g. linear classifiers)\n", 414 | " * **Player 1**: choose locations $x^{(1)},...,x^{(d)}$\n", 415 | " * *Player 2*: choose target labels $y^{(1)},...,y^{(d)}$\n", 416 | " * **Player 1**: choose a hypothesis $h \\in \\mathcal{H}$, e.g., choose $\\theta$ in the linear classifier\n", 417 | " * If $f(x;\\theta)$ can reproduce the target labeles, **Player 1** wins.\n", 418 | " * $\\exists \\{ x^{(1)}, ..., x^{(d)}\\} \\text{ s.t. } \\forall \\{ y^{(1)}, ..., y^{(d)}\\} \\exists \\theta \\text{ s.t. } \\forall i, f(x^{(i)}) = y^{(i)}$\n", 419 | " * The VC dimension would be the value $d$ if *Player 2* covered all the possibles labels and **Player 1** won every game." 420 | ] 421 | }, 422 | { 423 | "cell_type": "markdown", 424 | "metadata": { 425 | "slideshow": { 426 | "slide_type": "slide" 427 | } 428 | }, 429 | "source": [ 430 | "### VC Dimension - Examples\n", 431 | "---\n", 432 | "#### Example 1 - Toy Example\n", 433 | "---\n", 434 | "Consider 9 samples, and 8 hypotheses as follows:\n", 435 | "\n", 436 | "| | $x_1$ |$x_2$| $x_3$ | $x_4$ |$x_5$ |$x_6$ | $x_7$ | $x_8$ |$x_9$ |\n", 437 | "|------|------|------|------|------|------|------|------|------|------|\n", 438 | "| $h_1$ | 0 | 0 | 1 | 0|0|0|1|0|0|\n", 439 | "| $h_2$ | 0 | 1 | 0 | 0|0|1|0|0|0|\n", 440 | "| $h_3$ | 1 | 0 | 0 | 0|1|1|0|0|0|\n", 441 | "| $h_4$ | 0 | 0 | 0 | 1|1|0|0|0|1|\n", 442 | "| $h_5$ | 0 | 0 | 1 | 0|0|0|0|1|0|\n", 443 | "| $h_6$ | 0 | 1 | 0 | 0|0|0|1|0|0|\n", 444 | "| $h_7$ | 1 | 0 | 0 | 0|0|1|0|0|0|\n", 445 | "| $h_8$ | 0 | 0 | 0 | 0|0|0|0|0|0|\n", 446 | "\n", 447 | "* The first thing to notice is that the whole sample set (1-9) cannot be shattered as we don't have enough hypotheses. In order to shatter the whole set we would need at least $2^9$ hypotheses." 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": { 453 | "slideshow": { 454 | "slide_type": "subslide" 455 | } 456 | }, 457 | "source": [ 458 | "* **Excercise**: Are the following sets shattered?\n", 459 | " * $\\{x_1\\}$\n", 460 | " * $\\{x_5, x_6\\}$\n", 461 | " * $\\{x_1, x_2\\}$\n", 462 | " * $\\{x_5, x_6, x_7\\}$" 463 | ] 464 | }, 465 | { 466 | "cell_type": "markdown", 467 | "metadata": { 468 | "slideshow": { 469 | "slide_type": "subslide" 470 | } 471 | }, 472 | "source": [ 473 | "* **Solution**:\n", 474 | " * $\\{x_1\\}$ - **yes**, by $\\{h_2, h_3\\}$\n", 475 | " * $\\{x_5, x_6\\}$ - **yes**, by $\\{h_1, h_2, h_3, h_4\\}$\n", 476 | " * $\\{x_1, x_2\\}$ - **no**, can't get the classification: $x_1 = 1$ and $x_2 = 1$\n", 477 | " * $\\{x_5, x_6, x_7\\}$ - **no**, can't get the classification: $x_5=x_6=x_7=1$" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": { 483 | "slideshow": { 484 | "slide_type": "subslide" 485 | } 486 | }, 487 | "source": [ 488 | "* **Excercise**: What is the VC dimension of $\\mathcal{X}$?" 489 | ] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": { 494 | "slideshow": { 495 | "slide_type": "subslide" 496 | } 497 | }, 498 | "source": [ 499 | "* **Solution**:\n", 500 | " * The only 3 points with the dichotomy $\\{1, 1, 1\\}$ are $\\{x_1, x_5, x_6 \\}$\n", 501 | " * But the dichotomy $\\{1,0,0\\}$ isn't achievable.\n", 502 | " * $\\to$ No 3 points can be shattered\n", 503 | " * $\\to VCdim(\\mathcal{H}) = 2 $" 504 | ] 505 | }, 506 | { 507 | "cell_type": "markdown", 508 | "metadata": { 509 | "slideshow": { 510 | "slide_type": "slide" 511 | } 512 | }, 513 | "source": [ 514 | "#### Example 2 -Threshold Functions\n", 515 | "---\n", 516 | "* Threshold functions - $f \\in \\mathcal{H}$ is a single-parametric threshold classifier on real numbers, i.e., for a certain threshold $\\theta$, the classifier $f_{\\theta}$ returns 1 if the input number is larger than $\\theta$ and 0 otherwise. Formally: $$ \\mathcal{X} = \\mathbb{R}, \\mathcal{H} = \\{ x \\to sign(x-\\theta): \\theta \\in \\mathbb{R} \\} $$" 517 | ] 518 | }, 519 | { 520 | "cell_type": "markdown", 521 | "metadata": { 522 | "slideshow": { 523 | "slide_type": "subslide" 524 | } 525 | }, 526 | "source": [ 527 | "\n", 528 | "* Let's \"prove\" that $VCdim(\\mathcal{H}) = 1$:\n", 529 | " 1. One ($n=1$) point can be shattered because for every point $x$, a classifier $f_{\\theta}(x)$ labels it as 0 if $\\theta > x$ and 1 if $\\theta < x$. For example, for $(x=0, label=0), \\theta= 1$ and for $(x=0, label=1), \\theta= -1$.\n", 530 | " 2. No two ($n+1=2$) points can be shattered - because for every set of 2 points, if the smaller is labeled 1, then the larger must also be labeled 1, so not all labelings are possible.\n", 531 | " \n", 532 | "\n", 533 | " \n", 534 | " Image Source (CalTech's free machine Learning online course by Yaser Abu-Mostafa)" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "metadata": { 540 | "slideshow": { 541 | "slide_type": "slide" 542 | } 543 | }, 544 | "source": [ 545 | "#### Example 3 - Intervals Functions\n", 546 | "---\n", 547 | "* Intervals functions - $f \\in \\mathcal{H}$ is a single-parametric interval classifier on real numbers, i.e, for a certain parameter $\\theta$, the classifier $f_{\\theta}$ returns 1 if the input number is in the interval $[\\theta, \\theta+4]$ and 0 otherwise." 548 | ] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "metadata": { 553 | "slideshow": { 554 | "slide_type": "subslide" 555 | } 556 | }, 557 | "source": [ 558 | "* Let's \"prove\" that $VCdim(\\mathcal{H}) = 2$:\n", 559 | " 1. Two ($n=2$) points can be shattered because for every set $\\{x, x+2\\}$, a classifier $f_{\\theta}(x)$ labels it as:\n", 560 | " * $(0,0)$ - if $\\theta < x - 4$ or if $\\theta > x + 2$.\n", 561 | " * $(1,0)$ - if $\\theta \\in [x-4, x-2)$.\n", 562 | " * $(1,1)$ - if $\\theta \\in [x-2, x]$.\n", 563 | " * $(0,1)$ - if $\\theta \\in (x, x+2]$.\n", 564 | " 2. No three ($n+1=3$) points can be shattered - because for every set of three numbers, if the smallest and the largest are labeled 1, then the middle one must also be labeled 1, so not all labelings are possible." 565 | ] 566 | }, 567 | { 568 | "cell_type": "markdown", 569 | "metadata": { 570 | "slideshow": { 571 | "slide_type": "subslide" 572 | } 573 | }, 574 | "source": [ 575 | "* This result can be generalized for a two-parametric interval classifier $h_{a,b}$: $$ \\mathcal{X} = \\mathbb{R}, \\mathcal{H} = \\{ h_{a,b}: a < b \\in \\mathbb{R} \\} $$ where $$ h_{a,b}(x) = 1 \\iff x \\in [a,b] $$\n", 576 | "\n", 577 | "\n", 578 | "\n", 579 | "Image Source (CalTech's free machine Learning online course by Yaser Abu-Mostafa)" 580 | ] 581 | }, 582 | { 583 | "cell_type": "markdown", 584 | "metadata": { 585 | "slideshow": { 586 | "slide_type": "slide" 587 | } 588 | }, 589 | "source": [ 590 | "#### Example 4 - Axis Aligned Rectangles\n", 591 | "---\n", 592 | "* Axis aligned rectangles: $$ \\mathcal{X} = \\mathbb{R}^2, \\mathcal{H} = \\{ h_{a_1,a_2,b_1, b_2}: a_1 < a_2 \\text{ and } b_1 < b_2 \\} $$, where $$ h_{a_1,a_2,b_1, b_2}(x_1, x_2) = 1 \\iff x_1 \\in [a_1, a_2] \\text{ and } x_2 \\in [b_1, b_2] $$" 593 | ] 594 | }, 595 | { 596 | "cell_type": "markdown", 597 | "metadata": { 598 | "slideshow": { 599 | "slide_type": "subslide" 600 | } 601 | }, 602 | "source": [ 603 | "* Let's \"prove\" that $VCdim(\\mathcal{H}) = 4$:\n", 604 | " \n", 605 | "1.Four ($n=4$) points can be shattered as seen in the following arrangement: \n", 606 | "\n", 607 | "Image from Princeton's COS 511: Theoretical Machine Learning, Lecture on VC-Dimension" 608 | ] 609 | }, 610 | { 611 | "cell_type": "markdown", 612 | "metadata": { 613 | "slideshow": { 614 | "slide_type": "subslide" 615 | } 616 | }, 617 | "source": [ 618 | "2.No five ($n+1=5$) can be shattered - for any 5-point set, we can construct a data assignment in this way: pick the topmost, bottommost, leftmost and rightmost points and give them the label “+”. Because there are 5 points, there must be at least one point left to which we assign “−”. Any rectangle that contains all the “+” points must contain the “−” point, which is a case where shattering is not possible.\n", 619 | "\n", 620 | "" 621 | ] 622 | }, 623 | { 624 | "cell_type": "markdown", 625 | "metadata": { 626 | "slideshow": { 627 | "slide_type": "slide" 628 | } 629 | }, 630 | "source": [ 631 | "#### Example 5 - Halfspaces\n", 632 | "---\n", 633 | "* Halfspaces (linear classifiers): $$ \\mathcal{X} = \\mathbb{R}^2, \\mathcal{H} = \\{ x \\to sign(\\langle w, x \\rangle) \\}: w \\in \\mathbb{R}^2 $$\n", 634 | " * For example: $h(x) = \\mathbb{1}\\{ \\theta_1 x_1 + \\theta_2 x_2 \\geq 0\\}$" 635 | ] 636 | }, 637 | { 638 | "cell_type": "markdown", 639 | "metadata": { 640 | "slideshow": { 641 | "slide_type": "subslide" 642 | } 643 | }, 644 | "source": [ 645 | "* Let's \"prove\" that $VCdim(\\mathcal{H}) = 3$:\n", 646 | " \n", 647 | "1.Three ($n=3$) points can be shattered as seen in the following arrangement: " 648 | ] 649 | }, 650 | { 651 | "cell_type": "markdown", 652 | "metadata": { 653 | "slideshow": { 654 | "slide_type": "subslide" 655 | } 656 | }, 657 | "source": [ 658 | "2.No four ($n+1=4$) can be shattered - We consider two cases:\n", 659 | " 1. The four points form a convex region, i.e., lie on the convex hull defined by the 4 points. \n", 660 | " 2. Three of the 4 points define the convex hull and the 4th point is internal. \n", 661 | " \n", 662 | "* In the first case, the labeling which is positive for one diagonal pair and negative to the other pair cannot be realized by a separating line. \n", 663 | "* In the second case, a labeling which is positive for the three hull points and negative for the interior point cannot be realized.\n", 664 | " \n", 665 | "\n", 666 | "\n", 667 | "* The results is generalized for hyperplanes: VC dimension of hyperplanes in $\\mathbb{R}^d$ is $d+1$." 668 | ] 669 | }, 670 | { 671 | "cell_type": "markdown", 672 | "metadata": { 673 | "slideshow": { 674 | "slide_type": "slide" 675 | } 676 | }, 677 | "source": [ 678 | "### VC Dimension - Special Cases\n", 679 | "---\n", 680 | "* $VCdim(\\mathcal{H}) = 0$ - When is the VC dimension equals to zero? Assume $\\mathcal{X} = \\mathbb{R}^2$. Let $\\mathcal{H}$ contain a **single** hypothesis $h_1$. Thus, the VC dimension of $\\mathcal{H}$ is **always** 0! A single hypothesis can impose only one classification, can only assign one labeling to a set of points.\n" 681 | ] 682 | }, 683 | { 684 | "cell_type": "markdown", 685 | "metadata": { 686 | "slideshow": { 687 | "slide_type": "subslide" 688 | } 689 | }, 690 | "source": [ 691 | "* $VCdim(\\mathcal{H}) = \\infty$ - When does the VC dimension go to infinity? Assume $\\mathcal{X} = \\mathbb{R}^2$. Let $\\mathcal{A}$ be the **set of all convex polygons** in $\\mathcal{X}$. Define $\\mathcal{H}$ as the class of all hypotheses $h_p(x), p \\in \\mathcal{A}$: $$ h_p(x) = \\begin{cases} 1, \\text{ if } x \\text{ is contained within polygon } p \\\\ 0, \\text{ otherwise} \\end{cases} $$" 692 | ] 693 | }, 694 | { 695 | "cell_type": "markdown", 696 | "metadata": { 697 | "slideshow": { 698 | "slide_type": "subslide" 699 | } 700 | }, 701 | "source": [ 702 | "Let's see why $VCdim(\\mathcal{H}) = \\infty$: for any positive integer $n$, take $n$ points from $\\mathcal{X}$. Place the $n$ points **uniformly spaced** on the **unit circle**. For each $2^n$ subset of this data, there is a convex polygon with vertices at these $n$ points. For each subset, the convex polygon contains the set and excludes its complement.\n", 703 | "\n", 704 | "Image from Learnability and VC Dimension at LMU Munchen" 705 | ] 706 | }, 707 | { 708 | "cell_type": "markdown", 709 | "metadata": { 710 | "slideshow": { 711 | "slide_type": "slide" 712 | } 713 | }, 714 | "source": [ 715 | "### Recommended Videos\n", 716 | "---\n", 717 | "#### Warning!\n", 718 | "* These videos do not replace the lectures and tutorials.\n", 719 | "* Please use these to get a better understanding of the material, and not as an alternative to the written material.\n", 720 | "\n", 721 | "#### Video By Subject\n", 722 | "\n", 723 | "* VC Dimension - VC Dimension - Alexander Ihler\n", 724 | "* Learning Theory by Andrew Ng (Stanford)\n", 725 | " * Lecture 9 | Machine Learning (Stanford)\n", 726 | " * Lecture 10 | Machine Learning (Stanford)\n", 727 | "* Learning Theory Lectures By Shai Ben-David\n", 728 | " * Lecture 2\n", 729 | " * Lecture 3" 730 | ] 731 | }, 732 | { 733 | "cell_type": "markdown", 734 | "metadata": { 735 | "slideshow": { 736 | "slide_type": "skip" 737 | } 738 | }, 739 | "source": [ 740 | "## Credits\n", 741 | "---\n", 742 | "* Based on slides by Shai Shalev-Schwarz\n", 743 | "* Great (!) Reading Resource - CS229 - Stanford - Machine Learning - Learning Theory\n", 744 | " * It covers everything and goes into much more details\n", 745 | "* Icons from Icon8.com - https://icons8.com" 746 | ] 747 | } 748 | ], 749 | "metadata": { 750 | "kernelspec": { 751 | "display_name": "Python 3", 752 | "language": "python", 753 | "name": "python3" 754 | }, 755 | "language_info": { 756 | "codemirror_mode": { 757 | "name": "ipython", 758 | "version": 3 759 | }, 760 | "file_extension": ".py", 761 | "mimetype": "text/x-python", 762 | "name": "python", 763 | "nbconvert_exporter": "python", 764 | "pygments_lexer": "ipython3", 765 | "version": "3.6.9" 766 | } 767 | }, 768 | "nbformat": 4, 769 | "nbformat_minor": 2 770 | } 771 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: ml-course 2 | channels: 3 | - conda-forge 4 | - anaconda 5 | dependencies: 6 | - python 7 | - numpy 8 | - scikit-learn 9 | - scipy 10 | - pip: 11 | - pandas 12 | - matplotlib -------------------------------------------------------------------------------- /pdf/cs236756_tutorial_01_probability_mle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_01_probability_mle.pdf -------------------------------------------------------------------------------- /pdf/cs236756_tutorial_02_statistics.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_02_statistics.pdf -------------------------------------------------------------------------------- /pdf/cs236756_tutorial_03_linear_algebra.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_03_linear_algebra.pdf -------------------------------------------------------------------------------- /pdf/cs236756_tutorial_04_pca_feature_selection.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_04_pca_feature_selection.pdf -------------------------------------------------------------------------------- /pdf/cs236756_tutorial_05_evaluation_validation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_05_evaluation_validation.pdf -------------------------------------------------------------------------------- /pdf/cs236756_tutorial_06_decision_trees.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_06_decision_trees.pdf -------------------------------------------------------------------------------- /pdf/cs236756_tutorial_07_optimization.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_07_optimization.pdf -------------------------------------------------------------------------------- /pdf/cs236756_tutorial_08_linear_regression.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_08_linear_regression.pdf -------------------------------------------------------------------------------- /pdf/cs236756_tutorial_09_linear_models.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_09_linear_models.pdf -------------------------------------------------------------------------------- /pdf/cs236756_tutorial_10_expectation_maximization.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_10_expectation_maximization.pdf -------------------------------------------------------------------------------- /pdf/cs236756_tutorial_11_boosting_bagging.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_11_boosting_bagging.pdf -------------------------------------------------------------------------------- /pdf/cs236756_tutorial_12_svm.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_12_svm.pdf -------------------------------------------------------------------------------- /pdf/cs236756_tutorial_13_deep_learning_intro_backprop.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_13_deep_learning_intro_backprop.pdf -------------------------------------------------------------------------------- /pdf/cs236756_tutorial_14_pac_vc_dimension.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/cs236756-intro-to-ml/693faaa17a04b3649fa6eecf65d79adbbca9af80/pdf/cs236756_tutorial_14_pac_vc_dimension.pdf --------------------------------------------------------------------------------