├── 3. PyTorch ├── 3.1. Tensors │ └── gitkeep ├── 3.3. Devices │ ├── gitkeep │ └── device.md ├── 3.4. Modules │ ├── gitkeep │ └── module.md ├── 3.5. Datasets │ ├── gitkeep │ └── Datasets.md ├── 3.7. Losses │ └── gitkeep ├── 3.6. Dataloader │ └── gitkeep ├── 3.8. Optimizers │ └── gitkeep └── 3.2. Autograd │ ├── gd.png │ └── autograd.md ├── 5. Transformers ├── 5.2. Inference │ └── gitkeep ├── 5.5. BERT, T5, GPT │ └── gitkeep ├── 5.4. Batch Processing │ └── gitkeep ├── 5.3. Training, Pre-Training, Fine-Tuning │ └── gitkeep └── 5.1. Self-Attention, Cross-Attention, Masked Self-Attention, Layer Normalization, Word Embedding, Positional Encoding │ └── gitkeep ├── 7. OpenCV & Generative AI ├── 7.3. UNet │ └── gitkeep ├── 7.1. Object Detection │ └── gitkeep ├── 7.4. Autoencoder, Variational Autoencoder │ └── gitkeep ├── 7.5. Generative Adversarial Network, Adversarial Attack │ └── gitkeep └── 7.8. Stable Diffusion, Denoising Diffusion Probabilistic Methods │ └── gitkeep ├── 2. Machine Learning Generics ├── 2.7. Boosting │ └── gitkeep ├── 2.0. Machine Learning Terminology │ └── gitkeep ├── 2.5. Decision Trees, Random Forests │ └── gitkeep ├── 2.1. Linear Regression & Logistic Regression │ ├── gitkeep │ └── Linear & Logistic Regression.md ├── 2.3. Regularization, Bias-Variance Trade-Off, Kernel Methods, Cross Validation │ ├── KernelTrick.png │ ├── Cross Validation.md │ └── Regularization, Bias–Variance Trade-off, Kernel Methods.md ├── 2.4. Principal Component Analysis, Dimensionality Reduction │ ├── PCA.md │ └── Pizza.csv ├── 2.1. Support Vector Machines │ └── SVM.ipynb └── 2.5. K-Nearest Neighbors, Clustering K-Means │ └── KNN, Kmeans.ipynb ├── 4. Deep Learning & Computer Vision ├── 4.6. VGG │ └── gitkeep ├── 4.7. ResNet │ ├── gitkeep │ ├── resnet1.png │ ├── resnet2.png │ ├── resnet3.png │ └── resnet.ipynb ├── 4.8. GoogLeNet │ └── gitkeep ├── 4.9. Transfer Learning │ └── gitkeep ├── 4.5. Image Data Augmentation │ └── gitkeep ├── 4.1. Forward Propagation, Activation Functions, Linear Layer │ ├── gitkeep │ └── 4.1 forward propagation + Activation functions + Linear Layer.md ├── 4.3. Parameter Initialization, Batch Normalization, Dropout │ └── gitkeep ├── 4.4. Convolutional Layers, Pooling Layers, Convolutional Neural Network │ ├── CNN.md │ └── CNN.ipynb └── 4.2. Backpropagation, Gradient Descent, Adaptive Moment Estimation │ └── Backpropagation.md ├── 0. Prerequisites ├── 0.2. Python For AI │ ├── 0.2.2. NumPy │ │ └── gitkeep │ ├── 0.2.3. Pandas │ │ └── gitkeep │ ├── 0.2.5. Seaborn │ │ └── gitkeep │ ├── 0.2.4. Matplotlib │ │ └── gitkeep │ └── 0.2.1. Advanced Python Techniques For AI │ │ └── gitkeep └── 0.1. Basic Environment For Python │ └── 0.1 Setup.md ├── 1. Mathematical Methods For AI ├── 1.2. Calculus │ ├── 1.2.3. Chain Rule │ │ └── gitkeep │ ├── 1.2.1. Single-Variable Derivatives │ │ └── gitkeep │ └── 1.2.2. Multi-Variable Derivatives & Gradients │ │ └── gitkeep ├── 1.4. Convex Optimization │ ├── 1.4.3. Duality │ │ └── gitkeep │ ├── 1.4.1. Convexity │ │ └── gitkeep │ └── 1.4.2. Gradient Descent │ │ └── gitkeep ├── 1.3. Probability & Statistics │ ├── 1.3.3. Mean │ │ └── gitkeep │ ├── 1.3.5. Bayes' Rule │ │ └── gitkeep │ ├── 1.3.1. Discrete Distributions │ │ └── gitkeep │ ├── 1.3.4. Variance, Covariance │ │ └── gitkeep │ └── 1.3.2. Continuous Distributions │ │ └── gitkeep └── 1.1. Linear Algebra │ └── Linear Algebra.md ├── 6. Natural Language Processing & Graph Neural Networks ├── 6.7. Vision Transformers │ └── gitkeep ├── 6.2. Word Embedding Methods │ └── gitkeep ├── 6.6. Graph Convolutional Networks │ └── gitkeep ├── 6.5. Message-Passing Neural Networks │ └── gitkeep ├── 6.1. Character, Subword & Word Tokenization │ └── gitkeep ├── 6.4. Encoder-Only & Decoder-Only Transformers │ └── gitkeep └── 6.3. Skip-Gram, Continuous Word Bag, Global Vectors │ └── gitkeep ├── LICENSE └── README.md /3. PyTorch/3.1. Tensors/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /3. PyTorch/3.3. Devices/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /3. PyTorch/3.4. Modules/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /3. PyTorch/3.5. Datasets/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /3. PyTorch/3.7. Losses/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /3. PyTorch/3.6. Dataloader/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /3. PyTorch/3.8. Optimizers/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /5. Transformers/5.2. Inference/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /5. Transformers/5.5. BERT, T5, GPT/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /7. OpenCV & Generative AI/7.3. UNet/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /5. Transformers/5.4. Batch Processing/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /2. Machine Learning Generics/2.7. Boosting/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /4. Deep Learning & Computer Vision/4.6. VGG/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /0. Prerequisites/0.2. Python For AI/0.2.2. NumPy/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /0. Prerequisites/0.2. Python For AI/0.2.3. Pandas/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /0. Prerequisites/0.2. Python For AI/0.2.5. Seaborn/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /4. Deep Learning & Computer Vision/4.7. ResNet/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /4. Deep Learning & Computer Vision/4.8. GoogLeNet/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /7. OpenCV & Generative AI/7.1. Object Detection/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /0. Prerequisites/0.2. Python For AI/0.2.4. Matplotlib/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /4. Deep Learning & Computer Vision/4.9. Transfer Learning/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /5. Transformers/5.3. Training, Pre-Training, Fine-Tuning/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /1. Mathematical Methods For AI/1.2. Calculus/1.2.3. Chain Rule/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /2. Machine Learning Generics/2.0. Machine Learning Terminology/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /2. Machine Learning Generics/2.5. Decision Trees, Random Forests/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /4. Deep Learning & Computer Vision/4.5. Image Data Augmentation/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /1. Mathematical Methods For AI/1.4. Convex Optimization/1.4.3. Duality/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /7. OpenCV & Generative AI/7.4. Autoencoder, Variational Autoencoder/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /1. Mathematical Methods For AI/1.3. Probability & Statistics/1.3.3. Mean/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /1. Mathematical Methods For AI/1.4. Convex Optimization/1.4.1. Convexity/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /2. Machine Learning Generics/2.1. Linear Regression & Logistic Regression/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /0. Prerequisites/0.2. Python For AI/0.2.1. Advanced Python Techniques For AI/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /1. Mathematical Methods For AI/1.2. Calculus/1.2.1. Single-Variable Derivatives/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /1. Mathematical Methods For AI/1.3. Probability & Statistics/1.3.5. Bayes' Rule/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /1. Mathematical Methods For AI/1.4. Convex Optimization/1.4.2. Gradient Descent/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /6. Natural Language Processing & Graph Neural Networks/6.7. Vision Transformers/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /6. Natural Language Processing & Graph Neural Networks/6.2. Word Embedding Methods/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /7. OpenCV & Generative AI/7.5. Generative Adversarial Network, Adversarial Attack/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /1. Mathematical Methods For AI/1.2. Calculus/1.2.2. Multi-Variable Derivatives & Gradients/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /1. Mathematical Methods For AI/1.3. Probability & Statistics/1.3.1. Discrete Distributions/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /1. Mathematical Methods For AI/1.3. Probability & Statistics/1.3.4. Variance, Covariance/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /6. Natural Language Processing & Graph Neural Networks/6.6. Graph Convolutional Networks/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /7. OpenCV & Generative AI/7.8. Stable Diffusion, Denoising Diffusion Probabilistic Methods/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /1. Mathematical Methods For AI/1.3. Probability & Statistics/1.3.2. Continuous Distributions/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /4. Deep Learning & Computer Vision/4.1. Forward Propagation, Activation Functions, Linear Layer/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /4. Deep Learning & Computer Vision/4.3. Parameter Initialization, Batch Normalization, Dropout/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /6. Natural Language Processing & Graph Neural Networks/6.5. Message-Passing Neural Networks/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /6. Natural Language Processing & Graph Neural Networks/6.1. Character, Subword & Word Tokenization/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /6. Natural Language Processing & Graph Neural Networks/6.4. Encoder-Only & Decoder-Only Transformers/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /6. Natural Language Processing & Graph Neural Networks/6.3. Skip-Gram, Continuous Word Bag, Global Vectors/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /3. PyTorch/3.2. Autograd/gd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SHSID-Data-science-Club/Highschool_ML_Course/HEAD/3. PyTorch/3.2. Autograd/gd.png -------------------------------------------------------------------------------- /5. Transformers/5.1. Self-Attention, Cross-Attention, Masked Self-Attention, Layer Normalization, Word Embedding, Positional Encoding/gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /4. Deep Learning & Computer Vision/4.7. ResNet/resnet1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SHSID-Data-science-Club/Highschool_ML_Course/HEAD/4. Deep Learning & Computer Vision/4.7. ResNet/resnet1.png -------------------------------------------------------------------------------- /4. Deep Learning & Computer Vision/4.7. ResNet/resnet2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SHSID-Data-science-Club/Highschool_ML_Course/HEAD/4. Deep Learning & Computer Vision/4.7. ResNet/resnet2.png -------------------------------------------------------------------------------- /4. Deep Learning & Computer Vision/4.7. ResNet/resnet3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SHSID-Data-science-Club/Highschool_ML_Course/HEAD/4. Deep Learning & Computer Vision/4.7. ResNet/resnet3.png -------------------------------------------------------------------------------- /2. Machine Learning Generics/2.3. Regularization, Bias-Variance Trade-Off, Kernel Methods, Cross Validation/KernelTrick.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SHSID-Data-science-Club/Highschool_ML_Course/HEAD/2. Machine Learning Generics/2.3. Regularization, Bias-Variance Trade-Off, Kernel Methods, Cross Validation/KernelTrick.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 SHSID Data science Club 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /3. PyTorch/3.2. Autograd/autograd.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | # Autograd 4 | ##### **Jerry Zhang** | SHSID Data Science Club 5 | 6 |
7 | 8 | ## Hmm? 9 | Recalling from the previous chapters, a key concept in Machine Learning is Backpropagation--calculating derivatives of individual elements in tensors. Backpropagation is essential for calculating the gradients used in gradient descent. 10 | 11 |
12 | 13 |
14 | 15 | Calculating the gradients by hand is inefficient and unnecessary. Which is where Autograd comes in. 16 | 17 | *Note that a gradient is a value derived from the derivatives; it's how you USE the derivatives. Changing how you calculate the gradient results in different optimizing behavior; more on this in the Optimizers chapter* 18 | 19 | ## Usage 20 | Autograd is a Pytorch's auto-differentiation engine, that is, it traces a tensor's contribution to some result and calculates the gradient accordingly. 21 | 22 | ``` 23 | # 'requires_grad' turns on Autograd 24 | x = torch.tensor([1., 2., 3.], requires_grad=True) 25 | 26 | y = x**2 + 3*x + 5 27 | 28 | # Backpropagation 29 | # The tensor passed in is the gradient tensor 30 | # Since each individual element in the tensor contributes to many output values, a weighted sum is taken 31 | # The gradient tensor holds the weights of the weighted sum 32 | y.backward(torch.tensor[1., 1., 1.]) 33 | 34 | # The resulting gradients are stored their respective tensors 35 | x.grad # -> torch.tensor([5., 7., 9.]) 36 | 37 | # Resetting the gradients 38 | x.grad.zero_() 39 | # Usually though, you would not call grad.zero_(), more on this in the optimizer section. 40 | ``` 41 | 42 | Sometimes, for example when you are simply evaluating your model, you don't want to track the gradients as it affects performance. 43 | There are two ways to achieve this. 44 | ``` 45 | x = torch.tensor([1., 2., 3.,], requires_grad=True) 46 | 47 | x.detach() # Not recommended 48 | 49 | with torch.no_grad(): 50 | # Recommended 51 | # Only the code within this block won't be tracked 52 | # In other words, it's a temporary detach 53 | y = x**10 + 114 54 | ``` 55 | 56 | ## True Usage 57 | 58 | In most cases though, you will be using a optimizer to do the backpropagation; more on this in the Optimizer chapter. In this case, you code would resemble something like this. 59 | 60 | ``` 61 | # This line runs your model and returns the loss 62 | loss = model(input) 63 | # Backpropagating the contributions 64 | loss.backward() 65 | # Calculating the gradients and updating the weights 66 | optimizer.step() 67 | # Clearning the gradients 68 | optimizer.zero_grad() 69 | ``` 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | SHSID Data Science Club 2 | 3 |
4 | 5 | International Olympiad of Artificial Intelligence Guide 6 | ===================== 7 | created by the SHSID Data Science Club 8 | 9 | --- 10 | 11 | A machine learning course designed for high school students, inspired by the USAAIO course outline provided by Beaver-Edge. 12 | 13 | **For the best experience and visualization, download this repository and run the `ipynb`!** 14 | 15 | --- 16 | 17 | # Table of Contents 18 | 19 | ### 0. Prerequisites 20 | * 0.1. Basic Environment For Python 21 | * **0.2. Python For AI** 22 | * 0.2.1. Advanced Python Techniques For AI 23 | * 0.2.2. NumPy 24 | * 0.2.3. Pandas 25 | * 0.2.4. Matplotlib 26 | * 0.2.5. Seaborn 27 | 28 | ### 1. Mathematical Methods For AI 29 | * 1.1. Linear Algebra 30 | * **1.2. Calculus** 31 | * 1.2.1. Single-Variable Derivatives 32 | * 1.2.2. Multi-Variable Derivatives & Gradients 33 | * 1.2.3. Chain Rule 34 | * **1.3. Probability & Statistics** 35 | * 1.3.1. Discrete Distributions 36 | * 1.3.2. Continuous Distributions 37 | * 1.3.3. Mean 38 | * 1.3.4. Variance, Covariance 39 | * 1.3.5. Bayes' Rule 40 | * **1.4. Convex Optimization** 41 | * 1.4.1. Convexity 42 | * 1.4.2. Gradient Descent 43 | * 1.4.3. Duality 44 | 45 | ### 2. Machine Learning Generics 46 | * 2.0. Machine Learning Terminology 47 | * 2.1. Linear Regression & Logistic Regression 48 | * 2.1. Support Vector Machines 49 | * 2.3. Regularization, Bias-Variance Trade-Off, Kernel Methods, Cross Validation 50 | * 2.4. Principal Component Analysis, Dimensionality Reduction 51 | * 2.5. Decision Trees, Random Forests 52 | * 2.5. K-Nearest Neighbors, Clustering K-Means 53 | * 2.7. Bagging & Boosting (XGBoost) 54 | 55 | ### 3. PyTorch 56 | * 3.1. Tensors 57 | * 3.2. Autograd 58 | * 3.3. Devices 59 | * 3.4. Modules 60 | * 3.5. Datasets 61 | * 3.6. Dataloader 62 | * 3.7. Losses 63 | * 3.8. Optimizers 64 | 65 | ### 4. Deep Learning & Computer Vision 66 | * 4.1. Forward Propagation, Activation Functions, Linear Layer 67 | * 4.2. Backpropagation, Gradient Descent, Adaptive Moment Estimation 68 | * 4.3. Parameter Initialization, Batch Normalization, Dropout 69 | * 4.4. Convolutional Layers, Pooling Layers, Convolutional Neural Network 70 | * 4.5. Image Data Augmentation 71 | * 4.6. VGG 72 | * 4.7. ResNet 73 | * 4.8. GoogLeNet 74 | * 4.9. Transfer Learning 75 | * 4.10. Recurring Neural Networks 76 | * 4.11. Reinforcement Learning 77 | 78 | ### 5. Transformers 79 | * **5.1. Attention** 80 | * 5.1.1. Self-Attention 81 | * 5.1.2. Cross-Attention, Masked Self-Attention, Multi-Head-attention 82 | * 5.1.3. Layer Normalization, Word Embedding, Positional Encoding 83 | * 5.2. Inference 84 | * 5.3. Training, Pre-Training, Fine-Tuning 85 | * 5.4. Batch Processing 86 | * 5.5. BERT, T5, GPT 87 | * 5.6. RL with Human Feedback for LLMs 88 | * 5.7. Modern LLM Optimizers, Mixture-of-Experts, Retrieval Augmented Generation (DeepSeek Case Study) 89 | 90 | ### 6. Natural Language Processing & Graph Neural Networks 91 | * 6.1. Character, Subword & Word Tokenization 92 | * 6.2. Word Embedding Methods 93 | * 6.3. Skip-Gram, Continuous Word Bag, Global Vectors 94 | * 6.4. Encoder-Only & Decoder-Only Transformers 95 | * 6.5. Message-Passing Neural Networks 96 | * 6.6. Graph Convolutional Networks 97 | * 6.7. Vision Transformers 98 | 99 | ### 7. OpenCV & Generative AI 100 | * 7.1. Object Detection 101 | * 7.2. UNet 102 | * 7.3. Autoencoder, Variational Autoencoder 103 | * 7.4. Generative Adversarial Network, Adversarial Attack 104 | * 7.5. Stable Diffusion, Denoising Diffusion Probabilistic Methods 105 | * 7.6. State Space Models, Selective State Spaces (Mamba Case Study) 106 | 107 | -------------------------------------------------------------------------------- /3. PyTorch/3.5. Datasets/Datasets.md: -------------------------------------------------------------------------------- 1 | # Dataset 2 | 3 | Datasets are undoubtably the fuel of machine learning————regardless of what algorithm you have, you learn from data. 4 | However, it's not just a simple drag and drop task. 5 | Data needs to be preprocessed: converted into the right format, normalized, split into train and validation, etc. 6 | To facilitate this, Pytorch integrates many functions and modules, for example the `Dataset` class. 7 | 8 | ```python 9 | from torch.utils.data import Dataset 10 | ``` 11 | 12 | `Dataset` is a protocol that behaves similarly as an array. 13 | There are two types: Map-style and Iterable-style 14 | 15 | ## Initializing 16 | 17 | ### Map-style 18 | 19 | for random access 20 | ```python 21 | class Custom_DS(Dataset): 22 | def __init__(self, ...): ... 23 | def __len__(self) -> int: ... 24 | def __getitem__(self, idx): ... 25 | ``` 26 | 27 | ### Iterable-style 28 | 29 | for serial access 30 | ```python 31 | class Custom_DS(Dataset): 32 | def __init__(self, ...): ... 33 | def __len__(self) -> int: ... 34 | def __iter__(self): ... 35 | ``` 36 | 37 | However, you won't have to manually define all of that. 38 | For example if you are loading a dataset from *hugging face*, it will automatically return you a `Dataset` instance. 39 | 40 | There are also some predefined Datasets, the common ones being: 41 | 42 | ```python 43 | from torch.utils.data import TensorDataset, ConcatDataset, Subset, ChainDataset 44 | ``` 45 | 46 | ### TensorDataset 47 | 48 | ```python 49 | features = torch.randn(1000, 10) # 1000 samples of 10 features each 50 | labels = torch.randint(0, 2, (1000)) # 1000 0/1 s 51 | 52 | dataset = TensorDataset(features, labels) 53 | 54 | x = dataset[0] 55 | y = dataset[1] 56 | ``` 57 | 58 | ### ConcatDataset 59 | 60 | ```python 61 | # combines map-style datasets 62 | combined = ConcatDataset([dataset1, dataset2, dataset3]) 63 | ``` 64 | 65 | ### ChainDataset 66 | 67 | ```python 68 | # combines iterable-style datasets 69 | combined = ChainDataset([dataset1, dataset2, dataset3]) 70 | ``` 71 | 72 | ### Subset 73 | 74 | ```python 75 | # makes a subset of dataset with samples of index ∈ idxs 76 | idxs = [1, 2, 3, 5, 8, 13] 77 | dataset_subset = Subset(dataset, idxs) 78 | ``` 79 | 80 | ## Dataset wrapper 81 | 82 | If you want to apply transformations to your dataset dynamically, you might write a wrapper for an existing dataset 83 | 84 | ```python 85 | class Custom_DS(Dataset): 86 | def __init__(self, dataset, transformation): 87 | self._dataset = dataset 88 | self._transformation = transformation 89 | def __len__(self): 90 | return len(self._dataset) 91 | def __getitem__(self, idx): 92 | item = self._dataset[idx] 93 | x, y = item["image"], item["label"] 94 | 95 | x_transformed = self._transformation(x) 96 | 97 | return x_transformed, y 98 | ``` 99 | 100 | ## Utilities 101 | 102 | ### Random_split 103 | 104 | Datasets might not come presplit into training, validation, and test sets. By using the `random_split` function you can split one dataset randomly into subsets. 105 | 106 | ```python 107 | from torch.utils.data import random_split 108 | 109 | dataset = ... 110 | 111 | train_size = int(0.7 * len(dataset)) # 70% train 112 | val_size = int(0.15 * len(dataset)) # 15% validation 113 | test_size = len(dataset) - train_size - val_size # the rest are test 114 | 115 | train_dataset, val_dataset, test_dataset = random_split( 116 | dataset, [train_size, val_size, test_size] 117 | ) 118 | ``` 119 | 120 | For reproducibility you can fix the seed. (Computers cannot generate true random numbers, they are just seemingly random numbers that follow a certain distribution (usually uniform). The seed is a parameter that creates variance between different "random" generations attempts; in other words, by fixing the seed you will get a reproducible sequence of "random" numbers) 121 | 122 | ```python 123 | generator = torch.Generator().manual_seed(42) # seed=42 124 | train_dataset, val_dataset = random_split( 125 | dataset, [0.8, 0.2], generator=generator 126 | ) 127 | ``` 128 | 129 | the seed is set at 42, because 42 is the answer to the universe. 130 | -------------------------------------------------------------------------------- /2. Machine Learning Generics/2.4. Principal Component Analysis, Dimensionality Reduction/PCA.md: -------------------------------------------------------------------------------- 1 | 2 | # Principal Component Analysis (PCA) 3 | PCA is an algorithm that reduces the dimension of data using linear algebra. It requires some linear algebra knowledge to fully grasp. 4 | 5 | ## Process 6 | Consider $\text{X}_0 \in \mathbb{R}^{n \times p}$ to be the original data. We select $k$ as the number of components we want. 7 | 8 | 0. Data Preprocessing: 9 | 10 | We first center the data: 11 | 12 | $$ 13 | \text{X} = 14 | \begin{pmatrix} 15 | x_{11}-\bar x_1 & x_{12}-\bar x_2 & \cdots & x_{1p}-\bar x_p \\ 16 | x_{21}-\bar x_1 & x_{22}-\bar x_2 & \cdots & x_{2p}-\bar x_p \\ 17 | \vdots & \vdots & \ddots & \vdots \\ 18 | x_{n1}-\bar x_1 & x_{n2}-\bar x_2 & \cdots & x_{np}-\bar x_p 19 | \end{pmatrix} 20 | \quad \text{with} \quad 21 | \bar x_j=\frac{1}{n}\sum_{i=1}^{n}x_{ij}. 22 | $$ 23 | 24 | 1. Covariance Matrix 25 | 26 | Covariance is defined as $\text{cov}(\text{X},\text{Y}) = \frac{\sum^n_{i=1}({x}_i-\bar {x})({y}_i-\bar {y})}{n-1}$. It is a statistical value that describes how two variables change together. The covariance matrix, $\text{S}$, generalizes this idea to many variables at once. After $\text{X}$ is centered, $S=\frac{1}{n-1}\text{X}^T \text{X} \in \mathbb{R}^{p \times p}$. 27 | 28 | This matrix has these properties: 29 | 30 | a. Symmetry: $\text{S}=\text{S}^T$. 31 | 32 | b. Positive-semidefinite: all eigenvalues $\lambda_i \geq 0$. 33 | 34 | c. Its eigenvectors give the principal axes. 35 | 36 | 2. Eigen-decomposition of $\text{S}$ 37 | 38 | The next step is to find the eigenvectors of $\text{S}$, which is essentially solving this equation: 39 | 40 | $$ \text{S}v = \lambda v $$ 41 | 42 | This is equivalent to solving $\text{det}(\text{S}-\lambda \text{I}) = 0$. 43 | 44 | 3. Projection to $k$ principal axes 45 | 46 | We construct the projection matrix $\text{W}$ by selecting the most important principal components. This importance is measured by variance ratios, which $\text{Variance Ratio}_i = \frac{\lambda_i}{\sum_{j=1}^p \lambda_j}$. This step is essentially selecting the first $k$ eigenvectors with the greatest eigenvalues. 47 | 48 | $$ \text{W} = \begin{bmatrix} v_1 & v_2 & \cdots & v_k \end{bmatrix} \in \mathbb{R}^{p \times k} $$ 49 | 50 | Lastly, multiply $\text{X}$ by $\text{W}$ 51 | 52 | $$ \text{X}_k = \text{X} \text{W} \in \mathbb{R}^{n \times k} $$ 53 | 54 | The transformed coordinates of the original data is also called the PCA score. 55 | 56 | 57 | ## Disadvantages 58 | 59 | Although PCA is very fast (linear to the number of samples), it relies on linear projection, meaning that it only works with linearly separable data. Kernel PCA utilizes a kernel function to project data into a higher-dimensional feature space, where the data becomes linearly separable, and then applies PCA. 60 | 61 | There are other algorithms that perform dimension reduction, for example: t-Distributed Stochastic Neighbor Embedding (t-SNE), Uniform Manifold Approximation and Projection (UMAP), and Isometric Mapping (Isomap). We might cover these in the future. 62 | 63 | ```python 64 | # Implementation on a pizza dataset, reducing 9 dimensions to 2. 65 | 66 | import pandas as pd 67 | from sklearn.preprocessing import StandardScaler 68 | from sklearn.decomposition import PCA 69 | import matplotlib.pyplot as plt 70 | 71 | # load data 72 | data_path = 'pizza.csv' # your data.csv 73 | df = pd.read_csv(data_path) 74 | 75 | 76 | # There are non-numerical values so we need to do encoding: 77 | from sklearn import preprocessing 78 | encoder = preprocessing.LabelEncoder() 79 | print(df.head()) #original data 80 | df['brand'] = encoder.fit_transform(df['brand']) 81 | print(df.head()) #encoded version 82 | X = df.values 83 | # centering data 84 | scaler = StandardScaler() # normalize data 85 | X_centered = scaler.fit_transform(X) 86 | 87 | # 4. Fit PCA and transform to k 88 | k = 2 #transforming to 2 dimensions 89 | pca = PCA(n_components=k) # keep 2 principal components 90 | Z = pca.fit_transform(X_centered) 91 | 92 | print("Explained variance ratio:", pca.explained_variance_ratio_) 93 | print("First 5 rows of PCA scores (Z):\n", Z[:5]) 94 | 95 | # 6. Quick plot (optional) ------------------------ 96 | plt.scatter(Z[:, 0], Z[:, 1], alpha=0.7) 97 | plt.xlabel('PC1') 98 | plt.ylabel('PC2') 99 | plt.title('PCA projection (2-D)') 100 | plt.show() 101 | ``` -------------------------------------------------------------------------------- /4. Deep Learning & Computer Vision/4.7. ResNet/resnet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "12e4e3b2", 6 | "metadata": {}, 7 | "source": [ 8 | "# Resnet\n", 9 | "\n", 10 | "Original Paper: Deep Residual Learning for Image Recognition\n", 11 | "\n", 12 | "## Problem Statement\n", 13 | "Before actually introducing resnet, we want to know the problem it intends to solve. Lets consider two networks, a shallow one and its deeper counterpart that adds more layers onto it. We should expect the deeper one to have better results because in the worst case the extra layers of the deeper counterpart are all identity mappings and the results would be equivalent to that of the shallow network.\n", 14 | "\n", 15 | "
\n", 16 | "\n", 17 | "
\n", 18 | "\n", 19 | "This shows that the intuitive conclusion that adding layers = better performance doesn't typically hold true. The problem then becomes how to make the network learn identity functions, which is hard in traditional networks.\n", 20 | "\n", 21 | "## Approach\n", 22 | "There are essentially two pieces of innovation that Resnet comes up with.\n", 23 | "\n", 24 | "---\n", 25 | "\n", 26 | "Instead of learning the direct transformation, Resnet learns a residual between the output and the input, that is, $\\mathcal{F}: x_{l-1} \\to x_l-x_{l-1}$ where $f$ is the function the model tries to fit, $y$ is the output, and $x$ is the input.\n", 27 | "\n", 28 | "For a residual block, the output $x_l$ is defined as: \n", 29 | "\\begin{equation} x_l = x_{l-1} + \\mathcal{F}(x_{l-1}) \\end{equation}\n", 30 | "
\n", 31 | "\n", 32 | "
\n", 33 | "\n", 34 | "The added $x_{l-1}$ comes through a skip connection, as shown in the figure, we add it to the end of the block such that the output is still $x_l$ but the training process of the block changes. If $x_l$ and the residual have different dimensions or channel counts, the skip connection goes through a $1\\times 1$ convolution to project $x_{l-1}$ to the correct dimension.\n", 35 | "\n", 36 | "## Why\n", 37 | "\n", 38 | "### Easier To Fit Identity Function\n", 39 | "The essential problem is that traditional networks struggle to fit the identity function. Fitting a $n$ sized block of neural networks as the identity funciton would require:\n", 40 | "\\begin{equation}\n", 41 | " \\sigma(W_n\\sigma(W_{n-1}\\cdots(W_1x))) = x\n", 42 | "\\end{equation}\n", 43 | "Which is very hard due to non-linear activation functions, take ReLU $\\sigma(z) = \\text{max}(0,z)$ as an example, it directly 'zeros' all negative values. This means that fitting the block to the identity function would require it to be able to accurately 'recreate' any $x < 0$ out of no information, as $x$ would have been set zero by ReLU. \n", 44 | "\n", 45 | "
\n", 46 | "\n", 47 | "
\n", 48 | "\n", 49 | "For a residual block to fit the identity function, it would require:\n", 50 | "\\begin{equation}\n", 51 | "\\mathcal{F} + x = x\n", 52 | "\\end{equation}\n", 53 | "This can easily be simplified to be $\\mathcal{F} = 0$, which is far easier than the previously shown requirements as it can simply be done by setting all weights to $0$. \n", 54 | "\n", 55 | "### Prevent Gradient Vanishing\n", 56 | "During backpropagation, for a loss $\\mathcal{L}$, the gradient of $\\mathcal{L}$ with respect to $x_{l-1}$ is:\n", 57 | "\\begin{equation} \\frac{\\partial \\mathcal{L}}{\\partial x_{l-1}} = \\frac{\\partial \\mathcal{L}}{\\partial x_l} \\cdot (1 + \\frac{\\partial \\mathcal{F}}{\\partial x_{l-1}}) \\end{equation}\n", 58 | "\n", 59 | "The chain rule here applies as: $\\frac{\\partial \\mathcal{L}}{\\partial x_{l-1}} = \\frac{\\partial \\mathcal{L}}{\\partial x_l} \\cdot (\\frac{\\partial}{\\partial x_{l-1}}(x_{l-1} + \\mathcal{F}))$ as $x_l = x_{l-1} + \\mathcal{F}$\n", 60 | "\n", 61 | "The \"$+1$\" in the equation ensures that the gradient is able to flow directly from $x_l$ to $x_{l-1}$. In traditional networks ($f$), it looks like:\n", 62 | "\\begin{equation}\n", 63 | "\\frac{\\partial \\mathcal{L}}{\\partial x_{l-1}} = \\frac{\\partial \\mathcal{L}}{\\partial x_l} \\cdot \\frac{\\partial f}{\\partial x_{l-1}}\n", 64 | "\\end{equation}\n", 65 | "\n", 66 | "The chain rule in this one is more explicit, $f = x_l$. The problem with this is that since $\\frac{\\partial f}{\\partial x_{l-1}}$ can be similarly decomposed to a product of multiple gradients, it approaches to $0$ exponentially if all gradients $<1$. Resnet's \"$+1$\" ensures that even if the gradient vanishes, there is still something that is passed back.\n", 67 | "\n", 68 | "\n" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "id": "b41df3db", 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "Downloading: \"https://github.com/pytorch/vision/zipball/v0.10.0\" to C:\\Users\\1111/.cache\\torch\\hub\\v0.10.0.zip\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "import torch\n", 87 | "# Load ResNet models of different depths (with pre-trained weights)\n", 88 | "def get_resnet(depth, pretrained=True):\n", 89 | " \"\"\"\n", 90 | " Load ResNet model of specified depth using torch.hub.\n", 91 | " \n", 92 | " Args:\n", 93 | " depth (int): ResNet depth (18, 34, 50, 101, 152)\n", 94 | " pretrained (bool): Whether to load pre-trained weights\n", 95 | " \n", 96 | " Returns:\n", 97 | " nn.Module: ResNet model\n", 98 | " \"\"\"\n", 99 | " model_name = f\"resnet{depth}\"\n", 100 | " # PyTorch's vision repo supports these ResNet variants\n", 101 | " supported_depths = {18, 34, 50, 101, 152}\n", 102 | " if depth not in supported_depths:\n", 103 | " raise ValueError(f\"Unsupported ResNet depth: {depth}. Supported depths: {supported_depths}\")\n", 104 | " \n", 105 | " model = torch.hub.load(\n", 106 | " 'pytorch/vision:v0.10.0', # Repo and version\n", 107 | " model_name, # Model name (e.g., \"resnet18\")\n", 108 | " pretrained=pretrained\n", 109 | " )\n", 110 | " model.eval() # Set to evaluation mode\n", 111 | " return model\n", 112 | "\n", 113 | "\n", 114 | "# Example usage\n", 115 | "if __name__ == \"__main__\":\n", 116 | " # Load ResNet-18\n", 117 | " resnet18 = get_resnet(18, pretrained=True)\n", 118 | " print(\"ResNet-18 loaded.\")\n", 119 | "\n", 120 | " # Load ResNet-50\n", 121 | " resnet50 = get_resnet(50, pretrained=True)\n", 122 | " print(\"ResNet-50 loaded.\")\n", 123 | "\n", 124 | " # Load ResNet-152\n", 125 | " resnet152 = get_resnet(152, pretrained=True)\n", 126 | " print(\"ResNet-152 loaded.\")\n", 127 | "\n", 128 | " # Test with a sample input\n", 129 | " x = torch.randn(1, 3, 224, 224) # Batch of 1, 3-channel, 224x224 image\n", 130 | " with torch.no_grad(): # Disable gradient computation for inference\n", 131 | " output = resnet18(x)\n", 132 | " print(f\"ResNet-18 output shape: {output.shape}\") # Should be (1, 1000)" 133 | ] 134 | } 135 | ], 136 | "metadata": { 137 | "kernelspec": { 138 | "display_name": "base", 139 | "language": "python", 140 | "name": "python3" 141 | }, 142 | "language_info": { 143 | "codemirror_mode": { 144 | "name": "ipython", 145 | "version": 3 146 | }, 147 | "file_extension": ".py", 148 | "mimetype": "text/x-python", 149 | "name": "python", 150 | "nbconvert_exporter": "python", 151 | "pygments_lexer": "ipython3", 152 | "version": "3.12.7" 153 | } 154 | }, 155 | "nbformat": 4, 156 | "nbformat_minor": 5 157 | } 158 | -------------------------------------------------------------------------------- /4. Deep Learning & Computer Vision/4.1. Forward Propagation, Activation Functions, Linear Layer/4.1 forward propagation + Activation functions + Linear Layer.md: -------------------------------------------------------------------------------- 1 | 2 | ### **Lesson 4.1: The Data's Grand Tour (Forward Propagation)** 3 | SHSID DATA SCIENCE CLUB | Gordon.H 4 | 5 | 6 | Alright, welcome back! 7 | 8 | Ever wonder what *actually* happens inside a neural network when you give it data? It's not just a black box of magic. Today, we're going to pull back the curtain and follow a single piece of data on its journey from input to final prediction. 9 | 10 | This one-way trip is called **Forward Propagation**, and it's the heartbeat of every neural network. 11 | 12 | ### What We're Unlocking Today 13 | 14 | By the end of this chat, you'll be able to: 15 | * See the **Linear Layer** as the simple "calculator" at the heart of a neuron. 16 | * Understand how an **Activation Function** acts as the neuron's "decision-maker." 17 | * Walk a piece of data through a mini-network, step-by-step, to see how it all comes together. 18 | 19 | *** 20 | 21 | ## What's Inside a Single Neuron? 22 | 23 | Before we can understand a whole stadium, let's get to know one player. A neuron's job is surprisingly straightforward and happens in two steps: 24 | 25 | 1. **Calculate:** It gathers all the information it's given, weighs it all, and crunches it into a single number. Think of this as the *brawn*. 26 | 2. **Decide:** It looks at that number and decides how important it is. Should I get excited about this? Should I ignore it? This is the *brains*. 27 | 28 | Let's meet the two parts that handle this. 29 | 30 | ## 1. The Calculator: The Linear Layer 31 | 32 | The first step is just some simple math. This might sound fancy, but it's based on a formula you definitely learned in school: `y = mx + b`. 33 | 34 | Seriously, that's it! We just use slightly different words in machine learning. 35 | 36 | | School Algebra (`y = mx + b`) | Machine Learning (`output = weight * input + bias`) | 37 | | :--- | :--- | 38 | | `x` (your input number) | `input` (our piece of data) | 39 | | `m` (the slope of the line) | `weight` (how *important* that input is) | 40 | | `b` (where the line starts) | `bias` (a little "nudge" to get it just right) | 41 | 42 | The **`weight`** is the key. It tells the neuron how much to care about a piece of information. A big weight means "pay close attention to this!" The **`bias`** is just an offset, like a little head-start that can make it easier or harder for the neuron to get excited. 43 | 44 | Let's make this real. Imagine a neuron whose only job is to guess if a song will be a "hit." One piece of info it gets is the song's tempo. 45 | 46 | * `input` = `120` (a classic, danceable tempo) 47 | * Let's say the network has learned a `weight` for tempo of `0.02`. 48 | * And it has a starting `bias` of `-1.5`. 49 | 50 | The Linear Layer just does the math: 51 | ``` 52 | # The formula: (input * weight) + bias 53 | calculation = (120 * 0.02) + (-1.5) 54 | calculation = 2.4 - 1.5 55 | calculation = 0.9 56 | ``` 57 | Okay, the neuron has calculated a score: `0.9`. But... so what? Is `0.9` good? Bad? That's where the decision-maker comes in. 58 | 59 | ## 2. The Decision-Maker: The Activation Function 60 | 61 | That score of `0.9` doesn't mean anything on its own. The neuron needs a rulebook to translate that score into a clear signal to pass along. This rulebook is the **Activation Function**. 62 | 63 | > **Analogy:** Think of it like a dimmer switch for a light. The linear calculation (`0.9`) is how hard you're pressing on the switch. The activation function decides how bright the light should be. Maybe a gentle press does nothing, but a firm press turns it on full blast. 64 | 65 | Without this "decide" step, our network would just be a long, boring chain of `y=mx+b`. It could only learn simple, straight-line patterns. Activation functions add the "spark"—the twists and turns that let the network learn incredibly complex things, like telling a cat from a dog. 66 | 67 | ### Our Favorite Decision-Maker: ReLU 68 | 69 | The most popular activation function is called **ReLU (Rectified Linear Unit)**. Its rule is laughably simple: 70 | * If a number is positive, keep it. 71 | * If a number is negative, just make it `0`. 72 | 73 | That's it! We can write it as `ReLU(x) = max(0, x)`. 74 | 75 | Let's see it in action: 76 | * `ReLU(0.9)` → `0.9` (Our "hit song" neuron decides this is a strong signal and passes it on!) 77 | * `ReLU(52.7)` → `52.7` 78 | * `ReLU(-3.1)` → `0` (The neuron decides this signal isn't worth bothering with and silences it.) 79 | 80 | So, the full journey through one neuron looks like this: 81 | **Inputs → Linear Layer (Calculate) → Activation Function (Decide) → Output** 82 | 83 | ## Putting It All Together: The Grand Tour 84 | 85 | So, what is **Forward Propagation**? It's simply the process of letting our data complete this journey through the *entire* network, one layer of neurons at a time. 86 | 87 | It's like a relay race. The outputs from the first layer of neurons become the inputs for the second layer. They do their little "calculate-and-decide" dance and pass their results to the third layer, and so on, until the data crosses the final finish line, which gives us the network's final prediction. 88 | 89 | ## Let's Be the Computer! 90 | 91 | Alright, time to roll up our sleeves. We have a tiny network for our favorite houseplant. Its job is to decide if the plant needs water. It looks at two things: 92 | 1. `days_since_last_water` 93 | 2. `is_sunny` (1 for sunny, 0 for not sunny) 94 | 95 | Our network has one "thinking" neuron in the middle (`Neuron H`) and one final "decision" neuron (`Neuron O`). We'll use our friend **ReLU** for all the decisions. 96 | 97 | --- 98 | 99 | ### **Step 1: See What the Hidden Neuron (H) Thinks** 100 | 101 | Neuron H looks at both our original inputs. Its formula is: `(input1 * weight1) + (input2 * weight2) + biasH` 102 | 103 | **Here are its settings (the network already "learned" these):** 104 | * `weight1` (for days) = `0.4` 105 | * `weight2` (for sun) = `0.2` 106 | * `biasH` = `-0.5` 107 | 108 | **Our Scenario:** It's been **3 days** since we watered, and it **is sunny** today. 109 | * `input1` = `3` 110 | * `input2` = `1` 111 | 112 | **A. Do the Math (Linear Layer):** 113 | ```python 114 | # Let's calculate the neuron's initial score 115 | linear_result_H = (3 * 0.4) + (1 * 0.2) + (-0.5) 116 | linear_result_H = (1.2) + (0.2) - 0.5 117 | linear_result_H = _______________ # Fill this in! 118 | ``` 119 | 120 | **B. Make a Decision (Activation Function):** 121 | ```python 122 | # Now, apply the ReLU rule to your result 123 | output_H = ReLU(linear_result_H) 124 | output_H = _______________ # What does it decide? 125 | ``` 126 | 127 | --- 128 | 129 | ### **Step 2: Get the Final Verdict from the Output Neuron (O)** 130 | 131 | Neuron O is simpler. Its *only* input is the signal it got from Neuron H (`output_H`). Its formula is: `(output_H * weightO) + biasO` 132 | 133 | **Its settings are:** 134 | * `weightO` = `1.5` 135 | * `biasO` = `0.1` 136 | 137 | **A. Do the Final Calculation:** 138 | ```python 139 | # Use your value for output_H from the last step! 140 | linear_result_O = (output_H * 1.5) + 0.1 141 | linear_result_O = _______________ 142 | ``` 143 | 144 | **B. Make the Final Decision!** 145 | ```python 146 | # One last ReLU! A positive output means "water the plant." 147 | final_prediction = ReLU(linear_result_O) 148 | final_prediction = _______________ 149 | ``` 150 | 151 | > **Check Your Work!** 152 | > In Step 1, you should get `linear_result_H = 0.9`. After applying ReLU, `output_H` is also `0.9`. 153 | > In Step 2, you'd calculate `(0.9 * 1.5) + 0.1 = 1.35 + 0.1 = 1.45`. 154 | > The `final_prediction` is `ReLU(1.45)`, which is `1.45`. 155 | > Since the number is positive, our network is shouting: **"Yes, water the plant!"** 156 | 157 | ## You Did It! 158 | 159 | And... that's it. You just manually performed forward propagation. You acted as the brain of a neural network, taking inputs, pushing them through a "calculator" and a "decision-maker," and getting a final answer. This is *exactly* what every deep learning model does, just on a much, much bigger scale. 160 | 161 | But wait... where did those weights and biases (`0.4`, `0.2`, `-0.5`...) come from? This feels a bit like magic, right? How did the network *know* the right values to make good predictions? 162 | 163 | That's the real secret sauce: **training**. And it's exactly what we're diving into next. 164 | 165 | -------------------------------------------------------------------------------- /2. Machine Learning Generics/2.1. Linear Regression & Logistic Regression/Linear & Logistic Regression.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | # All About **Linear & Logistic Regression** 4 | ###### Will Chen | SHSID Data Science Group 5 | 6 | ## Key Questions 7 | 8 |
9 | 10 | Linear and logistic regression are key techniques used to understand and apply in supervised learning. By the end of the lesson, you will be able to answer the following **key questions**: 11 | 1. What are linear and logistic regression, and what kind of problems do they solve? 12 | 2. What's the difference between linear and logistic regression? 13 | 3. In the age of deep learning, why do we still rely on these foundational techniques? 14 | 15 |
16 | 17 | ## Key Terms 18 |
19 | 20 | In order to understand this lesson, you would have a grasp of the following key concepts and terms: 21 | 22 | - **Features**: The input variables used to make a prediction. It goes through the neural network. When the prediction is `f(x)`, the feature would be `x` (input). 23 | - **Target**: The variable we are trying to predict. It is the result of a feature going through the neural network. Using the same example as above, the target is `f(x)`. 24 | - **Regression**: A technique used in supervised learning where the goal of the model is to predict a *continuous numerical value*. These can be predictions such as tomorrow's temperature. It's not usually used anywhere else. 25 | - **Classification**: A technique used in supervised learning where the goal of the model is to predict a *discrete category*. Instead of an exact number, it classifies the features into defined targets. Specifically, it outputs a *probability list of how well an input fits into each category*. These can be predictions such as marking an image to more likely to be a cat or dog. 26 | - **Weights**: aka Coefficients. It's one piece of data stored in a neuron that determines the feature's influence to the prediction. The influence is usually multiplication, so multiplying by a small number may mean a smaller influence, and vice versa. 27 | - **Biases**: aka Intercepts. Also stored in a neuron, it's a constant data that changes the baseline representiation. Weights change something based on what's given; biases change a constant amount regardless. 28 | - **Loss**: aka Cost. It's the difference between an erroneous output and the expected output, useful in training models. The goal of training is to minimize this function via changing the weights and biases of each neuron in each layer. 29 | - **Gradient descent**: An optimization that allows us to find out by what magnitude do we need to change the information within our neurons to make the loss minimized. You'll learn more about this later on in Chapter 5. Through iteration, it adjusts the parameters in the opposite direction of the gradient. 30 | - **Sigmoid**: A special function that compresses all input values into a range between 0 and 1. The key differentiator between linear and logistic. 31 | 32 | Linear vs Logistic 33 | 34 | Classification vs Regression 35 | 36 | Gradient descent summary 37 | 38 |
39 | 40 | ## Introduction to predictive modeling 41 |
42 | 43 | ### What are Linear and Logistic Regression? 44 | 45 | **Linear Regression** and **Logistic Regression** are two of the most fundamental and universal algorithms in ML. They are both supervised learning methods, but they are used to solve different kinds of problems. 46 | 47 | - **Linear regression** is used for **regression** tasks, or to predict a continuous value. 48 | - Example: Based on past year's data, predict tomorrow's forecast. 49 | - Key concept: Drawing the best-fit line for a plot of points. 50 | 51 | - **Logistic regression** is used for **classification** tasks. Don't let the name fool you, it's not for regression even though it's named regression, that's just the technique not the application. 52 | - Example: Predict whether if a given image is a cat or a dog. 53 | - Key concept: Drawing a separation line that defines the boundary between different plots of points. 54 | 55 | While they solve different issues, they share a similar underlying foundation in the math. Understanding linear regression is the basis to understanding logistic regression. 56 | 57 | ### Part 1: Linear regression 58 | 59 | As humans, we can draw a best-fit line pretty easily. Just look at the set of points on the graph and you'll have a rough estimate on which line fits best. This is because our brains are kind of built for fuzzy pattern matching stuff like this. However, it might not be mathematically the most accurate way to draw a best fit line, nor would we like for us to draw the lines ourselves, regardless of method. 60 | 61 | So how do we teach this to a computer, and make them do it accurately? 62 | 63 | Well, you might remember this slope-intercept form from math: 64 | 65 | $$ f(x) = mx + b $$ 66 | 67 | In machine learning, this is also a core concept and you can see it in a lot of models: 68 | - f(x) is the target output. 69 | - x is the feature input. 70 | - m is the weight. 71 | - b is the bias. 72 | 73 | This slope-intercept form represents a line. If you want to shape the line in such a way that it fits a specific group of points, you would want to adjust the values m and b. This is exactly the same values that models adjust during training. They adjust the weights and biases for each neuron until we get a line that closely fits the "expected" point group. 74 | 75 | ### Part 2: Logistic regression 76 | 77 | What if our problem isn't predicting a price, but predicting a "yes" or "no" answer? The target is now a category (Cat=1, Dog=0), not a continuous number. 78 | 79 | #### The sigmoid function 80 | 81 | A straight line doesn't really fit our needs. If all we want is the model to tell us what it thinks the picture is, we really just need it to give us a number, between 0 and 1. For example, closer to 0 means dog, and closer to 1 means cat. 82 | 83 | So, we use a trick called the sigmoid function. It takes the output of a linear equation (`mx + b`) and feeds it into the sigmoid equation. 84 | 85 | The sigmoid function has an "S" shape. No matter what number you put into it (from negative infinity to positive infinity), it will always output a value **between 0 and 1**. 86 | 87 | So, the logistic regression model looks like this: 88 | 89 | $$ Probability(Animal) = Sigmoid(mx + b) $$ 90 | 91 | This output can be interpreted as the probability of the positive class. For instance, if the model outputs 0.8, it is 80% confident that the animal is a cat. 92 | 93 | #### Decision boundaries 94 | 95 | For our model to give us a decision instead of a number, we have to setup a decision boundary. **It's the line that sets up the distinction between results**. For a binary decision like cat or dog, it's much more simple, and the boundary is placed commonly at 0.5. If it's less than 0.5, we are more sure it is a dog than cat, and vice versa. 96 | 97 | #### Binary cross entropy 98 | 99 | Because our predictions are now probabilities, the traditional loss function for most models, Mean-Squared Error, is no longer the best loss function. Instead, logistic regression uses a loss function called **binary cross-entropy** (or los loss). It heavily "punishes" the model when it makes a confident but incorrect prediction. For example, if the model predicts a 99% chance of a cat who is actually a dog, the loss will be very high and the model will be severely "punished". 100 | 101 | Apart from this, the rest can be basically the same. They both use concepts such as chain rule and gradient descent to calculate the specific neurons, magnitude, and direction of change (which you'll learn about later on in detail, but the high-level differences are these). 102 | 103 |
104 | 105 | ### Conclusion 106 |
107 | 108 | Linear and Logistic Regression are the foundational pillars of predictive modeling. They demonstrate the core process of machine learning: defining a model, measuring its error with a **loss function**, and iteratively improving it using an optimizer like **gradient descent**. 109 | 110 | - **Linear Regression** fits a line to data to predict **continuous values**. 111 | - **Logistic Regression** adapts this line with a **sigmoid function** to predict a probability for **classification tasks**. 112 | 113 | Although simpler than deep neural networks and more complex topics, their importance is still monumental. They are fast, interpretable, and serve as the starting point for beginners. In fact, a single neuron in a neural network performing a classification task is essentially a logistic regression unit. 114 | -------------------------------------------------------------------------------- /3. PyTorch/3.4. Modules/module.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | # Modules 4 | ##### **Jerry Zhang** | SHSID Data Science Club 5 | 6 |
7 | 8 | ## What? 9 | 10 | Pytorch follows the python norm of OOP to let you construct neural networks—specifically they let you inherit a prebuilt module, `torch.nn.Module` which is essentially the skeleton of a neural network. 11 | 12 | The benefits of inheriting from a base class rather than assembling functions yourself is that it unifies code and hides unnecessary complexity while still giving you the freedom of modification to any degree. 13 | 14 | ## Usage 15 | 16 | `torch.nn` contains most functions / objects needed to construct a neural network 17 | ```python 18 | import torch.nn as nn 19 | ``` 20 | 21 | The architecture 22 | ```python 23 | class Your_nn(nn.Module): 24 | def __init__(self, ...): 25 | # super() refer to the father class, in this case nn.Module 26 | # __init__ is Module's init, calling it initiallizes Module's features 27 | # most sources will tell you to write super().__init__() but it's equivilent to super().__init__() in this case 28 | super().__init__() 29 | 30 | # your layers and functions 31 | ... 32 | 33 | def forward(self, x): 34 | """ 35 | This is the forward feeding function 36 | You define the structure of your network using the components defined in __init__ 37 | x is the input tensor 38 | return the output 39 | """ 40 | ... 41 | ``` 42 | 43 | A sample to better illustrate this 44 | ```python 45 | class MNIST_nn(nn.Module): 46 | def __init__(self): 47 | super().__init__() 48 | 49 | # these will be explained in section 4 50 | 51 | # the convolution layers 52 | self.conv1 = nn.Conv2d(1, 32, 3, padding=1) 53 | self.conv2 = nn.Conv2d(32, 64, 3, padding=1) 54 | # the fc layers 55 | self.fc1 = nn.Linear(64 * 7 * 7, 512) 56 | self.fc2 = nn.Linear(512, 10) 57 | # dropout 58 | self.dropout = nn.Dropout(0.25) 59 | # normalization 60 | self.bn1 = nn.BatchNorm2d(32) 61 | self.bn2 = nn.BatchNorm2d(64) 62 | # activation functions 63 | self.relu = nn.ReLU() 64 | 65 | def forward(self, x): 66 | # one layer 67 | x = self.conv1(x) 68 | x = self.relu(x) 69 | x = self.bn1(x) 70 | x = F.max_pool2d(x, 2) 71 | 72 | # you can also write it in a more compact way 73 | x = self.bn2(self.relu(self.conv2(x))) 74 | x = F.max_pool2d(x, 2) 75 | 76 | # flattening 77 | x = x.view(-1, 64 * 7 * 7) 78 | 79 | # here is another way to call relu 80 | # F is from nn.functional 81 | # this relu is a function while nn.ReLU() is an object 82 | # they are basically equivilent in terms of computation 83 | # F.relu is a bit more simplistic 84 | # nn.ReLU() is more organized and can be integrated into nn.Sequential 85 | x = self.dropout(F.relu(self.fc1(x))) 86 | x = self.fc2(x) 87 | 88 | return x 89 | ``` 90 | 91 | main function 92 | ```python 93 | if __name__ == "__main__": 94 | model = Your_nn(...) 95 | 96 | for epoch in range(epochs): 97 | loss = model(input) 98 | loss.backward() 99 | optimizer.step() 100 | optimizer.zero_grad() 101 | ``` 102 | 103 | ## Side facts about the hidden complexity 104 | 105 | you might wonder how backpropagation is done 106 | 107 | ### Auto param registration 108 | 109 | there is a hidden attribute called `self._parameters` which is an `OrderedDict`. Whenever a learnable module is defined, it is auto added to it. 110 | This is done via the `__setattr__` dunder method 111 | 112 | ```python 113 | # rough idea 114 | # whenever an attribute is added to the object this is invoked 115 | def __setattr__(self, name, value): 116 | # all learnable modules are a child class of nn.Parameter 117 | if isinstance(value, nn.Parameter): 118 | # all learnable modules are a child class of nn.Parameter 119 | self.register_parameter(name, value) 120 | elif isinstance(value, nn.Module): 121 | # this is the case when, for example, nn.Sequential is added 122 | # allowing for the formation of an organized tree architecture 123 | self._modules[name] = value 124 | ... 125 | # finally, the original purose of self.a = b must be fulfilled 126 | object.__setattr__(self, name, value) 127 | ``` 128 | 129 | for backpropagation 130 | ```python 131 | class Module: 132 | # these two methods expose the parameters to the optimizer 133 | def parameters(self, recurse=True): 134 | for name, param in self.named_parameters(recurse=recurse): 135 | yield param 136 | 137 | def named_parameters(self, prefix='', recurse=True): 138 | # Recursively yields all parameters with names 139 | # Enables optimizer access: optimizer = torch.optim.SGD(model.parameters(), lr=0.01) 140 | ``` 141 | 142 | ## Hooks 143 | 144 | hooks are functions that give you access points to data throughout the entire training process: the inputs, outputs, and the gradients of all modules 145 | 146 | ### types of hooks 147 | 148 | ```python 149 | # Forward pre-hook: called before forward() 150 | def forward_pre_hook(module, input): 151 | # input is a tuple of inputs to the module 152 | print(f"Module {module.__class__.__name__} received input: {[i.shape for i in input]}") 153 | # You can modify the input here 154 | return input # Or modified input 155 | 156 | # Forward hook: called after forward() 157 | def forward_hook(module, input, output): 158 | # output is the result of forward() 159 | print(f"Module {module.__class__.__name__} produced output: {output.shape}") 160 | # You can modify the output here 161 | return output # Or modified output 162 | 163 | # called during backprop 164 | def backward_hook(module, grad_input, grad_output): 165 | # grad_input: gradients flowing INTO the module 166 | # grad_output: gradients flowing OUT OF the module 167 | print(f"Gradients flowing out: {[g.shape for g in grad_output if g is not None]}") 168 | # Can be used for gradient clipping or monitoring 169 | return grad_input 170 | ``` 171 | 172 | ### registering hooks 173 | 174 | ```python 175 | module.register_forward_pre_hook(hook) 176 | module.register_forward_hook(hook) 177 | module.register_full_backwards_hook(hook) 178 | 179 | # you can manually assign 180 | # this is adding a hook to the initial input and final output 181 | model.register_forward_prehook(...) 182 | model.register_forward_hook(...) 183 | 184 | # or add hooks at mass 185 | for name, module in model.named_modules(): 186 | # for all linear modules within model 187 | if isinstance(module, nn.Linear): 188 | handle = module.register_forward_hook(...) 189 | ``` 190 | 191 | with these you can monitor values, clip gradients, etc. 192 | 193 | ### common hooks 194 | 195 | ```python 196 | # Gradient clipping using backward hook 197 | def gradient_clipping_hook(module, grad_input, grad_output, max_norm=1.0): 198 | # Clip gradients to prevent explosion 199 | total_norm = 0 200 | for g in grad_output: 201 | if g is not None: 202 | param_norm = g.data.norm(2) 203 | total_norm += param_norm.item() ** 2 204 | total_norm = total_norm ** 0.5 205 | 206 | clip_coef = max_norm / (total_norm + 1e-6) 207 | if clip_coef < 1: 208 | for g in grad_output: 209 | if g is not None: 210 | g.data.mul_(clip_coef) 211 | 212 | return grad_input 213 | 214 | # Register to specific layers 215 | for module in model.modules(): 216 | if isinstance(module, nn.Linear): 217 | module.register_full_backward_hook(gradient_clipping_hook) 218 | ``` 219 | 220 | ```python 221 | # Monitor activation statistics during training 222 | activation_stats = {} 223 | 224 | def activation_stats_hook(name): 225 | def hook(module, input, output): 226 | if name not in activation_stats: 227 | activation_stats[name] = { 228 | 'mean': [], 'std': [], 'min': [], 'max': [] 229 | } 230 | 231 | activation_stats[name]['mean'].append(output.mean().item()) 232 | activation_stats[name]['std'].append(output.std().item()) 233 | activation_stats[name]['min'].append(output.min().item()) 234 | activation_stats[name]['max'].append(output.max().item()) 235 | return hook 236 | 237 | # Register to all convolutional layers 238 | for name, module in model.named_modules(): 239 | if isinstance(module, nn.Conv2d): 240 | module.register_forward_hook(activation_stats_hook(name)) 241 | ``` 242 | 243 | ```python 244 | # Identify dead ReLU units 245 | dead_relus = {} 246 | 247 | def relu_monitor_hook(name): 248 | def hook(module, input, output): 249 | # Count how many outputs are exactly zero 250 | dead_ratio = (output == 0).float().mean().item() 251 | if name not in dead_relus: 252 | dead_relus[name] = [] 253 | dead_relus[name].append(dead_ratio) 254 | return hook 255 | 256 | # Monitor all ReLU layers 257 | for name, module in model.named_modules(): 258 | if isinstance(module, nn.ReLU): 259 | module.register_forward_hook(relu_monitor_hook(name)) 260 | ``` 261 | -------------------------------------------------------------------------------- /3. PyTorch/3.3. Devices/device.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | # Devices 4 | ##### **Jerry Zhang** | SHSID Data Science Club** 5 | 6 |
7 | 8 | ## The characteristics of different devices 9 | 10 | As you may know, there are many different components to a computer which all can do computation. Most notably are the **CPU** and the **GPU**. This leads to the question "what device is the best for running neural networks?". 11 | To answer this, we first need to understand **precision**, **parallelization**, and the characteristics of each device. 12 | 13 | ### Precision 14 | 15 | Our weight matrices consist of **Floating Point** (**FP**) values, values with a non-integer component. For example $0.175$ or $0.982$. 16 | To store **FP**s, computers use a scientific notation like system. The name of each system is **FP** followed by its bit count. 17 | 18 | ##### FP32 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 |
3130-2322-0
SExponent (E)Mantissa (M)
1 bit8 bits23 bits
36 | 37 | The value of which is $(-1)^S \times (1 + M) \times 2^{(E - 127)}$ 38 | 39 | The more bits in an **FP** the larger range of values it can represent. 40 | 41 | | Format | Range | 42 | | -------- | -------------------------------------------------------- | 43 | | **FP16** | $\pm6.10 \times 10^-5 \dots \pm6.55 \times 10^4$ | 44 | | **FP32** | $\pm1.18 \times 10^{-38} \dots \pm3.40 \times 10^{38}$ | 45 | | **FP64** | $\pm2.23 \times 10^{-308} \dots \pm1.80 \times 10^{308}$ | 46 | 47 | The *pros* of smaller number of bits would include 48 | - it reduces the required amount of memory 49 | - this allows for the deployment of large models on lesser devices 50 | - larger batch sizes in training 51 | - its faster to compute 52 | - lower power consumption (mainly a concern for deployment on mobile devices) 53 | 54 | *cons* 55 | - lower precision 56 | - when lowering a high-precision value to a lower-precision number, this introduces an error, which can accumulate to a decrease in performance 57 | - very small number in lower-precision can be rounded down to zero, leading to vanishing gradients 58 | - very large numbers may exceed the limit and become NaN, ruining the model performance and the gradients 59 | 60 | ### Parallelization 61 | 62 | Most computation units in a computer are packed with many different circuits, however, when processing an instruction, only one circuit is used. This is a waste of resources as a task most likely consists of many independent instructions. 63 | #### Basic jargon 64 | ##### Clock Cycle 65 | It takes time for electricity for flow through circuits and there is no guaranteeing when a signal is the final result. Therefore the **clock cycle**, a frequency of electricity, decrees when a circuits output is the final output. In other words, a **clock cycle** is a cycle of computation 66 | ##### Register 67 | To perform a computation on, for example, two numbers, they need to be first inputted, and then processed. Most processors don't allow direct input-output, values are first inputted and stored in a cache called the **register**, and in the next **clock cycle** they are then processed and outputted. 68 | #### Parallel processing 69 | ##### Multi-ported Registers 70 | To support multiprocessing, the first issue it to be able to take multiple inputs. A **multi-ported register**, as its name suggests allows for many simultaneous readings and writings 71 | ##### Bypass networks 72 | To "parallelize" calculations which rely on each other, outputs need to be mapped to inputs without passing through the register, as the register is only called once per **clock cycle**. The Bypass network connects outputs of circuits to the inputs of others. 73 | ##### The Scheduler (Reservation Station) 74 | The Scheduler facilitates the bypass network. It checks if all inputs are ready and the circuit is free before sending an instruction. 75 | #### An example 76 | ```mermaid 77 | 78 | flowchart TD 79 | 80 |     subgraph Cycle1[Scheduler Actions - Cycle 1] 81 | 82 |         direction TB 83 | 84 |         C1_Scheduler[Scheduler analyzes instruction queue] 85 | 86 |         C1_Dispatch[Dispatches to available execution units] 87 | 88 |         C1_Load["Load/Store Unit: LOAD R1, [0x100]"] 89 | 90 |         C1_ALU1[ALU 1: MUL R4, R5, R6] 91 | 92 |         C1_ALU2[ALU 2: SUB R7, R8, R9] 93 | 94 |         C1_RS["Reservation Station: ADD R2, R1, R3
STORE [0x200], R2
BRANCH if R4 > 0"] 95 | 96 |     end 97 | 98 | 99 | 100 |     subgraph Cycle2[Scheduler Actions - Cycle 2] 101 | 102 |         direction TB 103 | 104 |         C2_Scheduler[Scheduler monitors execution progress] 105 | 106 |         C2_Load[Load/Store: LOAD completing
Result available next cycle] 107 | 108 |         C2_ALU1[ALU 1: MUL continues
Multi-cycle operation] 109 | 110 |         C2_ALU2[ALU 2: SUB completes] 111 | 112 |         C2_RS[Reservation Station: ADD ready for dispatch
STORE waiting for R2
BRANCH waiting for R4] 113 | 114 |     end 115 | 116 | 117 | 118 |     subgraph Cycle3[Scheduler Actions - Cycle 3] 119 | 120 |         direction TB 121 | 122 |         C3_Scheduler[Scheduler dispatches ready instructions] 123 | 124 |         C3_Load["Load/Store: STORE [0x200], R2"] 125 | 126 |         C3_ALU1[ALU 1: ADD R2, R1, R3
R1 forwarded via bypass] 127 | 128 |         C3_ALU2[ALU 2: Idle] 129 | 130 |         C3_Branch[Branch Unit: BRANCH if R4 > 0
R4 forwarded from MUL] 131 | 132 |         C3_RS[Reservation Station: Empty] 133 | 134 |     end 135 | 136 | 137 | 138 |     %% Dependencies 139 | 140 |     C1_Load -.->|Produces R1| C1_RS 141 | 142 |     C1_ALU1 -.->|Produces R4| C1_RS 143 | 144 |     C1_ALU2 -.->|Independent| C1_RS 145 | 146 |     C2_Load -.->|Makes R1 available| C2_RS 147 | 148 |     C2_ALU1 -.->|Still producing R4| C2_RS 149 | 150 |     C2_ALU2 -.->|Independent complete| C2_RS 151 | 152 |     C3_Load -.->|Uses R2 from ADD| C3_ALU1 153 | 154 |     C3_ALU1 -.->|Produces R2| C3_Load 155 | 156 |     C3_ALU1 -.->|Uses R1 from LOAD| C3_Load 157 | 158 |     C3_Branch -.->|Uses R4 from MUL| C3_ALU1 159 | 160 | 161 | 162 |     %% Cycle connections 163 | 164 |     Cycle1 --> Cycle2 165 | 166 |     Cycle2 --> Cycle3 167 | 168 | 169 | 170 |     classDef executing fill:#9f9,stroke:#333,stroke-width:2px; 171 | 172 |     classDef waiting fill:#f9f9b8,stroke:#333,stroke-width:1px; 173 | 174 |     classDef completed fill:#aaf,stroke:#333,stroke-width:2px; 175 | 176 |     classDef scheduler fill:#faa,stroke:#333,stroke-width:2px; 177 | 178 |     class C1_Load,C1_ALU1,C1_ALU2 executing; 179 | 180 |     class C1_RS waiting; 181 | 182 |     class C1_Scheduler,C2_Scheduler,C3_Scheduler scheduler; 183 | 184 |     class C2_ALU2 completed; 185 | 186 |     class C3_Load,C3_ALU1,C3_Branch executing; 187 | 188 |     class C3_RS completed; 189 | ``` 190 | 191 | ### The characteristics of different devices 192 | 193 | - The **CPU**, *central processing unit*, is the hub of a computer: it can perform all general calculations, manage memory, and etc. However its very slow for the immense amount of parallel processes for neural networks. 194 | - GPU, *graphical processing units*, were originally designed to speed up graphical computation. Graphical computation involves many parallel computations as indicated by the ten-thousands of pixels on your screen. **CUDA**, Compute Unified Device Architecture, is a parallel computing API proprietary to NVIDIA; it enables GPUs to parallelize general computations, rather than just graphical ones from the OpenGL or Vulkan API. 195 | - **MPS**, apple's imitation of **CUDA** for their SoCs. As of 2025, its less mature than **CUDA**. 196 | - **XPU**, intel's GPUs. Less mature than both Apple and NVIDIA. 197 | 198 | ### Code 199 | 200 | #### Basic Grammar 201 | 202 | ##### Setting the device 203 | ```python 204 | torch.device(device) 205 | ``` 206 | replace device with `'cuda'`, `'cpu'`, `'MPS'`, or `'XPU'` 207 | 208 | ##### Moving objects to your device 209 | ```python 210 | x = torch.randn(3, 3).to(device) # tensor to device 211 | model = model().to(device) # model to device 212 | ``` 213 | If your using a **CPU** this is not necessary 214 | 215 | ##### Multiple GPUs 216 | ```python 217 | if torch.cuda.device_count() > 1: 218 | model = torch.nn.DataParallel(model, device_ids=[0, 1]) # Use GPUs 0 and 1 219 | ``` 220 | 221 | ##### Clear GPU memory 222 | ```python 223 | torch.cuda.empty_cache() 224 | ``` 225 | 226 | #### Techniques 227 | 228 | ```python 229 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 230 | ``` 231 | This is a piece of code most Pytorch scripts include. 232 | If your system has a **CUDA** hardware, it will use it; if not, it will use your **CPU**. 233 | 234 | ```python 235 | loader = DataLoader(dataset, pin_memory=True) # Faster GPU transfer 236 | ``` 237 | If your using a GPU, this speeds up the transfer of memory from RAM to GRAM. 238 | 239 | ```python 240 | from torch.cuda.amp import autocast, GradScaler 241 | 242 | scaler = GradScaler() 243 | for x, y in loader: 244 | optimizer.zero_grad() 245 | with autocast(): 246 | pred = model(x) 247 | loss = criterion(pred, y) 248 | scaler.scale(loss).backward() 249 | scaler.step(optimizer) 250 | scaler.update() 251 | ``` 252 | Mixed precision. A technique NVIDIA developed that uses mixed precision to speed up performance and reduce memory consumption. 253 | However, this technique has some caveats which may lead to decreased performance or even a completely failed training session. 254 | -------------------------------------------------------------------------------- /0. Prerequisites/0.1. Basic Environment For Python/0.1 Setup.md: -------------------------------------------------------------------------------- 1 | 2 | ### **Lesson 0.1: Environment Setup** 3 | 4 | SHSID DATA SCIENCE CLUB - Gordon.H 5 | 6 | **Objective:** By the end of this lesson, you will have a robust, isolated, and powerful development environment on your computer. This setup is the professional standard for data science, machine learning, and deep learning work. It will save you countless hours of troubleshooting in the future. 7 | 8 | --- 9 | 10 | ### **Introduction: Why Is This So Important?** 11 | 12 | Imagine you are a chef. Before you can cook, you need to set up your kitchen: your knives must be sharp, your cutting boards clean, and your ingredients organized. A development environment is your digital kitchen. A clean, organized setup allows you to focus on the "cooking" (writing code and building models) instead of worrying about whether your tools work together. 13 | 14 | We will install four key components: 15 | 16 | 1. **Anaconda:** The manager of our entire kitchen. It handles Python and all the specialized "appliances" (libraries) we need, keeping them organized in separate drawers (environments). 17 | 2. **Python:** The core programming language we will be using. Anaconda will install this for us. 18 | 3. **VS Code:** Our state-of-the-art workbench and recipe book. It's a modern, powerful, and highly customizable code editor where we will write our code. 19 | 4. **NVIDIA CUDA & cuDNN (Optional but Recommended for Deep Learning):** A special high-performance oven (your GPU) and the instructions for how to use it. **This is only for users with an NVIDIA graphics card.** If you don't have one, don't worry! You can skip this section and still complete 95% of data science and machine learning tasks. 20 | 21 | Let's begin. 22 | 23 | --- 24 | 25 | ### **Step 1: Install Anaconda (The Foundation)** 26 | 27 | Anaconda is a distribution of Python that comes pre-packaged with many of the most common data science libraries. More importantly, it includes **Conda**, a powerful package and environment manager. 28 | 29 | 1. **Download:** Go to the [Anaconda Distribution download page](https://www.anaconda.com/products/distribution). 30 | 2. **Select Your OS:** Download the installer for your operating system (Windows, macOS, or Linux). 31 | 3. **Choose Python 3.x:** Download the version for the latest stable Python 3 release (e.g., Python 3.9 or higher). 32 | 4. **Run the Installer:** 33 | * Launch the installer you downloaded. 34 | * Click "Next" through the initial prompts. 35 | * **On Windows:** When you reach the "Advanced Installation Options" screen, it is **recommended to leave "Add Anaconda3 to my PATH environment variable" unchecked**. While checking it seems convenient, it can interfere with other software. We will use the dedicated **Anaconda Prompt** instead. 36 | * Proceed with the default settings for the rest of the installation. 37 | 38 | 5. **Verify the Installation:** 39 | * **Windows:** Open the **Anaconda Prompt** from your Start Menu. 40 | * **macOS/Linux:** Open your regular Terminal. 41 | * In the terminal window, type the following command and press Enter: 42 | ```bash 43 | conda --version 44 | ``` 45 | * You should see an output like `conda 23.7.4`. If you see this, Anaconda is installed correctly! 46 | 47 | ### **Step 2: Create an Isolated Conda Environment (A Critical Best Practice)** 48 | 49 | You should **never** install packages directly into your base Anaconda installation. Instead, you create isolated environments for each project. This prevents package conflicts (e.g., Project A needs version 1.0 of a library, but Project B needs version 2.0). 50 | 51 | 1. **Open your terminal** (Anaconda Prompt on Windows, Terminal on macOS/Linux). 52 | 53 | 2. **Create a new environment.** We will call it `datasci` and install Python 3.9 in it. You can choose a different name or Python version if you prefer. 54 | ```bash 55 | conda create --name datasci python=3.9 56 | ``` 57 | Conda will show you a list of packages to be installed and ask you to proceed (`y/n`). Type `y` and press Enter. 58 | 59 | 3. **Activate the environment.** To use an environment, you must "activate" it. 60 | ```bash 61 | conda activate datasci 62 | ``` 63 | You will notice that your command prompt's prefix changes from `(base)` to `(datasci)`. This tells you that your new environment is active. Any package you install now will be placed inside `datasci`, leaving your `base` environment clean. 64 | 65 | > **Pro-Tip:** To leave an environment, simply type `conda deactivate`. You will return to the `(base)` environment. You can also use `conda activate base`. 66 | 67 | ### **Step 3: Install Visual Studio Code (The Code Editor)** 68 | 69 | VS Code is the most popular code editor in the world for a reason. It's free, fast, and has a massive ecosystem of extensions that can tailor it to your exact needs. 70 | 71 | 1. **Download:** Go to the [VS Code download page](https://code.visualstudio.com/download) and get the installer for your OS. 72 | 2. **Install:** Run the installer, accepting the default options. On Windows, ensure the "Add to PATH" option is checked during installation, as this is very useful. 73 | 3. **Install the Essential Extension:** 74 | * Open VS Code. 75 | * On the left-hand side, click the "Extensions" icon (it looks like four squares, with one flying off). 76 | * In the search bar, type `Python`. 77 | * Install the one published by **Microsoft**. It is the official extension and provides rich language support, debugging, and more. 78 | 79 | 4. **Connect VS Code to your Conda Environment:** 80 | * This is the most important integration step. 81 | * Open VS Code. 82 | * Press `Ctrl+Shift+P` (or `Cmd+Shift+P` on Mac) to open the Command Palette. 83 | * Type `Python: Select Interpreter`. 84 | * A list of available Python interpreters will appear. Find and select the one that includes your environment name, e.g., **`('datasci': conda)`**. It will point to the Python executable inside your `datasci` environment folder. 85 | 86 | Now, when you open a terminal inside VS Code (`Ctrl+`` or `View > Terminal`), it should automatically activate your `(datasci)` environment! 87 | 88 | --- 89 | 90 | ### **Step 4: NVIDIA GPU Setup (CUDA & cuDNN)** 91 | 92 | **⚠️ Important:** Only perform this step if you have an **NVIDIA GPU**. If you have an AMD GPU or integrated graphics, please skip to Step 5. 93 | 94 | Deep learning frameworks like TensorFlow and PyTorch can use the massively parallel processing power of NVIDIA GPUs to train models orders of magnitude faster. This requires three components: the driver, the CUDA Toolkit, and the cuDNN library. 95 | 96 | 1. **Check Your GPU:** Open your terminal and run: 97 | ```bash 98 | nvidia-smi 99 | ``` 100 | If this command works, it will show you your GPU name and, crucially, the highest **CUDA Version** your driver supports in the top-right corner. **Note this version.** If the command fails, you need to install or update your NVIDIA drivers from the [NVIDIA website](https://www.nvidia.com/Download/index.aspx) first. 101 | 102 | 2. **Install CUDA Toolkit:** 103 | * The CUDA Toolkit version you install must be **less than or equal to** the version supported by your driver. It also needs to be compatible with the deep learning library you plan to use (PyTorch/TensorFlow). A good, safe choice is often **CUDA 11.8**. 104 | * Go to the [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive). 105 | * Find your desired version (e.g., 11.8.0), select your OS and installer type, and download. 106 | * Run the installer. When prompted, choose the **Custom (Advanced)** installation and **deselect everything except the CUDA components**. Specifically, do **not** let it install a graphics driver, as you already have a newer one. 107 | 108 | 3. **Install cuDNN:** 109 | * cuDNN is a library that provides highly optimized routines for deep learning operations. 110 | * Go to the [cuDNN Archive](https://developer.nvidia.com/rdp/cudnn-archive). You will need to sign up for a free NVIDIA Developer account. 111 | * Find the cuDNN version that corresponds to your CUDA Toolkit version (e.g., "cuDNN v8.9.5 for CUDA 11.x"). Download the zip file. 112 | * **This is a manual installation:** 113 | a. Unzip the downloaded file. You will see three folders: `bin`, `include`, and `lib`. 114 | b. Navigate to your CUDA Toolkit installation directory. By default, this is: `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8` 115 | c. Copy the contents of the unzipped folders into the corresponding folders in your CUDA installation directory. 116 | * Copy `cudnn-*.h` from `include` to the CUDA `include` folder. 117 | * Copy `cudnn-*.lib` from `lib` to the CUDA `lib` folder. 118 | * Copy `cudnn-*.dll` from `bin` to the CUDA `bin` folder. 119 | 120 | --- 121 | 122 | ### **Step 5: Install Core Data Science Libraries** 123 | 124 | Let's install the essential packages into our `datasci` environment. 125 | 126 | 1. Make sure your `datasci` environment is active in your terminal (`(datasci)` should be visible). 127 | 2. Install the core stack using conda: 128 | ```bash 129 | conda install numpy pandas matplotlib scikit-learn jupyter 130 | ``` 131 | 3. **(Optional) Install a Deep Learning Library:** 132 | * **PyTorch (Recommended):** Go to the [PyTorch website's get-started local page](https://pytorch.org/get-started/locally/). Select the stable build, your OS, Conda, Python, and your CUDA version (e.g., 11.8). It will generate a command for you. Copy and run it. It will look something like this: 133 | ```bash 134 | conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia 135 | ``` 136 | * **TensorFlow:** Installation is usually done with `pip`. 137 | ```bash 138 | pip install tensorflow 139 | ``` 140 | 141 | ### **Final Verification** 142 | 143 | Let's make sure everything works together. 144 | 145 | 1. In VS Code, create a new file named `test_env.py`. 146 | 2. Make sure your `datasci` interpreter is selected in the bottom-right corner. 147 | 3. Paste this code into the file: 148 | 149 | ```python 150 | import numpy as np 151 | import pandas as pd 152 | import sklearn 153 | 154 | # Optional: PyTorch GPU check 155 | try: 156 | import torch 157 | print(f"PyTorch version: {torch.__version__}") 158 | gpu_available = torch.cuda.is_available() 159 | print(f"Is GPU available? {gpu_available}") 160 | if gpu_available: 161 | print(f"GPU Name: {torch.cuda.get_device_name(0)}") 162 | except ImportError: 163 | print("PyTorch not installed.") 164 | 165 | print("\nEnvironment setup is successful!") 166 | print(f"NumPy version: {np.__version__}") 167 | print(f"Pandas version: {pd.__version__}") 168 | print(f"Scikit-learn version: {sklearn.__version__}") 169 | ``` 170 | 171 | 4. Right-click in the editor and select "Run Python File in Terminal". 172 | 173 | If the script runs without errors and you see the version numbers and a "successful" message, **congratulations!** You have successfully built a professional-grade data science environment. If you installed the GPU components, you should see `Is GPU available? True`. 174 | 175 | You are now ready to tackle any data science or machine learning project. -------------------------------------------------------------------------------- /2. Machine Learning Generics/2.3. Regularization, Bias-Variance Trade-Off, Kernel Methods, Cross Validation/Cross Validation.md: -------------------------------------------------------------------------------- 1 | # 🚀 Cross-Validation 2 | --- 3 | Gordon.H | SHSID Data Science Club 4 | --- 5 | 6 | Hello again! We have just learned how to spot underfitting (high bias) and overfitting (high variance). We've even learned how to *tame* overfitting with regularization. 7 | 8 | But this raises a crucial question: **How do we get a reliable score for our model?** How can we be sure that a model is genuinely good, and not just "lucky" on the one-time test set we gave it? 9 | 10 | Today, we'll learn the gold-standard technique for model evaluation: **Cross-Validation**. 11 | 12 | ### Our Learning Journey Today 13 | 14 | We'll learn the professional's method for evaluating models and tuning their "dials" (hyperparameters). 15 | 16 | ```mermaid 17 | graph LR 18 | A[📍 We are here
We know about train/test split] --> B(Part 1: The Problem with a Single Split
The "Lucky" Test Set); 19 | B --> C(Part 2: K-Fold Cross-Validation
The Fair Solution); 20 | C --> D(Part 3: The Killer App: Hyperparameter Tuning
Finding the Best Settings); 21 | D --> E(🏆 We will be here
You can confidently evaluate and tune any model); 22 | 23 | style A fill:#f9f,stroke:#333,stroke-width:2px 24 | style E fill:#9f9,stroke:#333,stroke-width:2px 25 | ``` 26 | 27 | --- 28 | 29 | ## Part 1: The Problem with a Single Train/Test Split 30 | 31 | So far, you've probably seen this workflow: 32 | 1. Take all your data. 33 | 2. Split it once into a training set and a testing set (e.g., 80% train, 20% test). 34 | 3. Train your model on the training set. 35 | 4. Evaluate its performance on the testing set. 36 | 37 | ```mermaid 38 | graph LR 39 | subgraph Single Split Method 40 | Data[Full Dataset] --> Split{80/20 Split} 41 | Split --> Train[Training Set (80%)] 42 | Split --> Test[Test Set (20%)] 43 | Train --> Model[Train Model] 44 | Model --> Evaluate 45 | Test --> Evaluate{Evaluate Model} 46 | Evaluate --> Score[Get ONE Score] 47 | end 48 | ``` 49 | 50 | **The Problem:** The final score depends heavily on *which* 20% of the data ended up in the test set. 51 | * What if, just by random chance, the test set contained all the "easy" examples? You'd get a great score and think your model is a genius! 52 | * What if the test set happened to get all the "hard" or "weird" examples? You'd get a terrible score and might discard a perfectly good model. 53 | 54 | This is **high variance in your evaluation**. The score you get is not stable or reliable. We need a better, more robust way. 55 | 56 | --- 57 | 58 | ## Part 2: K-Fold Cross-Validation - A More Robust Referee 59 | 60 | Instead of a single split, Cross-Validation says: **"Let's do this multiple times and average the results!"** 61 | 62 | The most common method is **K-Fold Cross-Validation**. 63 | 64 | **The Main Idea:** 65 | 1. Split the *entire* dataset into `K` equal-sized "folds" (or groups). A common choice for `K` is 5 or 10. 66 | 2. Then, we run `K` experiments. In each experiment: 67 | * We pick **one** fold to be our test set. 68 | * We use the **remaining `K-1` folds** as our training set. 69 | 3. We train the model, evaluate it on the test fold, and record the score. 70 | 4. After running `K` times (with each fold getting a turn to be the test set), we average the `K` scores to get a final, more reliable performance estimate. 71 | 72 | ### Visualizing 5-Fold Cross-Validation (K=5) 73 | 74 | ```mermaid 75 | graph LR 76 | Data[Full Dataset] --> S[Split into 5 Folds] 77 | S --> F1[Fold 1] & F2[Fold 2] & F3[Fold 3] & F4[Fold 4] & F5[Fold 5] 78 | 79 | subgraph Iteration 1 80 | direction LR 81 | Train1[Train on F2,F3,F4,F5] --> Test1(Test on F1) --> Score1[Score 1] 82 | end 83 | 84 | subgraph Iteration 2 85 | direction LR 86 | Train2[Train on F1,F3,F4,F5] --> Test2(Test on F2) --> Score2[Score 2] 87 | end 88 | 89 | subgraph Iteration 3 90 | direction LR 91 | Train3[Train on F1,F2,F4,F5] --> Test3(Test on F3) --> Score3[Score 3] 92 | end 93 | 94 | subgraph Iteration 4 95 | direction LR 96 | Train4[Train on F1,F2,F3,F5] --> Test4(Test on F4) --> Score4[Score 4] 97 | end 98 | 99 | subgraph Iteration 5 100 | direction LR 101 | Train5[Train on F1,F2,F3,F4] --> Test5(Test on F5) --> Score5[Score 5] 102 | end 103 | 104 | Score1 & Score2 & Score3 & Score4 & Score5 --> Final{Average the Scores} 105 | Final --> FinalScore[Final CV Score
± a measure of variance] 106 | 107 | style Test1 fill:#f99 108 | style Test2 fill:#f99 109 | style Test3 fill:#f99 110 | style Test4 fill:#f99 111 | style Test5 fill:#f99 112 | ``` 113 | 114 | ### The Math: Simple and Sweet 115 | 116 | The final Cross-Validation score is just the average of the scores from each fold. 117 | 118 | $$ \text{CV}_{\text{score}} = \frac{1}{K} \sum_{i=1}^{K} \text{score}_i = \frac{\text{score}_1 + \text{score}_2 + ... + \text{score}_K}{K} $$ 119 | 120 | We also look at the **standard deviation** of the scores. A low standard deviation tells us the model's performance is stable and consistent across different subsets of the data. A high standard deviation means the performance is erratic. 121 | 122 | ### 🐍 Python Example: Evaluating Our Bias-Variance Models 123 | 124 | Let's use 5-fold CV to evaluate the three polynomial models from our last lesson. Which one will CV tell us is the best? 125 | 126 | ```python 127 | import numpy as np 128 | from sklearn.model_selection import cross_val_score 129 | from sklearn.pipeline import make_pipeline 130 | from sklearn.linear_model import LinearRegression 131 | from sklearn.preprocessing import PolynomialFeatures 132 | 133 | # 1. Generate the same sample data 134 | np.random.seed(0) 135 | X = np.linspace(0, 10, 100).reshape(-1, 1) # More data points for CV 136 | y = np.sin(X).ravel() + np.random.normal(0, 0.5, 100) 137 | 138 | # 2. Define our three models from the bias-variance lesson 139 | underfit_model = make_pipeline(PolynomialFeatures(degree=1), LinearRegression()) 140 | just_right_model = make_pipeline(PolynomialFeatures(degree=4), LinearRegression()) 141 | overfit_model = make_pipeline(PolynomialFeatures(degree=15), LinearRegression()) 142 | 143 | # 3. Use 5-fold cross-validation (cv=5) to evaluate each model 144 | # 'neg_mean_squared_error' is used because scikit-learn likes to maximize scores. 145 | # We'll just flip the sign back to positive to interpret it as error. 146 | scores_underfit = -cross_val_score(underfit_model, X, y, cv=5, scoring='neg_mean_squared_error') 147 | scores_just_right = -cross_val_score(just_right_model, X, y, cv=5, scoring='neg_mean_squared_error') 148 | scores_overfit = -cross_val_score(overfit_model, X, y, cv=5, scoring='neg_mean_squared_error') 149 | 150 | # 4. Print the results 151 | print("--- Underfit Model (Degree 1) ---") 152 | print("Individual Fold Errors:", scores_underfit.round(2)) 153 | print(f"Average CV Error: {scores_underfit.mean():.2f} (+/- {scores_underfit.std():.2f})\n") 154 | 155 | print("--- Just Right Model (Degree 4) ---") 156 | print("Individual Fold Errors:", scores_just_right.round(2)) 157 | print(f"Average CV Error: {scores_just_right.mean():.2f} (+/- {scores_just_right.std():.2f})\n") 158 | 159 | print("--- Overfit Model (Degree 15) ---") 160 | print("Individual Fold Errors:", scores_overfit.round(2)) 161 | print(f"Average CV Error: {scores_overfit.mean():.2f} (+/- {scores_overfit.std():.2f})\n") 162 | ``` 163 | 164 | **Expected Output & Analysis:** 165 | * **Underfit Model:** Will have a consistently high error (e.g., avg error ~0.6). The standard deviation will be relatively low because it's consistently bad. 166 | * **Just Right Model:** Will have the **lowest average error** (e.g., avg error ~0.25). This is our winner! 167 | * **Overfit Model:** Will have a very high average error, and likely a **huge standard deviation**. This is because its performance is highly dependent on the specific data in each fold—it does well on some folds and terribly on others. 168 | 169 | Cross-validation correctly and reliably identified the "Just Right" model as the best! 170 | 171 | --- 172 | 173 | ## Part 3: The Killer Application: Hyperparameter Tuning 174 | 175 | Models have "dials" you can turn called **hyperparameters**. Examples include: 176 | * The `alpha` ($\lambda$) in Ridge and Lasso regression. 177 | * The `C` and `gamma` ($\gamma$) in Support Vector Machines. 178 | * The `degree` of the polynomial we just used. 179 | 180 | How do we find the *best* setting for these dials? 181 | 182 | **The Golden Rule of Machine Learning:** The test set is a final exam. You only use it **ONCE**, at the very end, to report your final score. You cannot use it to tune your hyperparameters. Using the test set to pick the best `alpha` is a form of "cheating" or **data leakage**. 183 | 184 | So, how do we do it? **We use Cross-Validation on the training set!** 185 | 186 | ### The Correct Workflow 187 | 188 | ```mermaid 189 | graph LR 190 | Data[Full Dataset] --> Split1{Step 1: The Great Split
Split into Train & Final Test sets} 191 | Split1 --> FinalTest[Final Test Set (20%)
LOCK THIS AWAY! 🔒] 192 | Split1 --> TrainVal[Training + Validation Set (80%)] 193 | 194 | subgraph Step 2: Hyperparameter Tuning using CV 195 | direction LR 196 | TrainVal --> CV[Perform K-Fold CV
within this 80% data] 197 | CV -- "for each alpha value" --> Scores[Get avg CV score for alpha] 198 | end 199 | 200 | Scores --> Best[Find alpha with best CV score] 201 | 202 | Best --> Step3["Step 3: Final Training
Train a NEW model on ALL of the
Training+Validation data (80%),
using the best alpha."] 203 | 204 | Step3 --> FinalEval["Step 4: Final Evaluation
Unlock the Test Set! 🔓
Evaluate your final model ONCE."] 205 | FinalTest --> FinalEval 206 | FinalEval --> Report[Report Final Score] 207 | 208 | style FinalTest fill:#f99 209 | ``` 210 | 211 | This seems complicated, but thankfully, `scikit-learn` has a tool that does all of Step 2 for us automatically: `GridSearchCV`. 212 | 213 | ### 🐍 Python Example: Finding the Best `alpha` for Ridge Regression 214 | 215 | `GridSearchCV` will test a "grid" of hyperparameter values using cross-validation and tell us which one was the best. 216 | 217 | ```python 218 | from sklearn.model_selection import GridSearchCV 219 | from sklearn.linear_model import Ridge 220 | 221 | # 1. We're still using our same X and y data. 222 | # Imagine this is our "Training + Validation Set (80%)" 223 | 224 | # 2. Define the model we want to tune 225 | model_to_tune = Ridge() 226 | 227 | # 3. Set up the "grid" of hyperparameters to test. 228 | # We'll give it a list of different alpha values to try. 229 | param_grid = { 230 | 'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000] # These are the λ values 231 | } 232 | 233 | # 4. Set up GridSearchCV 234 | # It will try each alpha value using 5-fold CV. 235 | # n_jobs=-1 uses all your computer's cores to speed it up! 236 | grid_search = GridSearchCV(model_to_tune, param_grid, cv=5, scoring='neg_mean_squared_error') 237 | 238 | # 5. Run the search! This does all the work from the diagram's "Step 2". 239 | grid_search.fit(X, y) 240 | 241 | # 6. Print the best results 242 | print("GridSearchCV found the best settings!") 243 | print("Best alpha (λ):", grid_search.best_params_) 244 | print("Best CV Score (MSE):", -grid_search.best_score_) 245 | ``` 246 | **Analysis:** 247 | `GridSearchCV` will automatically iterate through all the `alpha` values, perform 5-fold cross-validation for each one, and store the results. The `.best_params_` attribute will show you which `alpha` gave the lowest average error across the folds. You would then take this `alpha` to train your final model. 248 | 249 | --- 250 | 251 | ## A Quick Note on Other CV Methods 252 | 253 | * **Stratified K-Fold:** When you have a classification problem with imbalanced classes (e.g., 99% not-fraud, 1% fraud), you need to make sure each fold has the same percentage of each class. `StratifiedKFold` does this automatically. 254 | * **Leave-One-Out (LOOCV):** This is K-Fold where `K` is equal to the number of data points. In each step, you train on all data points except one, and test on that one. It's very thorough but extremely slow and usually only used for very small datasets. 255 | 256 | ## Conclusion: Your Professional ML Workflow 257 | 258 | You now have the complete, professional workflow for building a machine learning model. 259 | 260 | > **Your Problem-Solving Guide:** 261 | > 262 | > 1. **Split Data:** Perform the one-time split into `train_set` and `test_set`. Lock the `test_set` away. 263 | > 2. **Choose a Model:** Pick a model appropriate for your problem (e.g., Ridge for regression, SVM for classification). 264 | > 3. **Tune Hyperparameters:** Use **GridSearchCV** on the `train_set` to find the best hyperparameters for your model. 265 | > 4. **Train Final Model:** Train your chosen model with the best hyperparameters on the *entire* `train_set`. 266 | > 5. **Final Evaluation:** Unleash the `test_set` and evaluate your final model once to get its true performance score. 267 | 268 | Cross-validation is the bridge between theory and practice. It's how we move from worrying about bias and variance to confidently building models that work well in the real world. Happy (and robust) modeling -------------------------------------------------------------------------------- /4. Deep Learning & Computer Vision/4.4. Convolutional Layers, Pooling Layers, Convolutional Neural Network/CNN.md: -------------------------------------------------------------------------------- 1 | # Seeing the world - Convolutional Neural Networks 2 | * Gordon.H | SHSID Data Science Group 3 | 4 | *Welcome back to the course, Junior ML Engineers !* 5 | 6 | Today we will be learning about the ultimate solution for image processing, **Convulutional Neural Netorks** 7 | 8 | --- 9 | ### Requirements 10 | * Understanding of the fundamentals of Machine learning 11 | * Basic Knowledge of Neural Networks 12 | * Basic Python and Numpy Library Usage 13 | 14 | --- 15 | ### 1. Problem with Images 16 | 17 | You have a small gray scale image of size 28*28 pixels 18 | * Size = 28*28 = 784 19 | * To feed it into a dense layer, we flatten into a vector of **784** numbers. 20 | * If first layer has 128 neurons we will need **100,352** weights 21 | 22 | This is a huge problem because: 23 | * It is inefficient with such large parameters to train 24 | * Spatial information is lost when we flatten the image 25 | 26 | Now, CNN's are designed to solve the problem with a smart approach 27 | 28 | ```mermaid 29 | graph TD 30 | subgraph Dense Network Approach 31 | A[Input Image
28x28x1] --> B{Flatten}; 32 | B --> C[1D Vector
784 neurons]; 33 | C --> D[Dense Layer]; 34 | style A fill:#f9f,stroke:#333,stroke-width:2px 35 | end 36 | 37 | subgraph CNN Approach 38 | E[Input Image
28x28x1] --> F{Convolutional Layer}; 39 | F --> G[Feature Map
e.g., 26x26x32]; 40 | style E fill:#9cf,stroke:#333,stroke-width:2px 41 | end 42 | 43 | A -- "Loses spatial structure" --> C 44 | E -- "Preserves spatial structure" --> G 45 | ``` 46 | As you see, CNN keeps the image's 2D structure, allowing it to learn from pixel neighborhoods. 47 | 48 | --- 49 | 50 | ### 2. The core of CNN : Convolutional Layer 51 | 52 | Instead of looking at a large image at once, CNN looks at it in small chunks. 53 | 54 | A **filter** is a small matrix of weights that the network learns. The process of sliding the filter and computing the output is called a **convolution**. 55 | 56 | Here's a mini-example of a 2x2 filter sliding over a 3x3 image to produce a 2x2 feature map. 57 | 58 | ``` 59 | Input Image (I) Filter (K) Feature Map (O) 60 | +---+---+---+ +---+---+ +---+---+ 61 | | 1 | 5 | 2 | | 1 | 0 | | 9 | ? | 62 | +---+---+---+ +---+---+ +---+---+ 63 | | 8 | 1 | 6 | | 1 | 0 | | ? | ? | 64 | +---+---+---+ +---+---+ +---+---+ 65 | | 3 | 4 | 7 | 66 | +---+---+---+ 67 | ``` 68 | To calculate the top-left value of the output: `(1*1) + (5*0) + (8*1) + (1*0) = 9`. 69 | 70 | #### The Mathematical Logic 71 | 72 | The mathematical formula for such operation, **cross-correlation**, looks like this: 73 | $$ 74 | O_{i,j} = b + \sum_{u=0}^{F-1} \sum_{v=0}^{F-1} I_{i+u, j+v} \cdot K_{u,v} 75 | $$ 76 | 77 | Looks complicated right? Lets break it down: 78 | 79 | * $O_{i,j}$: The output value at row `i`, column `j` in the feature map. 80 | 81 | * $b$: A learnable **bias** term, which helps the filter make better predictions. 82 | 83 | * $\sum$: The "sum" symbol. We sum over the filter's dimensions (`u` and `v`). 84 | 85 | * $I_{i+u, j+v}$: A pixel value from the **Input** image patch. 86 | 87 | * $K_{u,v}$: A weight from our **Kernel** (aka **the filter**). 88 | 89 | This formula is a precise mathematical formula for cross correlation in Machine Learning, in mathematics convolution is a bit different, it involves flipping the filter (both horizontally and vertically) before sliding it over the image. The reason for such difference is due to the special nature of neural networks, the values in the filter are learned during training, the network can simply learn the flipped version of the filter if it needs to. The cross correlation is easier to implement. 90 | 91 | #### Hyperparameters and Output Size 92 | The size of our output feature map depends on the hyperparameters we choose. The output width ($W_{out}$) and height ($H_{out}$) can be calculated with this formula: 93 | 94 | $$ 95 | W_{out} = \frac{W_{in} - F + 2P}{S} + 1 96 | $$ 97 | $$ 98 | H_{out} = \frac{H_{in} - F + 2P}{S} + 1 99 | $$ 100 | 101 | Where: 102 | * $W_{in}, H_{in}$: Input width and height. 103 | * $F$: Filter size. 104 | * $P$: Padding (number of pixels added to the border). 105 | * $S$: Stride (how many pixels the filter slides at a time). 106 | 107 | #### Example Code 108 | *Note: You can run the following code locally to try out convolutional layers!* 109 | 110 | ```Python 111 | # Remember to use pip to install numpy and matplotlib! 112 | import numpy as np 113 | import matplotlib.pyplot as plt 114 | 115 | # 1. Define the Input and Filter 116 | # A simple 6x6 grayscale image. 117 | # It has a sharp vertical edge down the middle. 118 | # (Low values = dark, high values = light) 119 | input_image = np.array([ 120 | [10, 10, 10, 100, 100, 100], 121 | [10, 10, 10, 100, 100, 100], 122 | [10, 10, 10, 100, 100, 100], 123 | [10, 10, 10, 100, 100, 100], 124 | [10, 10, 10, 100, 100, 100], 125 | [10, 10, 10, 100, 100, 100] 126 | ]) 127 | 128 | # A 3x3 filter designed to detect vertical edges. 129 | # The positive values on the left and negative on the right 130 | # will give a high response when we move from dark to light. 131 | vertical_edge_filter = np.array([ 132 | [ 1, 0, -1], 133 | [ 2, 0, -2], # This is a "Sobel" filter, common in image processing 134 | [ 1, 0, -1] 135 | ]) 136 | 137 | # 2. The Convolution Operation 138 | # Get dimensions (assuming no padding, stride=1) 139 | img_h, img_w = input_image.shape 140 | filter_h, filter_w = vertical_edge_filter.shape 141 | out_h = (img_h - filter_h) + 1 142 | out_w = (img_w - filter_w) + 1 143 | 144 | # Create an empty feature map to store the output 145 | output_feature_map = np.zeros((out_h, out_w)) 146 | 147 | # Slide filter over the image 148 | for y in range(out_h): 149 | for x in range(out_w): 150 | # Get current patch of the image 151 | image_patch = input_image[y : y + filter_h, x : x + filter_w] 152 | 153 | # Perform element-wise multiplication and sum the result 154 | # This is the core of the convolution! 155 | convolution_sum = np.sum(image_patch * vertical_edge_filter) 156 | 157 | # Store result in the map 158 | output_feature_map[y, x] = convolution_sum 159 | # 3.Display Results 160 | print("--- Manual NumPy Convolution ---\n") 161 | print("Input Image:\n", input_image) 162 | print("\nVertical Edge Filter:\n", vertical_edge_filter) 163 | print("\nOutput Feature Map:\n", output_feature_map) 164 | print("\nNotice the high values in the output where the vertical edge was detected!") 165 | # Visualize the images 166 | fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4)) 167 | ax1.imshow(input_image, cmap='gray') 168 | ax1.set_title("Original Image") 169 | ax2.imshow(output_feature_map, cmap='gray') 170 | ax2.set_title("Feature Map (Edges)") 171 | plt.show() 172 | ``` 173 | 174 | --- 175 | ### 3. Making it Robust: The Pooling layer 176 | 177 | A Pooling layer shrinks the feature map to make the network faster and robust. The most common type of pooling is **Max Pooling**. 178 | 179 | #### Visualizing Max Pooling 180 | 181 | Imagine a 2x2 Max Pooling operation on a 4x4 feature map. 182 | 183 | ``` 184 | Feature Map Pooled Output 185 | +---+---+---+---+ +---+---+ 186 | |12 |20 | 30| 0 | max(12,20,8,12)--> |20 | 187 | +---+---+---+---+ +---+---+ 188 | | 8 |12 | 2 | 0 | max(30,0,2,0)--> |30 | 189 | +---+---+---+---+ +---+---+ 190 | |34 |70 | 37| 4 | max(34,70,112,100)-->|112| 191 | +---+---+---+---+ +---+---+ 192 | |112|100| 25| 12| max(37,4,25,12)--> |37 | 193 | +---+---+---+---+ +---+---+ 194 | ``` 195 | This keeps only the strongest signal, reducing the map size from 4x4 to 2x2. 196 | 197 | #### The Math Behind Pooling 198 | 199 | Here is the formula for Max Pooling: 200 | 201 | $$ 202 | P_{i,j} = \max_{0 \le u < F_p, 0 \le v < F_p} A_{i \cdot S_p + u, j \cdot S_p + v} 203 | $$ 204 | 205 | This formally states: "The output $P_{i,j}$ is the `max` value from the input feature map `A` within the pooling window." 206 | 207 | --- 208 | ### Putting it all together: A full CNN Architecture 209 | 210 | A real world CNN stacks up all these layers 211 | 212 | ``` mermaid 213 | graph LR 214 | A["Input Image (28x28x1)"] --> B["Conv2D Layer\n32 filters, 3x3\nOutput: 26x26x32"] 215 | B --> C["MaxPooling2D\n2x2 window\nOutput: 13x13x32"] 216 | C --> D["Conv2D Layer\n64 filters, 3x3\nOutput: 11x11x64"] 217 | D --> E["MaxPooling2D\n2x2 window\nOutput: 5x5x64"] 218 | E --> F["Flatten Layer\nOutput: 1600 nodes"] 219 | F --> G["Dense Layer\n128 nodes"] 220 | G --> H["Output Layer\n10 nodes (Softmax)"] 221 | 222 | subgraph Feature Extractor 223 | B; C; D; E; 224 | end 225 | 226 | subgraph Classifier 227 | F; G; H; 228 | end 229 | ``` 230 | The final layer uses a **Softmax** activation function to convert the network's scores into a probability distribution. 231 | 232 | The **Softmax** function for a specific output class `i` is: 233 | 234 | $$ 235 | \sigma(\mathbf{z})_i = \frac{e^{z_i}}{\sum_{j=1}^{C} e^{z_j}} 236 | $$ 237 | 238 | The formula gurantees that all output values are between 0 to 1 and sums up to be 1. This allows us to treat them as the model's confidence for each class. 239 | 240 | --- 241 | 242 | ### 5. Coding Example: Full Functional CNN Architecture 243 | 244 | The following example uses Pytorch and Matplotlib to create an example CNN architecture. 245 | 246 | ``` Python 247 | import torch 248 | import torch.nn as nn 249 | import torch.optim as optim 250 | from torchvision import datasets, transforms 251 | from torch.utils.data import DataLoader 252 | import matplotlib.pyplot as plt 253 | 254 | # Define the CNN architecture 255 | class MNIST_CNN(nn.Module): 256 | def __init__(self): 257 | super(MNIST_CNN, self).__init__() 258 | # Feature extractor 259 | self.features = nn.Sequential( 260 | nn.Conv2d(1, 32, kernel_size=3), # 28x28x1 -> 26x26x32 261 | nn.ReLU(), 262 | nn.MaxPool2d(2), # 26x26x32 -> 13x13x32 263 | nn.Conv2d(32, 64, kernel_size=3), # 13x13x32 -> 11x11x64 264 | nn.ReLU(), 265 | nn.MaxPool2d(2) # 11x11x64 -> 5x5x64 266 | ) 267 | 268 | # Classifier 269 | self.classifier = nn.Sequential( 270 | nn.Flatten(), # 5x5x64 -> 1600 271 | nn.Linear(5*5*64, 128), # 1600 -> 128 272 | nn.ReLU(), 273 | nn.Linear(128, 10) # 128 -> 10 274 | ) 275 | 276 | def forward(self, x): 277 | x = self.features(x) 278 | x = self.classifier(x) 279 | return x 280 | 281 | # Initialize model, loss function, and optimizer 282 | model = MNIST_CNN() 283 | criterion = nn.CrossEntropyLoss() 284 | optimizer = optim.Adam(model.parameters(), lr=0.001) 285 | 286 | # Load MNIST data 287 | transform = transforms.Compose([ 288 | transforms.ToTensor(), 289 | transforms.Normalize((0.1307,), (0.3081,)) 290 | ]) 291 | 292 | train_data = datasets.MNIST('./data', train=True, download=True, transform=transform) 293 | test_data = datasets.MNIST('./data', train=False, transform=transform) 294 | 295 | train_loader = DataLoader(train_data, batch_size=64, shuffle=True) 296 | test_loader = DataLoader(test_data, batch_size=1000) 297 | 298 | # Training function 299 | def train(model, device, train_loader, optimizer, epoch): 300 | model.train() 301 | for batch_idx, (data, target) in enumerate(train_loader): 302 | optimizer.zero_grad() 303 | output = model(data) 304 | loss = criterion(output, target) 305 | loss.backward() 306 | optimizer.step() 307 | 308 | if batch_idx % 100 == 0: 309 | print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ' 310 | f'({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}') 311 | 312 | # Test function 313 | def test(model, device, test_loader): 314 | model.eval() 315 | test_loss = 0 316 | correct = 0 317 | with torch.no_grad(): 318 | for data, target in test_loader: 319 | output = model(data) 320 | test_loss += criterion(output, target).item() 321 | pred = output.argmax(dim=1, keepdim=True) 322 | correct += pred.eq(target.view_as(pred)).sum().item() 323 | 324 | test_loss /= len(test_loader) 325 | accuracy = 100. * correct / len(test_loader.dataset) 326 | print(f'\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ' 327 | f'({accuracy:.0f}%)\n') 328 | return accuracy 329 | 330 | # Training loop 331 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 332 | model.to(device) 333 | 334 | accuracies = [] 335 | for epoch in range(1, 6): # 5 epochs 336 | train(model, device, train_loader, optimizer, epoch) 337 | acc = test(model, device, test_loader) 338 | accuracies.append(acc) 339 | 340 | # Plot accuracy 341 | plt.plot(range(1, 6), accuracies) 342 | plt.title('Model Accuracy') 343 | plt.xlabel('Epoch') 344 | plt.ylabel('Accuracy (%)') 345 | plt.show() 346 | 347 | # Save model 348 | torch.save(model.state_dict(), 'mnist_cnn.pth') 349 | ``` 350 | 351 | ### Summary & Conclusion 352 | 353 | **Congratulations!** You have just completed your lesson on Convolutional Neural Networks! 354 | 355 | Throughout this lesson you have learned: 356 | 357 | * How **Convolutional Layers** use filters to find features, and you've seen the formal math behind the process. 358 | * How **Pooling Layers** make the network robust and efficient. 359 | * Understanded the **CNN** architecture and has saw the code to build it. 360 | 361 | In the next lesson, we will learn about video data augmentation. 362 | -------------------------------------------------------------------------------- /2. Machine Learning Generics/2.3. Regularization, Bias-Variance Trade-Off, Kernel Methods, Cross Validation/Regularization, Bias–Variance Trade-off, Kernel Methods.md: -------------------------------------------------------------------------------- 1 | # Lesson 3.4 | Regularization, Bias-Variance Trade-off, Kernel Methods 🚀 2 | --- 3 | * Gordon.H | SHSID ML Club 4 | --- 5 | Hi, Junior ML engineer, welcome back to the course. Today,we will explore some of the most powerful concepts in machine learning. 6 | So far, you have learned about building basic models like linear regression. 7 | But have you ever wondered... 8 | 9 | * "Why is my model so bad at predicting things it has not seen before?" 10 | * "How do I stop my model from just memorizing the answers?" 11 | * "How can a computer find patterns that aren't just straight lines?" 12 | 13 | Today we will learn about three foundational concepts that are at the heart of modern machine learning. 14 | 15 | Here is an overview of this lesson: 16 | 17 | ```mermaid 18 | graph LR 19 | A[📍 You are here
You can build a basic model] --> B(Part 1: The Bias-Variance Tradeoff
The #1 challenge in ML); 20 | B --> C(Part 2: Regularization
The cure for 'memorizing'); 21 | C --> D(Part 3: The Kernel Method
The magic trick for complex patterns); 22 | D --> E(🏆 You will be here
You can build robust, flexible models); 23 | 24 | style A fill:#f9f,stroke:#333,stroke-width:2px 25 | style E fill:#9f9,stroke:#333,stroke-width:2px 26 | ``` 27 | 28 | --- 29 | 30 | ## Part 1: Bias Variance Trade-Off 🐻 31 | 32 | Every machine learning model you build faces a fundamental tug-of-war. It's a balance between being too simple and being too complex. Getting this balance right is the key to a good model. This is the **Bias-Variance Tradeoff**. 33 | 34 | ## The Bullseye Analogy 🎯 35 | 36 | Imagine an archer trying to hit the bullseye, the true underlying pattern of the data. 37 | 38 | ```mermaid 39 | graph TD 40 | subgraph Legend 41 | direction LR 42 | Bullseye(Bullseye = True Pattern) 43 | Hits(Blue Dots = Model's Predictions) 44 | end 45 | 46 | subgraph Four Archers: A Comparison 47 | direction LR 48 | A[Low Bias, Low Variance

🎯

Accurate & Consistent
This is our GOAL!] 49 | B[Low Bias, High Variance

🎯

Accurate on average, but
predictions are all over the place.] 50 | C[High Bias, Low Variance

🎯

Consistently misses in the
same spot. Very predictable, but wrong.] 51 | D[High Bias, High Variance

🎯

Inaccurate & Inconsistent.
The worst of both worlds!] 52 | end 53 | 54 | style A fill:#9f9 55 | ``` 56 | 57 | * **Bias (Underfitting):** High bias is like a misaligned scope on a rifle. You are consistently wrong in the same way. Your model is too simple and has a fundamental "bias" that prevents it from capturing the truth. 58 | * **Variance (Overfitting):** High variance is like an archer with a shaky hand. Your shots are inconsistent. Your model is too complex and is distracted by random noise in the data, causing its predictions to vary wildly with new data. 59 | 60 | ### The Error Curve: The Fix 61 | This is one of the most important graphs in machine learning. It shows how error changes as we make a model more complex. Our goal is to find the lowest point of the "Total Error" curve. 62 | 63 | ```mermaid 64 | xychart-beta 65 | title "The Bias-Variance Tradeoff vs. Model Complexity" 66 | x-axis "Model Complexity (More features, higher polynomial degree) -->" 67 | y-axis "Prediction Error" 68 | 69 | line "Bias²" [9, 7, 5, 3.5, 2.5, 1.5, 1, 0.8, 0.6, 0.5, 0.4] 70 | line "Variance" [0.2, 0.3, 0.5, 0.8, 1.5, 2.5, 4, 5.5, 7, 8, 9] 71 | line "Total Error" [9.2, 7.3, 5.5, 4.3, 4.0, 4.0, 5, 6.3, 7.6, 8.5, 9.4] 72 | ``` 73 | 74 | ### Python Example - Visualizing the Tradeoff: 75 | We'll try to fit models of different complexities to some curvy data. 76 | 77 | ```python 78 | import numpy as np 79 | import matplotlib.pyplot as plt 80 | from sklearn.pipeline import make_pipeline 81 | from sklearn.linear_model import LinearRegression 82 | from sklearn.preprocessing import PolynomialFeatures 83 | 84 | # 1. Generate some sample data (a sine wave with noise) 85 | np.random.seed(0) 86 | X = np.linspace(0, 10, 30).reshape(-1, 1) 87 | y = np.sin(X).ravel() + np.random.normal(0, 0.5, 30) 88 | 89 | # 2. Define models with different complexities (polynomial degree) 90 | underfit_model = make_pipeline(PolynomialFeatures(degree=1), LinearRegression()) 91 | just_right_model = make_pipeline(PolynomialFeatures(degree=4), LinearRegression()) 92 | overfit_model = make_pipeline(PolynomialFeatures(degree=15), LinearRegression()) 93 | 94 | # 3. Fit the models 95 | underfit_model.fit(X, y) 96 | just_right_model.fit(X, y) 97 | overfit_model.fit(X, y) 98 | 99 | # 4. Plot everything 100 | X_plot = np.linspace(0, 10, 100).reshape(-1, 1) 101 | plt.figure(figsize=(15, 8)) 102 | plt.scatter(X, y, label='Original Data Points', color='black', zorder=5) 103 | 104 | # Plot Underfit Model 105 | plt.plot(X_plot, underfit_model.predict(X_plot), label='Underfit (Degree 1) - High Bias', color='red', linewidth=2) 106 | # Plot Just Right Model 107 | plt.plot(X_plot, just_right_model.predict(X_plot), label='Just Right (Degree 4) - Sweet Spot!', color='green', linewidth=4) 108 | # Plot Overfit Model 109 | plt.plot(X_plot, overfit_model.predict(X_plot), label='Overfit (Degree 15) - High Variance', color='orange', linewidth=2, linestyle='--') 110 | 111 | plt.title('Visualizing the Bias-Variance Tradeoff', fontsize=16) 112 | plt.xlabel('Feature', fontsize=12) 113 | plt.ylabel('Target', fontsize=12) 114 | plt.legend(fontsize=12) 115 | plt.ylim(-3, 4) 116 | plt.grid(True, linestyle='--', alpha=0.6) 117 | plt.show() 118 | ``` 119 | **Analysis of the Plot:** 120 | * **🔴 Red Line (High Bias):** Too simple. It misses the curve entirely. 121 | * **🟠 Orange Line (High Variance):** Too complex. It wiggles frantically to "memorize" the noisy data. 122 | * **🟢 Green Line (Sweet Spot):** Just right. It captures the true trend while ignoring the noise. 123 | 124 | --- 125 | 126 | ## Part 2: Regularization - The Overfitting Tamer 127 | 128 | We saw the huge problem of overfitting. We can use **Regularization** as our primary technique against it. 129 | 130 | **The Main Idea** 131 | We penalize a model for being too complex. We change its goal from *only* minimizing prediction error to minimizing error *AND* keeping its internal parameters (coefficients) small. 132 | 133 | ### The "Leash" Analogy 134 | Regularization is like a leash, it pulls the model's coefficient back towards zero to prevent them being to large. 135 | 136 | ```mermaid 137 | graph LR 138 | subgraph The Forces on Model Coefficients 139 | Origin((Origin
Coefficients = 0)) -- "Penalty Term (Leash)
pulls coefficients
towards zero" --> Model[Model's Optimal
Coefficients] 140 | 141 | Model -- "Data pulls the model
to fit it perfectly" --> DataPoint1(Data Point 1) 142 | Model -- "Data pulls the model
to fit it perfectly" --> DataPoint2(Data Point 2) 143 | end 144 | 145 | style Origin fill:#ccc,stroke:#333 146 | ``` 147 | 148 | The regularization strength (`λ`, lambda) is the length of the leash. A bigger `λ` means a shorter an stronger leash, a simpler model, and less overfitting. 149 | 150 | ### The Two Flavors of Regularization: Lasso vs. Ridge 151 | 152 | 1. **L2 Regularization (Ridge):** Uses a squared penalty: $\lambda \sum (\text{coefficient})^2$. 153 | * **Effect:** Shrinks all coefficients, making the model more stable. 154 | 155 | 2. **L1 Regularization (Lasso):** Uses an absolute value penalty: $\lambda \sum |\text{coefficient}|$. 156 | * **Effect:** Shrinks some coefficients *all the way to zero*. This is powerful because it performs **automatic feature selection**, telling you which features are most important. 157 | 158 | ### Python: Visulizing How Regularization Works 159 | 160 | Let's see the "shrinking" effect. We'll create a dataset with 2 important features and 8 useless "noise" features. 161 | 162 | ```python 163 | import pandas as pd 164 | from sklearn.linear_model import LinearRegression, Ridge, Lasso 165 | 166 | # 1. Create a dataset with 2 important features and 8 useless "noise" features 167 | np.random.seed(42) 168 | n_samples = 50 169 | n_features = 10 170 | X = np.random.randn(n_samples, n_features) 171 | # Create a target y where only the first two features matter 172 | y = 2 * X[:, 0] + 3 * X[:, 1] + np.random.normal(0, 1, n_samples) 173 | 174 | # 2. Train three models 175 | lr = LinearRegression() 176 | lr.fit(X, y) 177 | ridge = Ridge(alpha=10) # alpha is lambda (λ) in scikit-learn 178 | ridge.fit(X, y) 179 | lasso = Lasso(alpha=0.1) 180 | lasso.fit(X, y) 181 | 182 | # 3. Create a bar chart of the coefficients 183 | models = {'Linear Regression': lr, 'Ridge (L2)': ridge, 'Lasso (L1)': lasso} 184 | df_coeffs = pd.DataFrame({name: model.coef_ for name, model in models.items()}) 185 | 186 | df_coeffs.plot(kind='bar', figsize=(15, 7)) 187 | plt.title('Comparing Coefficients of Different Models', fontsize=16) 188 | plt.ylabel('Coefficient Value', fontsize=12) 189 | plt.xlabel('Feature Index', fontsize=12) 190 | plt.axhline(0, color='black', linewidth=0.5) 191 | plt.xticks(rotation=0) 192 | plt.grid(axis='y', linestyle='--', alpha=0.7) 193 | plt.show() 194 | ``` 195 | **Analysis of the Chart:** 196 | * **🔵 Linear Regression:** Coefficients are large and noisy. It's trying to use useless features. 197 | * **🟠 Ridge (L2):** All coefficients are shrunk towards zero, making the model more stable. 198 | * **🟢 Lasso (L1):** The magic! It has set the coefficients for most of the useless features to **exactly zero**, correctly identifying the two important features. 199 | 200 | --- 201 | ## Part 3: The Kernel Method: Dimension Trick 202 | 203 | What if your data looks like this? A simple line can't separate the classes. This is a **non-linear problem**. 204 | 205 | ```mermaid 206 | graph LR 207 | Problem["Data in 2D
(( ))   (( ))
(( ))   (( ))"] -- "Can't be solved with a line" --> Diagnosis{"Non-Linear Pattern"} 208 | 209 | style Problem text-align:center 210 | ``` 211 | 212 | **The Big Idea:** Let's project the data into a higher dimension where it *is* linearly separable. 213 | 214 | ### Visualizing the Transformation 215 | 216 | The Kernel Method allows us to find a separating boundary in a high-dimensional space without ever actually calculating the new dimensions. It's a "trick" to handle non-linearity. 217 | 218 | ```mermaid 219 | graph LR 220 | A["Problem in 2D
Dots are not linearly separable"] --> B["The Transformation 'Trick'
Imagine a function that 'lifts'
the inner dots into a 3rd dimension"]; 221 | B --> C["Solution in High Dimension
Now a simple plane can
easily separate the classes!"]; 222 | ``` 223 | 224 | ### The Kernel "Trick" 225 | 226 | A **kernel** is a function that efficiently calculates the similarity between two points *as if* they were in that higher-dimensional space. The most powerful model using this is the **Support Vector Machine (SVM)**. The most popular kernel is the **RBF (Radial Basis Function) Kernel**, which is excellent for complex patterns. 227 | 228 | ### Python Ex: Solving Non-Linear with SVM 229 | 230 | Let's use `scikit-learn` to create "moon" data and watch a Kernel SVM solve it effortlessly. 231 | 232 | ```python 233 | from sklearn.datasets import make_moons 234 | from sklearn.svm import SVC 235 | import matplotlib.pyplot as plt 236 | import numpy as np 237 | # 1. Create non-linear "moon" data 238 | X, y = make_moons(n_samples=200, noise=0.15, random_state=42) 239 | 240 | # 2. Create two SVM models 241 | linear_svm = SVC(kernel='linear', C=1.0) 242 | rbf_svm = SVC(kernel='rbf', C=1.0, gamma='auto') # RBF Kernel is the magic! 243 | 244 | # 3. Fit the models and plot results 245 | # (Code for plotting is the same as the previous version) 246 | 247 | # Helper function to visualize the decision boundary 248 | def plot_decision_boundary(model, X, y, ax, title): 249 | h = .02 250 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 251 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 252 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 253 | Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) 254 | Z = Z.reshape(xx.shape) 255 | 256 | ax.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu, alpha=0.3) 257 | ax.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolors='k') 258 | ax.set_title(title, fontsize=14) 259 | ax.set_xlabel('Feature 1') 260 | ax.set_ylabel('Feature 2') 261 | 262 | fig, axes = plt.subplots(1, 2, figsize=(16, 7)) 263 | linear_svm.fit(X, y) 264 | plot_decision_boundary(linear_svm, X, y, axes[0], 'Linear Kernel SVM (Fails)') 265 | rbf_svm.fit(X, y) 266 | plot_decision_boundary(rbf_svm, X, y, axes[1], 'RBF Kernel SVM (Succeeds!)') 267 | 268 | plt.suptitle('The Power of the Kernel Trick', fontsize=20) 269 | plt.show() 270 | ``` 271 | **Analysis of the Plot:** 272 | * **Left (Linear Kernel):** Fails because it can only draw a straight line. 273 | * **Right (RBF Kernel):** Succeeds by creating a complex, non-linear boundary, effectively "seeing" the pattern in a higher dimension. 274 | 275 | 276 | --- 277 | ## Conclusion & Your Expanded ML Toolkit 🧰 278 | 279 | Congratulations! You now have a mental flowchart for diagnosing and fixing common machine learning problems. 280 | ```mermaid 281 | graph LR 282 | Start("Start with your ML Problem") --> Diag{"Diagnose the Model's Behavior"}; 283 | 284 | Diag -->|"Model is too simple\n(Bad on train & test data)"| Bias("High Bias / Underfitting"); 285 | Bias --> Sol1["Solution: Use a more\ncomplex model, add features"]; 286 | 287 | Diag -->|"Model 'memorized' the data\n(Good on train, bad on test)"| Variance("High Variance / Overfitting"); 288 | Variance --> Sol2["Solution: Use Regularization\n(Lasso/Ridge) or add more data"]; 289 | 290 | Diag -->|"Pattern is not a straight line\n(Linear model fails)"| NonLinear("Non-Linear Problem"); 291 | NonLinear --> Sol3["Solution: Use the Kernel Trick\n(e.g., SVM with RBF Kernel)"]; 292 | 293 | Sol1 --> End((✅ A Better Model)); 294 | Sol2 --> End((✅ A Better Model)); 295 | Sol3 --> End((✅ A Better Model)); 296 | 297 | style End fill:#9f9,stroke:#333,stroke-width:2px 298 | ``` 299 | 300 | You are now equipped with the knowledge to build models that are not just predictive, but also robust, generalizable, and intelligent. Happy modeling -------------------------------------------------------------------------------- /2. Machine Learning Generics/2.4. Principal Component Analysis, Dimensionality Reduction/Pizza.csv: -------------------------------------------------------------------------------- 1 | brand,id,mois,prot,fat,ash,sodium,carb,cal 2 | A,14069,27.82,21.43,44.87,5.11,1.77,0.77,4.93 3 | A,14053,28.49,21.26,43.89,5.34,1.79,1.02,4.84 4 | A,14025,28.35,19.99,45.78,5.08,1.63,0.8,4.95 5 | A,14016,30.55,20.15,43.13,4.79,1.61,1.38,4.74 6 | A,14005,30.49,21.28,41.65,4.82,1.64,1.76,4.67 7 | A,14075,31.14,20.23,42.31,4.92,1.65,1.4,4.67 8 | A,14082,31.21,20.97,41.34,4.71,1.58,1.77,4.63 9 | A,14097,28.76,21.41,41.6,5.28,1.75,2.95,4.72 10 | A,14117,28.22,20.48,45.1,5.02,1.71,1.18,4.93 11 | A,14133,27.72,21.19,45.29,5.16,1.66,0.64,4.95 12 | A,14101,27.35,21.2,45.59,4.94,1.65,0.92,4.98 13 | A,14108,26.98,21.2,45.03,5.15,1.67,1.64,4.97 14 | A,14164,28.7,20,45.12,4.93,1.56,1.25,4.91 15 | A,14154,30.91,19.65,42.45,4.81,1.65,2.81,4.72 16 | A,24005,30.91,20.77,42.03,4.9,1.61,1.39,4.67 17 | A,24026,30.83,17.88,44.33,5.26,1.76,1.7,4.77 18 | A,24094,32.73,20.06,39.74,5.24,1.69,2.23,4.47 19 | A,24108,34.58,17.53,40.87,5.05,1.61,1.97,4.46 20 | A,24102,31.8,20.35,40.44,5.43,1.61,1.98,4.53 21 | A,24082,31.02,19.05,42.29,5.27,1.71,2.37,4.66 22 | A,34017,27.02,19.56,47.2,4.95,1.65,1.27,5.08 23 | A,34020,27.78,20.01,45.59,4.97,1.7,1.65,4.97 24 | A,24136,30.88,20.58,42.26,4.96,1.63,1.32,4.68 25 | A,24122,32.2,19.25,43.42,4.62,1.5,0.51,4.7 26 | A,24115,33.19,18.05,41.88,5.22,1.7,1.66,4.56 27 | A,34012,30.43,19.78,44.2,4.8,1.61,0.79,4.8 28 | A,34006,28.93,19.99,45.2,4.78,1.62,1.1,4.91 29 | A,24146,30.41,18.71,43.99,4.86,1.62,2.03,4.79 30 | A,24138,29.62,21.1,43.37,5.05,1.69,0.86,4.78 31 | B,14015,49.57,13.7,29.07,3.62,1.1,4.04,3.33 32 | B,14006,52.68,14.38,25.72,3.26,0.93,3.96,3.05 33 | B,14024,48.53,13.14,30.38,3.55,0.99,4.4,3.44 34 | B,14052,50.19,13.78,28.39,3.56,1,4.08,3.27 35 | B,14062,50.67,13.21,27.66,3.64,1,4.82,3.21 36 | B,14047,49.99,13.35,29.2,3.52,1.05,3.94,3.32 37 | B,14074,50.72,12.93,29.88,3.6,1.03,2.87,3.32 38 | B,14083,50.81,12.56,29.95,2.99,0.81,3.69,3.35 39 | B,14094,54.08,13.28,25.25,3.1,0.8,4.29,2.98 40 | B,14124,51.9,14.27,24.92,3.85,1.06,5.06,3.02 41 | B,24019,50.33,13.96,29.25,3.42,0.96,3.04,3.31 42 | B,24012,49.69,13.63,29.59,3.41,0.98,3.68,3.36 43 | B,14132,51.12,14.02,27.37,3.71,1.11,3.78,3.18 44 | B,14146,49.77,13.24,28.91,3.59,1.06,4.49,3.31 45 | B,14149,54.96,14.26,22.99,3.19,0.9,4.6,2.82 46 | B,14161,55.11,14.87,21.9,3.29,0.86,4.83,2.76 47 | B,14113,54.12,14.06,24.95,3.14,0.82,3.73,2.96 48 | B,24008,49.34,13.79,29.57,3.52,0.95,3.78,3.36 49 | B,24029,50.65,13.14,28.79,3.73,1.09,3.69,3.26 50 | B,24045,52.46,14.18,24.6,3.57,1.12,5.19,2.99 51 | B,24040,49.96,13.4,28.84,3.48,0.98,4.32,3.3 52 | B,24093,49.57,13.17,29.39,3.59,1.06,4.28,3.35 53 | B,24083,49.34,13.06,29.46,3.51,1.04,4.63,3.36 54 | B,24049,50.87,13.85,27.64,3.71,1.1,3.93,3.2 55 | B,24101,51.03,13.9,27.56,3.73,1.08,3.78,3.19 56 | B,34011,53.98,14.05,24.73,3.32,0.92,3.92,2.94 57 | B,24125,52.23,13.64,27.04,3.57,0.98,3.52,3.12 58 | B,24145,51.74,13.95,27.75,3.6,1.04,2.96,3.17 59 | B,24151,51.52,13.72,28.28,3.62,1.05,2.86,3.21 60 | B,34005,51.86,13.13,28.82,2.94,0.8,3.25,3.25 61 | B,24118,51.75,13.18,28.38,3.04,0.86,3.65,3.23 62 | C,14058,48.4,26.05,21.4,3.44,0.5,0.71,3 63 | C,14022,48.69,28.48,17.37,3.53,0.43,1.93,2.78 64 | C,14036,48.88,25.23,20.89,3.22,0.47,1.78,2.96 65 | C,14093,48.97,25.63,19.28,3.48,0.55,2.64,2.87 66 | C,14080,48.84,23.98,23.37,3,0.52,0.81,3.09 67 | C,14029,49.73,25.65,19.98,2.51,0.52,2.13,2.91 68 | C,14106,50.18,28.3,15.79,3.47,0.45,2.26,2.64 69 | C,14014,48.15,27.98,18.69,3.58,0.48,1.7,2.87 70 | C,14073,49.72,27.31,16.89,3.08,0.25,3,2.73 71 | C,14151,51.59,26.24,16.41,3.61,0.6,2.15,2.61 72 | C,14162,52.26,26.31,14.77,3.51,0.53,3.15,2.51 73 | C,14139,49.35,25.23,20.03,3.02,0.44,2.37,2.91 74 | C,14115,47.91,26.03,21.54,3.71,0.6,0.81,3.01 75 | C,24006,47.83,25.82,20.79,3.33,0.54,2.23,2.99 76 | C,24020,47.9,25.55,21.1,3.04,0.43,2.41,3.02 77 | C,24031,49.1,24.53,21.08,2.84,0.34,2.45,2.98 78 | C,24038,50.04,24.13,19.75,3.21,0.52,2.87,2.86 79 | C,24043,52.19,26,16.64,4.17,0.61,1,2.58 80 | C,24152,47.11,26.17,21.29,3.36,0.48,2.07,3.05 81 | C,24144,48.48,26.76,19.99,3.64,0.46,1.13,2.91 82 | C,24135,52.22,26.25,16.45,3.92,0.38,1.16,2.58 83 | C,24124,49.57,26.91,18,2.21,0.41,3.31,2.83 84 | C,24113,51.71,24.98,17.2,3.01,0.34,3.1,2.67 85 | C,24058,49.27,27.42,17.42,3.05,0.33,2.84,2.78 86 | C,24091,47.25,23.95,24.24,3.47,0.57,1.09,3.18 87 | C,34010,50.98,26.34,16.47,3.2,0.43,3.01,2.66 88 | C,34003,49.57,25.46,20.79,3.04,0.37,1.14,2.94 89 | D,14092,46.64,21.38,24.96,4.6,0.77,2.42,3.2 90 | D,14081,45.93,21.6,25.87,4.51,0.73,2.09,3.28 91 | D,14072,47.6,22.07,21.13,4.07,0.72,5.13,2.99 92 | D,14116,47.61,22.44,19.61,4.06,0.6,6.28,2.91 93 | D,14128,46.91,21.79,21.17,4.06,0.76,6.07,3.02 94 | D,14107,46.88,21.71,23.6,4.59,0.75,3.22,3.12 95 | D,14050,47.49,21.75,20.83,4.01,0.67,5.92,2.98 96 | D,14013,48.03,21.96,20.88,4.02,0.7,5.11,2.96 97 | D,14004,49.16,27.99,17.49,3.29,0.39,2.07,2.78 98 | D,14003,47.17,22.29,21.3,4.08,0.74,5.16,3.02 99 | D,14037,47.29,21.48,21.69,4.03,0.67,5.51,3.03 100 | D,14023,47.53,21.11,21.54,4.02,0.7,5.8,3.02 101 | D,14059,47.86,22.25,19.53,4.04,0.68,6.32,2.9 102 | D,14163,48.09,22.65,21.59,5.22,0.93,2.45,2.95 103 | D,14150,47.73,22.38,21.39,5.21,0.99,3.29,2.95 104 | D,14140,48.44,22.73,21.05,5.22,0.98,2.56,2.91 105 | D,24007,47.43,22.13,21.01,4.09,0.72,5.34,2.99 106 | D,24021,47.68,21.84,20.45,4.06,0.71,5.97,2.95 107 | D,24011,48.05,22.05,20.57,4.04,0.7,5.29,2.94 108 | D,24030,48.01,21.31,21.05,4.01,0.73,5.62,2.97 109 | D,34009,47.45,22.37,20.97,4.06,0.7,5.15,2.99 110 | D,24039,47.8,22.36,20.39,4.02,0.7,5.43,2.95 111 | D,24044,48.31,22.49,19.53,4.18,0.62,5.49,2.88 112 | D,34004,46.19,21.19,25.18,4.66,0.8,2.78,3.23 113 | D,24153,48.81,22.43,18.68,4.1,0.72,5.98,2.82 114 | D,24143,48.89,22.95,21.93,5.26,0.85,0.97,2.93 115 | D,24134,47.91,22.22,20.4,4.07,0.56,5.4,2.94 116 | D,24123,46.28,21.51,25.44,4.58,0.6,2.19,3.24 117 | D,24114,46.29,21.43,26,4.71,0.69,1.57,3.26 118 | D,24084,47.03,20.84,25.68,4.52,0.69,1.93,3.22 119 | D,24092,46.8,20.7,25.1,4.55,0.7,2.85,3.2 120 | D,24043,52.19,26,16.64,4.17,0.61,1,2.58 121 | E,14089,34.58,7.44,16.24,1.31,0.39,40.43,3.38 122 | E,14056,36.84,7.77,17.07,1.37,0.4,36.95,3.33 123 | E,14099,35.14,8.05,15.77,1.38,0.41,39.66,3.33 124 | E,14033,39.25,8.67,4.44,1.54,0.51,46.1,2.59 125 | E,14063,34.51,7.75,14.87,1.42,0.42,41.45,3.31 126 | E,14029,39.59,8.36,4.39,1.52,0.48,46.14,2.58 127 | E,14039,34.94,7.81,13.67,1.36,0.4,42.22,3.23 128 | E,14142,39.36,8.1,16.44,1.45,0.44,34.65,3.19 129 | E,14122,36.04,7.74,15.49,1.45,0.45,39.28,3.27 130 | E,14078,36.54,7.75,15.67,1.43,0.44,38.61,3.26 131 | E,14126,37.78,8.3,13.05,1.64,0.49,39.23,3.08 132 | E,14109,35.3,7.92,13.85,1.46,0.4,41.47,3.22 133 | E,14155,34.47,7.62,19.07,1.44,0.44,37.4,3.52 134 | E,14160,33.24,7.54,19.56,1.32,0.43,38.34,3.6 135 | E,24110,37.34,7.33,19.61,1.6,0.45,34.12,3.42 136 | E,24088,37.59,7.93,13.58,1.43,0.45,39.47,3.12 137 | E,24053,36.5,7.52,12.46,1.51,0.47,42.01,3.1 138 | E,24110,37.34,7.33,19.61,1.6,0.45,34.12,3.42 139 | E,24035,34.31,7.98,14.54,1.46,0.49,41.71,3.3 140 | E,24099,36.69,7.8,14.77,1.46,0.46,39.28,3.21 141 | E,24140,34.23,7.75,17.94,1.61,0.44,38.47,3.46 142 | E,34031,35.54,7.47,17.67,1.44,0.47,37.88,3.4 143 | E,34030,35.21,6.98,20.02,1.35,0.46,36.44,3.54 144 | E,34032,33.65,7.11,19.5,1.48,0.45,38.26,3.57 145 | E,24142,34.77,7.26,18.8,1.58,0.43,37.59,3.49 146 | E,34033,37.32,7.4,8.18,1.63,0.53,45.47,2.85 147 | E,14126,37.78,8.3,13.05,1.64,0.49,39.23,3.08 148 | E,24104,34.48,7.54,13.93,1.45,0.44,42.6,3.26 149 | F,34037,28.03,7.65,18.39,1.53,0.49,44.4,3.74 150 | F,14054,30.09,7.99,15.16,1.46,0.48,45.3,3.5 151 | F,14118,29.79,8.17,14.35,1.49,0.46,46.2,3.46 152 | F,14110,30.07,8.02,20.39,1.45,0.45,40.07,3.76 153 | F,14096,28.46,7.7,18.88,1.4,0.43,43.56,3.75 154 | F,14079,30.29,8.09,14.82,1.52,0.51,45.28,3.47 155 | F,14065,28.66,7.67,16.12,1.41,0.43,46.14,3.6 156 | F,14084,30.96,8.31,13.42,1.49,0.43,45.82,3.37 157 | F,14018,29.78,8.2,14.51,1.5,0.5,46.01,3.47 158 | F,14038,30.28,7.76,16.04,1.4,0.42,44.52,3.53 159 | F,14152,29.92,8.11,19.23,1.51,0.48,41.23,3.7 160 | F,14143,29.54,7.79,15.08,1.41,0.45,46.18,3.52 161 | F,14130,28.93,8.18,19.35,1.39,0.58,42.15,3.75 162 | F,14165,29.89,7.95,15.08,1.45,0.47,45.63,3.5 163 | F,24035,27.65,7.78,17.3,1.29,0.4,46.25,3.72 164 | F,24049,28.33,7.82,17.96,1.41,0.45,44.48,3.71 165 | F,24042,29.1,8.07,20.05,1.45,0.45,41.33,3.78 166 | F,24046,29.59,8.05,14.07,1.44,0.45,46.22,3.49 167 | F,24055,27.93,7.88,17.49,1.44,0.47,45.26,3.96 168 | F,24086,30.53,8.02,14.17,1.49,0.47,45.79,3.43 169 | F,24097,30.68,8.11,12.92,1.56,0.47,46.73,3.36 170 | F,24103,30.15,8.06,12.23,1.5,0.47,48.06,3.35 171 | F,24109,29.3,8.02,16.34,1.43,0.45,44.91,3.59 172 | F,24150,29.69,7.63,15.71,1.63,0.46,45.34,3.53 173 | F,24157,29,7.51,17.78,1.58,0.43,44.13,3.67 174 | F,34035,28.98,7.7,16.67,1.48,0.47,45.17,3.62 175 | F,24106,28.84,7.88,17.21,1.42,0.46,44.65,3.65 176 | F,34036,28.34,7.6,18.51,1.45,0.46,44.1,3.73 177 | F,34034,28.36,7.62,19.29,1.45,0.47,43.28,3.77 178 | F,24137,30.97,7.6,14.22,1.78,0.45,45.43,3.4 179 | G,14145,28.15,8.23,15.45,1.41,0.45,46.76,3.59 180 | G,24015,28.35,8.19,14.9,1.4,0.43,47.16,3.56 181 | G,24004,30.85,8.03,13.67,1.41,0.42,46.04,3.39 182 | G,14095,28.21,8.3,15,1.41,0.43,47.08,3.57 183 | G,14153,28.83,8.26,20.1,1.37,0.42,41.44,3.8 184 | G,14017,28.29,8.05,16.72,1.31,0.43,45.63,3.65 185 | G,14010,28.68,8.3,16.07,1.41,0.45,45.54,3.6 186 | G,14055,27.71,8.28,15.62,1.53,0.5,46.86,3.61 187 | G,14040,28.03,8.27,16.4,1.37,0.41,45.93,3.64 188 | G,14026,28.13,8.34,16.19,1.42,0.45,45.92,3.63 189 | G,14119,28.09,8.42,14.06,1.47,0.45,47.96,3.52 190 | G,14131,28.19,8.57,14.16,1.76,0.42,47.32,3.51 191 | G,14112,28.63,8.42,15.24,1.43,0.43,46.28,3.56 192 | G,14077,33.09,7.87,12.07,1.37,0.44,45.6,3.23 193 | G,14066,26.33,8.03,19.98,1.43,0.45,44.23,3.89 194 | G,14085,27.28,8.55,15.18,1.51,0.46,47.48,3.61 195 | G,14166,27.56,8.25,14.65,1.45,0.46,48.09,3.57 196 | G,24022,27.72,8.06,15.34,1.35,0.42,47.53,3.6 197 | G,24041,30.63,8.21,17.33,1.39,0.42,42.44,3.59 198 | G,24047,29.06,8.46,14.12,1.47,0.47,46.89,3.48 199 | G,24033,28.55,8.25,17.3,1.48,0.47,44.42,3.66 200 | G,24119,27.16,8.27,14.68,1.79,0.46,48.1,3.58 201 | G,24121,28.33,8.17,13.64,1.45,0.47,48.41,3.49 202 | G,24156,26.19,7.99,17.53,1.42,0.44,46.87,3.77 203 | G,34008,26.45,7.89,17.97,1.3,0.39,46.39,3.79 204 | G,34018,27.72,8.24,15.16,1.46,0.45,47.42,3.59 205 | G,24059,25,8.49,16.87,1.45,0.47,48.19,3.79 206 | G,34013,29.14,8.46,12.25,1.51,0.46,48.64,3.39 207 | G,24149,28.64,8.01,16.02,1.43,0.45,45.9,3.6 208 | H,14127,36.12,8.33,11.9,1.42,0.39,42.23,3.09 209 | H,24001,35.1,8.04,19.07,1.36,0.41,36.43,3.5 210 | H,14144,35.48,7.66,15.39,1.37,0.46,40.1,3.3 211 | H,14123,35.66,9.04,6.24,1.5,0.42,47.56,2.83 212 | H,14111,35.61,8.07,16.15,1.41,0.4,38.81,3.33 213 | H,14088,35.68,8.01,16.36,1.3,0.39,38.65,3.34 214 | H,14028,40.72,8.34,4.96,1.42,0.43,44.56,2.56 215 | H,14009,35.55,7.32,16.4,1.76,0.36,38.97,3.33 216 | H,14007,33.05,7.34,15.78,1.34,0.42,42.49,3.41 217 | H,14057,33.75,8.07,14.93,1.31,0.4,41.94,3.34 218 | H,14032,38.84,8.55,4.38,1.41,0.43,46.82,2.61 219 | H,14100,33.44,7.45,18.49,1.39,0.4,39.23,3.53 220 | H,14041,36.43,8.67,15.05,1.35,0.38,38.5,3.24 221 | H,14064,35.84,7.9,14.09,1.25,0.42,40.92,3.22 222 | H,14076,36.63,8.38,12.59,1.36,0.41,41.04,3.11 223 | H,14159,35.5,8.11,13.45,1.34,0.42,41.59,3.2 224 | H,14156,36.87,7.82,14.58,1.32,0.4,39.41,3.2 225 | H,24009,35.5,7.51,15.63,1.42,0.42,39.94,3.3 226 | H,24016,33.9,7.76,15.84,1.3,0.42,41.2,3.38 227 | H,24024,35.8,7.82,14.08,1.43,0.42,40.87,3.21 228 | H,24089,35.74,7.86,14.27,1.4,0.44,40.73,3.23 229 | H,24054,35.74,7.47,19.14,1.34,0.4,36.31,3.47 230 | H,24037,35.99,8.23,16.6,1.4,0.45,37.78,3.33 231 | H,24034,35.5,7.9,13.83,1.41,0.46,41.36,3.22 232 | H,24120,35.91,7.41,16.41,1.75,0.43,38.52,3.31 233 | H,34028,36.01,7.7,15.6,1.37,0.43,39.32,3.28 234 | H,34007,36.73,7.42,18.16,1.17,0.39,36.52,3.39 235 | H,24139,35.86,7.47,14.72,1.67,0.4,40.28,3.23 236 | H,34029,36.32,8.06,12.54,1.35,0.43,41.73,3.12 237 | H,24100,34.61,7.17,17.88,1.29,0.4,39.05,3.46 238 | H,24132,36.67,7.81,9.34,1.64,0.44,44.54,2.93 239 | H,24112,35.94,7.97,13.5,1.45,0.42,41.14,3.18 240 | H,24089,35.74,7.86,14.27,1.4,0.44,40.73,3.23 241 | I,14134,54.64,10.36,12.89,2.21,0.53,19.9,2.37 242 | I,14046,54.52,9.85,13.55,2.04,0.47,20.05,2.42 243 | I,14019,53.84,10.22,13.05,2.07,0.48,20.82,2.42 244 | I,24107,54.32,10.66,14.04,2.03,0.46,18.95,2.45 245 | I,24111,52.9,10.19,14.35,2.02,0.49,20.54,2.52 246 | I,24128,55,11.11,11.3,2.05,0.47,20.54,2.28 247 | I,24052,56.24,9.06,11.49,1.93,0.48,21.28,2.25 248 | I,14068,56.25,9.96,13.22,2.05,0.46,18.52,2.33 249 | I,14157,54.43,9.61,12.18,2.16,0.51,21.62,2.35 250 | I,14158,53.69,11,15.23,2.25,0.55,17.83,2.52 251 | I,14102,54.69,10.32,13.77,2.15,0.48,19.07,2.41 252 | I,14103,53.16,11.17,13.83,2.18,0.47,19.66,2.48 253 | I,14031,55.15,10.92,12.51,2.2,0.51,19.22,2.33 254 | I,14067,54.87,10.78,13.65,2.08,0.5,18.62,2.4 255 | I,14030,55.43,10.46,12.37,2.17,0.51,19.57,2.31 256 | I,24023,55.6,9.69,13.89,1.99,0.48,18.83,2.39 257 | I,14114,54.51,9.93,12.66,2.07,0.53,20.83,2.37 258 | I,14098,54.74,10.18,12.6,2.06,0.48,20.42,2.36 259 | I,14125,54.17,10.67,12.18,2.08,0.49,20.9,2.36 260 | I,24048,52.75,10.78,12.98,2.12,0.49,21.37,2.45 261 | I,34023,54.36,11.43,13.09,2.24,0.48,18.88,2.39 262 | I,34022,54.28,10.75,13.87,2.13,0.46,18.97,2.4 263 | I,34021,54.54,10.4,13.22,2.1,0.47,19.74,2.4 264 | I,24141,54.06,10.68,13.53,2.04,0.46,19.69,2.43 265 | I,34019,57.22,9.66,10.95,2.04,0.47,20.13,2.18 266 | I,34027,54.8,10.57,13.42,2.08,0.5,19.13,2.4 267 | I,34026,54.17,10.13,13.25,2.07,0.46,20.38,2.41 268 | I,34025,53.57,10.73,12.78,2.16,0.48,20.76,2.41 269 | I,34024,55.29,9.84,12.91,2.08,0.51,19.88,2.35 270 | J,34042,46.16,10.34,16.14,2.47,0.67,24.89,2.86 271 | J,34045,43.8,10.97,16.49,2.46,0.65,26.28,2.97 272 | J,14044,47.6,10.43,15.18,2.32,0.56,24.47,2.76 273 | J,14045,46.84,9.91,15.5,2.27,0.57,25.48,2.81 274 | J,14042,46.1,9.87,15.97,2.19,0.53,25.87,2.87 275 | J,14043,47.84,10.16,14.56,2.27,0.54,25.17,2.72 276 | J,24065,45.86,10.5,17.07,2.33,0.61,24.24,2.93 277 | J,24064,46.55,10.75,16.72,2.24,0.61,23.74,2.88 278 | J,24063,46.13,10.71,17.24,2.36,0.61,23.56,2.92 279 | J,24062,48.58,9.76,16.01,2.3,0.6,23.35,2.77 280 | J,24074,44.76,12.91,15.56,2.34,0.61,24.43,2.89 281 | J,24073,46.13,10.84,13.99,2.38,0.64,26.66,2.76 282 | J,24072,47.43,10.45,16.5,2.35,0.61,23.27,2.83 283 | J,24071,46.22,11.26,15.93,2.47,0.63,24.12,2.85 284 | J,24068,45.2,10.67,16.38,2.44,0.6,25.31,2.91 285 | J,24078,47.05,10.46,15.16,2.34,0.64,24.99,2.78 286 | J,24056,43.45,10.81,19.49,2.51,0.68,23.74,3.14 287 | J,34038,47.74,10.23,16.31,2.5,0.67,23.22,2.81 288 | J,34041,45.17,10.42,17.2,2.34,0.61,24.87,2.96 289 | J,34040,44.77,10.52,16.12,2.44,0.66,26.15,2.92 290 | J,24067,47.91,10.75,16,2.4,0.64,22.94,2.79 291 | J,24066,45.69,10.23,16.5,2.3,0.59,25.28,2.91 292 | J,24076,46.12,10.3,16.38,2.35,0.61,24.85,2.88 293 | J,24077,47.35,10.31,15.45,2.34,0.62,24.55,2.78 294 | J,24070,45.21,9.39,16.23,2.14,0.55,27.03,2.92 295 | J,24075,46.34,10,17.73,2.32,0.59,23.61,2.94 296 | J,34043,44.07,10.96,18.39,2.56,0.66,24.02,3.05 297 | J,34044,44.91,11.07,17,2.49,0.66,25.36,2.91 298 | J,24069,43.15,11.79,18.46,2.43,0.67,24.17,3.1 299 | J,34039,44.55,11.01,16.03,2.43,0.64,25.98,2.92 300 | J,14044,47.6,10.43,15.18,2.32,0.56,24.47,2.76 301 | J,14045,46.84,9.91,15.5,2.27,0.57,25.48,2.81 302 | -------------------------------------------------------------------------------- /2. Machine Learning Generics/2.1. Support Vector Machines/SVM.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "1e0b81b2", 6 | "metadata": {}, 7 | "source": [ 8 | "# Support Vector Machines (SVM)\n", 9 | "\n", 10 | "_Welcome back! Today we’ll master Support Vector Machines — powerful, margin‑based classifiers that shine in high‑dimensional spaces._\n", 11 | "\n", 12 | "- **Linear SVM** — maximum‑margin hyperplanes\n", 13 | "- **Soft margins & hinge loss** — robust to overlap and noise\n", 14 | "- **Dual form & kernels** — non‑linear decision boundaries via the kernel trick\n", 15 | "- **Practical tuning** — scaling, `C`, `gamma`, and when to use `LinearSVC` vs `SVC`.\n", 16 | "\n", 17 | "---\n", 18 | "\n", 19 | "## What you’ll learn\n", 20 | "\n", 21 | "- The margin idea and why “wider is better.”\n", 22 | "- Hard-margin vs. soft-margin SVMs.\n", 23 | "- Primal, hinge-loss view; dual and kernels.\n", 24 | "- How to tune `C`, `gamma`, and choose a kernel.\n", 25 | "- Minimal scikit-learn code for linear and kernel SVMs.\n", 26 | "\n", 27 | "---\n", 28 | "\n", 29 | "## 1) The SVM idea — separate with the **widest margin**\n", 30 | "\n", 31 | "Think of a line (or plane) that splits the classes. Among all lines that correctly split them, prefer the one that leaves the **largest gap** to the nearest points of either class. That gap is the **margin**. A larger margin usually means a simpler, more robust boundary that generalizes better.\n", 32 | "\n", 33 | "Given labeled points $(x_i, y_i)$ with $y_i \\in \\{-1,+1\\}$, SVM finds a hyperplane\n", 34 | "\n", 35 | "$$\n", 36 | "f(x) = w^\\top x + b\n", 37 | "$$\n", 38 | "\n", 39 | "that separates the classes while **maximizing the margin**\n", 40 | "\n", 41 | "- **Geometric margin** of a point: $\\displaystyle \\gamma_i = \\frac{y_i (w^\\top x_i + b)}{\\lVert w\\rVert}$\n", 42 | "- **Margin width** between class boundaries: $\\displaystyle \\frac{2}{\\lVert w\\rVert}$ \n", 43 | " → Max margin $\\Longleftrightarrow$ minimize $\\lVert w\\rVert$ (subject to correct classification).\n", 44 | "\n", 45 | "There are two types of SVMs:\n", 46 | "\n", 47 | "- **Support Vector Classification** (SVC): in scikit-learn, $SVC$ is the kernel SVM classifier, while LinearSVC is the fast linear-only solver. It is for classification tasks\n", 48 | "- **Support Vector Regression** (SVR): for regression tasks\n", 49 | "\n", 50 | "---\n", 51 | "\n", 52 | "## 2) Hard‑margin SVM (separable case)\n", 53 | "\n", 54 | "Here, every point must be on the correct side with **room to spare** (at least distance 1 in the scaled units). Minimizing $\\tfrac12\\lVert w\\rVert^2$ is equivalent to **maximizing the margin**. This version only works when data are perfectly separable; a single mislabeled or noisy point can break feasibility.\n", 55 | "\n", 56 | "**Optimization:**\n", 57 | "\n", 58 | "$$\n", 59 | "\\begin{aligned}\n", 60 | "\\min_{w,b}\\quad & \\tfrac12 \\lVert w\\rVert^2 \\\\\n", 61 | "\\text{s.t.}\\quad & y_i (w^\\top x_i + b) \\ge 1,\\quad i=1,\\dots,n\n", 62 | "\\end{aligned}\n", 63 | "$$\n", 64 | "\n", 65 | "- Constraints enforce that every point sits **outside** the margin band.\n", 66 | "- Works only when data are perfectly separable.\n", 67 | "\n", 68 | "---\n", 69 | "\n", 70 | "## 3) Soft‑margin SVM (realistic case)\n", 71 | "\n", 72 | "Real data must overlap.\n", 73 | "\n", 74 | "We introduce **slack** variables $\\xi_i \\ge 0$ that measures how much a point breaks the margin rule:\n", 75 | "\n", 76 | "- $\\xi_i=0$ means safely outside\n", 77 | "- $0<\\xi_i<1$ means inside the margin but on the correct side\n", 78 | "- $\\xi_i>1$ means misclassified.\n", 79 | "\n", 80 | "$$\n", 81 | "\\begin{aligned}\n", 82 | "\\min_{w,b,\\xi}\\quad & \\tfrac12 \\lVert w\\rVert^2 + C \\sum_{i=1}^n \\xi_i \\\\\n", 83 | "\\text{s.t.}\\quad & y_i (w^\\top x_i + b) \\ge 1 - \\xi_i,\\quad \\xi_i \\ge 0\n", 84 | "\\end{aligned}\n", 85 | "$$\n", 86 | "\n", 87 | "- The constant **C>0** balances two desires: keep the margin wide (small $\\lVert w\\rVert$), yet don’t allow too many/too large violations (small $\\sum\\xi_i$):\n", 88 | " - large $C$ → penalize violations heavily (lower bias, higher variance, risking overfit)\n", 89 | " - small $C$ → wider margin, more violations allowed (higher bias, lower variance, smoother, possibly underfit)\n", 90 | "\n", 91 | "**Hinge‑loss view (equivalent):**\n", 92 | "\n", 93 | "$$\n", 94 | "\\min_{w,b}\\quad \\frac{\\lambda}{2}\\lVert w\\rVert^2 + \\frac{1}{n}\\sum_{i=1}^n \\max\\!\\big(0, 1 - y_i(w^\\top x_i + b)\\big),\n", 95 | "$$\n", 96 | "\n", 97 | "with $\\lambda$ inversely related to $C$ (roughly, $\\lambda \\approx 1/(nC)$ in many libraries).\n", 98 | "\n", 99 | "---\n", 100 | "\n", 101 | "## 4) Support vectors & the decision function\n", 102 | "\n", 103 | "Only points that lie **on or inside** the margin influence the final classifier; these are the **support vectors**. Points far from the boundary have zero hinge loss and do not change $w,b$. This is why SVM solutions are often sparse: the model depends on a subset of the training data.\n", 104 | "\n", 105 | "Points with zero loss **away from the margin** don’t affect the solution. The model depends only on a subset — the **support vectors** — that lie **on or inside** the margin band.\n", 106 | "\n", 107 | "Prediction:\n", 108 | "\n", 109 | "$$\n", 110 | "\\hat y = \\mathrm{sign}(w^\\top x + b).\n", 111 | "$$\n", 112 | "\n", 113 | "---\n", 114 | "\n", 115 | "## 5) Dual problem & the kernel trick (non‑linear SVM)\n", 116 | "\n", 117 | "The dual re-expresses the problem in terms of **similarities between pairs of points** via a kernel $K(x_i,x_j)$. Replacing dot-products with kernels lets the classifier act as if data were mapped into a higher-dimensional space **without computing that mapping explicitly** (the “kernel trick”). The prediction becomes a weighted sum over support vectors: only those with $\\alpha_i>0$ matter.\n", 118 | "\n", 119 | "The Lagrange dual of the soft‑margin problem (for kernel $K$) is:\n", 120 | "\n", 121 | "$$\n", 122 | "\\begin{aligned}\n", 123 | "\\max_{\\alpha}\\quad & \\sum_{i=1}^n \\alpha_i - \\frac12 \\sum_{i=1}^n \\sum_{j=1}^n \\alpha_i \\alpha_j y_i y_j\\, K(x_i,x_j) \\\\\n", 124 | "\\text{s.t.}\\quad & 0 \\le \\alpha_i \\le C,\\quad \\sum_{i=1}^n \\alpha_i y_i = 0\n", 125 | "\\end{aligned}\n", 126 | "$$\n", 127 | "\n", 128 | "The decision function becomes:\n", 129 | "\n", 130 | "$$\n", 131 | "f(x) = \\sum_{i=1}^n \\alpha_i y_i\\, K(x_i, x) + b.\n", 132 | "$$\n", 133 | "\n", 134 | "**Common kernels $K(x,z)$**\n", 135 | "\n", 136 | "- **Linear:** $x^\\top z$ (useful for very high‑dimensional sparse features; scalable with `LinearSVC`).\n", 137 | "- **RBF (Gaussian):** $\\exp(-\\gamma \\lVert x - z\\rVert^2)$ with $\\gamma > 0$\n", 138 | "- **Polynomial:** $(\\gamma\\, x^\\top z + r)^d$ (degree $d$)\n", 139 | "- **Sigmoid:** $\\tanh(\\gamma\\, x^\\top z + r)$ (less common)\n", 140 | "\n", 141 | "**Hyperparameters**\n", 142 | "\n", 143 | "- `C` — regularization (as above)\n", 144 | "- `gamma` — RBF/poly scale; large `gamma` → tighter, more wiggly boundaries; small `gamma` → smoother\n", 145 | "\n", 146 | "---\n", 147 | "\n", 148 | "## 6) Best practices\n", 149 | "\n", 150 | "- **Scale features** (standardize) — SVMs are distance‑based.\n", 151 | "- Start with **linear SVM** for many features/large $n$ (`LinearSVC` or `SGDClassifier(loss=\"hinge\")`).\n", 152 | "- Use **RBF SVC** for moderate $n$ when nonlinearity helps.\n", 153 | "- For **imbalanced** data, set `class_weight=\"balanced\"` or provide weights.\n", 154 | "- Enable probability estimates (Platt scaling) via `probability=True` in `SVC` (costs extra fitting).\n", 155 | "\n", 156 | "---\n", 157 | "\n", 158 | "## 7) Minimal code — linear and RBF SVM (scikit‑learn)\n", 159 | "\n", 160 | "The flow below is: make data → train/test split → **pipeline** with `StandardScaler` → fit a **LinearSVC** (fast for large/high‑dimensional sets). Then try an **RBF SVC** and use a tiny **grid search** to pick `C` and `gamma`. `LinearSVC(dual=\"auto\")` chooses an efficient solver depending on the feature/sample ratio." 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "id": "264c5c42", 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "import numpy as np\n", 171 | "from sklearn.datasets import make_classification\n", 172 | "from sklearn.model_selection import train_test_split, GridSearchCV\n", 173 | "from sklearn.preprocessing import StandardScaler\n", 174 | "from sklearn.pipeline import make_pipeline\n", 175 | "from sklearn.svm import LinearSVC, SVC\n", 176 | "from sklearn.metrics import accuracy_score, classification_report\n", 177 | "\n", 178 | "# Data\n", 179 | "X, y = make_classification(n_samples=2000, n_features=50, n_informative=10,\n", 180 | " class_sep=1.5, random_state=0)\n", 181 | "Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.25, random_state=0, stratify=y)\n", 182 | "\n", 183 | "# 1) Linear SVM (good for large & high-dim data)\n", 184 | "lin_clf = make_pipeline(StandardScaler(), LinearSVC(class_weight=\"balanced\"))\n", 185 | "lin_clf.fit(Xtr, ytr)\n", 186 | "print(\"LinearSVC acc:\", accuracy_score(yte, lin_clf.predict(Xte)))\n", 187 | "\n", 188 | "# 2) RBF-kernel SVM with small grid search (for moderate-sized data)\n", 189 | "rbf_pipe = make_pipeline(StandardScaler(), SVC(kernel=\"rbf\"))\n", 190 | "param_grid = {\"svc__C\": [0.1, 1, 10], \"svc__gamma\": [\"scale\", 0.01, 0.1]}\n", 191 | "grid = GridSearchCV(rbf_pipe, param_grid, cv=3, n_jobs=-1)\n", 192 | "grid.fit(Xtr, ytr)\n", 193 | "print(\"Best RBF params:\", grid.best_params_)\n", 194 | "print(\"RBF acc:\", accuracy_score(yte, grid.predict(Xte)))\n", 195 | "print(classification_report(yte, grid.predict(Xte)))" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "id": "fa0aad12", 201 | "metadata": {}, 202 | "source": [ 203 | "---\n", 204 | "\n", 205 | "## 8) (Optional) Hinge‑loss SGD — from scratch (toy)\n", 206 | "\n", 207 | "This toy optimizer does **stochastic subgradient descent** on the hinge loss plus $\\ell_2$ penalty. If a sample is correctly classified with margin $\\ge 1$, we only apply weight decay. If it violates the margin, we also step in the direction that reduces the hinge loss. This mirrors what large‑scale linear SVM libraries do under the hood." 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "id": "652686aa", 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "import numpy as np\n", 218 | "\n", 219 | "def sgd_linear_svm(X, y, lr=0.1, lam=1e-3, epochs=10):\n", 220 | " # y in {-1, +1}\n", 221 | " n, d = X.shape\n", 222 | " w = np.zeros(d); b = 0.0\n", 223 | " for _ in range(epochs):\n", 224 | " idx = np.random.permutation(n)\n", 225 | " for i in idx:\n", 226 | " margin = y[i]*(X[i] @ w + b)\n", 227 | " if margin < 1:\n", 228 | " # subgradient of hinge + L2\n", 229 | " w = (1 - lr*lam)*w + lr*y[i]*X[i]\n", 230 | " b = b + lr*y[i]\n", 231 | " else:\n", 232 | " w = (1 - lr*lam)*w\n", 233 | " return w, b" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "id": "72a3aaf5", 239 | "metadata": {}, 240 | "source": [ 241 | "---\n", 242 | "\n", 243 | "## 9) How to choose `C`, `gamma`, and the kernel\n", 244 | "\n", 245 | "1. **Start simple:** linear vs RBF; pick by validation.\n", 246 | "2. **Grid search / log‑scale**: Use a small **log‑grid** `C ∈ {0.01, 0.1, 1, 10, 100}`, `gamma ∈ {\"scale\", 0.001, 0.01, 0.1, 1}`.\n", 247 | "3. **Watch for overfit:** very high training acc + lower validation acc → reduce `C` or `gamma`.\n", 248 | "4. **Speed:** If training is slow with kernel SVMs, reduce the grid size first and/or switch to `LinearSVC`.\n", 249 | "\n", 250 | "---\n", 251 | "\n", 252 | "## 10) Multiclass strategies\n", 253 | "\n", 254 | "SVMs are inherently binary. For $K$ classes:\n", 255 | "\n", 256 | "- **One‑vs‑One (OvO):** train $K(K-1)/2$ binary classifiers (default in `SVC`). It compares every pair of classes and tends to work well when classes are balanced\n", 257 | "- **One‑vs‑Rest (OvR):** train $K$ classifiers vs the rest (default in `LinearSVC`). It is simpler and pairs well with linear models on high‑dimensional data.\n", 258 | "\n", 259 | "---\n", 260 | "\n", 261 | "## 11) FAQs & Gotchas\n", 262 | "\n", 263 | "Probability outputs from `SVC` come from an extra calibration step (Platt scaling); they can look conservative on small datasets. For speed problems, try fewer features, subsampling, or a linear model. Always re‑check that features are standardized.\n", 264 | "\n", 265 | "- **“My SVM is slow.”** → Too many samples with kernel SVC; try `LinearSVC` or sub-sample + RBF.\n", 266 | "- **“Predicted probabilities look odd.”** → They’re calibrated via Platt scaling; try `CalibratedClassifierCV`.\n", 267 | "- **“Decision boundary is jagged.”** → Likely large `gamma` (RBF) or very large `C`; reduce them and re‑scale features.\n", 268 | "- **“Imbalanced classes.”** → Use `class_weight=\"balanced\"` and evaluate with F1/ROC‑AUC, not just accuracy.\n", 269 | "\n", 270 | "---\n", 271 | "\n", 272 | "## 12) Quick cheat sheet\n", 273 | "\n", 274 | "Linear SVMs shine with many features (e.g., text). Kernel SVMs capture curvature at the cost of speed and tuning more hyperparameters. Both require scaling; both are sensitive to `C`, and kernels add `gamma` (and `degree` for polynomial).\n", 275 | "\n", 276 | "| Aspect | Linear SVM | Kernel SVM (RBF/Poly) |\n", 277 | "| ------------------- | ------------------------------- | ------------------------------------- |\n", 278 | "| Nonlinearity | No | Yes (via kernels) |\n", 279 | "| Scale with #samples | Great (use `LinearSVC`) | Moderate/Slow for large $n$ |\n", 280 | "| Hyperparams | `C` | `C`, `gamma`, (degree for poly) |\n", 281 | "| Feature scaling | **Required** | **Required** |\n", 282 | "| Typical use | Text, high‑dim sparse, big data | Moderate data with nonlinear boundary |\n", 283 | "\n", 284 | "---\n", 285 | "\n", 286 | "## 13) Practice\n", 287 | "\n", 288 | "Try plotting the margin band (the two lines where $y(w^\\top x + b)=1$) on a 2‑D toy set to see which points become support vectors. Then vary `C` and watch how the number of support vectors and the margin width change.\n", 289 | "\n", 290 | "1. Standardize features, then compare `LinearSVC` vs `SVC(RBF)` on your dataset.\n", 291 | "2. Grid‑search `C` and `gamma`; report validation curves and best test score.\n", 292 | "3. Inspect support vectors: how many are there, and which points become SVs?\n", 293 | "4. Try class imbalance: set `class_weight=\"balanced\"` and compare metrics.\n", 294 | "\n", 295 | "---\n", 296 | "\n", 297 | "## Summary\n", 298 | "\n", 299 | "- SVMs maximize margin → robust decision boundaries.\n", 300 | "- Soft margins + hinge loss handle overlap and noise.\n", 301 | "- Dual form enables **kernels** for powerful nonlinear separation.\n", 302 | "- In practice: **scale**, pick kernel by validation, tune `C`/`gamma`, and use `LinearSVC` for large datasets.\n", 303 | "\n", 304 | "**Next:** Principal component analysis + Dimensionality reduction" 305 | ] 306 | } 307 | ], 308 | "metadata": { 309 | "jupytext": { 310 | "cell_metadata_filter": "-all", 311 | "main_language": "python", 312 | "notebook_metadata_filter": "-all" 313 | }, 314 | "language_info": { 315 | "name": "python" 316 | } 317 | }, 318 | "nbformat": 4, 319 | "nbformat_minor": 5 320 | } 321 | -------------------------------------------------------------------------------- /1. Mathematical Methods For AI/1.1. Linear Algebra/Linear Algebra.md: -------------------------------------------------------------------------------- 1 | # Linear Algebra 2 | ## Notation 3 | - `∀` : "for any" 4 | - `∃` : "there exists" 5 | 6 | --- 7 | 8 | ## Vector Operations 9 | ### Basic Operations 10 | - **Addition/Subtraction**: 11 | $(u_1,u_2) + (v_1,v_2) = (u_1+v_1,u_2+v_2)$ 12 | - **Scalar Multiplication**: 13 | $k \cdot (v_1,v_2) = (k\cdot v_1, k\cdot v_2)$ 14 | 15 | ### Dot Product (Inner Product) 16 | $$\vec{u} \cdot \vec{v} = \sum_{i=1}^{n} u_i v_i$$ 17 | - **Geometric Interpretation**: 18 | $\vec{u} \cdot \vec{v} = \|\vec{u}\|\|\vec{v}\| \cos{\theta}$ 19 | ($\theta$ = angle between vectors) 20 | - **Orthogonality Condition**: 21 | $\vec{u} \cdot \vec{v} = 0 \iff \vec{u} \perp \vec{v}$ 22 | 23 | ### Cross Product (3D Only) 24 | $$\vec{u} \times \vec{v} = (u_2v_3 - u_3v_2,\ u_3v_1 - u_1v_3,\ u_1v_2 - u_2v_1)$$ 25 | - **Properties**: 26 | - Result is perpendicular to input vectors (right-hand rule determines direction) 27 | - $\|\vec{u} \times \vec{v}\| = \|\vec{u}\|\|\vec{v}\| \sin{\theta}$ = area of parallelogram spanned by $\vec{u},\vec{v}$ 28 | 29 | ### Norms 30 | $$ \|\vec{x}\|_p = \left( \sum_{i=1}^{n} |x_i|^p \right)^{1/p} \quad (p \geq 1) $$ 31 | - **Special Cases**: 32 | - Euclidean ($L^2$): $\|\vec{x}\|_2 = \sqrt{\sum x_i^2}$ 33 | - Manhattan ($L^1$): $\|\vec{x}\|_1 = \sum |x_i|$ 34 | - Infinity ($L^\infty$): $\|\vec{x}\|_\infty = \max |x_i|$ 35 | - **Convention**: $|\vec{v}|$ denotes Euclidean norm 36 | 37 | ### Linear Combinations & Span 38 | - **Linear Combination**: 39 | $\vec{w} = \sum_{i=1}^{n} c_i \vec{v}_i$ 40 | (Scalars $c_i$, vectors $\vec{v}_i$) 41 | - **Span**: Set of all linear combinations of $\{\vec{v}_1,\dots,\vec{v}_n\}$ 42 | Example: $\text{span}\left(\begin{bmatrix}1\\0\\0\end{bmatrix},\begin{bmatrix}0\\1\\0\end{bmatrix}\right)$ = $xy$-plane in $\mathbb{R}^3$ 43 | 44 | ### Orthonormal Vectors 45 | Set $\{\vec{v}_1,\dots,\vec{v}_n\}$ satisfies: 46 | 1. **Orthogonal**: $\vec{v}_i \cdot \vec{v}_j = 0 \quad (i \neq j)$ 47 | 2. **Unit Length**: $ \forall i \quad \|\vec{v}_i\| = 1$ 48 | 49 | --- 50 | 51 | ## Matrix Operations 52 | 53 | ### Notations 54 | - $A \in \mathbb{R}^{n\times m}$ Represents a matrix $A$ is a matrix of dimensions $n \times n$ whose entries are real numbers. 55 | - $I$ is the identity matrix. for $I_3$, or the three dimension one, it would be $\begin{pmatrix} 1 & 0 & 0 \\ 0 & 1 & 0 \\ 0 & 0 & 1 \end{pmatrix}$. 56 | - $A_{i,j}$ means the $i$ th row and $j$ th column of matrix $A$. 57 | 58 | ### Basic Operations 59 | - **Addition and Subtraction**: 60 | Only matrices of the same dimensions can be added or subtracted. $(A+B)_{i,j} = A_{i,j} + B_{i,j}$. $(A-B)_{i,j} = A_{i,j} - B_{i,j}$. 61 | - **Scalar Multiplication**: 62 | Let $k$ be a scalar. $(kA)_{i,j} = k \times A_{i,j}$. 63 | 64 | ### Transpose 65 | This is essentially flipping a matrix over its main diagonal(the upper-left to down-right diagonal). let $A = \begin{pmatrix} a & b \\ c& d \end{pmatrix}$ then its transpose $A^T = \begin{pmatrix} a & c \\ b& d \end{pmatrix}$ 66 | 67 | ### Determinant 68 | The determinant of a matrix, denoted as $\text{det}(A)$, provides important information about a matrix. 69 | - A matrix $A$ is invertible if and only if $\text{det}(A) \neq 0$. We call a matrix without an inverse singular. 70 | - The determinant can represent the scaling factor of the volume change when the matrix is applied as a linear transformation. In $\mathbb{R}^2$, the determinant of a $R^{2\times2}$ matrix represents the area scaling factor of the parallelogram formed by the column vectors of the matrix. 71 | 72 | **Properties**: 73 | - Cramer's rule: The solution components of $A\vec{x}=\vec{b}$ are given by $x_i = \frac{\text{det}(A_i)}{\text{det}A}$, where $A_i$ is the matrix formed by replacing the $i$th column of $A$ with the vector $\vec{b}$ 74 | - Row operations: 75 | - Swapping two rows changes the sign of the determinant 76 | - Multiplying a row by a scalar $k$ multiplies the determinant by $k$ 77 | - Adding a multiple of one row to another rorw doesn't change the determinant 78 | - $\text{det}(AB) = \text{det}(A)\cdot \text{det}(B)$ 79 | - $\text{det}(A^T)=\text{det}(A)$ 80 | - $\text{det}(0) = 0 \quad \text{det}(I) = 1$ (this means at any valid dimension where the identity matrix is defined, its determinant if 1) 81 | - diagonal matrix: a matrix that only has entries on its main diagonal. for a diagonal matrix, its determinant is the product of all of its entries. 82 | 83 | **Calculation** 84 | - For a $1 \times 1$ matrix $A=\begin{pmatrix} a \end{pmatrix}$, $\text{det}(A) = a$. 85 | - For a $2 \times 2$ matrix $A=\begin{pmatrix} a & b \\ c & d \end{pmatrix}$, $\text{det}(A) = ad - bc$. 86 | - For $n \times n$ matrix, the determinant can be found by the property of diagonal matrices. This means we can use Gaussian elimination (covered below) to find a diagonal matrix and then find the determinant of that diagonal matrix. Remember to multiply the final determinant with the scale factors of each row due to the second property written in row operations. 87 | 88 | ### Inverse 89 | **only square matrices that are non-singular (non-zero determinant) have inverses.** The inverse of $A$, denoted by $A^{-1}$, is the matrix such that $A \times A^{-1} = A^{-1} \times A = I$. To find the inverse, you can use gaussian elimination to solve the equation where the augmented matrix would be: $A|I$. For a $2\times2$ matrix, $\begin{pmatrix} a & b \\ c & d \end{pmatrix}, its inverse is $\frac{1}{ad-bc}\begin{pmatrix} d & -b \\ -c & a \end{pmatrix}$. 90 | 91 | ### Matrix Multiplication 92 | For $A \in \mathbb{R}^{m \times p}$, $B \in \mathbb{R}^{p \times n}$: 93 | $$C = AB \quad \text{where} \quad c_{ij} = \sum_{k=1}^{p} a_{ik}b_{kj}$$ 94 | - $c_{ij}$ = dot product of $i$-th row of $A$ and $j$-th column of $B$ 95 | 96 | ![image.png](attachment:image.png) 97 | 98 | --- 99 | ## Linear Transformations and Eigen Theory 100 | ### Linear Maps 101 | A linear map (linear transformation) is a function $T:V \to W$ between vector spaces $V$ and $W$ over field $\mathbb{F}$ satisfying $\forall \vec{u},\vec{v} \in V, k\in \mathbb{F}$: 102 | 1. $T(\vec{u} + \vec{v}) = T(\vec{u}) + T(\vec{v})$ 103 | 2. $T(k\vec{v}) = kT(\vec{v})$ 104 | 105 | ### Matrix Representation 106 | Matrix multiplication represents linear maps. For $A \in \mathbb{F}^{m \times n}$, define $T_A : \mathbb{F}^n \to \mathbb{F}^m$ by: 107 | $$T_A(\vec{x}) = A\vec{x}$$ 108 | This satisfies linearity: 109 | - $T_A(\vec{x} + \vec{y}) = A(\vec{x} + \vec{y}) = A\vec{x} + A\vec{y} = T_A(\vec{x}) + T_A(\vec{y})$ 110 | - $T_A(k\vec{x}) = A(k\vec{x}) = k(A\vec{x}) = kT_A(\vec{x})$ 111 | 112 | ### Example: Rotation in $\mathbb{R}^2$ 113 | The rotation matrix for counter-clockwise rotation by $\theta$: 114 | $$R_\theta = \begin{bmatrix} 115 | \cos\theta & -\sin\theta \\ 116 | \sin\theta & \cos\theta 117 | \end{bmatrix}$$ 118 | defines linear map $T_\theta(\vec{x}) = R_\theta \vec{x}$. 119 | 120 | For $\theta = 60^\circ = \frac{\pi}{3}$: 121 | $$R_{60^\circ} = \begin{bmatrix} 122 | \frac{1}{2} & -\frac{\sqrt{3}}{2} \\ 123 | \frac{\sqrt{3}}{2} & \frac{1}{2} 124 | \end{bmatrix}$$ 125 | 126 | #### Transforming Standard Basis: 127 | $$\mathbf{e}_1 = \begin{bmatrix}1\\0\end{bmatrix},\quad \mathbf{e}_2 = \begin{bmatrix}0\\1\end{bmatrix}$$ 128 | 129 | $$T_{60^\circ}(\mathbf{e}_1) = \begin{bmatrix}\frac{1}{2} \\ \frac{\sqrt{3}}{2}\end{bmatrix},\quad T_{60^\circ}(\mathbf{e}_2) = \begin{bmatrix}-\frac{\sqrt{3}}{2} \\ \frac{1}{2}\end{bmatrix}$$ 130 | 131 | Geometrically: All vectors rotated counter-clockwise by $60^\circ$ about origin, preserving lengths and angles. 132 | 133 | ### Eigenvectors and Eigenvalues 134 | An eigenvector $\vec{v} \neq \vec{0}$ of linear transformation $T$ is a direction invariant under $T$, scaled by eigenvalue $\lambda$: 135 | $$T(\vec{v}) = \lambda \vec{v}$$ 136 | 137 | - $\lambda > 0$: Direction preserved 138 | - $\lambda < 0$: Direction reversed 139 | 140 | #### Finding Eigenvalues/Eigenvectors 141 | For matrix $A$ representing $T$: 142 | 1. Solve characteristic equation: $\det(A - \lambda I) = 0$ 143 | 2. For each $\lambda_i$, solve $(A - \lambda_i I)\vec{v} = \vec{0}$ 144 | 145 | ##### Example 146 | For $A = \begin{pmatrix} 3 & 1 \\ 0 & 2 \end{pmatrix}$: 147 | Characteristic equation: 148 | $$\det\begin{pmatrix} 3-\lambda & 1 \\ 0 & 2-\lambda \end{pmatrix} = (3-\lambda)(2-\lambda) = 0$$ 149 | Eigenvalues: $\lambda_1 = 3, \lambda_2 = 2$ 150 | 151 | - For $\lambda_1=3$: 152 | $$\begin{pmatrix}0 & 1 \\ 0 & -1\end{pmatrix}\begin{pmatrix}v_1\\v_2\end{pmatrix} = \begin{pmatrix}0\\0\end{pmatrix} \implies v_2=0$$ 153 | Eigenvectors: $\begin{pmatrix}v_1\\0\end{pmatrix} = v_1\begin{pmatrix}1\\0\end{pmatrix} \ (v_1 \neq 0)$ 154 | 155 | - For $\lambda_2=2$: 156 | $$\begin{pmatrix}1 & 1 \\ 0 & 0\end{pmatrix}\begin{pmatrix}v_1\\v_2\end{pmatrix} = \begin{pmatrix}0\\0\end{pmatrix} \implies v_1 = -v_2$$ 157 | Eigenvectors: $\begin{pmatrix}v_1\\-v_1\end{pmatrix} = v_1\begin{pmatrix}1\\-1\end{pmatrix} \ (v_1 \neq 0)$ 158 | 159 | *Note that the determinant of a matrix is equal to the product of its eigenvalues. 160 | 161 | --- 162 | 163 | ## Matrix Decomposition 164 | ### Gaussian Elimination 165 | 166 | This is an algorithm for solving a linear equation system using linear algebra methods. The idea is to rewrite the matrix into an upper triangular matrix that represents the equation system. For example, for the linear equation: 167 | $$\begin{cases} x + 2y - 4z = 5 \\ 2x + y - 6z = 8 \\ 4x - y - 12z = 13 \end{cases}$$ 168 | We have a coefficient matrix and an augmented matrix 169 | $$ A_{\text{Coefficient}} = \begin{bmatrix} 1 & 2 & {-4} \\ 2 & 1 & {-6} \\ 4 & {-1} & {-12} \end{bmatrix}$$ 170 | $$ 171 | A_{\text{Augmented}} = \left[ 172 | \begin{array}{rrr|r} 173 | 1 & 2 & -4 & 5 \\ 174 | 2 & 1 & -6 & 8 \\ 175 | 4 & -1 & -12 & 13 176 | \end{array} 177 | \right] 178 | $$ 179 | 180 | The corresponding upper triangular matrix would be 181 | $$ 182 | A_{\text{Augmented}} = \left[ 183 | \begin{array}{rrr|r} 184 | 1 & 2 & -4 & 5 \\ 185 | 0 & -3 & 2 & -2 \\ 186 | 0 & 0 & -2 & -1 187 | \end{array} 188 | \right] 189 | $$ 190 | The equivalent linear system is 191 | $$\begin{cases} x + 2y - 4z = 5 \\ -3y + 2z = -2 \\ -2z = -1 \end{cases}$$ 192 | 193 | How is this done? This is an example to show the process. We name the $i$th row as $R_i$, the $i$th column as $C_i$, and $M_{i,j}$ the value in the $i$th row and the $j$th column. we start with $i=1$ and end at $i=n$, which is the number of rows the matrix has. Define constant $k\in \mathbb{R}$. 194 | 195 | $$ 196 | \left[ 197 | \begin{array}{rrr|r} 198 | 2 & 2 & 6 & 4 \\ 199 | 2 & 1 & 7 & 6 \\ 200 | -2 & -6 & -7 & -1 201 | \end{array} 202 | \right] 203 | $$ 204 | 205 | We first scale $R_i$ such that $M_{i,i}=1$. 206 | 207 | $$ 208 | \left[ 209 | \begin{array}{rrr|r} 210 | 1 & 1 & 3 & 2 \\ 211 | 2 & 1 & 7 & 6 \\ 212 | -2 & -6 & -7 & -1 213 | \end{array} 214 | \right] 215 | $$ 216 | 217 | We add $k \cdot R_i$ to all $R_j$ with $j>i$ such that $M_{j,i} = 0$. 218 | 219 | $$ 220 | \left[ 221 | \begin{array}{rrr|r} 222 | 1 & 1 & 3 & 2 \\ 223 | 0 & -1 & 1 & 2 \\ 224 | 0 & -4 & -1 & 3 225 | \end{array} 226 | \right] 227 | $$ 228 | 229 | We repeat the process on the next $i$. Here $i=2$, so we scale the second row such that $M_{i,i}=1$. 230 | 231 | $$ 232 | \left[ 233 | \begin{array}{rrr|r} 234 | 1 & 1 & 3 & 2 \\ 235 | 0 & 1 & -1 & -2 \\ 236 | 0 & -4 & -1 & 3 237 | \end{array} 238 | \right] 239 | $$ 240 | 241 | We then add a multiple of $R_2$ to all rows under it such that the second element of each such row becomes $0$. Here, we add $4\cdot R_2$ to $R_3$. 242 | 243 | $$ 244 | \left[ 245 | \begin{array}{rrr|r} 246 | 1 & 1 & 3 & 2 \\ 247 | 0 & 1 & -1 & -2 \\ 248 | 0 & 0 & -5 & -5 249 | \end{array} 250 | \right] 251 | $$ 252 | 253 | We then repeat the process for the last row. 254 | 255 | $$ 256 | \left[ 257 | \begin{array}{rrr|r} 258 | 1 & 1 & 3 & 2 \\ 259 | 0 & 1 & -1 & -2 \\ 260 | 0 & 0 & 1 & 1 261 | \end{array} 262 | \right] 263 | $$ 264 | 265 | We now have a upper triangular matrix that can be easily used to find the solutions of the initial linear equation system. To solve the system, we essentially need to find $M_{ij}$ where $i=j$. Starting with $i=n$, we reverse the process. We take $k\cdot R_i$ and add it to all $R_j$ with $ji$ such that $M_{j,i} = 0$. 295 | 3. Move to the next $i$ by adding $1$ 296 | 297 | After this has been done to all rows: 298 | 1. Add $k \cdot R_i$ to all $R_j$ with $j i \text{ such that } M_{j,i} = 0: \\ 317 | &\hspace{3em} \text{for } j = i+1 \text{ to } n \text{ do} \hspace{1em} \\ 318 | &\hspace{4em} \text{if } M_{j,i} \neq 0 \text{ then} \\ 319 | &\hspace{5em} k = -M_{j,i} \\ 320 | &\hspace{5em} R_j = R_j + k \cdot R_i \\ 321 | &\hspace{4em} \text{end if} \\ 322 | &\hspace{3em} \text{end for} \\ 323 | &\text{end for} \\ 324 | & \\ 325 | &\text{2. Backward Elimination:} \\ 326 | &\text{for } i = n-1 \text{ down to } 1 \text{ do} \hspace{1em} \\ 327 | &\hspace{2em} \text{(a) Add } k \cdot R_i \text{ to all } R_j \text{ with } j < i \text{ such that } M_{j,i} = 0: \\ 328 | &\hspace{3em} \text{for } j = i-1 \text{ down to } 1 \text{ do} \hspace{1em} \\ 329 | &\hspace{4em} \text{if } M_{j,i} \neq 0 \text{ then} \\ 330 | &\hspace{5em} k = -M_{j,i} \\ 331 | &\hspace{5em} R_j = R_j + k \cdot R_i \\ 332 | &\hspace{4em} \text{end if} \\ 333 | &\hspace{3em} \text{end for} \\ 334 | &\text{end for} \\ 335 | & \\ 336 | &\textbf{End} 337 | \end{aligned} 338 | $$ 339 | 340 | --- 341 | 342 | ## Algebraic Structures 343 | ### Fields ($\mathbb{F}$) 344 | Set of scalars with two operations (`+`, `·`) satisfying: 345 | 1. **Closure** under addition and multiplication 346 | 2. **Commutativity**: $a+b=b+a$, $a·b=b·a$ 347 | 3. **Associativity**: $(a+b)+c=a+(b+c)$, $(a·b)·c=a·(b·c)$ 348 | 4. **Identities**: 349 | - Additive: $a+0=a$ 350 | - Multiplicative: $a·1=a$ $(1 \neq 0)$ 351 | 5. **Inverses**: 352 | - Additive: $a + (-a) = 0$ 353 | - Multiplicative: $a·a^{-1}=1$ $(a \neq 0)$ 354 | 6. **Distributivity**: $a·(b+c) = a·b + a·c$ 355 | 356 | *Example*: $\mathbb{R}$ (real numbers) 357 | 358 | ### Vector Spaces ($V$ over $\mathbb{F}$) 359 | Set of vectors with two operations (`+`, scalar multiplication) satisfying: 360 | 1. **Closure** under vector addition and scalar multiplication 361 | 2. **Commutativity/Associativity** of addition 362 | 3. **Zero vector**: $\vec{v} + \vec{0} = \vec{v}$ 363 | 4. **Additive inverse**: $\vec{v} + (-\vec{v}) = \vec{0}$ 364 | 5. **Distributivity**: 365 | - $k(\vec{u} + \vec{v}) = k\vec{u} + k\vec{v}$ 366 | - $(α+β)\vec{v} = α\vec{v} + β\vec{v}$ 367 | 6. **Associativity of scalars**: $α(β\vec{v}) = (αβ)\vec{v}$ 368 | 7. **Multiplicative identity**: $1·\vec{v} = \vec{v}$ 369 | 370 | *Examples*: $\mathbb{R}^n$, polynomials of degree $\leq 4$, $2\times 3$ matrices 371 | 372 | ### Subspaces ($W \subseteq V$) 373 | Subset satisfying: 374 | 1. $k \in \mathbb{F}, \vec{w} \in W \implies k\vec{w} \in W$ 375 | 2. $\vec{u},\vec{v} \in W \implies \vec{u} + \vec{v} \in W$ 376 | 377 | *Equivalent to*: $W$ is itself a vector space 378 | *Example*: Plane $x+y+z=0$ in $\mathbb{R}^3$ 379 | 380 | ### Basis 381 | Set $B = \{\vec{b}_1,\dots,\vec{b}_n\}$ that is: 382 | 1. **Linearly Independent**: 383 | $\sum c_i\vec{b}_i = \vec{0} \implies c_i = 0 \ \forall i$ 384 | 2. **Spanning**: $\forall \vec{v} \in V,\ \exists c_i : \vec{v} = \sum c_i\vec{b}_i$ 385 | - **Dimension** ($\dim V$): Number of basis vectors ($n$) 386 | - *Example*: Standard basis $\{(1,0,0),(0,1,0),(0,0,1)\}$ for $\mathbb{R}^3$ 387 | -------------------------------------------------------------------------------- /4. Deep Learning & Computer Vision/4.2. Backpropagation, Gradient Descent, Adaptive Moment Estimation/Backpropagation.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | # All About **Backpropagation** 4 | ###### **Will Chen** | SHSID Data Science Group 5 | 6 | ## Key Questions 7 | 8 |
9 | 10 | Backpropagation is an important step in supervised learning procedures. By the end of the lesson, you will be able to answer the following **key questions**: 11 | 1. What is backpropagation and how is it used? 12 | 2. What are the similarities and differences between forward propagation and backpropagation? 13 | 3. How does backpropagation specifically function? 14 | 4. Why is it such an important process and why is it used so much? 15 | 16 |
17 | 18 | ## Key Terms 19 |
20 | In order to understand this lesson, you would have a grasp of the following key concepts and terms: 21 | 22 | - **Layers**: Each network has many layers. The first layer takes in the input vector and the last layer is output. In the diagram below, each layer is one column. 23 | 24 | - **Vector**: A list of the numbers in each layer of a neural network. 25 | 26 | - **Neural Network**: _Input vectors_ (blue) are passed through the _hidden layers_ (green) to form an _output vector_ (yellow). 27 | 28 | - **Neuron**: A node in the network, each containing a special number called weight. In the diagram, it is each circle (🟢). When running the model, each neuron multiplies its weight with the previous layer’s vector and adds them together, forming its result and ready to be used by the next layer’s neurons. 29 | 30 | - **Hidden Layer**: Contains a series of weights for each result of each previous layer to multiply with, producing its output value. 31 | 32 | - **Matrix Multiplication**: The method used to multiply vectors with weights. Each vector is multiplied by the weight once and passed on to the next layer. 33 | 34 | - **Epoch**: One training generation. One round of backpropagation (explained later) is used per epoch. 35 | 36 | - **Loss**: During supervised training, the model’s output is continuously compared with the real outcome. The purpose of the model is to form results as close to the real outcome as possible, in other words, minimize the loss. The difference between output vector and the real outcome vector is known as loss. 37 | 38 | - **Supervised training**: A type of training method where the expected input and result vectors are labeled for the model to train with. 39 | 40 | 41 | image 42 | 43 | image 44 | 45 | image 46 | 47 |
48 | 49 | ## Introduction to Backpropagation 50 |
51 | 52 | ### What is backpropagation? 53 | 54 | **Backpropagation** is a crucial step in neural networks that improves the accuracy of a model by traversing a network in reverse to find out how much each weight contributed to the model’s inaccuracy. 55 | 56 | image 57 | 58 | As we know from **forward propagation**, multilayer perceptron networks work by moving through layers, from the input vector layer to the output. Back propagation, as the name suggests, moves in reverse: It **starts with the very last layer** (output) and **propagates back to the previous layers**. When forward propagation is useful in running the model, backpropagation is useful in training. 59 | 60 | **Think of it like solving a math formula**: You know you made a mistake along the way because your answer is different from the answer key. So, you want to look through your steps in reverse to adjust them. Similarly, when a model returns something we don’t expect, we can use backpropagation to **adjust each neuron until it returns the things we like**. 61 | 62 | ### What happens in training? 63 | 64 | Training a model is a** very repetitive process**. From earlier on, you might recall that **epochs** are the individual rounds of training, and that the goal of training a model is to **minimize the errors it makes**. During each epoch, several things happen: 65 | 66 | 1. We run the model through its neurons (aka Forward Propagation). 67 | 2. The model gives us its predictions (in the final output vector layer). 68 | 3. We compare the prediction with the real, expected result (given by our training dataset). 69 | 4. **We use backpropagation as a tool to know which neurons went wrong, and to what extent**. 70 | 5. We adjust the neurons and move on to the next epoch, and so on until we’re happy. 71 | 72 | image 73 | 74 | **But what happens before any epoch**? We must set up the model. Usually, models begin with “blank” neurons, or neurons that don’t contain any special information about their weights and biases. When we run this blank model for the first time with our training features, it probably won’t return something we’d like. 75 | 76 | When we compare the model’s predictions with the labels, we get a difference, which is known as loss. This difference is crucial because it tells us if our predictions are close enough or not. But it doesn’t tell us what exactly went wrong. How do we know that? **The answer? We do it through backpropagation**. 77 | 78 | ### How does backpropagation really work? 79 | 80 | Think of it like **taking blame**. We start off with knowing a difference (or loss) between the expected and predicted results, which we don’t like. Of course, the neurons at the very end of the network are **most directly responsible for the loss** since they are closest to the output neurons, so they take all the blame first. Then, using a special math property called the **chain rule**, the blame is further **passed down along the line from back to front**, and **distributed across the entire network**. 81 | 82 | #### Understanding loss calculation 83 | 84 | First, we need to get how the loss is exactly calculated. As we know, the model produces predictions that can be compared with expected predictions. For each forward propagation, loss is taken as a single number. Since there can be a lot of output neurons, we need to take the mean of the errors. 85 | 86 | But it’s not just any mean. Because we’re dealing with some calculus (later on), scientists realized that taking the **MSE (Mean Squared Error)** will make the calculations a lot easier than just taking the mean regularly, or MAE (Mean Absolute Error). **MSE is defined by the average squared difference between the predictions and expected results**. 87 | 88 | #### Dealing with gradients 89 | 90 | A key optimizer within backpropagation is **gradient descent**. It’s a type of iterative optimization algorithm; in other words, it slowly solves a problem iteratively one by one. Its use in backpropagation is that it tells us **how much we need to change a parameter in order to minimize the loss function**. 91 | 92 | image 93 | 94 | Reviewing some calculus terminology: 95 | 96 | - The **gradient** of a function is a vector of its _partial derivatives_. 97 | - A **partial derivative** is essentially the rate of change of an object in the x direction, shown by the black line on the right image. 98 | - The **derivative** of the function measures the weight change with respect to the input’s change. It tells us the _direction of the function._ 99 | - The **gradient** of the function, then, will tell us the magnitude of the function, otherwise _how much our parameters need to change_. 100 | - **E_tot** is our _“surface”_ for gradient descent minimization, similar to a _“warped 3D surface”_ like the picture to the right. This picture shows 3 dimensions, assuming there are 2 weights to work with and 1 local gradient. 101 | 102 | With this knowledge in mind, our aims in backpropagation becomes clear. We want to: 103 | 104 | - Get the _gradient_ of our error (magnitude to adjust our parameters) 105 | - The _opposite_ of our gradient would be the _direction for us to descend_ on E_tot. 106 | - Taking the same image as above, gradient descent is similar to **rolling a ball down the hill**. 107 | - When the ball reaches the very bottom, the slope of the surface is 0, and that’s the most optimal way we can adjust our loss to minimize it. 108 | 109 | In summary, the concept of Gradient Descent follows the derivatives to essentially “roll” down the slope until it finds its way to the minimum. 110 | 111 | #### Using the Chain Rule 112 | 113 | So, we know we need to find the gradient (the slope of our error surface) to roll the ball downhill towards the minimum loss. But a neural network has many layers and many weights. How do we figure out the specific gradient for a weight buried deep inside the network? This is where a fundamental concept from calculus, the **Chain Rule**, becomes the key to our lesson. 114 | 115 | The Chain Rule allows us to calculate how one variable affects another indirectly, through a chain of intermediate variables. The final loss is directly affected by the output of the last hidden layer. The output of that last hidden layer is affected by its weights and the output of the layer before it, and so on. **The Chain Rule gives us a precise mathematical way to quantify and pass this blame backward through the network**. 116 | 117 | For any given weight in the network, we want to calculate its "blame", or more formally, the partial derivative of the total error (E_tot) with respect to that weight (w). This tells us: "_If I change this specific weight just a tiny bit, how much will the total error of the network change_?" 118 | 119 | Using the Chain Rule, we can break this down into **three manageable pieces** for a weight connected to an output neuron: 120 | 121 | 1. How much did the total **error** change with respect to the neuron's **final output**? 122 | 2. How much did the neuron's **final output** change with respect to **its pre-activated input**? 123 | 3. How much did the neuron's **pre-activated input** change with respect to the **weight**? 124 | 125 | By multiplying these three rates of change together, **the Chain Rule gives us the overall gradient for that one weight**. For weights in earlier hidden layers, the chain just gets longer, but the principle is exactly the same. The error is propagated backward, layer by layer, **with each layer using the error signal calculated from the layer in front of it**. 126 | 127 | #### Making adjustments with the Update Rule 128 | 129 | Once backpropagation has used the Chain Rule to calculate the gradient for every single weight in the network, we know **two things** for each weight: 130 | 131 | 1. The **direction** of the steepest increase in error _(the gradient itself)_. 132 | 2. The **magnitude** of that slope _(how much the error will increase)_. 133 | 134 | Since our goal is to decrease the error, we simply go in the opposite direction of the gradient. This brings us to the **Update Rule**. For each weight, _we perform the following calculation_: 135 | 136 | $$ W_{new} = W_{old} - (LR * Gradient) $$ 137 | 138 | Let's break this down: 139 | 140 | - **Old Weight** (W_old): The current value of the weight before the update. 141 | - **Gradient**: The value we just calculated through backpropagation for this specific weight. It tells us which way is "uphill." 142 | - **Learning Rate**: This is a small number (e.g., 0.01) that we choose before training starts. It's a crucial parameter that controls _how big of a step we take downhill_. 143 | > If the learning rate is _too large_, we might overshoot the bottom of the valley and _end up on the other side_ (overfitting). 144 | > If it's _too small_, training will _take an incredibly long time and explode_, like taking tiny baby steps down a huge mountain (exploding). 145 | 146 | This update process is performed for _every weight and bias_ in the entire network during each training epoch. 147 | 148 | #### Linear regression 149 | 150 | If this still seems abstract, let's think about the simplest possible "network": a **Linear Regression model**. A linear regression model tries to fit a _straight line (y = mx + b) to a set of data points_. 151 | 152 | In the context of this equation, 153 | 154 | $$ y = mx + b $$ 155 | 156 | - x is our input. 157 | - m (the slope) and b (the y-intercept) are our "weights" or parameters. 158 | - y is our prediction. 159 | 160 | The "loss" is typically the **Mean Squared Error (MSE)** between our predicted y values and the actual y values from the data. To find the best line, we need to find the values of m and b that minimize this loss. How do we do that? With gradient descent! We would: 161 | 162 | - Calculate the partial derivative of the MSE **with respect to m**. 163 | - Calculate the partial derivative of the MSE **with respect to b**. 164 | - Use these gradients in the update rule to slowly _adjust m and b until the loss is at a minimum_. 165 | 166 | Backpropagation is simply a **generalization of this exact process** for a much more complex model with many layers and non-linear activation functions. It's a clever, systematic way of applying gradient descent to millions of parameters simultaneously. 167 | 168 | #### Other key aspects of backpropagation 169 | 170 | ##### Activation functions 171 | 172 | Remember that each neuron applies an **activation function** (like Sigmoid or ReLU) to its input. When we use the Chain Rule, we need to calculate the _derivative of this activation function_. This is a critical reason why activation functions **must be differentiable** (i.e., we can find their slope at any point)._ A function with a "clean" and easy-to-calculate derivative makes the math of backpropagation much more efficient_. 173 | 174 | ##### Different versions of gradient descent 175 | 176 | We don't have to calculate the loss over the entire dataset before making one single update. This would be very slow. Instead, we use different strategies: 177 | 178 | - **Batch Gradient Descent**: _The "classic" approach_. We run all training examples through the network, average their gradients, and then update the weights once. It's stable but _memory-intensive and slow for large datasets_. 179 | - **Stochastic Gradient Descent** (SGD): _The opposite extreme_. We update the weights after every single training example. It's much faster and can escape shallow local minima, but the _updates can be very "noisy" and erratic_. 180 | - **Mini-Batch Gradient Descent**: The _best of both worlds_ and the most common method. We divide the training data into small batches (e.g., 32, 64, or 128 examples), and we update the weights once per batch. This provides _a good balance between the stability of Batch GD and the speed of SGD_. 181 | 182 | ##### Vanishing and exploding gradients 183 | 184 | Backpropagation isn't without its challenges. In very deep networks (networks with many layers), _the error signal can run into problems as it's propagated backward_. 185 | 186 | - **Vanishing Gradients**: As the gradient is passed back, it can be multiplied by numbers less than one over and over. This can cause the gradient to become incredibly small, effectively "vanishing" by the time it reaches the early layers. When this happens, the weights in the early layers stop updating, and the network stops learning. 187 | - **Exploding Gradients**: The opposite can also occur. The gradient can be repeatedly multiplied by numbers greater than one, causing it to become astronomically large. This leads to massive weight updates and makes the model unstable and unable to learn. 188 | 189 | Modern deep learning has developed solutions to these problems, such as using specific activation functions like ReLU (whose derivative is a constant 1), better weight initialization methods, and normalization techniques, all designed to keep the backpropagated signal healthy and effective. 190 | 191 |
192 | 193 | ### Conclusion 194 |
195 | 196 | That’s all for our lesson on backpropagation! A lot of it relies on complex math, but it’s the heart of the model training process. This clever algorithm is what allows a neural network to truly learn from its mistakes by translating the final error into actionable feedback for every single weight. 197 | 198 | By using the **chain rule** to propagate this error signal backward, it calculates the precise **gradient** needed to guide the model's optimization. These gradients ensure each **weight** is nudged in the exact direction that will **minimize the overall loss**. Repeated over thousands of **epochs**, this **iterative refinement** transforms a randomly initialized **network** into a powerful and accurate **predictive tool**. 199 | 200 | Ultimately, backpropagation is the engine that drives intelligence in most modern ML systems, turning abstract data into concrete knowledge. 201 | 202 | -------------------------------------------------------------------------------- /2. Machine Learning Generics/2.5. K-Nearest Neighbors, Clustering K-Means/KNN, Kmeans.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "222d8505", 6 | "metadata": {}, 7 | "source": [ 8 | "# k-Nearest Neighbors & K-Means\n", 9 | "\n", 10 | "_Welcome back! Today we’ll build two core ML tools that you’ll use again and again._\n", 11 | "\n", 12 | "- **k-Nearest Neighbors (kNN)** — a simple, powerful **supervised** method\n", 13 | "- **K-Means** — a fast, practical **unsupervised** clustering method\n", 14 | "\n", 15 | "---\n", 16 | "\n", 17 | "## What you'll learn\n", 18 | "\n", 19 | "- kNN intuition and how prediction works\n", 20 | "- Distances, feature scaling, and choosing **k**\n", 21 | "- K-Means objective and the Lloyd’s algorithm loop\n", 22 | "- How to pick **K** (elbow & silhouette)\n", 23 | "- Minimal, copy-pasteable code for both\n", 24 | "\n", 25 | "---\n", 26 | "\n", 27 | "## Requirements\n", 28 | "\n", 29 | "- Vectors & basic distance (e.g., Euclidean)\n", 30 | "- Train/validation/test split\n", 31 | "- Python + NumPy/Matplotlib (optionally scikit-learn)\n", 32 | "\n", 33 | "---\n", 34 | "\n", 35 | "## 1) kNN — the core idea (intuition)\n", 36 | "\n", 37 | "It works on the idea that nearby points tend to share similar labels or values. The only real knob is **k**: small k focuses on very local patterns (can overfit to noise), while larger k smooths decisions (can miss fine detail). For classification, the class with the most support among the k neighbors is chosen; for regression, the neighbors’ values are averaged. Distance-weighting lets nearer neighbors count more than farther ones.\n", 38 | "\n", 39 | "> **“Tell me who your neighbors are, and I’ll tell you who you are.”**\n", 40 | "\n", 41 | "For a new point $x$:\n", 42 | "\n", 43 | "1. Find the **k closest** training points.\n", 44 | "2. **Classification:** majority vote of their labels.\n", 45 | "3. **Regression:** average their target values.\n", 46 | "\n", 47 | "```\n", 48 | " ● ● Class A\n", 49 | " x ? → ▲ Query\n", 50 | " ○ ○ Class B\n", 51 | "```\n", 52 | "\n", 53 | "`x` takes the class of whichever neighbors are more common (or closer, if weighted).\n", 54 | "\n", 55 | "---\n", 56 | "\n", 57 | "## 2) Distance & scaling (the make-or-break)\n", 58 | "\n", 59 | "Common distances for vectors $x, y$:\n", 60 | "\n", 61 | "- **Euclidean:** $\\lVert x - y\\rVert_2$ (default for geometry)\n", 62 | "- **Manhattan:** $\\lVert x - y\\rVert_1$ (robust to outliers)\n", 63 | "- **Cosine distance:** $1 - \\dfrac{x^\\top y}{\\lVert x\\rVert\\,\\lVert y\\rVert}$ — “angle” difference (great for text/high-dim sparse)\n", 64 | "\n", 65 | "**Euclidean** distance treats differences along each feature dimension equally, so features with larger scales can dominate if you don’t standardize. **Manhattan** distance is less sensitive to outliers and can work better with high-variance features. **Cosine** similarity ignores magnitude and compares only direction, which is useful for sparse/high‑dimensional data (e.g., text). Standardizing features (mean 0, std 1) before kNN keeps distances meaningful. If two classes are tied by count, distance‑weighted voting often resolves the tie sensibly.\n", 66 | "\n", 67 | "> ⚠️ So **Always scale features** (e.g., standardize to mean 0, std 1). \n", 68 | "> Otherwise the feature with the largest units dominates distance.\n", 69 | "\n", 70 | "**Distance-weighted vote (optional):** \n", 71 | "$w_i=\\dfrac{1}{\\operatorname{dist}(x,x_i)+\\epsilon}$ so closer neighbors count more.\n", 72 | "\n", 73 | "---\n", 74 | "\n", 75 | "## 3) Choosing **k** & common pitfalls\n", 76 | "\n", 77 | "Pick k by validation or cross‑validation: try several values and choose the one with the best validation score.\n", 78 | "\n", 79 | "- **Small k** (1–3): low bias, high variance → can be noisy.\n", 80 | "- **Larger k** (5–21+): smoother, higher bias → may underfit.\n", 81 | "- Pick **k** via validation or cross-validation.\n", 82 | "\n", 83 | "**Pitfalls**\n", 84 | "\n", 85 | "- Not scaling features.\n", 86 | "- Using an unfitting distance (e.g., Euclidean on sparse text).\n", 87 | "- Class imbalance: consider distance-weighted vote or class-weighted strategies.\n", 88 | "\n", 89 | "---\n", 90 | "\n", 91 | "## 4) kNN — minimal code (easy to try)\n", 92 | "\n", 93 | "Before running the code below, here’s the flow you’ll see:\n", 94 | "\n", 95 | "- Make a small, separable dataset and split into train/test.\n", 96 | "- Fit the scaler **on training data only**, then transform both train and test.\n", 97 | "- Loop over a few k values; each model just stores the scaled training set.\n", 98 | "- Predict by finding the k nearest training points for each test point and aggregating their labels.\n", 99 | "\n", 100 | "> Tip: run locally. Install: `pip install numpy scikit-learn`" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "id": "1b8b2ade", 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "import numpy as np\n", 111 | "from sklearn.datasets import make_classification\n", 112 | "from sklearn.model_selection import train_test_split\n", 113 | "from sklearn.preprocessing import StandardScaler\n", 114 | "from sklearn.neighbors import KNeighborsClassifier\n", 115 | "from sklearn.metrics import accuracy_score\n", 116 | "\n", 117 | "# 1) Toy 2D dataset\n", 118 | "X, y = make_classification(n_samples=400, n_features=2, n_redundant=0,\n", 119 | " n_informative=2, n_clusters_per_class=1, random_state=7)\n", 120 | "Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.25, random_state=7)\n", 121 | "\n", 122 | "# 2) Scale for distance-based methods\n", 123 | "scaler = StandardScaler().fit(Xtr)\n", 124 | "Xtr = scaler.transform(Xtr)\n", 125 | "Xte = scaler.transform(Xte)\n", 126 | "\n", 127 | "# 3) Try a few k values (distance-weighted)\n", 128 | "for k in [1, 3, 5, 11, 21]:\n", 129 | " clf = KNeighborsClassifier(n_neighbors=k, weights=\"distance\", metric=\"euclidean\")\n", 130 | " clf.fit(Xtr, ytr)\n", 131 | " acc = accuracy_score(yte, clf.predict(Xte))\n", 132 | " print(f\"k={k:>2} acc={acc:.3f}\")" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "id": "47cefdd1", 138 | "metadata": {}, 139 | "source": [ 140 | "**From-scratch (core idea):**" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "id": "dc7cb8af", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "import numpy as np\n", 151 | "\n", 152 | "def knn_predict(Xtr, ytr, x, k=5):\n", 153 | " d = ((Xtr - x)**2).sum(axis=1)**0.5 # Euclidean\n", 154 | " idx = np.argpartition(d, k)[:k] # top-k neighbors (unordered)\n", 155 | " votes = np.bincount(ytr[idx])\n", 156 | " return votes.argmax()" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "id": "6164d341", 162 | "metadata": {}, 163 | "source": [ 164 | "---\n", 165 | "\n", 166 | "## 5) K-Means — the core idea (intuition)\n", 167 | "\n", 168 | "Choose **K** cluster centers (centroids) so that points are as close as possible to the centroid of the cluster they’re assigned to. The loss being minimized is the sum of **squared** distances to the nearest centroid, which naturally makes the mean the best representative for each cluster and favors compact, roughly spherical clusters.\n", 169 | "\n", 170 | "> **Goal:** group similar points into **K** clusters. \n", 171 | "> Each cluster has a center (the **mean**), and points belong to their nearest center.\n", 172 | "\n", 173 | "**Plain-English objective:** place $K $ centers so points are as close as possible (on average, squared) to their assigned center.\n", 174 | "\n", 175 | "**Mathematically, with clusters $C_k$ and centroids $\\mu_k$:**\n", 176 | "\n", 177 | "$$\n", 178 | "\\min_{\\{C_k\\},\\{\\mu_k\\}} \\sum_{k=1}^{K} \\sum_{x\\in C_k} \\lVert x-\\mu_k\\rVert_2^2\n", 179 | "$$\n", 180 | "\n", 181 | "and the centroid update\n", 182 | "\n", 183 | "$$\n", 184 | "\\mu_k \\;=\\; \\frac{1}{\\lvert C_k\\rvert}\\sum_{x\\in C_k} x\\;.\n", 185 | "$$\n", 186 | "\n", 187 | "---\n", 188 | "\n", 189 | "## 6) Lloyd’s algorithm (the standard K-Means loop)\n", 190 | "\n", 191 | "Two repeated steps drive the method:\n", 192 | "\n", 193 | "1. **assignment** — give each point to its nearest centroid.\n", 194 | "2. **update** — move each centroid to the mean of its assigned points. Each step never increases the objective, so the loop converges to a local optimum. Initialization matters; using **k‑means++** spreads starting centroids apart and improves results. If a cluster becomes empty, re‑seed that centroid (e.g., pick a faraway point).\n", 195 | "\n", 196 | "Repeat until nothing changes.\n", 197 | "\n", 198 | "Each iteration never increases the objective → **converges** to a local optimum.\n", 199 | "\n", 200 | "**Good practice**\n", 201 | "\n", 202 | "- **k-means++** initialization (smart starting centers)\n", 203 | "- **Multiple restarts** (keep the best run)\n", 204 | "- **Scale features** before clustering\n", 205 | "- Handle empty clusters by re-seeding (e.g., to a farthest point)\n", 206 | "\n", 207 | "---\n", 208 | "\n", 209 | "## 7) Picking **K** (how many clusters?)\n", 210 | "\n", 211 | "- **Elbow method:** plot $K$ vs **inertia** (sum of squared distances to centers). \n", 212 | " Look for the “bend” where adding clusters gives diminishing returns.\n", 213 | "- **Silhouette score** ($-1 \\ldots 1$): higher is better. Rough guide: \n", 214 | " $0.5+$ good separation, $\\sim 0.25$ weak structure.\n", 215 | "\n", 216 | "---\n", 217 | "\n", 218 | "## 8) K-Means — minimal code (easy to try)\n", 219 | "\n", 220 | "Scan a few K values, compute inertia and silhouette for each, and then pick a reasonable K. Use `init=\"k-means++\"` and multiple `n_init` runs to avoid poor local optima. After selecting K, fit once more and evaluate." 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "id": "28510354", 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "import numpy as np\n", 231 | "from sklearn.datasets import make_blobs\n", 232 | "from sklearn.preprocessing import StandardScaler\n", 233 | "from sklearn.cluster import KMeans\n", 234 | "from sklearn.metrics import silhouette_score\n", 235 | "\n", 236 | "# 1) Synthetic data with 4 clusters\n", 237 | "X, _, _ = make_blobs(n_samples=600, centers=4, cluster_std=1.2, random_state=42)\n", 238 | "X = StandardScaler().fit_transform(X)\n", 239 | "\n", 240 | "# 2) Try several K to compute inertia and silhouette\n", 241 | "Ks = range(2, 9)\n", 242 | "inertias, sils = [], []\n", 243 | "for K in Ks:\n", 244 | " km = KMeans(n_clusters=K, n_init=10, init=\"k-means++\", random_state=42).fit(X)\n", 245 | " inertias.append(km.inertia_)\n", 246 | " sils.append(silhouette_score(X, km.labels_))\n", 247 | "\n", 248 | "print(\"K:\", list(Ks))\n", 249 | "print(\"Inertia:\", [round(v, 1) for v in inertias])\n", 250 | "print(\"Silhouette:\", [round(v, 3) for v in sils])\n", 251 | "\n", 252 | "# 3) Pick K (say best-looking), then fit once more\n", 253 | "bestK = 4\n", 254 | "km = KMeans(n_clusters=bestK, n_init=10, init=\"k-means++\", random_state=42).fit(X)\n", 255 | "print(\"Final silhouette at K=4:\", round(silhouette_score(X, km.labels_), 3))" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "id": "5c3088e7", 261 | "metadata": {}, 262 | "source": [ 263 | "**From-scratch (one iteration):**" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "id": "36aa578a", 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "import numpy as np\n", 274 | "\n", 275 | "def kmeans_step(X, centers):\n", 276 | " # Assign\n", 277 | " d = np.stack([np.linalg.norm(X - c, axis=1) for c in centers], axis=1)\n", 278 | " labels = d.argmin(axis=1)\n", 279 | " # Update\n", 280 | " new_centers = np.array([X[labels == k].mean(axis=0) for k in range(centers.shape[0])])\n", 281 | " return labels, new_centers" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "id": "14451fe4", 287 | "metadata": {}, 288 | "source": [ 289 | "---\n", 290 | "\n", 291 | "## 9) kNN vs K-Means — quick cheat sheet\n", 292 | "\n", 293 | "Use **kNN** when you have labels and want predictions; use **K‑Means** when you don’t have labels and want segments. kNN has almost no training cost but can be slow at prediction time; K‑Means costs more to fit but predicts fast by checking the nearest centroid. Scaling is important for both.\n", 294 | "\n", 295 | "| Aspect | kNN (Supervised) | K-Means (Unsupervised) |\n", 296 | "| ------------ | -------------------------------------------- | ------------------------------------------- |\n", 297 | "| Purpose | Predict labels/values from neighbors | Discover groups without labels |\n", 298 | "| “Training” | Just store data | Learn K centers |\n", 299 | "| Compute cost | Cheap train, slower predict (need neighbors) | Fast predict, cost during fitting |\n", 300 | "| Key choices | **k**, distance, weighting, scaling | **K**, init (k-means++), scaling, restarts |\n", 301 | "| When to use | Strong local patterns, simple baseline | Quick segmentation, preprocessing, insights |\n", 302 | "\n", 303 | "---\n", 304 | "\n", 305 | "## 10) Checklists, FAQs, and practice\n", 306 | "\n", 307 | "kNN checklist: scale features; tune **k** with validation; consider distance‑weighted voting; watch for class imbalance.\n", 308 | "\n", 309 | "K‑Means checklist: scale features; use k‑means++; try several K and compare inertia and silhouette; handle empty clusters robustly.\n", 310 | "\n", 311 | "Practice ideas: try kNN on Iris or a small MNIST subset with various k; for K‑Means, sweep K=2…10 and plot inertia and silhouette; as a speed‑up experiment, cluster first, then run kNN within each cluster and compare runtime/accuracy to vanilla kNN.\n", 312 | "\n", 313 | "**kNN checklist**\n", 314 | "\n", 315 | "- [ ] Scale features\n", 316 | "- [ ] Pick **k** via validation\n", 317 | "- [ ] Consider distance-weighted vote\n", 318 | "- [ ] Beware class imbalance\n", 319 | "\n", 320 | "**K-Means checklist**\n", 321 | "\n", 322 | "- [ ] Scale features\n", 323 | "- [ ] Use **k-means++** + multiple **n_init**\n", 324 | "- [ ] Pick **K** via elbow/silhouette\n", 325 | "- [ ] Watch for empty clusters (reseed)\n", 326 | "\n", 327 | "**FAQs**\n", 328 | "\n", 329 | "- _“kNN is slow at prediction.”_ → Use KD-Tree/Ball-Tree/ANN, or pre-cluster and search within cluster.\n", 330 | "- _“My clusters look weird.”_ → Scale features; try different K; check for non-spherical structure (K-Means likes spherical blobs).\n", 331 | "- _“Do I need labels for K-Means?”_ → No — it’s unsupervised.\n", 332 | "\n", 333 | "**Practice**\n", 334 | "\n", 335 | "1. On a real dataset (e.g., Iris or a MNIST subset), tune **k** for kNN and report accuracy.\n", 336 | "2. Run K-Means with $K = 2,\\dots,10$; plot inertia and silhouette; choose $K$ and visualize clusters.\n", 337 | "3. Hybrid: cluster first (K-Means), then run kNN **within each cluster** — compare speed/accuracy.\n", 338 | "\n", 339 | "---\n", 340 | "\n", 341 | "## Summary\n", 342 | "\n", 343 | "kNN: predict using the answers from your **k nearest labeled neighbors**.\n", 344 | "\n", 345 | "K‑Means: place **K centroids** and alternate between assigning points to the nearest centroid and recomputing centroids until stable.\n", 346 | "\n", 347 | "You learned:\n", 348 | "\n", 349 | "- **kNN**: how neighbors + distances make predictions; how **k** and scaling affect results.\n", 350 | "- **K-Means**: how Lloyd’s algorithm works; how to pick **K** and get stable clusters.\n", 351 | "- Minimal code to try both methods today.\n", 352 | "\n", 353 | "**Next:** Support vector machines" 354 | ] 355 | } 356 | ], 357 | "metadata": { 358 | "jupytext": { 359 | "cell_metadata_filter": "-all", 360 | "main_language": "python", 361 | "notebook_metadata_filter": "-all" 362 | }, 363 | "language_info": { 364 | "name": "python" 365 | } 366 | }, 367 | "nbformat": 4, 368 | "nbformat_minor": 5 369 | } 370 | -------------------------------------------------------------------------------- /4. Deep Learning & Computer Vision/4.4. Convolutional Layers, Pooling Layers, Convolutional Neural Network/CNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3fc7e788-61f8-475b-87d6-78f2601e4b45", 6 | "metadata": {}, 7 | "source": [ 8 | "# Seeing the world - Convolutional Neural Networks\n", 9 | "\n", 10 | "* Gordon.H | SHSID Data Science Group \n" , 11 | "\n", 12 | "*Welcome back to the course, Junior ML Engineers !*\n", 13 | "\n", 14 | "Today we will be learning about the ultimate solution for image processing, **Convulutional Neural Netorks**\n", 15 | "\n", 16 | "---\n", 17 | "### Requirements \n", 18 | "* Understanding of the fundamentals of Machine learning\n", 19 | "* Basic Knowledge of Neural Networks\n", 20 | "* Basic Python and Numpy Library Usage\n", 21 | "\n", 22 | "---\n", 23 | "### 1. Problem with Images\n", 24 | "\n", 25 | "You have a small gray scale image of size 28*28 pixels\n", 26 | "* Size = 28*28 = 784\n", 27 | "* To feed it into a dense layer, we flatten into a vector of **784** numbers.\n", 28 | "* If first layer has 128 neurons we will need **100,352** weights\n", 29 | "\n", 30 | "This is a huge problem because:\n", 31 | "* It is inefficient with such large parameters to train\n", 32 | "* Spatial information is lost when we flatten the image\n", 33 | "\n", 34 | "Now, CNN's are designed to solve the problem with a smart approach\n", 35 | "\n", 36 | "```mermaid\n", 37 | "graph TD\n", 38 | " subgraph Dense Network Approach\n", 39 | " A[Input Image
28x28x1] --> B{Flatten};\n", 40 | " B --> C[1D Vector
784 neurons];\n", 41 | " C --> D[Dense Layer];\n", 42 | " style A fill:#f9f,stroke:#333,stroke-width:2px\n", 43 | " end\n", 44 | " \n", 45 | " subgraph CNN Approach\n", 46 | " E[Input Image
28x28x1] --> F{Convolutional Layer};\n", 47 | " F --> G[Feature Map
e.g., 26x26x32];\n", 48 | " style E fill:#9cf,stroke:#333,stroke-width:2px\n", 49 | " end\n", 50 | "\n", 51 | " A -- \"Loses spatial structure\" --> C\n", 52 | " E -- \"Preserves spatial structure\" --> G\n", 53 | "```\n", 54 | "As you see, CNN keeps the image's 2D structure, allowing it to learn from pixel neighborhoods.\n", 55 | "\n", 56 | "---\n", 57 | "\n", 58 | "### 2. The core of CNN : Convolutional Layer\n", 59 | "\n", 60 | "Instead of looking at a large image at once, CNN looks at it in small chunks. \n", 61 | "\n", 62 | "A **filter** is a small matrix of weights that the network learns. The process of sliding the filter and computing the output is called a **convolution**.\n", 63 | "\n", 64 | "Here's a mini-example of a 2x2 filter sliding over a 3x3 image to produce a 2x2 feature map.\n", 65 | "\n", 66 | "```\n", 67 | "Input Image (I) Filter (K) Feature Map (O)\n", 68 | "+---+---+---+ +---+---+ +---+---+\n", 69 | "| 1 | 5 | 2 | | 1 | 0 | | 9 | ? |\n", 70 | "+---+---+---+ +---+---+ +---+---+\n", 71 | "| 8 | 1 | 6 | | 1 | 0 | | ? | ? |\n", 72 | "+---+---+---+ +---+---+ +---+---+\n", 73 | "| 3 | 4 | 7 |\n", 74 | "+---+---+---+\n", 75 | "```\n", 76 | "To calculate the top-left value of the output: `(1*1) + (5*0) + (8*1) + (1*0) = 9`.\n", 77 | "\n", 78 | "#### The Mathematical Logic\n", 79 | "\n", 80 | "The mathematical formula for such operation, **cross-correlation**, looks like this:\n", 81 | "$$\n", 82 | "O_{i,j} = b + \\sum_{u=0}^{F-1} \\sum_{v=0}^{F-1} I_{i+u, j+v} \\cdot K_{u,v}\n", 83 | "$$\n", 84 | "\n", 85 | "Looks complicated right? Lets break it down:\n", 86 | "\n", 87 | "* $O_{i,j}$: The output value at row `i`, column `j` in the feature map.\n", 88 | "\n", 89 | "* $b$: A learnable **bias** term, which helps the filter make better predictions.\n", 90 | "\n", 91 | "* $\\sum$: The \"sum\" symbol. We sum over the filter's dimensions (`u` and `v`).\n", 92 | "\n", 93 | "* $I_{i+u, j+v}$: A pixel value from the **Input** image patch.\n", 94 | "\n", 95 | "* $K_{u,v}$: A weight from our **Kernel** (aka **the filter**).\n", 96 | "\n", 97 | "This formula is a precise mathematical formula for cross correlation in Machine Learning, in mathematics convolution is a bit different, it involves flipping the filter (both horizontally and vertically) before sliding it over the image. The reason for such difference is due to the special nature of neural networks, the values in the filter are learned during training, the network can simply learn the flipped version of the filter if it needs to. The cross correlation is easier to implement.\n", 98 | "\n", 99 | "#### Hyperparameters and Output Size\n", 100 | "The size of our output feature map depends on the hyperparameters we choose. The output width ($W_{out}$) and height ($H_{out}$) can be calculated with this formula:\n", 101 | "\n", 102 | "$$\n", 103 | "W_{out} = \\frac{W_{in} - F + 2P}{S} + 1\n", 104 | "$$\n", 105 | "$$\n", 106 | "H_{out} = \\frac{H_{in} - F + 2P}{S} + 1\n", 107 | "$$\n", 108 | "\n", 109 | "Where:\n", 110 | "* $W_{in}, H_{in}$: Input width and height.\n", 111 | "* $F$: Filter size.\n", 112 | "* $P$: Padding (number of pixels added to the border).\n", 113 | "* $S$: Stride (how many pixels the filter slides at a time).\n", 114 | "\n", 115 | "#### Example Code\n", 116 | " *Note: You can run the following code locally to try out convolutional layers!*\n", 117 | "\n", 118 | "\n", 119 | "\n" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "id": "73bcea1c-0c71-4fbe-9001-e63093a3fa01", 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "# Remember to use pip to install numpy and matplotlib!\n", 130 | "import numpy as np\n", 131 | "import matplotlib.pyplot as plt\n", 132 | "\n", 133 | "# 1. Define the Input and Filter\n", 134 | "# A simple 6x6 grayscale image. \n", 135 | "# It has a sharp vertical edge down the middle.\n", 136 | "# (Low values = dark, high values = light)\n", 137 | "input_image = np.array([\n", 138 | " [10, 10, 10, 100, 100, 100],\n", 139 | " [10, 10, 10, 100, 100, 100],\n", 140 | " [10, 10, 10, 100, 100, 100],\n", 141 | " [10, 10, 10, 100, 100, 100],\n", 142 | " [10, 10, 10, 100, 100, 100],\n", 143 | " [10, 10, 10, 100, 100, 100]\n", 144 | "])\n", 145 | "\n", 146 | "# A 3x3 filter designed to detect vertical edges.\n", 147 | "# The positive values on the left and negative on the right\n", 148 | "# will give a high response when we move from dark to light.\n", 149 | "vertical_edge_filter = np.array([\n", 150 | " [ 1, 0, -1],\n", 151 | " [ 2, 0, -2], # This is a \"Sobel\" filter, common in image processing\n", 152 | " [ 1, 0, -1]\n", 153 | "])\n", 154 | "\n", 155 | "# 2. The Convolution Operation\n", 156 | "# Get dimensions (assuming no padding, stride=1)\n", 157 | "img_h, img_w = input_image.shape\n", 158 | "filter_h, filter_w = vertical_edge_filter.shape\n", 159 | "out_h = (img_h - filter_h) + 1\n", 160 | "out_w = (img_w - filter_w) + 1\n", 161 | "\n", 162 | "# Create an empty feature map to store the output\n", 163 | "output_feature_map = np.zeros((out_h, out_w))\n", 164 | "\n", 165 | "# Slide filter over the image\n", 166 | "for y in range(out_h):\n", 167 | " for x in range(out_w):\n", 168 | " # Get current patch of the image\n", 169 | " image_patch = input_image[y : y + filter_h, x : x + filter_w]\n", 170 | " \n", 171 | " # Perform element-wise multiplication and sum the result\n", 172 | " # This is the core of the convolution!\n", 173 | " convolution_sum = np.sum(image_patch * vertical_edge_filter)\n", 174 | " \n", 175 | " # Store result in the map\n", 176 | " output_feature_map[y, x] = convolution_sum \n", 177 | "# 3.Display Results\n", 178 | "print(\"--- Manual NumPy Convolution ---\\n\")\n", 179 | "print(\"Input Image:\\n\", input_image)\n", 180 | "print(\"\\nVertical Edge Filter:\\n\", vertical_edge_filter)\n", 181 | "print(\"\\nOutput Feature Map:\\n\", output_feature_map)\n", 182 | "print(\"\\nNotice the high values in the output where the vertical edge was detected!\")\n", 183 | "# Visualize the images\n", 184 | "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))\n", 185 | "ax1.imshow(input_image, cmap='gray')\n", 186 | "ax1.set_title(\"Original Image\")\n", 187 | "ax2.imshow(output_feature_map, cmap='gray')\n", 188 | "ax2.set_title(\"Feature Map (Edges)\")\n", 189 | "plt.show()" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "id": "e70522ea-6596-413f-aadf-e37042a51b87", 195 | "metadata": {}, 196 | "source": [ 197 | "---\n", 198 | "### 3. Making it Robust: The Pooling layer\n", 199 | "\n", 200 | "A Pooling layer shrinks the feature map to make the network faster and robust. The most common type of pooling is **Max Pooling**.\n", 201 | "\n", 202 | "#### Visualizing Max Pooling\n", 203 | "\n", 204 | "Imagine a 2x2 Max Pooling operation on a 4x4 feature map.\n", 205 | "\n", 206 | "```\n", 207 | "Feature Map Pooled Output\n", 208 | "+---+---+---+---+ +---+---+\n", 209 | "|12 |20 | 30| 0 | max(12,20,8,12)--> |20 |\n", 210 | "+---+---+---+---+ +---+---+\n", 211 | "| 8 |12 | 2 | 0 | max(30,0,2,0)--> |30 |\n", 212 | "+---+---+---+---+ +---+---+\n", 213 | "|34 |70 | 37| 4 | max(34,70,112,100)-->|112|\n", 214 | "+---+---+---+---+ +---+---+\n", 215 | "|112|100| 25| 12| max(37,4,25,12)--> |37 |\n", 216 | "+---+---+---+---+ +---+---+\n", 217 | "```\n", 218 | "This keeps only the strongest signal, reducing the map size from 4x4 to 2x2.\n", 219 | "\n", 220 | "#### The Math Behind Pooling\n", 221 | "\n", 222 | "Here is the formula for Max Pooling:\n", 223 | "\n", 224 | "$$\n", 225 | "P_{i,j} = \\max_{0 \\le u < F_p, 0 \\le v < F_p} A_{i \\cdot S_p + u, j \\cdot S_p + v}\n", 226 | "$$\n", 227 | "\n", 228 | "This formally states: \"The output $P_{i,j}$ is the `max` value from the input feature map `A` within the pooling window.\"\n", 229 | "\n", 230 | "---\n", 231 | "### Putting it all together: A full CNN Architecture\n", 232 | "\n", 233 | "A real world CNN stacks up all these layers\n", 234 | "\n", 235 | "``` mermaid\n", 236 | "graph LR\n", 237 | " A[\"Input Image (28x28x1)\"] --> B[\"Conv2D Layer\\n32 filters, 3x3\\nOutput: 26x26x32\"]\n", 238 | " B --> C[\"MaxPooling2D\\n2x2 window\\nOutput: 13x13x32\"]\n", 239 | " C --> D[\"Conv2D Layer\\n64 filters, 3x3\\nOutput: 11x11x64\"]\n", 240 | " D --> E[\"MaxPooling2D\\n2x2 window\\nOutput: 5x5x64\"]\n", 241 | " E --> F[\"Flatten Layer\\nOutput: 1600 nodes\"]\n", 242 | " F --> G[\"Dense Layer\\n128 nodes\"]\n", 243 | " G --> H[\"Output Layer\\n10 nodes (Softmax)\"]\n", 244 | "\n", 245 | " subgraph Feature Extractor\n", 246 | " B; C; D; E;\n", 247 | " end\n", 248 | "\n", 249 | " subgraph Classifier\n", 250 | " F; G; H;\n", 251 | " end\n", 252 | "```\n", 253 | "The final layer uses a **Softmax** activation function to convert the network's scores into a probability distribution.\n", 254 | "\n", 255 | "The **Softmax** function for a specific output class `i` is:\n", 256 | "\n", 257 | "$$\n", 258 | "\\sigma(\\mathbf{z})_i = \\frac{e^{z_i}}{\\sum_{j=1}^{C} e^{z_j}}\n", 259 | "$$\n", 260 | "\n", 261 | "The formula gurantees that all output values are between 0 to 1 and sums up to be 1. This allows us to treat them as the model's confidence for each class.\n", 262 | "\n", 263 | "---\n", 264 | "\n", 265 | "### 5. Coding Example: Full Functional CNN Architecture\n", 266 | "\n", 267 | "The following example uses Pytorch and Matplotlib to create an example CNN architecture." 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "id": "a426d453-c908-4108-8af4-b896b9a8e95a", 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "import torch\n", 278 | "import torch.nn as nn\n", 279 | "import torch.optim as optim\n", 280 | "from torchvision import datasets, transforms\n", 281 | "from torch.utils.data import DataLoader\n", 282 | "import matplotlib.pyplot as plt\n", 283 | "\n", 284 | "# Define the CNN architecture\n", 285 | "class MNIST_CNN(nn.Module):\n", 286 | " def __init__(self):\n", 287 | " super(MNIST_CNN, self).__init__()\n", 288 | " # Feature extractor\n", 289 | " self.features = nn.Sequential(\n", 290 | " nn.Conv2d(1, 32, kernel_size=3), # 28x28x1 -> 26x26x32\n", 291 | " nn.ReLU(),\n", 292 | " nn.MaxPool2d(2), # 26x26x32 -> 13x13x32\n", 293 | " nn.Conv2d(32, 64, kernel_size=3), # 13x13x32 -> 11x11x64\n", 294 | " nn.ReLU(),\n", 295 | " nn.MaxPool2d(2) # 11x11x64 -> 5x5x64\n", 296 | " )\n", 297 | " \n", 298 | " # Classifier\n", 299 | " self.classifier = nn.Sequential(\n", 300 | " nn.Flatten(), # 5x5x64 -> 1600\n", 301 | " nn.Linear(5*5*64, 128), # 1600 -> 128\n", 302 | " nn.ReLU(),\n", 303 | " nn.Linear(128, 10) # 128 -> 10\n", 304 | " )\n", 305 | " \n", 306 | " def forward(self, x):\n", 307 | " x = self.features(x)\n", 308 | " x = self.classifier(x)\n", 309 | " return x\n", 310 | "\n", 311 | "# Initialize model, loss function, and optimizer\n", 312 | "model = MNIST_CNN()\n", 313 | "criterion = nn.CrossEntropyLoss()\n", 314 | "optimizer = optim.Adam(model.parameters(), lr=0.001)\n", 315 | "\n", 316 | "# Load MNIST data\n", 317 | "transform = transforms.Compose([\n", 318 | " transforms.ToTensor(),\n", 319 | " transforms.Normalize((0.1307,), (0.3081,))\n", 320 | "])\n", 321 | "\n", 322 | "train_data = datasets.MNIST('./data', train=True, download=True, transform=transform)\n", 323 | "test_data = datasets.MNIST('./data', train=False, transform=transform)\n", 324 | "\n", 325 | "train_loader = DataLoader(train_data, batch_size=64, shuffle=True)\n", 326 | "test_loader = DataLoader(test_data, batch_size=1000)\n", 327 | "\n", 328 | "# Training function\n", 329 | "def train(model, device, train_loader, optimizer, epoch):\n", 330 | " model.train()\n", 331 | " for batch_idx, (data, target) in enumerate(train_loader):\n", 332 | " optimizer.zero_grad()\n", 333 | " output = model(data)\n", 334 | " loss = criterion(output, target)\n", 335 | " loss.backward()\n", 336 | " optimizer.step()\n", 337 | " \n", 338 | " if batch_idx % 100 == 0:\n", 339 | " print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} '\n", 340 | " f'({100. * batch_idx / len(train_loader):.0f}%)]\\tLoss: {loss.item():.6f}')\n", 341 | "\n", 342 | "# Test function\n", 343 | "def test(model, device, test_loader):\n", 344 | " model.eval()\n", 345 | " test_loss = 0\n", 346 | " correct = 0\n", 347 | " with torch.no_grad():\n", 348 | " for data, target in test_loader:\n", 349 | " output = model(data)\n", 350 | " test_loss += criterion(output, target).item()\n", 351 | " pred = output.argmax(dim=1, keepdim=True)\n", 352 | " correct += pred.eq(target.view_as(pred)).sum().item()\n", 353 | " \n", 354 | " test_loss /= len(test_loader)\n", 355 | " accuracy = 100. * correct / len(test_loader.dataset)\n", 356 | " print(f'\\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} '\n", 357 | " f'({accuracy:.0f}%)\\n')\n", 358 | " return accuracy\n", 359 | "\n", 360 | "# Training loop\n", 361 | "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", 362 | "model.to(device)\n", 363 | "\n", 364 | "accuracies = []\n", 365 | "for epoch in range(1, 6): # 5 epochs\n", 366 | " train(model, device, train_loader, optimizer, epoch)\n", 367 | " acc = test(model, device, test_loader)\n", 368 | " accuracies.append(acc)\n", 369 | "\n", 370 | "# Plot accuracy\n", 371 | "plt.plot(range(1, 6), accuracies)\n", 372 | "plt.title('Model Accuracy')\n", 373 | "plt.xlabel('Epoch')\n", 374 | "plt.ylabel('Accuracy (%)')\n", 375 | "plt.show()\n", 376 | "\n", 377 | "# Save model\n", 378 | "torch.save(model.state_dict(), 'mnist_cnn.pth')" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "id": "12cd21d0-27ed-41c3-81a0-6fb694aeab29", 384 | "metadata": {}, 385 | "source": [ 386 | "### Summary & Conclusion\n", 387 | "\n", 388 | "**Congratulations!** You have just completed your lesson on Convolutional Neural Networks!\n", 389 | "\n", 390 | "Throughout this lesson you have learned:\n", 391 | "\n", 392 | "* How **Convolutional Layers** use filters to find features, and you've seen the formal math behind the process.\n", 393 | "* How **Pooling Layers** make the network robust and efficient.\n", 394 | "* Understanded the **CNN** architecture and has saw the code to build it.\n", 395 | "\n", 396 | "In the next lesson, we will learn about video data augmentation." 397 | ] 398 | } 399 | ], 400 | "metadata": { 401 | "kernelspec": { 402 | "display_name": "Python 3 (ipykernel)", 403 | "language": "python", 404 | "name": "python3" 405 | }, 406 | "language_info": { 407 | "codemirror_mode": { 408 | "name": "ipython", 409 | "version": 3 410 | }, 411 | "file_extension": ".py", 412 | "mimetype": "text/x-python", 413 | "name": "python", 414 | "nbconvert_exporter": "python", 415 | "pygments_lexer": "ipython3", 416 | "version": "3.12.7" 417 | } 418 | }, 419 | "nbformat": 4, 420 | "nbformat_minor": 5 421 | } 422 | --------------------------------------------------------------------------------