├── 3. PyTorch
    ├── 3.1. Tensors
    │   └── gitkeep
    ├── 3.3. Devices
    │   ├── gitkeep
    │   └── device.md
    ├── 3.4. Modules
    │   ├── gitkeep
    │   └── module.md
    ├── 3.5. Datasets
    │   ├── gitkeep
    │   └── Datasets.md
    ├── 3.7. Losses
    │   └── gitkeep
    ├── 3.6. Dataloader
    │   └── gitkeep
    ├── 3.8. Optimizers
    │   └── gitkeep
    └── 3.2. Autograd
    │   ├── gd.png
    │   └── autograd.md
├── 5. Transformers
    ├── 5.2. Inference
    │   └── gitkeep
    ├── 5.5. BERT, T5, GPT
    │   └── gitkeep
    ├── 5.4. Batch Processing
    │   └── gitkeep
    ├── 5.3. Training, Pre-Training, Fine-Tuning
    │   └── gitkeep
    └── 5.1. Self-Attention, Cross-Attention, Masked Self-Attention, Layer Normalization, Word Embedding, Positional Encoding
    │   └── gitkeep
├── 7. OpenCV & Generative AI
    ├── 7.3. UNet
    │   └── gitkeep
    ├── 7.1. Object Detection
    │   └── gitkeep
    ├── 7.4. Autoencoder, Variational Autoencoder
    │   └── gitkeep
    ├── 7.5. Generative Adversarial Network, Adversarial Attack
    │   └── gitkeep
    └── 7.8. Stable Diffusion, Denoising Diffusion Probabilistic Methods
    │   └── gitkeep
├── 2. Machine Learning Generics
    ├── 2.7. Boosting
    │   └── gitkeep
    ├── 2.0. Machine Learning Terminology
    │   └── gitkeep
    ├── 2.5. Decision Trees, Random Forests
    │   └── gitkeep
    ├── 2.1. Linear Regression & Logistic Regression
    │   ├── gitkeep
    │   └── Linear & Logistic Regression.md
    ├── 2.3. Regularization, Bias-Variance Trade-Off, Kernel Methods, Cross Validation
    │   ├── KernelTrick.png
    │   ├── Cross Validation.md
    │   └── Regularization, Bias–Variance Trade-off, Kernel Methods.md
    ├── 2.4. Principal Component Analysis, Dimensionality Reduction
    │   ├── PCA.md
    │   └── Pizza.csv
    ├── 2.1. Support Vector Machines
    │   └── SVM.ipynb
    └── 2.5. K-Nearest Neighbors, Clustering K-Means
    │   └── KNN, Kmeans.ipynb
├── 4. Deep Learning & Computer Vision
    ├── 4.6. VGG
    │   └── gitkeep
    ├── 4.7. ResNet
    │   ├── gitkeep
    │   ├── resnet1.png
    │   ├── resnet2.png
    │   ├── resnet3.png
    │   └── resnet.ipynb
    ├── 4.8. GoogLeNet
    │   └── gitkeep
    ├── 4.9. Transfer Learning
    │   └── gitkeep
    ├── 4.5. Image Data Augmentation
    │   └── gitkeep
    ├── 4.1. Forward Propagation, Activation Functions, Linear Layer
    │   ├── gitkeep
    │   └── 4.1 forward propagation  + Activation functions + Linear Layer.md
    ├── 4.3. Parameter Initialization, Batch Normalization, Dropout
    │   └── gitkeep
    ├── 4.4. Convolutional Layers, Pooling Layers, Convolutional Neural Network
    │   ├── CNN.md
    │   └── CNN.ipynb
    └── 4.2. Backpropagation, Gradient Descent, Adaptive Moment Estimation
    │   └── Backpropagation.md
├── 0. Prerequisites
    ├── 0.2. Python For AI
    │   ├── 0.2.2. NumPy
    │   │   └── gitkeep
    │   ├── 0.2.3. Pandas
    │   │   └── gitkeep
    │   ├── 0.2.5. Seaborn
    │   │   └── gitkeep
    │   ├── 0.2.4. Matplotlib
    │   │   └── gitkeep
    │   └── 0.2.1. Advanced Python Techniques For AI
    │   │   └── gitkeep
    └── 0.1. Basic Environment For Python
    │   └── 0.1 Setup.md
├── 1. Mathematical Methods For AI
    ├── 1.2. Calculus
    │   ├── 1.2.3. Chain Rule
    │   │   └── gitkeep
    │   ├── 1.2.1. Single-Variable Derivatives
    │   │   └── gitkeep
    │   └── 1.2.2. Multi-Variable Derivatives & Gradients
    │   │   └── gitkeep
    ├── 1.4. Convex Optimization
    │   ├── 1.4.3. Duality
    │   │   └── gitkeep
    │   ├── 1.4.1. Convexity
    │   │   └── gitkeep
    │   └── 1.4.2. Gradient Descent
    │   │   └── gitkeep
    ├── 1.3. Probability & Statistics
    │   ├── 1.3.3. Mean
    │   │   └── gitkeep
    │   ├── 1.3.5. Bayes' Rule
    │   │   └── gitkeep
    │   ├── 1.3.1. Discrete Distributions
    │   │   └── gitkeep
    │   ├── 1.3.4. Variance, Covariance
    │   │   └── gitkeep
    │   └── 1.3.2. Continuous Distributions
    │   │   └── gitkeep
    └── 1.1. Linear Algebra
    │   └── Linear Algebra.md
├── 6. Natural Language Processing & Graph Neural Networks
    ├── 6.7. Vision Transformers
    │   └── gitkeep
    ├── 6.2. Word Embedding Methods
    │   └── gitkeep
    ├── 6.6. Graph Convolutional Networks
    │   └── gitkeep
    ├── 6.5. Message-Passing Neural Networks
    │   └── gitkeep
    ├── 6.1. Character, Subword & Word Tokenization
    │   └── gitkeep
    ├── 6.4. Encoder-Only & Decoder-Only Transformers
    │   └── gitkeep
    └── 6.3. Skip-Gram, Continuous Word Bag, Global Vectors
    │   └── gitkeep
├── LICENSE
└── README.md


/3. PyTorch/3.1. Tensors/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/3. PyTorch/3.3. Devices/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/3. PyTorch/3.4. Modules/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/3. PyTorch/3.5. Datasets/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/3. PyTorch/3.7. Losses/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/3. PyTorch/3.6. Dataloader/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/3. PyTorch/3.8. Optimizers/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/5. Transformers/5.2. Inference/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/5. Transformers/5.5. BERT, T5, GPT/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/7. OpenCV & Generative AI/7.3. UNet/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/5. Transformers/5.4. Batch Processing/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/2. Machine Learning Generics/2.7. Boosting/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/4. Deep Learning & Computer Vision/4.6. VGG/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/0. Prerequisites/0.2. Python For AI/0.2.2. NumPy/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/0. Prerequisites/0.2. Python For AI/0.2.3. Pandas/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/0. Prerequisites/0.2. Python For AI/0.2.5. Seaborn/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/4. Deep Learning & Computer Vision/4.7. ResNet/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/4. Deep Learning & Computer Vision/4.8. GoogLeNet/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/7. OpenCV & Generative AI/7.1. Object Detection/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/0. Prerequisites/0.2. Python For AI/0.2.4. Matplotlib/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/4. Deep Learning & Computer Vision/4.9. Transfer Learning/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/5. Transformers/5.3. Training, Pre-Training, Fine-Tuning/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/1. Mathematical Methods For AI/1.2. Calculus/1.2.3. Chain Rule/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/2. Machine Learning Generics/2.0. Machine Learning Terminology/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/2. Machine Learning Generics/2.5. Decision Trees, Random Forests/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/4. Deep Learning & Computer Vision/4.5. Image Data Augmentation/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/1. Mathematical Methods For AI/1.4. Convex Optimization/1.4.3. Duality/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/7. OpenCV & Generative AI/7.4. Autoencoder, Variational Autoencoder/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/1. Mathematical Methods For AI/1.3. Probability & Statistics/1.3.3. Mean/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/1. Mathematical Methods For AI/1.4. Convex Optimization/1.4.1. Convexity/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/2. Machine Learning Generics/2.1. Linear Regression & Logistic Regression/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/0. Prerequisites/0.2. Python For AI/0.2.1. Advanced Python Techniques For AI/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/1. Mathematical Methods For AI/1.2. Calculus/1.2.1. Single-Variable Derivatives/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/1. Mathematical Methods For AI/1.3. Probability & Statistics/1.3.5. Bayes' Rule/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/1. Mathematical Methods For AI/1.4. Convex Optimization/1.4.2. Gradient Descent/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/6. Natural Language Processing & Graph Neural Networks/6.7. Vision Transformers/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/6. Natural Language Processing & Graph Neural Networks/6.2. Word Embedding Methods/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/7. OpenCV & Generative AI/7.5. Generative Adversarial Network, Adversarial Attack/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/1. Mathematical Methods For AI/1.2. Calculus/1.2.2. Multi-Variable Derivatives & Gradients/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/1. Mathematical Methods For AI/1.3. Probability & Statistics/1.3.1. Discrete Distributions/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/1. Mathematical Methods For AI/1.3. Probability & Statistics/1.3.4. Variance, Covariance/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/6. Natural Language Processing & Graph Neural Networks/6.6. Graph Convolutional Networks/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/7. OpenCV & Generative AI/7.8. Stable Diffusion, Denoising Diffusion Probabilistic Methods/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/1. Mathematical Methods For AI/1.3. Probability & Statistics/1.3.2. Continuous Distributions/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/4. Deep Learning & Computer Vision/4.1. Forward Propagation, Activation Functions, Linear Layer/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/4. Deep Learning & Computer Vision/4.3. Parameter Initialization, Batch Normalization, Dropout/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/6. Natural Language Processing & Graph Neural Networks/6.5. Message-Passing Neural Networks/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/6. Natural Language Processing & Graph Neural Networks/6.1. Character, Subword & Word Tokenization/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/6. Natural Language Processing & Graph Neural Networks/6.4. Encoder-Only & Decoder-Only Transformers/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/6. Natural Language Processing & Graph Neural Networks/6.3. Skip-Gram, Continuous Word Bag, Global Vectors/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/3. PyTorch/3.2. Autograd/gd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SHSID-Data-science-Club/Highschool_ML_Course/HEAD/3. PyTorch/3.2. Autograd/gd.png


--------------------------------------------------------------------------------
/5. Transformers/5.1. Self-Attention, Cross-Attention, Masked Self-Attention, Layer Normalization, Word Embedding, Positional Encoding/gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/4. Deep Learning & Computer Vision/4.7. ResNet/resnet1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SHSID-Data-science-Club/Highschool_ML_Course/HEAD/4. Deep Learning & Computer Vision/4.7. ResNet/resnet1.png


--------------------------------------------------------------------------------
/4. Deep Learning & Computer Vision/4.7. ResNet/resnet2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SHSID-Data-science-Club/Highschool_ML_Course/HEAD/4. Deep Learning & Computer Vision/4.7. ResNet/resnet2.png


--------------------------------------------------------------------------------
/4. Deep Learning & Computer Vision/4.7. ResNet/resnet3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SHSID-Data-science-Club/Highschool_ML_Course/HEAD/4. Deep Learning & Computer Vision/4.7. ResNet/resnet3.png


--------------------------------------------------------------------------------
/2. Machine Learning Generics/2.3. Regularization, Bias-Variance Trade-Off, Kernel Methods, Cross Validation/KernelTrick.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SHSID-Data-science-Club/Highschool_ML_Course/HEAD/2. Machine Learning Generics/2.3. Regularization, Bias-Variance Trade-Off, Kernel Methods, Cross Validation/KernelTrick.png


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 SHSID Data science Club
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/3. PyTorch/3.2. Autograd/autograd.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 |   
 3 | # Autograd
 4 | ##### **Jerry Zhang** | SHSID Data Science Club
 5 | 
 6 | <div align="left">
 7 | 
 8 | ## Hmm?
 9 | Recalling from the previous chapters, a key concept in Machine Learning is Backpropagation--calculating derivatives of individual elements in tensors. Backpropagation is essential for calculating the gradients used in gradient descent. 
10 | 
11 | <div align="center">
12 |   <img src="4.2 Autograd/gd.png" width="430px" style="border-radius:8px;">
13 | </div>
14 | 
15 | Calculating the gradients by hand is inefficient and unnecessary. Which is where Autograd comes in.
16 | 
17 | *Note that a gradient is a value derived from the derivatives; it's how you USE the derivatives. Changing how you calculate the gradient results in different optimizing behavior; more on this in the Optimizers chapter*
18 | 
19 | ## Usage
20 | Autograd is a Pytorch's auto-differentiation engine, that is, it traces a tensor's contribution to some result and calculates the gradient accordingly.
21 | 
22 | ```
23 | # 'requires_grad' turns on Autograd
24 | x = torch.tensor([1., 2., 3.], requires_grad=True)
25 | 
26 | y = x**2 + 3*x + 5
27 | 
28 | # Backpropagation
29 | # The tensor passed in is the gradient tensor
30 | # Since each individual element in the tensor contributes to many output values, a weighted sum is taken
31 | # The gradient tensor holds the weights of the weighted sum
32 | y.backward(torch.tensor[1., 1., 1.])
33 | 
34 | # The resulting gradients are stored their respective tensors
35 | x.grad # -> torch.tensor([5., 7., 9.])
36 | 
37 | # Resetting the gradients
38 | x.grad.zero_()
39 | # Usually though, you would not call grad.zero_(), more on this in the optimizer section.
40 | ```
41 | 
42 | Sometimes, for example when you are simply evaluating your model, you don't want to track the gradients as it affects performance.
43 | There are two ways to achieve this.
44 | ```
45 | x = torch.tensor([1., 2., 3.,], requires_grad=True)
46 | 
47 | x.detach() # Not recommended
48 | 
49 | with torch.no_grad():
50 | 	# Recommended
51 | 	# Only the code within this block won't be tracked
52 | 	# In other words, it's a temporary detach
53 | 	y = x**10 + 114
54 | ```
55 | 
56 | ## True Usage
57 | 
58 | In most cases though, you will be using a optimizer to do the backpropagation; more on this in the Optimizer chapter.  In this case, you code would resemble something like this.
59 | 
60 | ```
61 | # This line runs your model and returns the loss
62 | loss = model(input)
63 | # Backpropagating the contributions
64 | loss.backward()
65 | # Calculating the gradients and updating the weights
66 | optimizer.step()
67 | # Clearning the gradients
68 | optimizer.zero_grad()
69 | ```
70 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <img align="right" width="128" height="110" alt="SHSID Data Science Club" src="https://github.com/user-attachments/assets/61a0d5dd-37e3-45b0-a58e-e7724aa2579c" />
  2 | 
  3 | <div align="left">
  4 | 
  5 | International Olympiad of Artificial Intelligence Guide
  6 | =====================
  7 | created by the SHSID Data Science Club
  8 | 
  9 | ---
 10 | 
 11 | A machine learning course designed for high school students, inspired by the USAAIO course outline provided by Beaver-Edge.
 12 | 
 13 | **For the best experience and visualization, download this repository and run the `ipynb`!**
 14 | 
 15 | ---
 16 | 
 17 | # Table of Contents
 18 | 
 19 | ### 0. Prerequisites
 20 | * 0.1. Basic Environment For Python
 21 | * **0.2. Python For AI**
 22 |     * 0.2.1. Advanced Python Techniques For AI
 23 |     * 0.2.2. NumPy
 24 |     * 0.2.3. Pandas
 25 |     * 0.2.4. Matplotlib
 26 |     * 0.2.5. Seaborn
 27 | 
 28 | ### 1. Mathematical Methods For AI
 29 | * 1.1. Linear Algebra
 30 | * **1.2. Calculus**
 31 |     * 1.2.1. Single-Variable Derivatives
 32 |     * 1.2.2. Multi-Variable Derivatives & Gradients
 33 |     * 1.2.3. Chain Rule
 34 | * **1.3. Probability & Statistics**
 35 |     * 1.3.1. Discrete Distributions
 36 |     * 1.3.2. Continuous Distributions
 37 |     * 1.3.3. Mean
 38 |     * 1.3.4. Variance, Covariance
 39 |     * 1.3.5. Bayes' Rule
 40 | * **1.4. Convex Optimization**
 41 |     * 1.4.1. Convexity
 42 |     * 1.4.2. Gradient Descent
 43 |     * 1.4.3. Duality
 44 | 
 45 | ### 2. Machine Learning Generics
 46 | * 2.0. Machine Learning Terminology
 47 | * 2.1. Linear Regression & Logistic Regression
 48 | * 2.1. Support Vector Machines
 49 | * 2.3. Regularization, Bias-Variance Trade-Off, Kernel Methods, Cross Validation
 50 | * 2.4. Principal Component Analysis, Dimensionality Reduction
 51 | * 2.5. Decision Trees, Random Forests
 52 | * 2.5. K-Nearest Neighbors, Clustering K-Means
 53 | * 2.7. Bagging & Boosting (XGBoost)
 54 | 
 55 | ### 3. PyTorch
 56 | * 3.1. Tensors
 57 | * 3.2. Autograd
 58 | * 3.3. Devices
 59 | * 3.4. Modules
 60 | * 3.5. Datasets
 61 | * 3.6. Dataloader
 62 | * 3.7. Losses
 63 | * 3.8. Optimizers
 64 | 
 65 | ### 4. Deep Learning & Computer Vision
 66 | * 4.1. Forward Propagation, Activation Functions, Linear Layer
 67 | * 4.2. Backpropagation, Gradient Descent, Adaptive Moment Estimation
 68 | * 4.3. Parameter Initialization, Batch Normalization, Dropout
 69 | * 4.4. Convolutional Layers, Pooling Layers, Convolutional Neural Network
 70 | * 4.5. Image Data Augmentation
 71 | * 4.6. VGG
 72 | * 4.7. ResNet
 73 | * 4.8. GoogLeNet
 74 | * 4.9. Transfer Learning
 75 | * 4.10. Recurring Neural Networks
 76 | * 4.11. Reinforcement Learning
 77 | 
 78 | ### 5. Transformers
 79 | * **5.1. Attention**
 80 |     * 5.1.1. Self-Attention
 81 |     * 5.1.2. Cross-Attention, Masked Self-Attention, Multi-Head-attention
 82 |     * 5.1.3. Layer Normalization, Word Embedding, Positional Encoding
 83 | * 5.2. Inference
 84 | * 5.3. Training, Pre-Training, Fine-Tuning
 85 | * 5.4. Batch Processing
 86 | * 5.5. BERT, T5, GPT
 87 | * 5.6. RL with Human Feedback for LLMs
 88 | * 5.7. Modern LLM Optimizers, Mixture-of-Experts, Retrieval Augmented Generation (DeepSeek Case Study)
 89 | 
 90 | ### 6. Natural Language Processing & Graph Neural Networks
 91 | * 6.1. Character, Subword & Word Tokenization
 92 | * 6.2. Word Embedding Methods
 93 | * 6.3. Skip-Gram, Continuous Word Bag, Global Vectors
 94 | * 6.4. Encoder-Only & Decoder-Only Transformers
 95 | * 6.5. Message-Passing Neural Networks
 96 | * 6.6. Graph Convolutional Networks
 97 | * 6.7. Vision Transformers
 98 | 
 99 | ### 7. OpenCV & Generative AI
100 | * 7.1. Object Detection
101 | * 7.2. UNet
102 | * 7.3. Autoencoder, Variational Autoencoder
103 | * 7.4. Generative Adversarial Network, Adversarial Attack
104 | * 7.5. Stable Diffusion, Denoising Diffusion Probabilistic Methods
105 | * 7.6. State Space Models, Selective State Spaces (Mamba Case Study)
106 | 
107 | 


--------------------------------------------------------------------------------
/3. PyTorch/3.5. Datasets/Datasets.md:
--------------------------------------------------------------------------------
  1 | # Dataset
  2 | 
  3 | Datasets are undoubtably the fuel of machine learning————regardless of what algorithm you have, you learn from data.
  4 | However, it's not just a simple drag and drop task.
  5 | Data needs to be preprocessed: converted into the right format, normalized, split into train and validation, etc.
  6 | To facilitate this, Pytorch integrates many functions and modules, for example the `Dataset` class.
  7 | 
  8 | ```python
  9 | from torch.utils.data import Dataset
 10 | ```
 11 | 
 12 | `Dataset` is a protocol that behaves similarly as an array. 
 13 | There are two types: Map-style and Iterable-style
 14 | 
 15 | ## Initializing
 16 | 
 17 | ### Map-style
 18 | 
 19 | for random access
 20 | ```python
 21 | class Custom_DS(Dataset):
 22 | 	def __init__(self, ...): ...
 23 | 	def __len__(self) -> int: ...
 24 | 	def __getitem__(self, idx): ...
 25 | ```
 26 | 
 27 | ### Iterable-style
 28 | 
 29 | for serial access
 30 | ```python
 31 | class Custom_DS(Dataset):
 32 | 	def __init__(self, ...): ...
 33 | 	def __len__(self) -> int: ...
 34 | 	def __iter__(self): ...
 35 | ```
 36 | 
 37 | However, you won't have to manually define all of that.
 38 | For example if you are loading a dataset from *hugging face*, it will automatically return you a `Dataset` instance.
 39 | 
 40 | There are also some predefined Datasets, the common ones being:
 41 | 
 42 | ```python
 43 | from torch.utils.data import TensorDataset, ConcatDataset, Subset, ChainDataset
 44 | ```
 45 | 
 46 | ### TensorDataset
 47 | 
 48 | ```python
 49 | features = torch.randn(1000, 10)      # 1000 samples of 10 features each
 50 | labels = torch.randint(0, 2, (1000))  # 1000 0/1 s
 51 | 
 52 | dataset = TensorDataset(features, labels)
 53 | 
 54 | x = dataset[0]
 55 | y = dataset[1]
 56 | ```
 57 | 
 58 | ### ConcatDataset
 59 | 
 60 | ```python
 61 | # combines map-style datasets
 62 | combined = ConcatDataset([dataset1, dataset2, dataset3])
 63 | ```
 64 | 
 65 | ### ChainDataset
 66 | 
 67 | ```python
 68 | # combines iterable-style datasets
 69 | combined = ChainDataset([dataset1, dataset2, dataset3])
 70 | ```
 71 | 
 72 | ### Subset
 73 | 
 74 | ```python
 75 | # makes a subset of dataset with samples of index ∈ idxs
 76 | idxs = [1, 2, 3, 5, 8, 13]
 77 | dataset_subset = Subset(dataset, idxs)
 78 | ```
 79 | 
 80 | ## Dataset wrapper
 81 | 
 82 | If you want to apply transformations to your dataset dynamically, you might write a wrapper for an existing dataset
 83 | 
 84 | ```python
 85 | class Custom_DS(Dataset):
 86 | 	def __init__(self, dataset, transformation):
 87 | 		self._dataset = dataset
 88 | 		self._transformation = transformation
 89 | 	def __len__(self):
 90 | 		return len(self._dataset)
 91 | 	def __getitem__(self, idx):
 92 | 		item = self._dataset[idx]
 93 | 		x, y = item["image"], item["label"]
 94 | 		
 95 | 		x_transformed = self._transformation(x)
 96 | 		
 97 | 		return x_transformed, y
 98 | ```
 99 | 
100 | ## Utilities
101 | 
102 | ### Random_split
103 | 
104 | Datasets might not come presplit into training, validation, and test sets. By using the `random_split` function you can split one dataset randomly into subsets.
105 | 
106 | ```python
107 | from torch.utils.data import random_split
108 | 
109 | dataset = ...
110 | 
111 | train_size = int(0.7 * len(dataset))              # 70% train
112 | val_size = int(0.15 * len(dataset))               # 15% validation
113 | test_size = len(dataset) - train_size - val_size  # the rest are test
114 | 
115 | train_dataset, val_dataset, test_dataset = random_split(
116 |     dataset, [train_size, val_size, test_size]
117 | )
118 | ```
119 | 
120 | For reproducibility you can fix the seed. (Computers cannot generate true random numbers, they are just seemingly random numbers that follow a certain distribution (usually uniform). The seed is a parameter that creates variance between different "random" generations attempts; in other words, by fixing the seed you will get a reproducible sequence of "random" numbers)
121 | 
122 | ```python
123 | generator = torch.Generator().manual_seed(42)  # seed=42
124 | train_dataset, val_dataset = random_split(
125 |     dataset, [0.8, 0.2], generator=generator
126 | )
127 | ```
128 | 
129 | the seed is set at 42, because 42 is the answer to the universe.
130 | 


--------------------------------------------------------------------------------
/2. Machine Learning Generics/2.4. Principal Component Analysis, Dimensionality Reduction/PCA.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Principal Component Analysis (PCA)
  3 | PCA is an algorithm that reduces the dimension of data using linear algebra. It requires some linear algebra knowledge to fully grasp.
  4 | 
  5 | ## Process
  6 | Consider $\text{X}_0 \in \mathbb{R}^{n \times p}$ to be the original data. We select $k$ as the number of components we want.
  7 | 
  8 | 0. Data Preprocessing:
  9 | 
 10 |     We first center the data:
 11 | 
 12 |     $$
 13 |     \text{X} =
 14 |     \begin{pmatrix}
 15 |     x_{11}-\bar x_1 & x_{12}-\bar x_2 & \cdots & x_{1p}-\bar x_p \\
 16 |     x_{21}-\bar x_1 & x_{22}-\bar x_2 & \cdots & x_{2p}-\bar x_p \\
 17 |     \vdots & \vdots & \ddots & \vdots \\
 18 |     x_{n1}-\bar x_1 & x_{n2}-\bar x_2 & \cdots & x_{np}-\bar x_p
 19 |     \end{pmatrix}
 20 |     \quad \text{with} \quad
 21 |     \bar x_j=\frac{1}{n}\sum_{i=1}^{n}x_{ij}.
 22 |     $$
 23 | 
 24 | 1. Covariance Matrix
 25 | 
 26 |    Covariance is defined as $\text{cov}(\text{X},\text{Y}) = \frac{\sum^n_{i=1}({x}_i-\bar {x})({y}_i-\bar {y})}{n-1}$. It is a statistical value that describes how two variables change together. The covariance matrix, $\text{S}$, generalizes this idea to many variables at once. After $\text{X}$ is centered, $S=\frac{1}{n-1}\text{X}^T \text{X} \in \mathbb{R}^{p \times p}$.
 27 | 
 28 |     This matrix has these properties:
 29 | 
 30 |     a. Symmetry: $\text{S}=\text{S}^T$.
 31 | 
 32 |     b. Positive-semidefinite: all eigenvalues $\lambda_i \geq 0$.
 33 | 
 34 |     c. Its eigenvectors give the principal axes.
 35 | 
 36 | 2. Eigen-decomposition of $\text{S}$
 37 | 
 38 |     The next step is to find the eigenvectors of $\text{S}$, which is essentially solving this equation:
 39 | 
 40 |     $$ \text{S}v = \lambda v $$
 41 | 
 42 |     This is equivalent to solving $\text{det}(\text{S}-\lambda \text{I}) = 0$.
 43 | 
 44 | 3. Projection to $k$ principal axes
 45 | 
 46 |     We construct the projection matrix $\text{W}$ by selecting the most important principal components. This importance is measured by variance ratios, which $\text{Variance Ratio}_i = \frac{\lambda_i}{\sum_{j=1}^p \lambda_j}$. This step is essentially selecting the first $k$ eigenvectors with the greatest eigenvalues.
 47 | 
 48 |     $$ \text{W} = \begin{bmatrix} v_1 & v_2 & \cdots & v_k \end{bmatrix} \in \mathbb{R}^{p \times k} $$
 49 | 
 50 |     Lastly, multiply $\text{X}$ by $\text{W}$
 51 | 
 52 |     $$ \text{X}_k = \text{X} \text{W} \in \mathbb{R}^{n \times k} $$
 53 | 
 54 |     The transformed coordinates of the original data is also called the PCA score.
 55 | 
 56 | 
 57 | ## Disadvantages
 58 | 
 59 | Although PCA is very fast (linear to the number of samples), it relies on linear projection, meaning that it only works with linearly separable data. Kernel PCA utilizes a kernel function to project data into a higher-dimensional feature space, where the data becomes linearly separable, and then applies PCA.
 60 | 
 61 | There are other algorithms that perform dimension reduction, for example: t-Distributed Stochastic Neighbor Embedding (t-SNE), Uniform Manifold Approximation and Projection (UMAP), and Isometric Mapping (Isomap). We might cover these in the future.
 62 | 
 63 | ```python
 64 | # Implementation on a pizza dataset, reducing 9 dimensions to 2.
 65 | 
 66 | import pandas as pd
 67 | from sklearn.preprocessing import StandardScaler
 68 | from sklearn.decomposition import PCA
 69 | import matplotlib.pyplot as plt
 70 | 
 71 | # load data
 72 | data_path = 'pizza.csv' # your data.csv
 73 | df = pd.read_csv(data_path)
 74 | 
 75 | 
 76 | # There are non-numerical values so we need to do encoding:
 77 | from sklearn import preprocessing
 78 | encoder = preprocessing.LabelEncoder()
 79 | print(df.head()) #original data
 80 | df['brand'] = encoder.fit_transform(df['brand'])
 81 | print(df.head()) #encoded version
 82 | X = df.values
 83 | # centering data
 84 | scaler = StandardScaler()   # normalize data
 85 | X_centered = scaler.fit_transform(X)
 86 | 
 87 | # 4. Fit PCA and transform to k
 88 | k = 2 #transforming to 2 dimensions
 89 | pca = PCA(n_components=k)        # keep 2 principal components
 90 | Z = pca.fit_transform(X_centered)
 91 | 
 92 | print("Explained variance ratio:", pca.explained_variance_ratio_)
 93 | print("First 5 rows of PCA scores (Z):\n", Z[:5])
 94 | 
 95 | # 6. Quick plot (optional)  ------------------------
 96 | plt.scatter(Z[:, 0], Z[:, 1], alpha=0.7)
 97 | plt.xlabel('PC1')
 98 | plt.ylabel('PC2')
 99 | plt.title('PCA projection (2-D)')
100 | plt.show()
101 | ```


--------------------------------------------------------------------------------
/4. Deep Learning & Computer Vision/4.7. ResNet/resnet.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "12e4e3b2",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Resnet\n",
  9 |     "\n",
 10 |     "Original Paper: Deep Residual Learning for Image Recognition\n",
 11 |     "\n",
 12 |     "## Problem Statement\n",
 13 |     "Before actually introducing resnet, we want to know the problem it intends to solve. Lets consider two networks, a shallow one and its deeper counterpart that adds more layers onto it. We should expect the deeper one to have better results because in the worst case the extra layers of the deeper counterpart are all identity mappings and the results would be equivalent to that of the shallow network.\n",
 14 |     "\n",
 15 |     "<div style = \"text-align:center;\">\n",
 16 |     "<img src=\"resnet1.png\" width=\"500\" height=\"200\">\n",
 17 |     "</div>\n",
 18 |     "\n",
 19 |     "This shows that the intuitive conclusion that adding layers = better performance doesn't typically hold true. The problem then becomes how to make the network learn identity functions, which is hard in traditional networks.\n",
 20 |     "\n",
 21 |     "## Approach\n",
 22 |     "There are essentially two pieces of innovation that Resnet comes up with.\n",
 23 |     "\n",
 24 |     "---\n",
 25 |     "\n",
 26 |     "Instead of learning the direct transformation, Resnet learns a residual between the output and the input, that is, $\\mathcal{F}: x_{l-1} \\to x_l-x_{l-1}$ where $f$ is the function the model tries to fit, $y$ is the output, and $x$ is the input.\n",
 27 |     "\n",
 28 |     "For a residual block, the output $x_l$ is defined as: \n",
 29 |     "\\begin{equation} x_l = x_{l-1} + \\mathcal{F}(x_{l-1}) \\end{equation}\n",
 30 |     "<div style = \"text-align:center;\">\n",
 31 |     "<img src=\"resnet2.png\" width=\"250\" height=\"200\">\n",
 32 |     "</div>\n",
 33 |     "\n",
 34 |     "The added $x_{l-1}$ comes through a skip connection, as shown in the figure, we add it to the end of the block such that the output is still $x_l$ but the training process of the block changes. If $x_l$ and the residual have different dimensions or channel counts, the skip connection goes through a $1\\times 1$ convolution to project $x_{l-1}$ to the correct dimension.\n",
 35 |     "\n",
 36 |     "## Why\n",
 37 |     "\n",
 38 |     "### Easier To Fit Identity Function\n",
 39 |     "The essential problem is that traditional networks struggle to fit the identity function. Fitting a $n$ sized block of neural networks as the identity funciton would require:\n",
 40 |     "\\begin{equation}\n",
 41 |     " \\sigma(W_n\\sigma(W_{n-1}\\cdots(W_1x))) = x\n",
 42 |     "\\end{equation}\n",
 43 |     "Which is very hard due to non-linear activation functions, take ReLU $\\sigma(z) = \\text{max}(0,z)$ as an example, it directly 'zeros' all negative values. This means that fitting the block to the identity function would require it to be able to accurately 'recreate' any $x < 0$ out of no information, as $x$ would have been set zero by ReLU. \n",
 44 |     "\n",
 45 |     "<div style = \"text-align:center;\">\n",
 46 |     "<img src=\"resnet3.png\" width=\"500\" height=\"200\">\n",
 47 |     "</div>\n",
 48 |     "\n",
 49 |     "For a residual block to fit the identity function, it would require:\n",
 50 |     "\\begin{equation}\n",
 51 |     "\\mathcal{F} + x = x\n",
 52 |     "\\end{equation}\n",
 53 |     "This can easily be simplified to be $\\mathcal{F} = 0$, which is far easier than the previously shown requirements as it can simply be done by setting all weights to $0$. \n",
 54 |     "\n",
 55 |     "### Prevent Gradient Vanishing\n",
 56 |     "During backpropagation, for a loss $\\mathcal{L}$, the gradient of $\\mathcal{L}$ with respect to $x_{l-1}$ is:\n",
 57 |     "\\begin{equation} \\frac{\\partial \\mathcal{L}}{\\partial x_{l-1}} = \\frac{\\partial \\mathcal{L}}{\\partial x_l} \\cdot (1 + \\frac{\\partial \\mathcal{F}}{\\partial x_{l-1}}) \\end{equation}\n",
 58 |     "\n",
 59 |     "The chain rule here applies as: $\\frac{\\partial \\mathcal{L}}{\\partial x_{l-1}} = \\frac{\\partial \\mathcal{L}}{\\partial x_l} \\cdot (\\frac{\\partial}{\\partial x_{l-1}}(x_{l-1} + \\mathcal{F}))$ as $x_l = x_{l-1} + \\mathcal{F}$\n",
 60 |     "\n",
 61 |     "The \"$+1$\" in the equation ensures that the gradient is able to flow directly from $x_l$ to $x_{l-1}$. In traditional networks ($f$), it looks like:\n",
 62 |     "\\begin{equation}\n",
 63 |     "\\frac{\\partial \\mathcal{L}}{\\partial x_{l-1}} = \\frac{\\partial \\mathcal{L}}{\\partial x_l} \\cdot \\frac{\\partial f}{\\partial x_{l-1}}\n",
 64 |     "\\end{equation}\n",
 65 |     "\n",
 66 |     "The chain rule in this one is more explicit, $f = x_l$. The problem with this is that since $\\frac{\\partial f}{\\partial x_{l-1}}$ can be similarly decomposed to a product of multiple gradients, it approaches to $0$ exponentially if all gradients $<1$. Resnet's \"$+1$\" ensures that even if the gradient vanishes, there is still something that is passed back.\n",
 67 |     "\n",
 68 |     "\n"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "id": "b41df3db",
 75 |    "metadata": {},
 76 |    "outputs": [
 77 |     {
 78 |      "name": "stdout",
 79 |      "output_type": "stream",
 80 |      "text": [
 81 |       "Downloading: \"https://github.com/pytorch/vision/zipball/v0.10.0\" to C:\\Users\\1111/.cache\\torch\\hub\\v0.10.0.zip\n"
 82 |      ]
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "import torch\n",
 87 |     "# Load ResNet models of different depths (with pre-trained weights)\n",
 88 |     "def get_resnet(depth, pretrained=True):\n",
 89 |     "    \"\"\"\n",
 90 |     "    Load ResNet model of specified depth using torch.hub.\n",
 91 |     "    \n",
 92 |     "    Args:\n",
 93 |     "        depth (int): ResNet depth (18, 34, 50, 101, 152)\n",
 94 |     "        pretrained (bool): Whether to load pre-trained weights\n",
 95 |     "    \n",
 96 |     "    Returns:\n",
 97 |     "        nn.Module: ResNet model\n",
 98 |     "    \"\"\"\n",
 99 |     "    model_name = f\"resnet{depth}\"\n",
100 |     "    # PyTorch's vision repo supports these ResNet variants\n",
101 |     "    supported_depths = {18, 34, 50, 101, 152}\n",
102 |     "    if depth not in supported_depths:\n",
103 |     "        raise ValueError(f\"Unsupported ResNet depth: {depth}. Supported depths: {supported_depths}\")\n",
104 |     "    \n",
105 |     "    model = torch.hub.load(\n",
106 |     "        'pytorch/vision:v0.10.0',  # Repo and version\n",
107 |     "        model_name,                # Model name (e.g., \"resnet18\")\n",
108 |     "        pretrained=pretrained\n",
109 |     "    )\n",
110 |     "    model.eval()  # Set to evaluation mode\n",
111 |     "    return model\n",
112 |     "\n",
113 |     "\n",
114 |     "# Example usage\n",
115 |     "if __name__ == \"__main__\":\n",
116 |     "    # Load ResNet-18\n",
117 |     "    resnet18 = get_resnet(18, pretrained=True)\n",
118 |     "    print(\"ResNet-18 loaded.\")\n",
119 |     "\n",
120 |     "    # Load ResNet-50\n",
121 |     "    resnet50 = get_resnet(50, pretrained=True)\n",
122 |     "    print(\"ResNet-50 loaded.\")\n",
123 |     "\n",
124 |     "    # Load ResNet-152\n",
125 |     "    resnet152 = get_resnet(152, pretrained=True)\n",
126 |     "    print(\"ResNet-152 loaded.\")\n",
127 |     "\n",
128 |     "    # Test with a sample input\n",
129 |     "    x = torch.randn(1, 3, 224, 224)  # Batch of 1, 3-channel, 224x224 image\n",
130 |     "    with torch.no_grad():  # Disable gradient computation for inference\n",
131 |     "        output = resnet18(x)\n",
132 |     "        print(f\"ResNet-18 output shape: {output.shape}\")  # Should be (1, 1000)"
133 |    ]
134 |   }
135 |  ],
136 |  "metadata": {
137 |   "kernelspec": {
138 |    "display_name": "base",
139 |    "language": "python",
140 |    "name": "python3"
141 |   },
142 |   "language_info": {
143 |    "codemirror_mode": {
144 |     "name": "ipython",
145 |     "version": 3
146 |    },
147 |    "file_extension": ".py",
148 |    "mimetype": "text/x-python",
149 |    "name": "python",
150 |    "nbconvert_exporter": "python",
151 |    "pygments_lexer": "ipython3",
152 |    "version": "3.12.7"
153 |   }
154 |  },
155 |  "nbformat": 4,
156 |  "nbformat_minor": 5
157 | }
158 | 


--------------------------------------------------------------------------------
/4. Deep Learning & Computer Vision/4.1. Forward Propagation, Activation Functions, Linear Layer/4.1 forward propagation  + Activation functions + Linear Layer.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ### **Lesson 4.1: The Data's Grand Tour (Forward Propagation)**
  3 | SHSID DATA SCIENCE CLUB | Gordon.H
  4 | 
  5 | 
  6 | Alright, welcome back!
  7 | 
  8 | Ever wonder what *actually* happens inside a neural network when you give it data? It's not just a black box of magic. Today, we're going to pull back the curtain and follow a single piece of data on its journey from input to final prediction.
  9 | 
 10 | This one-way trip is called **Forward Propagation**, and it's the heartbeat of every neural network.
 11 | 
 12 | ### What We're Unlocking Today
 13 | 
 14 | By the end of this chat, you'll be able to:
 15 | *   See the **Linear Layer** as the simple "calculator" at the heart of a neuron.
 16 | *   Understand how an **Activation Function** acts as the neuron's "decision-maker."
 17 | *   Walk a piece of data through a mini-network, step-by-step, to see how it all comes together.
 18 | 
 19 | ***
 20 | 
 21 | ## What's Inside a Single Neuron?
 22 | 
 23 | Before we can understand a whole stadium, let's get to know one player. A neuron's job is surprisingly straightforward and happens in two steps:
 24 | 
 25 | 1.  **Calculate:** It gathers all the information it's given, weighs it all, and crunches it into a single number. Think of this as the *brawn*.
 26 | 2.  **Decide:** It looks at that number and decides how important it is. Should I get excited about this? Should I ignore it? This is the *brains*.
 27 | 
 28 | Let's meet the two parts that handle this.
 29 | 
 30 | ## 1. The Calculator: The Linear Layer
 31 | 
 32 | The first step is just some simple math. This might sound fancy, but it's based on a formula you definitely learned in school: `y = mx + b`.
 33 | 
 34 | Seriously, that's it! We just use slightly different words in machine learning.
 35 | 
 36 | | School Algebra (`y = mx + b`) | Machine Learning (`output = weight * input + bias`) |
 37 | | :--- | :--- |
 38 | | `x` (your input number) | `input` (our piece of data) |
 39 | | `m` (the slope of the line) | `weight` (how *important* that input is) |
 40 | | `b` (where the line starts) | `bias` (a little "nudge" to get it just right) |
 41 | 
 42 | The **`weight`** is the key. It tells the neuron how much to care about a piece of information. A big weight means "pay close attention to this!" The **`bias`** is just an offset, like a little head-start that can make it easier or harder for the neuron to get excited.
 43 | 
 44 | Let's make this real. Imagine a neuron whose only job is to guess if a song will be a "hit." One piece of info it gets is the song's tempo.
 45 | 
 46 | *   `input` = `120` (a classic, danceable tempo)
 47 | *   Let's say the network has learned a `weight` for tempo of `0.02`.
 48 | *   And it has a starting `bias` of `-1.5`.
 49 | 
 50 | The Linear Layer just does the math:
 51 | ```
 52 | # The formula: (input * weight) + bias
 53 | calculation = (120 * 0.02) + (-1.5)
 54 | calculation = 2.4 - 1.5
 55 | calculation = 0.9
 56 | ```
 57 | Okay, the neuron has calculated a score: `0.9`. But... so what? Is `0.9` good? Bad? That's where the decision-maker comes in.
 58 | 
 59 | ## 2. The Decision-Maker: The Activation Function
 60 | 
 61 | That score of `0.9` doesn't mean anything on its own. The neuron needs a rulebook to translate that score into a clear signal to pass along. This rulebook is the **Activation Function**.
 62 | 
 63 | > **Analogy:** Think of it like a dimmer switch for a light. The linear calculation (`0.9`) is how hard you're pressing on the switch. The activation function decides how bright the light should be. Maybe a gentle press does nothing, but a firm press turns it on full blast.
 64 | 
 65 | Without this "decide" step, our network would just be a long, boring chain of `y=mx+b`. It could only learn simple, straight-line patterns. Activation functions add the "spark"—the twists and turns that let the network learn incredibly complex things, like telling a cat from a dog.
 66 | 
 67 | ### Our Favorite Decision-Maker: ReLU
 68 | 
 69 | The most popular activation function is called **ReLU (Rectified Linear Unit)**. Its rule is laughably simple:
 70 | *   If a number is positive, keep it.
 71 | *   If a number is negative, just make it `0`.
 72 | 
 73 | That's it! We can write it as `ReLU(x) = max(0, x)`.
 74 | 
 75 | Let's see it in action:
 76 | *   `ReLU(0.9)` → `0.9` (Our "hit song" neuron decides this is a strong signal and passes it on!)
 77 | *   `ReLU(52.7)` → `52.7`
 78 | *   `ReLU(-3.1)` → `0` (The neuron decides this signal isn't worth bothering with and silences it.)
 79 | 
 80 | So, the full journey through one neuron looks like this:
 81 | **Inputs → Linear Layer (Calculate) → Activation Function (Decide) → Output**
 82 | 
 83 | ## Putting It All Together: The Grand Tour
 84 | 
 85 | So, what is **Forward Propagation**? It's simply the process of letting our data complete this journey through the *entire* network, one layer of neurons at a time.
 86 | 
 87 | It's like a relay race. The outputs from the first layer of neurons become the inputs for the second layer. They do their little "calculate-and-decide" dance and pass their results to the third layer, and so on, until the data crosses the final finish line, which gives us the network's final prediction.
 88 | 
 89 | ## Let's Be the Computer!
 90 | 
 91 | Alright, time to roll up our sleeves. We have a tiny network for our favorite houseplant. Its job is to decide if the plant needs water. It looks at two things:
 92 | 1.  `days_since_last_water`
 93 | 2.  `is_sunny` (1 for sunny, 0 for not sunny)
 94 | 
 95 | Our network has one "thinking" neuron in the middle (`Neuron H`) and one final "decision" neuron (`Neuron O`). We'll use our friend **ReLU** for all the decisions.
 96 | 
 97 | ---
 98 | 
 99 | ### **Step 1: See What the Hidden Neuron (H) Thinks**
100 | 
101 | Neuron H looks at both our original inputs. Its formula is: `(input1 * weight1) + (input2 * weight2) + biasH`
102 | 
103 | **Here are its settings (the network already "learned" these):**
104 | *   `weight1` (for days) = `0.4`
105 | *   `weight2` (for sun) = `0.2`
106 | *   `biasH` = `-0.5`
107 | 
108 | **Our Scenario:** It's been **3 days** since we watered, and it **is sunny** today.
109 | *   `input1` = `3`
110 | *   `input2` = `1`
111 | 
112 | **A. Do the Math (Linear Layer):**
113 | ```python
114 | # Let's calculate the neuron's initial score
115 | linear_result_H = (3 * 0.4) + (1 * 0.2) + (-0.5)
116 | linear_result_H = (1.2) + (0.2) - 0.5
117 | linear_result_H = _______________ # Fill this in!
118 | ```
119 | 
120 | **B. Make a Decision (Activation Function):**
121 | ```python
122 | # Now, apply the ReLU rule to your result
123 | output_H = ReLU(linear_result_H)
124 | output_H = _______________ # What does it decide?
125 | ```
126 | 
127 | ---
128 | 
129 | ### **Step 2: Get the Final Verdict from the Output Neuron (O)**
130 | 
131 | Neuron O is simpler. Its *only* input is the signal it got from Neuron H (`output_H`). Its formula is: `(output_H * weightO) + biasO`
132 | 
133 | **Its settings are:**
134 | *   `weightO` = `1.5`
135 | *   `biasO` = `0.1`
136 | 
137 | **A. Do the Final Calculation:**
138 | ```python
139 | # Use your value for output_H from the last step!
140 | linear_result_O = (output_H * 1.5) + 0.1
141 | linear_result_O = _______________
142 | ```
143 | 
144 | **B. Make the Final Decision!**
145 | ```python
146 | # One last ReLU! A positive output means "water the plant."
147 | final_prediction = ReLU(linear_result_O)
148 | final_prediction = _______________
149 | ```
150 | 
151 | > **Check Your Work!**
152 | > In Step 1, you should get `linear_result_H = 0.9`. After applying ReLU, `output_H` is also `0.9`.
153 | > In Step 2, you'd calculate `(0.9 * 1.5) + 0.1 = 1.35 + 0.1 = 1.45`.
154 | > The `final_prediction` is `ReLU(1.45)`, which is `1.45`.
155 | > Since the number is positive, our network is shouting: **"Yes, water the plant!"**
156 | 
157 | ## You Did It!
158 | 
159 | And... that's it. You just manually performed forward propagation. You acted as the brain of a neural network, taking inputs, pushing them through a "calculator" and a "decision-maker," and getting a final answer. This is *exactly* what every deep learning model does, just on a much, much bigger scale.
160 | 
161 | But wait... where did those weights and biases (`0.4`, `0.2`, `-0.5`...) come from? This feels a bit like magic, right? How did the network *know* the right values to make good predictions?
162 | 
163 | That's the real secret sauce: **training**. And it's exactly what we're diving into next.
164 | 
165 | 


--------------------------------------------------------------------------------
/2. Machine Learning Generics/2.1. Linear Regression & Logistic Regression/Linear & Logistic Regression.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 | # All About **Linear & Logistic Regression**
  4 | ###### Will Chen | SHSID Data Science Group
  5 | 
  6 | ## Key Questions
  7 | 
  8 | <div align="left">
  9 | 
 10 | Linear and logistic regression are key techniques used to understand and apply in supervised learning. By the end of the lesson, you will be able to answer the following **key questions**: 
 11 | 1. What are linear and logistic regression, and what kind of problems do they solve? 
 12 | 2. What's the difference between linear and logistic regression? 
 13 | 3. In the age of deep learning, why do we still rely on these foundational techniques? 
 14 | 
 15 | <div align="center">
 16 |   
 17 | ## Key Terms
 18 | <div align="left">
 19 | 
 20 | In order to understand this lesson, you would have a grasp of the following key concepts and terms: 
 21 | 
 22 | - **Features**: The input variables used to make a prediction. It goes through the neural network. When the prediction is `f(x)`, the feature would be `x` (input). 
 23 | - **Target**: The variable we are trying to predict. It is the result of a feature going through the neural network. Using the same example as above, the target is `f(x)`. 
 24 | - **Regression**: A technique used in supervised learning where the goal of the model is to predict a *continuous numerical value*. These can be predictions such as tomorrow's temperature. It's not usually used anywhere else. 
 25 | - **Classification**: A technique used in supervised learning where the goal of the model is to predict a *discrete category*. Instead of an exact number, it classifies the features into defined targets. Specifically, it outputs a *probability list of how well an input fits into each category*. These can be predictions such as marking an image to more likely to be a cat or dog. 
 26 | - **Weights**: aka Coefficients. It's one piece of data stored in a neuron that determines the feature's influence to the prediction. The influence is usually multiplication, so multiplying by a small number may mean a smaller influence, and vice versa. 
 27 | - **Biases**: aka Intercepts. Also stored in a neuron, it's a constant data that changes the baseline representiation. Weights change something based on what's given; biases change a constant amount regardless. 
 28 | - **Loss**: aka Cost. It's the difference between an erroneous output and the expected output, useful in training models. The goal of training is to minimize this function via changing the weights and biases of each neuron in each layer. 
 29 | - **Gradient descent**: An optimization that allows us to find out by what magnitude do we need to change the information within our neurons to make the loss minimized. You'll learn more about this later on in Chapter 5. Through iteration, it adjusts the parameters in the opposite direction of the gradient. 
 30 | - **Sigmoid**: A special function that compresses all input values into a range between 0 and 1. The key differentiator between linear and logistic.
 31 | 
 32 | <img align="left" width="270" height="170" alt="Linear vs Logistic" src="https://github.com/user-attachments/assets/32268bcc-9bb0-4f39-97ac-02ee84b74d2b" />
 33 | 
 34 | <img align="center" width="350" height="200" alt="Classification vs Regression" src="https://github.com/user-attachments/assets/d84d6158-be14-479f-b9c4-483d11fb14d8" />
 35 | 
 36 | <img align="right" width="200" height="700" alt="Gradient descent summary" src="https://github.com/user-attachments/assets/3f947b17-8f0a-42f9-a476-984acfdbed19" />
 37 | 
 38 | <div align="center">
 39 |   
 40 | ## Introduction to predictive modeling
 41 | <div align="left">
 42 | 
 43 | ### What are Linear and Logistic Regression?
 44 | 
 45 | **Linear Regression** and **Logistic Regression** are two of the most fundamental and universal algorithms in ML. They are both supervised learning methods, but they are used to solve different kinds of problems. 
 46 | 
 47 | - **Linear regression** is used for **regression** tasks, or to predict a continuous value.
 48 |   - Example: Based on past year's data, predict tomorrow's forecast. 
 49 |   - Key concept: Drawing the best-fit line for a plot of points. 
 50 | 
 51 | - **Logistic regression** is used for **classification** tasks. Don't let the name fool you, it's not for regression even though it's named regression, that's just the technique not the application. 
 52 |   - Example: Predict whether if a given image is a cat or a dog. 
 53 |   - Key concept: Drawing a separation line that defines the boundary between different plots of points. 
 54 | 
 55 | While they solve different issues, they share a similar underlying foundation in the math. Understanding linear regression is the basis to understanding logistic regression. 
 56 | 
 57 | ### Part 1: Linear regression
 58 | 
 59 | As humans, we can draw a best-fit line pretty easily. Just look at the set of points on the graph and you'll have a rough estimate on which line fits best. This is because our brains are kind of built for fuzzy pattern matching stuff like this. However, it might not be mathematically the most accurate way to draw a best fit line, nor would we like for us to draw the lines ourselves, regardless of method. 
 60 | 
 61 | So how do we teach this to a computer, and make them do it accurately? 
 62 | 
 63 | Well, you might remember this slope-intercept form from math:
 64 | 
 65 | $$ f(x) = mx + b $$
 66 | 
 67 | In machine learning, this is also a core concept and you can see it in a lot of models: 
 68 | - f(x) is the target output. 
 69 | - x is the feature input. 
 70 | - m is the weight. 
 71 | - b is the bias. 
 72 | 
 73 | This slope-intercept form represents a line. If you want to shape the line in such a way that it fits a specific group of points, you would want to adjust the values m and b. This is exactly the same values that models adjust during training. They adjust the weights and biases for each neuron until we get a line that closely fits the "expected" point group. 
 74 | 
 75 | ### Part 2: Logistic regression
 76 | 
 77 | What if our problem isn't predicting a price, but predicting a "yes" or "no" answer? The target is now a category (Cat=1, Dog=0), not a continuous number.
 78 | 
 79 | #### The sigmoid function
 80 | 
 81 | A straight line doesn't really fit our needs. If all we want is the model to tell us what it thinks the picture is, we really just need it to give us a number, between 0 and 1. For example, closer to 0 means dog, and closer to 1 means cat. 
 82 | 
 83 | So, we use a trick called the sigmoid function. It takes the output of a linear equation (`mx + b`) and feeds it into the sigmoid equation. 
 84 | 
 85 | The sigmoid function has an "S" shape. No matter what number you put into it (from negative infinity to positive infinity), it will always output a value **between 0 and 1**.
 86 | 
 87 | So, the logistic regression model looks like this:
 88 | 
 89 | $$ Probability(Animal) = Sigmoid(mx + b) $$
 90 | 
 91 | This output can be interpreted as the probability of the positive class. For instance, if the model outputs 0.8, it is 80% confident that the animal is a cat.
 92 | 
 93 | #### Decision boundaries
 94 | 
 95 | For our model to give us a decision instead of a number, we have to setup a decision boundary. **It's the line that sets up the distinction between results**. For a binary decision like cat or dog, it's much more simple, and the boundary is placed commonly at 0.5. If it's less than 0.5, we are more sure it is a dog than cat, and vice versa. 
 96 | 
 97 | #### Binary cross entropy
 98 | 
 99 | Because our predictions are now probabilities, the traditional loss function for most models, Mean-Squared Error, is no longer the best loss function. Instead, logistic regression uses a loss function called **binary cross-entropy** (or los loss). It heavily "punishes" the model when it makes a confident but incorrect prediction. For example, if the model predicts a 99% chance of a cat who is actually a dog, the loss will be very high and the model will be severely "punished". 
100 | 
101 | Apart from this, the rest can be basically the same. They both use concepts such as chain rule and gradient descent to calculate the specific neurons, magnitude, and direction of change (which you'll learn about later on in detail, but the high-level differences are these). 
102 | 
103 | <div align="center">
104 | 
105 | ### Conclusion
106 | <div align="left">
107 | 
108 | Linear and Logistic Regression are the foundational pillars of predictive modeling. They demonstrate the core process of machine learning: defining a model, measuring its error with a **loss function**, and iteratively improving it using an optimizer like **gradient descent**.
109 | 
110 | - **Linear Regression** fits a line to data to predict **continuous values**.
111 | - **Logistic Regression** adapts this line with a **sigmoid function** to predict a probability for **classification tasks**.
112 | 
113 | Although simpler than deep neural networks and more complex topics, their importance is still monumental. They are fast, interpretable, and serve as the starting point for beginners. In fact, a single neuron in a neural network performing a classification task is essentially a logistic regression unit. 
114 | 


--------------------------------------------------------------------------------
/3. PyTorch/3.4. Modules/module.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 | # Modules
  4 | ##### **Jerry Zhang** | SHSID Data Science Club
  5 | 
  6 | <div align="left">
  7 | 
  8 | ## What?
  9 | 
 10 | Pytorch follows the python norm of OOP to let you construct neural networks—specifically they let you inherit a prebuilt module, `torch.nn.Module` which is essentially the skeleton of a neural network.
 11 | 
 12 | The benefits of inheriting from a base class rather than assembling functions yourself is that it unifies code and hides unnecessary complexity while still giving you the freedom of modification to any degree.
 13 | 
 14 | ## Usage
 15 | 
 16 | `torch.nn` contains most functions / objects needed to construct a neural network
 17 | ```python
 18 | import torch.nn as nn
 19 | ```
 20 | 
 21 | The architecture
 22 | ```python
 23 | class Your_nn(nn.Module):
 24 | 	def __init__(self, ...):
 25 | 		# super() refer to the father class, in this case nn.Module
 26 | 		# __init__ is Module's init, calling it initiallizes Module's features
 27 | 		# most sources will tell you to write super().__init__() but it's equivilent to super().__init__() in this case
 28 | 		super().__init__()
 29 | 		
 30 | 		# your layers and functions
 31 | 		...
 32 | 	
 33 | 	def forward(self, x):
 34 | 		"""
 35 | 		This is the forward feeding function
 36 | 		You define the structure of your network using the components defined in __init__
 37 | 		x is the input tensor
 38 | 		return the output
 39 | 		"""
 40 | 		...
 41 | ```
 42 | 
 43 | A sample to better illustrate this
 44 | ```python
 45 | class MNIST_nn(nn.Module):
 46 | 	def __init__(self):
 47 | 		super().__init__()
 48 | 		
 49 | 		# these will be explained in section 4
 50 | 		
 51 | 		# the convolution layers
 52 |         self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
 53 |         self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
 54 |         # the fc layers
 55 |         self.fc1 = nn.Linear(64 * 7 * 7, 512)
 56 |         self.fc2 = nn.Linear(512, 10)
 57 | 		# dropout
 58 |         self.dropout = nn.Dropout(0.25)
 59 | 		# normalization
 60 |         self.bn1 = nn.BatchNorm2d(32)
 61 |         self.bn2 = nn.BatchNorm2d(64)
 62 |         # activation functions
 63 |         self.relu = nn.ReLU()
 64 | 	
 65 | 	def forward(self, x):
 66 | 		# one layer
 67 | 		x = self.conv1(x)
 68 | 		x = self.relu(x)
 69 |         x = self.bn1(x)
 70 |         x = F.max_pool2d(x, 2)
 71 |         
 72 |         # you can also write it in a more compact way
 73 |         x = self.bn2(self.relu(self.conv2(x)))
 74 |         x = F.max_pool2d(x, 2)
 75 | 
 76 | 		# flattening
 77 |         x = x.view(-1, 64 * 7 * 7)
 78 |         
 79 |         # here is another way to call relu
 80 |         # F is from nn.functional
 81 |         # this relu is a function while nn.ReLU() is an object
 82 |         # they are basically equivilent in terms of computation
 83 |         # F.relu is a bit more simplistic
 84 |         # nn.ReLU() is more organized and can be integrated into nn.Sequential
 85 |         x = self.dropout(F.relu(self.fc1(x)))
 86 |         x = self.fc2(x)
 87 |         
 88 |         return x
 89 | ```
 90 | 
 91 | main function
 92 | ```python
 93 | if __name__ == "__main__":
 94 | 	model = Your_nn(...)
 95 | 
 96 | 	for epoch in range(epochs):
 97 | 		loss = model(input)
 98 | 		loss.backward()
 99 | 		optimizer.step()
100 | 		optimizer.zero_grad()
101 | ```
102 | 
103 | ## Side facts about the hidden complexity
104 | 
105 | you might wonder how backpropagation is done
106 | 
107 | ### Auto param registration
108 | 
109 | there is a hidden attribute called `self._parameters` which is an `OrderedDict`. Whenever a learnable module is defined, it is auto added to it.
110 | This is done via the `__setattr__` dunder method
111 | 
112 | ```python
113 | # rough idea
114 | # whenever an attribute is added to the object this is invoked
115 | def __setattr__(self, name, value):
116 | 	# all learnable modules are a child class of nn.Parameter
117 |     if isinstance(value, nn.Parameter):
118 | 	    # all learnable modules are a child class of nn.Parameter
119 |         self.register_parameter(name, value)
120 |     elif isinstance(value, nn.Module):
121 | 	    # this is the case when, for example, nn.Sequential is added
122 | 	    # allowing for the formation of an organized tree architecture
123 |         self._modules[name] = value
124 |     ...
125 |     # finally, the original purose of self.a = b must be fulfilled
126 |     object.__setattr__(self, name, value)
127 | ```
128 | 
129 | for backpropagation
130 | ```python
131 | class Module:
132 | 	# these two methods expose the parameters to the optimizer
133 |     def parameters(self, recurse=True):
134 |         for name, param in self.named_parameters(recurse=recurse):
135 |             yield param
136 |     
137 |     def named_parameters(self, prefix='', recurse=True):
138 |         # Recursively yields all parameters with names
139 |         # Enables optimizer access: optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
140 | ```
141 | 
142 | ## Hooks
143 | 
144 | hooks are functions that give you access points to data throughout the entire training process: the inputs, outputs, and the gradients of all modules
145 | 
146 | ### types of hooks
147 | 
148 | ```python
149 | # Forward pre-hook: called before forward()
150 | def forward_pre_hook(module, input):
151 |     # input is a tuple of inputs to the module
152 |     print(f"Module {module.__class__.__name__} received input: {[i.shape for i in input]}")
153 |     # You can modify the input here
154 |     return input  # Or modified input
155 | 
156 | # Forward hook: called after forward()
157 | def forward_hook(module, input, output):
158 |     # output is the result of forward()
159 |     print(f"Module {module.__class__.__name__} produced output: {output.shape}")
160 |     # You can modify the output here
161 |     return output  # Or modified output
162 | 
163 | # called during backprop
164 | def backward_hook(module, grad_input, grad_output):
165 |     # grad_input: gradients flowing INTO the module
166 |     # grad_output: gradients flowing OUT OF the module
167 |     print(f"Gradients flowing out: {[g.shape for g in grad_output if g is not None]}")
168 |     # Can be used for gradient clipping or monitoring
169 |     return grad_input
170 | ```
171 | 
172 | ### registering hooks
173 | 
174 | ```python
175 | module.register_forward_pre_hook(hook)
176 | module.register_forward_hook(hook)
177 | module.register_full_backwards_hook(hook)
178 | 
179 | # you can manually assign
180 | # this is adding a hook to the initial input and final output
181 | model.register_forward_prehook(...)
182 | model.register_forward_hook(...)
183 | 
184 | # or add hooks at mass
185 | for name, module in model.named_modules():
186 | 	# for all linear modules within model
187 |     if isinstance(module, nn.Linear):
188 |         handle = module.register_forward_hook(...)
189 | ```
190 | 
191 | with these you can monitor values, clip gradients, etc.
192 | 
193 | ### common hooks
194 | 
195 | ```python
196 | # Gradient clipping using backward hook
197 | def gradient_clipping_hook(module, grad_input, grad_output, max_norm=1.0):
198 |     # Clip gradients to prevent explosion
199 |     total_norm = 0
200 |     for g in grad_output:
201 |         if g is not None:
202 |             param_norm = g.data.norm(2)
203 |             total_norm += param_norm.item() ** 2
204 |     total_norm = total_norm ** 0.5
205 |     
206 |     clip_coef = max_norm / (total_norm + 1e-6)
207 |     if clip_coef < 1:
208 |         for g in grad_output:
209 |             if g is not None:
210 |                 g.data.mul_(clip_coef)
211 |     
212 |     return grad_input
213 | 
214 | # Register to specific layers
215 | for module in model.modules():
216 |     if isinstance(module, nn.Linear):
217 |         module.register_full_backward_hook(gradient_clipping_hook)
218 | ```
219 | 
220 | ```python
221 | # Monitor activation statistics during training
222 | activation_stats = {}
223 | 
224 | def activation_stats_hook(name):
225 |     def hook(module, input, output):
226 |         if name not in activation_stats:
227 |             activation_stats[name] = {
228 |                 'mean': [], 'std': [], 'min': [], 'max': []
229 |             }
230 |         
231 |         activation_stats[name]['mean'].append(output.mean().item())
232 |         activation_stats[name]['std'].append(output.std().item())
233 |         activation_stats[name]['min'].append(output.min().item())
234 |         activation_stats[name]['max'].append(output.max().item())
235 |     return hook
236 | 
237 | # Register to all convolutional layers
238 | for name, module in model.named_modules():
239 |     if isinstance(module, nn.Conv2d):
240 |         module.register_forward_hook(activation_stats_hook(name))
241 | ```
242 | 
243 | ```python
244 | # Identify dead ReLU units
245 | dead_relus = {}
246 | 
247 | def relu_monitor_hook(name):
248 |     def hook(module, input, output):
249 |         # Count how many outputs are exactly zero
250 |         dead_ratio = (output == 0).float().mean().item()
251 |         if name not in dead_relus:
252 |             dead_relus[name] = []
253 |         dead_relus[name].append(dead_ratio)
254 |     return hook
255 | 
256 | # Monitor all ReLU layers
257 | for name, module in model.named_modules():
258 |     if isinstance(module, nn.ReLU):
259 |         module.register_forward_hook(relu_monitor_hook(name))
260 | ```
261 | 


--------------------------------------------------------------------------------
/3. PyTorch/3.3. Devices/device.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 | # Devices
  4 | ##### **Jerry Zhang** | SHSID Data Science Club**
  5 | 
  6 | <div align="left">
  7 | 
  8 | ## The characteristics of different devices
  9 | 
 10 | As you may know, there are many different components to a computer which all can do computation. Most notably are the **CPU** and the **GPU**. This leads to the question "what device is the best for running neural networks?".
 11 | To answer this, we first need to understand **precision**, **parallelization**, and the characteristics of each device.
 12 | 
 13 | ### Precision
 14 | 
 15 | Our weight matrices consist of **Floating Point** (**FP**) values, values with a non-integer component. For example $0.175$ or $0.982$.
 16 | To store **FP**s, computers use a scientific notation like system. The name of each system is **FP** followed by its bit count.
 17 | 
 18 | ##### FP32
 19 | <table class="bit-diagram">
 20 | 	<tr> 
 21 | 		<td class="bit-index">31</td> 
 22 | 		<td class="bit-index" colspan="9">30-23</td> 
 23 | 		<td class="bit-index" colspan="2">22-0</td> 
 24 | 	</tr> 
 25 | 	<tr> 
 26 | 		<td class="sign-bit" colspan="1">S</td> 
 27 | 		<td class="exponent-bits" colspan="8">Exponent (E)</td> 
 28 | 		<td class="mantissa-bits" colspan="23">Mantissa (M)</td>
 29 | 	 </tr> 
 30 | 	<tr> 
 31 | 		 <td class="bit-size">1 bit</td> 
 32 | 		 <td class="bit-size" colspan="8">8 bits</td> 
 33 | 		 <td class="bit-size" colspan="23">23 bits</td>
 34 | 	</tr> 
 35 | </table>
 36 | 
 37 | The value of which is $(-1)^S \times (1 + M) \times 2^{(E - 127)}$
 38 | 
 39 | The more bits in an **FP** the larger range of values it can represent. 
 40 | 
 41 | | Format   | Range |
 42 | | -------- | -------------------------------------------------------- |
 43 | | **FP16** | $\pm6.10 \times 10^-5 \dots \pm6.55 \times 10^4$         |
 44 | | **FP32** | $\pm1.18 \times 10^{-38} \dots \pm3.40 \times 10^{38}$   |
 45 | | **FP64** | $\pm2.23 \times 10^{-308} \dots \pm1.80 \times 10^{308}$ |
 46 | 
 47 | The *pros* of smaller number of bits would include
 48 | - it reduces the required amount of memory
 49 | 	- this allows for the deployment of large models on lesser devices
 50 | 	- larger batch sizes in training
 51 | - its faster to compute
 52 | - lower power consumption (mainly a concern for deployment on mobile devices)
 53 | 
 54 | *cons*
 55 | - lower precision
 56 | 	- when lowering a high-precision value to a lower-precision number, this introduces an error, which can accumulate to a decrease in performance
 57 | 	- very small number in lower-precision can be rounded down to zero, leading to vanishing gradients
 58 | 	- very large numbers may exceed the limit and become NaN, ruining the model performance and the gradients
 59 | 
 60 | ### Parallelization
 61 | 
 62 | Most computation units in a computer are packed with many different circuits, however, when processing an instruction, only one circuit is used. This is a waste of resources as a task most likely consists of many independent instructions.
 63 | #### Basic jargon
 64 | ##### Clock Cycle
 65 | It takes time for electricity for flow through circuits and there is no guaranteeing when a signal is the final result. Therefore the **clock cycle**, a frequency of electricity, decrees when a circuits output is the final output. In other words, a **clock cycle** is a cycle of computation
 66 | ##### Register
 67 | To perform a computation on, for example, two numbers, they need to be first inputted, and then processed. Most processors don't allow direct input-output, values are first inputted and stored in a cache called the **register**, and in the next **clock cycle** they are then processed and outputted. 
 68 | #### Parallel processing
 69 | ##### Multi-ported Registers
 70 | To support multiprocessing, the first issue it to be able to take multiple inputs. A **multi-ported register**, as its name suggests allows for many simultaneous readings and writings
 71 | ##### Bypass networks
 72 | To "parallelize" calculations which rely on each other, outputs need to be mapped to inputs without passing through the register, as the register is only called once per **clock cycle**. The Bypass network connects outputs of circuits to the inputs of others.
 73 | ##### The Scheduler (Reservation Station)
 74 | The Scheduler facilitates the bypass network. It checks if all inputs are ready and the circuit is free before sending an instruction. 
 75 | #### An example
 76 | ```mermaid
 77 | 
 78 | flowchart TD
 79 | 
 80 |     subgraph Cycle1[Scheduler Actions - Cycle 1]
 81 | 
 82 |         direction TB
 83 | 
 84 |         C1_Scheduler[Scheduler analyzes instruction queue]
 85 | 
 86 |         C1_Dispatch[Dispatches to available execution units]
 87 | 
 88 |         C1_Load["Load/Store Unit: LOAD R1, [0x100]"]
 89 | 
 90 |         C1_ALU1[ALU 1: MUL R4, R5, R6]
 91 | 
 92 |         C1_ALU2[ALU 2: SUB R7, R8, R9]
 93 | 
 94 |         C1_RS["Reservation Station: ADD R2, R1, R3<br>STORE [0x200], R2<br>BRANCH if R4 > 0"]
 95 | 
 96 |     end
 97 | 
 98 |   
 99 | 
100 |     subgraph Cycle2[Scheduler Actions - Cycle 2]
101 | 
102 |         direction TB
103 | 
104 |         C2_Scheduler[Scheduler monitors execution progress]
105 | 
106 |         C2_Load[Load/Store: LOAD completing<br>Result available next cycle]
107 | 
108 |         C2_ALU1[ALU 1: MUL continues<br>Multi-cycle operation]
109 | 
110 |         C2_ALU2[ALU 2: SUB completes]
111 | 
112 |         C2_RS[Reservation Station: ADD ready for dispatch<br>STORE waiting for R2<br>BRANCH waiting for R4]
113 | 
114 |     end
115 | 
116 |   
117 | 
118 |     subgraph Cycle3[Scheduler Actions - Cycle 3]
119 | 
120 |         direction TB
121 | 
122 |         C3_Scheduler[Scheduler dispatches ready instructions]
123 | 
124 |         C3_Load["Load/Store: STORE [0x200], R2"]
125 | 
126 |         C3_ALU1[ALU 1: ADD R2, R1, R3<br>R1 forwarded via bypass]
127 | 
128 |         C3_ALU2[ALU 2: Idle]
129 | 
130 |         C3_Branch[Branch Unit: BRANCH if R4 > 0<br>R4 forwarded from MUL]
131 | 
132 |         C3_RS[Reservation Station: Empty]
133 | 
134 |     end
135 | 
136 |   
137 | 
138 |     %% Dependencies
139 | 
140 |     C1_Load -.->|Produces R1| C1_RS
141 | 
142 |     C1_ALU1 -.->|Produces R4| C1_RS
143 | 
144 |     C1_ALU2 -.->|Independent| C1_RS
145 | 
146 |     C2_Load -.->|Makes R1 available| C2_RS
147 | 
148 |     C2_ALU1 -.->|Still producing R4| C2_RS
149 | 
150 |     C2_ALU2 -.->|Independent complete| C2_RS
151 | 
152 |     C3_Load -.->|Uses R2 from ADD| C3_ALU1
153 | 
154 |     C3_ALU1 -.->|Produces R2| C3_Load
155 | 
156 |     C3_ALU1 -.->|Uses R1 from LOAD| C3_Load
157 | 
158 |     C3_Branch -.->|Uses R4 from MUL| C3_ALU1
159 | 
160 |   
161 | 
162 |     %% Cycle connections
163 | 
164 |     Cycle1 --> Cycle2
165 | 
166 |     Cycle2 --> Cycle3
167 | 
168 |   
169 | 
170 |     classDef executing fill:#9f9,stroke:#333,stroke-width:2px;
171 | 
172 |     classDef waiting fill:#f9f9b8,stroke:#333,stroke-width:1px;
173 | 
174 |     classDef completed fill:#aaf,stroke:#333,stroke-width:2px;
175 | 
176 |     classDef scheduler fill:#faa,stroke:#333,stroke-width:2px;
177 | 
178 |     class C1_Load,C1_ALU1,C1_ALU2 executing;
179 | 
180 |     class C1_RS waiting;
181 | 
182 |     class C1_Scheduler,C2_Scheduler,C3_Scheduler scheduler;
183 | 
184 |     class C2_ALU2 completed;
185 | 
186 |     class C3_Load,C3_ALU1,C3_Branch executing;
187 | 
188 |     class C3_RS completed;
189 | ```
190 | 
191 | ### The characteristics of different devices
192 | 
193 | - The **CPU**, *central processing unit*, is the hub of a computer: it can perform all general calculations, manage memory, and etc. However its very slow for the immense amount of parallel processes for neural networks. 
194 | - GPU, *graphical processing units*, were originally designed to speed up graphical computation. Graphical computation involves many parallel computations as indicated by the ten-thousands of pixels on your screen. **CUDA**, Compute Unified Device Architecture, is a parallel computing API proprietary to NVIDIA; it enables GPUs to parallelize general computations, rather than just graphical ones from the OpenGL or Vulkan API.
195 | - **MPS**, apple's imitation of **CUDA** for their SoCs. As of 2025, its less mature than **CUDA**.
196 | - **XPU**, intel's GPUs. Less mature than both Apple and NVIDIA.
197 | 
198 | ### Code
199 | 
200 | #### Basic Grammar
201 | 
202 | ##### Setting the device
203 | ```python
204 | torch.device(device)
205 | ```
206 | replace device with `'cuda'`, `'cpu'`, `'MPS'`, or `'XPU'`
207 | 
208 | ##### Moving objects to your device
209 | ```python
210 | x = torch.randn(3, 3).to(device)  # tensor to device
211 | model = model().to(device)    # model to device
212 | ```
213 | If your using a **CPU** this is not necessary
214 | 
215 | ##### Multiple GPUs
216 | ```python
217 | if torch.cuda.device_count() > 1:
218 |     model = torch.nn.DataParallel(model, device_ids=[0, 1])  # Use GPUs 0 and 1
219 | ```
220 | 
221 | ##### Clear GPU memory
222 | ```python
223 | torch.cuda.empty_cache()
224 | ```
225 | 
226 | #### Techniques
227 | 
228 | ```python
229 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
230 | ```
231 | This is a piece of code most Pytorch scripts include.
232 | If your system has a **CUDA** hardware, it will use it; if not, it will use your **CPU**.
233 | 
234 | ```python
235 | loader = DataLoader(dataset, pin_memory=True)  # Faster GPU transfer
236 | ```
237 | If your using a GPU, this speeds up the transfer of memory from RAM to GRAM.
238 | 
239 | ```python
240 | from torch.cuda.amp import autocast, GradScaler
241 | 
242 | scaler = GradScaler()
243 | for x, y in loader:
244 |     optimizer.zero_grad()
245 |     with autocast():
246 |         pred = model(x)
247 |         loss = criterion(pred, y)
248 |     scaler.scale(loss).backward()
249 |     scaler.step(optimizer)
250 |     scaler.update()
251 | ```
252 | Mixed precision. A technique NVIDIA developed that uses mixed precision to speed up performance and reduce memory consumption.
253 | However, this technique has some caveats which may lead to decreased performance or even a completely failed training session.
254 | 


--------------------------------------------------------------------------------
/0. Prerequisites/0.1. Basic Environment For Python/0.1 Setup.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ### **Lesson 0.1: Environment Setup**
  3 | 
  4 | SHSID DATA SCIENCE CLUB - Gordon.H
  5 | 
  6 | **Objective:** By the end of this lesson, you will have a robust, isolated, and powerful development environment on your computer. This setup is the professional standard for data science, machine learning, and deep learning work. It will save you countless hours of troubleshooting in the future.
  7 | 
  8 | ---
  9 | 
 10 | ### **Introduction: Why Is This So Important?**
 11 | 
 12 | Imagine you are a chef. Before you can cook, you need to set up your kitchen: your knives must be sharp, your cutting boards clean, and your ingredients organized. A development environment is your digital kitchen. A clean, organized setup allows you to focus on the "cooking" (writing code and building models) instead of worrying about whether your tools work together.
 13 | 
 14 | We will install four key components:
 15 | 
 16 | 1.  **Anaconda:** The manager of our entire kitchen. It handles Python and all the specialized "appliances" (libraries) we need, keeping them organized in separate drawers (environments).
 17 | 2.  **Python:** The core programming language we will be using. Anaconda will install this for us.
 18 | 3.  **VS Code:** Our state-of-the-art workbench and recipe book. It's a modern, powerful, and highly customizable code editor where we will write our code.
 19 | 4.  **NVIDIA CUDA & cuDNN (Optional but Recommended for Deep Learning):** A special high-performance oven (your GPU) and the instructions for how to use it. **This is only for users with an NVIDIA graphics card.** If you don't have one, don't worry! You can skip this section and still complete 95% of data science and machine learning tasks.
 20 | 
 21 | Let's begin.
 22 | 
 23 | ---
 24 | 
 25 | ### **Step 1: Install Anaconda (The Foundation)**
 26 | 
 27 | Anaconda is a distribution of Python that comes pre-packaged with many of the most common data science libraries. More importantly, it includes **Conda**, a powerful package and environment manager.
 28 | 
 29 | 1.  **Download:** Go to the [Anaconda Distribution download page](https://www.anaconda.com/products/distribution).
 30 | 2.  **Select Your OS:** Download the installer for your operating system (Windows, macOS, or Linux).
 31 | 3.  **Choose Python 3.x:** Download the version for the latest stable Python 3 release (e.g., Python 3.9 or higher).
 32 | 4.  **Run the Installer:**
 33 |     *   Launch the installer you downloaded.
 34 |     *   Click "Next" through the initial prompts.
 35 |     *   **On Windows:** When you reach the "Advanced Installation Options" screen, it is **recommended to leave "Add Anaconda3 to my PATH environment variable" unchecked**. While checking it seems convenient, it can interfere with other software. We will use the dedicated **Anaconda Prompt** instead.
 36 |     *   Proceed with the default settings for the rest of the installation.
 37 | 
 38 | 5.  **Verify the Installation:**
 39 |     *   **Windows:** Open the **Anaconda Prompt** from your Start Menu.
 40 |     *   **macOS/Linux:** Open your regular Terminal.
 41 |     *   In the terminal window, type the following command and press Enter:
 42 |         ```bash
 43 |         conda --version
 44 |         ```
 45 |     *   You should see an output like `conda 23.7.4`. If you see this, Anaconda is installed correctly!
 46 | 
 47 | ### **Step 2: Create an Isolated Conda Environment (A Critical Best Practice)**
 48 | 
 49 | You should **never** install packages directly into your base Anaconda installation. Instead, you create isolated environments for each project. This prevents package conflicts (e.g., Project A needs version 1.0 of a library, but Project B needs version 2.0).
 50 | 
 51 | 1.  **Open your terminal** (Anaconda Prompt on Windows, Terminal on macOS/Linux).
 52 | 
 53 | 2.  **Create a new environment.** We will call it `datasci` and install Python 3.9 in it. You can choose a different name or Python version if you prefer.
 54 |     ```bash
 55 |     conda create --name datasci python=3.9
 56 |     ```
 57 |     Conda will show you a list of packages to be installed and ask you to proceed (`y/n`). Type `y` and press Enter.
 58 | 
 59 | 3.  **Activate the environment.** To use an environment, you must "activate" it.
 60 |     ```bash
 61 |     conda activate datasci
 62 |     ```
 63 |     You will notice that your command prompt's prefix changes from `(base)` to `(datasci)`. This tells you that your new environment is active. Any package you install now will be placed inside `datasci`, leaving your `base` environment clean.
 64 | 
 65 | > **Pro-Tip:** To leave an environment, simply type `conda deactivate`. You will return to the `(base)` environment. You can also use `conda activate base`.
 66 | 
 67 | ### **Step 3: Install Visual Studio Code (The Code Editor)**
 68 | 
 69 | VS Code is the most popular code editor in the world for a reason. It's free, fast, and has a massive ecosystem of extensions that can tailor it to your exact needs.
 70 | 
 71 | 1.  **Download:** Go to the [VS Code download page](https://code.visualstudio.com/download) and get the installer for your OS.
 72 | 2.  **Install:** Run the installer, accepting the default options. On Windows, ensure the "Add to PATH" option is checked during installation, as this is very useful.
 73 | 3.  **Install the Essential Extension:**
 74 |     *   Open VS Code.
 75 |     *   On the left-hand side, click the "Extensions" icon (it looks like four squares, with one flying off).
 76 |     *   In the search bar, type `Python`.
 77 |     *   Install the one published by **Microsoft**. It is the official extension and provides rich language support, debugging, and more.
 78 | 
 79 | 4.  **Connect VS Code to your Conda Environment:**
 80 |     *   This is the most important integration step.
 81 |     *   Open VS Code.
 82 |     *   Press `Ctrl+Shift+P` (or `Cmd+Shift+P` on Mac) to open the Command Palette.
 83 |     *   Type `Python: Select Interpreter`.
 84 |     *   A list of available Python interpreters will appear. Find and select the one that includes your environment name, e.g., **`('datasci': conda)`**. It will point to the Python executable inside your `datasci` environment folder.
 85 | 
 86 | Now, when you open a terminal inside VS Code (`Ctrl+`` or `View > Terminal`), it should automatically activate your `(datasci)` environment!
 87 | 
 88 | ---
 89 | 
 90 | ### **Step 4: NVIDIA GPU Setup (CUDA & cuDNN)**
 91 | 
 92 | **⚠️ Important:** Only perform this step if you have an **NVIDIA GPU**. If you have an AMD GPU or integrated graphics, please skip to Step 5.
 93 | 
 94 | Deep learning frameworks like TensorFlow and PyTorch can use the massively parallel processing power of NVIDIA GPUs to train models orders of magnitude faster. This requires three components: the driver, the CUDA Toolkit, and the cuDNN library.
 95 | 
 96 | 1.  **Check Your GPU:** Open your terminal and run:
 97 |     ```bash
 98 |     nvidia-smi
 99 |     ```
100 |     If this command works, it will show you your GPU name and, crucially, the highest **CUDA Version** your driver supports in the top-right corner. **Note this version.** If the command fails, you need to install or update your NVIDIA drivers from the [NVIDIA website](https://www.nvidia.com/Download/index.aspx) first.
101 | 
102 | 2.  **Install CUDA Toolkit:**
103 |     *   The CUDA Toolkit version you install must be **less than or equal to** the version supported by your driver. It also needs to be compatible with the deep learning library you plan to use (PyTorch/TensorFlow). A good, safe choice is often **CUDA 11.8**.
104 |     *   Go to the [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive).
105 |     *   Find your desired version (e.g., 11.8.0), select your OS and installer type, and download.
106 |     *   Run the installer. When prompted, choose the **Custom (Advanced)** installation and **deselect everything except the CUDA components**. Specifically, do **not** let it install a graphics driver, as you already have a newer one.
107 | 
108 | 3.  **Install cuDNN:**
109 |     *   cuDNN is a library that provides highly optimized routines for deep learning operations.
110 |     *   Go to the [cuDNN Archive](https://developer.nvidia.com/rdp/cudnn-archive). You will need to sign up for a free NVIDIA Developer account.
111 |     *   Find the cuDNN version that corresponds to your CUDA Toolkit version (e.g., "cuDNN v8.9.5 for CUDA 11.x"). Download the zip file.
112 |     *   **This is a manual installation:**
113 |         a. Unzip the downloaded file. You will see three folders: `bin`, `include`, and `lib`.
114 |         b. Navigate to your CUDA Toolkit installation directory. By default, this is: `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8`
115 |         c. Copy the contents of the unzipped folders into the corresponding folders in your CUDA installation directory.
116 |            *   Copy `cudnn-*.h` from `include` to the CUDA `include` folder.
117 |            *   Copy `cudnn-*.lib` from `lib` to the CUDA `lib` folder.
118 |            *   Copy `cudnn-*.dll` from `bin` to the CUDA `bin` folder.
119 | 
120 | ---
121 | 
122 | ### **Step 5: Install Core Data Science Libraries**
123 | 
124 | Let's install the essential packages into our `datasci` environment.
125 | 
126 | 1.  Make sure your `datasci` environment is active in your terminal (`(datasci)` should be visible).
127 | 2.  Install the core stack using conda:
128 |     ```bash
129 |     conda install numpy pandas matplotlib scikit-learn jupyter
130 |     ```
131 | 3.  **(Optional) Install a Deep Learning Library:**
132 |     *   **PyTorch (Recommended):** Go to the [PyTorch website's get-started local page](https://pytorch.org/get-started/locally/). Select the stable build, your OS, Conda, Python, and your CUDA version (e.g., 11.8). It will generate a command for you. Copy and run it. It will look something like this:
133 |         ```bash
134 |         conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
135 |         ```
136 |     *   **TensorFlow:** Installation is usually done with `pip`.
137 |         ```bash
138 |         pip install tensorflow
139 |         ```
140 | 
141 | ### **Final Verification**
142 | 
143 | Let's make sure everything works together.
144 | 
145 | 1.  In VS Code, create a new file named `test_env.py`.
146 | 2.  Make sure your `datasci` interpreter is selected in the bottom-right corner.
147 | 3.  Paste this code into the file:
148 | 
149 |     ```python
150 |     import numpy as np
151 |     import pandas as pd
152 |     import sklearn
153 |     
154 |     # Optional: PyTorch GPU check
155 |     try:
156 |         import torch
157 |         print(f"PyTorch version: {torch.__version__}")
158 |         gpu_available = torch.cuda.is_available()
159 |         print(f"Is GPU available? {gpu_available}")
160 |         if gpu_available:
161 |             print(f"GPU Name: {torch.cuda.get_device_name(0)}")
162 |     except ImportError:
163 |         print("PyTorch not installed.")
164 |         
165 |     print("\nEnvironment setup is successful!")
166 |     print(f"NumPy version: {np.__version__}")
167 |     print(f"Pandas version: {pd.__version__}")
168 |     print(f"Scikit-learn version: {sklearn.__version__}")
169 |     ```
170 | 
171 | 4.  Right-click in the editor and select "Run Python File in Terminal".
172 | 
173 | If the script runs without errors and you see the version numbers and a "successful" message, **congratulations!** You have successfully built a professional-grade data science environment. If you installed the GPU components, you should see `Is GPU available? True`.
174 | 
175 | You are now ready to tackle any data science or machine learning project.


--------------------------------------------------------------------------------
/2. Machine Learning Generics/2.3. Regularization, Bias-Variance Trade-Off, Kernel Methods, Cross Validation/Cross Validation.md:
--------------------------------------------------------------------------------
  1 | # 🚀 Cross-Validation 
  2 | ---
  3 | Gordon.H | SHSID Data Science Club
  4 | ---
  5 | 
  6 | Hello again! We have just learned how to spot underfitting (high bias) and overfitting (high variance). We've even learned how to *tame* overfitting with regularization.
  7 | 
  8 | But this raises a crucial question: **How do we get a reliable score for our model?** How can we be sure that a model is genuinely good, and not just "lucky" on the one-time test set we gave it?
  9 | 
 10 | Today, we'll learn the gold-standard technique for model evaluation: **Cross-Validation**.
 11 | 
 12 | ### Our Learning Journey Today
 13 | 
 14 | We'll learn the professional's method for evaluating models and tuning their "dials" (hyperparameters).
 15 | 
 16 | ```mermaid
 17 | graph LR
 18 |     A[📍 We are here<br>We know about train/test split] --> B(Part 1: The Problem with a Single Split<br><i>The "Lucky" Test Set</i>);
 19 |     B --> C(Part 2: K-Fold Cross-Validation<br><i>The Fair Solution</i>);
 20 |     C --> D(Part 3: The Killer App: Hyperparameter Tuning<br><i>Finding the Best Settings</i>);
 21 |     D --> E(🏆 We will be here<br>You can confidently evaluate and tune any model);
 22 | 
 23 |     style A fill:#f9f,stroke:#333,stroke-width:2px
 24 |     style E fill:#9f9,stroke:#333,stroke-width:2px
 25 | ```
 26 | 
 27 | ---
 28 | 
 29 | ## Part 1: The Problem with a Single Train/Test Split
 30 | 
 31 | So far, you've probably seen this workflow:
 32 | 1.  Take all your data.
 33 | 2.  Split it once into a training set and a testing set (e.g., 80% train, 20% test).
 34 | 3.  Train your model on the training set.
 35 | 4.  Evaluate its performance on the testing set.
 36 | 
 37 | ```mermaid
 38 | graph LR
 39 |     subgraph Single Split Method
 40 |         Data[Full Dataset] --> Split{80/20 Split}
 41 |         Split --> Train[Training Set (80%)]
 42 |         Split --> Test[Test Set (20%)]
 43 |         Train --> Model[Train Model]
 44 |         Model --> Evaluate
 45 |         Test --> Evaluate{Evaluate Model}
 46 |         Evaluate --> Score[Get ONE Score]
 47 |     end
 48 | ```
 49 | 
 50 | **The Problem:** The final score depends heavily on *which* 20% of the data ended up in the test set.
 51 | *   What if, just by random chance, the test set contained all the "easy" examples? You'd get a great score and think your model is a genius!
 52 | *   What if the test set happened to get all the "hard" or "weird" examples? You'd get a terrible score and might discard a perfectly good model.
 53 | 
 54 | This is **high variance in your evaluation**. The score you get is not stable or reliable. We need a better, more robust way.
 55 | 
 56 | ---
 57 | 
 58 | ## Part 2: K-Fold Cross-Validation - A More Robust Referee
 59 | 
 60 | Instead of a single split, Cross-Validation says: **"Let's do this multiple times and average the results!"**
 61 | 
 62 | The most common method is **K-Fold Cross-Validation**.
 63 | 
 64 | **The Main Idea:**
 65 | 1.  Split the *entire* dataset into `K` equal-sized "folds" (or groups). A common choice for `K` is 5 or 10.
 66 | 2.  Then, we run `K` experiments. In each experiment:
 67 |     *   We pick **one** fold to be our test set.
 68 |     *   We use the **remaining `K-1` folds** as our training set.
 69 | 3.  We train the model, evaluate it on the test fold, and record the score.
 70 | 4.  After running `K` times (with each fold getting a turn to be the test set), we average the `K` scores to get a final, more reliable performance estimate.
 71 | 
 72 | ### Visualizing 5-Fold Cross-Validation (K=5)
 73 | 
 74 | ```mermaid
 75 | graph LR
 76 |     Data[Full Dataset] --> S[Split into 5 Folds]
 77 |     S --> F1[Fold 1] & F2[Fold 2] & F3[Fold 3] & F4[Fold 4] & F5[Fold 5]
 78 |     
 79 |     subgraph Iteration 1
 80 |         direction LR
 81 |         Train1[Train on F2,F3,F4,F5] --> Test1(Test on F1) --> Score1[Score 1]
 82 |     end
 83 |     
 84 |     subgraph Iteration 2
 85 |         direction LR
 86 |         Train2[Train on F1,F3,F4,F5] --> Test2(Test on F2) --> Score2[Score 2]
 87 |     end
 88 | 
 89 |     subgraph Iteration 3
 90 |         direction LR
 91 |         Train3[Train on F1,F2,F4,F5] --> Test3(Test on F3) --> Score3[Score 3]
 92 |     end
 93 | 
 94 |     subgraph Iteration 4
 95 |         direction LR
 96 |         Train4[Train on F1,F2,F3,F5] --> Test4(Test on F4) --> Score4[Score 4]
 97 |     end
 98 | 
 99 |     subgraph Iteration 5
100 |         direction LR
101 |         Train5[Train on F1,F2,F3,F4] --> Test5(Test on F5) --> Score5[Score 5]
102 |     end
103 |     
104 |     Score1 & Score2 & Score3 & Score4 & Score5 --> Final{Average the Scores}
105 |     Final --> FinalScore[<b>Final CV Score</b><br>± a measure of variance]
106 | 
107 |     style Test1 fill:#f99
108 |     style Test2 fill:#f99
109 |     style Test3 fill:#f99
110 |     style Test4 fill:#f99
111 |     style Test5 fill:#f99
112 | ```
113 | 
114 | ### The Math: Simple and Sweet
115 | 
116 | The final Cross-Validation score is just the average of the scores from each fold.
117 | 
118 | $$ \text{CV}_{\text{score}} = \frac{1}{K} \sum_{i=1}^{K} \text{score}_i = \frac{\text{score}_1 + \text{score}_2 + ... + \text{score}_K}{K} $$
119 | 
120 | We also look at the **standard deviation** of the scores. A low standard deviation tells us the model's performance is stable and consistent across different subsets of the data. A high standard deviation means the performance is erratic.
121 | 
122 | ### 🐍 Python Example: Evaluating Our Bias-Variance Models
123 | 
124 | Let's use 5-fold CV to evaluate the three polynomial models from our last lesson. Which one will CV tell us is the best?
125 | 
126 | ```python
127 | import numpy as np
128 | from sklearn.model_selection import cross_val_score
129 | from sklearn.pipeline import make_pipeline
130 | from sklearn.linear_model import LinearRegression
131 | from sklearn.preprocessing import PolynomialFeatures
132 | 
133 | # 1. Generate the same sample data
134 | np.random.seed(0)
135 | X = np.linspace(0, 10, 100).reshape(-1, 1) # More data points for CV
136 | y = np.sin(X).ravel() + np.random.normal(0, 0.5, 100)
137 | 
138 | # 2. Define our three models from the bias-variance lesson
139 | underfit_model = make_pipeline(PolynomialFeatures(degree=1), LinearRegression())
140 | just_right_model = make_pipeline(PolynomialFeatures(degree=4), LinearRegression())
141 | overfit_model = make_pipeline(PolynomialFeatures(degree=15), LinearRegression())
142 | 
143 | # 3. Use 5-fold cross-validation (cv=5) to evaluate each model
144 | # 'neg_mean_squared_error' is used because scikit-learn likes to maximize scores.
145 | # We'll just flip the sign back to positive to interpret it as error.
146 | scores_underfit = -cross_val_score(underfit_model, X, y, cv=5, scoring='neg_mean_squared_error')
147 | scores_just_right = -cross_val_score(just_right_model, X, y, cv=5, scoring='neg_mean_squared_error')
148 | scores_overfit = -cross_val_score(overfit_model, X, y, cv=5, scoring='neg_mean_squared_error')
149 | 
150 | # 4. Print the results
151 | print("--- Underfit Model (Degree 1) ---")
152 | print("Individual Fold Errors:", scores_underfit.round(2))
153 | print(f"Average CV Error: {scores_underfit.mean():.2f} (+/- {scores_underfit.std():.2f})\n")
154 | 
155 | print("--- Just Right Model (Degree 4) ---")
156 | print("Individual Fold Errors:", scores_just_right.round(2))
157 | print(f"Average CV Error: {scores_just_right.mean():.2f} (+/- {scores_just_right.std():.2f})\n")
158 | 
159 | print("--- Overfit Model (Degree 15) ---")
160 | print("Individual Fold Errors:", scores_overfit.round(2))
161 | print(f"Average CV Error: {scores_overfit.mean():.2f} (+/- {scores_overfit.std():.2f})\n")
162 | ```
163 | 
164 | **Expected Output & Analysis:**
165 | *   **Underfit Model:** Will have a consistently high error (e.g., avg error ~0.6). The standard deviation will be relatively low because it's consistently bad.
166 | *   **Just Right Model:** Will have the **lowest average error** (e.g., avg error ~0.25). This is our winner!
167 | *   **Overfit Model:** Will have a very high average error, and likely a **huge standard deviation**. This is because its performance is highly dependent on the specific data in each fold—it does well on some folds and terribly on others.
168 | 
169 | Cross-validation correctly and reliably identified the "Just Right" model as the best!
170 | 
171 | ---
172 | 
173 | ## Part 3: The Killer Application: Hyperparameter Tuning
174 | 
175 | Models have "dials" you can turn called **hyperparameters**. Examples include:
176 | *   The `alpha` ($\lambda$) in Ridge and Lasso regression.
177 | *   The `C` and `gamma` ($\gamma$) in Support Vector Machines.
178 | *   The `degree` of the polynomial we just used.
179 | 
180 | How do we find the *best* setting for these dials?
181 | 
182 | **The Golden Rule of Machine Learning:** The test set is a final exam. You only use it **ONCE**, at the very end, to report your final score. You cannot use it to tune your hyperparameters. Using the test set to pick the best `alpha` is a form of "cheating" or **data leakage**.
183 | 
184 | So, how do we do it? **We use Cross-Validation on the training set!**
185 | 
186 | ### The Correct Workflow
187 | 
188 | ```mermaid
189 | graph LR
190 |     Data[Full Dataset] --> Split1{<b>Step 1: The Great Split</b><br>Split into Train & Final Test sets}
191 |     Split1 --> FinalTest[<b>Final Test Set (20%)</b><br>LOCK THIS AWAY! 🔒]
192 |     Split1 --> TrainVal[Training + Validation Set (80%)]
193 |     
194 |     subgraph Step 2: Hyperparameter Tuning using CV
195 |         direction LR
196 |         TrainVal --> CV[Perform K-Fold CV<br>within this 80% data]
197 |         CV -- "for each alpha value" --> Scores[Get avg CV score for alpha]
198 |     end
199 |     
200 |     Scores --> Best[Find alpha with best CV score]
201 |     
202 |     Best --> Step3["<b>Step 3: Final Training</b><br>Train a NEW model on ALL of the<br>Training+Validation data (80%),<br>using the best alpha."]
203 |     
204 |     Step3 --> FinalEval["<b>Step 4: Final Evaluation</b><br>Unlock the Test Set! 🔓<br>Evaluate your final model ONCE."]
205 |     FinalTest --> FinalEval
206 |     FinalEval --> Report[Report Final Score]
207 | 
208 |     style FinalTest fill:#f99
209 | ```
210 | 
211 | This seems complicated, but thankfully, `scikit-learn` has a tool that does all of Step 2 for us automatically: `GridSearchCV`.
212 | 
213 | ### 🐍 Python Example: Finding the Best `alpha` for Ridge Regression
214 | 
215 | `GridSearchCV` will test a "grid" of hyperparameter values using cross-validation and tell us which one was the best.
216 | 
217 | ```python
218 | from sklearn.model_selection import GridSearchCV
219 | from sklearn.linear_model import Ridge
220 | 
221 | # 1. We're still using our same X and y data.
222 | #    Imagine this is our "Training + Validation Set (80%)"
223 | 
224 | # 2. Define the model we want to tune
225 | model_to_tune = Ridge()
226 | 
227 | # 3. Set up the "grid" of hyperparameters to test.
228 | #    We'll give it a list of different alpha values to try.
229 | param_grid = {
230 |     'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000] # These are the λ values
231 | }
232 | 
233 | # 4. Set up GridSearchCV
234 | #    It will try each alpha value using 5-fold CV.
235 | #    n_jobs=-1 uses all your computer's cores to speed it up!
236 | grid_search = GridSearchCV(model_to_tune, param_grid, cv=5, scoring='neg_mean_squared_error')
237 | 
238 | # 5. Run the search! This does all the work from the diagram's "Step 2".
239 | grid_search.fit(X, y)
240 | 
241 | # 6. Print the best results
242 | print("GridSearchCV found the best settings!")
243 | print("Best alpha (λ):", grid_search.best_params_)
244 | print("Best CV Score (MSE):", -grid_search.best_score_)
245 | ```
246 | **Analysis:**
247 | `GridSearchCV` will automatically iterate through all the `alpha` values, perform 5-fold cross-validation for each one, and store the results. The `.best_params_` attribute will show you which `alpha` gave the lowest average error across the folds. You would then take this `alpha` to train your final model.
248 | 
249 | ---
250 | 
251 | ## A Quick Note on Other CV Methods
252 | 
253 | *   **Stratified K-Fold:** When you have a classification problem with imbalanced classes (e.g., 99% not-fraud, 1% fraud), you need to make sure each fold has the same percentage of each class. `StratifiedKFold` does this automatically.
254 | *   **Leave-One-Out (LOOCV):** This is K-Fold where `K` is equal to the number of data points. In each step, you train on all data points except one, and test on that one. It's very thorough but extremely slow and usually only used for very small datasets.
255 | 
256 | ## Conclusion: Your Professional ML Workflow
257 | 
258 | You now have the complete, professional workflow for building a machine learning model.
259 | 
260 | > **Your Problem-Solving Guide:**
261 | >
262 | > 1.  **Split Data:** Perform the one-time split into `train_set` and `test_set`. Lock the `test_set` away.
263 | > 2.  **Choose a Model:** Pick a model appropriate for your problem (e.g., Ridge for regression, SVM for classification).
264 | > 3.  **Tune Hyperparameters:** Use **GridSearchCV** on the `train_set` to find the best hyperparameters for your model.
265 | > 4.  **Train Final Model:** Train your chosen model with the best hyperparameters on the *entire* `train_set`.
266 | > 5.  **Final Evaluation:** Unleash the `test_set` and evaluate your final model once to get its true performance score.
267 | 
268 | Cross-validation is the bridge between theory and practice. It's how we move from worrying about bias and variance to confidently building models that work well in the real world. Happy (and robust) modeling


--------------------------------------------------------------------------------
/4. Deep Learning & Computer Vision/4.4. Convolutional Layers, Pooling Layers, Convolutional Neural Network/CNN.md:
--------------------------------------------------------------------------------
  1 | # Seeing the world - Convolutional Neural Networks
  2 | * Gordon.H | SHSID Data Science Group
  3 | 
  4 | *Welcome back to the course, Junior ML Engineers !*
  5 | 
  6 | Today we will be learning about the ultimate solution for image processing, **Convulutional Neural Netorks**
  7 | 
  8 | ---
  9 | ### Requirements 
 10 | * Understanding of the fundamentals of Machine learning
 11 | * Basic Knowledge of Neural Networks
 12 | * Basic Python and Numpy Library Usage
 13 | 
 14 | ---
 15 | ### 1. Problem with Images
 16 | 
 17 | You have a small gray scale image of size 28*28 pixels
 18 | * Size = 28*28 = 784
 19 | * To feed it into a dense layer, we flatten into a vector of **784** numbers.
 20 | * If first layer has 128 neurons we will need **100,352** weights
 21 | 
 22 | This is a huge problem because:
 23 | * It is inefficient with such large parameters to train
 24 | * Spatial information is lost when we flatten the image
 25 | 
 26 | Now, CNN's are designed to solve the problem with a smart approach
 27 | 
 28 | ```mermaid
 29 | graph TD
 30 |     subgraph Dense Network Approach
 31 |         A[Input Image<br>28x28x1] --> B{Flatten};
 32 |         B --> C[1D Vector<br>784 neurons];
 33 |         C --> D[Dense Layer];
 34 |         style A fill:#f9f,stroke:#333,stroke-width:2px
 35 |     end
 36 |     
 37 |     subgraph CNN Approach
 38 |         E[Input Image<br>28x28x1] --> F{Convolutional Layer};
 39 |         F --> G[Feature Map<br>e.g., 26x26x32];
 40 |         style E fill:#9cf,stroke:#333,stroke-width:2px
 41 |     end
 42 | 
 43 |     A -- "Loses spatial structure" --> C
 44 |     E -- "Preserves spatial structure" --> G
 45 | ```
 46 | As you see, CNN keeps the image's 2D structure, allowing it to learn from pixel neighborhoods.
 47 | 
 48 | ---
 49 | 
 50 | ### 2. The core of CNN : Convolutional Layer
 51 | 
 52 | Instead of looking at a large image at once, CNN looks at it in small chunks. 
 53 | 
 54 | A **filter** is a small matrix of weights that the network learns. The process of sliding the filter and computing the output is called a **convolution**.
 55 | 
 56 | Here's a mini-example of a 2x2 filter sliding over a 3x3 image to produce a 2x2 feature map.
 57 | 
 58 | ```
 59 | Input Image (I)      Filter (K)     Feature Map (O)
 60 | +---+---+---+        +---+---+      +---+---+
 61 | | 1 | 5 | 2 |        | 1 | 0 |      | 9 | ? |
 62 | +---+---+---+        +---+---+      +---+---+
 63 | | 8 | 1 | 6 |        | 1 | 0 |      | ? | ? |
 64 | +---+---+---+        +---+---+      +---+---+
 65 | | 3 | 4 | 7 |
 66 | +---+---+---+
 67 | ```
 68 | To calculate the top-left value of the output: `(1*1) + (5*0) + (8*1) + (1*0) = 9`.
 69 | 
 70 | #### The Mathematical Logic
 71 | 
 72 | The mathematical formula for such operation, **cross-correlation**, looks like this:
 73 | $$
 74 | O_{i,j} = b + \sum_{u=0}^{F-1} \sum_{v=0}^{F-1} I_{i+u, j+v} \cdot K_{u,v}
 75 | $$
 76 | 
 77 | Looks complicated right? Lets break it down:
 78 | 
 79 | *   $O_{i,j}$: The output value at row `i`, column `j` in the feature map.
 80 | 
 81 | *   $b$: A learnable **bias** term, which helps the filter make better predictions.
 82 | 
 83 | *   $\sum$: The "sum" symbol. We sum over the filter's dimensions (`u` and `v`).
 84 | 
 85 | *   $I_{i+u, j+v}$: A pixel value from the **Input** image patch.
 86 | 
 87 | *   $K_{u,v}$: A weight from our **Kernel** (aka **the filter**).
 88 | 
 89 | This formula is a precise mathematical formula for cross correlation in Machine Learning, in mathematics convolution is a bit different, it involves flipping the filter (both horizontally and vertically) before sliding it over the image. The reason for such difference is due to the special nature of neural networks, the values in the filter are learned during training, the network can simply learn the flipped version of the filter if it needs to. The cross correlation is easier to implement.
 90 | 
 91 | #### Hyperparameters and Output Size
 92 | The size of our output feature map depends on the hyperparameters we choose. The output width ($W_{out}$) and height ($H_{out}$) can be calculated with this formula:
 93 | 
 94 | $$
 95 | W_{out} = \frac{W_{in} - F + 2P}{S} + 1
 96 | $$
 97 | $$
 98 | H_{out} = \frac{H_{in} - F + 2P}{S} + 1
 99 | $$
100 | 
101 | Where:
102 | *   $W_{in}, H_{in}$: Input width and height.
103 | *   $F$: Filter size.
104 | *   $P$: Padding (number of pixels added to the border).
105 | *   $S$: Stride (how many pixels the filter slides at a time).
106 | 
107 | #### Example Code
108 |  *Note: You can run the following code locally to try out convolutional layers!*
109 | 
110 |  ```Python
111 | # Remember to use pip to install numpy and matplotlib!
112 | import numpy as np
113 | import matplotlib.pyplot as plt
114 | 
115 | # 1. Define the Input and Filter
116 | # A simple 6x6 grayscale image. 
117 | # It has a sharp vertical edge down the middle.
118 | # (Low values = dark, high values = light)
119 | input_image = np.array([
120 |     [10, 10, 10, 100, 100, 100],
121 |     [10, 10, 10, 100, 100, 100],
122 |     [10, 10, 10, 100, 100, 100],
123 |     [10, 10, 10, 100, 100, 100],
124 |     [10, 10, 10, 100, 100, 100],
125 |     [10, 10, 10, 100, 100, 100]
126 | ])
127 | 
128 | # A 3x3 filter designed to detect vertical edges.
129 | # The positive values on the left and negative on the right
130 | # will give a high response when we move from dark to light.
131 | vertical_edge_filter = np.array([
132 |     [ 1,  0, -1],
133 |     [ 2,  0, -2], # This is a "Sobel" filter, common in image processing
134 |     [ 1,  0, -1]
135 | ])
136 | 
137 | # 2. The Convolution Operation
138 | # Get dimensions (assuming no padding, stride=1)
139 | img_h, img_w = input_image.shape
140 | filter_h, filter_w = vertical_edge_filter.shape
141 | out_h = (img_h - filter_h) + 1
142 | out_w = (img_w - filter_w) + 1
143 | 
144 | # Create an empty feature map to store the output
145 | output_feature_map = np.zeros((out_h, out_w))
146 | 
147 | # Slide filter over the image
148 | for y in range(out_h):
149 |     for x in range(out_w):
150 |         # Get current patch of the image
151 |         image_patch = input_image[y : y + filter_h, x : x + filter_w]
152 |         
153 |         # Perform element-wise multiplication and sum the result
154 |         # This is the core of the convolution!
155 |         convolution_sum = np.sum(image_patch * vertical_edge_filter)
156 |         
157 |         # Store result in the map
158 |         output_feature_map[y, x] = convolution_sum       
159 | # 3.Display Results
160 | print("--- Manual NumPy Convolution ---\n")
161 | print("Input Image:\n", input_image)
162 | print("\nVertical Edge Filter:\n", vertical_edge_filter)
163 | print("\nOutput Feature Map:\n", output_feature_map)
164 | print("\nNotice the high values in the output where the vertical edge was detected!")
165 | # Visualize the images
166 | fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))
167 | ax1.imshow(input_image, cmap='gray')
168 | ax1.set_title("Original Image")
169 | ax2.imshow(output_feature_map, cmap='gray')
170 | ax2.set_title("Feature Map (Edges)")
171 | plt.show()
172 | ```
173 | 
174 | ---
175 | ### 3. Making it Robust: The Pooling layer
176 | 
177 | A Pooling layer shrinks the feature map to make the network faster and robust. The most common type of pooling is **Max Pooling**.
178 | 
179 | #### Visualizing Max Pooling
180 | 
181 | Imagine a 2x2 Max Pooling operation on a 4x4 feature map.
182 | 
183 | ```
184 | Feature Map                     Pooled Output
185 | +---+---+---+---+               +---+---+
186 | |12 |20 | 30| 0 |  max(12,20,8,12)--> |20 |
187 | +---+---+---+---+               +---+---+
188 | | 8 |12 | 2 | 0 |  max(30,0,2,0)-->   |30 |
189 | +---+---+---+---+               +---+---+
190 | |34 |70 | 37| 4 |  max(34,70,112,100)-->|112|
191 | +---+---+---+---+               +---+---+
192 | |112|100| 25| 12|  max(37,4,25,12)--> |37 |
193 | +---+---+---+---+               +---+---+
194 | ```
195 | This keeps only the strongest signal, reducing the map size from 4x4 to 2x2.
196 | 
197 | #### The Math Behind Pooling
198 | 
199 | Here is the formula for Max Pooling:
200 | 
201 | $$
202 | P_{i,j} = \max_{0 \le u < F_p, 0 \le v < F_p} A_{i \cdot S_p + u, j \cdot S_p + v}
203 | $$
204 | 
205 | This formally states: "The output $P_{i,j}$ is the `max` value from the input feature map `A` within the pooling window."
206 | 
207 | ---
208 | ### Putting it all together: A full CNN Architecture
209 | 
210 | A real world CNN stacks up all these layers
211 | 
212 | ``` mermaid
213 | graph LR
214 |     A["Input Image (28x28x1)"] --> B["Conv2D Layer\n32 filters, 3x3\nOutput: 26x26x32"]
215 |     B --> C["MaxPooling2D\n2x2 window\nOutput: 13x13x32"]
216 |     C --> D["Conv2D Layer\n64 filters, 3x3\nOutput: 11x11x64"]
217 |     D --> E["MaxPooling2D\n2x2 window\nOutput: 5x5x64"]
218 |     E --> F["Flatten Layer\nOutput: 1600 nodes"]
219 |     F --> G["Dense Layer\n128 nodes"]
220 |     G --> H["Output Layer\n10 nodes (Softmax)"]
221 | 
222 |     subgraph Feature Extractor
223 |         B; C; D; E;
224 |     end
225 | 
226 |     subgraph Classifier
227 |         F; G; H;
228 |     end
229 | ```
230 | The final layer uses a **Softmax** activation function to convert the network's scores into a probability distribution.
231 | 
232 | The **Softmax** function for a specific output class `i` is:
233 | 
234 | $$
235 | \sigma(\mathbf{z})_i = \frac{e^{z_i}}{\sum_{j=1}^{C} e^{z_j}}
236 | $$
237 | 
238 | The formula gurantees that all output values are between 0 to 1 and sums up to be 1. This allows us to treat them as the model's confidence for each class.
239 | 
240 | ---
241 | 
242 | ### 5. Coding Example: Full Functional CNN Architecture
243 | 
244 | The following example uses Pytorch and Matplotlib to create an example CNN architecture.
245 | 
246 | ``` Python
247 | import torch
248 | import torch.nn as nn
249 | import torch.optim as optim
250 | from torchvision import datasets, transforms
251 | from torch.utils.data import DataLoader
252 | import matplotlib.pyplot as plt
253 | 
254 | # Define the CNN architecture
255 | class MNIST_CNN(nn.Module):
256 |     def __init__(self):
257 |         super(MNIST_CNN, self).__init__()
258 |         # Feature extractor
259 |         self.features = nn.Sequential(
260 |             nn.Conv2d(1, 32, kernel_size=3),  # 28x28x1 -> 26x26x32
261 |             nn.ReLU(),
262 |             nn.MaxPool2d(2),                   # 26x26x32 -> 13x13x32
263 |             nn.Conv2d(32, 64, kernel_size=3),  # 13x13x32 -> 11x11x64
264 |             nn.ReLU(),
265 |             nn.MaxPool2d(2)                    # 11x11x64 -> 5x5x64
266 |         )
267 |         
268 |         # Classifier
269 |         self.classifier = nn.Sequential(
270 |             nn.Flatten(),                      # 5x5x64 -> 1600
271 |             nn.Linear(5*5*64, 128),            # 1600 -> 128
272 |             nn.ReLU(),
273 |             nn.Linear(128, 10)                 # 128 -> 10
274 |         )
275 |     
276 |     def forward(self, x):
277 |         x = self.features(x)
278 |         x = self.classifier(x)
279 |         return x
280 | 
281 | # Initialize model, loss function, and optimizer
282 | model = MNIST_CNN()
283 | criterion = nn.CrossEntropyLoss()
284 | optimizer = optim.Adam(model.parameters(), lr=0.001)
285 | 
286 | # Load MNIST data
287 | transform = transforms.Compose([
288 |     transforms.ToTensor(),
289 |     transforms.Normalize((0.1307,), (0.3081,))
290 | ])
291 | 
292 | train_data = datasets.MNIST('./data', train=True, download=True, transform=transform)
293 | test_data = datasets.MNIST('./data', train=False, transform=transform)
294 | 
295 | train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
296 | test_loader = DataLoader(test_data, batch_size=1000)
297 | 
298 | # Training function
299 | def train(model, device, train_loader, optimizer, epoch):
300 |     model.train()
301 |     for batch_idx, (data, target) in enumerate(train_loader):
302 |         optimizer.zero_grad()
303 |         output = model(data)
304 |         loss = criterion(output, target)
305 |         loss.backward()
306 |         optimizer.step()
307 |         
308 |         if batch_idx % 100 == 0:
309 |             print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} '
310 |                   f'({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')
311 | 
312 | # Test function
313 | def test(model, device, test_loader):
314 |     model.eval()
315 |     test_loss = 0
316 |     correct = 0
317 |     with torch.no_grad():
318 |         for data, target in test_loader:
319 |             output = model(data)
320 |             test_loss += criterion(output, target).item()
321 |             pred = output.argmax(dim=1, keepdim=True)
322 |             correct += pred.eq(target.view_as(pred)).sum().item()
323 |     
324 |     test_loss /= len(test_loader)
325 |     accuracy = 100. * correct / len(test_loader.dataset)
326 |     print(f'\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} '
327 |           f'({accuracy:.0f}%)\n')
328 |     return accuracy
329 | 
330 | # Training loop
331 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
332 | model.to(device)
333 | 
334 | accuracies = []
335 | for epoch in range(1, 6):  # 5 epochs
336 |     train(model, device, train_loader, optimizer, epoch)
337 |     acc = test(model, device, test_loader)
338 |     accuracies.append(acc)
339 | 
340 | # Plot accuracy
341 | plt.plot(range(1, 6), accuracies)
342 | plt.title('Model Accuracy')
343 | plt.xlabel('Epoch')
344 | plt.ylabel('Accuracy (%)')
345 | plt.show()
346 | 
347 | # Save model
348 | torch.save(model.state_dict(), 'mnist_cnn.pth')
349 | ```
350 | 
351 | ### Summary & Conclusion
352 | 
353 | **Congratulations!** You have just completed your lesson on Convolutional Neural Networks!
354 | 
355 | Throughout this lesson you have learned:
356 | 
357 | * How **Convolutional Layers** use filters to find features, and you've seen the formal math behind the process.
358 | * How **Pooling Layers** make the network robust and efficient.
359 | * Understanded the **CNN** architecture and has saw the code to build it.
360 | 
361 | In the next lesson, we will learn about video data augmentation.
362 | 


--------------------------------------------------------------------------------
/2. Machine Learning Generics/2.3. Regularization, Bias-Variance Trade-Off, Kernel Methods, Cross Validation/Regularization, Bias–Variance Trade-off, Kernel Methods.md:
--------------------------------------------------------------------------------
  1 | # Lesson 3.4 ｜ Regularization, Bias-Variance Trade-off, Kernel Methods 🚀
  2 | ---
  3 | * Gordon.H | SHSID ML Club
  4 | ---
  5 | Hi, Junior ML engineer, welcome back to the course. Today,we will explore some of the most powerful concepts in machine learning.
  6 | So far, you have learned about building basic models like linear regression. 
  7 | But have you ever wondered...
  8 | 
  9 | * "Why is my model so bad at predicting things it has not seen before?"
 10 | * "How do I stop my model from just memorizing the answers?"
 11 | * "How can a computer find patterns that aren't just straight lines?"
 12 | 
 13 | Today we will learn about three foundational concepts that are at the heart of modern machine learning.
 14 | 
 15 | Here is an overview of this lesson:
 16 | 
 17 | ```mermaid
 18 | graph LR
 19 |     A[📍 You are here<br>You can build a basic model] --> B(Part 1: The Bias-Variance Tradeoff<br><i>The #1 challenge in ML</i>);
 20 |     B --> C(Part 2: Regularization<br><i>The cure for 'memorizing'</i>);
 21 |     C --> D(Part 3: The Kernel Method<br><i>The magic trick for complex patterns</i>);
 22 |     D --> E(🏆 You will be here<br>You can build robust, flexible models);
 23 | 
 24 |     style A fill:#f9f,stroke:#333,stroke-width:2px
 25 |     style E fill:#9f9,stroke:#333,stroke-width:2px
 26 | ```
 27 | 
 28 | ---
 29 | 
 30 | ## Part 1: Bias Variance Trade-Off 🐻
 31 | 
 32 | Every machine learning model you build faces a fundamental tug-of-war. It's a balance between being too simple and being too complex. Getting this balance right is the key to a good model. This is the **Bias-Variance Tradeoff**.
 33 | 
 34 | ## The Bullseye Analogy 🎯
 35 | 
 36 | Imagine an archer trying to hit the bullseye, the true underlying pattern of the data.
 37 | 
 38 | ```mermaid
 39 | graph TD
 40 |     subgraph Legend
 41 |         direction LR
 42 |         Bullseye(Bullseye = True Pattern)
 43 |         Hits(Blue Dots = Model's Predictions)
 44 |     end
 45 |     
 46 |     subgraph Four Archers: A Comparison
 47 |         direction LR
 48 |         A[<b>Low Bias, Low Variance</b><br><br>🎯<br><br>Accurate & Consistent<br><b>This is our GOAL!</b>]
 49 |         B[<b>Low Bias, High Variance</b><br><br>🎯<br><br>Accurate on average, but<br>predictions are all over the place.]
 50 |         C[<b>High Bias, Low Variance</b><br><br>🎯<br><br>Consistently misses in the<br>same spot. Very predictable, but wrong.]
 51 |         D[<b>High Bias, High Variance</b><br><br>🎯<br><br>Inaccurate & Inconsistent.<br>The worst of both worlds!]
 52 |     end
 53 | 
 54 |     style A fill:#9f9
 55 | ```
 56 | 
 57 | *   **Bias (Underfitting):** High bias is like a misaligned scope on a rifle. You are consistently wrong in the same way. Your model is too simple and has a fundamental "bias" that prevents it from capturing the truth.
 58 | *   **Variance (Overfitting):** High variance is like an archer with a shaky hand. Your shots are inconsistent. Your model is too complex and is distracted by random noise in the data, causing its predictions to vary wildly with new data.
 59 | 
 60 | ### The Error Curve: The Fix
 61 | This is one of the most important graphs in machine learning. It shows how error changes as we make a model more complex. Our goal is to find the lowest point of the "Total Error" curve.
 62 | 
 63 | ```mermaid
 64 | xychart-beta
 65 |   title "The Bias-Variance Tradeoff vs. Model Complexity"
 66 |   x-axis "Model Complexity (More features, higher polynomial degree) -->"
 67 |   y-axis "Prediction Error"
 68 |   
 69 |   line "Bias²" [9, 7, 5, 3.5, 2.5, 1.5, 1, 0.8, 0.6, 0.5, 0.4]
 70 |   line "Variance" [0.2, 0.3, 0.5, 0.8, 1.5, 2.5, 4, 5.5, 7, 8, 9]
 71 |   line "Total Error" [9.2, 7.3, 5.5, 4.3, 4.0, 4.0, 5, 6.3, 7.6, 8.5, 9.4]
 72 | ```
 73 | 
 74 | ### Python Example - Visualizing the Tradeoff: 
 75 | We'll try to fit models of different complexities to some curvy data.
 76 | 
 77 | ```python
 78 | import numpy as np
 79 | import matplotlib.pyplot as plt
 80 | from sklearn.pipeline import make_pipeline
 81 | from sklearn.linear_model import LinearRegression
 82 | from sklearn.preprocessing import PolynomialFeatures
 83 | 
 84 | # 1. Generate some sample data (a sine wave with noise)
 85 | np.random.seed(0)
 86 | X = np.linspace(0, 10, 30).reshape(-1, 1)
 87 | y = np.sin(X).ravel() + np.random.normal(0, 0.5, 30)
 88 | 
 89 | # 2. Define models with different complexities (polynomial degree)
 90 | underfit_model = make_pipeline(PolynomialFeatures(degree=1), LinearRegression())
 91 | just_right_model = make_pipeline(PolynomialFeatures(degree=4), LinearRegression())
 92 | overfit_model = make_pipeline(PolynomialFeatures(degree=15), LinearRegression())
 93 | 
 94 | # 3. Fit the models
 95 | underfit_model.fit(X, y)
 96 | just_right_model.fit(X, y)
 97 | overfit_model.fit(X, y)
 98 | 
 99 | # 4. Plot everything
100 | X_plot = np.linspace(0, 10, 100).reshape(-1, 1)
101 | plt.figure(figsize=(15, 8))
102 | plt.scatter(X, y, label='Original Data Points', color='black', zorder=5)
103 | 
104 | # Plot Underfit Model
105 | plt.plot(X_plot, underfit_model.predict(X_plot), label='Underfit (Degree 1) - High Bias', color='red', linewidth=2)
106 | # Plot Just Right Model
107 | plt.plot(X_plot, just_right_model.predict(X_plot), label='Just Right (Degree 4) - Sweet Spot!', color='green', linewidth=4)
108 | # Plot Overfit Model
109 | plt.plot(X_plot, overfit_model.predict(X_plot), label='Overfit (Degree 15) - High Variance', color='orange', linewidth=2, linestyle='--')
110 | 
111 | plt.title('Visualizing the Bias-Variance Tradeoff', fontsize=16)
112 | plt.xlabel('Feature', fontsize=12)
113 | plt.ylabel('Target', fontsize=12)
114 | plt.legend(fontsize=12)
115 | plt.ylim(-3, 4)
116 | plt.grid(True, linestyle='--', alpha=0.6)
117 | plt.show()
118 | ```
119 | **Analysis of the Plot:**
120 | *   **🔴 Red Line (High Bias):** Too simple. It misses the curve entirely.
121 | *   **🟠 Orange Line (High Variance):** Too complex. It wiggles frantically to "memorize" the noisy data.
122 | *   **🟢 Green Line (Sweet Spot):** Just right. It captures the true trend while ignoring the noise.
123 | 
124 | ---
125 | 
126 | ## Part 2: Regularization - The Overfitting Tamer
127 | 
128 | We saw the huge problem of overfitting. We can use **Regularization** as our primary technique against it.
129 | 
130 | **The Main Idea**
131 | We penalize a model for being too complex. We change its goal from *only* minimizing prediction error to minimizing error *AND* keeping its internal parameters (coefficients) small.
132 | 
133 | ### The "Leash" Analogy
134 | Regularization is like a leash, it pulls the model's coefficient back towards zero to prevent them being to large.
135 | 
136 | ```mermaid
137 | graph LR
138 |     subgraph The Forces on Model Coefficients
139 |         Origin((Origin<br>Coefficients = 0)) -- "<b>Penalty Term (Leash)</b><br>pulls coefficients<br>towards zero" --> Model[Model's Optimal<br>Coefficients]
140 |         
141 |         Model -- "Data pulls the model<br>to fit it perfectly" --> DataPoint1(Data Point 1)
142 |         Model -- "Data pulls the model<br>to fit it perfectly" --> DataPoint2(Data Point 2)
143 |     end
144 | 
145 |     style Origin fill:#ccc,stroke:#333
146 | ```
147 | 
148 | The regularization strength (`λ`, lambda) is the length of the leash. A bigger `λ` means a shorter an stronger leash, a simpler model, and less overfitting.
149 | 
150 | ### The Two Flavors of Regularization: Lasso vs. Ridge
151 | 
152 | 1.  **L2 Regularization (Ridge):** Uses a squared penalty: $\lambda \sum (\text{coefficient})^2$.
153 |     *   **Effect:** Shrinks all coefficients, making the model more stable.
154 | 
155 | 2.  **L1 Regularization (Lasso):** Uses an absolute value penalty: $\lambda \sum |\text{coefficient}|$.
156 |     *   **Effect:** Shrinks some coefficients *all the way to zero*. This is powerful because it performs **automatic feature selection**, telling you which features are most important.
157 | 
158 | ### Python: Visulizing How Regularization Works
159 | 
160 | Let's see the "shrinking" effect. We'll create a dataset with 2 important features and 8 useless "noise" features.
161 | 
162 | ```python
163 | import pandas as pd
164 | from sklearn.linear_model import LinearRegression, Ridge, Lasso
165 | 
166 | # 1. Create a dataset with 2 important features and 8 useless "noise" features
167 | np.random.seed(42)
168 | n_samples = 50
169 | n_features = 10
170 | X = np.random.randn(n_samples, n_features)
171 | # Create a target y where only the first two features matter
172 | y = 2 * X[:, 0] + 3 * X[:, 1] + np.random.normal(0, 1, n_samples)
173 | 
174 | # 2. Train three models
175 | lr = LinearRegression()
176 | lr.fit(X, y)
177 | ridge = Ridge(alpha=10) # alpha is lambda (λ) in scikit-learn
178 | ridge.fit(X, y)
179 | lasso = Lasso(alpha=0.1)
180 | lasso.fit(X, y)
181 | 
182 | # 3. Create a bar chart of the coefficients
183 | models = {'Linear Regression': lr, 'Ridge (L2)': ridge, 'Lasso (L1)': lasso}
184 | df_coeffs = pd.DataFrame({name: model.coef_ for name, model in models.items()})
185 | 
186 | df_coeffs.plot(kind='bar', figsize=(15, 7))
187 | plt.title('Comparing Coefficients of Different Models', fontsize=16)
188 | plt.ylabel('Coefficient Value', fontsize=12)
189 | plt.xlabel('Feature Index', fontsize=12)
190 | plt.axhline(0, color='black', linewidth=0.5)
191 | plt.xticks(rotation=0)
192 | plt.grid(axis='y', linestyle='--', alpha=0.7)
193 | plt.show()
194 | ```
195 | **Analysis of the Chart:**
196 | *   **🔵 Linear Regression:** Coefficients are large and noisy. It's trying to use useless features.
197 | *   **🟠 Ridge (L2):** All coefficients are shrunk towards zero, making the model more stable.
198 | *   **🟢 Lasso (L1):** The magic! It has set the coefficients for most of the useless features to **exactly zero**, correctly identifying the two important features.
199 | 
200 | ---
201 | ## Part 3: The Kernel Method: Dimension Trick
202 | 
203 | What if your data looks like this? A simple line can't separate the classes. This is a **non-linear problem**.
204 | 
205 | ```mermaid
206 | graph LR
207 |     Problem["Data in 2D<br>(( ))&nbsp;&nbsp;&nbsp;(( ))<br>(( ))&nbsp;&nbsp;&nbsp;(( ))"] -- "Can't be solved with a line" --> Diagnosis{"Non-Linear Pattern"}
208 |     
209 |     style Problem text-align:center
210 | ```
211 | 
212 | **The Big Idea:** Let's project the data into a higher dimension where it *is* linearly separable.
213 | 
214 | ### Visualizing the Transformation
215 | 
216 | The Kernel Method allows us to find a separating boundary in a high-dimensional space without ever actually calculating the new dimensions. It's a "trick" to handle non-linearity.
217 | 
218 | ```mermaid
219 | graph LR
220 |     A["<b>Problem in 2D</b><br>Dots are not linearly separable"] --> B["<b>The Transformation 'Trick'</b><br>Imagine a function that 'lifts'<br>the inner dots into a 3rd dimension"];
221 |     B --> C["<b>Solution in High Dimension</b><br>Now a simple plane can<br>easily separate the classes!"];
222 | ```
223 | 
224 | ### The Kernel "Trick"
225 | 
226 | A **kernel** is a function that efficiently calculates the similarity between two points *as if* they were in that higher-dimensional space. The most powerful model using this is the **Support Vector Machine (SVM)**. The most popular kernel is the **RBF (Radial Basis Function) Kernel**, which is excellent for complex patterns.
227 | 
228 | ### Python Ex: Solving Non-Linear with SVM
229 | 
230 | Let's use `scikit-learn` to create "moon" data and watch a Kernel SVM solve it effortlessly.
231 | 
232 | ```python
233 | from sklearn.datasets import make_moons
234 | from sklearn.svm import SVC
235 | import matplotlib.pyplot as plt
236 | import numpy as np
237 | # 1. Create non-linear "moon" data
238 | X, y = make_moons(n_samples=200, noise=0.15, random_state=42)
239 | 
240 | # 2. Create two SVM models
241 | linear_svm = SVC(kernel='linear', C=1.0)
242 | rbf_svm = SVC(kernel='rbf', C=1.0, gamma='auto') # RBF Kernel is the magic!
243 | 
244 | # 3. Fit the models and plot results
245 | # (Code for plotting is the same as the previous version)
246 | 
247 | # Helper function to visualize the decision boundary
248 | def plot_decision_boundary(model, X, y, ax, title):
249 |     h = .02
250 |     x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
251 |     y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
252 |     xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
253 |     Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
254 |     Z = Z.reshape(xx.shape)
255 |     
256 |     ax.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu, alpha=0.3)
257 |     ax.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolors='k')
258 |     ax.set_title(title, fontsize=14)
259 |     ax.set_xlabel('Feature 1')
260 |     ax.set_ylabel('Feature 2')
261 | 
262 | fig, axes = plt.subplots(1, 2, figsize=(16, 7))
263 | linear_svm.fit(X, y)
264 | plot_decision_boundary(linear_svm, X, y, axes[0], 'Linear Kernel SVM (Fails)')
265 | rbf_svm.fit(X, y)
266 | plot_decision_boundary(rbf_svm, X, y, axes[1], 'RBF Kernel SVM (Succeeds!)')
267 | 
268 | plt.suptitle('The Power of the Kernel Trick', fontsize=20)
269 | plt.show()
270 | ```
271 | **Analysis of the Plot:**
272 | *   **Left (Linear Kernel):** Fails because it can only draw a straight line.
273 | *   **Right (RBF Kernel):** Succeeds by creating a complex, non-linear boundary, effectively "seeing" the pattern in a higher dimension.
274 | 
275 | 
276 | ---
277 | ## Conclusion & Your Expanded ML Toolkit 🧰
278 | 
279 | Congratulations! You now have a mental flowchart for diagnosing and fixing common machine learning problems.
280 | ```mermaid
281 | graph LR
282 |     Start("Start with your ML Problem") --> Diag{"Diagnose the Model's Behavior"};
283 |     
284 |     Diag -->|"Model is too simple\n(Bad on train & test data)"| Bias("High Bias / Underfitting");
285 |     Bias --> Sol1["Solution: Use a more\ncomplex model, add features"];
286 | 
287 |     Diag -->|"Model 'memorized' the data\n(Good on train, bad on test)"| Variance("High Variance / Overfitting");
288 |     Variance --> Sol2["Solution: Use Regularization\n(Lasso/Ridge) or add more data"];
289 | 
290 |     Diag -->|"Pattern is not a straight line\n(Linear model fails)"| NonLinear("Non-Linear Problem");
291 |     NonLinear --> Sol3["Solution: Use the Kernel Trick\n(e.g., SVM with RBF Kernel)"];
292 | 
293 |     Sol1 --> End((✅ A Better Model));
294 |     Sol2 --> End((✅ A Better Model));
295 |     Sol3 --> End((✅ A Better Model));
296 | 
297 |     style End fill:#9f9,stroke:#333,stroke-width:2px
298 | ```
299 | 
300 | You are now equipped with the knowledge to build models that are not just predictive, but also robust, generalizable, and intelligent. Happy modeling


--------------------------------------------------------------------------------
/2. Machine Learning Generics/2.4. Principal Component Analysis, Dimensionality Reduction/Pizza.csv:
--------------------------------------------------------------------------------
  1 | brand,id,mois,prot,fat,ash,sodium,carb,cal
  2 | A,14069,27.82,21.43,44.87,5.11,1.77,0.77,4.93
  3 | A,14053,28.49,21.26,43.89,5.34,1.79,1.02,4.84
  4 | A,14025,28.35,19.99,45.78,5.08,1.63,0.8,4.95
  5 | A,14016,30.55,20.15,43.13,4.79,1.61,1.38,4.74
  6 | A,14005,30.49,21.28,41.65,4.82,1.64,1.76,4.67
  7 | A,14075,31.14,20.23,42.31,4.92,1.65,1.4,4.67
  8 | A,14082,31.21,20.97,41.34,4.71,1.58,1.77,4.63
  9 | A,14097,28.76,21.41,41.6,5.28,1.75,2.95,4.72
 10 | A,14117,28.22,20.48,45.1,5.02,1.71,1.18,4.93
 11 | A,14133,27.72,21.19,45.29,5.16,1.66,0.64,4.95
 12 | A,14101,27.35,21.2,45.59,4.94,1.65,0.92,4.98
 13 | A,14108,26.98,21.2,45.03,5.15,1.67,1.64,4.97
 14 | A,14164,28.7,20,45.12,4.93,1.56,1.25,4.91
 15 | A,14154,30.91,19.65,42.45,4.81,1.65,2.81,4.72
 16 | A,24005,30.91,20.77,42.03,4.9,1.61,1.39,4.67
 17 | A,24026,30.83,17.88,44.33,5.26,1.76,1.7,4.77
 18 | A,24094,32.73,20.06,39.74,5.24,1.69,2.23,4.47
 19 | A,24108,34.58,17.53,40.87,5.05,1.61,1.97,4.46
 20 | A,24102,31.8,20.35,40.44,5.43,1.61,1.98,4.53
 21 | A,24082,31.02,19.05,42.29,5.27,1.71,2.37,4.66
 22 | A,34017,27.02,19.56,47.2,4.95,1.65,1.27,5.08
 23 | A,34020,27.78,20.01,45.59,4.97,1.7,1.65,4.97
 24 | A,24136,30.88,20.58,42.26,4.96,1.63,1.32,4.68
 25 | A,24122,32.2,19.25,43.42,4.62,1.5,0.51,4.7
 26 | A,24115,33.19,18.05,41.88,5.22,1.7,1.66,4.56
 27 | A,34012,30.43,19.78,44.2,4.8,1.61,0.79,4.8
 28 | A,34006,28.93,19.99,45.2,4.78,1.62,1.1,4.91
 29 | A,24146,30.41,18.71,43.99,4.86,1.62,2.03,4.79
 30 | A,24138,29.62,21.1,43.37,5.05,1.69,0.86,4.78
 31 | B,14015,49.57,13.7,29.07,3.62,1.1,4.04,3.33
 32 | B,14006,52.68,14.38,25.72,3.26,0.93,3.96,3.05
 33 | B,14024,48.53,13.14,30.38,3.55,0.99,4.4,3.44
 34 | B,14052,50.19,13.78,28.39,3.56,1,4.08,3.27
 35 | B,14062,50.67,13.21,27.66,3.64,1,4.82,3.21
 36 | B,14047,49.99,13.35,29.2,3.52,1.05,3.94,3.32
 37 | B,14074,50.72,12.93,29.88,3.6,1.03,2.87,3.32
 38 | B,14083,50.81,12.56,29.95,2.99,0.81,3.69,3.35
 39 | B,14094,54.08,13.28,25.25,3.1,0.8,4.29,2.98
 40 | B,14124,51.9,14.27,24.92,3.85,1.06,5.06,3.02
 41 | B,24019,50.33,13.96,29.25,3.42,0.96,3.04,3.31
 42 | B,24012,49.69,13.63,29.59,3.41,0.98,3.68,3.36
 43 | B,14132,51.12,14.02,27.37,3.71,1.11,3.78,3.18
 44 | B,14146,49.77,13.24,28.91,3.59,1.06,4.49,3.31
 45 | B,14149,54.96,14.26,22.99,3.19,0.9,4.6,2.82
 46 | B,14161,55.11,14.87,21.9,3.29,0.86,4.83,2.76
 47 | B,14113,54.12,14.06,24.95,3.14,0.82,3.73,2.96
 48 | B,24008,49.34,13.79,29.57,3.52,0.95,3.78,3.36
 49 | B,24029,50.65,13.14,28.79,3.73,1.09,3.69,3.26
 50 | B,24045,52.46,14.18,24.6,3.57,1.12,5.19,2.99
 51 | B,24040,49.96,13.4,28.84,3.48,0.98,4.32,3.3
 52 | B,24093,49.57,13.17,29.39,3.59,1.06,4.28,3.35
 53 | B,24083,49.34,13.06,29.46,3.51,1.04,4.63,3.36
 54 | B,24049,50.87,13.85,27.64,3.71,1.1,3.93,3.2
 55 | B,24101,51.03,13.9,27.56,3.73,1.08,3.78,3.19
 56 | B,34011,53.98,14.05,24.73,3.32,0.92,3.92,2.94
 57 | B,24125,52.23,13.64,27.04,3.57,0.98,3.52,3.12
 58 | B,24145,51.74,13.95,27.75,3.6,1.04,2.96,3.17
 59 | B,24151,51.52,13.72,28.28,3.62,1.05,2.86,3.21
 60 | B,34005,51.86,13.13,28.82,2.94,0.8,3.25,3.25
 61 | B,24118,51.75,13.18,28.38,3.04,0.86,3.65,3.23
 62 | C,14058,48.4,26.05,21.4,3.44,0.5,0.71,3
 63 | C,14022,48.69,28.48,17.37,3.53,0.43,1.93,2.78
 64 | C,14036,48.88,25.23,20.89,3.22,0.47,1.78,2.96
 65 | C,14093,48.97,25.63,19.28,3.48,0.55,2.64,2.87
 66 | C,14080,48.84,23.98,23.37,3,0.52,0.81,3.09
 67 | C,14029,49.73,25.65,19.98,2.51,0.52,2.13,2.91
 68 | C,14106,50.18,28.3,15.79,3.47,0.45,2.26,2.64
 69 | C,14014,48.15,27.98,18.69,3.58,0.48,1.7,2.87
 70 | C,14073,49.72,27.31,16.89,3.08,0.25,3,2.73
 71 | C,14151,51.59,26.24,16.41,3.61,0.6,2.15,2.61
 72 | C,14162,52.26,26.31,14.77,3.51,0.53,3.15,2.51
 73 | C,14139,49.35,25.23,20.03,3.02,0.44,2.37,2.91
 74 | C,14115,47.91,26.03,21.54,3.71,0.6,0.81,3.01
 75 | C,24006,47.83,25.82,20.79,3.33,0.54,2.23,2.99
 76 | C,24020,47.9,25.55,21.1,3.04,0.43,2.41,3.02
 77 | C,24031,49.1,24.53,21.08,2.84,0.34,2.45,2.98
 78 | C,24038,50.04,24.13,19.75,3.21,0.52,2.87,2.86
 79 | C,24043,52.19,26,16.64,4.17,0.61,1,2.58
 80 | C,24152,47.11,26.17,21.29,3.36,0.48,2.07,3.05
 81 | C,24144,48.48,26.76,19.99,3.64,0.46,1.13,2.91
 82 | C,24135,52.22,26.25,16.45,3.92,0.38,1.16,2.58
 83 | C,24124,49.57,26.91,18,2.21,0.41,3.31,2.83
 84 | C,24113,51.71,24.98,17.2,3.01,0.34,3.1,2.67
 85 | C,24058,49.27,27.42,17.42,3.05,0.33,2.84,2.78
 86 | C,24091,47.25,23.95,24.24,3.47,0.57,1.09,3.18
 87 | C,34010,50.98,26.34,16.47,3.2,0.43,3.01,2.66
 88 | C,34003,49.57,25.46,20.79,3.04,0.37,1.14,2.94
 89 | D,14092,46.64,21.38,24.96,4.6,0.77,2.42,3.2
 90 | D,14081,45.93,21.6,25.87,4.51,0.73,2.09,3.28
 91 | D,14072,47.6,22.07,21.13,4.07,0.72,5.13,2.99
 92 | D,14116,47.61,22.44,19.61,4.06,0.6,6.28,2.91
 93 | D,14128,46.91,21.79,21.17,4.06,0.76,6.07,3.02
 94 | D,14107,46.88,21.71,23.6,4.59,0.75,3.22,3.12
 95 | D,14050,47.49,21.75,20.83,4.01,0.67,5.92,2.98
 96 | D,14013,48.03,21.96,20.88,4.02,0.7,5.11,2.96
 97 | D,14004,49.16,27.99,17.49,3.29,0.39,2.07,2.78
 98 | D,14003,47.17,22.29,21.3,4.08,0.74,5.16,3.02
 99 | D,14037,47.29,21.48,21.69,4.03,0.67,5.51,3.03
100 | D,14023,47.53,21.11,21.54,4.02,0.7,5.8,3.02
101 | D,14059,47.86,22.25,19.53,4.04,0.68,6.32,2.9
102 | D,14163,48.09,22.65,21.59,5.22,0.93,2.45,2.95
103 | D,14150,47.73,22.38,21.39,5.21,0.99,3.29,2.95
104 | D,14140,48.44,22.73,21.05,5.22,0.98,2.56,2.91
105 | D,24007,47.43,22.13,21.01,4.09,0.72,5.34,2.99
106 | D,24021,47.68,21.84,20.45,4.06,0.71,5.97,2.95
107 | D,24011,48.05,22.05,20.57,4.04,0.7,5.29,2.94
108 | D,24030,48.01,21.31,21.05,4.01,0.73,5.62,2.97
109 | D,34009,47.45,22.37,20.97,4.06,0.7,5.15,2.99
110 | D,24039,47.8,22.36,20.39,4.02,0.7,5.43,2.95
111 | D,24044,48.31,22.49,19.53,4.18,0.62,5.49,2.88
112 | D,34004,46.19,21.19,25.18,4.66,0.8,2.78,3.23
113 | D,24153,48.81,22.43,18.68,4.1,0.72,5.98,2.82
114 | D,24143,48.89,22.95,21.93,5.26,0.85,0.97,2.93
115 | D,24134,47.91,22.22,20.4,4.07,0.56,5.4,2.94
116 | D,24123,46.28,21.51,25.44,4.58,0.6,2.19,3.24
117 | D,24114,46.29,21.43,26,4.71,0.69,1.57,3.26
118 | D,24084,47.03,20.84,25.68,4.52,0.69,1.93,3.22
119 | D,24092,46.8,20.7,25.1,4.55,0.7,2.85,3.2
120 | D,24043,52.19,26,16.64,4.17,0.61,1,2.58
121 | E,14089,34.58,7.44,16.24,1.31,0.39,40.43,3.38
122 | E,14056,36.84,7.77,17.07,1.37,0.4,36.95,3.33
123 | E,14099,35.14,8.05,15.77,1.38,0.41,39.66,3.33
124 | E,14033,39.25,8.67,4.44,1.54,0.51,46.1,2.59
125 | E,14063,34.51,7.75,14.87,1.42,0.42,41.45,3.31
126 | E,14029,39.59,8.36,4.39,1.52,0.48,46.14,2.58
127 | E,14039,34.94,7.81,13.67,1.36,0.4,42.22,3.23
128 | E,14142,39.36,8.1,16.44,1.45,0.44,34.65,3.19
129 | E,14122,36.04,7.74,15.49,1.45,0.45,39.28,3.27
130 | E,14078,36.54,7.75,15.67,1.43,0.44,38.61,3.26
131 | E,14126,37.78,8.3,13.05,1.64,0.49,39.23,3.08
132 | E,14109,35.3,7.92,13.85,1.46,0.4,41.47,3.22
133 | E,14155,34.47,7.62,19.07,1.44,0.44,37.4,3.52
134 | E,14160,33.24,7.54,19.56,1.32,0.43,38.34,3.6
135 | E,24110,37.34,7.33,19.61,1.6,0.45,34.12,3.42
136 | E,24088,37.59,7.93,13.58,1.43,0.45,39.47,3.12
137 | E,24053,36.5,7.52,12.46,1.51,0.47,42.01,3.1
138 | E,24110,37.34,7.33,19.61,1.6,0.45,34.12,3.42
139 | E,24035,34.31,7.98,14.54,1.46,0.49,41.71,3.3
140 | E,24099,36.69,7.8,14.77,1.46,0.46,39.28,3.21
141 | E,24140,34.23,7.75,17.94,1.61,0.44,38.47,3.46
142 | E,34031,35.54,7.47,17.67,1.44,0.47,37.88,3.4
143 | E,34030,35.21,6.98,20.02,1.35,0.46,36.44,3.54
144 | E,34032,33.65,7.11,19.5,1.48,0.45,38.26,3.57
145 | E,24142,34.77,7.26,18.8,1.58,0.43,37.59,3.49
146 | E,34033,37.32,7.4,8.18,1.63,0.53,45.47,2.85
147 | E,14126,37.78,8.3,13.05,1.64,0.49,39.23,3.08
148 | E,24104,34.48,7.54,13.93,1.45,0.44,42.6,3.26
149 | F,34037,28.03,7.65,18.39,1.53,0.49,44.4,3.74
150 | F,14054,30.09,7.99,15.16,1.46,0.48,45.3,3.5
151 | F,14118,29.79,8.17,14.35,1.49,0.46,46.2,3.46
152 | F,14110,30.07,8.02,20.39,1.45,0.45,40.07,3.76
153 | F,14096,28.46,7.7,18.88,1.4,0.43,43.56,3.75
154 | F,14079,30.29,8.09,14.82,1.52,0.51,45.28,3.47
155 | F,14065,28.66,7.67,16.12,1.41,0.43,46.14,3.6
156 | F,14084,30.96,8.31,13.42,1.49,0.43,45.82,3.37
157 | F,14018,29.78,8.2,14.51,1.5,0.5,46.01,3.47
158 | F,14038,30.28,7.76,16.04,1.4,0.42,44.52,3.53
159 | F,14152,29.92,8.11,19.23,1.51,0.48,41.23,3.7
160 | F,14143,29.54,7.79,15.08,1.41,0.45,46.18,3.52
161 | F,14130,28.93,8.18,19.35,1.39,0.58,42.15,3.75
162 | F,14165,29.89,7.95,15.08,1.45,0.47,45.63,3.5
163 | F,24035,27.65,7.78,17.3,1.29,0.4,46.25,3.72
164 | F,24049,28.33,7.82,17.96,1.41,0.45,44.48,3.71
165 | F,24042,29.1,8.07,20.05,1.45,0.45,41.33,3.78
166 | F,24046,29.59,8.05,14.07,1.44,0.45,46.22,3.49
167 | F,24055,27.93,7.88,17.49,1.44,0.47,45.26,3.96
168 | F,24086,30.53,8.02,14.17,1.49,0.47,45.79,3.43
169 | F,24097,30.68,8.11,12.92,1.56,0.47,46.73,3.36
170 | F,24103,30.15,8.06,12.23,1.5,0.47,48.06,3.35
171 | F,24109,29.3,8.02,16.34,1.43,0.45,44.91,3.59
172 | F,24150,29.69,7.63,15.71,1.63,0.46,45.34,3.53
173 | F,24157,29,7.51,17.78,1.58,0.43,44.13,3.67
174 | F,34035,28.98,7.7,16.67,1.48,0.47,45.17,3.62
175 | F,24106,28.84,7.88,17.21,1.42,0.46,44.65,3.65
176 | F,34036,28.34,7.6,18.51,1.45,0.46,44.1,3.73
177 | F,34034,28.36,7.62,19.29,1.45,0.47,43.28,3.77
178 | F,24137,30.97,7.6,14.22,1.78,0.45,45.43,3.4
179 | G,14145,28.15,8.23,15.45,1.41,0.45,46.76,3.59
180 | G,24015,28.35,8.19,14.9,1.4,0.43,47.16,3.56
181 | G,24004,30.85,8.03,13.67,1.41,0.42,46.04,3.39
182 | G,14095,28.21,8.3,15,1.41,0.43,47.08,3.57
183 | G,14153,28.83,8.26,20.1,1.37,0.42,41.44,3.8
184 | G,14017,28.29,8.05,16.72,1.31,0.43,45.63,3.65
185 | G,14010,28.68,8.3,16.07,1.41,0.45,45.54,3.6
186 | G,14055,27.71,8.28,15.62,1.53,0.5,46.86,3.61
187 | G,14040,28.03,8.27,16.4,1.37,0.41,45.93,3.64
188 | G,14026,28.13,8.34,16.19,1.42,0.45,45.92,3.63
189 | G,14119,28.09,8.42,14.06,1.47,0.45,47.96,3.52
190 | G,14131,28.19,8.57,14.16,1.76,0.42,47.32,3.51
191 | G,14112,28.63,8.42,15.24,1.43,0.43,46.28,3.56
192 | G,14077,33.09,7.87,12.07,1.37,0.44,45.6,3.23
193 | G,14066,26.33,8.03,19.98,1.43,0.45,44.23,3.89
194 | G,14085,27.28,8.55,15.18,1.51,0.46,47.48,3.61
195 | G,14166,27.56,8.25,14.65,1.45,0.46,48.09,3.57
196 | G,24022,27.72,8.06,15.34,1.35,0.42,47.53,3.6
197 | G,24041,30.63,8.21,17.33,1.39,0.42,42.44,3.59
198 | G,24047,29.06,8.46,14.12,1.47,0.47,46.89,3.48
199 | G,24033,28.55,8.25,17.3,1.48,0.47,44.42,3.66
200 | G,24119,27.16,8.27,14.68,1.79,0.46,48.1,3.58
201 | G,24121,28.33,8.17,13.64,1.45,0.47,48.41,3.49
202 | G,24156,26.19,7.99,17.53,1.42,0.44,46.87,3.77
203 | G,34008,26.45,7.89,17.97,1.3,0.39,46.39,3.79
204 | G,34018,27.72,8.24,15.16,1.46,0.45,47.42,3.59
205 | G,24059,25,8.49,16.87,1.45,0.47,48.19,3.79
206 | G,34013,29.14,8.46,12.25,1.51,0.46,48.64,3.39
207 | G,24149,28.64,8.01,16.02,1.43,0.45,45.9,3.6
208 | H,14127,36.12,8.33,11.9,1.42,0.39,42.23,3.09
209 | H,24001,35.1,8.04,19.07,1.36,0.41,36.43,3.5
210 | H,14144,35.48,7.66,15.39,1.37,0.46,40.1,3.3
211 | H,14123,35.66,9.04,6.24,1.5,0.42,47.56,2.83
212 | H,14111,35.61,8.07,16.15,1.41,0.4,38.81,3.33
213 | H,14088,35.68,8.01,16.36,1.3,0.39,38.65,3.34
214 | H,14028,40.72,8.34,4.96,1.42,0.43,44.56,2.56
215 | H,14009,35.55,7.32,16.4,1.76,0.36,38.97,3.33
216 | H,14007,33.05,7.34,15.78,1.34,0.42,42.49,3.41
217 | H,14057,33.75,8.07,14.93,1.31,0.4,41.94,3.34
218 | H,14032,38.84,8.55,4.38,1.41,0.43,46.82,2.61
219 | H,14100,33.44,7.45,18.49,1.39,0.4,39.23,3.53
220 | H,14041,36.43,8.67,15.05,1.35,0.38,38.5,3.24
221 | H,14064,35.84,7.9,14.09,1.25,0.42,40.92,3.22
222 | H,14076,36.63,8.38,12.59,1.36,0.41,41.04,3.11
223 | H,14159,35.5,8.11,13.45,1.34,0.42,41.59,3.2
224 | H,14156,36.87,7.82,14.58,1.32,0.4,39.41,3.2
225 | H,24009,35.5,7.51,15.63,1.42,0.42,39.94,3.3
226 | H,24016,33.9,7.76,15.84,1.3,0.42,41.2,3.38
227 | H,24024,35.8,7.82,14.08,1.43,0.42,40.87,3.21
228 | H,24089,35.74,7.86,14.27,1.4,0.44,40.73,3.23
229 | H,24054,35.74,7.47,19.14,1.34,0.4,36.31,3.47
230 | H,24037,35.99,8.23,16.6,1.4,0.45,37.78,3.33
231 | H,24034,35.5,7.9,13.83,1.41,0.46,41.36,3.22
232 | H,24120,35.91,7.41,16.41,1.75,0.43,38.52,3.31
233 | H,34028,36.01,7.7,15.6,1.37,0.43,39.32,3.28
234 | H,34007,36.73,7.42,18.16,1.17,0.39,36.52,3.39
235 | H,24139,35.86,7.47,14.72,1.67,0.4,40.28,3.23
236 | H,34029,36.32,8.06,12.54,1.35,0.43,41.73,3.12
237 | H,24100,34.61,7.17,17.88,1.29,0.4,39.05,3.46
238 | H,24132,36.67,7.81,9.34,1.64,0.44,44.54,2.93
239 | H,24112,35.94,7.97,13.5,1.45,0.42,41.14,3.18
240 | H,24089,35.74,7.86,14.27,1.4,0.44,40.73,3.23
241 | I,14134,54.64,10.36,12.89,2.21,0.53,19.9,2.37
242 | I,14046,54.52,9.85,13.55,2.04,0.47,20.05,2.42
243 | I,14019,53.84,10.22,13.05,2.07,0.48,20.82,2.42
244 | I,24107,54.32,10.66,14.04,2.03,0.46,18.95,2.45
245 | I,24111,52.9,10.19,14.35,2.02,0.49,20.54,2.52
246 | I,24128,55,11.11,11.3,2.05,0.47,20.54,2.28
247 | I,24052,56.24,9.06,11.49,1.93,0.48,21.28,2.25
248 | I,14068,56.25,9.96,13.22,2.05,0.46,18.52,2.33
249 | I,14157,54.43,9.61,12.18,2.16,0.51,21.62,2.35
250 | I,14158,53.69,11,15.23,2.25,0.55,17.83,2.52
251 | I,14102,54.69,10.32,13.77,2.15,0.48,19.07,2.41
252 | I,14103,53.16,11.17,13.83,2.18,0.47,19.66,2.48
253 | I,14031,55.15,10.92,12.51,2.2,0.51,19.22,2.33
254 | I,14067,54.87,10.78,13.65,2.08,0.5,18.62,2.4
255 | I,14030,55.43,10.46,12.37,2.17,0.51,19.57,2.31
256 | I,24023,55.6,9.69,13.89,1.99,0.48,18.83,2.39
257 | I,14114,54.51,9.93,12.66,2.07,0.53,20.83,2.37
258 | I,14098,54.74,10.18,12.6,2.06,0.48,20.42,2.36
259 | I,14125,54.17,10.67,12.18,2.08,0.49,20.9,2.36
260 | I,24048,52.75,10.78,12.98,2.12,0.49,21.37,2.45
261 | I,34023,54.36,11.43,13.09,2.24,0.48,18.88,2.39
262 | I,34022,54.28,10.75,13.87,2.13,0.46,18.97,2.4
263 | I,34021,54.54,10.4,13.22,2.1,0.47,19.74,2.4
264 | I,24141,54.06,10.68,13.53,2.04,0.46,19.69,2.43
265 | I,34019,57.22,9.66,10.95,2.04,0.47,20.13,2.18
266 | I,34027,54.8,10.57,13.42,2.08,0.5,19.13,2.4
267 | I,34026,54.17,10.13,13.25,2.07,0.46,20.38,2.41
268 | I,34025,53.57,10.73,12.78,2.16,0.48,20.76,2.41
269 | I,34024,55.29,9.84,12.91,2.08,0.51,19.88,2.35
270 | J,34042,46.16,10.34,16.14,2.47,0.67,24.89,2.86
271 | J,34045,43.8,10.97,16.49,2.46,0.65,26.28,2.97
272 | J,14044,47.6,10.43,15.18,2.32,0.56,24.47,2.76
273 | J,14045,46.84,9.91,15.5,2.27,0.57,25.48,2.81
274 | J,14042,46.1,9.87,15.97,2.19,0.53,25.87,2.87
275 | J,14043,47.84,10.16,14.56,2.27,0.54,25.17,2.72
276 | J,24065,45.86,10.5,17.07,2.33,0.61,24.24,2.93
277 | J,24064,46.55,10.75,16.72,2.24,0.61,23.74,2.88
278 | J,24063,46.13,10.71,17.24,2.36,0.61,23.56,2.92
279 | J,24062,48.58,9.76,16.01,2.3,0.6,23.35,2.77
280 | J,24074,44.76,12.91,15.56,2.34,0.61,24.43,2.89
281 | J,24073,46.13,10.84,13.99,2.38,0.64,26.66,2.76
282 | J,24072,47.43,10.45,16.5,2.35,0.61,23.27,2.83
283 | J,24071,46.22,11.26,15.93,2.47,0.63,24.12,2.85
284 | J,24068,45.2,10.67,16.38,2.44,0.6,25.31,2.91
285 | J,24078,47.05,10.46,15.16,2.34,0.64,24.99,2.78
286 | J,24056,43.45,10.81,19.49,2.51,0.68,23.74,3.14
287 | J,34038,47.74,10.23,16.31,2.5,0.67,23.22,2.81
288 | J,34041,45.17,10.42,17.2,2.34,0.61,24.87,2.96
289 | J,34040,44.77,10.52,16.12,2.44,0.66,26.15,2.92
290 | J,24067,47.91,10.75,16,2.4,0.64,22.94,2.79
291 | J,24066,45.69,10.23,16.5,2.3,0.59,25.28,2.91
292 | J,24076,46.12,10.3,16.38,2.35,0.61,24.85,2.88
293 | J,24077,47.35,10.31,15.45,2.34,0.62,24.55,2.78
294 | J,24070,45.21,9.39,16.23,2.14,0.55,27.03,2.92
295 | J,24075,46.34,10,17.73,2.32,0.59,23.61,2.94
296 | J,34043,44.07,10.96,18.39,2.56,0.66,24.02,3.05
297 | J,34044,44.91,11.07,17,2.49,0.66,25.36,2.91
298 | J,24069,43.15,11.79,18.46,2.43,0.67,24.17,3.1
299 | J,34039,44.55,11.01,16.03,2.43,0.64,25.98,2.92
300 | J,14044,47.6,10.43,15.18,2.32,0.56,24.47,2.76
301 | J,14045,46.84,9.91,15.5,2.27,0.57,25.48,2.81
302 | 


--------------------------------------------------------------------------------
/2. Machine Learning Generics/2.1. Support Vector Machines/SVM.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "1e0b81b2",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Support Vector Machines (SVM)\n",
  9 |     "\n",
 10 |     "_Welcome back! Today we’ll master Support Vector Machines — powerful, margin‑based classifiers that shine in high‑dimensional spaces._\n",
 11 |     "\n",
 12 |     "- **Linear SVM** — maximum‑margin hyperplanes\n",
 13 |     "- **Soft margins & hinge loss** — robust to overlap and noise\n",
 14 |     "- **Dual form & kernels** — non‑linear decision boundaries via the kernel trick\n",
 15 |     "- **Practical tuning** — scaling, `C`, `gamma`, and when to use `LinearSVC` vs `SVC`.\n",
 16 |     "\n",
 17 |     "---\n",
 18 |     "\n",
 19 |     "## What you’ll learn\n",
 20 |     "\n",
 21 |     "- The margin idea and why “wider is better.”\n",
 22 |     "- Hard-margin vs. soft-margin SVMs.\n",
 23 |     "- Primal, hinge-loss view; dual and kernels.\n",
 24 |     "- How to tune `C`, `gamma`, and choose a kernel.\n",
 25 |     "- Minimal scikit-learn code for linear and kernel SVMs.\n",
 26 |     "\n",
 27 |     "---\n",
 28 |     "\n",
 29 |     "## 1) The SVM idea — separate with the **widest margin**\n",
 30 |     "\n",
 31 |     "Think of a line (or plane) that splits the classes. Among all lines that correctly split them, prefer the one that leaves the **largest gap** to the nearest points of either class. That gap is the **margin**. A larger margin usually means a simpler, more robust boundary that generalizes better.\n",
 32 |     "\n",
 33 |     "Given labeled points $(x_i, y_i)$ with $y_i \\in \\{-1,+1\\}$, SVM finds a hyperplane\n",
 34 |     "\n",
 35 |     "$$\n",
 36 |     "f(x) = w^\\top x + b\n",
 37 |     "$$\n",
 38 |     "\n",
 39 |     "that separates the classes while **maximizing the margin**\n",
 40 |     "\n",
 41 |     "- **Geometric margin** of a point: $\\displaystyle \\gamma_i = \\frac{y_i (w^\\top x_i + b)}{\\lVert w\\rVert}$\n",
 42 |     "- **Margin width** between class boundaries: $\\displaystyle \\frac{2}{\\lVert w\\rVert}$  \n",
 43 |     "  → Max margin $\\Longleftrightarrow$ minimize $\\lVert w\\rVert$ (subject to correct classification).\n",
 44 |     "\n",
 45 |     "There are two types of SVMs:\n",
 46 |     "\n",
 47 |     "- **Support Vector Classification** (SVC): in scikit-learn, $SVC$ is the kernel SVM classifier, while LinearSVC is the fast linear-only solver. It is for classification tasks\n",
 48 |     "- **Support Vector Regression** (SVR): for regression tasks\n",
 49 |     "\n",
 50 |     "---\n",
 51 |     "\n",
 52 |     "## 2) Hard‑margin SVM (separable case)\n",
 53 |     "\n",
 54 |     "Here, every point must be on the correct side with **room to spare** (at least distance 1 in the scaled units). Minimizing $\\tfrac12\\lVert w\\rVert^2$ is equivalent to **maximizing the margin**. This version only works when data are perfectly separable; a single mislabeled or noisy point can break feasibility.\n",
 55 |     "\n",
 56 |     "**Optimization:**\n",
 57 |     "\n",
 58 |     "$$\n",
 59 |     "\\begin{aligned}\n",
 60 |     "\\min_{w,b}\\quad & \\tfrac12 \\lVert w\\rVert^2 \\\\\n",
 61 |     "\\text{s.t.}\\quad & y_i (w^\\top x_i + b) \\ge 1,\\quad i=1,\\dots,n\n",
 62 |     "\\end{aligned}\n",
 63 |     "$$\n",
 64 |     "\n",
 65 |     "- Constraints enforce that every point sits **outside** the margin band.\n",
 66 |     "- Works only when data are perfectly separable.\n",
 67 |     "\n",
 68 |     "---\n",
 69 |     "\n",
 70 |     "## 3) Soft‑margin SVM (realistic case)\n",
 71 |     "\n",
 72 |     "Real data must overlap.\n",
 73 |     "\n",
 74 |     "We introduce **slack** variables $\\xi_i \\ge 0$ that measures how much a point breaks the margin rule:\n",
 75 |     "\n",
 76 |     "- $\\xi_i=0$ means safely outside\n",
 77 |     "- $0<\\xi_i<1$ means inside the margin but on the correct side\n",
 78 |     "- $\\xi_i>1$ means misclassified.\n",
 79 |     "\n",
 80 |     "$$\n",
 81 |     "\\begin{aligned}\n",
 82 |     "\\min_{w,b,\\xi}\\quad & \\tfrac12 \\lVert w\\rVert^2 + C \\sum_{i=1}^n \\xi_i \\\\\n",
 83 |     "\\text{s.t.}\\quad & y_i (w^\\top x_i + b) \\ge 1 - \\xi_i,\\quad \\xi_i \\ge 0\n",
 84 |     "\\end{aligned}\n",
 85 |     "$$\n",
 86 |     "\n",
 87 |     "- The constant **C>0** balances two desires: keep the margin wide (small $\\lVert w\\rVert$), yet don’t allow too many/too large violations (small $\\sum\\xi_i$):\n",
 88 |     "  - large $C$ → penalize violations heavily (lower bias, higher variance, risking overfit)\n",
 89 |     "  - small $C$ → wider margin, more violations allowed (higher bias, lower variance, smoother, possibly underfit)\n",
 90 |     "\n",
 91 |     "**Hinge‑loss view (equivalent):**\n",
 92 |     "\n",
 93 |     "$$\n",
 94 |     "\\min_{w,b}\\quad \\frac{\\lambda}{2}\\lVert w\\rVert^2 + \\frac{1}{n}\\sum_{i=1}^n \\max\\!\\big(0, 1 - y_i(w^\\top x_i + b)\\big),\n",
 95 |     "$$\n",
 96 |     "\n",
 97 |     "with $\\lambda$ inversely related to $C$ (roughly, $\\lambda \\approx 1/(nC)$ in many libraries).\n",
 98 |     "\n",
 99 |     "---\n",
100 |     "\n",
101 |     "## 4) Support vectors & the decision function\n",
102 |     "\n",
103 |     "Only points that lie **on or inside** the margin influence the final classifier; these are the **support vectors**. Points far from the boundary have zero hinge loss and do not change $w,b$. This is why SVM solutions are often sparse: the model depends on a subset of the training data.\n",
104 |     "\n",
105 |     "Points with zero loss **away from the margin** don’t affect the solution. The model depends only on a subset — the **support vectors** — that lie **on or inside** the margin band.\n",
106 |     "\n",
107 |     "Prediction:\n",
108 |     "\n",
109 |     "$$\n",
110 |     "\\hat y = \\mathrm{sign}(w^\\top x + b).\n",
111 |     "$$\n",
112 |     "\n",
113 |     "---\n",
114 |     "\n",
115 |     "## 5) Dual problem & the kernel trick (non‑linear SVM)\n",
116 |     "\n",
117 |     "The dual re-expresses the problem in terms of **similarities between pairs of points** via a kernel $K(x_i,x_j)$. Replacing dot-products with kernels lets the classifier act as if data were mapped into a higher-dimensional space **without computing that mapping explicitly** (the “kernel trick”). The prediction becomes a weighted sum over support vectors: only those with $\\alpha_i>0$ matter.\n",
118 |     "\n",
119 |     "The Lagrange dual of the soft‑margin problem (for kernel $K$) is:\n",
120 |     "\n",
121 |     "$$\n",
122 |     "\\begin{aligned}\n",
123 |     "\\max_{\\alpha}\\quad & \\sum_{i=1}^n \\alpha_i - \\frac12 \\sum_{i=1}^n \\sum_{j=1}^n \\alpha_i \\alpha_j y_i y_j\\, K(x_i,x_j) \\\\\n",
124 |     "\\text{s.t.}\\quad & 0 \\le \\alpha_i \\le C,\\quad \\sum_{i=1}^n \\alpha_i y_i = 0\n",
125 |     "\\end{aligned}\n",
126 |     "$$\n",
127 |     "\n",
128 |     "The decision function becomes:\n",
129 |     "\n",
130 |     "$$\n",
131 |     "f(x) = \\sum_{i=1}^n \\alpha_i y_i\\, K(x_i, x) + b.\n",
132 |     "$$\n",
133 |     "\n",
134 |     "**Common kernels $K(x,z)$**\n",
135 |     "\n",
136 |     "- **Linear:** $x^\\top z$ (useful for very high‑dimensional sparse features; scalable with `LinearSVC`).\n",
137 |     "- **RBF (Gaussian):** $\\exp(-\\gamma \\lVert x - z\\rVert^2)$ with $\\gamma > 0$\n",
138 |     "- **Polynomial:** $(\\gamma\\, x^\\top z + r)^d$ (degree $d$)\n",
139 |     "- **Sigmoid:** $\\tanh(\\gamma\\, x^\\top z + r)$ (less common)\n",
140 |     "\n",
141 |     "**Hyperparameters**\n",
142 |     "\n",
143 |     "- `C` — regularization (as above)\n",
144 |     "- `gamma` — RBF/poly scale; large `gamma` → tighter, more wiggly boundaries; small `gamma` → smoother\n",
145 |     "\n",
146 |     "---\n",
147 |     "\n",
148 |     "## 6) Best practices\n",
149 |     "\n",
150 |     "- **Scale features** (standardize) — SVMs are distance‑based.\n",
151 |     "- Start with **linear SVM** for many features/large $n$ (`LinearSVC` or `SGDClassifier(loss=\"hinge\")`).\n",
152 |     "- Use **RBF SVC** for moderate $n$ when nonlinearity helps.\n",
153 |     "- For **imbalanced** data, set `class_weight=\"balanced\"` or provide weights.\n",
154 |     "- Enable probability estimates (Platt scaling) via `probability=True` in `SVC` (costs extra fitting).\n",
155 |     "\n",
156 |     "---\n",
157 |     "\n",
158 |     "## 7) Minimal code — linear and RBF SVM (scikit‑learn)\n",
159 |     "\n",
160 |     "The flow below is: make data → train/test split → **pipeline** with `StandardScaler` → fit a **LinearSVC** (fast for large/high‑dimensional sets). Then try an **RBF SVC** and use a tiny **grid search** to pick `C` and `gamma`. `LinearSVC(dual=\"auto\")` chooses an efficient solver depending on the feature/sample ratio."
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "id": "264c5c42",
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "import numpy as np\n",
171 |     "from sklearn.datasets import make_classification\n",
172 |     "from sklearn.model_selection import train_test_split, GridSearchCV\n",
173 |     "from sklearn.preprocessing import StandardScaler\n",
174 |     "from sklearn.pipeline import make_pipeline\n",
175 |     "from sklearn.svm import LinearSVC, SVC\n",
176 |     "from sklearn.metrics import accuracy_score, classification_report\n",
177 |     "\n",
178 |     "# Data\n",
179 |     "X, y = make_classification(n_samples=2000, n_features=50, n_informative=10,\n",
180 |     "                           class_sep=1.5, random_state=0)\n",
181 |     "Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.25, random_state=0, stratify=y)\n",
182 |     "\n",
183 |     "# 1) Linear SVM (good for large & high-dim data)\n",
184 |     "lin_clf = make_pipeline(StandardScaler(), LinearSVC(class_weight=\"balanced\"))\n",
185 |     "lin_clf.fit(Xtr, ytr)\n",
186 |     "print(\"LinearSVC acc:\", accuracy_score(yte, lin_clf.predict(Xte)))\n",
187 |     "\n",
188 |     "# 2) RBF-kernel SVM with small grid search (for moderate-sized data)\n",
189 |     "rbf_pipe = make_pipeline(StandardScaler(), SVC(kernel=\"rbf\"))\n",
190 |     "param_grid = {\"svc__C\": [0.1, 1, 10], \"svc__gamma\": [\"scale\", 0.01, 0.1]}\n",
191 |     "grid = GridSearchCV(rbf_pipe, param_grid, cv=3, n_jobs=-1)\n",
192 |     "grid.fit(Xtr, ytr)\n",
193 |     "print(\"Best RBF params:\", grid.best_params_)\n",
194 |     "print(\"RBF acc:\", accuracy_score(yte, grid.predict(Xte)))\n",
195 |     "print(classification_report(yte, grid.predict(Xte)))"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "id": "fa0aad12",
201 |    "metadata": {},
202 |    "source": [
203 |     "---\n",
204 |     "\n",
205 |     "## 8) (Optional) Hinge‑loss SGD — from scratch (toy)\n",
206 |     "\n",
207 |     "This toy optimizer does **stochastic subgradient descent** on the hinge loss plus $\\ell_2$ penalty. If a sample is correctly classified with margin $\\ge 1$, we only apply weight decay. If it violates the margin, we also step in the direction that reduces the hinge loss. This mirrors what large‑scale linear SVM libraries do under the hood."
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "id": "652686aa",
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "import numpy as np\n",
218 |     "\n",
219 |     "def sgd_linear_svm(X, y, lr=0.1, lam=1e-3, epochs=10):\n",
220 |     "    # y in {-1, +1}\n",
221 |     "    n, d = X.shape\n",
222 |     "    w = np.zeros(d); b = 0.0\n",
223 |     "    for _ in range(epochs):\n",
224 |     "        idx = np.random.permutation(n)\n",
225 |     "        for i in idx:\n",
226 |     "            margin = y[i]*(X[i] @ w + b)\n",
227 |     "            if margin < 1:\n",
228 |     "                # subgradient of hinge + L2\n",
229 |     "                w = (1 - lr*lam)*w + lr*y[i]*X[i]\n",
230 |     "                b = b + lr*y[i]\n",
231 |     "            else:\n",
232 |     "                w = (1 - lr*lam)*w\n",
233 |     "    return w, b"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "id": "72a3aaf5",
239 |    "metadata": {},
240 |    "source": [
241 |     "---\n",
242 |     "\n",
243 |     "## 9) How to choose `C`, `gamma`, and the kernel\n",
244 |     "\n",
245 |     "1. **Start simple:** linear vs RBF; pick by validation.\n",
246 |     "2. **Grid search / log‑scale**: Use a small **log‑grid** `C ∈ {0.01, 0.1, 1, 10, 100}`, `gamma ∈ {\"scale\", 0.001, 0.01, 0.1, 1}`.\n",
247 |     "3. **Watch for overfit:** very high training acc + lower validation acc → reduce `C` or `gamma`.\n",
248 |     "4. **Speed:** If training is slow with kernel SVMs, reduce the grid size first and/or switch to `LinearSVC`.\n",
249 |     "\n",
250 |     "---\n",
251 |     "\n",
252 |     "## 10) Multiclass strategies\n",
253 |     "\n",
254 |     "SVMs are inherently binary. For $K$ classes:\n",
255 |     "\n",
256 |     "- **One‑vs‑One (OvO):** train $K(K-1)/2$ binary classifiers (default in `SVC`). It compares every pair of classes and tends to work well when classes are balanced\n",
257 |     "- **One‑vs‑Rest (OvR):** train $K$ classifiers vs the rest (default in `LinearSVC`). It is simpler and pairs well with linear models on high‑dimensional data.\n",
258 |     "\n",
259 |     "---\n",
260 |     "\n",
261 |     "## 11) FAQs & Gotchas\n",
262 |     "\n",
263 |     "Probability outputs from `SVC` come from an extra calibration step (Platt scaling); they can look conservative on small datasets. For speed problems, try fewer features, subsampling, or a linear model. Always re‑check that features are standardized.\n",
264 |     "\n",
265 |     "- **“My SVM is slow.”** → Too many samples with kernel SVC; try `LinearSVC` or sub-sample + RBF.\n",
266 |     "- **“Predicted probabilities look odd.”** → They’re calibrated via Platt scaling; try `CalibratedClassifierCV`.\n",
267 |     "- **“Decision boundary is jagged.”** → Likely large `gamma` (RBF) or very large `C`; reduce them and re‑scale features.\n",
268 |     "- **“Imbalanced classes.”** → Use `class_weight=\"balanced\"` and evaluate with F1/ROC‑AUC, not just accuracy.\n",
269 |     "\n",
270 |     "---\n",
271 |     "\n",
272 |     "## 12) Quick cheat sheet\n",
273 |     "\n",
274 |     "Linear SVMs shine with many features (e.g., text). Kernel SVMs capture curvature at the cost of speed and tuning more hyperparameters. Both require scaling; both are sensitive to `C`, and kernels add `gamma` (and `degree` for polynomial).\n",
275 |     "\n",
276 |     "| Aspect              | Linear SVM                      | Kernel SVM (RBF/Poly)                 |\n",
277 |     "| ------------------- | ------------------------------- | ------------------------------------- |\n",
278 |     "| Nonlinearity        | No                              | Yes (via kernels)                     |\n",
279 |     "| Scale with #samples | Great (use `LinearSVC`)         | Moderate/Slow for large $n$           |\n",
280 |     "| Hyperparams         | `C`                             | `C`, `gamma`, (degree for poly)       |\n",
281 |     "| Feature scaling     | **Required**                    | **Required**                          |\n",
282 |     "| Typical use         | Text, high‑dim sparse, big data | Moderate data with nonlinear boundary |\n",
283 |     "\n",
284 |     "---\n",
285 |     "\n",
286 |     "## 13) Practice\n",
287 |     "\n",
288 |     "Try plotting the margin band (the two lines where $y(w^\\top x + b)=1$) on a 2‑D toy set to see which points become support vectors. Then vary `C` and watch how the number of support vectors and the margin width change.\n",
289 |     "\n",
290 |     "1. Standardize features, then compare `LinearSVC` vs `SVC(RBF)` on your dataset.\n",
291 |     "2. Grid‑search `C` and `gamma`; report validation curves and best test score.\n",
292 |     "3. Inspect support vectors: how many are there, and which points become SVs?\n",
293 |     "4. Try class imbalance: set `class_weight=\"balanced\"` and compare metrics.\n",
294 |     "\n",
295 |     "---\n",
296 |     "\n",
297 |     "## Summary\n",
298 |     "\n",
299 |     "- SVMs maximize margin → robust decision boundaries.\n",
300 |     "- Soft margins + hinge loss handle overlap and noise.\n",
301 |     "- Dual form enables **kernels** for powerful nonlinear separation.\n",
302 |     "- In practice: **scale**, pick kernel by validation, tune `C`/`gamma`, and use `LinearSVC` for large datasets.\n",
303 |     "\n",
304 |     "**Next:** Principal component analysis + Dimensionality reduction"
305 |    ]
306 |   }
307 |  ],
308 |  "metadata": {
309 |   "jupytext": {
310 |    "cell_metadata_filter": "-all",
311 |    "main_language": "python",
312 |    "notebook_metadata_filter": "-all"
313 |   },
314 |   "language_info": {
315 |    "name": "python"
316 |   }
317 |  },
318 |  "nbformat": 4,
319 |  "nbformat_minor": 5
320 | }
321 | 


--------------------------------------------------------------------------------
/1. Mathematical Methods For AI/1.1. Linear Algebra/Linear Algebra.md:
--------------------------------------------------------------------------------
  1 | # Linear Algebra
  2 | ## Notation
  3 | - `∀` : "for any"
  4 | - `∃` : "there exists"
  5 | 
  6 | ---
  7 | 
  8 | ## Vector Operations
  9 | ### Basic Operations
 10 | - **Addition/Subtraction**:  
 11 |   $(u_1,u_2) + (v_1,v_2) = (u_1+v_1,u_2+v_2)$  
 12 | - **Scalar Multiplication**:  
 13 |   $k \cdot (v_1,v_2) = (k\cdot v_1, k\cdot v_2)$
 14 | 
 15 | ### Dot Product (Inner Product)
 16 | $$\vec{u} \cdot \vec{v} = \sum_{i=1}^{n} u_i v_i$$
 17 | - **Geometric Interpretation**:  
 18 |   $\vec{u} \cdot \vec{v} = \|\vec{u}\|\|\vec{v}\| \cos{\theta}$  
 19 |   ($\theta$ = angle between vectors)
 20 | - **Orthogonality Condition**:  
 21 |   $\vec{u} \cdot \vec{v} = 0 \iff \vec{u} \perp \vec{v}$
 22 | 
 23 | ### Cross Product (3D Only)
 24 | $$\vec{u} \times \vec{v} = (u_2v_3 - u_3v_2,\ u_3v_1 - u_1v_3,\ u_1v_2 - u_2v_1)$$
 25 | - **Properties**:
 26 |   - Result is perpendicular to input vectors (right-hand rule determines direction)
 27 |   - $\|\vec{u} \times \vec{v}\| = \|\vec{u}\|\|\vec{v}\| \sin{\theta}$ = area of parallelogram spanned by $\vec{u},\vec{v}$
 28 | 
 29 | ### Norms
 30 | $$ \|\vec{x}\|_p = \left( \sum_{i=1}^{n} |x_i|^p \right)^{1/p} \quad (p \geq 1) $$
 31 | - **Special Cases**:
 32 |   - Euclidean ($L^2$): $\|\vec{x}\|_2 = \sqrt{\sum x_i^2}$
 33 |   - Manhattan ($L^1$): $\|\vec{x}\|_1 = \sum |x_i|$
 34 |   - Infinity ($L^\infty$): $\|\vec{x}\|_\infty = \max |x_i|$
 35 | - **Convention**: $|\vec{v}|$ denotes Euclidean norm
 36 | 
 37 | ### Linear Combinations & Span
 38 | - **Linear Combination**:  
 39 |   $\vec{w} = \sum_{i=1}^{n} c_i \vec{v}_i$  
 40 |   (Scalars $c_i$, vectors $\vec{v}_i$)
 41 | - **Span**: Set of all linear combinations of $\{\vec{v}_1,\dots,\vec{v}_n\}$  
 42 |   Example: $\text{span}\left(\begin{bmatrix}1\\0\\0\end{bmatrix},\begin{bmatrix}0\\1\\0\end{bmatrix}\right)$ = $xy$-plane in $\mathbb{R}^3$
 43 | 
 44 | ### Orthonormal Vectors
 45 | Set $\{\vec{v}_1,\dots,\vec{v}_n\}$ satisfies:
 46 | 1. **Orthogonal**: $\vec{v}_i \cdot \vec{v}_j = 0 \quad (i \neq j)$
 47 | 2. **Unit Length**: $ \forall i \quad \|\vec{v}_i\| = 1$
 48 | 
 49 | ---
 50 | 
 51 | ## Matrix Operations
 52 | 
 53 | ### Notations
 54 | - $A \in \mathbb{R}^{n\times m}$ Represents a matrix $A$ is a matrix of dimensions $n \times n$ whose entries are real numbers.
 55 | - $I$ is the identity matrix. for $I_3$, or the three dimension one, it would be $\begin{pmatrix} 1 & 0 & 0 \\ 0 & 1 & 0 \\ 0 & 0 & 1 \end{pmatrix}$.
 56 | - $A_{i,j}$ means the $i$ th row and $j$ th column of matrix $A$.
 57 | 
 58 | ### Basic Operations
 59 | - **Addition and Subtraction**:
 60 |  Only matrices of the same dimensions can be added or subtracted. $(A+B)_{i,j} = A_{i,j} + B_{i,j}$. $(A-B)_{i,j} = A_{i,j} - B_{i,j}$.
 61 | - **Scalar Multiplication**:
 62 | Let $k$ be a scalar. $(kA)_{i,j} = k \times A_{i,j}$.
 63 | 
 64 | ### Transpose
 65 | This is essentially flipping a matrix over its main diagonal(the upper-left to down-right diagonal). let $A = \begin{pmatrix} a & b \\ c& d \end{pmatrix}$ then its transpose $A^T = \begin{pmatrix} a & c \\ b& d \end{pmatrix}$
 66 | 
 67 | ### Determinant
 68 | The determinant of a matrix, denoted as $\text{det}(A)$, provides important information about a matrix.
 69 | - A matrix $A$ is invertible if and only if $\text{det}(A) \neq 0$. We call a matrix without an inverse singular.
 70 | - The determinant can represent the scaling factor of the volume change when the matrix is applied as a linear transformation. In $\mathbb{R}^2$, the determinant of a $R^{2\times2}$ matrix represents the area scaling factor of the parallelogram formed by the column vectors of the matrix. 
 71 | 
 72 | **Properties**:
 73 | - Cramer's rule: The solution components of $A\vec{x}=\vec{b}$ are given by $x_i = \frac{\text{det}(A_i)}{\text{det}A}$, where $A_i$ is the matrix formed by replacing the $i$th column of $A$ with the vector $\vec{b}$
 74 | - Row operations: 
 75 |   - Swapping two rows changes the sign of the determinant
 76 |   - Multiplying a row by a scalar $k$ multiplies the determinant by $k$
 77 |   - Adding a multiple of one row to another rorw doesn't change the determinant
 78 | - $\text{det}(AB) = \text{det}(A)\cdot \text{det}(B)$
 79 | - $\text{det}(A^T)=\text{det}(A)$
 80 | - $\text{det}(0) = 0 \quad \text{det}(I) = 1$ (this means at any valid dimension where the identity matrix is defined, its determinant if 1)
 81 | - diagonal matrix: a matrix that only has entries on its main diagonal. for a diagonal matrix, its determinant is the product of all of its entries.
 82 | 
 83 | **Calculation**
 84 | - For a $1 \times 1$ matrix $A=\begin{pmatrix} a \end{pmatrix}$, $\text{det}(A) = a$.
 85 | - For a $2 \times 2$ matrix $A=\begin{pmatrix} a & b \\ c & d \end{pmatrix}$, $\text{det}(A) = ad - bc$.
 86 | - For $n \times n$ matrix, the determinant can be found by the property of diagonal matrices. This means we can use Gaussian elimination (covered below) to find a diagonal matrix and then find the determinant of that diagonal matrix. Remember to multiply the final determinant with the scale factors of each row due to the second property written in row operations. 
 87 | 
 88 | ### Inverse
 89 | **only square matrices that are non-singular (non-zero determinant) have inverses.** The inverse of $A$, denoted by $A^{-1}$, is the matrix such that $A \times A^{-1} = A^{-1} \times A = I$. To find the inverse, you can use gaussian elimination to solve the equation where the augmented matrix would be: $A|I$. For a $2\times2$ matrix, $\begin{pmatrix} a & b \\ c & d \end{pmatrix}, its inverse is $\frac{1}{ad-bc}\begin{pmatrix} d & -b \\ -c & a \end{pmatrix}$.
 90 | 
 91 | ### Matrix Multiplication
 92 | For $A \in \mathbb{R}^{m \times p}$, $B \in \mathbb{R}^{p \times n}$:  
 93 | $$C = AB \quad \text{where} \quad c_{ij} = \sum_{k=1}^{p} a_{ik}b_{kj}$$
 94 | - $c_{ij}$ = dot product of $i$-th row of $A$ and $j$-th column of $B$
 95 | 
 96 | ![image.png](attachment:image.png)
 97 | 
 98 | ---
 99 | ## Linear Transformations and Eigen Theory
100 | ### Linear Maps
101 | A linear map (linear transformation) is a function $T:V \to W$ between vector spaces $V$ and $W$ over field $\mathbb{F}$ satisfying $\forall \vec{u},\vec{v} \in V, k\in \mathbb{F}$:
102 | 1. $T(\vec{u} + \vec{v}) = T(\vec{u}) + T(\vec{v})$
103 | 2. $T(k\vec{v}) = kT(\vec{v})$
104 | 
105 | ### Matrix Representation
106 | Matrix multiplication represents linear maps. For $A \in \mathbb{F}^{m \times n}$, define $T_A : \mathbb{F}^n \to \mathbb{F}^m$ by:
107 | $$T_A(\vec{x}) = A\vec{x}$$
108 | This satisfies linearity:
109 | - $T_A(\vec{x} + \vec{y}) = A(\vec{x} + \vec{y}) = A\vec{x} + A\vec{y} = T_A(\vec{x}) + T_A(\vec{y})$
110 | - $T_A(k\vec{x}) = A(k\vec{x}) = k(A\vec{x}) = kT_A(\vec{x})$
111 | 
112 | ### Example: Rotation in $\mathbb{R}^2$
113 | The rotation matrix for counter-clockwise rotation by $\theta$:
114 | $$R_\theta = \begin{bmatrix}
115 | \cos\theta & -\sin\theta \\
116 | \sin\theta & \cos\theta
117 | \end{bmatrix}$$
118 | defines linear map $T_\theta(\vec{x}) = R_\theta \vec{x}$.
119 | 
120 | For $\theta = 60^\circ = \frac{\pi}{3}$:
121 | $$R_{60^\circ} = \begin{bmatrix}
122 | \frac{1}{2} & -\frac{\sqrt{3}}{2} \\
123 | \frac{\sqrt{3}}{2} & \frac{1}{2}
124 | \end{bmatrix}$$
125 | 
126 | #### Transforming Standard Basis:
127 | $$\mathbf{e}_1 = \begin{bmatrix}1\\0\end{bmatrix},\quad \mathbf{e}_2 = \begin{bmatrix}0\\1\end{bmatrix}$$
128 | 
129 | $$T_{60^\circ}(\mathbf{e}_1) = \begin{bmatrix}\frac{1}{2} \\ \frac{\sqrt{3}}{2}\end{bmatrix},\quad T_{60^\circ}(\mathbf{e}_2) = \begin{bmatrix}-\frac{\sqrt{3}}{2} \\ \frac{1}{2}\end{bmatrix}$$
130 | 
131 | Geometrically: All vectors rotated counter-clockwise by $60^\circ$ about origin, preserving lengths and angles.
132 | 
133 | ### Eigenvectors and Eigenvalues
134 | An eigenvector $\vec{v} \neq \vec{0}$ of linear transformation $T$ is a direction invariant under $T$, scaled by eigenvalue $\lambda$:
135 | $$T(\vec{v}) = \lambda \vec{v}$$
136 | 
137 | - $\lambda > 0$: Direction preserved
138 | - $\lambda < 0$: Direction reversed
139 | 
140 | #### Finding Eigenvalues/Eigenvectors
141 | For matrix $A$ representing $T$:
142 | 1. Solve characteristic equation: $\det(A - \lambda I) = 0$
143 | 2. For each $\lambda_i$, solve $(A - \lambda_i I)\vec{v} = \vec{0}$
144 | 
145 | ##### Example
146 | For $A = \begin{pmatrix} 3 & 1 \\ 0 & 2 \end{pmatrix}$:  
147 | Characteristic equation:  
148 | $$\det\begin{pmatrix} 3-\lambda & 1 \\ 0 & 2-\lambda \end{pmatrix} = (3-\lambda)(2-\lambda) = 0$$  
149 | Eigenvalues: $\lambda_1 = 3, \lambda_2 = 2$
150 | 
151 | - For $\lambda_1=3$:  
152 | $$\begin{pmatrix}0 & 1 \\ 0 & -1\end{pmatrix}\begin{pmatrix}v_1\\v_2\end{pmatrix} = \begin{pmatrix}0\\0\end{pmatrix} \implies v_2=0$$  
153 | Eigenvectors: $\begin{pmatrix}v_1\\0\end{pmatrix} = v_1\begin{pmatrix}1\\0\end{pmatrix} \ (v_1 \neq 0)$
154 | 
155 | - For $\lambda_2=2$:  
156 | $$\begin{pmatrix}1 & 1 \\ 0 & 0\end{pmatrix}\begin{pmatrix}v_1\\v_2\end{pmatrix} = \begin{pmatrix}0\\0\end{pmatrix} \implies v_1 = -v_2$$  
157 | Eigenvectors: $\begin{pmatrix}v_1\\-v_1\end{pmatrix} = v_1\begin{pmatrix}1\\-1\end{pmatrix} \ (v_1 \neq 0)$
158 | 
159 | *Note that the determinant of a matrix is equal to the product of its eigenvalues.
160 | 
161 | ---
162 | 
163 | ## Matrix Decomposition
164 | ### Gaussian Elimination
165 | 
166 | This is an algorithm for solving a linear equation system using linear algebra methods. The idea is to rewrite the matrix into an upper triangular matrix that represents the equation system. For example, for the linear equation:
167 | $$\begin{cases} x + 2y - 4z = 5 \\ 2x + y - 6z = 8 \\ 4x - y - 12z = 13 \end{cases}$$
168 | We have a coefficient matrix and an augmented matrix
169 | $$ A_{\text{Coefficient}} = \begin{bmatrix} 1 & 2 & {-4} \\ 2 & 1 & {-6} \\ 4 & {-1} & {-12} \end{bmatrix}$$
170 | $$
171 | A_{\text{Augmented}} = \left[
172 | \begin{array}{rrr|r}
173 | 1 & 2 & -4 & 5 \\
174 | 2 & 1 & -6 & 8 \\
175 | 4 & -1 & -12 & 13
176 | \end{array}
177 | \right]
178 | $$
179 | 
180 | The corresponding upper triangular matrix would be
181 | $$
182 | A_{\text{Augmented}} = \left[
183 | \begin{array}{rrr|r}
184 | 1 & 2 & -4 & 5 \\
185 | 0 & -3 & 2 & -2 \\
186 | 0 & 0 & -2 & -1
187 | \end{array}
188 | \right]
189 | $$
190 | The equivalent linear system is
191 | $$\begin{cases} x + 2y - 4z = 5 \\ -3y + 2z = -2 \\ -2z = -1 \end{cases}$$
192 | 
193 | How is this done? This is an example to show the process. We name the $i$th row as $R_i$, the $i$th column as $C_i$, and $M_{i,j}$ the value in the $i$th row and the $j$th column. we start with $i=1$ and end at $i=n$, which is the number of rows the matrix has. Define constant $k\in \mathbb{R}$.
194 | 
195 | $$
196 | \left[
197 | \begin{array}{rrr|r}
198 | 2 & 2 & 6 & 4 \\
199 | 2 & 1 & 7 & 6 \\
200 | -2 & -6 & -7 & -1
201 | \end{array}
202 | \right]
203 | $$
204 | 
205 | We first scale $R_i$ such that $M_{i,i}=1$. 
206 | 
207 | $$
208 | \left[
209 | \begin{array}{rrr|r}
210 | 1 & 1 & 3 & 2 \\
211 | 2 & 1 & 7 & 6 \\
212 | -2 & -6 & -7 & -1
213 | \end{array}
214 | \right]
215 | $$
216 | 
217 | We add $k \cdot R_i$ to all $R_j$ with $j>i$ such that $M_{j,i} = 0$.
218 | 
219 | $$
220 | \left[
221 | \begin{array}{rrr|r}
222 | 1 & 1 & 3 & 2 \\
223 | 0 & -1 & 1 & 2 \\
224 | 0 & -4 & -1 & 3
225 | \end{array}
226 | \right]
227 | $$
228 | 
229 | We repeat the process on the next $i$. Here $i=2$, so we scale the second row such that $M_{i,i}=1$.
230 | 
231 | $$
232 | \left[
233 | \begin{array}{rrr|r}
234 | 1 & 1 & 3 & 2 \\
235 | 0 & 1 & -1 & -2 \\
236 | 0 & -4 & -1 & 3
237 | \end{array}
238 | \right]
239 | $$
240 | 
241 | We then add a multiple of $R_2$ to all rows under it such that the second element of each such row becomes $0$. Here, we add $4\cdot R_2$ to $R_3$.
242 | 
243 | $$
244 | \left[
245 | \begin{array}{rrr|r}
246 | 1 & 1 & 3 & 2 \\
247 | 0 & 1 & -1 & -2 \\
248 | 0 & 0 & -5 & -5
249 | \end{array}
250 | \right]
251 | $$
252 | 
253 | We then repeat the process for the last row.
254 | 
255 | $$
256 | \left[
257 | \begin{array}{rrr|r}
258 | 1 & 1 & 3 & 2 \\
259 | 0 & 1 & -1 & -2 \\
260 | 0 & 0 & 1 & 1
261 | \end{array}
262 | \right]
263 | $$
264 | 
265 | We now have a upper triangular matrix that can be easily used to find the solutions of the initial linear equation system. To solve the system, we essentially need to find $M_{ij}$ where $i=j$. Starting with $i=n$, we reverse the process. We take $k\cdot R_i$ and add it to all $R_j$ with $j<i$ such that $M_{j,i} = 0$.
266 | 
267 | $$
268 | \left[
269 | \begin{array}{rrr|r}
270 | 1 & 1 & 0 & -1 \\
271 | 0 & 1 & 0 & -1 \\
272 | 0 & 0 & 1 & 1
273 | \end{array}
274 | \right]
275 | $$
276 | 
277 | We repeat for $i=n-1$.
278 | 
279 | $$
280 | \left[
281 | \begin{array}{rrr|r}
282 | 1 & 0 & 0 & 0 \\
283 | 0 & 1 & 0 & -1 \\
284 | 0 & 0 & 1 & 1
285 | \end{array}
286 | \right]
287 | $$
288 | 
289 | A generalization in natural language:
290 | 
291 | start with $i=1$
292 | 
293 | 1. Start with $i=1$, Scale $R_i$ such that $M_{i,i}=1$.
294 | 2. Add $k \cdot R_i$ to all $R_j$ with $j>i$ such that $M_{j,i} = 0$.
295 | 3. Move to the next $i$ by adding $1$
296 | 
297 | After this has been done to all rows:
298 | 1. Add $k \cdot R_i$ to all $R_j$ with $j<i$ such that $M_{j,i}=0$
299 | 2. move to the next $i$ by subtracting $1$
300 | 
301 | A psuedo code:
302 | 
303 | $$
304 | \begin{aligned}
305 | &\textbf{Pseudocode for Gaussian Elimination:} \\
306 | &\textbf{Input:} \text{Matrix } M \text{ of size } n \times n \\
307 | &\textbf{Output:} \text{Row-echelon form of } M \\
308 | & \\
309 | &\text{1. Forward Elimination:} \\
310 | &\text{for } i = 1 \text{ to } n-1 \text{ do} \hspace{1em} \\
311 | &\hspace{2em} \text{(a) Scale } R_i \text{ such that } M_{i,i} = 1: \\
312 | &\hspace{3em} \text{if } M_{i,i} \neq 0 \text{ then} \\
313 | &\hspace{4em} \text{Scale factor } s = \frac{1}{M_{i,i}} \\
314 | &\hspace{4em} R_i = s \cdot R_i \\
315 | &\hspace{3em} \text{end if} \\
316 | &\hspace{2em} \text{(b) Add } k \cdot R_i \text{ to all } R_j \text{ with } j > i \text{ such that } M_{j,i} = 0: \\
317 | &\hspace{3em} \text{for } j = i+1 \text{ to } n \text{ do} \hspace{1em} \\
318 | &\hspace{4em} \text{if } M_{j,i} \neq 0 \text{ then} \\
319 | &\hspace{5em} k = -M_{j,i} \\
320 | &\hspace{5em} R_j = R_j + k \cdot R_i \\
321 | &\hspace{4em} \text{end if} \\
322 | &\hspace{3em} \text{end for} \\
323 | &\text{end for} \\
324 | & \\
325 | &\text{2. Backward Elimination:} \\
326 | &\text{for } i = n-1 \text{ down to } 1 \text{ do} \hspace{1em} \\
327 | &\hspace{2em} \text{(a) Add } k \cdot R_i \text{ to all } R_j \text{ with } j < i \text{ such that } M_{j,i} = 0: \\
328 | &\hspace{3em} \text{for } j = i-1 \text{ down to } 1 \text{ do} \hspace{1em} \\
329 | &\hspace{4em} \text{if } M_{j,i} \neq 0 \text{ then} \\
330 | &\hspace{5em} k = -M_{j,i} \\
331 | &\hspace{5em} R_j = R_j + k \cdot R_i \\
332 | &\hspace{4em} \text{end if} \\
333 | &\hspace{3em} \text{end for} \\
334 | &\text{end for} \\
335 | & \\
336 | &\textbf{End}
337 | \end{aligned}
338 | $$
339 | 
340 | ---
341 | 
342 | ## Algebraic Structures
343 | ### Fields ($\mathbb{F}$)
344 | Set of scalars with two operations (`+`, `·`) satisfying:
345 | 1. **Closure** under addition and multiplication
346 | 2. **Commutativity**: $a+b=b+a$, $a·b=b·a$
347 | 3. **Associativity**: $(a+b)+c=a+(b+c)$, $(a·b)·c=a·(b·c)$
348 | 4. **Identities**:  
349 |    - Additive: $a+0=a$  
350 |    - Multiplicative: $a·1=a$ $(1 \neq 0)$
351 | 5. **Inverses**:  
352 |    - Additive: $a + (-a) = 0$  
353 |    - Multiplicative: $a·a^{-1}=1$ $(a \neq 0)$
354 | 6. **Distributivity**: $a·(b+c) = a·b + a·c$
355 | 
356 | *Example*: $\mathbb{R}$ (real numbers)
357 | 
358 | ### Vector Spaces ($V$ over $\mathbb{F}$)
359 | Set of vectors with two operations (`+`, scalar multiplication) satisfying:
360 | 1. **Closure** under vector addition and scalar multiplication
361 | 2. **Commutativity/Associativity** of addition
362 | 3. **Zero vector**: $\vec{v} + \vec{0} = \vec{v}$
363 | 4. **Additive inverse**: $\vec{v} + (-\vec{v}) = \vec{0}$
364 | 5. **Distributivity**:  
365 |    - $k(\vec{u} + \vec{v}) = k\vec{u} + k\vec{v}$  
366 |    - $(α+β)\vec{v} = α\vec{v} + β\vec{v}$
367 | 6. **Associativity of scalars**: $α(β\vec{v}) = (αβ)\vec{v}$
368 | 7. **Multiplicative identity**: $1·\vec{v} = \vec{v}$
369 | 
370 | *Examples*: $\mathbb{R}^n$, polynomials of degree $\leq 4$, $2\times 3$ matrices
371 | 
372 | ### Subspaces ($W \subseteq V$)
373 | Subset satisfying:
374 | 1. $k \in \mathbb{F}, \vec{w} \in W \implies k\vec{w} \in W$
375 | 2. $\vec{u},\vec{v} \in W \implies \vec{u} + \vec{v} \in W$
376 | 
377 | *Equivalent to*: $W$ is itself a vector space  
378 | *Example*: Plane $x+y+z=0$ in $\mathbb{R}^3$
379 | 
380 | ### Basis
381 | Set $B = \{\vec{b}_1,\dots,\vec{b}_n\}$ that is:
382 | 1. **Linearly Independent**:  
383 |    $\sum c_i\vec{b}_i = \vec{0} \implies c_i = 0 \ \forall i$
384 | 2. **Spanning**: $\forall \vec{v} \in V,\ \exists c_i : \vec{v} = \sum c_i\vec{b}_i$
385 | - **Dimension** ($\dim V$): Number of basis vectors ($n$)
386 | - *Example*: Standard basis $\{(1,0,0),(0,1,0),(0,0,1)\}$ for $\mathbb{R}^3$
387 | 


--------------------------------------------------------------------------------
/4. Deep Learning & Computer Vision/4.2. Backpropagation, Gradient Descent, Adaptive Moment Estimation/Backpropagation.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 | # All About **Backpropagation**
  4 | ###### **Will Chen** | SHSID Data Science Group
  5 | 
  6 | ## Key Questions
  7 | 
  8 | <div align="left">
  9 |   
 10 | Backpropagation is an important step in supervised learning procedures. By the end of the lesson, you will be able to answer the following **key questions**: 
 11 | 1.	What is backpropagation and how is it used? 
 12 | 2.	What are the similarities and differences between forward propagation and backpropagation? 
 13 | 3.	How does backpropagation specifically function? 
 14 | 4.	Why is it such an important process and why is it used so much? 
 15 | 
 16 | <div align="center">
 17 |   
 18 | ## Key Terms
 19 | <div align="left">
 20 | In order to understand this lesson, you would have a grasp of the following key concepts and terms: 
 21 | 
 22 | - **Layers**: Each network has many layers. The first layer takes in the input vector and the last layer is output. In the diagram below, each layer is one column. 
 23 | 
 24 | - **Vector**: A list of the numbers in each layer of a neural network. 
 25 | 
 26 | - **Neural Network**: _Input vectors_ (blue) are passed through the _hidden layers_ (green) to form an _output vector_ (yellow). 
 27 | 
 28 | - **Neuron**: A node in the network, each containing a special number called weight. In the diagram, it is each circle (🟢). When running the model, each neuron multiplies its weight with the previous layer’s vector and adds them together, forming its result and ready to be used by the next layer’s neurons.
 29 | 
 30 | - **Hidden Layer**: Contains a series of weights for each result of each previous layer to multiply with, producing its output value. 
 31 | 
 32 | - **Matrix Multiplication**: The method used to multiply vectors with weights. Each vector is multiplied by the weight once and passed on to the next layer. 
 33 | 
 34 | - **Epoch**: One training generation. One round of backpropagation (explained later) is used per epoch.
 35 | 
 36 | - **Loss**: During supervised training, the model’s output is continuously compared with the real outcome. The purpose of the model is to form results as close to the real outcome as possible, in other words, minimize the loss. The difference between output vector and the real outcome vector is known as loss.
 37 | 
 38 | - **Supervised training**: A type of training method where the expected input and result vectors are labeled for the model to train with. 
 39 | 
 40 | 
 41 | <img align="left" width="200" height="417" alt="image" src="https://github.com/user-attachments/assets/0a0752e0-6001-4c1a-a465-6ac34f0ac756" />
 42 | 
 43 | <img align="center" width="200" height="240" alt="image" src="https://github.com/user-attachments/assets/2f505d8a-e71f-408f-af05-3f597da55802" />
 44 | 
 45 | <img align="right" width="200" height="700" alt="image" src="https://github.com/user-attachments/assets/47bbc890-cec4-42b7-a689-c4fbacb51346" />
 46 | 
 47 | <div align="center">
 48 |   
 49 | ## Introduction to Backpropagation
 50 | <div align="left">
 51 |   
 52 | ### What is backpropagation?
 53 | 
 54 | **Backpropagation** is a crucial step in neural networks that improves the accuracy of a model by traversing a network in reverse to find out how much each weight contributed to the model’s inaccuracy. 
 55 | 
 56 | <img align="right" width="300" height="385" alt="image" src="https://github.com/user-attachments/assets/70fd512a-79ef-4dda-a297-626b5def20b5" />
 57 | 
 58 | As we know from **forward propagation**, multilayer perceptron networks work by moving through layers, from the input vector layer to the output. Back propagation, as the name suggests, moves in reverse: It **starts with the very last layer** (output) and **propagates back to the previous layers**. When forward propagation is useful in running the model, backpropagation is useful in training. 
 59 | 
 60 | **Think of it like solving a math formula**: You know you made a mistake along the way because your answer is different from the answer key. So, you want to look through your steps in reverse to adjust them. Similarly, when a model returns something we don’t expect, we can use backpropagation to **adjust each neuron until it returns the things we like**. 
 61 | 
 62 | ### What happens in training?
 63 | 
 64 | Training a model is a** very repetitive process**. From earlier on, you might recall that **epochs** are the individual rounds of training, and that the goal of training a model is to **minimize the errors it makes**. During each epoch, several things happen: 
 65 | 
 66 | 1.	We run the model through its neurons (aka Forward Propagation). 
 67 | 2.	The model gives us its predictions (in the final output vector layer). 
 68 | 3.	We compare the prediction with the real, expected result (given by our training dataset). 
 69 | 4.	**We use backpropagation as a tool to know which neurons went wrong, and to what extent**. 
 70 | 5.	We adjust the neurons and move on to the next epoch, and so on until we’re happy. 
 71 | 
 72 | <img align="right" width="300" height="456" alt="image" src="https://github.com/user-attachments/assets/2247773c-a586-487a-a8f3-e02cc36f4be0" />
 73 | 
 74 | **But what happens before any epoch**? We must set up the model. Usually, models begin with “blank” neurons, or neurons that don’t contain any special information about their weights and biases. When we run this blank model for the first time with our training features, it probably won’t return something we’d like. 
 75 | 
 76 | When we compare the model’s predictions with the labels, we get a difference, which is known as loss. This difference is crucial because it tells us if our predictions are close enough or not. But it doesn’t tell us what exactly went wrong. How do we know that? **The answer? We do it through backpropagation**. 
 77 | 
 78 | ### How does backpropagation really work?
 79 | 
 80 | Think of it like **taking blame**. We start off with knowing a difference (or loss) between the expected and predicted results, which we don’t like. Of course, the neurons at the very end of the network are **most directly responsible for the loss** since they are closest to the output neurons, so they take all the blame first. Then, using a special math property called the **chain rule**, the blame is further **passed down along the line from back to front**, and **distributed across the entire network**. 
 81 | 
 82 | #### Understanding loss calculation
 83 | 
 84 | First, we need to get how the loss is exactly calculated. As we know, the model produces predictions that can be compared with expected predictions. For each forward propagation, loss is taken as a single number. Since there can be a lot of output neurons, we need to take the mean of the errors. 
 85 | 
 86 | But it’s not just any mean. Because we’re dealing with some calculus (later on), scientists realized that taking the **MSE (Mean Squared Error)** will make the calculations a lot easier than just taking the mean regularly, or MAE (Mean Absolute Error). **MSE is defined by the average squared difference between the predictions and expected results**. 
 87 | 
 88 | #### Dealing with gradients
 89 | 
 90 | A key optimizer within backpropagation is **gradient descent**. It’s a type of iterative optimization algorithm; in other words, it slowly solves a problem iteratively one by one. Its use in backpropagation is that it tells us **how much we need to change a parameter in order to minimize the loss function**.
 91 | 
 92 | <img align="right" width="300" height="373" alt="image" src="https://github.com/user-attachments/assets/5fcc7eb4-eb6d-4ad5-8858-50a32eb900f8" />
 93 | 
 94 | Reviewing some calculus terminology: 
 95 | 
 96 | -	The **gradient** of a function is a vector of its _partial derivatives_. 
 97 | -	A **partial derivative** is essentially the rate of change of an object in the x direction, shown by the black line on the right image. 
 98 | -	The **derivative** of the function measures the weight change with respect to the input’s change. It tells us the _direction of the function._ 
 99 | -	The **gradient** of the function, then, will tell us the magnitude of the function, otherwise _how much our parameters need to change_. 
100 | -	**E_tot** is our _“surface”_ for gradient descent minimization, similar to a _“warped 3D surface”_ like the picture to the right. This picture shows 3 dimensions, assuming there are 2 weights to work with and 1 local gradient. 
101 | 
102 | With this knowledge in mind, our aims in backpropagation becomes clear. We want to: 
103 | 
104 | -	Get the _gradient_ of our error (magnitude to adjust our parameters)
105 | -	The _opposite_ of our gradient would be the _direction for us to descend_ on E_tot. 
106 | -	Taking the same image as above, gradient descent is similar to **rolling a ball down the hill**. 
107 | -	When the ball reaches the very bottom, the slope of the surface is 0, and that’s the most optimal way we can adjust our loss to minimize it. 
108 | 
109 | In summary, the concept of Gradient Descent follows the derivatives to essentially “roll” down the slope until it finds its way to the minimum. 
110 | 
111 | #### Using the Chain Rule 
112 | 
113 | So, we know we need to find the gradient (the slope of our error surface) to roll the ball downhill towards the minimum loss. But a neural network has many layers and many weights. How do we figure out the specific gradient for a weight buried deep inside the network? This is where a fundamental concept from calculus, the **Chain Rule**, becomes the key to our lesson.
114 | 
115 | The Chain Rule allows us to calculate how one variable affects another indirectly, through a chain of intermediate variables. The final loss is directly affected by the output of the last hidden layer. The output of that last hidden layer is affected by its weights and the output of the layer before it, and so on. **The Chain Rule gives us a precise mathematical way to quantify and pass this blame backward through the network**.
116 | 
117 | For any given weight in the network, we want to calculate its "blame", or more formally, the partial derivative of the total error (E_tot) with respect to that weight (w). This tells us: "_If I change this specific weight just a tiny bit, how much will the total error of the network change_?"
118 | 
119 | Using the Chain Rule, we can break this down into **three manageable pieces** for a weight connected to an output neuron:
120 | 
121 | 1.	How much did the total **error** change with respect to the neuron's **final output**? 
122 | 2.	How much did the neuron's **final output** change with respect to **its pre-activated input**? 
123 | 3.	How much did the neuron's **pre-activated input** change with respect to the **weight**? 
124 | 
125 | By multiplying these three rates of change together, **the Chain Rule gives us the overall gradient for that one weight**. For weights in earlier hidden layers, the chain just gets longer, but the principle is exactly the same. The error is propagated backward, layer by layer, **with each layer using the error signal calculated from the layer in front of it**.
126 | 
127 | #### Making adjustments with the Update Rule
128 | 
129 | Once backpropagation has used the Chain Rule to calculate the gradient for every single weight in the network, we know **two things** for each weight:
130 | 
131 | 1.	The **direction** of the steepest increase in error _(the gradient itself)_.
132 | 2.	The **magnitude** of that slope _(how much the error will increase)_.
133 | 
134 | Since our goal is to decrease the error, we simply go in the opposite direction of the gradient. This brings us to the **Update Rule**. For each weight, _we perform the following calculation_:
135 | 
136 | $$ W_{new} = W_{old} - (LR * Gradient) $$
137 | 
138 | Let's break this down:
139 | 
140 | -	**Old Weight** (W_old): The current value of the weight before the update.
141 | -	**Gradient**: The value we just calculated through backpropagation for this specific weight. It tells us which way is "uphill."
142 | -	**Learning Rate**: This is a small number (e.g., 0.01) that we choose before training starts. It's a crucial parameter that controls _how big of a step we take downhill_. 
143 | > If the learning rate is _too large_, we might overshoot the bottom of the valley and _end up on the other side_ (overfitting). 
144 | > If it's _too small_, training will _take an incredibly long time and explode_, like taking tiny baby steps down a huge mountain (exploding). 
145 | 
146 | This update process is performed for _every weight and bias_ in the entire network during each training epoch.
147 | 
148 | #### Linear regression
149 | 
150 | If this still seems abstract, let's think about the simplest possible "network": a **Linear Regression model**. A linear regression model tries to fit a _straight line (y = mx + b) to a set of data points_.
151 | 
152 | In the context of this equation,
153 | 
154 | $$ y = mx + b $$
155 | 
156 | -	x is our input.
157 | -	m (the slope) and b (the y-intercept) are our "weights" or parameters.
158 | -	y is our prediction.
159 | 
160 | The "loss" is typically the **Mean Squared Error (MSE)** between our predicted y values and the actual y values from the data. To find the best line, we need to find the values of m and b that minimize this loss. How do we do that? With gradient descent! We would:
161 | 
162 | -	Calculate the partial derivative of the MSE **with respect to m**.
163 | -	Calculate the partial derivative of the MSE **with respect to b**.
164 | -	Use these gradients in the update rule to slowly _adjust m and b until the loss is at a minimum_.
165 | 
166 | Backpropagation is simply a **generalization of this exact process** for a much more complex model with many layers and non-linear activation functions. It's a clever, systematic way of applying gradient descent to millions of parameters simultaneously.
167 | 
168 | #### Other key aspects of backpropagation
169 | 
170 | ##### Activation functions
171 | 
172 | Remember that each neuron applies an **activation function** (like Sigmoid or ReLU) to its input. When we use the Chain Rule, we need to calculate the _derivative of this activation function_. This is a critical reason why activation functions **must be differentiable** (i.e., we can find their slope at any point)._ A function with a "clean" and easy-to-calculate derivative makes the math of backpropagation much more efficient_.
173 | 
174 | ##### Different versions of gradient descent
175 | 
176 | We don't have to calculate the loss over the entire dataset before making one single update. This would be very slow. Instead, we use different strategies:
177 | 
178 | -	**Batch Gradient Descent**: _The "classic" approach_. We run all training examples through the network, average their gradients, and then update the weights once. It's stable but _memory-intensive and slow for large datasets_.
179 | -	**Stochastic Gradient Descent** (SGD): _The opposite extreme_. We update the weights after every single training example. It's much faster and can escape shallow local minima, but the _updates can be very "noisy" and erratic_.
180 | -	**Mini-Batch Gradient Descent**: The _best of both worlds_ and the most common method. We divide the training data into small batches (e.g., 32, 64, or 128 examples), and we update the weights once per batch. This provides _a good balance between the stability of Batch GD and the speed of SGD_.
181 | 
182 | ##### Vanishing and exploding gradients
183 | 
184 | Backpropagation isn't without its challenges. In very deep networks (networks with many layers), _the error signal can run into problems as it's propagated backward_.
185 | 
186 | -	**Vanishing Gradients**: As the gradient is passed back, it can be multiplied by numbers less than one over and over. This can cause the gradient to become incredibly small, effectively "vanishing" by the time it reaches the early layers. When this happens, the weights in the early layers stop updating, and the network stops learning.
187 | -	**Exploding Gradients**: The opposite can also occur. The gradient can be repeatedly multiplied by numbers greater than one, causing it to become astronomically large. This leads to massive weight updates and makes the model unstable and unable to learn.
188 | 
189 | Modern deep learning has developed solutions to these problems, such as using specific activation functions like ReLU (whose derivative is a constant 1), better weight initialization methods, and normalization techniques, all designed to keep the backpropagated signal healthy and effective. 
190 | 
191 | <div align="center">
192 | 
193 | ### Conclusion
194 | <div align="left">
195 | 
196 | That’s all for our lesson on backpropagation! A lot of it relies on complex math, but it’s the heart of the model training process. This clever algorithm is what allows a neural network to truly learn from its mistakes by translating the final error into actionable feedback for every single weight.
197 | 
198 | By using the **chain rule** to propagate this error signal backward, it calculates the precise **gradient** needed to guide the model's optimization. These gradients ensure each **weight** is nudged in the exact direction that will **minimize the overall loss**. Repeated over thousands of **epochs**, this **iterative refinement** transforms a randomly initialized **network** into a powerful and accurate **predictive tool**. 
199 | 
200 | Ultimately, backpropagation is the engine that drives intelligence in most modern ML systems, turning abstract data into concrete knowledge. 
201 | 
202 | 


--------------------------------------------------------------------------------
/2. Machine Learning Generics/2.5. K-Nearest Neighbors, Clustering K-Means/KNN, Kmeans.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "222d8505",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# k-Nearest Neighbors & K-Means\n",
  9 |     "\n",
 10 |     "_Welcome back! Today we’ll build two core ML tools that you’ll use again and again._\n",
 11 |     "\n",
 12 |     "- **k-Nearest Neighbors (kNN)** — a simple, powerful **supervised** method\n",
 13 |     "- **K-Means** — a fast, practical **unsupervised** clustering method\n",
 14 |     "\n",
 15 |     "---\n",
 16 |     "\n",
 17 |     "## What you'll learn\n",
 18 |     "\n",
 19 |     "- kNN intuition and how prediction works\n",
 20 |     "- Distances, feature scaling, and choosing **k**\n",
 21 |     "- K-Means objective and the Lloyd’s algorithm loop\n",
 22 |     "- How to pick **K** (elbow & silhouette)\n",
 23 |     "- Minimal, copy-pasteable code for both\n",
 24 |     "\n",
 25 |     "---\n",
 26 |     "\n",
 27 |     "## Requirements\n",
 28 |     "\n",
 29 |     "- Vectors & basic distance (e.g., Euclidean)\n",
 30 |     "- Train/validation/test split\n",
 31 |     "- Python + NumPy/Matplotlib (optionally scikit-learn)\n",
 32 |     "\n",
 33 |     "---\n",
 34 |     "\n",
 35 |     "## 1) kNN — the core idea (intuition)\n",
 36 |     "\n",
 37 |     "It works on the idea that nearby points tend to share similar labels or values. The only real knob is **k**: small k focuses on very local patterns (can overfit to noise), while larger k smooths decisions (can miss fine detail). For classification, the class with the most support among the k neighbors is chosen; for regression, the neighbors’ values are averaged. Distance-weighting lets nearer neighbors count more than farther ones.\n",
 38 |     "\n",
 39 |     "> **“Tell me who your neighbors are, and I’ll tell you who you are.”**\n",
 40 |     "\n",
 41 |     "For a new point $x$:\n",
 42 |     "\n",
 43 |     "1. Find the **k closest** training points.\n",
 44 |     "2. **Classification:** majority vote of their labels.\n",
 45 |     "3. **Regression:** average their target values.\n",
 46 |     "\n",
 47 |     "```\n",
 48 |     "              ●  ●      Class A\n",
 49 |     "     x ?  →   ▲          Query\n",
 50 |     "              ○  ○      Class B\n",
 51 |     "```\n",
 52 |     "\n",
 53 |     "`x` takes the class of whichever neighbors are more common (or closer, if weighted).\n",
 54 |     "\n",
 55 |     "---\n",
 56 |     "\n",
 57 |     "## 2) Distance & scaling (the make-or-break)\n",
 58 |     "\n",
 59 |     "Common distances for vectors $x, y$:\n",
 60 |     "\n",
 61 |     "- **Euclidean:** $\\lVert x - y\\rVert_2$ (default for geometry)\n",
 62 |     "- **Manhattan:** $\\lVert x - y\\rVert_1$ (robust to outliers)\n",
 63 |     "- **Cosine distance:** $1 - \\dfrac{x^\\top y}{\\lVert x\\rVert\\,\\lVert y\\rVert}$ — “angle” difference (great for text/high-dim sparse)\n",
 64 |     "\n",
 65 |     "**Euclidean** distance treats differences along each feature dimension equally, so features with larger scales can dominate if you don’t standardize. **Manhattan** distance is less sensitive to outliers and can work better with high-variance features. **Cosine** similarity ignores magnitude and compares only direction, which is useful for sparse/high‑dimensional data (e.g., text). Standardizing features (mean 0, std 1) before kNN keeps distances meaningful. If two classes are tied by count, distance‑weighted voting often resolves the tie sensibly.\n",
 66 |     "\n",
 67 |     "> ⚠️ So **Always scale features** (e.g., standardize to mean 0, std 1).  \n",
 68 |     "> Otherwise the feature with the largest units dominates distance.\n",
 69 |     "\n",
 70 |     "**Distance-weighted vote (optional):**  \n",
 71 |     "$w_i=\\dfrac{1}{\\operatorname{dist}(x,x_i)+\\epsilon}$ so closer neighbors count more.\n",
 72 |     "\n",
 73 |     "---\n",
 74 |     "\n",
 75 |     "## 3) Choosing **k** & common pitfalls\n",
 76 |     "\n",
 77 |     "Pick k by validation or cross‑validation: try several values and choose the one with the best validation score.\n",
 78 |     "\n",
 79 |     "- **Small k** (1–3): low bias, high variance → can be noisy.\n",
 80 |     "- **Larger k** (5–21+): smoother, higher bias → may underfit.\n",
 81 |     "- Pick **k** via validation or cross-validation.\n",
 82 |     "\n",
 83 |     "**Pitfalls**\n",
 84 |     "\n",
 85 |     "- Not scaling features.\n",
 86 |     "- Using an unfitting distance (e.g., Euclidean on sparse text).\n",
 87 |     "- Class imbalance: consider distance-weighted vote or class-weighted strategies.\n",
 88 |     "\n",
 89 |     "---\n",
 90 |     "\n",
 91 |     "## 4) kNN — minimal code (easy to try)\n",
 92 |     "\n",
 93 |     "Before running the code below, here’s the flow you’ll see:\n",
 94 |     "\n",
 95 |     "- Make a small, separable dataset and split into train/test.\n",
 96 |     "- Fit the scaler **on training data only**, then transform both train and test.\n",
 97 |     "- Loop over a few k values; each model just stores the scaled training set.\n",
 98 |     "- Predict by finding the k nearest training points for each test point and aggregating their labels.\n",
 99 |     "\n",
100 |     "> Tip: run locally. Install: `pip install numpy scikit-learn`"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "id": "1b8b2ade",
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "import numpy as np\n",
111 |     "from sklearn.datasets import make_classification\n",
112 |     "from sklearn.model_selection import train_test_split\n",
113 |     "from sklearn.preprocessing import StandardScaler\n",
114 |     "from sklearn.neighbors import KNeighborsClassifier\n",
115 |     "from sklearn.metrics import accuracy_score\n",
116 |     "\n",
117 |     "# 1) Toy 2D dataset\n",
118 |     "X, y = make_classification(n_samples=400, n_features=2, n_redundant=0,\n",
119 |     "                           n_informative=2, n_clusters_per_class=1, random_state=7)\n",
120 |     "Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.25, random_state=7)\n",
121 |     "\n",
122 |     "# 2) Scale for distance-based methods\n",
123 |     "scaler = StandardScaler().fit(Xtr)\n",
124 |     "Xtr = scaler.transform(Xtr)\n",
125 |     "Xte = scaler.transform(Xte)\n",
126 |     "\n",
127 |     "# 3) Try a few k values (distance-weighted)\n",
128 |     "for k in [1, 3, 5, 11, 21]:\n",
129 |     "    clf = KNeighborsClassifier(n_neighbors=k, weights=\"distance\", metric=\"euclidean\")\n",
130 |     "    clf.fit(Xtr, ytr)\n",
131 |     "    acc = accuracy_score(yte, clf.predict(Xte))\n",
132 |     "    print(f\"k={k:>2}  acc={acc:.3f}\")"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "id": "47cefdd1",
138 |    "metadata": {},
139 |    "source": [
140 |     "**From-scratch (core idea):**"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "id": "dc7cb8af",
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "import numpy as np\n",
151 |     "\n",
152 |     "def knn_predict(Xtr, ytr, x, k=5):\n",
153 |     "    d = ((Xtr - x)**2).sum(axis=1)**0.5          # Euclidean\n",
154 |     "    idx = np.argpartition(d, k)[:k]              # top-k neighbors (unordered)\n",
155 |     "    votes = np.bincount(ytr[idx])\n",
156 |     "    return votes.argmax()"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "id": "6164d341",
162 |    "metadata": {},
163 |    "source": [
164 |     "---\n",
165 |     "\n",
166 |     "## 5) K-Means — the core idea (intuition)\n",
167 |     "\n",
168 |     "Choose **K** cluster centers (centroids) so that points are as close as possible to the centroid of the cluster they’re assigned to. The loss being minimized is the sum of **squared** distances to the nearest centroid, which naturally makes the mean the best representative for each cluster and favors compact, roughly spherical clusters.\n",
169 |     "\n",
170 |     "> **Goal:** group similar points into **K** clusters.  \n",
171 |     "> Each cluster has a center (the **mean**), and points belong to their nearest center.\n",
172 |     "\n",
173 |     "**Plain-English objective:** place $K $ centers so points are as close as possible (on average, squared) to their assigned center.\n",
174 |     "\n",
175 |     "**Mathematically, with clusters $C_k$ and centroids $\\mu_k$:**\n",
176 |     "\n",
177 |     "$$\n",
178 |     "\\min_{\\{C_k\\},\\{\\mu_k\\}} \\sum_{k=1}^{K} \\sum_{x\\in C_k} \\lVert x-\\mu_k\\rVert_2^2\n",
179 |     "$$\n",
180 |     "\n",
181 |     "and the centroid update\n",
182 |     "\n",
183 |     "$$\n",
184 |     "\\mu_k \\;=\\; \\frac{1}{\\lvert C_k\\rvert}\\sum_{x\\in C_k} x\\;.\n",
185 |     "$$\n",
186 |     "\n",
187 |     "---\n",
188 |     "\n",
189 |     "## 6) Lloyd’s algorithm (the standard K-Means loop)\n",
190 |     "\n",
191 |     "Two repeated steps drive the method:\n",
192 |     "\n",
193 |     "1. **assignment** — give each point to its nearest centroid.\n",
194 |     "2. **update** — move each centroid to the mean of its assigned points. Each step never increases the objective, so the loop converges to a local optimum. Initialization matters; using **k‑means++** spreads starting centroids apart and improves results. If a cluster becomes empty, re‑seed that centroid (e.g., pick a faraway point).\n",
195 |     "\n",
196 |     "Repeat until nothing changes.\n",
197 |     "\n",
198 |     "Each iteration never increases the objective → **converges** to a local optimum.\n",
199 |     "\n",
200 |     "**Good practice**\n",
201 |     "\n",
202 |     "- **k-means++** initialization (smart starting centers)\n",
203 |     "- **Multiple restarts** (keep the best run)\n",
204 |     "- **Scale features** before clustering\n",
205 |     "- Handle empty clusters by re-seeding (e.g., to a farthest point)\n",
206 |     "\n",
207 |     "---\n",
208 |     "\n",
209 |     "## 7) Picking **K** (how many clusters?)\n",
210 |     "\n",
211 |     "- **Elbow method:** plot $K$ vs **inertia** (sum of squared distances to centers).  \n",
212 |     "  Look for the “bend” where adding clusters gives diminishing returns.\n",
213 |     "- **Silhouette score** ($-1 \\ldots 1$): higher is better. Rough guide:  \n",
214 |     "  $0.5+$ good separation, $\\sim 0.25$ weak structure.\n",
215 |     "\n",
216 |     "---\n",
217 |     "\n",
218 |     "## 8) K-Means — minimal code (easy to try)\n",
219 |     "\n",
220 |     "Scan a few K values, compute inertia and silhouette for each, and then pick a reasonable K. Use `init=\"k-means++\"` and multiple `n_init` runs to avoid poor local optima. After selecting K, fit once more and evaluate."
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "id": "28510354",
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "import numpy as np\n",
231 |     "from sklearn.datasets import make_blobs\n",
232 |     "from sklearn.preprocessing import StandardScaler\n",
233 |     "from sklearn.cluster import KMeans\n",
234 |     "from sklearn.metrics import silhouette_score\n",
235 |     "\n",
236 |     "# 1) Synthetic data with 4 clusters\n",
237 |     "X, _, _ = make_blobs(n_samples=600, centers=4, cluster_std=1.2, random_state=42)\n",
238 |     "X = StandardScaler().fit_transform(X)\n",
239 |     "\n",
240 |     "# 2) Try several K to compute inertia and silhouette\n",
241 |     "Ks = range(2, 9)\n",
242 |     "inertias, sils = [], []\n",
243 |     "for K in Ks:\n",
244 |     "    km = KMeans(n_clusters=K, n_init=10, init=\"k-means++\", random_state=42).fit(X)\n",
245 |     "    inertias.append(km.inertia_)\n",
246 |     "    sils.append(silhouette_score(X, km.labels_))\n",
247 |     "\n",
248 |     "print(\"K:\", list(Ks))\n",
249 |     "print(\"Inertia:\", [round(v, 1) for v in inertias])\n",
250 |     "print(\"Silhouette:\", [round(v, 3) for v in sils])\n",
251 |     "\n",
252 |     "# 3) Pick K (say best-looking), then fit once more\n",
253 |     "bestK = 4\n",
254 |     "km = KMeans(n_clusters=bestK, n_init=10, init=\"k-means++\", random_state=42).fit(X)\n",
255 |     "print(\"Final silhouette at K=4:\", round(silhouette_score(X, km.labels_), 3))"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "id": "5c3088e7",
261 |    "metadata": {},
262 |    "source": [
263 |     "**From-scratch (one iteration):**"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "id": "36aa578a",
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "import numpy as np\n",
274 |     "\n",
275 |     "def kmeans_step(X, centers):\n",
276 |     "    # Assign\n",
277 |     "    d = np.stack([np.linalg.norm(X - c, axis=1) for c in centers], axis=1)\n",
278 |     "    labels = d.argmin(axis=1)\n",
279 |     "    # Update\n",
280 |     "    new_centers = np.array([X[labels == k].mean(axis=0) for k in range(centers.shape[0])])\n",
281 |     "    return labels, new_centers"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "markdown",
286 |    "id": "14451fe4",
287 |    "metadata": {},
288 |    "source": [
289 |     "---\n",
290 |     "\n",
291 |     "## 9) kNN vs K-Means — quick cheat sheet\n",
292 |     "\n",
293 |     "Use **kNN** when you have labels and want predictions; use **K‑Means** when you don’t have labels and want segments. kNN has almost no training cost but can be slow at prediction time; K‑Means costs more to fit but predicts fast by checking the nearest centroid. Scaling is important for both.\n",
294 |     "\n",
295 |     "| Aspect       | kNN (Supervised)                             | K-Means (Unsupervised)                      |\n",
296 |     "| ------------ | -------------------------------------------- | ------------------------------------------- |\n",
297 |     "| Purpose      | Predict labels/values from neighbors         | Discover groups without labels              |\n",
298 |     "| “Training”   | Just store data                              | Learn K centers                             |\n",
299 |     "| Compute cost | Cheap train, slower predict (need neighbors) | Fast predict, cost during fitting           |\n",
300 |     "| Key choices  | **k**, distance, weighting, scaling          | **K**, init (k-means++), scaling, restarts  |\n",
301 |     "| When to use  | Strong local patterns, simple baseline       | Quick segmentation, preprocessing, insights |\n",
302 |     "\n",
303 |     "---\n",
304 |     "\n",
305 |     "## 10) Checklists, FAQs, and practice\n",
306 |     "\n",
307 |     "kNN checklist: scale features; tune **k** with validation; consider distance‑weighted voting; watch for class imbalance.\n",
308 |     "\n",
309 |     "K‑Means checklist: scale features; use k‑means++; try several K and compare inertia and silhouette; handle empty clusters robustly.\n",
310 |     "\n",
311 |     "Practice ideas: try kNN on Iris or a small MNIST subset with various k; for K‑Means, sweep K=2…10 and plot inertia and silhouette; as a speed‑up experiment, cluster first, then run kNN within each cluster and compare runtime/accuracy to vanilla kNN.\n",
312 |     "\n",
313 |     "**kNN checklist**\n",
314 |     "\n",
315 |     "- [ ] Scale features\n",
316 |     "- [ ] Pick **k** via validation\n",
317 |     "- [ ] Consider distance-weighted vote\n",
318 |     "- [ ] Beware class imbalance\n",
319 |     "\n",
320 |     "**K-Means checklist**\n",
321 |     "\n",
322 |     "- [ ] Scale features\n",
323 |     "- [ ] Use **k-means++** + multiple **n_init**\n",
324 |     "- [ ] Pick **K** via elbow/silhouette\n",
325 |     "- [ ] Watch for empty clusters (reseed)\n",
326 |     "\n",
327 |     "**FAQs**\n",
328 |     "\n",
329 |     "- _“kNN is slow at prediction.”_ → Use KD-Tree/Ball-Tree/ANN, or pre-cluster and search within cluster.\n",
330 |     "- _“My clusters look weird.”_ → Scale features; try different K; check for non-spherical structure (K-Means likes spherical blobs).\n",
331 |     "- _“Do I need labels for K-Means?”_ → No — it’s unsupervised.\n",
332 |     "\n",
333 |     "**Practice**\n",
334 |     "\n",
335 |     "1. On a real dataset (e.g., Iris or a MNIST subset), tune **k** for kNN and report accuracy.\n",
336 |     "2. Run K-Means with $K = 2,\\dots,10$; plot inertia and silhouette; choose $K$ and visualize clusters.\n",
337 |     "3. Hybrid: cluster first (K-Means), then run kNN **within each cluster** — compare speed/accuracy.\n",
338 |     "\n",
339 |     "---\n",
340 |     "\n",
341 |     "## Summary\n",
342 |     "\n",
343 |     "kNN: predict using the answers from your **k nearest labeled neighbors**.\n",
344 |     "\n",
345 |     "K‑Means: place **K centroids** and alternate between assigning points to the nearest centroid and recomputing centroids until stable.\n",
346 |     "\n",
347 |     "You learned:\n",
348 |     "\n",
349 |     "- **kNN**: how neighbors + distances make predictions; how **k** and scaling affect results.\n",
350 |     "- **K-Means**: how Lloyd’s algorithm works; how to pick **K** and get stable clusters.\n",
351 |     "- Minimal code to try both methods today.\n",
352 |     "\n",
353 |     "**Next:** Support vector machines"
354 |    ]
355 |   }
356 |  ],
357 |  "metadata": {
358 |   "jupytext": {
359 |    "cell_metadata_filter": "-all",
360 |    "main_language": "python",
361 |    "notebook_metadata_filter": "-all"
362 |   },
363 |   "language_info": {
364 |    "name": "python"
365 |   }
366 |  },
367 |  "nbformat": 4,
368 |  "nbformat_minor": 5
369 | }
370 | 


--------------------------------------------------------------------------------
/4. Deep Learning & Computer Vision/4.4. Convolutional Layers, Pooling Layers, Convolutional Neural Network/CNN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "3fc7e788-61f8-475b-87d6-78f2601e4b45",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Seeing the world - Convolutional Neural Networks\n",
  9 |     "\n",
 10 |     "* Gordon.H | SHSID Data Science Group \n" ,
 11 |     "\n",
 12 |     "*Welcome back to the course, Junior ML Engineers !*\n",
 13 |     "\n",
 14 |     "Today we will be learning about the ultimate solution for image processing, **Convulutional Neural Netorks**\n",
 15 |     "\n",
 16 |     "---\n",
 17 |     "### Requirements \n",
 18 |     "* Understanding of the fundamentals of Machine learning\n",
 19 |     "* Basic Knowledge of Neural Networks\n",
 20 |     "* Basic Python and Numpy Library Usage\n",
 21 |     "\n",
 22 |     "---\n",
 23 |     "### 1. Problem with Images\n",
 24 |     "\n",
 25 |     "You have a small gray scale image of size 28*28 pixels\n",
 26 |     "* Size = 28*28 = 784\n",
 27 |     "* To feed it into a dense layer, we flatten into a vector of **784** numbers.\n",
 28 |     "* If first layer has 128 neurons we will need **100,352** weights\n",
 29 |     "\n",
 30 |     "This is a huge problem because:\n",
 31 |     "* It is inefficient with such large parameters to train\n",
 32 |     "* Spatial information is lost when we flatten the image\n",
 33 |     "\n",
 34 |     "Now, CNN's are designed to solve the problem with a smart approach\n",
 35 |     "\n",
 36 |     "```mermaid\n",
 37 |     "graph TD\n",
 38 |     "    subgraph Dense Network Approach\n",
 39 |     "        A[Input Image<br>28x28x1] --> B{Flatten};\n",
 40 |     "        B --> C[1D Vector<br>784 neurons];\n",
 41 |     "        C --> D[Dense Layer];\n",
 42 |     "        style A fill:#f9f,stroke:#333,stroke-width:2px\n",
 43 |     "    end\n",
 44 |     "    \n",
 45 |     "    subgraph CNN Approach\n",
 46 |     "        E[Input Image<br>28x28x1] --> F{Convolutional Layer};\n",
 47 |     "        F --> G[Feature Map<br>e.g., 26x26x32];\n",
 48 |     "        style E fill:#9cf,stroke:#333,stroke-width:2px\n",
 49 |     "    end\n",
 50 |     "\n",
 51 |     "    A -- \"Loses spatial structure\" --> C\n",
 52 |     "    E -- \"Preserves spatial structure\" --> G\n",
 53 |     "```\n",
 54 |     "As you see, CNN keeps the image's 2D structure, allowing it to learn from pixel neighborhoods.\n",
 55 |     "\n",
 56 |     "---\n",
 57 |     "\n",
 58 |     "### 2. The core of CNN : Convolutional Layer\n",
 59 |     "\n",
 60 |     "Instead of looking at a large image at once, CNN looks at it in small chunks. \n",
 61 |     "\n",
 62 |     "A **filter** is a small matrix of weights that the network learns. The process of sliding the filter and computing the output is called a **convolution**.\n",
 63 |     "\n",
 64 |     "Here's a mini-example of a 2x2 filter sliding over a 3x3 image to produce a 2x2 feature map.\n",
 65 |     "\n",
 66 |     "```\n",
 67 |     "Input Image (I)      Filter (K)     Feature Map (O)\n",
 68 |     "+---+---+---+        +---+---+      +---+---+\n",
 69 |     "| 1 | 5 | 2 |        | 1 | 0 |      | 9 | ? |\n",
 70 |     "+---+---+---+        +---+---+      +---+---+\n",
 71 |     "| 8 | 1 | 6 |        | 1 | 0 |      | ? | ? |\n",
 72 |     "+---+---+---+        +---+---+      +---+---+\n",
 73 |     "| 3 | 4 | 7 |\n",
 74 |     "+---+---+---+\n",
 75 |     "```\n",
 76 |     "To calculate the top-left value of the output: `(1*1) + (5*0) + (8*1) + (1*0) = 9`.\n",
 77 |     "\n",
 78 |     "#### The Mathematical Logic\n",
 79 |     "\n",
 80 |     "The mathematical formula for such operation, **cross-correlation**, looks like this:\n",
 81 |     "$$\n",
 82 |     "O_{i,j} = b + \\sum_{u=0}^{F-1} \\sum_{v=0}^{F-1} I_{i+u, j+v} \\cdot K_{u,v}\n",
 83 |     "$$\n",
 84 |     "\n",
 85 |     "Looks complicated right? Lets break it down:\n",
 86 |     "\n",
 87 |     "*   $O_{i,j}$: The output value at row `i`, column `j` in the feature map.\n",
 88 |     "\n",
 89 |     "*   $b$: A learnable **bias** term, which helps the filter make better predictions.\n",
 90 |     "\n",
 91 |     "*   $\\sum$: The \"sum\" symbol. We sum over the filter's dimensions (`u` and `v`).\n",
 92 |     "\n",
 93 |     "*   $I_{i+u, j+v}$: A pixel value from the **Input** image patch.\n",
 94 |     "\n",
 95 |     "*   $K_{u,v}$: A weight from our **Kernel** (aka **the filter**).\n",
 96 |     "\n",
 97 |     "This formula is a precise mathematical formula for cross correlation in Machine Learning, in mathematics convolution is a bit different, it involves flipping the filter (both horizontally and vertically) before sliding it over the image. The reason for such difference is due to the special nature of neural networks, the values in the filter are learned during training, the network can simply learn the flipped version of the filter if it needs to. The cross correlation is easier to implement.\n",
 98 |     "\n",
 99 |     "#### Hyperparameters and Output Size\n",
100 |     "The size of our output feature map depends on the hyperparameters we choose. The output width ($W_{out}$) and height ($H_{out}$) can be calculated with this formula:\n",
101 |     "\n",
102 |     "$$\n",
103 |     "W_{out} = \\frac{W_{in} - F + 2P}{S} + 1\n",
104 |     "$$\n",
105 |     "$$\n",
106 |     "H_{out} = \\frac{H_{in} - F + 2P}{S} + 1\n",
107 |     "$$\n",
108 |     "\n",
109 |     "Where:\n",
110 |     "*   $W_{in}, H_{in}$: Input width and height.\n",
111 |     "*   $F$: Filter size.\n",
112 |     "*   $P$: Padding (number of pixels added to the border).\n",
113 |     "*   $S$: Stride (how many pixels the filter slides at a time).\n",
114 |     "\n",
115 |     "#### Example Code\n",
116 |     " *Note: You can run the following code locally to try out convolutional layers!*\n",
117 |     "\n",
118 |     "\n",
119 |     "\n"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "id": "73bcea1c-0c71-4fbe-9001-e63093a3fa01",
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "# Remember to use pip to install numpy and matplotlib!\n",
130 |     "import numpy as np\n",
131 |     "import matplotlib.pyplot as plt\n",
132 |     "\n",
133 |     "# 1. Define the Input and Filter\n",
134 |     "# A simple 6x6 grayscale image. \n",
135 |     "# It has a sharp vertical edge down the middle.\n",
136 |     "# (Low values = dark, high values = light)\n",
137 |     "input_image = np.array([\n",
138 |     "    [10, 10, 10, 100, 100, 100],\n",
139 |     "    [10, 10, 10, 100, 100, 100],\n",
140 |     "    [10, 10, 10, 100, 100, 100],\n",
141 |     "    [10, 10, 10, 100, 100, 100],\n",
142 |     "    [10, 10, 10, 100, 100, 100],\n",
143 |     "    [10, 10, 10, 100, 100, 100]\n",
144 |     "])\n",
145 |     "\n",
146 |     "# A 3x3 filter designed to detect vertical edges.\n",
147 |     "# The positive values on the left and negative on the right\n",
148 |     "# will give a high response when we move from dark to light.\n",
149 |     "vertical_edge_filter = np.array([\n",
150 |     "    [ 1,  0, -1],\n",
151 |     "    [ 2,  0, -2], # This is a \"Sobel\" filter, common in image processing\n",
152 |     "    [ 1,  0, -1]\n",
153 |     "])\n",
154 |     "\n",
155 |     "# 2. The Convolution Operation\n",
156 |     "# Get dimensions (assuming no padding, stride=1)\n",
157 |     "img_h, img_w = input_image.shape\n",
158 |     "filter_h, filter_w = vertical_edge_filter.shape\n",
159 |     "out_h = (img_h - filter_h) + 1\n",
160 |     "out_w = (img_w - filter_w) + 1\n",
161 |     "\n",
162 |     "# Create an empty feature map to store the output\n",
163 |     "output_feature_map = np.zeros((out_h, out_w))\n",
164 |     "\n",
165 |     "# Slide filter over the image\n",
166 |     "for y in range(out_h):\n",
167 |     "    for x in range(out_w):\n",
168 |     "        # Get current patch of the image\n",
169 |     "        image_patch = input_image[y : y + filter_h, x : x + filter_w]\n",
170 |     "        \n",
171 |     "        # Perform element-wise multiplication and sum the result\n",
172 |     "        # This is the core of the convolution!\n",
173 |     "        convolution_sum = np.sum(image_patch * vertical_edge_filter)\n",
174 |     "        \n",
175 |     "        # Store result in the map\n",
176 |     "        output_feature_map[y, x] = convolution_sum       \n",
177 |     "# 3.Display Results\n",
178 |     "print(\"--- Manual NumPy Convolution ---\\n\")\n",
179 |     "print(\"Input Image:\\n\", input_image)\n",
180 |     "print(\"\\nVertical Edge Filter:\\n\", vertical_edge_filter)\n",
181 |     "print(\"\\nOutput Feature Map:\\n\", output_feature_map)\n",
182 |     "print(\"\\nNotice the high values in the output where the vertical edge was detected!\")\n",
183 |     "# Visualize the images\n",
184 |     "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))\n",
185 |     "ax1.imshow(input_image, cmap='gray')\n",
186 |     "ax1.set_title(\"Original Image\")\n",
187 |     "ax2.imshow(output_feature_map, cmap='gray')\n",
188 |     "ax2.set_title(\"Feature Map (Edges)\")\n",
189 |     "plt.show()"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "id": "e70522ea-6596-413f-aadf-e37042a51b87",
195 |    "metadata": {},
196 |    "source": [
197 |     "---\n",
198 |     "### 3. Making it Robust: The Pooling layer\n",
199 |     "\n",
200 |     "A Pooling layer shrinks the feature map to make the network faster and robust. The most common type of pooling is **Max Pooling**.\n",
201 |     "\n",
202 |     "#### Visualizing Max Pooling\n",
203 |     "\n",
204 |     "Imagine a 2x2 Max Pooling operation on a 4x4 feature map.\n",
205 |     "\n",
206 |     "```\n",
207 |     "Feature Map                     Pooled Output\n",
208 |     "+---+---+---+---+               +---+---+\n",
209 |     "|12 |20 | 30| 0 |  max(12,20,8,12)--> |20 |\n",
210 |     "+---+---+---+---+               +---+---+\n",
211 |     "| 8 |12 | 2 | 0 |  max(30,0,2,0)-->   |30 |\n",
212 |     "+---+---+---+---+               +---+---+\n",
213 |     "|34 |70 | 37| 4 |  max(34,70,112,100)-->|112|\n",
214 |     "+---+---+---+---+               +---+---+\n",
215 |     "|112|100| 25| 12|  max(37,4,25,12)--> |37 |\n",
216 |     "+---+---+---+---+               +---+---+\n",
217 |     "```\n",
218 |     "This keeps only the strongest signal, reducing the map size from 4x4 to 2x2.\n",
219 |     "\n",
220 |     "#### The Math Behind Pooling\n",
221 |     "\n",
222 |     "Here is the formula for Max Pooling:\n",
223 |     "\n",
224 |     "$$\n",
225 |     "P_{i,j} = \\max_{0 \\le u < F_p, 0 \\le v < F_p} A_{i \\cdot S_p + u, j \\cdot S_p + v}\n",
226 |     "$$\n",
227 |     "\n",
228 |     "This formally states: \"The output $P_{i,j}$ is the `max` value from the input feature map `A` within the pooling window.\"\n",
229 |     "\n",
230 |     "---\n",
231 |     "### Putting it all together: A full CNN Architecture\n",
232 |     "\n",
233 |     "A real world CNN stacks up all these layers\n",
234 |     "\n",
235 |     "``` mermaid\n",
236 |     "graph LR\n",
237 |     "    A[\"Input Image (28x28x1)\"] --> B[\"Conv2D Layer\\n32 filters, 3x3\\nOutput: 26x26x32\"]\n",
238 |     "    B --> C[\"MaxPooling2D\\n2x2 window\\nOutput: 13x13x32\"]\n",
239 |     "    C --> D[\"Conv2D Layer\\n64 filters, 3x3\\nOutput: 11x11x64\"]\n",
240 |     "    D --> E[\"MaxPooling2D\\n2x2 window\\nOutput: 5x5x64\"]\n",
241 |     "    E --> F[\"Flatten Layer\\nOutput: 1600 nodes\"]\n",
242 |     "    F --> G[\"Dense Layer\\n128 nodes\"]\n",
243 |     "    G --> H[\"Output Layer\\n10 nodes (Softmax)\"]\n",
244 |     "\n",
245 |     "    subgraph Feature Extractor\n",
246 |     "        B; C; D; E;\n",
247 |     "    end\n",
248 |     "\n",
249 |     "    subgraph Classifier\n",
250 |     "        F; G; H;\n",
251 |     "    end\n",
252 |     "```\n",
253 |     "The final layer uses a **Softmax** activation function to convert the network's scores into a probability distribution.\n",
254 |     "\n",
255 |     "The **Softmax** function for a specific output class `i` is:\n",
256 |     "\n",
257 |     "$$\n",
258 |     "\\sigma(\\mathbf{z})_i = \\frac{e^{z_i}}{\\sum_{j=1}^{C} e^{z_j}}\n",
259 |     "$$\n",
260 |     "\n",
261 |     "The formula gurantees that all output values are between 0 to 1 and sums up to be 1. This allows us to treat them as the model's confidence for each class.\n",
262 |     "\n",
263 |     "---\n",
264 |     "\n",
265 |     "### 5. Coding Example: Full Functional CNN Architecture\n",
266 |     "\n",
267 |     "The following example uses Pytorch and Matplotlib to create an example CNN architecture."
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "id": "a426d453-c908-4108-8af4-b896b9a8e95a",
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": [
277 |     "import torch\n",
278 |     "import torch.nn as nn\n",
279 |     "import torch.optim as optim\n",
280 |     "from torchvision import datasets, transforms\n",
281 |     "from torch.utils.data import DataLoader\n",
282 |     "import matplotlib.pyplot as plt\n",
283 |     "\n",
284 |     "# Define the CNN architecture\n",
285 |     "class MNIST_CNN(nn.Module):\n",
286 |     "    def __init__(self):\n",
287 |     "        super(MNIST_CNN, self).__init__()\n",
288 |     "        # Feature extractor\n",
289 |     "        self.features = nn.Sequential(\n",
290 |     "            nn.Conv2d(1, 32, kernel_size=3),  # 28x28x1 -> 26x26x32\n",
291 |     "            nn.ReLU(),\n",
292 |     "            nn.MaxPool2d(2),                   # 26x26x32 -> 13x13x32\n",
293 |     "            nn.Conv2d(32, 64, kernel_size=3),  # 13x13x32 -> 11x11x64\n",
294 |     "            nn.ReLU(),\n",
295 |     "            nn.MaxPool2d(2)                    # 11x11x64 -> 5x5x64\n",
296 |     "        )\n",
297 |     "        \n",
298 |     "        # Classifier\n",
299 |     "        self.classifier = nn.Sequential(\n",
300 |     "            nn.Flatten(),                      # 5x5x64 -> 1600\n",
301 |     "            nn.Linear(5*5*64, 128),            # 1600 -> 128\n",
302 |     "            nn.ReLU(),\n",
303 |     "            nn.Linear(128, 10)                 # 128 -> 10\n",
304 |     "        )\n",
305 |     "    \n",
306 |     "    def forward(self, x):\n",
307 |     "        x = self.features(x)\n",
308 |     "        x = self.classifier(x)\n",
309 |     "        return x\n",
310 |     "\n",
311 |     "# Initialize model, loss function, and optimizer\n",
312 |     "model = MNIST_CNN()\n",
313 |     "criterion = nn.CrossEntropyLoss()\n",
314 |     "optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
315 |     "\n",
316 |     "# Load MNIST data\n",
317 |     "transform = transforms.Compose([\n",
318 |     "    transforms.ToTensor(),\n",
319 |     "    transforms.Normalize((0.1307,), (0.3081,))\n",
320 |     "])\n",
321 |     "\n",
322 |     "train_data = datasets.MNIST('./data', train=True, download=True, transform=transform)\n",
323 |     "test_data = datasets.MNIST('./data', train=False, transform=transform)\n",
324 |     "\n",
325 |     "train_loader = DataLoader(train_data, batch_size=64, shuffle=True)\n",
326 |     "test_loader = DataLoader(test_data, batch_size=1000)\n",
327 |     "\n",
328 |     "# Training function\n",
329 |     "def train(model, device, train_loader, optimizer, epoch):\n",
330 |     "    model.train()\n",
331 |     "    for batch_idx, (data, target) in enumerate(train_loader):\n",
332 |     "        optimizer.zero_grad()\n",
333 |     "        output = model(data)\n",
334 |     "        loss = criterion(output, target)\n",
335 |     "        loss.backward()\n",
336 |     "        optimizer.step()\n",
337 |     "        \n",
338 |     "        if batch_idx % 100 == 0:\n",
339 |     "            print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} '\n",
340 |     "                  f'({100. * batch_idx / len(train_loader):.0f}%)]\\tLoss: {loss.item():.6f}')\n",
341 |     "\n",
342 |     "# Test function\n",
343 |     "def test(model, device, test_loader):\n",
344 |     "    model.eval()\n",
345 |     "    test_loss = 0\n",
346 |     "    correct = 0\n",
347 |     "    with torch.no_grad():\n",
348 |     "        for data, target in test_loader:\n",
349 |     "            output = model(data)\n",
350 |     "            test_loss += criterion(output, target).item()\n",
351 |     "            pred = output.argmax(dim=1, keepdim=True)\n",
352 |     "            correct += pred.eq(target.view_as(pred)).sum().item()\n",
353 |     "    \n",
354 |     "    test_loss /= len(test_loader)\n",
355 |     "    accuracy = 100. * correct / len(test_loader.dataset)\n",
356 |     "    print(f'\\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} '\n",
357 |     "          f'({accuracy:.0f}%)\\n')\n",
358 |     "    return accuracy\n",
359 |     "\n",
360 |     "# Training loop\n",
361 |     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
362 |     "model.to(device)\n",
363 |     "\n",
364 |     "accuracies = []\n",
365 |     "for epoch in range(1, 6):  # 5 epochs\n",
366 |     "    train(model, device, train_loader, optimizer, epoch)\n",
367 |     "    acc = test(model, device, test_loader)\n",
368 |     "    accuracies.append(acc)\n",
369 |     "\n",
370 |     "# Plot accuracy\n",
371 |     "plt.plot(range(1, 6), accuracies)\n",
372 |     "plt.title('Model Accuracy')\n",
373 |     "plt.xlabel('Epoch')\n",
374 |     "plt.ylabel('Accuracy (%)')\n",
375 |     "plt.show()\n",
376 |     "\n",
377 |     "# Save model\n",
378 |     "torch.save(model.state_dict(), 'mnist_cnn.pth')"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "id": "12cd21d0-27ed-41c3-81a0-6fb694aeab29",
384 |    "metadata": {},
385 |    "source": [
386 |     "### Summary & Conclusion\n",
387 |     "\n",
388 |     "**Congratulations!** You have just completed your lesson on Convolutional Neural Networks!\n",
389 |     "\n",
390 |     "Throughout this lesson you have learned:\n",
391 |     "\n",
392 |     "* How **Convolutional Layers** use filters to find features, and you've seen the formal math behind the process.\n",
393 |     "* How **Pooling Layers** make the network robust and efficient.\n",
394 |     "* Understanded the **CNN** architecture and has saw the code to build it.\n",
395 |     "\n",
396 |     "In the next lesson, we will learn about video data augmentation."
397 |    ]
398 |   }
399 |  ],
400 |  "metadata": {
401 |   "kernelspec": {
402 |    "display_name": "Python 3 (ipykernel)",
403 |    "language": "python",
404 |    "name": "python3"
405 |   },
406 |   "language_info": {
407 |    "codemirror_mode": {
408 |     "name": "ipython",
409 |     "version": 3
410 |    },
411 |    "file_extension": ".py",
412 |    "mimetype": "text/x-python",
413 |    "name": "python",
414 |    "nbconvert_exporter": "python",
415 |    "pygments_lexer": "ipython3",
416 |    "version": "3.12.7"
417 |   }
418 |  },
419 |  "nbformat": 4,
420 |  "nbformat_minor": 5
421 | }
422 | 


--------------------------------------------------------------------------------