├── .gitignore ├── 01-deep-neural-networks ├── 01-dnn │ ├── 01_dnn.ipynb │ ├── backprop.pdf │ └── clean_backprop.py ├── 02-cnn │ ├── cnn.ipynb │ └── cnn_jax.py ├── 03-alex-net │ └── alexnet.ipynb ├── 04-u-net │ ├── unet.ipynb │ └── unet.py ├── 05-vis-cnn │ └── visulisation_cnn.ipynb └── 06-yolo │ └── yolo.ipynb ├── 02-optimization-and-regularization ├── 01-weight-decay │ └── weightdecay.ipynb ├── 02-relu │ └── relu.ipynb ├── 03-residuals │ └── resnet.ipynb ├── 04-dropout │ └── dropout.ipynb ├── 05-batch-norm │ └── Batch-norm.ipynb ├── 06-layer-norm │ └── layernorm.ipynb ├── 07-gelu │ ├── gelu.ipynb │ └── probability.ipynb ├── 08-adam │ ├── adam.ipynb │ └── loss.ipynb └── 09-early-stopping │ └── early-stopping.ipynb ├── 03-sequence-modeling ├── 01-rnn │ ├── rnn-data │ │ └── Nietzsche_Articles.txt │ ├── rnn.ipynb │ └── simplernn.py ├── 02-lstm │ └── lstm.ipynb ├── 03-learning-to-forget │ └── learning.ipynb ├── 04-word2vec │ └── word2vec.ipynb ├── 05-seq2seq │ └── seq2seq.ipynb ├── 06-attention │ └── attention.ipynb └── 07-mixture-of-experts │ └── mixture-of-experts.ipynb ├── 04-transformers ├── 01-transformer │ └── transformer.ipynb ├── 02-bert │ └── bert.ipynb ├── 03-t5 │ └── t5.ipynb ├── 04-gpt │ └── gpt.ipynb ├── 05-lora │ └── lora.ipynb ├── 06-rlhf │ └── rlhf.ipynb └── 07-vision-transformer │ └── vit.ipynb ├── 05-image-generation ├── 01-gan │ └── gan.ipynb ├── 02-vae │ └── vae.ipynb ├── 03-diffusion │ └── sd.ipynb ├── 04-clip │ └── clip.ipynb └── 05-dall-e │ └── dalle.ipynb ├── README.md └── images ├── .gitkeep ├── 3-11.png ├── 3-12-2.png ├── 3-7.png ├── Cowboy-Bebop-Quotes1.jpeg ├── GRU.png ├── GRulCXpaUAAm5Up.jpeg ├── LKE.png ├── RNN-vs-FNN-660.png ├── T5.jpg ├── T5_1.jpg ├── add-1.drawio.png ├── alexnet-arc.png ├── batcnorm.jpeg ├── bert.jpg ├── bot-res.png ├── bottleneck.png ├── cnn.jpg ├── convolution-2.gif ├── decoder.png ├── dropout.png ├── dropoutex.jpg ├── dropoutt.png ├── earlystopping.jpg ├── encoder.png ├── f_pdf.jpg ├── for_revered_guest.png ├── imagent.png ├── imagnet-win.png ├── last-lstm.png ├── lstm-2.png ├── lstm-add.png ├── lstm-core.png ├── lstm-input.png ├── lstm.png ├── maxpool.gif ├── mul.drawio.png ├── newunet.png ├── nor-res.png ├── overlapping.png ├── pos-cal.png ├── pos-emb.png ├── probs.jpg ├── relu.png ├── reluu.png ├── res-arc.png ├── resnet.jpg ├── rnn_arc.png ├── self-feedback-loop.jpg ├── skip.png ├── transformer.jpeg ├── transistor.png ├── trig.png ├── typesofrnn.png ├── unet.png ├── unetimg.png ├── unetsd.png ├── vis-4.png ├── vis-cnn.png ├── vis_0.png ├── vis_1.png ├── vis_2.png ├── word2vec-embed.png └── word2vec.png /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .ipynb_checkpoints 3 | datasets 4 | .DS_Store 5 | .vscode 6 | data -------------------------------------------------------------------------------- /01-deep-neural-networks/01-dnn/backprop.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/01-deep-neural-networks/01-dnn/backprop.pdf -------------------------------------------------------------------------------- /01-deep-neural-networks/01-dnn/clean_backprop.py: -------------------------------------------------------------------------------- 1 | #rewrite the backprop from ipynb 2 | 3 | class Tensor: 4 | 5 | def __init__(self,data,children=()): 6 | 7 | self.data = data 8 | self._prev = set(children) 9 | self.grad = 0.0 10 | self._backward = lambda : None 11 | 12 | 13 | 14 | pass -------------------------------------------------------------------------------- /01-deep-neural-networks/02-cnn/cnn_jax.py: -------------------------------------------------------------------------------- 1 | #import 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | import pandas as pd 7 | from sklearn.metrics import ConfusionMatrixDisplay 8 | 9 | import torch 10 | import torchvision 11 | import torchvision.transforms as transforms 12 | from torch.utils import data 13 | 14 | import jax 15 | import jax.numpy as jnp 16 | import flax 17 | import flax.linen as nn 18 | import optax 19 | from jax.tree_util import tree_map 20 | from flax.training import train_state 21 | 22 | 23 | # data 24 | 25 | 26 | IMAGE_SIZE = 32 27 | BATCH_SIZE = 128 28 | DATA_MEANS = np.array([0.49139968, 0.48215841, 0.44653091]) 29 | DATA_STD = np.array([0.24703223, 0.24348513, 0.26158784]) 30 | CROP_SCALES = (0.8, 1.0) 31 | CROP_RATIO = (0.9, 1.1) 32 | SEED = 42 33 | 34 | 35 | 36 | # CNN 37 | 38 | class CNN(nn.Module): 39 | 40 | 41 | @nn.compact 42 | def __call__(self, x): 43 | x = nn.Conv(features=32, kernel_size=(3, 3))(x) 44 | x = nn.relu(x) 45 | x = nn.avg_pool(x, window_shape=(2, 2), strides=(2, 2)) 46 | x = nn.Conv(features=64, kernel_size=(3, 3))(x) 47 | x = nn.relu(x) 48 | x = nn.avg_pool(x, window_shape=(2, 2), strides=(2, 2)) 49 | x = x.reshape((x.shape[0], -1)) # flatten 50 | x = nn.Dense(features=256)(x) 51 | x = nn.relu(x) 52 | x = nn.Dense(features=10)(x) 53 | return x 54 | 55 | 56 | 57 | def image_to_numpy(img): 58 | img = np.array(img, dtype=np.float32) 59 | img = (img / 255. - DATA_MEANS) / DATA_STD 60 | return img 61 | 62 | 63 | def numpy_collate(batch): 64 | if isinstance(batch[0], np.ndarray): 65 | return np.stack(batch) 66 | elif isinstance(batch[0], (tuple, list)): 67 | transposed = zip(*batch) 68 | return [numpy_collate(samples) for samples in transposed] 69 | else: 70 | return np.array(batch) 71 | 72 | classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') 73 | test_transform = image_to_numpy 74 | train_transform = transforms.Compose([ 75 | transforms.RandomHorizontalFlip(), 76 | transforms.RandomResizedCrop((IMAGE_SIZE, IMAGE_SIZE), scale=CROP_SCALES, ratio=CROP_RATIO), 77 | image_to_numpy 78 | ]) 79 | 80 | # Validation set should not use train_transform. 81 | train_dataset = torchvision.datasets.CIFAR10('data', train=True, transform=train_transform, download=True) 82 | val_dataset = torchvision.datasets.CIFAR10('data', train=True, transform=test_transform, download=True) 83 | train_set, _ = torch.utils.data.random_split(train_dataset, [45000, 5000], generator=torch.Generator().manual_seed(SEED)) 84 | _, val_set = torch.utils.data.random_split(val_dataset, [45000, 5000], generator=torch.Generator().manual_seed(SEED)) 85 | test_set = torchvision.datasets.CIFAR10('data', train=False, transform=test_transform, download=True) 86 | 87 | train_data_loader = torch.utils.data.DataLoader( 88 | train_set, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=2, persistent_workers=True, collate_fn=numpy_collate, 89 | ) 90 | val_data_loader = torch.utils.data.DataLoader( 91 | val_set, batch_size=BATCH_SIZE, shuffle=False, drop_last=False, num_workers=2, persistent_workers=True, collate_fn=numpy_collate, 92 | ) 93 | test_data_loader = torch.utils.data.DataLoader( 94 | test_set, batch_size=BATCH_SIZE, shuffle=False, drop_last=False, num_workers=2, persistent_workers=True, collate_fn=numpy_collate, 95 | ) 96 | 97 | 98 | #model init 99 | 100 | model = CNN() 101 | 102 | optimizer = optax.adam(learning_rate=1e-4) 103 | 104 | rng, inp_rng, init_rng = jax.random.split(jax.random.PRNGKey(SEED), 3) 105 | params = model.init(jax.random.PRNGKey(SEED), 106 | jax.random.normal(inp_rng, (BATCH_SIZE, 32, 32, 3))) 107 | 108 | model_state = train_state.TrainState.create(apply_fn=model.apply, 109 | params=params, 110 | tx=optimizer) 111 | 112 | 113 | # training 114 | 115 | 116 | @jax.jit 117 | def apply_model(state, images, labels): 118 | 119 | 120 | def loss_fn(params): 121 | logits = state.apply_fn(params, images) 122 | one_hot = jax.nn.one_hot(labels, logits.shape[1]) 123 | loss = jnp.mean(optax.softmax_cross_entropy(logits=logits, labels=one_hot)) 124 | return loss, logits 125 | 126 | grad_fn = jax.value_and_grad(loss_fn, has_aux=True) 127 | (loss, logits), grads = grad_fn(state.params) 128 | accuracy = jnp.mean(jnp.argmax(logits, -1) == labels) 129 | return grads, loss, accuracy 130 | 131 | 132 | @jax.jit 133 | def update_model(state, grads): 134 | return state.apply_gradients(grads=grads) 135 | 136 | 137 | def train_epoch(state, data_loader): 138 | 139 | 140 | epoch_loss = [] 141 | epoch_accuracy = [] 142 | 143 | for batch in data_loader: 144 | batch_images, batch_labels = batch 145 | grads, loss, accuracy = apply_model(state, batch_images, batch_labels) 146 | state = update_model(state, grads) 147 | epoch_loss.append(loss) 148 | epoch_accuracy.append(accuracy) 149 | train_loss = np.mean(epoch_loss) 150 | train_accuracy = np.mean(epoch_accuracy) 151 | return state, train_loss, train_accuracy 152 | 153 | 154 | def train_model(state, train_data_loader, num_epochs): 155 | # Training loop 156 | for epoch in range(num_epochs): 157 | state, train_loss, train_accuracy = train_epoch(state, train_data_loader) 158 | print(f'epoch: {epoch:03d}, train loss: {train_loss:.4f}, train accuracy: {train_accuracy:.4f}') 159 | return state 160 | 161 | 162 | trained_model_state = train_model(model_state, train_data_loader, num_epochs=100) 163 | 164 | #testing 165 | 166 | test_loss = [] 167 | test_accuracy = [] 168 | 169 | for batch in test_data_loader: 170 | batch_images, batch_labels = batch 171 | _, loss, accuracy = apply_model(trained_model_state, batch_images, batch_labels) 172 | test_loss.append(loss) 173 | test_accuracy.append(accuracy) 174 | 175 | print(f'loss: {np.mean(test_loss):.4f}, accuracy: {np.mean(test_accuracy):.4f}') -------------------------------------------------------------------------------- /01-deep-neural-networks/04-u-net/unet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | class DoubleConv(nn.Module): 5 | def __init__(self, in_channels, out_channels): 6 | super(DoubleConv, self).__init__() 7 | self.conv = nn.Sequential( 8 | nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1), 9 | nn.ReLU(inplace=True), 10 | nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1), 11 | nn.ReLU(inplace=True), 12 | ) 13 | 14 | def forward(self, x): 15 | return self.conv(x) 16 | 17 | class DownSample(nn.Module): 18 | def __init__(self, in_channels, out_channels): 19 | super(DownSample, self).__init__() 20 | self.conv = DoubleConv(in_channels, out_channels) 21 | self.pool = nn.MaxPool2d(kernel_size=2, stride=2) 22 | 23 | def forward(self, x): 24 | down = self.conv(x) 25 | pooled = self.pool(down) 26 | return down, pooled 27 | 28 | class UpSample(nn.Module): 29 | def __init__(self, in_channels, out_channels): 30 | super(UpSample, self).__init__() 31 | self.up_conv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=2, stride=2) 32 | self.conv = DoubleConv(in_channels, out_channels) 33 | 34 | def forward(self, x1, x2): 35 | x1 = self.up_conv(x1) 36 | diffY = x2.size()[2] - x1.size()[2] 37 | diffX = x2.size()[3] - x1.size()[3] 38 | x1 = nn.functional.pad(x1, [diffX // 2, diffX - diffX // 2, diffY // 2, diffY - diffY // 2]) 39 | x = torch.cat([x2, x1], dim=1) 40 | return self.conv(x) 41 | 42 | class UNet(nn.Module): 43 | def __init__(self, in_channels, out_channels): 44 | super(UNet, self).__init__() 45 | self.down_conv_1 = DownSample(in_channels, 64) 46 | self.down_conv_2 = DownSample(64, 128) 47 | self.down_conv_3 = DownSample(128, 256) 48 | self.down_conv_4 = DownSample(256, 512) 49 | 50 | self.bottle_neck = DoubleConv(512, 1024) 51 | 52 | self.up_conv_1 = UpSample(1024, 512) 53 | self.up_conv_2 = UpSample(512, 256) 54 | self.up_conv_3 = UpSample(256, 128) 55 | self.up_conv_4 = UpSample(128, 64) 56 | 57 | self.out = nn.Conv2d(64, out_channels, kernel_size=1) 58 | 59 | def forward(self, x): 60 | down_1, p1 = self.down_conv_1(x) 61 | down_2, p2 = self.down_conv_2(p1) 62 | down_3, p3 = self.down_conv_3(p2) 63 | down_4, p4 = self.down_conv_4(p3) 64 | 65 | b = self.bottle_neck(p4) 66 | 67 | up_1 = self.up_conv_1(b, down_4) 68 | up_2 = self.up_conv_2(up_1, down_3) 69 | up_3 = self.up_conv_3(up_2, down_2) 70 | up_4 = self.up_conv_4(up_3, down_1) 71 | 72 | out = self.out(up_4) 73 | return out 74 | 75 | -------------------------------------------------------------------------------- /01-deep-neural-networks/06-yolo/yolo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### YOLO " 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [] 16 | } 17 | ], 18 | "metadata": { 19 | "language_info": { 20 | "name": "python" 21 | } 22 | }, 23 | "nbformat": 4, 24 | "nbformat_minor": 2 25 | } 26 | -------------------------------------------------------------------------------- /02-optimization-and-regularization/06-layer-norm/layernorm.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Layer-Norm !!! (will come back to this at the time of transformer for now sayonara :) " 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [] 16 | } 17 | ], 18 | "metadata": { 19 | "language_info": { 20 | "name": "python" 21 | } 22 | }, 23 | "nbformat": 4, 24 | "nbformat_minor": 2 25 | } 26 | -------------------------------------------------------------------------------- /02-optimization-and-regularization/07-gelu/gelu.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "> Before you get into gelu you should know what is CDF ?\n", 8 | "\n", 9 | "- after learning **CDF** i think you can skip it if you want. \n", 10 | "- in short cdf helps gelu to skip negative values.\n", 11 | "\n", 12 | "- -> [text](probability.ipynb) (Link to CDF)" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "> Lest understand the gelu\n", 20 | "\n", 21 | "- gelu (gaussian error linear unit), an activation function used in neural networks.\n", 22 | "\n", 23 | "- gelu makes a hard decision by outputting zero for all negative inputs and passing through positive inputs as they are.\n", 24 | "\n", 25 | "- gelu, on the other hand, provides a probabilistic gating mechanism by using the Gaussian CDF to weight the inputs. This gives a smoother transition for input values, particularly around zero.\n", 26 | "\n", 27 | "- relu and leaky relu, which have a sharp transition at x = 0 , GELU transitions smoothly, which can lead to better performance.\n", 28 | "\n", 29 | "- gelu considers the input’s probability of being positive, which integrates both the input value and its likelihood of activation.\n", 30 | "\n", 31 | "> Why Use the Gaussian CDF?\n", 32 | "\n", 33 | "- inputs often follow a normal distribution.\n", 34 | "\n", 35 | "- gelu used this distribution of input \n", 36 | "\n", 37 | "- this can result in better performance across various tasks, as the gelu function smoothly blends in.\n", 38 | "\n", 39 | "> computation \n", 40 | "\n", 41 | "- ReLU is very simple and computationally efficient.\n", 42 | "\n", 43 | " - GELU involves more complex calculations due to the Gaussian CDF, but the approximation helps mitigate this complexity.\n", 44 | "\n", 45 | "> math (approximation )\n", 46 | "\n", 47 | "- for computational efficiency, the GELU function can be approximated as:\n", 48 | "\n", 49 | "$$ \\text{GELU}(x) \\approx 0.5x \\left(1 + \\tanh\\left(\\sqrt{\\frac{2}{\\pi}} \\left(x + 0.044715x^3\\right)\\right)\\right) $$\n", 50 | "\n", 51 | "> uses \n", 52 | "\n", 53 | "- used in bert and gpts " 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 24, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "image/png": "", 64 | "text/plain": [ 65 | "
" 66 | ] 67 | }, 68 | "metadata": {}, 69 | "output_type": "display_data" 70 | } 71 | ], 72 | "source": [ 73 | "import torch\n", 74 | "import torch.nn.functional as F\n", 75 | "import matplotlib.pyplot as plt\n", 76 | "import numpy as np\n", 77 | "\n", 78 | "\n", 79 | "x = torch.linspace(-10, 10, 100)\n", 80 | "\n", 81 | "\n", 82 | "relu_output = F.relu(x) # relu\n", 83 | "gelu_output = F.gelu(x) # gelu\n", 84 | "\n", 85 | "\n", 86 | "x_np = x.numpy()\n", 87 | "relu_output_np = relu_output.numpy()\n", 88 | "gelu_output_np = gelu_output.numpy()\n", 89 | "\n", 90 | "# Plot the results\n", 91 | "plt.figure(figsize=(12, 6))\n", 92 | "\n", 93 | "# Plot ReLU\n", 94 | "plt.subplot(1, 2, 1)\n", 95 | "plt.plot(x_np, relu_output_np, label='ReLU')\n", 96 | "plt.title('ReLU Activation Function')\n", 97 | "plt.xlabel('Input')\n", 98 | "plt.ylabel('Output')\n", 99 | "plt.grid(True)\n", 100 | "plt.legend()\n", 101 | "\n", 102 | "# Plot GELU\n", 103 | "plt.subplot(1, 2, 2)\n", 104 | "plt.plot(x_np, gelu_output_np, label='GELU', color='orange')\n", 105 | "plt.title('GELU Activation Function')\n", 106 | "plt.xlabel('Input')\n", 107 | "plt.ylabel('Output')\n", 108 | "plt.grid(True)\n", 109 | "plt.legend()\n", 110 | "\n", 111 | "plt.tight_layout()\n", 112 | "plt.show()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "#### as you can see the smooth curve around zero in gelu plot that helps gelu in better performance !" 120 | ] 121 | } 122 | ], 123 | "metadata": { 124 | "kernelspec": { 125 | "display_name": "Python 3", 126 | "language": "python", 127 | "name": "python3" 128 | }, 129 | "language_info": { 130 | "name": "python", 131 | "version": "3.9.6" 132 | } 133 | }, 134 | "nbformat": 4, 135 | "nbformat_minor": 2 136 | } 137 | -------------------------------------------------------------------------------- /02-optimization-and-regularization/08-adam/loss.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Loss Function (will be talking in terms of probability) (working on this)\n", 8 | "\n", 9 | "> **What is a loss fucntion ?**\n", 10 | "\n", 11 | "- while training the model the loss fucntion is used to calculate the mismatch between actual vlaues and predicted values.\n", 12 | "\n", 13 | "- it requires to parameters to calulate it Y_hat(predicted) and y(actual value).\n", 14 | "\n", 15 | "- the goal is to find paramaters values that minimize the loss.\n", 16 | "\n", 17 | "> **Maximum Likelihood**\n", 18 | "\n", 19 | "- here we compute the distribution over a outputs.(y_hat)\n", 20 | "\n", 21 | "- instead of guessing single outcomes the model predicts a range of possible outcomes and how likely each one is.\n", 22 | "\n", 23 | "- ex. if we guess someone's height it 6ft but we are guessing so it might be less or more. So we represent this using bell curve.\n", 24 | "\n", 25 | " - to plot this we need mean and variance.\n", 26 | "\n", 27 | "> **Maximum Likelihood Criterion**\n", 28 | "\n", 29 | "- Maximum Likeihood Estimation : we want to get the right of values for our model's parameter in order to reduce the gap between our prediceted output and actual outputs.\n", 30 | "\n", 31 | "$$\n", 32 | "\\hat{\\phi} = \\argmax_{\\phi} \\left[ \\prod_{i=1}^{n} \\Pr(y_i \\mid x_i, \\phi) \\right]\n", 33 | "$$\n", 34 | "\n", 35 | "**Break Down of above equaton**\n", 36 | "\n", 37 | "1. **Model Prediction (Pr(y_i | x_i, φ)):**\n", 38 | " - Pr(y_i | x_i, φ) represents the probability that our model predicts the actual outcome (y_i) given the input data (x_i) and the model parameters (φ).\n", 39 | "\n", 40 | " - Think of this as the model saying, \"Based on my current parameters, there's an X% chance that this specific input (x_i) will result in this specific output (y_i).\"\n", 41 | "\n", 42 | "2. **Product of Probabilities:**\n", 43 | " - ∏(i=1 to n) means we multiply these probabilities together for all the data points (from 1 to n).\n", 44 | "\n", 45 | " - This gives us a combined probability that shows how likely it is that all our predictions match the actual data.\n", 46 | "\n", 47 | "3. **Maximizing the Combined Probability:**\n", 48 | " - argmax_φ means we want to find the specific parameters (φ) that make this combined probability as large as possible.\n", 49 | "\n", 50 | " - inshort, we're adjusting our model's paramters to [maximize] the chances that it predicts the actual outcomes correctly.\n", 51 | "\n", 52 | "> **Maximum log Likelihood**\n", 53 | "\n", 54 | "- MLC uses the product of the ouputs prob and it can genrate the very small values.\n", 55 | "\n", 56 | "- and aslo it is not a numericaly stable.\n", 57 | "\n", 58 | "**Example**\n", 59 | "\n", 60 | "If you flip a coin 10 times and each flip has a probability of 0.5 for heads, the product of probabilities for all heads is:\n", 61 | "\n", 62 | "$$\n", 63 | "0.5 \\times 0.5 \\times 0.5 \\times \\ldots = (0.5)^{10} = 0.0009765625\n", 64 | "$$\n", 65 | "\n", 66 | "This is a very small number, and if you had more flips, it would get even smaller.\n", 67 | "\n", 68 | "Taking the log of each probability and summing them:\n", 69 | "\n", 70 | "$$\n", 71 | "\\log(0.5) + \\log(0.5) + \\log(0.5) + \\ldots = 10 \\times \\log(0.5) \\approx 10 \\times (-0.693) = -6.93\n", 72 | "$$\n", 73 | "\n", 74 | "This sum is a manageable number and avoids the numerical issues of multiplying many small numbers.\n", 75 | "\n", 76 | "- This is why using log-liklihood is more practical where we take the sum and not product.\n", 77 | "\n", 78 | "> **Negative log Likelihood**\n", 79 | "\n", 80 | "- this reframed the problem as minimizes prolem.\n", 81 | "\n", 82 | "- most of optimizestion algorithms are the build to solve the minimizestion problem.\n", 83 | "\n", 84 | "- this changes our goal from finding the maximum value of the log-likelihood to finding the minimum value of the negative log-likelihood.\n", 85 | "\n", 86 | "\n", 87 | "- The negative log-likelihood is calculated as:\n", 88 | "\n", 89 | "$$\n", 90 | "-\\sum_{i=1}^{n} \\log \\Pr(y_i \\mid x_i, \\phi)\n", 91 | "$$\n", 92 | "\n", 93 | "- To find the parameter values (\\(\\phi\\)) that minimize the negative log-likelihood, we use:\n", 94 | "\n", 95 | "$$\n", 96 | "\\hat{\\phi} = \\argmin_{\\phi} \\left[ -\\sum_{i=1}^{n} \\log \\Pr(y_i \\mid x_i, \\phi) \\right]\n", 97 | "$$\n", 98 | "\n", 99 | "- comapare to **Maximum log Likelihood** its now minizing problem with neg.\n", 100 | "\n", 101 | "- this is the final formula of **LOSS.**\n", 102 | "\n", 103 | "> **Inference**\n", 104 | "\n", 105 | "- instead of the network directly predicting a specific value (y), it now predicts a range of possible values with different likelihoods. This is called a probability distribution.\n", 106 | "\n", 107 | "- During inference, we need a single \"best guess\" answer. We choose the single value from the distribution that has the highest probability.\n", 108 | "\n", 109 | "- **finding the Best Guess:**\n", 110 | "- We use the argmax operation to find this best guess:\n", 111 | "$$\\hat{y} = \\arg\\max_y \\text{Pr}(y|f[x,\\hat{\\phi}])$$\n", 112 | "- This means \"find the value of $y$ that gives the highest probability, given our model's output.\"\n", 113 | "\n", 114 | "> **IN-short**\n", 115 | "\n", 116 | "- Log-Likelihood: A measure of model fit that we want to maximize.\n", 117 | "- Negative Log-Likelihood: The negative of the log-likelihood, which we want to minimize.\n", 118 | " \n", 119 | "> **Recipe for constructing loss function**\n", 120 | "\n", 121 | "- This recipe outlines the process of creating loss functions for training probabilistic \n", 122 | "neural networks using the maximum likelihood approach.\n", 123 | "\n", 124 | "\n", 125 | "**1. Choose a Suitable Probability Distribution**\n", 126 | "\n", 127 | "$$\\text{Pr}(y|\\theta)$$\n", 128 | "\n", 129 | "- Choose a probability distribution that's appropriate for your prediction task.\n", 130 | "- This distribution is defined over the domain of the predictions $y$.\n", 131 | "$\\theta$ represents the parameters of this distribution.\n", 132 | "\n", 133 | "- Examples:\n", 134 | "\n", 135 | "- For regression tasks, you might choose a Normal (Gaussian) distribution.\n", 136 | "\n", 137 | "- For binary classification, you might choose a Bernoulli distribution.\n", 138 | "- For multi-class classification, you might choose a Categorical distribution.\n", 139 | "\n", 140 | "**2. Set the Machine Learning Model to Predict Distribution Parameters**\n", 141 | "\n", 142 | "$$\\theta = f[x, \\phi]$$\n", 143 | "$$\\text{Pr}(y|\\theta) = \\text{Pr}(y|f[x, \\phi])$$\n", 144 | "\n", 145 | "- Your neural network $f[x, \\phi]$ is set to predict the parameters $\\theta$ of the chosen distribution.\n", 146 | "$x$ is the input to the network.\n", 147 | "$\\phi$ represents the parameters of the neural network itself.\n", 148 | "\n", 149 | "- Example:\n", 150 | "\n", 151 | "- For a Normal distribution, the network might output the mean $\\mu$ and standard deviation $\\sigma$.\n", 152 | "\n", 153 | "**3. Train the Model by Minimizing Negative Log-Likelihood**\n", 154 | "$$\\hat{\\phi} = \\arg\\min_{\\phi} L[\\phi] = \\arg\\min_{\\phi} -\\sum_{i=1}^N \\log \\text{Pr}(y_i|f[x_i, \\phi])$$\n", 155 | "\n", 156 | "- We want to find the network parameters $\\hat{\\phi}$ that minimize the negative log-likelihood loss function.\n", 157 | "\n", 158 | "This is done over the entire training dataset of pairs ${x_i, y_i}$.\n", 159 | "$N$ is the number of training examples.\n", 160 | "\n", 161 | "- **Why negative log-likelihood?**\n", 162 | "\n", 163 | "- Using log transforms the product of probabilities into a sum, which is computationally easier to handle.\n", 164 | "\n", 165 | "- The negative sign turns the maximization problem into a minimization problem, \n", 166 | "which is conventionally used in optimization algorithms.\n", 167 | "\n", 168 | "**4. Perform Inference**\n", 169 | "\n", 170 | "- For a new test example $x$, you have two options:\n", 171 | "\n", 172 | "- Return the full distribution: $\\text{Pr}(y|f[x,\\hat{\\phi}])$\n", 173 | "\n", 174 | "- This gives you the complete probability distribution over possible outputs.\n", 175 | "\n", 176 | "\n", 177 | "- Return the maximum of this distribution:\n", 178 | "$$\\hat{y} = \\arg\\max_y \\text{Pr}(y|f[x,\\hat{\\phi}])$$\n", 179 | "\n", 180 | "- This gives you a single point estimate, which is often more practical for decision-making.\n", 181 | "\n", 182 | "\n", 183 | "> **IN-Short**\n", 184 | "\n", 185 | "- This approach allows the model to learn to predict not just a single value, but a full probability distribution over possible outputs.\n", 186 | "\n", 187 | "- By minimizing the negative log-likelihood, we're effectively maximizing the probability of the observed data given our model.\n", 188 | "- This method naturally handles uncertainty: the predicted distribution will be wider (more uncertain) for inputs the model is less confident about.\n", 189 | "- The choice of distribution in step 1 is crucial and should reflect the nature of your data and task.\n", 190 | "- This recipe forms the basis for many modern machine learning approaches.\n", 191 | "\n", 192 | "\n", 193 | "> **Ex. 1: Univariate regression**\n", 194 | "\n", 195 | "\n", 196 | "\n", 197 | "\n", 198 | "\n", 199 | "> **Ex. 2: Binary Classification**\n", 200 | "\n", 201 | "> **Ex. 3: Multiclass classification**\n", 202 | "\n", 203 | "> **Cross Entropy Loss**" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 28, 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "name": "stdout", 213 | "output_type": "stream", 214 | "text": [ 215 | "[1, 2, 2]\n" 216 | ] 217 | }, 218 | { 219 | "data": { 220 | "text/plain": [ 221 | "2" 222 | ] 223 | }, 224 | "execution_count": 28, 225 | "metadata": {}, 226 | "output_type": "execute_result" 227 | } 228 | ], 229 | "source": [] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [] 237 | } 238 | ], 239 | "metadata": { 240 | "kernelspec": { 241 | "display_name": "Python 3", 242 | "language": "python", 243 | "name": "python3" 244 | }, 245 | "language_info": { 246 | "codemirror_mode": { 247 | "name": "ipython", 248 | "version": 3 249 | }, 250 | "file_extension": ".py", 251 | "mimetype": "text/x-python", 252 | "name": "python", 253 | "nbconvert_exporter": "python", 254 | "pygments_lexer": "ipython3", 255 | "version": "3.9.6" 256 | } 257 | }, 258 | "nbformat": 4, 259 | "nbformat_minor": 2 260 | } 261 | -------------------------------------------------------------------------------- /03-sequence-modeling/01-rnn/simplernn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | 6 | 7 | class RNNModel(nn.Module): 8 | 9 | 10 | def __init__(self, rnn_layer, **kwargs): 11 | super(RNNModel, self).__init__(**kwargs) 12 | self.rnn = rnn_layer 13 | if self.rnn.bidirectional: 14 | self.num_directions = 2 15 | else: 16 | self.num_directions = 1 17 | self.linear = nn.Linear( 18 | self.num_directions * self.rnn.hidden_size, self.rnn.input_size) 19 | 20 | # forward 21 | 22 | def forward(self, inputs, state): 23 | X = F.one_hot(inputs.T.long(), self.rnn.input_size) 24 | X = X.to(torch.float32) 25 | Y, state = self.rnn(X, state) 26 | output = self.linear(Y.reshape((-1, Y.shape[-1]))) 27 | return output, state # Change begin_state to state 28 | 29 | 30 | #begin state 31 | 32 | def begin_state(self, device, batch_size=1): 33 | tensor = torch.zeros((self.num_directions * self.rnn.num_layers, 34 | batch_size, self.rnn.hidden_size), 35 | device=device) 36 | if isinstance(self.rnn, nn.LSTM): 37 | return (tensor, tensor) 38 | else: 39 | return tensor -------------------------------------------------------------------------------- /03-sequence-modeling/03-learning-to-forget/learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# backgroung \n", 8 | "\n", 9 | "rnn suffers from the vanishing or exploding gradients problem and some them porblem solved by std lstm to some extend.\n", 10 | "lstms creats the bridge which help in solving the context problems through the cells.\n", 11 | "\n", 12 | "but lstms may suffers in input seq get too big. there is chance the magnitude of thst bridge carries may lead overflow and eventually failed. the proposed solution is an adaptive \"forget gate\" that allows an lstm cell to learn to reset itself at appropriate times, releasing internal resources.\n", 13 | "\n", 14 | "## standard lstm\n", 15 | "\n", 16 | "the basic unit in the hidden layer of an lstm network is the memory block, which contains:\n", 17 | "- one or more memory cells\n", 18 | "- a pair of adaptive, multiplicative gating units (input and output gates)\n", 19 | "\n", 20 | "each memory cell has a recurrently self-connected linear unit called the \"constant error carousel\" (cec), which helps prevent the vanishing gradient problem. the cell state, denoted as $s_c$, is updated as follows:\n", 21 | "\n", 22 | "$$\n", 23 | "s_{c}(t) = s_{c}(t-1) + y_{in}(t) \\cdot g(\\text{net}_{c}(t))\n", 24 | "$$\n", 25 | "\n", 26 | "where:\n", 27 | "- $y_{in}(t)$ is the input gate activation\n", 28 | "- $g(\\cdot)$ is a centered logistic sigmoid function with range $[-2, 2]$\n", 29 | "\n", 30 | "the cell output $y_c$ is calculated as:\n", 31 | "\n", 32 | "$$\n", 33 | "y_{c}(t) = y_{out}(t) \\cdot h(s_{c}(t))\n", 34 | "$$\n", 35 | "\n", 36 | "where:\n", 37 | "- $y_{out}(t)$ is the output gate activation\n", 38 | "- $h(\\cdot)$ is a centered sigmoid function with range $[-1, 1]$\n", 39 | "\n", 40 | "## solution: forget gates\n", 41 | "to address the issue of indefinite growth of cell states, the \"forget gate\" is introduced. the forget gate activation $y'_j$ is calculated similarly to other gates and is squashed using a logistic sigmoid function:\n", 42 | "\n", 43 | "$$\n", 44 | "y'_j (t) = f'_j \\left( \\sum_m w'_{jm} y_m(t-1) \\right)\n", 45 | "$$\n", 46 | "\n", 47 | "the revised update equation for the cell state $s_c$ in the extended lstm is:\n", 48 | "\n", 49 | "$$\n", 50 | "s_{cvj}(t) = y'_{j}(t) \\cdot s_{cvj}(t-1) + y_{in}(t) \\cdot g(\\text{net}_{cvj}(t))\n", 51 | "$$\n", 52 | "\n", 53 | "forget gates learn to reset the memory block when its contents are no longer useful, thereby preventing unbounded growth of internal states.\n", 54 | "\n", 55 | "## experiments\n", 56 | "\n", 57 | "to test the effectiveness of forget gates, the authors extended the embedded reber grammar (erg) problem to create a continual version, where the network must handle concatenated sequences without explicit resets. the results show that extended lstm with forget gates can solve the task more efficiently than standard lstm, especially when combined with learning rate decay.\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [] 64 | } 65 | ], 66 | "metadata": { 67 | "language_info": { 68 | "name": "python" 69 | } 70 | }, 71 | "nbformat": 4, 72 | "nbformat_minor": 2 73 | } 74 | -------------------------------------------------------------------------------- /03-sequence-modeling/04-word2vec/word2vec.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": { 5 | "word2vec.png": { 6 | "image/png": "" 7 | } 8 | }, 9 | "cell_type": "markdown", 10 | "metadata": {}, 11 | "source": [ 12 | "# word representations in vector space\n", 13 | "\n", 14 | "\n", 15 | "to work with nlp we need embedding(converting txt into meaningful vec representation). word2vec was one the intial idea that guided the innovation of embedding.\n", 16 | "\n", 17 | "![word2vec.png](attachment:word2vec.png)\n", 18 | "\n", 19 | "as we can see here words is getting converted into vectors(list of numbers). ther are new methods do this in current time.\n", 20 | "\n", 21 | "highy recommended this [blog-post](https://jalammar.github.io/illustrated-word2vec/) by jalmmar to read more about Word2vec.\n", 22 | "\n", 23 | "\n", 24 | "## model architectures\n", 25 | "\n", 26 | "many models have been proposed for estimating continuous word representations, including latent semantic analysis (lsa) and latent dirichlet allocation (lda). but the distributed representations learned by neural networks have shown better performance. the computational complexity of these models is defined as:\n", 27 | "\n", 28 | "$$\n", 29 | "o = e \\times t \\times q\n", 30 | "$$\n", 31 | "\n", 32 | "where:\n", 33 | "- $e$ is the number of training epochs\n", 34 | "- $t$ is the number of words in the training set\n", 35 | "- $q$ is defined for each model architecture\n", 36 | "\n", 37 | "### feedforward neural net language model \n", 38 | "the feedforward neural net language model consists of input, projection, hidden, and output layers. at the input layer, $n$ previous words are encoded using a 1-of-v coding, where $v$ is the vocabulary size. the projection layer has dimensionality $n \\times d$. the computational complexity per training example is:\n", 39 | "\n", 40 | "$$\n", 41 | "q = n \\times d + n \\times d \\times h + h \\times v\n", 42 | "$$\n", 43 | "\n", 44 | "where:\n", 45 | "- $d$ is the dimensionality of the word vectors\n", 46 | "- $h$ is the size of the hidden layer\n", 47 | "- $v$ is the size of the vocabulary\n", 48 | "\n", 49 | "## new log-linear models\n", 50 | "the paper introduces two new model architectures to learn distributed representations of words with reduced computational complexity:\n", 51 | "\n", 52 | "1. **continuous bag-of-words model (cbow)**: the cbow model predicts the current word based on its context by averaging the word vectors in the context. the training complexity is:\n", 53 | "\n", 54 | "$$\n", 55 | "q = n \\times d + d \\times \\log_2(v)\n", 56 | "$$\n", 57 | "\n", 58 | "2. **continuous skip-gram model**: the skip-gram model maximizes the classification of a word based on another word in the same sentence. the training complexity is:\n", 59 | "\n", 60 | "$$\n", 61 | "q = c \\times (d + d \\times \\log_2(v))\n", 62 | "$$\n", 63 | "\n", 64 | "where $c$ is the maximum distance between words.\n", 65 | "\n", 66 | "## results\n", 67 | "the paper compares the quality of different word vector models by measuring their performance on semantic-syntactic word relationships. it shows that the new architectures (cbow and skip-gram) outperform the previous neural network models in accuracy while requiring significantly less computational cost.\n", 68 | "\n", 69 | "## conclusion\n", 70 | "the study shows that it is possible to train high-quality word vectors using simpler model architectures, such as cbow and skip-gram, which are computationally efficient and capable of learning from much larger datasets. this advancement could lead to new applications in nlp tasks like machine translation, information retrieval, and question-answering systems.\n" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [] 79 | } 80 | ], 81 | "metadata": { 82 | "language_info": { 83 | "name": "python" 84 | } 85 | }, 86 | "nbformat": 4, 87 | "nbformat_minor": 2 88 | } 89 | -------------------------------------------------------------------------------- /03-sequence-modeling/05-seq2seq/seq2seq.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# seq2seq \n", 8 | "\n", 9 | "## background\n", 10 | "\n", 11 | "### the problem with traditional neural networks\n", 12 | "\n", 13 | "traditional neural networks like anns (artificial neural networks) and cnns (convolutional neural networks) weren't cutting it for text data.\n", 14 | "\n", 15 | "Two main significant reasons : \n", 16 | "\n", 17 | "- **fixed input size**: these models typically expect a fixed input size, which doesn't work well for variable-length sequences like sentences.\n", 18 | "- **lack of temporal understanding**: they don't naturally capture the order and context of words in a sequence.\n", 19 | "\n", 20 | "### rnns (recurrent neural networks)\n", 21 | "\n", 22 | "rnns were introduced to handle sequential data better. they process input sequentially, maintaining a hidden state that can capture some context. however, they had their own issues:\n", 23 | "\n", 24 | "- **vanishing gradient problem**: as the sequence gets longer, rnns struggle to carry information from earlier time steps.\n", 25 | "- **limited context**: they have trouble capturing long-term dependencies in the data.\n", 26 | "\n", 27 | "### lstm (kind of solve)\n", 28 | "\n", 29 | "long short-term memory (lstm) networks were design to solve the RNN problem \n", 30 | "\n", 31 | "- **gating mechanisms**: lstms use gates to control the flow of information, helping to mitigate the vanishing gradient problem.\n", 32 | "- **better at long-term dependencies**: they can carry relevant information across longer sequences.\n", 33 | "\n", 34 | "but even lstms (and their variants like grus - gated recurrent units) struggle with very long sequences.\n", 35 | "\n", 36 | "## seq2seq: \n", 37 | "\n", 38 | "seq2seq (sequence-to-sequence) models were designed to handle tasks where both input and output are sequences, like machine translation.\n", 39 | "\n", 40 | "### core idea\n", 41 | "\n", 42 | "the seq2seq model consists of two main parts:\n", 43 | "\n", 44 | "1. **encoder**: processes the input sequence\n", 45 | "2. **decoder**: generates the output sequence\n", 46 | "\n", 47 | "this architecture allows the model to map sequences of different lengths, which is crucial for tasks like translation where input and output lengths may vary.\n", 48 | "\n", 49 | "### how seq2seq works\n", 50 | "\n", 51 | "let's break down the process:\n", 52 | "\n", 53 | "1. **input processing**:\n", 54 | " - text input is tokenized (split into words or subwords)\n", 55 | " - tokens are converted to numerical representations via an embedding layer\n", 56 | "\n", 57 | "2. **encoding**:\n", 58 | " - the embedded input sequence is fed into the encoder (usually lstm-based)\n", 59 | " - encoder processes the sequence, updating its hidden state at each step\n", 60 | " - final hidden state of the encoder captures the essence of the input sequence\n", 61 | "\n", 62 | "3. **context vector**:\n", 63 | " - the final hidden state of the encoder becomes the \"context vector\"\n", 64 | " - this vector is meant to encapsulate the meaning of the entire input sequence\n", 65 | "\n", 66 | "4. **decoding**:\n", 67 | " - decoder initializes its hidden state with the context vector\n", 68 | " - at each step, the decoder:\n", 69 | " - takes the previous output and its current hidden state as input\n", 70 | " - produces a probability distribution over the output vocabulary\n", 71 | " - selects the most likely token as the output for that step\n", 72 | "\n", 73 | "5. **output generation**:\n", 74 | " - the process continues until the decoder generates an end-of-sequence token or reaches a maximum length\n", 75 | "\n", 76 | "### IMP finding in seq2seq\n", 77 | "\n", 78 | "1. **separate encoder and decoder**:\n", 79 | " - allows handling different languages or domains for input and output\n", 80 | " - enables more parameters without excessive computational cost\n", 81 | " - can be trained separately, adding flexibility\n", 82 | "\n", 83 | "2. **deep lstms**:\n", 84 | " - stacking multiple lstm layers (typically 4) in both encoder and decoder\n", 85 | " - increases model capacity to capture complex patterns\n", 86 | " - helps maintain long-term dependencies\n", 87 | "\n", 88 | "3. **input reversal**:\n", 89 | " - reversing the order of input tokens (but not output tokens)\n", 90 | " - creates shorter dependencies between source and target\n", 91 | " - makes optimization easier for gradient-based methods like sgd\n", 92 | "\n", 93 | "4. **attention mechanism** (a later addition):\n", 94 | " - allows decoder to focus on different parts of input for each output token\n", 95 | " - significantly improves performance, especially for long sequences\n", 96 | " - paved the way for transformer models\n", 97 | "\n", 98 | "\n", 99 | "### beam search decoding\n", 100 | "\n", 101 | "instead of greedily selecting the most probable token at each step, beam search maintains multiple candidate sequences:\n", 102 | "\n", 103 | "- keeps top-k most likely sequences at each step\n", 104 | "- improves output quality by exploring more possibilities\n", 105 | "\n", 106 | "### handling unknown words\n", 107 | "\n", 108 | "seq2seq models struggle with words not in their vocabulary. solutions include:\n", 109 | "\n", 110 | "- subword tokenization (e.g., byte-pair encoding)\n", 111 | "- pointer-generator networks for copying unknown words from input\n", 112 | "\n", 113 | "### bidirectional encoders\n", 114 | "\n", 115 | "using bidirectional lstms in the encoder to capture context from both directions of the input sequence.\n", 116 | "\n", 117 | "\n", 118 | "## limitations \n", 119 | "\n", 120 | "\n", 121 | "- still struggle with very long sequences\n", 122 | "- computationally intensive, especially during training\n", 123 | "- require large amounts of parallel data for training\n" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [] 130 | } 131 | ], 132 | "metadata": { 133 | "language_info": { 134 | "name": "python" 135 | } 136 | }, 137 | "nbformat": 4, 138 | "nbformat_minor": 2 139 | } 140 | -------------------------------------------------------------------------------- /04-transformers/04-gpt/gpt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "### understanding generative pre-trained transformer (gpt) \n", 9 | "\n", 10 | "\n", 11 | "\n", 12 | "the generative pre-trained transformer (gpt) family of models, introduced by openai, represents a significant advancement in language modeling and natural language processing. these models employ a decoder-only transformer architecture and utilize an autoregressive approach to text generation. the evolution from gpt-1 through subsequent versions has demonstrated remarkable scaling properties, with each iteration showing significant improvements in performance and capabilities.\n", 13 | "\n", 14 | "\n", 15 | "\n", 16 | "gpt models utilize a decoder-only transformer architecture, which differs from encoder-decoder models like t5. the architecture consists of multiple transformer blocks stacked upon each other, with each block containing self-attention mechanisms and feed-forward neural networks. the model processes text as a sequence of tokens and predicts the next token based on all previous tokens in the sequence.\n", 17 | "\n", 18 | "\n", 19 | "> attention mechanism\n", 20 | "\n", 21 | "gpt implements a masked self-attention mechanism where each token can only attend to its previous tokens and itself. this causality constraint is crucial for the autoregressive nature of the model. the attention mechanism computes queries, keys, and values for each token and uses scaled dot-product attention to weight the importance of different tokens in the sequence.\n", 22 | "\n", 23 | "\n", 24 | "\n", 25 | "\n", 26 | "\n", 27 | "the pre-training of gpt models follows an autoregressive language modeling objective. the model learns to predict the next token in a sequence given all previous tokens. this unsupervised learning approach allows the model to learn from vast amounts of text data without requiring labeled examples.\n", 28 | "\n", 29 | "\n", 30 | "\n", 31 | "> tokenization\n", 32 | "\n", 33 | "gpt models use byte-pair encoding (bpe) tokenization, which breaks down text into subword units. this approach provides a balance between character-level and word-level tokenization, allowing the model to handle both common and rare words effectively.\n", 34 | "\n", 35 | "\n", 36 | "the model uses learned position embeddings to maintain awareness of token positions in the sequence. these embeddings are added to the token embeddings before being processed by the transformer layers.\n", 37 | "\n", 38 | "\n", 39 | "gpt models demonstrate impressive scaling properties, with performance improving predictably with model size. key scaling factors include:\n", 40 | "\n", 41 | "\n", 42 | "during text generation, the model employs various decoding strategies such as greedy decoding, beam search, or sampling with temperature control. these strategies help balance between output quality and diversity.\n", 43 | "\n", 44 | "the success of gpt models has significantly influenced the direction of nlp research and applications, demonstrating the potential of large-scale language models trained on vast amounts of text data. their ability to generate coherent and contextually appropriate text has opened new possibilities in various domains, from creative writing to technical documentation.\n", 45 | "\n", 46 | "\n", 47 | "> model size\n", 48 | "\n", 49 | "increasing the number of parameters by expanding model depth and width helps in performance. gpt-2 ranges from 117 million to 1.5 billion parameters across its variants.\n", 50 | "\n", 51 | "> dataset size\n", 52 | "\n", 53 | " gpt-2 was trained on a diverse dataset of 8 million web pages selected for quality.\n", 54 | "\n", 55 | "\n", 56 | "> context window\n", 57 | "\n", 58 | "gpt-2 processes sequences of up to 1024 tokens, allowing it to maintain longer-range dependencies than previous models. this expanded context window enables more coherent generation of long passages.\n", 59 | "\n" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [] 66 | } 67 | ], 68 | "metadata": { 69 | "kernelspec": { 70 | "display_name": "Python 3", 71 | "language": "python", 72 | "name": "python3" 73 | }, 74 | "language_info": { 75 | "codemirror_mode": { 76 | "name": "ipython", 77 | "version": 3 78 | }, 79 | "file_extension": ".py", 80 | "mimetype": "text/x-python", 81 | "name": "python", 82 | "nbconvert_exporter": "python", 83 | "pygments_lexer": "ipython3", 84 | "version": "3.9.6" 85 | } 86 | }, 87 | "nbformat": 4, 88 | "nbformat_minor": 2 89 | } 90 | -------------------------------------------------------------------------------- /04-transformers/05-lora/lora.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### let's understand lora (low-rank adaptation)\n", 8 | "\n", 9 | "> **why do we need lora?**\n", 10 | "- when we fine-tune large language models, it's incredibly expensive to update all parameters.\n", 11 | "- lora provides a memory-efficient alternative that achieves similar results while only training a small fraction of parameters.\n", 12 | "- traditional fine-tuning requires storing and updating the entire model, which is impractical for most users without expensive hardware.\n", 13 | "- lora introduces a clever \"bypass\" solution that keeps the original pre-trained weights frozen and only trains small adapter modules.\n", 14 | "\n", 15 | "> **what is lora?**\n", 16 | "- lora stands for low-rank adaptation, a technique that makes fine-tuning large models more accessible.\n", 17 | "- instead of modifying all weights directly, lora decomposes weight updates into smaller matrices through low-rank decomposition.\n", 18 | "- this dramatically reduces the number of trainable parameters (often by 10,000x or more) while maintaining performance.\n", 19 | "- example: instead of training billions of parameters in a large model, lora might only train a few million parameters.\n", 20 | "\n", 21 | "> **how lora works?**\n", 22 | "- lora freezes the pre-trained model weights completely.\n", 23 | "- for each weight matrix we want to adapt, lora adds a parallel \"bypass\" connection.\n", 24 | "- this bypass consists of two smaller matrices: a down-projection and an up-projection.\n", 25 | "- the original path: input → original frozen weight → output\n", 26 | "- the lora path: input → down-projection → up-projection → output\n", 27 | "- the final output combines both paths.\n", 28 | "\n", 29 | "> **three key steps**\n", 30 | "- 1. decompose each weight matrix update into two smaller matrices (down-projection and up-projection)\n", 31 | "- 2. initialize these matrices so their product is zero (ensuring no change to behavior initially)\n", 32 | "- 3. train only these small matrices while keeping the original weights frozen\n", 33 | "\n", 34 | "> **why is this efficient?**\n", 35 | "- the rank of these matrices (r) is tiny compared to the original dimensions.\n", 36 | "- this makes the number of trainable parameters much smaller than the original model.\n", 37 | "- storage requirements are reduced significantly, often enabling fine-tuning on consumer hardware.\n", 38 | "- during inference, lora matrices can be merged with the original weights with no performance penalty.\n", 39 | "\n", 40 | "> **benefits of lora**\n", 41 | "- dramatically reduced memory requirements for fine-tuning\n", 42 | "- faster training times\n", 43 | "- lower computational costs\n", 44 | "- ability to switch between different adaptations quickly\n", 45 | "- preserves the general knowledge of the base model while adding specialized capabilities" 46 | ] 47 | } 48 | ], 49 | "metadata": { 50 | "language_info": { 51 | "name": "python" 52 | } 53 | }, 54 | "nbformat": 4, 55 | "nbformat_minor": 2 56 | } 57 | -------------------------------------------------------------------------------- /04-transformers/06-rlhf/rlhf.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### let's understand rlhf (reinforcement learning from human feedback)\n", 8 | "\n", 9 | "> **why do we need rlhf?**\n", 10 | "- large language models trained only on text prediction don't naturally align with human values and preferences.\n", 11 | "- models may generate harmful, misleading, or unhelpful content if optimized solely to predict the next token.\n", 12 | "- we need a way to teach models to produce outputs that humans actually prefer and find helpful.\n", 13 | "- supervised fine-tuning alone can't capture nuanced human preferences about quality, safety, and helpfulness.\n", 14 | "\n", 15 | "> **what is rlhf?**\n", 16 | "- rlhf stands for reinforcement learning from human feedback, a technique for aligning ai systems with human preferences.\n", 17 | "- it teaches models to generate content humans prefer by using human judgments as rewards.\n", 18 | "- human evaluators compare different model outputs, ranking them from most to least preferred.\n", 19 | "- these preferences are used to train a reward model that scores outputs, which then guides the language model's learning.\n", 20 | "\n", 21 | "> **how rlhf works?**\n", 22 | "- rlhf typically follows a three-stage process:\n", 23 | "- 1. supervised fine-tuning: first train the model on high-quality examples to get a decent starting point.\n", 24 | "- 2. reward model training: collect human preferences between outputs and train a model to predict which responses humans prefer.\n", 25 | "- 3. reinforcement learning: optimize the language model using the reward model's scores as feedback.\n", 26 | "\n", 27 | "> **the key components**\n", 28 | "- the policy model: the language model being trained to generate preferred outputs\n", 29 | "- the reward model: evaluates outputs based on human preferences\n", 30 | "- ppo (proximal policy optimization): the reinforcement learning algorithm typically used\n", 31 | "- kl penalty: ensures the model doesn't deviate too far from its original capabilities\n", 32 | "\n", 33 | "> **practical implementation**\n", 34 | "- human evaluators compare pairs of model responses and select which one better satisfies criteria like helpfulness.\n", 35 | "- these preferences create a dataset for training the reward model.\n", 36 | "- during reinforcement learning, the model generates many variations of responses to prompts.\n", 37 | "- these responses are scored by the reward model, and the policy is updated to maximize these scores.\n", 38 | "- a kl divergence penalty prevents the model from changing too drastically and forgetting its capabilities.\n", 39 | "\n", 40 | "> **benefits of rlhf**\n", 41 | "- models that produce more helpful, harmless, and honest outputs\n", 42 | "- reduced likelihood of generating harmful content\n", 43 | "- better alignment with complex human values that are difficult to specify explicitly\n", 44 | "- improved ability to follow instructions and understand user intent\n", 45 | "- more natural, helpful interactions that better meet human expectations" 46 | ] 47 | } 48 | ], 49 | "metadata": { 50 | "language_info": { 51 | "name": "python" 52 | } 53 | }, 54 | "nbformat": 4, 55 | "nbformat_minor": 2 56 | } 57 | -------------------------------------------------------------------------------- /04-transformers/07-vision-transformer/vit.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### let's understand vision transformer (vit)\n", 8 | "\n", 9 | "> **why do we need vision transformer?**\n", 10 | "- traditionally, convolutional neural networks (cnns) dominated computer vision tasks.\n", 11 | "- cnns have built-in inductive biases for images, but these can sometimes limit what the model learns.\n", 12 | "- transformer architecture had revolutionized nlp, suggesting it might also benefit vision.\n", 13 | "- vision transformer brings the power of self-attention to image processing, enabling better global understanding of images.\n", 14 | "- vit can capture long-range dependencies between image patches that might be missed by cnns.\n", 15 | "\n", 16 | "> **what is a vision transformer?**\n", 17 | "- vision transformer (vit) adapts the transformer architecture from nlp to computer vision.\n", 18 | "- instead of processing text tokens, it processes image patches as tokens.\n", 19 | "- the key insight: split an image into fixed-size patches and treat each patch like a word token.\n", 20 | "- this approach removes the convolution operations entirely in the pure vit design.\n", 21 | "- it applies the same self-attention mechanism that made transformers successful in language tasks.\n", 22 | "\n", 23 | "> **how vision transformer works?**\n", 24 | "- 1. split the image into fixed-size patches (like 16×16 pixels)\n", 25 | "- 2. flatten each patch into a 1d vector\n", 26 | "- 3. project these vectors to the model dimension\n", 27 | "- 4. add position embeddings to retain spatial information\n", 28 | "- 5. process through standard transformer encoder blocks\n", 29 | "- 6. use the output of the special [class] token for classification\n", 30 | "\n", 31 | "\n", 32 | "\n", 33 | "> **transformer encoder block:**\n", 34 | "- each block contains:\n", 35 | " - multi-head self-attention (msa)\n", 36 | " - layer normalization (ln)\n", 37 | " - multilayer perceptron (mlp)\n", 38 | " - residual connections\n", 39 | "\n", 40 | "\n", 41 | "\n", 42 | "> **key advantages of vit**\n", 43 | "- global receptive field from the start (unlike cnns which build this gradually)\n", 44 | "- flexible attention to relevant parts of the image regardless of distance\n", 45 | "- fewer inductive biases, allowing the model to learn more complex patterns\n", 46 | "- excellent scaling properties - performance improves predictably with more data and compute\n", 47 | "- ability to visualize attention maps to see what the model focuses on\n", 48 | "- transfer learning capabilities across different vision tasks\n", 49 | "\n", 50 | "> **challenges and solutions**\n", 51 | "- requires more data than cnns to reach similar performance\n", 52 | "- computationally intensive for high-resolution images\n", 53 | "- positional information must be explicitly added\n", 54 | "- hybrid approaches combining cnn features with transformers often work best in practice\n", 55 | "- data augmentation and regularization are crucial for good performance\n" 56 | ] 57 | } 58 | ], 59 | "metadata": { 60 | "language_info": { 61 | "name": "python" 62 | } 63 | }, 64 | "nbformat": 4, 65 | "nbformat_minor": 2 66 | } 67 | -------------------------------------------------------------------------------- /05-image-generation/01-gan/gan.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### let's understand gan (generative adversarial networks)\n", 8 | "\n", 9 | "> **why do we need gan?**\n", 10 | "\n", 11 | "traditional generative models like variational autoencoders often produce blurry or unrealistic outputs. we needed a way to generate sharper, more realistic images, audio, and other media that truly captures the complexity of real-world data. supervised learning requires labeled data, but generative tasks often need to learn the underlying distribution of unlabeled data. gans introduced a revolutionary approach: instead of explicitly modeling probability distributions with mathematical formulas, they learn to generate data through an adversarial game between two neural networks.\n", 12 | "\n", 13 | "> **what is gan?**\n", 14 | "\n", 15 | "generative adversarial networks consist of two neural networks; a generator and a discriminator that compete against each other in a minimax game. the generator creates fake samples trying to fool the discriminator, while the discriminator works to distinguish between real and fake samples. this adversarial process forces both networks to improve: the generator creates increasingly realistic data, and the discriminator becomes better at spotting subtle flaws. after training, the generator can create new, never-before-seen samples that closely resemble the training data distribution.\n", 16 | "\n", 17 | "> **how gan works?**\n", 18 | "\n", 19 | "the generator takes random noise (typically from a normal or uniform distribution) as input and transforms it into synthetic data samples. the discriminator receives both real samples from the training dataset and fake samples from the generator, outputting a probability indicating whether each sample is real or fake. the generator aims to maximize the discriminator's error rate, while the discriminator aims to minimize its own error rate. mathematically, this forms a two-player minimax game where the generator minimizes and the discriminator maximizes the same objective function. training alternates between updating the discriminator and the generator, gradually improving both networks.\n", 20 | "\n", 21 | "\n", 22 | "> **training algorithm**\n", 23 | "\n", 24 | "the training process alternates between:\n", 25 | "\n", 26 | "1. training the discriminator:\n", 27 | " $$\\max_D V(D, G) = \\mathbb{E}_{x \\sim p_{data}(x)}[\\log D(x)] + \\mathbb{E}_{z \\sim p_z(z)}[\\log(1 - D(G(z)))]$$\n", 28 | "\n", 29 | "2. training the generator:\n", 30 | " $$\\min_G V(D, G) = \\mathbb{E}_{z \\sim p_z(z)}[\\log(1 - D(G(z)))]$$\n", 31 | " or equivalently:\n", 32 | " $$\\max_G \\mathbb{E}_{z \\sim p_z(z)}[\\log D(G(z))]$$\n", 33 | "\n", 34 | "when the system reaches equilibrium, the generator produces samples indistinguishable from real data, and the discriminator outputs 0.5 for all inputs, indicating it can no longer differentiate between real and fake samples.\n", 35 | "\n", 36 | "> **challenges in gan training**\n", 37 | "\n", 38 | "training gans is notoriously difficult due to several issues. mode collapse occurs when the generator produces limited varieties of outputs, failing to capture the full data distribution. vanishing gradients can happen when the discriminator becomes too effective too quickly, providing minimal feedback to the generator. training instability manifests as oscillating losses rather than convergence. these issues have led to numerous gan variants like wasserstein gan (wgan), which uses wasserstein distance instead of jensen shannon divergence, and spectral normalization gan (sn-gan), which stabilizes discriminator training through weight normalization.\n", 39 | "\n", 40 | "> **applications of gan**\n", 41 | "\n", 42 | "gans have revolutionized multiple fields with their ability to generate realistic data. in computer vision, they create photorealistic images, perform image-to-image translation (like converting satellite images to maps), and enhance low resolution photos. in medicine, gans generate synthetic medical images for training algorithms and data augmentation when real samples are scarce. they've also been applied to audio synthesis for creating realistic speech and music, text generation for creating coherent passages, and even drug discovery by generating molecular structures with specific properties. perhaps most famously, deepfakes highly realistic fake videos and images—are created using gan-based approaches.\n", 43 | "\n", 44 | "> **recent advances in gan**\n", 45 | "\n", 46 | "stylegan represents a significant advancement with its ability to generate incredibly realistic faces and control different aspects of image generation separately. biggan scaled up gan training to produce high-resolution, diverse images. cyclegan enabled unpaired image-to-image translation, allowing transformation between domains without paired examples. diffusion models, while technically different from gans, have recently outperformed them in image generation quality by gradually denoising random noise. gans continue to evolve, with research focusing on improving training stability, increasing output diversity, and extending their capabilities to new domains and applications.\n" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [] 55 | } 56 | ], 57 | "metadata": { 58 | "kernelspec": { 59 | "display_name": "Python 3", 60 | "language": "python", 61 | "name": "python3" 62 | }, 63 | "language_info": { 64 | "name": "python", 65 | "version": "3.9.6" 66 | } 67 | }, 68 | "nbformat": 4, 69 | "nbformat_minor": 2 70 | } 71 | -------------------------------------------------------------------------------- /05-image-generation/02-vae/vae.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### let's understand vae (variational autoencoders)\n", 8 | "\n", 9 | "> **why do we need vae?**\n", 10 | "\n", 11 | "traditional autoencoders compress data into a lower-dimensional latent space but lack the ability to generate new samples effectively. we needed a generative model that could not only reconstruct inputs but also produce new, realistic samples by sampling from a continuous latent space. vae solves this by introducing probabilistic encoding that forces the latent space to be well-structured and meaningful. unlike gans which require adversarial training, vaes offer a more stable training process based on a clear mathematical foundation of variational inference. vaes also provide explicit probability distributions, allowing us to reason about the underlying data structure and uncertainty in a principled way.\n", 12 | "\n", 13 | "> **what is vae?**\n", 14 | "\n", 15 | "variational autoencoder is a generative model that combines deep learning with bayesian inference. it consists of an encoder network that maps input data to a probability distribution in latent space, and a decoder network that reconstructs the input from samples of this distribution. the key innovation is representing each point in latent space not as a single value but as a distribution (typically gaussian) defined by mean and variance parameters. this probabilistic approach enables smooth interpolation between data points and generation of new samples by sampling from the latent space. vae training optimizes a balance between reconstruction quality and ensuring the latent space follows a predefined prior distribution, usually a standard normal distribution.\n", 16 | "\n", 17 | "> **how vae works?**\n", 18 | "\n", 19 | "the encoder in a vae takes input data and outputs parameters of a probability distribution (mean μ and variance σ²) rather than a fixed encoding. the model then uses the reparameterization trick to sample from this distribution in a way that allows gradient flow during backpropagation: z = μ + σ * ε, where ε is random noise from a standard normal distribution. the decoder takes this sampled point z and reconstructs the input. during training, the vae optimizes two components: the reconstruction loss (how well the decoder reconstructs the input) and the kullback-leibler divergence between the encoder's distribution and a prior distribution (usually standard normal). this second term acts as a regularizer, ensuring the latent space is well-structured and continuous.\n", 20 | "\n", 21 | "\n", 22 | "> **detailed architecture**\n", 23 | "\n", 24 | "the encoder network typically consists of several layers that process the input and output two vectors: one for the means (μ) and one for the log-variances (log σ²) of the latent dimensions. we use log-variance instead of variance directly for numerical stability. these parameters define a multivariate gaussian distribution for each input. during training, we sample from this distribution using the reparameterization trick. the decoder network takes this sample and attempts to reconstruct the original input. the loss function combines reconstruction error (often mean squared error for continuous data or binary cross-entropy for binary data) with the kl divergence term that regularizes the latent space distributions to be close to the prior.\n", 25 | "\n", 26 | "> **vae vs traditional autoencoders**\n", 27 | "\n", 28 | "unlike traditional autoencoders that encode inputs as single points in latent space, vaes encode inputs as probability distributions. this probabilistic approach creates a continuous, structured latent space where similar inputs cluster together and interpolation between points produces meaningful outputs. standard autoencoders may have gaps or discontinuities in their latent space, making generation of new samples difficult. vaes solve this by enforcing a smooth, continuous latent space through the kl divergence regularization. this structure allows for semantic operations in latent space, such as attribute manipulation through vector arithmetic, and enables generation of diverse samples by sampling different points from the prior distribution and decoding them.\n", 29 | "\n", 30 | "> **applications of vae**\n", 31 | "\n", 32 | "vaes excel in various generative applications across different domains. in computer vision, they generate images, perform image inpainting to fill missing regions, and enable controlled image generation and editing. in natural language processing, text vaes can generate coherent paragraphs and perform sentence interpolation. for anomaly detection, vaes learn the normal data distribution, allowing them to identify outliers as samples with high reconstruction error. in drug discovery, vaes generate novel molecular structures with desired properties by learning the distribution of valid chemical compounds. vaes also excel at learning disentangled representations, where different dimensions in latent space correspond to interpretable features of the data, enabling controlled generation and attribute manipulation.\n", 33 | "\n", 34 | "> **limitations and extensions**\n", 35 | "\n", 36 | "despite their elegant mathematical foundation, vaes often produce blurrier outputs than gans, especially for images. this is partially due to the pixel-wise reconstruction loss, which doesn't capture perceptual quality effectively. to address these limitations, numerous vae variants have been developed. β-vae introduces a hyperparameter to control the trade-off between reconstruction quality and latent space regularity. vq-vae (vector quantized vae) uses discrete latent variables instead of continuous ones, producing sharper outputs. conditional vaes incorporate additional information like class labels to control the generation process. flow-based models and diffusion models extend vae concepts with more expressive transformation functions. hybrid approaches like vae-gans combine the stable training of vaes with the perceptual quality of gans." 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [] 43 | } 44 | ], 45 | "metadata": { 46 | "language_info": { 47 | "name": "python" 48 | } 49 | }, 50 | "nbformat": 4, 51 | "nbformat_minor": 2 52 | } 53 | -------------------------------------------------------------------------------- /05-image-generation/03-diffusion/sd.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### let's understand stable diffusion\n", 8 | "\n", 9 | "> **why do we need stable diffusion?**\n", 10 | "\n", 11 | "previous generative models like gans and vaes struggled with generating high quality, diverse images at high resolutions. despite advances, these models required enormous computational resources and often produced artifacts or unrealistic features. we needed a fundamentally different approach that could create photorealistic images with precise control, while being more computationally efficient. text-to-image generation remained particularly challenging, with models failing to accurately interpret complex prompts and generate corresponding images. stable diffusion addresses these limitations by leveraging diffusion probabilistic models combined with latent space compression, resulting in remarkable image generation capabilities with reasonable computational requirements.\n", 12 | "\n", 13 | "> **what is stable diffusion?**\n", 14 | "\n", 15 | "stable diffusion is a latent diffusion model that generates images from text prompts or other images. unlike earlier diffusion models that operated in pixel space, stable diffusion works in a compressed latent space, making it much more computationally efficient. it consists of three key components: a text encoder (usually clip) that transforms text prompts into embeddings, a unet model that performs the diffusion process in latent space, and a variational autoencoder that decodes the latent representations into images. the model works by gradually denoising random noise into coherent images guided by text embeddings. by operating in a compressed latent space rather than full pixel space, stable diffusion can generate high-resolution images with significantly lower computational requirements than previous state-of-the-art models.\n", 16 | "\n", 17 | "> **how diffusion models work?**\n", 18 | "\n", 19 | "diffusion models are based on the concept of gradually adding noise to data and then learning to reverse this process. the forward diffusion process systematically destroys structure in data by adding gaussian noise over multiple steps until the data becomes pure noise. the model then learns the reverse diffusion process starting from random noise and gradually denoising it into meaningful data. mathematically, each step of the forward process can be seen as adding a small amount of gaussian noise to the previous state. the neural network is trained to predict the noise component at each step of the reverse process, allowing it to gradually denoise random samples into data that matches the training distribution. this approach creates a smooth path between random noise and structured data.\n", 20 | "\n", 21 | "> **the math behind diffusion models**\n", 22 | "\n", 23 | "the forward diffusion process adds noise to the data in t steps according to:\n", 24 | "\n", 25 | "$$q(x_t|x_{t-1}) = \\mathcal{N}(x_t; \\sqrt{1-\\beta_t}x_{t-1}, \\beta_t\\mathbf{I})$$\n", 26 | "\n", 27 | "where $$\\beta_t$$ is the noise schedule parameter at step t.\n", 28 | "\n", 29 | "this leads to:\n", 30 | "\n", 31 | "$$q(x_t|x_0) = \\mathcal{N}(x_t; \\sqrt{\\bar{\\alpha}_t}x_0, (1-\\bar{\\alpha}_t)\\mathbf{I})$$\n", 32 | "\n", 33 | "where $$\\alpha_t = 1 - \\beta_t$$ and $$\\bar{\\alpha}_t = \\prod_{s=1}^{t}\\alpha_s$$\n", 34 | "\n", 35 | "the model learns to reverse this process by predicting the noise $$\\epsilon$$ added at each step:\n", 36 | "\n", 37 | "$$\\epsilon_\\theta(x_t, t) \\approx \\epsilon$$\n", 38 | "\n", 39 | "the loss function is:\n", 40 | "\n", 41 | "$$L = \\mathbb{E}_{x_0,\\epsilon,t}[||\\epsilon - \\epsilon_\\theta(x_t, t)||^2]$$\n", 42 | "\n", 43 | "where $$x_t = \\sqrt{\\bar{\\alpha}_t}x_0 + \\sqrt{1-\\bar{\\alpha}_t}\\epsilon$$ and $$\\epsilon \\sim \\mathcal{N}(0, \\mathbf{I})$$\n", 44 | "\n", 45 | "for sampling, we use:\n", 46 | "\n", 47 | "$$x_{t-1} = \\frac{1}{\\sqrt{\\alpha_t}}(x_t - \\frac{1-\\alpha_t}{\\sqrt{1-\\bar{\\alpha}_t}}\\epsilon_\\theta(x_t, t)) + \\sigma_t\\mathbf{z}$$\n", 48 | "\n", 49 | "where $$\\mathbf{z} \\sim \\mathcal{N}(0, \\mathbf{I})$$ and $$\\sigma_t$$ controls the sampling stochasticity.\n", 50 | "\n", 51 | "> **stable diffusion architecture**\n", 52 | "\n", 53 | "stable diffusion's key innovation is performing the diffusion process in a compressed latent space instead of pixel space. the architecture consists of three main components working together. first, a text encoder (commonly clip) processes text prompts into embeddings that guide the image generation. next, a u-net with cross-attention layers performs the actual diffusion process in latent space, conditioned on the text embeddings. the u-net predicts noise to be removed at each denoising step. finally, a variational autoencoder decodes the final latent representation into a high-resolution image. this latent-space approach dramatically reduces computation requirements—working with 64×64 latent representations versus 512×512 or larger pixel images—while maintaining generation quality.\n", 54 | "\n", 55 | "> **conditioning and guidance**\n", 56 | "\n", 57 | "one of stable diffusion's powerful features is its ability to be conditioned on various inputs. text conditioning is the most common, where the diffusion process is guided by text embeddings to generate images matching textual descriptions. classifier free guidance improves this by interpolating between conditional and unconditional generation, controlled by a guidance scale parameter that determines how strongly the generation follows the conditioning signal. higher guidance values produce images that more closely match the prompt but may sacrifice some natural variation. stable diffusion can also be conditioned on images for tasks like inpainting (filling in missing parts), outpainting (extending images beyond their boundaries), and image-to-image translation where an input image is transformed according to a text prompt.\n", 58 | "\n", 59 | "> **applications and extensions**\n", 60 | "\n", 61 | "stable diffusion has found applications across numerous domains due to its versatility and accessibility. beyond basic text-to-image generation, it powers creative tools that help artists, designers, and content creators generate and edit visual content. in entertainment and media, it's used for concept art, storyboarding, and asset creation. researchers have extended stable diffusion for video generation by adding temporal layers, 3d model generation by incorporating additional 3d constraints, and personalized image generation by fine-tuning on specific concepts or styles. techniques like dreambooth, textual inversion, and lora allow users to teach the model new concepts or styles with just a few reference images. its open-source nature has led to a flourishing ecosystem of innovations built upon the base model.\n", 62 | "\n", 63 | "> **limitations and ethical considerations**\n", 64 | "\n", 65 | "despite its capabilities, stable diffusion faces several limitations. it sometimes struggles with complex compositions, accurate text rendering, precise counting, and consistent object rendering across images. the model can also reproduce biases present in its training data, potentially reinforcing stereotypes. since it's trained on internet data, it may generate inappropriate content without proper safeguards. there are also concerns about copyright infringement, as the model may reproduce styles of specific artists or copyrighted characters. to address these issues, researchers have implemented various safety mechanisms, including prompt filtering, output checking, and image watermarking. ongoing research focuses on making these models more controllable, accurate, and aligned with human values while preserving their creative capabilities.\n", 66 | "\n" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [] 73 | } 74 | ], 75 | "metadata": { 76 | "kernelspec": { 77 | "display_name": "Python 3", 78 | "language": "python", 79 | "name": "python3" 80 | }, 81 | "language_info": { 82 | "codemirror_mode": { 83 | "name": "ipython", 84 | "version": 3 85 | }, 86 | "file_extension": ".py", 87 | "mimetype": "text/x-python", 88 | "name": "python", 89 | "nbconvert_exporter": "python", 90 | "pygments_lexer": "ipython3", 91 | "version": "3.9.6" 92 | } 93 | }, 94 | "nbformat": 4, 95 | "nbformat_minor": 2 96 | } 97 | -------------------------------------------------------------------------------- /05-image-generation/04-clip/clip.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# clip (contrastive language-image pre-training)\n", 8 | "\n", 9 | "clip is a neural network model developed to connect text and images. in the context of stable diffusion, clip serves as the text encoder that transforms text prompts into embeddings that guide the image generation process.\n", 10 | "\n", 11 | "the core principle behind clip is **contrastive learning** between text and image pairs. clip learns to align text and images in a shared embedding space, where related text and images are positioned closer together and unrelated ones are farther apart.\n", 12 | "\n", 13 | "## how clip works\n", 14 | "\n", 15 | "clip consists of two encoders:\n", 16 | "1. a text encoder (typically a transformer)\n", 17 | "2. an image encoder (typically a vision transformer or cnn)\n", 18 | "\n", 19 | "these encoders map text and images into a shared, high-dimensional embedding space. during training, clip maximizes the cosine similarity between correct text-image pairs while minimizing similarity between incorrect pairs.\n", 20 | "\n", 21 | "## the main formula\n", 22 | "\n", 23 | "the contrastive loss function that clip optimizes can be represented as:\n", 24 | "\n", 25 | "$$l = -\\log \\frac{\\exp(sim(t_i, i_i)/\\tau)}{\\sum_{j=1}^{n} \\exp(sim(t_i, i_j)/\\tau)}$$\n", 26 | "\n", 27 | "where:\n", 28 | "- $t_i$ is the text embedding\n", 29 | "- $i_i$ is the corresponding image embedding\n", 30 | "- $sim$ is the cosine similarity function\n", 31 | "- $\\tau$ is a temperature parameter\n", 32 | "- $n$ is the batch size\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [] 39 | } 40 | ], 41 | "metadata": { 42 | "language_info": { 43 | "name": "python" 44 | } 45 | }, 46 | "nbformat": 4, 47 | "nbformat_minor": 2 48 | } 49 | -------------------------------------------------------------------------------- /05-image-generation/05-dall-e/dalle.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# dall-e (a neural network that creates images from text)\n", 8 | "\n", 9 | "dall-e is a generative ai system developed by openai that creates images from text descriptions. it represents a breakthrough in the field of text-to-image synthesis, allowing users to generate novel and creative visual content simply by describing what they want to see.\n", 10 | "\n", 11 | "the core principle behind dall-e is **autoregressive generation** based on transformer architecture. dall-e understands the relationships between images and their textual descriptions to produce images that match the given text prompts with remarkable accuracy.\n", 12 | "\n", 13 | "## how dall-e works\n", 14 | "\n", 15 | "dall-e follows a two-stage approach:\n", 16 | "1. a text encoder processes the input prompt\n", 17 | "2. a generative model produces images based on the encoded text\n", 18 | "\n", 19 | "the original dall-e used a discrete vae (variational autoencoder) to compress images into tokens, treating image generation similar to language modeling. dall-e 2 and dall-e 3 use diffusion models for higher quality generation.\n", 20 | "\n", 21 | "## the technical approach\n", 22 | "\n", 23 | "dall-e 3 specifically uses:\n", 24 | "\n", 25 | "1. a text encoder to understand the prompt\n", 26 | "2. a diffusion model that gradually removes noise from random pixels\n", 27 | "3. a refinement process that ensures adherence to the text prompt\n", 28 | "\n", 29 | "the simplified process can be represented as:\n", 30 | "\n", 31 | "$$x_t = \\sqrt{\\alpha_t}x_0 + \\sqrt{1-\\alpha_t}\\epsilon$$\n", 32 | "\n", 33 | "where:\n", 34 | "- $x_t$ is the noisy image at timestep $t$\n", 35 | "- $x_0$ is the original image\n", 36 | "- $\\alpha_t$ is a noise schedule parameter\n", 37 | "- $\\epsilon$ is gaussian noise\n", 38 | "\n", 39 | "## dall-e's capabilities\n", 40 | "\n", 41 | "dall-e excels at:\n", 42 | "- creating photorealistic images from detailed descriptions\n", 43 | "- generating artistic compositions in various styles\n", 44 | "- understanding complex spatial relationships\n", 45 | "- maintaining coherence across complex prompts\n", 46 | "- rendering text within images (especially in dall-e 3)\n", 47 | "\n", 48 | "when you provide a text prompt to dall-e, it interprets your description and generates an image that visually represents the concepts, styles, and relationships you've described, demonstrating an impressive understanding of both language and visual content." 49 | ] 50 | } 51 | ], 52 | "metadata": { 53 | "language_info": { 54 | "name": "python" 55 | } 56 | }, 57 | "nbformat": 4, 58 | "nbformat_minor": 2 59 | } 60 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Learning Research Paper Collection 2 | 3 | ## Overview 4 | 5 | This repository is a collection of IMPORTANT deep learning research papers, organized by research area and implementation. The goal is to provide a structured approach to understanding the evolution and core concepts of deep learning. 6 | 7 | ## Disclaimer 8 | 9 | > [!IMPORTANT] 10 | > This is a personal learning project. The implementations and notes may contain errors or simplifications. Use with caution and always refer to the original papers. 11 | 12 | ## Inspiration and Credits 13 | 14 | Inspired by [adam-maj](https://github.com/adam-maj) and expanded with additional research papers and implementations. 15 | 16 | ## Project Goals 17 | 18 | - Implement approximately 60 important deep learning papers 19 | - Provide scratch implementations for learning and understanding 20 | - Create a comprehensive resource for deep learning research 21 | 22 | ## Contents 23 | 24 | ### 1. Foundational Deep Neural Networks 25 | 26 | #### Papers 27 | - **DNN** (1987): Learning Internal Representations by Error Propagation [pdf](https://www.iro.umontreal.ca/~vincentp/ift3395/lectures/backprop_old.pdf) 28 | - **CNN** (1989): Backpropagation Applied to Handwritten Zip Code Recognition [pdf](http://yann.lecun.com/exdb/publis/pdf/lecun-89e.pdf) 29 | - **LeNet** (1998): Gradient-Based Learning Applied to Document Recognition [pdf](http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf) 30 | - **AlexNet** (2012): ImageNet Classification with Deep Convolutional Networks [pdf](https://papers.nips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf) 31 | - **U-Net** (2015): Convolutional Networks for Biomedical Image Segmentation [pdf](https://arxiv.org/pdf/1505.04597.pdf) 32 | 33 | ### 2. Optimization and Regularization Techniques 34 | 35 | #### Papers 36 | - **Weight Decay** (1991): A Simple Weight Decay Can Improve Generalization [pdf](https://www.cs.toronto.edu/~hinton/absps/nips93.pdf) 37 | - **ReLU** (2011): Deep Sparse Rectified Neural Networks [pdf](https://www.cs.toronto.edu/~hinton/absps/reluICML.pdf) 38 | - **Residuals** (2015): Deep Residual Learning for Image Recognition [pdf](https://arxiv.org/pdf/1512.03385.pdf) 39 | - **Dropout** (2014): Preventing Neural Networks from Overfitting [pdf](https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf) 40 | - **BatchNorm** (2015): Accelerating Deep Network Training [pdf](https://arxiv.org/pdf/1502.03167.pdf) 41 | - **LayerNorm** (2016): Layer Normalization [pdf](https://arxiv.org/pdf/1607.06450.pdf) 42 | - **GELU** (2016): Gaussian Error Linear Units [pdf](https://arxiv.org/pdf/1606.08415.pdf) 43 | - **Adam** (2014): Stochastic Optimization Method [pdf](https://arxiv.org/pdf/1412.6980.pdf) 44 | 45 | ### 3. Sequence Modeling 46 | 47 | #### Papers 48 | - **RNN** (1989): Continually Running Fully Recurrent Neural Networks [pdf](https://www.bioinf.jku.at/publications/older/2604.pdf) 49 | - **LSTM** (1997): Long-Short Term Memory [pdf](https://www.bioinf.jku.at/publications/older/2308.pdf) 50 | - **Learning to Forget** (2000): Continual Prediction with LSTM [pdf](https://www.researchgate.net/publication/221601044_Learning_to_Forget_Continual_Prediction_with_LSTM) 51 | - **Word2Vec** (2013): Word Representations in Vector Space [pdf](https://arxiv.org/pdf/1301.3781.pdf) 52 | - **Phrase2Vec** (2013): Distributed Representations of Words and Phrases [pdf](https://arxiv.org/pdf/1310.4546.pdf) 53 | - **Encoder-Decoder** (2014): RNN Encoder-Decoder for Machine Translation [pdf](https://arxiv.org/pdf/1406.1078.pdf) 54 | - **Seq2Seq** (2014): Sequence to Sequence Learning [pdf](https://arxiv.org/pdf/1409.3215.pdf) 55 | - **Attention** (2014): Neural Machine Translation with Alignment [pdf](https://arxiv.org/pdf/1409.0473.pdf) 56 | - **Mixture of Experts** (2017): Sparsely-Gated Neural Networks [pdf](https://arxiv.org/pdf/1701.06538.pdf) 57 | 58 | ### 4. Language Modeling 59 | 60 | #### Papers 61 | - **Transformer** (2017): Attention Is All You Need [pdf](https://arxiv.org/pdf/1706.03762.pdf) 62 | - **BERT** (2018): Bidirectional Transformers for Language Understanding [pdf](https://arxiv.org/pdf/1810.04805.pdf) 63 | - **RoBERTa** (2019): Robustly Optimized BERT Pretraining [pdf](https://arxiv.org/pdf/1907.11692.pdf) 64 | - **T5** (2019): Unified Text-to-Text Transformer [pdf](https://arxiv.org/pdf/1910.10683.pdf) 65 | - **GPT Series**: 66 | - GPT (2018): Generative Pre-Training [pdf](https://arxiv.org/pdf/1810.04805.pdf) 67 | - GPT-2 (2018): Unsupervised Multitask Learning [pdf](https://arxiv.org/pdf/1902.01082.pdf) 68 | - GPT-3 (2020): Few-Shot Learning [pdf](https://arxiv.org/pdf/2005.14165.pdf) 69 | - GPT-4 (2023): Advanced Language Model [pdf](https://arxiv.org/pdf/2303.08774.pdf) 70 | - **LoRA** (2021): Low-Rank Adaptation of Large Language Models [pdf](https://arxiv.org/pdf/2106.09685.pdf) 71 | - **RLHF** (2019): Fine-Tuning from Human Preferences [pdf](https://arxiv.org/pdf/1909.08593.pdf) 72 | - **InstructGPT** (2022): Following Instructions with Human Feedback [pdf](https://arxiv.org/pdf/2203.02155.pdf) 73 | - **Vision Transformer** (2020): Image Recognition with Transformers [pdf](https://arxiv.org/pdf/2010.11929.pdf) 74 | - **ELECTRA** (2020): Discriminative Pre-training [pdf](https://arxiv.org/pdf/2003.10555.pdf) 75 | 76 | ### 5. Image Generative Modeling 77 | 78 | #### Papers 79 | - **GAN** (2014): Generative Adversarial Networks [pdf](https://arxiv.org/pdf/1406.2661.pdf) 80 | - **VAE** (2013): Auto-Encoding Variational Bayes [pdf](https://arxiv.org/pdf/1312.6114.pdf) 81 | - **VQ VAE** (2017): Neural Discrete Representation Learning [pdf](https://arxiv.org/pdf/1711.00937.pdf) 82 | - **Diffusion Models**: 83 | - Initial Diffusion (2015): Nonequilibrium Thermodynamics [pdf](https://arxiv.org/pdf/1503.03585.pdf) 84 | - Denoising Diffusion (2020): Probabilistic Models [pdf](https://arxiv.org/pdf/2006.11239.pdf) 85 | - Improved Denoising Diffusion (2021) [pdf](https://arxiv.org/pdf/2102.09672.pdf) 86 | - **CLIP** (2021): Visual Models from Natural Language Supervision [pdf](https://arxiv.org/pdf/2103.00020.pdf) 87 | - **DALL-E** (2021-2022): Text-to-Image Generation [pdf](https://arxiv.org/pdf/2102.12092.pdf) 88 | - **SimCLR** (2020): Contrastive Learning of Visual Representations [pdf](https://arxiv.org/pdf/2002.05709.pdf) 89 | 90 | ### 6. Deep Reinforcement Learning 91 | 92 | #### Papers 93 | - **Deep Reinforcement Learning** (2017): Mastering Chess and Shogi [pdf](https://arxiv.org/pdf/1712.01815.pdf) 94 | - **Deep Q-Learning** (2013): Playing Atari Games [pdf](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) 95 | - **AlphaGo** (2016): Mastering the Game of Go [pdf](https://www.nature.com/articles/nature16961.pdf) 96 | - **AlphaFold** (2021): Protein Structure Prediction [pdf](https://www.nature.com/articles/s41586-021-03819-2.pdf) 97 | 98 | ### 7. Additional Influential Papers 99 | 100 | - **Deep Learning Survey** (2015): By LeCun, Bengio, and Hinton [pdf](https://www.cs.toronto.edu/~hinton/absps/NatureDeepReview.pdf) 101 | - **BigGAN** (2018): Large Scale GAN Training [pdf](https://arxiv.org/pdf/1809.11096.pdf) 102 | - **WaveNet** (2016): Generative Model for Raw Audio [pdf](https://arxiv.org/pdf/1609.03499.pdf) 103 | - **BERTology** (2020): Survey of BERT Use Cases [pdf](https://arxiv.org/pdf/2002.10063.pdf) 104 | 105 | #### Scaling and Model Optimization 106 | - **Scaling Laws for Neural Language Models** (2020): Predicting Model Performance [pdf](https://arxiv.org/pdf/2001.08361.pdf) 107 | - **Chinchilla** (2022): Training Compute-Optimal Large Language Models [pdf](https://arxiv.org/pdf/2203.15556.pdf) 108 | - **Gopher** (2022): Scaling Language Models with Massive Compute [pdf](https://arxiv.org/pdf/2112.11446.pdf) 109 | 110 | #### Fine-tuning and Adaptation 111 | - **P-Tuning** (2021): Prompt Tuning with Soft Prompts [pdf](https://arxiv.org/pdf/2103.10385.pdf) 112 | - **Prefix-Tuning** (2021): Optimizing Continuous Prompts [pdf](https://arxiv.org/pdf/2101.00190.pdf) 113 | - **AdaLoRA** (2023): Adaptive Low-Rank Adaptation [pdf](https://arxiv.org/pdf/2303.10512.pdf) 114 | - **QLoRA** (2023): Efficient Fine-Tuning of Quantized Models [pdf](https://arxiv.org/pdf/2305.14314.pdf) 115 | 116 | #### Inference and Optimization Techniques 117 | - **FlashAttention** (2022): Fast and Memory-Efficient Attention [pdf](https://arxiv.org/pdf/2205.14135.pdf) 118 | - **FlashAttention-2** (2023): Faster Attention Mechanism [pdf](https://arxiv.org/pdf/2307.08691.pdf) 119 | - **Direct Preference Optimization (DPO)** (2023): Aligning Language Models with Human Preferences [pdf](https://arxiv.org/pdf/2305.18046.pdf) 120 | - **LoRA** (2021): Low-Rank Adaptation of Large Language Models [pdf](https://arxiv.org/pdf/2106.09685.pdf) 121 | 122 | #### Pre-training and Model Architecture 123 | - **Mixture of Experts (MoE)** (2022): Scaling Language Models with Sparse Experts [pdf](https://arxiv.org/pdf/2201.05596.pdf) 124 | - **GLaM** (2021): Efficient Scaling with Mixture of Experts [pdf](https://arxiv.org/pdf/2112.06905.pdf) 125 | - **Switch Transformers** (2022): Scaling to Trillion Parameter Models [pdf](https://arxiv.org/pdf/2101.03961.pdf) 126 | 127 | #### Reasoning and Capabilities 128 | - **Chain of Thought Prompting** (2022): Reasoning with Language Models [pdf](https://arxiv.org/pdf/2201.11903.pdf) 129 | - **Self-Consistency** (2022): Improving Language Model Reasoning [pdf](https://arxiv.org/pdf/2203.11171.pdf) 130 | - **Tree of Thoughts** (2023): Deliberate Problem Solving [pdf](https://arxiv.org/pdf/2305.10601.pdf) 131 | 132 | #### Efficiency and Compression 133 | - **DistilBERT** (2019): Distilled Version of BERT [pdf](https://arxiv.org/pdf/1910.01108.pdf) 134 | - **Knowledge Distillation** (2022): Comprehensive Survey [pdf](https://arxiv.org/pdf/2006.05525.pdf) 135 | - **Pruning and Quantization Techniques** (2022): Model Compression Survey [pdf](https://arxiv.org/pdf/2102.06322.pdf) -------------------------------------------------------------------------------- /images/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/.gitkeep -------------------------------------------------------------------------------- /images/3-11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/3-11.png -------------------------------------------------------------------------------- /images/3-12-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/3-12-2.png -------------------------------------------------------------------------------- /images/3-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/3-7.png -------------------------------------------------------------------------------- /images/Cowboy-Bebop-Quotes1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/Cowboy-Bebop-Quotes1.jpeg -------------------------------------------------------------------------------- /images/GRU.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/GRU.png -------------------------------------------------------------------------------- /images/GRulCXpaUAAm5Up.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/GRulCXpaUAAm5Up.jpeg -------------------------------------------------------------------------------- /images/LKE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/LKE.png -------------------------------------------------------------------------------- /images/RNN-vs-FNN-660.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/RNN-vs-FNN-660.png -------------------------------------------------------------------------------- /images/T5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/T5.jpg -------------------------------------------------------------------------------- /images/T5_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/T5_1.jpg -------------------------------------------------------------------------------- /images/add-1.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/add-1.drawio.png -------------------------------------------------------------------------------- /images/alexnet-arc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/alexnet-arc.png -------------------------------------------------------------------------------- /images/batcnorm.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/batcnorm.jpeg -------------------------------------------------------------------------------- /images/bert.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/bert.jpg -------------------------------------------------------------------------------- /images/bot-res.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/bot-res.png -------------------------------------------------------------------------------- /images/bottleneck.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/bottleneck.png -------------------------------------------------------------------------------- /images/cnn.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/cnn.jpg -------------------------------------------------------------------------------- /images/convolution-2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/convolution-2.gif -------------------------------------------------------------------------------- /images/decoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/decoder.png -------------------------------------------------------------------------------- /images/dropout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/dropout.png -------------------------------------------------------------------------------- /images/dropoutex.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/dropoutex.jpg -------------------------------------------------------------------------------- /images/dropoutt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/dropoutt.png -------------------------------------------------------------------------------- /images/earlystopping.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/earlystopping.jpg -------------------------------------------------------------------------------- /images/encoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/encoder.png -------------------------------------------------------------------------------- /images/f_pdf.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/f_pdf.jpg -------------------------------------------------------------------------------- /images/for_revered_guest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/for_revered_guest.png -------------------------------------------------------------------------------- /images/imagent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/imagent.png -------------------------------------------------------------------------------- /images/imagnet-win.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/imagnet-win.png -------------------------------------------------------------------------------- /images/last-lstm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/last-lstm.png -------------------------------------------------------------------------------- /images/lstm-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/lstm-2.png -------------------------------------------------------------------------------- /images/lstm-add.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/lstm-add.png -------------------------------------------------------------------------------- /images/lstm-core.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/lstm-core.png -------------------------------------------------------------------------------- /images/lstm-input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/lstm-input.png -------------------------------------------------------------------------------- /images/lstm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/lstm.png -------------------------------------------------------------------------------- /images/maxpool.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/maxpool.gif -------------------------------------------------------------------------------- /images/mul.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/mul.drawio.png -------------------------------------------------------------------------------- /images/newunet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/newunet.png -------------------------------------------------------------------------------- /images/nor-res.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/nor-res.png -------------------------------------------------------------------------------- /images/overlapping.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/overlapping.png -------------------------------------------------------------------------------- /images/pos-cal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/pos-cal.png -------------------------------------------------------------------------------- /images/pos-emb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/pos-emb.png -------------------------------------------------------------------------------- /images/probs.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/probs.jpg -------------------------------------------------------------------------------- /images/relu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/relu.png -------------------------------------------------------------------------------- /images/reluu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/reluu.png -------------------------------------------------------------------------------- /images/res-arc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/res-arc.png -------------------------------------------------------------------------------- /images/resnet.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/resnet.jpg -------------------------------------------------------------------------------- /images/rnn_arc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/rnn_arc.png -------------------------------------------------------------------------------- /images/self-feedback-loop.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/self-feedback-loop.jpg -------------------------------------------------------------------------------- /images/skip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/skip.png -------------------------------------------------------------------------------- /images/transformer.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/transformer.jpeg -------------------------------------------------------------------------------- /images/transistor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/transistor.png -------------------------------------------------------------------------------- /images/trig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/trig.png -------------------------------------------------------------------------------- /images/typesofrnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/typesofrnn.png -------------------------------------------------------------------------------- /images/unet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/unet.png -------------------------------------------------------------------------------- /images/unetimg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/unetimg.png -------------------------------------------------------------------------------- /images/unetsd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/unetsd.png -------------------------------------------------------------------------------- /images/vis-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/vis-4.png -------------------------------------------------------------------------------- /images/vis-cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/vis-cnn.png -------------------------------------------------------------------------------- /images/vis_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/vis_0.png -------------------------------------------------------------------------------- /images/vis_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/vis_1.png -------------------------------------------------------------------------------- /images/vis_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/vis_2.png -------------------------------------------------------------------------------- /images/word2vec-embed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/word2vec-embed.png -------------------------------------------------------------------------------- /images/word2vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saurabhaloneai/History-of-Deep-Learning/3bf46e64963175f6607ab97672c59ae8ec25832f/images/word2vec.png --------------------------------------------------------------------------------