├── .labml.yaml ├── utils ├── __init__.py └── sitemap.py ├── docs ├── CNAME ├── icon.png ├── cnn │ └── utils │ │ ├── cv-folds.png │ │ ├── overfitting.png │ │ ├── Underfitting.png │ │ ├── early-stopping.png │ │ ├── ground_truth.png │ │ └── Cross-validation.png ├── optimizers │ ├── noam_lr.png │ └── radam_r_t.png ├── gan │ └── stylegan │ │ └── generated_64.png ├── resnets │ ├── index.html │ ├── utils │ │ └── index.html │ └── models │ │ └── index.html ├── experiments │ └── index.html └── transformers │ ├── basic │ └── index.html │ └── relative_mha.html ├── labml_nn ├── resnets │ ├── __init__.py │ ├── models │ │ ├── __init__.py │ │ └── mlp.py │ ├── utils │ │ ├── __init__.py │ │ ├── labelsmoothing.py │ │ ├── utils.py │ │ └── train.py │ ├── accuracy_graph_85.png │ ├── pretrained_nets.py │ └── resnet_net.py ├── experiments │ ├── __init__.py │ ├── cifar10.py │ └── mnist.py ├── transformers │ ├── basic │ │ └── __init__.py │ ├── relative_mha.py │ ├── glu_variants │ │ ├── __init__.py │ │ └── experiment.py │ ├── utils.py │ ├── gmlp │ │ ├── readme.md │ │ └── experiment.py │ ├── fnet │ │ ├── readme.md │ │ ├── __init__.py │ │ └── experiment.py │ ├── aft │ │ └── readme.md │ ├── fast_weights │ │ ├── readme.md │ │ ├── experiment.py │ │ └── token_wise.py │ ├── xl │ │ └── readme.md │ ├── knn │ │ ├── __init__.py │ │ └── train_model.py │ ├── switch │ │ └── readme.md │ ├── feedback │ │ └── readme.md │ ├── compressive │ │ └── readme.md │ ├── label_smoothing_loss.py │ ├── positional_encoding.py │ ├── mlm │ │ └── readme.md │ ├── __init__.py │ └── feed_forward.py ├── activations │ ├── __init__.py │ └── swish.py ├── cnn │ ├── save │ │ └── Basic_CNN-best-model │ │ │ └── model.pt │ ├── cross_validation.py │ ├── utils │ │ └── dataloader.py │ └── ray_tune.py ├── gan │ ├── wasserstein │ │ ├── readme.md │ │ ├── gradient_penalty │ │ │ ├── readme.md │ │ │ ├── experiment.py │ │ │ └── __init__.py │ │ └── experiment.py │ ├── original │ │ └── readme.md │ ├── cycle_gan │ │ └── readme.md │ ├── dcgan │ │ ├── readme.md │ │ └── __init__.py │ ├── __init__.py │ └── stylegan │ │ └── readme.md ├── hypernetworks │ ├── __init__.py │ └── experiment.py ├── normalization │ ├── weight_standardization │ │ ├── readme.md │ │ ├── conv2d.py │ │ ├── experiment.py │ │ └── __init__.py │ ├── __init__.py │ ├── instance_norm │ │ ├── readme.md │ │ ├── experiment.py │ │ └── __init__.py │ ├── layer_norm │ │ └── readme.md │ ├── group_norm │ │ ├── readme.md │ │ └── experiment.py │ └── batch_norm │ │ ├── cifar10.py │ │ ├── mnist.py │ │ └── readme.md ├── optimizers │ ├── readme.md │ ├── performance_test.py │ ├── adam_warmup.py │ ├── noam.py │ ├── adam_warmup_cosine_decay.py │ └── mnist_experiment.py ├── rl │ ├── __init__.py │ ├── ppo │ │ ├── readme.md │ │ └── gae.py │ └── dqn │ │ └── model.py ├── utils │ ├── tokenizer.py │ └── __init__.py ├── capsule_networks │ └── readme.md └── __init__.py ├── MANIFEST.in ├── images └── dqn.png ├── requirements.txt ├── .gitignore ├── Makefile ├── license └── setup.py /.labml.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/CNAME: -------------------------------------------------------------------------------- 1 | nn.labml.ai -------------------------------------------------------------------------------- /labml_nn/resnets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /labml_nn/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include readme.rst 2 | -------------------------------------------------------------------------------- /labml_nn/resnets/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /labml_nn/resnets/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /labml_nn/transformers/basic/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /labml_nn/activations/__init__.py: -------------------------------------------------------------------------------- 1 | from .swish import Swish 2 | -------------------------------------------------------------------------------- /docs/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/icon.png -------------------------------------------------------------------------------- /images/dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/images/dqn.png -------------------------------------------------------------------------------- /docs/cnn/utils/cv-folds.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/cnn/utils/cv-folds.png -------------------------------------------------------------------------------- /docs/optimizers/noam_lr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/optimizers/noam_lr.png -------------------------------------------------------------------------------- /docs/cnn/utils/overfitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/cnn/utils/overfitting.png -------------------------------------------------------------------------------- /docs/optimizers/radam_r_t.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/optimizers/radam_r_t.png -------------------------------------------------------------------------------- /docs/cnn/utils/Underfitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/cnn/utils/Underfitting.png -------------------------------------------------------------------------------- /docs/cnn/utils/early-stopping.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/cnn/utils/early-stopping.png -------------------------------------------------------------------------------- /docs/cnn/utils/ground_truth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/cnn/utils/ground_truth.png -------------------------------------------------------------------------------- /docs/cnn/utils/Cross-validation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/cnn/utils/Cross-validation.png -------------------------------------------------------------------------------- /docs/gan/stylegan/generated_64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/gan/stylegan/generated_64.png -------------------------------------------------------------------------------- /labml_nn/resnets/accuracy_graph_85.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/labml_nn/resnets/accuracy_graph_85.png -------------------------------------------------------------------------------- /labml_nn/cnn/save/Basic_CNN-best-model/model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/labml_nn/cnn/save/Basic_CNN-best-model/model.pt -------------------------------------------------------------------------------- /labml_nn/gan/wasserstein/readme.md: -------------------------------------------------------------------------------- 1 | # [Wasserstein GAN - WGAN](https://nn.labml.ai/gan/wasserstein/index.html) 2 | 3 | This is an implementation of 4 | [Wasserstein GAN](https://arxiv.org/abs/1701.07875). 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.7 2 | labml>=0.4.94 3 | labml-helpers>=0.4.77 4 | torchvision 5 | numpy>=1.16.3 6 | matplotlib>=3.0.3 7 | einops>=0.3.0 8 | gym[atari] 9 | opencv-python 10 | Pillow>=6.2.1 11 | -------------------------------------------------------------------------------- /labml_nn/hypernetworks/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: HyperNetworks 4 | summary: A PyTorch implementation/tutorial of HyperLSTM introduced in paper HyperNetworks. 5 | --- 6 | 7 | ## [HyperLSTM](hyper_lstm.html) 8 | """ -------------------------------------------------------------------------------- /labml_nn/gan/original/readme.md: -------------------------------------------------------------------------------- 1 | # [Generative Adversarial Networks - GAN](https://nn.labml.ai/gan/original/index.html) 2 | 3 | This is an annotated implementation of 4 | [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661). 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | __pycache__ 3 | .DS_Store 4 | .*.swp 5 | *.egg-info/ 6 | dist/ 7 | build/ 8 | .idea/* 9 | !.idea/dictionaries 10 | labml 11 | labml_helpers 12 | labml_samples 13 | data 14 | logs 15 | html/ 16 | diagrams/ -------------------------------------------------------------------------------- /labml_nn/transformers/relative_mha.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Relative Multi-Headed Attention 4 | summary: Relative Multi-Headed Attention from paper Transformer-XL. 5 | redirect: https://nn.labml.ai/transformers/xl/relative_mha.html 6 | --- 7 | """ 8 | -------------------------------------------------------------------------------- /labml_nn/gan/cycle_gan/readme.md: -------------------------------------------------------------------------------- 1 | # [Cycle GAN](https://nn.labml.ai/gan/cycle_gan/index.html) 2 | 3 | This is a [PyTorch](https://pytorch.org) implementation/tutorial of the paper 4 | [Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks](https://arxiv.org/abs/1703.10593). 5 | -------------------------------------------------------------------------------- /labml_nn/gan/dcgan/readme.md: -------------------------------------------------------------------------------- 1 | # [Deep Convolutional Generative Adversarial Networks - DCGAN](https://nn.labml.ai/gan/dcgan/index.html) 2 | 3 | This is a [PyTorch](https://pytorch.org) implementation of paper 4 | [Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](https://arxiv.org/abs/1511.06434). 5 | -------------------------------------------------------------------------------- /labml_nn/activations/swish.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from labml_helpers.module import Module 5 | 6 | 7 | class Swish(Module): 8 | def __init__(self): 9 | super().__init__() 10 | self.sigmoid = nn.Sigmoid() 11 | 12 | def forward(self, x: torch.Tensor) -> torch.Tensor: 13 | return x * self.sigmoid(x) 14 | -------------------------------------------------------------------------------- /labml_nn/experiments/cifar10.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: CIFAR10 Experiment 4 | summary: > 5 | This is a reusable trainer for CIFAR10 dataset 6 | --- 7 | 8 | # CIFAR10 Experiment 9 | """ 10 | 11 | from labml_helpers.datasets.cifar10 import CIFAR10Configs as CIFAR10DatasetConfigs 12 | from labml_nn.experiments.mnist import MNISTConfigs 13 | 14 | 15 | class CIFAR10Configs(CIFAR10DatasetConfigs, MNISTConfigs): 16 | dataset_name: str = 'CIFAR10' 17 | -------------------------------------------------------------------------------- /labml_nn/transformers/glu_variants/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Gated Linear Units and Variants 4 | summary: > 5 | Train an auto-regressive transformer with Gated Linear Units and variants 6 | for the position-wise feedforward network (FFN). 7 | --- 8 | 9 | # Gated Linear Units and Variants 10 | 11 | * [Experiment that uses `labml.configs`](glu_variants/experiment.html) 12 | * [Simpler version from scratch](glu_variants/simple.html) 13 | """ 14 | -------------------------------------------------------------------------------- /labml_nn/normalization/weight_standardization/readme.md: -------------------------------------------------------------------------------- 1 | # [Weight Standardization](https://nn.labml.ai/normalization/weight_standardization/index.html) 2 | 3 | This is a [PyTorch](https://pytorch.org) implementation of Weight Standardization from the paper 4 | [Micro-Batch Training with Batch-Channel Normalization and Weight Standardization](https://arxiv.org/abs/1903.10520). 5 | We also have an 6 | [annotated implementation of Batch-Channel Normalization](https://nn.labml.ai/normalization/batch_channel_norm/index.html). 7 | -------------------------------------------------------------------------------- /labml_nn/gan/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Generative Adversarial Networks 4 | summary: > 5 | A set of PyTorch implementations/tutorials of GANs. 6 | --- 7 | 8 | # Generative Adversarial Networks 9 | 10 | * [Original GAN](original/index.html) 11 | * [GAN with deep convolutional network](dcgan/index.html) 12 | * [Cycle GAN](cycle_gan/index.html) 13 | * [Wasserstein GAN](wasserstein/index.html) 14 | * [Wasserstein GAN with Gradient Penalty](wasserstein/gradient_penalty/index.html) 15 | * [Style GAN 2](stylegan/index.html) 16 | """ -------------------------------------------------------------------------------- /labml_nn/optimizers/readme.md: -------------------------------------------------------------------------------- 1 | # [Optimizers](https://nn.labml.ai/optimizers/index.html) 2 | 3 | ## Optimizer Implementations 4 | * [Adam Optimizer](https://nn.labml.ai/optimizers/adam.html) 5 | * [AMSGrad Optimizer](https://nn.labml.ai/optimizers/amsgrad.html) 6 | * [Adam Optimizer with warmup](https://nn.labml.ai/optimizers/adam_warmup.html) 7 | * [Noam Optimizer](https://nn.labml.ai/optimizers/noam.html) 8 | * [Rectified Adam Optimizer](https://nn.labml.ai/optimizers/radam.html) 9 | * [AdaBelief Optimizer](https://nn.labml.ai/optimizers/ada_belief.html) 10 | -------------------------------------------------------------------------------- /labml_nn/normalization/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Normalization Layers 4 | summary: > 5 | A set of PyTorch implementations/tutorials of normalization layers. 6 | --- 7 | 8 | # Normalization Layers 9 | 10 | * [Batch Normalization](batch_norm/index.html) 11 | * [Layer Normalization](layer_norm/index.html) 12 | * [Instance Normalization](instance_norm/index.html) 13 | * [Group Normalization](group_norm/index.html) 14 | * [Weight Standardization](weight_standardization/index.html) 15 | * [Batch-Channel Normalization](batch_channel_norm/index.html) 16 | """ 17 | -------------------------------------------------------------------------------- /labml_nn/resnets/utils/labelsmoothing.py: -------------------------------------------------------------------------------- 1 | import torch.nn.functional as F 2 | from torch import nn 3 | 4 | class LabelSmoothingLoss(nn.Module): 5 | def __init__(self, epsilon= 0.5, reduction='mean'): 6 | super().__init__() 7 | self.epsilon = epsilon 8 | self.reduction = reduction 9 | 10 | def forward(self, pred, target): 11 | n = pred.size()[-1] 12 | log_pred = F.log_softmax(pred, dim=-1) 13 | loss = -log_pred.sum(dim=-1).mean() 14 | nll = F.nll_loss(log_pred, target, reduction=self.reduction) 15 | out = (1-self.epsilon)*nll + self.epsilon*(loss / n) 16 | return out 17 | -------------------------------------------------------------------------------- /labml_nn/normalization/instance_norm/readme.md: -------------------------------------------------------------------------------- 1 | # [Instance Normalization](https://nn.labml.ai/normalization/instance_norm/index.html) 2 | 3 | This is a [PyTorch](https://pytorch.org) implementation of 4 | [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022). 5 | 6 | Instance normalization was introduced to improve [style transfer](https://paperswithcode.com/task/style-transfer). 7 | It is based on the observation that stylization should not depend on the contrast of the content image. 8 | Since it's hard for a convolutional network to learn "contrast normalization", this paper 9 | introduces instance normalization which does that. -------------------------------------------------------------------------------- /labml_nn/transformers/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Utilities for Transformer 4 | summary: A bunch of utility functions and classes for transformers. 5 | --- 6 | 7 | # Utilities for Transformer 8 | """ 9 | 10 | import torch 11 | 12 | 13 | def subsequent_mask(seq_len): 14 | """ 15 | ## Subsequent mask to mask out data from future (subsequent) time steps 16 | """ 17 | mask = torch.tril(torch.ones(seq_len, seq_len)).to(torch.bool).unsqueeze(-1) 18 | return mask 19 | 20 | 21 | def _subsequent_mask(): 22 | from labml.logger import inspect 23 | inspect(subsequent_mask(10)[:, :, 0]) 24 | 25 | 26 | if __name__ == '__main__': 27 | _subsequent_mask() 28 | -------------------------------------------------------------------------------- /labml_nn/transformers/gmlp/readme.md: -------------------------------------------------------------------------------- 1 | # [Pay Attention to MLPs (gMLP)](https://nn.labml.ai/transformers/gmlp/index.html) 2 | 3 | This is a [PyTorch](https://pytorch.org) implementation of the paper 4 | [Pay Attention to MLPs](https://papers.labml.ai/paper/2105.08050). 5 | 6 | This paper introduces a Multilayer Perceptron (MLP) based architecture with gating, 7 | which they name **gMLP**. It consists of a stack of $L$ *gMLP* blocks. 8 | 9 | Here is [the training code](https://nn.labml.ai/transformers/gmlp/experiment.html) for a gMLP model based autoregressive model. 10 | 11 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/01bd941ac74c11eb890c1d9196651a4a) 12 | -------------------------------------------------------------------------------- /labml_nn/transformers/fnet/readme.md: -------------------------------------------------------------------------------- 1 | # [FNet: Mixing Tokens with Fourier Transforms](https://nn.labml.ai/transformers/fnet/index.html) 2 | 3 | This is a [PyTorch](https://pytorch.org) implementation of the paper 4 | [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824). 5 | 6 | This paper replaces the [self-attention layer](https://nn.labml.ai/transformers//mha.html) with two 7 | [Fourier transforms](https://en.wikipedia.org/wiki/Discrete_Fourier_transform) to 8 | *mix* tokens. 9 | This is a 7X more efficient than self-attention. 10 | The accuracy loss of using this over self-attention is about 92% for 11 | [BERT](https://paperswithcode.com/method/bert) on 12 | [GLUE benchmark](https://paperswithcode.com/dataset/glue). 13 | -------------------------------------------------------------------------------- /labml_nn/gan/stylegan/readme.md: -------------------------------------------------------------------------------- 1 | # [Style GAN 2](https://nn.labml.ai/gan/stylegan/index.html) 2 | 3 | This is a [PyTorch](https://pytorch.org) implementation of the paper 4 | [Analyzing and Improving the Image Quality of StyleGAN](https://arxiv.org/abs/1912.04958) 5 | which introduces **Style GAN2**. 6 | Style GAN2 is an improvement over **Style GAN** from the paper 7 | [A Style-Based Generator Architecture for Generative Adversarial Networks](https://arxiv.org/abs/1812.04948). 8 | And Style GAN is based on **Progressive GAN** from the paper 9 | [Progressive Growing of GANs for Improved Quality, Stability, and Variation](https://arxiv.org/abs/1710.10196). 10 | All three papers are from the same authors from [NVIDIA AI](https://twitter.com/NVIDIAAI). 11 | -------------------------------------------------------------------------------- /labml_nn/transformers/aft/readme.md: -------------------------------------------------------------------------------- 1 | # [An Attention Free Transformer](https://nn.labml.ai/transformers/aft/index.html) 2 | 3 | This is a [PyTorch](https://pytorch.org) implementation of the paper 4 | [An Attention Free Transformer](https://papers.labml.ai/paper/2105.14103). 5 | 6 | This paper replaces the [self-attention layer](https://nn.labml.ai/transformers/mha.html) 7 | with a new efficient operation, 8 | that has memory complexity of O(Td), where T is the sequence length 9 | and $d$ is the dimensionality of embeddings. 10 | 11 | The paper introduces AFT along with AFT-local and AFT-conv. 12 | Here we have implemented AFT-local which pays attention to closeby tokens 13 | in an autoregressive model. 14 | 15 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/6348e504c3a511eba9529daa283fb495) 16 | -------------------------------------------------------------------------------- /labml_nn/rl/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Reinforcement Learning Algorithms 4 | summary: > 5 | This is a collection of PyTorch implementations/tutorials of reinforcement learning algorithms. 6 | It currently includes Proximal Policy Optimization, Generalized Advantage Estimation, and 7 | Deep Q Networks. 8 | --- 9 | 10 | # Reinforcement Learning Algorithms 11 | 12 | * [Proximal Policy Optimization](ppo) 13 | * [This is an experiment](ppo/experiment.html) that runs a PPO agent on Atari Breakout. 14 | * [Generalized advantage estimation](ppo/gae.html) 15 | * [Deep Q Networks](dqn) 16 | * [This is an experiment](dqn/experiment.html) that runs a DQN agent on Atari Breakout. 17 | * [Model](dqn/model.html) with dueling network 18 | * [Prioritized Experience Replay Buffer](dqn/replay_buffer.html) 19 | 20 | [This is the implementation for OpenAI game wrapper](game.html) using `multiprocessing`. 21 | """ -------------------------------------------------------------------------------- /labml_nn/transformers/fast_weights/readme.md: -------------------------------------------------------------------------------- 1 | # [Fast weights transformer](https://nn.labml.ai/transformers/fast_weights/index.html) 2 | 3 | This is an annotated implementation of the paper 4 | [Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch](https://arxiv.org/abs/2102.11174). 5 | 6 | Here is the [annotated implementation](https://nn.labml.ai/transformers/fast_weights/index.html). 7 | Here are [the training code](https://nn.labml.ai/transformers/fast_weights/experiment.html) 8 | and a notebook for training a fast weights transformer on the Tiny Shakespeare dataset. 9 | 10 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb) 11 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/928aadc0846c11eb85710242ac1c0002) 12 | -------------------------------------------------------------------------------- /labml_nn/gan/wasserstein/gradient_penalty/readme.md: -------------------------------------------------------------------------------- 1 | # [Gradient Penalty for Wasserstein GAN (WGAN-GP)](https://nn.labml.ai/gan/wasserstein/gradient_penalty/index.html) 2 | 3 | This is an implementation of 4 | [Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028). 5 | 6 | [WGAN](https://nn.labml.ai/gan/wasserstein/index.html) suggests 7 | clipping weights to enforce Lipschitz constraint 8 | on the discriminator network (critic). 9 | This and other weight constraints like L2 norm clipping, weight normalization, 10 | L1, L2 weight decay have problems: 11 | 12 | 1. Limiting the capacity of the discriminator 13 | 2. Exploding and vanishing gradients (without [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html)). 14 | 15 | The paper [Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028) 16 | proposal a better way to improve Lipschitz constraint, a gradient penalty. 17 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | clean: ## Clean 2 | rm -rf dist 3 | rm -rf build 4 | rm -rf *.egg-info 5 | 6 | build: clean ## Build PIPy Package 7 | python setup.py sdist bdist_wheel 8 | 9 | check-content: build ## List contents of PIPy Package 10 | tar -tvf dist/*.tar.gz 11 | 12 | check: build ## Check PIPy Package 13 | twine check dist/* 14 | 15 | upload: build ## Upload PIPy Package 16 | twine upload dist/* 17 | 18 | install: ## Install from repo 19 | pip install -e . 20 | 21 | uninstall: ## Uninstall 22 | pip uninstall labml_nn 23 | 24 | docs: ## Render annotated HTML 25 | find ./docs/ -name "*.html" -type f -delete 26 | find ./docs/ -name "*.svg" -type f -delete 27 | python utils/sitemap.py 28 | python utils/diagrams.py 29 | cd labml_nn; pylit --remove_empty_sections --title_md -t ../../../pylit/templates/nn -d ../docs -w * 30 | 31 | help: ## Show this help. 32 | @fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##//' 33 | 34 | .PHONY: clean build check upload help docs 35 | .DEFAULT_GOAL := help 36 | -------------------------------------------------------------------------------- /license: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2020 Varuna Jayasiri 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /labml_nn/utils/tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | from labml.configs import BaseConfigs, option 4 | 5 | 6 | class TokenizerConfigs(BaseConfigs): 7 | """ 8 | 9 | ## Optimizer Configurations 10 | 11 | """ 12 | 13 | tokenizer: Callable = 'character' 14 | 15 | def __init__(self): 16 | super().__init__(_primary='tokenizer') 17 | 18 | 19 | @option(TokenizerConfigs.tokenizer) 20 | def basic_english(): 21 | """ 22 | ### Basic english tokenizer 23 | 24 | We use character level tokenizer in this experiment. 25 | You can switch by setting, 26 | 27 | ``` 28 | 'tokenizer': 'basic_english', 29 | ``` 30 | 31 | as the configurations dictionary when starting the experiment. 32 | 33 | """ 34 | from torchtext.data import get_tokenizer 35 | return get_tokenizer('basic_english') 36 | 37 | 38 | def character_tokenizer(x: str): 39 | """ 40 | ### Character level tokenizer 41 | """ 42 | return list(x) 43 | 44 | 45 | @option(TokenizerConfigs.tokenizer) 46 | def character(): 47 | """ 48 | Character level tokenizer configuration 49 | """ 50 | return character_tokenizer 51 | -------------------------------------------------------------------------------- /labml_nn/normalization/layer_norm/readme.md: -------------------------------------------------------------------------------- 1 | # [Layer Normalization](https://nn.labml.ai/normalization/layer_norm/index.html) 2 | 3 | This is a [PyTorch](https://pytorch.org) implementation of 4 | [Layer Normalization](https://arxiv.org/abs/1607.06450). 5 | 6 | ### Limitations of [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html) 7 | 8 | * You need to maintain running means. 9 | * Tricky for RNNs. Do you need different normalizations for each step? 10 | * Doesn't work with small batch sizes; 11 | large NLP models are usually trained with small batch sizes. 12 | * Need to compute means and variances across devices in distributed training. 13 | 14 | ## Layer Normalization 15 | 16 | Layer normalization is a simpler normalization method that works 17 | on a wider range of settings. 18 | Layer normalization transforms the inputs to have zero mean and unit variance 19 | across the features. 20 | *Note that batch normalization fixes the zero mean and unit variance for each element.* 21 | Layer normalization does it for each batch across all elements. 22 | 23 | Layer normalization is generally used for NLP tasks. 24 | 25 | We have used layer normalization in most of the 26 | [transformer implementations](https://nn.labml.ai/transformers/gpt/index.html). -------------------------------------------------------------------------------- /labml_nn/capsule_networks/readme.md: -------------------------------------------------------------------------------- 1 | # [Capsule Networks](https://nn.labml.ai/capsule_networks/index.html) 2 | 3 | This is a [PyTorch](https://pytorch.org) implementation/tutorial of 4 | [Dynamic Routing Between Capsules](https://arxiv.org/abs/1710.09829). 5 | 6 | Capsule network is a neural network architecture that embeds features 7 | as capsules and routes them with a voting mechanism to next layer of capsules. 8 | 9 | Unlike in other implementations of models, we've included a sample, because 10 | it is difficult to understand some concepts with just the modules. 11 | [This is the annotated code for a model that uses capsules to classify MNIST dataset](mnist.html) 12 | 13 | This file holds the implementations of the core modules of Capsule Networks. 14 | 15 | I used [jindongwang/Pytorch-CapsuleNet](https://github.com/jindongwang/Pytorch-CapsuleNet) to clarify some 16 | confusions I had with the paper. 17 | 18 | Here's a notebook for training a Capsule Network on MNIST dataset. 19 | 20 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/capsule_networks/mnist.ipynb) 21 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/e7c08e08586711ebb3e30242ac1c0002) 22 | -------------------------------------------------------------------------------- /labml_nn/rl/ppo/readme.md: -------------------------------------------------------------------------------- 1 | # [Proximal Policy Optimization - PPO](https://nn.labml.ai/rl/ppo/index.html) 2 | 3 | This is a [PyTorch](https://pytorch.org) implementation of 4 | [Proximal Policy Optimization - PPO](https://arxiv.org/abs/1707.06347). 5 | 6 | PPO is a policy gradient method for reinforcement learning. 7 | Simple policy gradient methods one do a single gradient update per sample (or a set of samples). 8 | Doing multiple gradient steps for a singe sample causes problems 9 | because the policy deviates too much producing a bad policy. 10 | PPO lets us do multiple gradient updates per sample by trying to keep the 11 | policy close to the policy that was used to sample data. 12 | It does so by clipping gradient flow if the updated policy 13 | is not close to the policy used to sample the data. 14 | 15 | You can find an experiment that uses it [here](https://nn.labml.ai/rl/ppo/experiment.html). 16 | The experiment uses [Generalized Advantage Estimation](https://nn.labml.ai/rl/ppo/gae.html). 17 | 18 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/rl/ppo/experiment.ipynb) 19 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/6eff28a0910e11eb9b008db315936e2f) 20 | -------------------------------------------------------------------------------- /labml_nn/gan/wasserstein/experiment.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: WGAN experiment with MNIST 4 | summary: This experiment generates MNIST images using convolutional neural network. 5 | --- 6 | 7 | # WGAN experiment with MNIST 8 | """ 9 | from labml import experiment 10 | 11 | from labml.configs import calculate 12 | # Import configurations from [DCGAN experiment](../dcgan/index.html) 13 | from labml_nn.gan.dcgan import Configs 14 | 15 | # Import [Wasserstein GAN losses](./index.html) 16 | from labml_nn.gan.wasserstein import GeneratorLoss, DiscriminatorLoss 17 | 18 | # Set configurations options for Wasserstein GAN losses 19 | calculate(Configs.generator_loss, 'wasserstein', lambda c: GeneratorLoss()) 20 | calculate(Configs.discriminator_loss, 'wasserstein', lambda c: DiscriminatorLoss()) 21 | 22 | 23 | def main(): 24 | # Create configs object 25 | conf = Configs() 26 | # Create experiment 27 | experiment.create(name='mnist_wassertein_dcgan', comment='test') 28 | # Override configurations 29 | experiment.configs(conf, 30 | { 31 | 'discriminator': 'cnn', 32 | 'generator': 'cnn', 33 | 'label_smoothing': 0.01, 34 | 'generator_loss': 'wasserstein', 35 | 'discriminator_loss': 'wasserstein', 36 | }) 37 | 38 | # Start the experiment and run training loop 39 | with experiment.start(): 40 | conf.run() 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | -------------------------------------------------------------------------------- /labml_nn/normalization/group_norm/readme.md: -------------------------------------------------------------------------------- 1 | # [Group Normalization](https://nn.labml.ai/normalization/group_norm/index.html) 2 | 3 | This is a [PyTorch](https://pytorch.org) implementation of 4 | the [Group Normalization](https://arxiv.org/abs/1803.08494) paper. 5 | 6 | [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html) works well for large enough batch sizes 7 | but not well for small batch sizes, because it normalizes over the batch. 8 | Training large models with large batch sizes is not possible due to the memory capacity of the 9 | devices. 10 | 11 | This paper introduces Group Normalization, which normalizes a set of features together as a group. 12 | This is based on the observation that classical features such as 13 | [SIFT](https://en.wikipedia.org/wiki/Scale-invariant_feature_transform) and 14 | [HOG](https://en.wikipedia.org/wiki/Histogram_of_oriented_gradients) are group-wise features. 15 | The paper proposes dividing feature channels into groups and then separately normalizing 16 | all channels within each group. 17 | 18 | Here's a [CIFAR 10 classification model](https://nn.labml.ai/normalization/group_norm/experiment.html) that uses instance normalization. 19 | 20 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/group_norm/experiment.ipynb) 21 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/081d950aa4e011eb8f9f0242ac1c0002) 22 | [![WandB](https://img.shields.io/badge/wandb-run-yellow)](https://wandb.ai/vpj/cifar10/runs/310etthp) -------------------------------------------------------------------------------- /labml_nn/transformers/xl/readme.md: -------------------------------------------------------------------------------- 1 | # [Transformer XL](https://nn.labml.ai/transformers/xl/index.html) 2 | 3 | This is an implementation of 4 | [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 5 | in [PyTorch](https://pytorch.org). 6 | 7 | Transformer has a limited attention span, 8 | equal to the length of the sequence trained in parallel. 9 | All these positions have a fixed positional encoding. 10 | Transformer XL increases this attention span by letting 11 | each of the positions pay attention to precalculated past embeddings. 12 | For instance if the context length is $l$, it will keep the embeddings of 13 | all layers for previous batch of length $l$ and feed them to current step. 14 | If we use fixed-positional encodings these pre-calculated embeddings will have 15 | the same positions as the current context. 16 | They introduce relative positional encoding, where the positional encodings 17 | are introduced at the attention calculation. 18 | 19 | Annotated implementation of relative multi-headed attention is in [`relative_mha.py`](https://nn.labml.ai/transformers/xl/relative_mha.html). 20 | 21 | Here's [the training code](https://nn.labml.ai/transformers/xl/experiment.html) and a notebook for training a transformer XL model on Tiny Shakespeare dataset. 22 | 23 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/xl/experiment.ipynb) 24 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/d3b6760c692e11ebb6a70242ac1c0002) 25 | -------------------------------------------------------------------------------- /utils/sitemap.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import git 4 | 5 | HOME = Path('./labml_nn') 6 | REPO = git.Repo('.') 7 | 8 | 9 | def collect(path: Path): 10 | if path.is_file(): 11 | try: 12 | commit = next(iter(REPO.iter_commits(paths=path))) 13 | except StopIteration: 14 | return [] 15 | 16 | html = path.relative_to(HOME) 17 | if html.stem == '__init__': 18 | html = html.parent / 'index.html' 19 | else: 20 | html = html.parent / f'{html.stem}.html' 21 | 22 | return [{'path': str(html), 'date': str(commit.committed_datetime.date())}] 23 | 24 | urls = [] 25 | for f in path.iterdir(): 26 | urls += collect(f) 27 | 28 | return urls 29 | 30 | 31 | def main(): 32 | urls = [] 33 | for f in HOME.iterdir(): 34 | urls += collect(f) 35 | 36 | urls = [f''' 37 | 38 | https://nn.labml.ai/{u['path']} 39 | {u['date']}T16:30:00+00:00 40 | 1.00 41 | 42 | ''' for u in urls] 43 | 44 | urls = '\n'.join(urls) 45 | xml = f''' 46 | 47 | 52 | {urls} 53 | 54 | ''' 55 | 56 | with open(str(HOME.parent / 'docs' / 'sitemap.xml'), 'w') as f: 57 | f.write(xml) 58 | 59 | 60 | if __name__ == '__main__': 61 | main() 62 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("readme.md", "r") as f: 4 | long_description = f.read() 5 | 6 | setuptools.setup( 7 | name='labml-nn', 8 | version='0.4.99', 9 | author="Varuna Jayasiri, Nipun Wijerathne", 10 | author_email="vpjayasiri@gmail.com, hnipun@gmail.com", 11 | description="A collection of PyTorch implementations of neural network architectures and layers.", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/lab-ml/nn", 15 | project_urls={ 16 | 'Documentation': 'https://lab-ml.com/' 17 | }, 18 | packages=setuptools.find_packages(exclude=('labml', 'labml.*', 19 | 'labml_samples', 'labml_samples.*', 20 | 'labml_helpers', 'labml_helpers.*', 21 | 'test', 22 | 'test.*')), 23 | install_requires=['labml>=0.4.110', 24 | 'labml-helpers>=0.4.77', 25 | 'torch', 26 | 'einops', 27 | 'numpy'], 28 | classifiers=[ 29 | "Programming Language :: Python :: 3", 30 | "License :: OSI Approved :: MIT License", 31 | 'Intended Audience :: Developers', 32 | 'Intended Audience :: Science/Research', 33 | 'Topic :: Scientific/Engineering', 34 | 'Topic :: Scientific/Engineering :: Mathematics', 35 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 36 | 'Topic :: Software Development', 37 | 'Topic :: Software Development :: Libraries', 38 | 'Topic :: Software Development :: Libraries :: Python Modules', 39 | ], 40 | keywords='machine learning', 41 | ) 42 | -------------------------------------------------------------------------------- /labml_nn/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Utilities 4 | summary: A bunch of utility functions and classes 5 | --- 6 | 7 | # Utilities 8 | """ 9 | 10 | import copy 11 | 12 | from torch.utils.data import Dataset, IterableDataset 13 | 14 | from labml_helpers.module import M, TypedModuleList 15 | 16 | 17 | def clone_module_list(module: M, n: int) -> TypedModuleList[M]: 18 | """ 19 | ## Clone Module 20 | 21 | Make a `nn.ModuleList` with clones of a given module 22 | """ 23 | return TypedModuleList([copy.deepcopy(module) for _ in range(n)]) 24 | 25 | 26 | def cycle_dataloader(data_loader): 27 | """ 28 | 29 | ## Cycle Data Loader 30 | 31 | Infinite loader that recycles the data loader after each epoch 32 | """ 33 | while True: 34 | for batch in data_loader: 35 | yield batch 36 | 37 | 38 | class MapStyleDataset(Dataset): 39 | """ 40 | 41 | ## Map Style Dataset 42 | 43 | This converts an [`IterableDataset`](https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset) 44 | to a [map-style dataset](https://pytorch.org/docs/stable/data.html#map-style-datasets) 45 | so that we can shuffle the dataset. 46 | 47 | *This only works when the dataset size is small and can be held in memory.* 48 | """ 49 | 50 | def __init__(self, dataset: IterableDataset): 51 | # Load the data to memory 52 | self.data = [d for d in dataset] 53 | 54 | def __getitem__(self, idx: int): 55 | """Get a sample by index""" 56 | return self.data[idx] 57 | 58 | def __iter__(self): 59 | """Create an iterator""" 60 | return iter(self.data) 61 | 62 | def __len__(self): 63 | """Size of the dataset""" 64 | return len(self.data) 65 | -------------------------------------------------------------------------------- /labml_nn/optimizers/performance_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Test performance of Adam implementations 4 | summary: This experiment compares performance of Adam implementations. 5 | --- 6 | 7 | # Performance testing Adam 8 | 9 | ``` 10 | TorchAdam warmup...[DONE] 222.59ms 11 | TorchAdam...[DONE] 1,356.01ms 12 | MyAdam warmup...[DONE] 119.15ms 13 | MyAdam...[DONE] 1,192.89ms 14 | ``` 15 | 16 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ngowaAsADj8VdZfBifu_6L6rtjGoEeoR?usp=sharing) 17 | """ 18 | 19 | import torch 20 | import torch.nn as nn 21 | from labml_helpers.device import DeviceInfo 22 | from torch.optim import Adam as TorchAdam 23 | 24 | from labml import monit 25 | from labml_nn.optimizers.adam import Adam as MyAdam 26 | from labml_nn.optimizers.mnist_experiment import Model 27 | 28 | 29 | def test(): 30 | device_info = DeviceInfo(use_cuda=True, cuda_device=0) 31 | print(device_info) 32 | inp = torch.randn((64, 1, 28, 28), device=device_info.device) 33 | target = torch.ones(64, dtype=torch.long, device=device_info.device) 34 | loss_func = nn.CrossEntropyLoss() 35 | model = Model().to(device_info.device) 36 | my_adam = MyAdam(model.parameters()) 37 | torch_adam = TorchAdam(model.parameters()) 38 | loss = loss_func(model(inp), target) 39 | loss.backward() 40 | with monit.section('MyAdam warmup'): 41 | for i in range(100): 42 | my_adam.step() 43 | with monit.section('MyAdam'): 44 | for i in range(1000): 45 | my_adam.step() 46 | with monit.section('TorchAdam warmup'): 47 | for i in range(100): 48 | torch_adam.step() 49 | with monit.section('TorchAdam'): 50 | for i in range(1000): 51 | torch_adam.step() 52 | 53 | 54 | if __name__ == '__main__': 55 | test() 56 | -------------------------------------------------------------------------------- /labml_nn/transformers/knn/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: k-Nearest Neighbor Language Models 4 | summary: > 5 | This is a simple PyTorch implementation/tutorial of the paper 6 | Generalization through Memorization: Nearest Neighbor Language Models using FAISS. 7 | It runs a kNN model on the final transformer layer embeddings to improve the 8 | loss of transformer based language models. 9 | It's also great for domain adaptation without pre-training. 10 | --- 11 | 12 | # k-Nearest Neighbor Language Models 13 | 14 | This is a [PyTorch](https://pytorch.org) implementation of the paper 15 | [Generalization through Memorization: Nearest Neighbor Language Models](https://arxiv.org/abs/1911.00172). 16 | It uses k-nearest neighbors to improve perplexity of autoregressive transformer models. 17 | 18 | An autoregressive language model estimates $p(w_t | \color{yellowgreen}{c_t})$, 19 | where $w_t$ is the token at step $t$ 20 | and $c_t$ is the context, $\color{yellowgreen}{c_t} = (w_1, w_2, ..., w_{t-1})$. 21 | 22 | This paper, improves $p(w_t | \color{yellowgreen}{c_t})$ using a k-nearest neighbor search 23 | on key-value pairs $\big(f(c_i), w_i\big)$, with search key $f(\color{yellowgreen}{c_t})$. 24 | Here $f(\color{yellowgreen}{c_t})$ is an embedding of the context $\color{yellowgreen}{c_t}$. 25 | The paper (and this implementation) uses the **input to the feed-forward layer of the 26 | final layer of the transformer** as $f(\color{yellowgreen}{c_t})$. 27 | 28 | We use [FAISS](https://github.com/facebookresearch/faiss) to index $f(c_i)$. 29 | 30 | ### Implementation 31 | 32 | So to run $k$NN-LM we need to: 33 | 34 | * [Train a transformer model](train_model.html) 35 | * [Build an index](build_index.html) of $\big(f(c_i), w_i\big)$ 36 | * [Evaluate kNN-ML](eval_knn.html) using $k$NN seach on $\big(f(c_i), w_i\big)$ 37 | with $f(\color{yellowgreen}{c_t})$ 38 | 39 | This experiment uses a small dataset so that we can run this without using up a few hundred giga-bytes 40 | of disk space for the index. 41 | 42 | The official implementation of $k$NN-LM can be found [here](https://github.com/urvashik/knnlm). 43 | """ 44 | -------------------------------------------------------------------------------- /labml_nn/transformers/switch/readme.md: -------------------------------------------------------------------------------- 1 | # [Switch Transformer](https://nn.labml.ai/transformers/switch/index.html) 2 | 3 | This is a miniature [PyTorch](https://pytorch.org) implementation of the paper 4 | [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961). 5 | Our implementation only has a few million parameters and doesn't do model parallel distributed training. 6 | It does single GPU training, but we implement the concept of switching as described in the paper. 7 | 8 | The Switch Transformer uses different parameters for each token by switching among parameters 9 | based on the token. 10 | Therefore, only a fraction of parameters are chosen for each token. 11 | So you can have more parameters but less computational cost. 12 | 13 | The switching happens at the Position-wise Feedforward network (FFN) of each transformer block. 14 | Position-wise feedforward network consists of two sequentially fully connected layers. 15 | In switch transformer we have multiple FFNs (multiple experts), 16 | and we chose which one to use based on a router. 17 | The output is a set of probabilities for picking a FFN, 18 | and we pick the one with the highest probability and only evaluate that. 19 | So essentially the computational cost is the same as having a single FFN. 20 | In our implementation this doesn't parallelize well when you have many or large FFNs since it's all 21 | happening on a single GPU. 22 | In a distributed setup you would have each FFN (each very large) on a different device. 23 | 24 | The paper introduces another loss term to balance load among the experts (FFNs) and 25 | discusses dropping tokens when routing is not balanced. 26 | 27 | Here's [the training code](experiment.html) and a notebook for training a switch transformer on Tiny Shakespeare dataset. 28 | 29 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/switch/experiment.ipynb) 30 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/c4656c605b9311eba13d0242ac1c0002) 31 | -------------------------------------------------------------------------------- /labml_nn/normalization/weight_standardization/conv2d.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: 2D Convolution Layer with Weight Standardization 4 | summary: > 5 | A PyTorch implementation/tutorial of a 2D Convolution Layer with Weight Standardization. 6 | --- 7 | 8 | # 2D Convolution Layer with Weight Standardization 9 | 10 | This is an implementation of a 2 dimensional convolution layer with [Weight Standardization](./index.html) 11 | """ 12 | 13 | import torch 14 | import torch.nn as nn 15 | from torch.nn import functional as F 16 | 17 | from labml_nn.normalization.weight_standardization import weight_standardization 18 | 19 | 20 | class Conv2d(nn.Conv2d): 21 | """ 22 | ## 2D Convolution Layer 23 | 24 | This extends the standard 2D Convolution layer and standardize the weights before the convolution step. 25 | """ 26 | def __init__(self, in_channels, out_channels, kernel_size, 27 | stride=1, 28 | padding=0, 29 | dilation=1, 30 | groups: int = 1, 31 | bias: bool = True, 32 | padding_mode: str = 'zeros', 33 | eps: float = 1e-5): 34 | super(Conv2d, self).__init__(in_channels, out_channels, kernel_size, 35 | stride=stride, 36 | padding=padding, 37 | dilation=dilation, 38 | groups=groups, 39 | bias=bias, 40 | padding_mode=padding_mode) 41 | self.eps = eps 42 | 43 | def forward(self, x: torch.Tensor): 44 | return F.conv2d(x, weight_standardization(self.weight, self.eps), self.bias, self.stride, 45 | self.padding, self.dilation, self.groups) 46 | 47 | 48 | def _test(): 49 | """ 50 | A simple test to verify the tensor sizes 51 | """ 52 | conv2d = Conv2d(10, 20, 5) 53 | from labml.logger import inspect 54 | inspect(conv2d.weight) 55 | import torch 56 | inspect(conv2d(torch.zeros(10, 10, 100, 100))) 57 | 58 | 59 | if __name__ == '__main__': 60 | _test() 61 | -------------------------------------------------------------------------------- /labml_nn/normalization/batch_norm/cifar10.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: CIFAR10 Experiment to try Group Normalization 4 | summary: > 5 | This trains is a simple convolutional neural network that uses group normalization 6 | to classify CIFAR10 images. 7 | --- 8 | 9 | # CIFAR10 Experiment for Group Normalization 10 | """ 11 | 12 | import torch.nn as nn 13 | 14 | from labml import experiment 15 | from labml.configs import option 16 | from labml_helpers.module import Module 17 | from labml_nn.experiments.cifar10 import CIFAR10Configs 18 | from labml_nn.normalization.batch_norm import BatchNorm 19 | 20 | 21 | class Model(Module): 22 | def __init__(self): 23 | super().__init__() 24 | layers = [] 25 | in_channels = 3 26 | for block in [[64, 64], [128, 128], [256, 256, 256], [512, 512, 512], [512, 512, 512]]: 27 | for channels in block: 28 | layers += [nn.Conv2d(in_channels, channels, kernel_size=3, padding=1), 29 | BatchNorm(channels), 30 | nn.ReLU(inplace=True)] 31 | in_channels = channels 32 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 33 | layers += [nn.AvgPool2d(kernel_size=1, stride=1)] 34 | self.layers = nn.Sequential(*layers) 35 | self.fc = nn.Linear(512, 10) 36 | 37 | def __call__(self, x): 38 | x = self.layers(x) 39 | x = x.view(x.shape[0], -1) 40 | return self.fc(x) 41 | 42 | 43 | @option(CIFAR10Configs.model) 44 | def model(c: CIFAR10Configs): 45 | """ 46 | ### Create model 47 | """ 48 | return Model().to(c.device) 49 | 50 | 51 | def main(): 52 | # Create experiment 53 | experiment.create(name='cifar10', comment='batch norm') 54 | # Create configurations 55 | conf = CIFAR10Configs() 56 | # Load configurations 57 | experiment.configs(conf, { 58 | 'optimizer.optimizer': 'Adam', 59 | 'optimizer.learning_rate': 2.5e-4, 60 | 'train_batch_size': 64, 61 | }) 62 | # Start the experiment and run the training loop 63 | with experiment.start(): 64 | conf.run() 65 | 66 | 67 | # 68 | if __name__ == '__main__': 69 | main() 70 | -------------------------------------------------------------------------------- /labml_nn/transformers/feedback/readme.md: -------------------------------------------------------------------------------- 1 | # [Feedback Transformer](https://nn.labml.ai/transformers/feedback/index.html) 2 | 3 | This is a [PyTorch](https://pytorch.org) implementation of the paper 4 | [Accessing Higher-level Representations in Sequential Transformers with Feedback Memory](https://arxiv.org/abs/2002.09402). 5 | 6 | Normal transformers process tokens in parallel. Each transformer layer pays attention 7 | to the outputs of the previous layer. 8 | Feedback transformer pays attention to the output of all layers in previous steps. 9 | So this adds recurrence, and we need to process token-by-token. 10 | This slows down the training significantly (about 5X - 10X depending on the sequence length). 11 | However, when predicting Feedback Transformer is faster because you can predict the next token 12 | if you cache the memory vectors. 13 | 14 | In order to speed up the training the paper discusses starting with a short sequence length and 15 | gradually increasing it. 16 | They also discuss using a pretrained parallel transformer as the starting point. 17 | 18 | The original feedback transformer doesn't keep the outputs of all layers. 19 | Instead it keeps weighted sum of the output of all layers. 20 | This reduces the memory used for caching during prediction. 21 | The first half of this file implements this. 22 | 23 | The updated feedback transformer shares weights used 24 | to calculate keys and values among the layers. 25 | We then calculate the keys and values for each step only once and keep 26 | them cached. 27 | The [second half](#shared_kv) of this file implements this. 28 | We implemented a custom PyTorch function to improve performance. 29 | 30 | Here's [the training code](experiment.html) and a notebook for training a feedback transformer on Tiny Shakespeare dataset. 31 | 32 | [Colab Notebook](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb) 33 | 34 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb) 35 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/d8eb9416530a11eb8fb50242ac1c0002) 36 | -------------------------------------------------------------------------------- /labml_nn/cnn/cross_validation.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import torchvision 4 | import torchvision.transforms as transforms 5 | from torch.utils.data.sampler import SubsetRandomSampler 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import torch.optim as optim 9 | from torchsummary import summary 10 | import torch.nn as nn 11 | 12 | # from models.mlp import MLP 13 | # from utils.utils import * 14 | # from utils.train_dataset import * 15 | #from nutsflow import Take, Consume 16 | #from nutsml import * 17 | from utils.dataloader import * 18 | from models.cnn import CNN 19 | from utils.train import Trainer 20 | 21 | from utils.cv_train import * 22 | 23 | # Check if GPU is available 24 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 25 | print("Device: " + str(device)) 26 | 27 | # Cifar 10 Datasets location 28 | save='./data/Cifar10' 29 | 30 | # Transformations train 31 | transform_train = transforms.Compose( 32 | [transforms.ToTensor(), 33 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) 34 | 35 | # Load train dataset and dataloader 36 | trainset = LoadCifar10DatasetTrain(save, transform_train) 37 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, 38 | shuffle=True, num_workers=4) 39 | 40 | # Transformations test (for inference later) 41 | transform_test = transforms.Compose( 42 | [transforms.ToTensor(), 43 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) 44 | 45 | # Load test dataset and dataloader (for inference later) 46 | testset = LoadCifar10DatasetTest(save, transform_test) 47 | testloader = torch.utils.data.DataLoader(testset, batch_size=64, 48 | shuffle=False, num_workers=4) 49 | 50 | # Specify loss function 51 | cost = nn.CrossEntropyLoss() 52 | 53 | epochs=25 #10 54 | splits = 4 #5 55 | 56 | # Training - Cross-validation 57 | history = cross_val_train(cost, trainset, epochs, splits, device=device) 58 | 59 | # Inference 60 | best_model, best_val_accuracy = retreive_best_trial() 61 | print("Best Validation Accuracy = %.3f"%(best_val_accuracy)) 62 | 63 | # Testing 64 | accuracy = Test(best_model, cost, testloader, device=device) 65 | print("Test Accuracy = %.3f"%(accuracy['val_acc'])) 66 | -------------------------------------------------------------------------------- /labml_nn/resnets/pretrained_nets.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | 3 | from utils.train import Trainer # Default custom training class 4 | from models.resnet import * 5 | from torchvision import models 6 | 7 | # GPU Check 8 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 9 | print("Device: " + str(device)) 10 | 11 | # Use different train/test data augmentations 12 | transform_test = transforms.Compose( 13 | [transforms.ToTensor(), 14 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) 15 | 16 | # Get Cifar 10 Datasets 17 | save='./data/Cifar10' 18 | transform_train = transforms.Compose([ 19 | transforms.RandomHorizontalFlip(p=1.0), 20 | transforms.RandomRotation(20), 21 | transforms.RandomCrop(32, (2, 2), pad_if_needed=False, padding_mode='constant'), 22 | transforms.ToTensor(), 23 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) 24 | 25 | # Get Cifar 10 Datasets 26 | trainset = torchvision.datasets.CIFAR10(root=save, train=True, download=True, transform=transform_train) 27 | testset = torchvision.datasets.CIFAR10(root=save, train=False, download=True, transform=transform_test) 28 | 29 | # Get Cifar 10 Dataloaders 30 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, 31 | shuffle=True, num_workers=4) 32 | 33 | testloader = torch.utils.data.DataLoader(testset, batch_size=64, 34 | shuffle=False, num_workers=4) 35 | 36 | ################################# 37 | # Load the pre-trained model 38 | ################################# 39 | 40 | model_ft = models.resnet18(pretrained=True) 41 | num_ftrs = model_ft.fc.in_features 42 | model_ft.fc = nn.Sequential( 43 | nn.Dropout(0.5), 44 | nn.Linear(num_ftrs, 10) 45 | ) 46 | 47 | 48 | model_ft = model_ft.to(device) 49 | 50 | # Loss function 51 | cost = nn.CrossEntropyLoss() 52 | 53 | # Optimizer 54 | lr = 0.0005 55 | # opt = optim.SGD(model_ft.parameters(), lr=lr, momentum=0.9) 56 | opt = torch.optim.Adam(model_ft.parameters(), lr=lr, betas=(0.9, 0.95), weight_decay=1e-4) #0.0005 l2_factor.item() 57 | 58 | # Create a trainer 59 | trainer = Trainer(model_ft, opt, cost, name="Transfer-learning",lr=lr , use_lr_schedule=True, device=device) 60 | 61 | # Run training 62 | epochs = 25 63 | trainer.Train(trainloader, epochs, testloader=testloader) 64 | # trainer.Train(trainloader, epochs) # check train error 65 | 66 | print('done') 67 | -------------------------------------------------------------------------------- /labml_nn/transformers/compressive/readme.md: -------------------------------------------------------------------------------- 1 | # [Compressive Transformer](https://nn.labml.ai/transformers/compressive/index.html) 2 | 3 | This is an implementation of 4 | [Compressive Transformers for Long-Range Sequence Modelling](https://arxiv.org/abs/1911.05507) 5 | in [PyTorch](https://pytorch.org). 6 | 7 | This is an extension of [Transformer XL](https://nn.labml.ai/transformers/xl/index.html) where past memories 8 | are compressed to give a longer attention range. 9 | That is, the furthest $n_{cm} c$ memories are compressed into 10 | $n_{cm}$ memories, where $c$ is the compression rate. 11 | 12 | ## Compression operation 13 | 14 | The compression operation is defined as 15 | $f_c: \mathbb{R}^{nc \times d} \rightarrow \mathbb{R}^{n \times d}$. 16 | The paper introduces multiple choices for $f_c$ and we have only implemented 17 | 1D convolution which seems to give the best results. 18 | Each layer has a separate compression operation $f_c^{(i)}$ where 19 | $i$ is the layer number. 20 | 21 | ## Training compression operation 22 | 23 | Since training compression with BPTT requires maintaining 24 | a very large computational graph (many time steps), the paper proposes 25 | an *auto-encoding loss* and an *attention reconstruction loss*. 26 | The auto-encoding loss decodes the original memories from the compressed memories 27 | and calculates the loss. 28 | Attention reconstruction loss computes the multi-headed attention results 29 | on the compressed memory and on uncompressed memory and gets a mean squared error 30 | between them. 31 | We have implemented the latter here since it gives better results. 32 | 33 | This implementation uses pre-layer normalization 34 | while the paper uses post-layer normalization. 35 | Pre-layer norm does the layer norm before FFN[../feedforward.html) and 36 | self-attention, and the pass-through in the residual connection is not normalized. 37 | This is supposed to be more stable in standard transformer setups. 38 | 39 | Here are [the training code](https://nn.labml.ai/transformers/compressive/experiment.html) and a notebook for training a compressive transformer 40 | model on the Tiny Shakespeare dataset. 41 | 42 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/compressive/experiment.ipynb) 43 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/0d9b5338726c11ebb7c80242ac1c0002) 44 | -------------------------------------------------------------------------------- /labml_nn/transformers/label_smoothing_loss.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Label Smoothing Loss 4 | summary: > 5 | This is an implementation of label smoothing loss, that can be used as 6 | an alternative to cross entropy loss for improved accuracy. 7 | --- 8 | 9 | # Label Smoothing Loss 10 | """ 11 | import matplotlib.pyplot as plt 12 | import numpy as np 13 | import torch 14 | import torch.nn as nn 15 | 16 | from labml_helpers.module import Module 17 | 18 | 19 | class LabelSmoothingLoss(Module): 20 | def __init__(self, size: int, padding_idx: int, smoothing: float = 0.0): 21 | super().__init__() 22 | self.loss = nn.KLDivLoss(reduction='sum') 23 | self.padding_idx = padding_idx 24 | self.confidence = 1.0 - smoothing 25 | self.smoothing = smoothing 26 | self.size = size 27 | self.true_dist = None 28 | 29 | def forward(self, x: torch.Tensor, target: torch.Tensor): 30 | assert x.shape[1] == self.size 31 | true_dist = x.clone() 32 | true_dist.fill_(self.smoothing / (self.size - 2)) 33 | true_dist.scatter_(1, target.unsqueeze(1), self.confidence) 34 | true_dist[:, self.padding_idx] = 0 35 | mask = torch.nonzero(target == self.padding_idx, as_tuple=False) 36 | if mask.dim() > 0: 37 | true_dist.index_fill_(0, mask.squeeze(), 0.0) 38 | self.true_dist = true_dist 39 | return self.loss(x, true_dist.detach()) 40 | 41 | 42 | def _test_label_smoothing(): 43 | smooth_loss = LabelSmoothingLoss(5, 0, 0.4) 44 | predict = torch.tensor([[0, 0.2, 0.7, 0.1, 0], 45 | [0, 0.2, 0.7, 0.1, 0], 46 | [0, 0.2, 0.7, 0.1, 0]], dtype=torch.float) 47 | _ = smooth_loss(predict.log(), 48 | torch.tensor([2, 1, 0], dtype=torch.long)) 49 | 50 | # Show the target distributions expected by the system. 51 | plt.imshow(smooth_loss.true_dist) 52 | plt.show() 53 | 54 | smooth_loss = LabelSmoothingLoss(5, 0, 0.1) 55 | 56 | def loss_sample(x): 57 | d = x + 3 * 1 58 | predict2 = torch.tensor([[0, x / d, 1 / d, 1 / d, 1 / d], 59 | ], dtype=torch.float) 60 | # print(predict) 61 | return smooth_loss(predict2.log(), 62 | torch.tensor([1], dtype=torch.long)).item() 63 | 64 | plt.plot(np.arange(1, 100), [loss_sample(x) for x in range(1, 100)]) 65 | plt.show() 66 | 67 | 68 | if __name__ == '__main__': 69 | _test_label_smoothing() 70 | -------------------------------------------------------------------------------- /labml_nn/optimizers/adam_warmup.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Adam optimizer with warm-up 4 | summary: A simple PyTorch implementation/tutorial of Adam optimizer with warm-up. 5 | --- 6 | 7 | # Adam Optimizer with Warmup 8 | 9 | This extends [AMSGrad optimizer](amsgrad.html) and adds a warmup stage. 10 | """ 11 | 12 | from typing import Dict 13 | 14 | from labml_nn.optimizers import WeightDecay 15 | from labml_nn.optimizers.amsgrad import AMSGrad 16 | 17 | 18 | class AdamWarmup(AMSGrad): 19 | """ 20 | ## Adam Optimizer with Warmup 21 | 22 | This class extends from AMSGrad optimizer defined in [`amsgrad.py`](amsgrad.html). 23 | """ 24 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, 25 | weight_decay: WeightDecay = WeightDecay(), 26 | optimized_update: bool = True, 27 | amsgrad=False, warmup=0, defaults=None): 28 | """ 29 | ### Initialize the optimizer 30 | 31 | * `params` is the list of parameters 32 | * `lr` is the learning rate $\alpha$ 33 | * `betas` is a tuple of ($\beta_1$, $\beta_2$) 34 | * `eps` is $\hat{\epsilon}$ or $\epsilon$ based on `optimized_update` 35 | * `weight_decay` is an instance of class `WeightDecay` defined in [`__init__.py`](index.html) 36 | * 'optimized_update' is a flag whether to optimize the bias correction of the second moment 37 | by doing it after adding $\epsilon$ 38 | * `amsgrad` is a flag indicating whether to use AMSGrad or fallback to plain Adam 39 | * `warmup` number of warmup steps 40 | * `defaults` is a dictionary of default for group values. 41 | This is useful when you want to extend the class `AdamWarmup`. 42 | """ 43 | 44 | defaults = {} if defaults is None else defaults 45 | defaults.update(dict(warmup=warmup)) 46 | super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults) 47 | 48 | def get_lr(self, state: Dict[str, any], group: Dict[str, any]): 49 | """ 50 | ### Get learning-rate 51 | 52 | $$\alpha \min \bigg(1, \frac{t}{w}\bigg)$$ 53 | where $w$ is the number of warmup steps. 54 | """ 55 | # If we are in warmup stage 56 | if group['warmup'] > state['step']: 57 | # A linearly increasing learning rate from $0$ to $\alpha$ 58 | return 1e-8 + state['step'] * group['lr'] / group['warmup'] 59 | else: 60 | # Constant learning rate $\alpha$ 61 | return group['lr'] 62 | -------------------------------------------------------------------------------- /labml_nn/normalization/weight_standardization/experiment.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: CIFAR10 Experiment to try Weight Standardization and Batch-Channel Normalization 4 | summary: > 5 | This trains is a VGG net that uses weight standardization and batch-channel normalization 6 | to classify CIFAR10 images. 7 | --- 8 | 9 | # CIFAR10 Experiment to try Weight Standardization and Batch-Channel Normalization 10 | """ 11 | 12 | import torch.nn as nn 13 | 14 | from labml import experiment 15 | from labml.configs import option 16 | from labml_helpers.module import Module 17 | from labml_nn.experiments.cifar10 import CIFAR10Configs 18 | from labml_nn.normalization.batch_channel_norm import BatchChannelNorm 19 | from labml_nn.normalization.weight_standardization.conv2d import Conv2d 20 | 21 | 22 | class Model(Module): 23 | """ 24 | ### Model 25 | 26 | A VGG model that use [Weight Standardization](./index.html) and 27 | [Batch-Channel Normalization](../batch_channel_norm/index.html). 28 | """ 29 | def __init__(self): 30 | super().__init__() 31 | layers = [] 32 | in_channels = 3 33 | for block in [[64, 64], [128, 128], [256, 256, 256], [512, 512, 512], [512, 512, 512]]: 34 | for channels in block: 35 | layers += [Conv2d(in_channels, channels, kernel_size=3, padding=1), 36 | BatchChannelNorm(channels, 32), 37 | nn.ReLU(inplace=True)] 38 | in_channels = channels 39 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 40 | layers += [nn.AvgPool2d(kernel_size=1, stride=1)] 41 | self.layers = nn.Sequential(*layers) 42 | self.fc = nn.Linear(512, 10) 43 | 44 | def __call__(self, x): 45 | x = self.layers(x) 46 | x = x.view(x.shape[0], -1) 47 | return self.fc(x) 48 | 49 | 50 | @option(CIFAR10Configs.model) 51 | def model(c: CIFAR10Configs): 52 | """ 53 | ### Create model 54 | """ 55 | return Model().to(c.device) 56 | 57 | 58 | def main(): 59 | # Create experiment 60 | experiment.create(name='cifar10', comment='weight standardization') 61 | # Create configurations 62 | conf = CIFAR10Configs() 63 | # Load configurations 64 | experiment.configs(conf, { 65 | 'optimizer.optimizer': 'Adam', 66 | 'optimizer.learning_rate': 2.5e-4, 67 | 'train_batch_size': 64, 68 | }) 69 | # Start the experiment and run the training loop 70 | with experiment.start(): 71 | conf.run() 72 | 73 | 74 | # 75 | if __name__ == '__main__': 76 | main() 77 | -------------------------------------------------------------------------------- /labml_nn/resnets/utils/utils.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | 3 | import torch 4 | import torchvision 5 | import torchvision.transforms as transforms 6 | 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | 13 | from sklearn.model_selection import KFold 14 | from torch.utils.data.sampler import SubsetRandomSampler 15 | 16 | 17 | 18 | # Plot the loss of multiple runs together 19 | def PlotLosses(losses, titles, save=None): 20 | fig = plt.figure() 21 | fig.set_size_inches(14, 22) 22 | # Plot results on 3 subgraphs 23 | # subplot integers: 24 | # nrows 25 | # ncols 26 | # index 27 | sublplot_str_start = "" + str(len(losses)) + "1" 28 | 29 | for i in range(len(losses)): 30 | subplot = sublplot_str_start + str(i+1) 31 | loss = losses[i] 32 | title = titles[i] 33 | 34 | ax = plt.subplot(int(subplot)) 35 | ax.plot(range(len(loss)), loss) 36 | ax.set_xlabel("Epoch") 37 | ax.set_title(title) 38 | ax.set_ylabel("Loss") 39 | 40 | # Save Figure 41 | if save: 42 | plt.savefig(save) 43 | else: 44 | plt.show() 45 | 46 | 47 | 48 | def ClassSpecificTestCifar10(net, testdata, device=None): 49 | classes = ('plane', 'car', 'bird', 'cat','deer', 'dog', 'frog', 'horse', 'ship', 'truck') 50 | class_correct = list(0. for i in range(10)) 51 | class_total = list(0. for i in range(10)) 52 | with torch.no_grad(): 53 | for data in testdata: 54 | if device: 55 | images, labels = data[0].to(device), data[1].to(device) 56 | else: 57 | images, labels = data 58 | 59 | outputs = net(images) 60 | _, predicted = torch.max(outputs, 1) 61 | c = (predicted == labels).squeeze() 62 | for i in range(4): 63 | label = labels[i] 64 | class_correct[label] += c[i].item() 65 | class_total[label] += 1 66 | 67 | # Print out 68 | for i in range(10): 69 | print('Accuracy of %5s : %2d %%' % ( 70 | classes[i], 100 * class_correct[i] / class_total[i])) 71 | 72 | 73 | 74 | def GetActivation(name="relu"): 75 | if name == "relu": 76 | return nn.ReLU() 77 | elif name == "leakyrelu": 78 | return nn.LeakyReLU() 79 | elif name == "Sigmoid": 80 | return nn.Sigmoid() 81 | elif name == "Tanh": 82 | return nn.Tanh() 83 | elif name == "Identity": 84 | return nn.Identity() 85 | else: 86 | return nn.ReLU() -------------------------------------------------------------------------------- /labml_nn/transformers/positional_encoding.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Fixed Positional Encodings 4 | summary: > 5 | Implementation with explanation of fixed positional encodings as 6 | described in paper Attention is All You Need. 7 | --- 8 | 9 | # Fixed Positional Encodings 10 | 11 | The positional encoding encodes the position along the sequence into 12 | a vector of size `d_model`. 13 | 14 | \begin{align} 15 | PE_{p,2i} &= sin\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg) \\ 16 | PE_{p,2i + 1} &= cos\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg) 17 | \end{align} 18 | 19 | Where $1 \leq 2i, 2i + 1 \leq d_{model}$ 20 | are the feature indexes in the encoding, and $p$ is the position. 21 | """ 22 | 23 | import math 24 | 25 | import numpy as np 26 | import torch 27 | import torch.nn as nn 28 | 29 | from labml_helpers.module import Module 30 | 31 | 32 | class PositionalEncoding(Module): 33 | def __init__(self, d_model: int, dropout_prob: float, max_len: int = 5000): 34 | super().__init__() 35 | self.dropout = nn.Dropout(dropout_prob) 36 | 37 | self.register_buffer('positional_encodings', get_positional_encoding(d_model, max_len), False) 38 | 39 | def forward(self, x: torch.Tensor): 40 | pe = self.positional_encodings[:x.shape[0]].detach().requires_grad_(False) 41 | x = x + pe 42 | x = self.dropout(x) 43 | return x 44 | 45 | 46 | def get_positional_encoding(d_model: int, max_len: int = 5000): 47 | # Empty encodings vectors 48 | encodings = torch.zeros(max_len, d_model) 49 | # Position indexes 50 | position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1) 51 | # $2 * i$ 52 | two_i = torch.arange(0, d_model, 2, dtype=torch.float32) 53 | # $10000^{\frac{2i}{d_{model}}$ 54 | div_term = torch.exp(two_i * -(math.log(10000.0) / d_model)) 55 | # $PE_{p,2i} = sin\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg)$ 56 | encodings[:, 0::2] = torch.sin(position * div_term) 57 | # $PE_{p,2i + 1} = cos\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg)$ 58 | encodings[:, 1::2] = torch.cos(position * div_term) 59 | 60 | # Add batch dimension 61 | encodings = encodings.unsqueeze(1).requires_grad_(False) 62 | 63 | return encodings 64 | 65 | 66 | def _test_positional_encoding(): 67 | import matplotlib.pyplot as plt 68 | 69 | plt.figure(figsize=(15, 5)) 70 | pe = get_positional_encoding(20, 100) 71 | plt.plot(np.arange(100), pe[:, 0, 4:8].numpy()) 72 | plt.legend(["dim %d" % p for p in [4, 5, 6, 7]]) 73 | plt.title("Positional encoding") 74 | plt.show() 75 | 76 | 77 | if __name__ == '__main__': 78 | _test_positional_encoding() 79 | -------------------------------------------------------------------------------- /labml_nn/normalization/group_norm/experiment.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: CIFAR10 Experiment to try Group Normalization 4 | summary: > 5 | This trains is a simple convolutional neural network that uses group normalization 6 | to classify CIFAR10 images. 7 | --- 8 | 9 | # CIFAR10 Experiment for Group Normalization 10 | """ 11 | 12 | import torch.nn as nn 13 | 14 | from labml import experiment 15 | from labml.configs import option 16 | from labml_helpers.module import Module 17 | from labml_nn.experiments.cifar10 import CIFAR10Configs 18 | from labml_nn.normalization.group_norm import GroupNorm 19 | 20 | 21 | class Model(Module): 22 | """ 23 | ### VGG model for CIFAR-10 classification 24 | """ 25 | 26 | def __init__(self, groups: int = 32): 27 | super().__init__() 28 | layers = [] 29 | # RGB channels 30 | in_channels = 3 31 | # Number of channels in each layer in each block 32 | for block in [[64, 64], [128, 128], [256, 256, 256], [512, 512, 512], [512, 512, 512]]: 33 | # Convolution, Normalization and Activation layers 34 | for channels in block: 35 | layers += [nn.Conv2d(in_channels, channels, kernel_size=3, padding=1), 36 | GroupNorm(groups, channels), 37 | nn.ReLU(inplace=True)] 38 | in_channels = channels 39 | # Max pooling at end of each block 40 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 41 | 42 | # Create a sequential model with the layers 43 | self.layers = nn.Sequential(*layers) 44 | # Final logits layer 45 | self.fc = nn.Linear(512, 10) 46 | 47 | def __call__(self, x): 48 | # The VGG layers 49 | x = self.layers(x) 50 | # Reshape for classification layer 51 | x = x.view(x.shape[0], -1) 52 | # Final linear layer 53 | return self.fc(x) 54 | 55 | 56 | class Configs(CIFAR10Configs): 57 | # Number of groups 58 | groups: int = 16 59 | 60 | 61 | @option(Configs.model) 62 | def model(c: Configs): 63 | """ 64 | ### Create model 65 | """ 66 | return Model(c.groups).to(c.device) 67 | 68 | 69 | def main(): 70 | # Create experiment 71 | experiment.create(name='cifar10', comment='group norm') 72 | # Create configurations 73 | conf = Configs() 74 | # Load configurations 75 | experiment.configs(conf, { 76 | 'optimizer.optimizer': 'Adam', 77 | 'optimizer.learning_rate': 2.5e-4, 78 | }) 79 | # Start the experiment and run the training loop 80 | with experiment.start(): 81 | conf.run() 82 | 83 | 84 | # 85 | if __name__ == '__main__': 86 | main() 87 | -------------------------------------------------------------------------------- /labml_nn/normalization/batch_norm/mnist.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: MNIST Experiment to try Batch Normalization 4 | summary: > 5 | This trains is a simple convolutional neural network that uses batch normalization 6 | to classify MNIST digits. 7 | --- 8 | 9 | # MNIST Experiment for Batch Normalization 10 | """ 11 | 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | import torch.utils.data 15 | 16 | from labml import experiment 17 | from labml.configs import option 18 | from labml_helpers.module import Module 19 | from labml_nn.experiments.mnist import MNISTConfigs 20 | from labml_nn.normalization.batch_norm import BatchNorm 21 | 22 | 23 | class Model(Module): 24 | """ 25 | ### Model definition 26 | """ 27 | 28 | def __init__(self): 29 | super().__init__() 30 | # Note that we omit the bias parameter 31 | self.conv1 = nn.Conv2d(1, 20, 5, 1, bias=False) 32 | # Batch normalization with 20 channels (output of convolution layer). 33 | # The input to this layer will have shape `[batch_size, 20, height(24), width(24)]` 34 | self.bn1 = BatchNorm(20) 35 | # 36 | self.conv2 = nn.Conv2d(20, 50, 5, 1, bias=False) 37 | # Batch normalization with 50 channels. 38 | # The input to this layer will have shape `[batch_size, 50, height(8), width(8)]` 39 | self.bn2 = BatchNorm(50) 40 | # 41 | self.fc1 = nn.Linear(4 * 4 * 50, 500, bias=False) 42 | # Batch normalization with 500 channels (output of fully connected layer). 43 | # The input to this layer will have shape `[batch_size, 500]` 44 | self.bn3 = BatchNorm(500) 45 | # 46 | self.fc2 = nn.Linear(500, 10) 47 | 48 | def __call__(self, x: torch.Tensor): 49 | x = F.relu(self.bn1(self.conv1(x))) 50 | x = F.max_pool2d(x, 2, 2) 51 | x = F.relu(self.bn2(self.conv2(x))) 52 | x = F.max_pool2d(x, 2, 2) 53 | x = x.view(-1, 4 * 4 * 50) 54 | x = F.relu(self.bn3(self.fc1(x))) 55 | return self.fc2(x) 56 | 57 | 58 | @option(MNISTConfigs.model) 59 | def model(c: MNISTConfigs): 60 | """ 61 | ### Create model 62 | 63 | We use [`MNISTConfigs`](../../experiments/mnist.html#MNISTConfigs) configurations 64 | and set a new function to calculate the model. 65 | """ 66 | return Model().to(c.device) 67 | 68 | 69 | def main(): 70 | # Create experiment 71 | experiment.create(name='mnist_batch_norm') 72 | # Create configurations 73 | conf = MNISTConfigs() 74 | # Load configurations 75 | experiment.configs(conf, {'optimizer.optimizer': 'Adam'}) 76 | # Start the experiment and run the training loop 77 | with experiment.start(): 78 | conf.run() 79 | 80 | 81 | # 82 | if __name__ == '__main__': 83 | main() 84 | -------------------------------------------------------------------------------- /labml_nn/normalization/instance_norm/experiment.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: CIFAR10 Experiment to try Instance Normalization 4 | summary: > 5 | This trains is a simple convolutional neural network that uses instance normalization 6 | to classify CIFAR10 images. 7 | --- 8 | 9 | # CIFAR10 Experiment for Instance Normalization 10 | 11 | This demonstrates the use of an instance normalization layer in a convolutional 12 | neural network for classification. Not that instance normalization was designed for 13 | style transfer and this is only a demo. 14 | """ 15 | 16 | import torch.nn as nn 17 | 18 | from labml import experiment 19 | from labml.configs import option 20 | from labml_helpers.module import Module 21 | from labml_nn.experiments.cifar10 import CIFAR10Configs 22 | from labml_nn.normalization.instance_norm import InstanceNorm 23 | 24 | 25 | class Model(Module): 26 | """ 27 | ### VGG model for CIFAR-10 classification 28 | """ 29 | 30 | def __init__(self): 31 | super().__init__() 32 | layers = [] 33 | # RGB channels 34 | in_channels = 3 35 | # Number of channels in each layer in each block 36 | for block in [[64, 64], [128, 128], [256, 256, 256], [512, 512, 512], [512, 512, 512]]: 37 | # Convolution, Normalization and Activation layers 38 | for channels in block: 39 | layers += [nn.Conv2d(in_channels, channels, kernel_size=3, padding=1), 40 | InstanceNorm(channels), 41 | nn.ReLU(inplace=True)] 42 | in_channels = channels 43 | # Max pooling at end of each block 44 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 45 | 46 | # Create a sequential model with the layers 47 | self.layers = nn.Sequential(*layers) 48 | # Final logits layer 49 | self.fc = nn.Linear(512, 10) 50 | 51 | def __call__(self, x): 52 | # The VGG layers 53 | x = self.layers(x) 54 | # Reshape for classification layer 55 | x = x.view(x.shape[0], -1) 56 | # Final linear layer 57 | return self.fc(x) 58 | 59 | 60 | @option(CIFAR10Configs.model) 61 | def model(c: CIFAR10Configs): 62 | """ 63 | ### Create model 64 | """ 65 | return Model().to(c.device) 66 | 67 | 68 | def main(): 69 | # Create experiment 70 | experiment.create(name='cifar10', comment='instance norm') 71 | # Create configurations 72 | conf = CIFAR10Configs() 73 | # Load configurations 74 | experiment.configs(conf, { 75 | 'optimizer.optimizer': 'Adam', 76 | 'optimizer.learning_rate': 2.5e-4, 77 | }) 78 | # Start the experiment and run the training loop 79 | with experiment.start(): 80 | conf.run() 81 | 82 | 83 | # 84 | if __name__ == '__main__': 85 | main() 86 | -------------------------------------------------------------------------------- /labml_nn/resnets/resnet_net.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | 3 | # Custom classes 4 | from models.mlp import MLP 5 | from utils.train import Trainer 6 | from models.resnet import * 7 | 8 | # GPU Check 9 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 10 | print("Device: " + str(device)) 11 | 12 | #Use different train/test data augmentations 13 | transform_test = transforms.Compose( 14 | [transforms.ToTensor(), 15 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) 16 | 17 | transform_train = transforms.Compose([ 18 | transforms.RandomHorizontalFlip(p=1.0), 19 | transforms.RandomRotation(20), 20 | transforms.RandomCrop(32, (2, 2), pad_if_needed=False, padding_mode='constant'), 21 | transforms.ToTensor(), 22 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) 23 | 24 | 25 | # Get Cifar 10 Datasets 26 | save='./data/Cifar10' 27 | trainset = torchvision.datasets.CIFAR10(root=save, train=True, download=True, transform=transform_train) 28 | testset = torchvision.datasets.CIFAR10(root=save, train=False, download=True, transform=transform_test) 29 | 30 | # Get Cifar 10 Dataloaders 31 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, 32 | shuffle=True, num_workers=4) 33 | 34 | testloader = torch.utils.data.DataLoader(testset, batch_size=64, 35 | shuffle=False, num_workers=4) 36 | 37 | epochs = 50 38 | 39 | ################################# 40 | # Create the assignment Resnet (part a) 41 | ################################# 42 | def MyResNet(): 43 | resnet = ResNet(in_features= [32, 32, 3], 44 | num_class=10, 45 | feature_channel_list = [128, 256, 512], 46 | batch_norm= True, 47 | num_stacks=1 48 | ) 49 | 50 | # Create MLP 51 | # Calculate the input shape 52 | s = resnet.GetCurShape() 53 | in_features = s[0]*s[1]*s[2] 54 | 55 | mlp = MLP(in_features, 56 | 10, 57 | [], #512, 1024, 512 58 | [], 59 | use_batch_norm=False, 60 | use_dropout=False, 61 | use_softmax=False, 62 | device=device) 63 | 64 | resnet.AddMLP(mlp) 65 | return resnet 66 | 67 | model = MyResNet() 68 | model.to(device=device) 69 | summary(model, (3, 32,32)) 70 | 71 | # Optimizer 72 | opt = torch.optim.Adam(model.parameters(), lr=0.0005, betas=(0.9, 0.95), weight_decay=1e-8) #0.0005 l2_factor.item() 73 | 74 | # Loss function 75 | cost = nn.CrossEntropyLoss() 76 | 77 | # Create a trainer 78 | trainer = Trainer(model, opt, cost, name="MyResNet", device=device, use_lr_schedule =True) 79 | 80 | # Run training 81 | trainer.Train(trainloader, epochs, testloader=testloader) 82 | 83 | print('done') 84 | -------------------------------------------------------------------------------- /labml_nn/resnets/models/mlp.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | class MLP(nn.Module): 7 | def __init__(self 8 | , in_features 9 | , out_features 10 | , hidden_layers 11 | , actv_func 12 | , pre_module_list=None 13 | , use_dropout=False 14 | , use_batch_norm=False 15 | , use_softmax=True 16 | , device="cpu" 17 | ): 18 | super(MLP, self).__init__() 19 | 20 | self.in_features = in_features 21 | self.out_features = out_features 22 | self.num_hidden_layers = len(hidden_layers) 23 | self.hidden_layers = hidden_layers 24 | self.use_dropout = use_dropout 25 | self.use_batch_norm = use_batch_norm 26 | self.actv_func = actv_func 27 | self.use_softmax = use_softmax 28 | 29 | self.device = device 30 | 31 | # Add on to another model 32 | if pre_module_list: 33 | self.module_list = pre_module_list 34 | else: 35 | self.module_list = nn.ModuleList() 36 | 37 | self.build_() 38 | 39 | # Send to gpu 40 | self.to(self.device) 41 | 42 | def build_(self): 43 | # Activation Functions for Fully connected layers # 44 | # Start with input dimensions 45 | dim = self.in_features 46 | for i in range(self.num_hidden_layers): 47 | # Create a fully connected layer between the last layer 48 | # and the current hidden layer 49 | self.module_list.append(nn.Linear(dim, self.hidden_layers[i])) 50 | # Update the current dimension 51 | dim = self.hidden_layers[i] 52 | 53 | if self.use_batch_norm: 54 | self.module_list.append( nn.BatchNorm1d(dim, affine=True) ) 55 | 56 | # Add the Activation function 57 | self.module_list.append( self.GetActivation(name=self.actv_func[i]) ) 58 | 59 | if self.use_dropout: 60 | self.module_list.append( nn.Dropout(p=0.10) ) 61 | 62 | # Fully connect to output dimensions 63 | if dim != self.out_features: 64 | self.module_list.append( nn.Linear(dim, self.out_features) ) 65 | 66 | 67 | def forward(self, x): 68 | # Flatten the 2d image into 1d 69 | # Also convert into float for FC layer 70 | x = torch.flatten(x.float(), start_dim=1) 71 | 72 | # Apply each layer in the module list 73 | for i in range( len(self.module_list) ): 74 | x = self.module_list[i](x) 75 | 76 | return x 77 | 78 | def GetActivation(self, name="relu"): 79 | if name == "relu": 80 | return nn.ReLU() 81 | elif name == "leakyrelu": 82 | return nn.LeakyReLU() 83 | elif name == "Sigmoid": 84 | return nn.Sigmoid() 85 | elif name == "Tanh": 86 | return nn.Tanh() 87 | elif name == "Identity": 88 | return nn.Identity() 89 | else: 90 | return nn.ReLU() -------------------------------------------------------------------------------- /labml_nn/gan/wasserstein/gradient_penalty/experiment.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: WGAN-GP experiment with MNIST 4 | summary: This experiment generates MNIST images using convolutional neural network. 5 | --- 6 | 7 | # WGAN-GP experiment with MNIST 8 | """ 9 | 10 | import torch 11 | 12 | from labml import experiment, tracker 13 | # Import configurations from [Wasserstein experiment](../experiment.html) 14 | from labml_nn.gan.wasserstein.experiment import Configs as OriginalConfigs 15 | # 16 | from labml_nn.gan.wasserstein.gradient_penalty import GradientPenalty 17 | 18 | 19 | class Configs(OriginalConfigs): 20 | """ 21 | ## Configuration class 22 | 23 | We extend [original GAN implementation](../../original/experiment.html) and override the discriminator (critic) loss 24 | calculation to include gradient penalty. 25 | """ 26 | 27 | # Gradient penalty coefficient $\lambda$ 28 | gradient_penalty_coefficient: float = 10.0 29 | # 30 | gradient_penalty = GradientPenalty() 31 | 32 | def calc_discriminator_loss(self, data: torch.Tensor): 33 | """ 34 | This overrides the original discriminator loss calculation and 35 | includes gradient penalty. 36 | """ 37 | # Require gradients on $x$ to calculate gradient penalty 38 | data.requires_grad_() 39 | # Sample $z \sim p(z)$ 40 | latent = self.sample_z(data.shape[0]) 41 | # $D(x)$ 42 | f_real = self.discriminator(data) 43 | # $D(G_\theta(z))$ 44 | f_fake = self.discriminator(self.generator(latent).detach()) 45 | # Get discriminator losses 46 | loss_true, loss_false = self.discriminator_loss(f_real, f_fake) 47 | # Calculate gradient penalties in training mode 48 | if self.mode.is_train: 49 | gradient_penalty = self.gradient_penalty(data, f_real) 50 | tracker.add("loss.gp.", gradient_penalty) 51 | loss = loss_true + loss_false + self.gradient_penalty_coefficient * gradient_penalty 52 | # Skip gradient penalty otherwise 53 | else: 54 | loss = loss_true + loss_false 55 | 56 | # Log stuff 57 | tracker.add("loss.discriminator.true.", loss_true) 58 | tracker.add("loss.discriminator.false.", loss_false) 59 | tracker.add("loss.discriminator.", loss) 60 | 61 | return loss 62 | 63 | 64 | def main(): 65 | # Create configs object 66 | conf = Configs() 67 | # Create experiment 68 | experiment.create(name='mnist_wassertein_gp_dcgan') 69 | # Override configurations 70 | experiment.configs(conf, 71 | { 72 | 'discriminator': 'cnn', 73 | 'generator': 'cnn', 74 | 'label_smoothing': 0.01, 75 | 'generator_loss': 'wasserstein', 76 | 'discriminator_loss': 'wasserstein', 77 | 'discriminator_k': 5, 78 | }) 79 | 80 | # Start the experiment and run training loop 81 | with experiment.start(): 82 | conf.run() 83 | 84 | 85 | if __name__ == '__main__': 86 | main() 87 | -------------------------------------------------------------------------------- /labml_nn/gan/wasserstein/gradient_penalty/__init__.py: -------------------------------------------------------------------------------- 1 | r""" 2 | --- 3 | title: Gradient Penalty for Wasserstein GAN (WGAN-GP) 4 | summary: > 5 | An annotated PyTorch implementation/tutorial of 6 | Improved Training of Wasserstein GANs. 7 | --- 8 | 9 | # Gradient Penalty for Wasserstein GAN (WGAN-GP) 10 | 11 | This is an implementation of 12 | [Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028). 13 | 14 | [WGAN](../index.html) suggests clipping weights to enforce Lipschitz constraint 15 | on the discriminator network (critic). 16 | This and other weight constraints like L2 norm clipping, weight normalization, 17 | L1, L2 weight decay have problems: 18 | 19 | 1. Limiting the capacity of the discriminator 20 | 2. Exploding and vanishing gradients (without [Batch Normalization](../../../normalization/batch_norm/index.html)). 21 | 22 | The paper [Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028) 23 | proposal a better way to improve Lipschitz constraint, a gradient penalty. 24 | 25 | $$\mathcal{L}_{GP} = \lambda \underset{\hat{x} \sim \mathbb{P}_{\hat{x}}}{\mathbb{E}} 26 | \Big[ \big(\Vert \nabla_{\hat{x}} D(\hat{x}) \Vert_2 - 1\big)^2 \Big] 27 | $$ 28 | 29 | where $\lambda$ is the penalty weight and 30 | 31 | \begin{align} 32 | x &\sim \mathbb{P}_r \\ 33 | z &\sim p(z) \\ 34 | \epsilon &\sim U[0,1] \\ 35 | \tilde{x} &\leftarrow G_\theta (z) \\ 36 | \hat{x} &\leftarrow \epsilon x + (1 - \epsilon) \tilde{x} 37 | \end{align} 38 | 39 | That is we try to keep the gradient norm $\Vert \nabla_{\hat{x}} D(\hat{x}) \Vert_2$ close to $1$. 40 | 41 | In this implementation we set $\epsilon = 1$. 42 | 43 | Here is the [code for an experiment](experiment.html) that uses gradient penalty. 44 | """ 45 | 46 | import torch 47 | import torch.autograd 48 | 49 | from labml_helpers.module import Module 50 | 51 | 52 | class GradientPenalty(Module): 53 | """ 54 | ## Gradient Penalty 55 | """ 56 | 57 | def __call__(self, x: torch.Tensor, f: torch.Tensor): 58 | """ 59 | * `x` is $x \sim \mathbb{P}_r$ 60 | * `f` is $D(x)$ 61 | 62 | $\hat{x} \leftarrow x$ 63 | since we set $\epsilon = 1$ for this implementation. 64 | """ 65 | 66 | # Get batch size 67 | batch_size = x.shape[0] 68 | 69 | # Calculate gradients of $D(x)$ with respect to $x$. 70 | # `grad_outputs` is set to ones since we want the gradients of $D(x)$, 71 | # and we need to create and retain graph since we have to compute gradients 72 | # with respect to weight on this loss. 73 | gradients, *_ = torch.autograd.grad(outputs=f, 74 | inputs=x, 75 | grad_outputs=f.new_ones(f.shape), 76 | create_graph=True) 77 | 78 | # Reshape gradients to calculate the norm 79 | gradients = gradients.reshape(batch_size, -1) 80 | # Calculate the norm $\Vert \nabla_{\hat{x}} D(\hat{x}) \Vert_2$ 81 | norm = gradients.norm(2, dim=-1) 82 | # Return the loss $\big(\Vert \nabla_{\hat{x}} D(\hat{x}) \Vert_2 - 1\big)^2$ 83 | return torch.mean((norm - 1) ** 2) 84 | -------------------------------------------------------------------------------- /labml_nn/transformers/mlm/readme.md: -------------------------------------------------------------------------------- 1 | # [Masked Language Model (MLM)](https://nn.labml.ai/transformers/mlm/index.html) 2 | 3 | This is a [PyTorch](https://pytorch.org) implementation of Masked Language Model (MLM) 4 | used to pre-train the BERT model introduced in the paper 5 | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805). 6 | 7 | ## BERT Pretraining 8 | 9 | BERT model is a transformer model. 10 | The paper pre-trains the model using MLM and with next sentence prediction. 11 | We have only implemented MLM here. 12 | 13 | ### Next sentence prediction 14 | 15 | In *next sentence prediction*, the model is given two sentences `A` and `B` and the model 16 | makes a binary prediction whether `B` is the sentence that follows `A` in the actual text. 17 | The model is fed with actual sentence pairs 50% of the time and random pairs 50% of the time. 18 | This classification is done while applying MLM. *We haven't implemented this here.* 19 | 20 | ## Masked LM 21 | 22 | This masks a percentage of tokens at random and trains the model to predict 23 | the masked tokens. 24 | They **mask 15% of the tokens** by replacing them with a special `[MASK]` token. 25 | 26 | The loss is computed on predicting the masked tokens only. 27 | This causes a problem during fine-tuning and actual usage since there are no `[MASK]` tokens 28 | at that time. 29 | Therefore we might not get any meaningful representations. 30 | 31 | To overcome this **10% of the masked tokens are replaced with the original token**, 32 | and another **10% of the masked tokens are replaced with a random token**. 33 | This trains the model to give representations about the actual token whether or not the 34 | input token at that position is a `[MASK]`. 35 | And replacing with a random token causes it to 36 | give a representation that has information from the context as well; 37 | because it has to use the context to fix randomly replaced tokens. 38 | 39 | ## Training 40 | 41 | MLMs are harder to train than autoregressive models because they have a smaller training signal. 42 | i.e. only a small percentage of predictions are trained per sample. 43 | 44 | Another problem is since the model is bidirectional, any token can see any other token. 45 | This makes the "credit assignment" harder. 46 | Let's say you have the character level model trying to predict `home *s where i want to be`. 47 | At least during the early stages of the training, it'll be super hard to figure out why the 48 | replacement for `*` should be `i`, it could be anything from the whole sentence. 49 | Whilst, in an autoregressive setting the model will only have to use `h` to predict `o` and 50 | `hom` to predict `e` and so on. So the model will initially start predicting with a shorter context first 51 | and then learn to use longer contexts later. 52 | Since MLMs have this problem it's a lot faster to train if you start with a smaller sequence length 53 | initially and then use a longer sequence length later. 54 | 55 | Here is [the training code](https://nn.labml.ai/transformers/mlm/experiment.html) for a simple MLM model. 56 | 57 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/3a6d22b6c67111ebb03d6764d13a38d1) 58 | -------------------------------------------------------------------------------- /labml_nn/cnn/utils/dataloader.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | 3 | import torch 4 | import torchvision 5 | import torchvision.transforms as transforms 6 | from torch.utils.data import Dataset, random_split 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | 10 | def LoadCifar10DatasetTrain(save, transform=None): 11 | trainset = torchvision.datasets.CIFAR10(root=save, train=True, 12 | download=True, transform=transform) 13 | return trainset 14 | 15 | def LoadCifar10DatasetTest(save, transform): 16 | return torchvision.datasets.CIFAR10(root=save, train=False, 17 | download=False, transform=transform) 18 | 19 | def GetCustTransform(): 20 | transform_train = transforms.Compose([ 21 | transforms.RandomRotation(20), 22 | transforms.RandomCrop(32, (2, 2), pad_if_needed=False, padding_mode='constant'), 23 | transforms.ToTensor(), 24 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) 25 | return transform_train 26 | 27 | def Dataloader_train_valid(save, batch_size): 28 | 29 | # See utils/dataloader.py for data augmentations 30 | transform_train_valid = GetCustTransform() 31 | 32 | # Get Cifar 10 Datasets 33 | trainset = LoadCifar10DatasetTrain(save, transform_train_valid) 34 | train_val_abs = int(len(trainset) * 0.8) 35 | train_subset, val_subset = random_split(trainset, [train_val_abs, len(trainset) - train_val_abs]) 36 | 37 | # Get Cifar 10 Dataloaders 38 | trainloader = torch.utils.data.DataLoader(train_subset, batch_size=batch_size, 39 | shuffle=True, num_workers=4) 40 | 41 | valloader = torch.utils.data.DataLoader(val_subset, batch_size=batch_size, 42 | shuffle=True, num_workers=4) 43 | return trainloader, valloader 44 | 45 | def Dataloader_train(save, batch_size): 46 | 47 | # See utils/dataloader.py for data augmentations 48 | transform_train = GetCustTransform() 49 | 50 | # Get Cifar 10 Datasets 51 | trainset = LoadCifar10DatasetTrain(save, transform_train) 52 | # Get Cifar 10 Dataloaders 53 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, 54 | shuffle=True, num_workers=4) 55 | 56 | return trainloader 57 | 58 | def Dataloader_test(save, batch_size): 59 | 60 | # transformation test set 61 | transform_test = transforms.Compose( 62 | [transforms.ToTensor(), 63 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) 64 | 65 | # initialize test dataset and dataloader 66 | testset = LoadCifar10DatasetTest(save, transform_test) 67 | testloader = torch.utils.data.DataLoader(testset, batch_size=64, 68 | shuffle=False, num_workers=4) 69 | 70 | return testloader 71 | 72 | def imshow(im): 73 | image = im.cpu().clone().detach().numpy() 74 | image = image.transpose(1, 2, 0) 75 | image = image * np.array((0.5, 0.5, 0.5)) + np.array((0.5, 0.5, 0.5)) # unnormalize 76 | plt.imshow(image) 77 | plt.show() 78 | 79 | def imretrun(im): 80 | image = im.cpu().clone().detach().numpy() 81 | image = image.transpose(1, 2, 0) 82 | image = image * np.array((0.5, 0.5, 0.5)) + np.array((0.5, 0.5, 0.5)) # unnormalize 83 | return image -------------------------------------------------------------------------------- /labml_nn/hypernetworks/experiment.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from labml import experiment 4 | from labml.configs import option 5 | from labml.utils.pytorch import get_modules 6 | from labml_helpers.module import Module 7 | 8 | from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs 9 | from labml_nn.hypernetworks.hyper_lstm import HyperLSTM 10 | from labml_nn.lstm import LSTM 11 | 12 | 13 | class AutoregressiveModel(Module): 14 | """ 15 | ## Auto regressive model 16 | """ 17 | 18 | def __init__(self, n_vocab: int, d_model: int, rnn_model: Module): 19 | super().__init__() 20 | # Token embedding module 21 | self.src_embed = nn.Embedding(n_vocab, d_model) 22 | self.lstm = rnn_model 23 | self.generator = nn.Linear(d_model, n_vocab) 24 | 25 | def __call__(self, x: torch.Tensor): 26 | x = self.src_embed(x) 27 | # Embed the tokens (`src`) and run it through the the transformer 28 | res, state = self.lstm(x) 29 | # Generate logits of the next token 30 | return self.generator(res), state 31 | 32 | 33 | class Configs(NLPAutoRegressionConfigs): 34 | """ 35 | ## Configurations 36 | 37 | The default configs can and will be over-ridden when we start the experiment 38 | """ 39 | 40 | model: AutoregressiveModel 41 | rnn_model: Module 42 | 43 | d_model: int = 512 44 | n_rhn: int = 16 45 | n_z: int = 16 46 | 47 | 48 | @option(Configs.model) 49 | def autoregressive_model(c: Configs): 50 | """ 51 | Initialize the auto-regressive model 52 | """ 53 | m = AutoregressiveModel(c.n_tokens, c.d_model, c.rnn_model) 54 | return m.to(c.device) 55 | 56 | 57 | @option(Configs.rnn_model) 58 | def hyper_lstm(c: Configs): 59 | return HyperLSTM(c.d_model, c.d_model, c.n_rhn, c.n_z, 1) 60 | 61 | 62 | @option(Configs.rnn_model) 63 | def lstm(c: Configs): 64 | return LSTM(c.d_model, c.d_model, 1) 65 | 66 | 67 | def main(): 68 | # Create experiment 69 | experiment.create(name="hyper_lstm", comment='') 70 | # Create configs 71 | conf = Configs() 72 | # Load configurations 73 | experiment.configs(conf, 74 | # A dictionary of configurations to override 75 | {'tokenizer': 'character', 76 | 'text': 'tiny_shakespeare', 77 | 'optimizer.learning_rate': 2.5e-4, 78 | 'optimizer.optimizer': 'Adam', 79 | 'prompt': 'It is', 80 | 'prompt_separator': '', 81 | 82 | 'rnn_model': 'hyper_lstm', 83 | 84 | 'train_loader': 'shuffled_train_loader', 85 | 'valid_loader': 'shuffled_valid_loader', 86 | 87 | 'seq_len': 512, 88 | 'epochs': 128, 89 | 'batch_size': 2, 90 | 'inner_iterations': 25}) 91 | 92 | # Set models for saving and loading 93 | experiment.add_pytorch_models(get_modules(conf)) 94 | 95 | # Start the experiment 96 | with experiment.start(): 97 | # `TrainValidConfigs.run` 98 | conf.run() 99 | 100 | 101 | if __name__ == '__main__': 102 | main() 103 | -------------------------------------------------------------------------------- /labml_nn/rl/ppo/gae.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Generalized Advantage Estimation (GAE) 4 | summary: A PyTorch implementation/tutorial of Generalized Advantage Estimation (GAE). 5 | --- 6 | 7 | # Generalized Advantage Estimation (GAE) 8 | 9 | This is a [PyTorch](https://pytorch.org) implementation of paper 10 | [Generalized Advantage Estimation](https://arxiv.org/abs/1506.02438). 11 | 12 | You can find an experiment that uses it [here](experiment.html). 13 | """ 14 | 15 | import numpy as np 16 | 17 | 18 | class GAE: 19 | def __init__(self, n_workers: int, worker_steps: int, gamma: float, lambda_: float): 20 | self.lambda_ = lambda_ 21 | self.gamma = gamma 22 | self.worker_steps = worker_steps 23 | self.n_workers = n_workers 24 | 25 | def __call__(self, done: np.ndarray, rewards: np.ndarray, values: np.ndarray) -> np.ndarray: 26 | """ 27 | ### Calculate advantages 28 | \begin{align} 29 | \hat{A_t^{(1)}} &= r_t + \gamma V(s_{t+1}) - V(s) 30 | \\ 31 | \hat{A_t^{(2)}} &= r_t + \gamma r_{t+1} +\gamma^2 V(s_{t+2}) - V(s) 32 | \\ 33 | ... 34 | \\ 35 | \hat{A_t^{(\infty)}} &= r_t + \gamma r_{t+1} +\gamma^2 r_{t+1} + ... - V(s) 36 | \end{align} 37 | 38 | $\hat{A_t^{(1)}}$ is high bias, low variance, whilst 39 | $\hat{A_t^{(\infty)}}$ is unbiased, high variance. 40 | 41 | We take a weighted average of $\hat{A_t^{(k)}}$ to balance bias and variance. 42 | This is called Generalized Advantage Estimation. 43 | $$\hat{A_t} = \hat{A_t^{GAE}} = \sum_k w_k \hat{A_t^{(k)}}$$ 44 | We set $w_k = \lambda^{k-1}$, this gives clean calculation for 45 | $\hat{A_t}$ 46 | 47 | \begin{align} 48 | \delta_t &= r_t + \gamma V(s_{t+1}) - V(s_t)$ 49 | \\ 50 | \hat{A_t} &= \delta_t + \gamma \lambda \delta_{t+1} + ... + 51 | (\gamma \lambda)^{T - t + 1} \delta_{T - 1}$ 52 | \\ 53 | &= \delta_t + \gamma \lambda \hat{A_{t+1}} 54 | \end{align} 55 | """ 56 | 57 | # advantages table 58 | advantages = np.zeros((self.n_workers, self.worker_steps), dtype=np.float32) 59 | last_advantage = 0 60 | 61 | # $V(s_{t+1})$ 62 | last_value = values[:, -1] 63 | 64 | for t in reversed(range(self.worker_steps)): 65 | # mask if episode completed after step $t$ 66 | mask = 1.0 - done[:, t] 67 | last_value = last_value * mask 68 | last_advantage = last_advantage * mask 69 | # $\delta_t$ 70 | delta = rewards[:, t] + self.gamma * last_value - values[:, t] 71 | 72 | # $\hat{A_t} = \delta_t + \gamma \lambda \hat{A_{t+1}}$ 73 | last_advantage = delta + self.gamma * self.lambda_ * last_advantage 74 | 75 | # note that we are collecting in reverse order. 76 | # *My initial code was appending to a list and 77 | # I forgot to reverse it later. 78 | # It took me around 4 to 5 hours to find the bug. 79 | # The performance of the model was improving 80 | # slightly during initial runs, 81 | # probably because the samples are similar.* 82 | advantages[:, t] = last_advantage 83 | 84 | last_value = values[:, t] 85 | 86 | return advantages 87 | -------------------------------------------------------------------------------- /labml_nn/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Transformers 4 | summary: > 5 | This is a collection of PyTorch implementations/tutorials of 6 | transformers and related techniques. 7 | --- 8 | 9 | # Transformers 10 | 11 | This module contains [PyTorch](https://pytorch.org/) 12 | implementations and explanations of original transformer 13 | from paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762), 14 | and derivatives and enhancements of it. 15 | 16 | * [Multi-head attention](mha.html) 17 | * [Transformer Encoder and Decoder Models](models.html) 18 | * [Fixed positional encoding](positional_encoding.html) 19 | 20 | ## [Transformer XL](xl/index.html) 21 | This implements Transformer XL model using 22 | [relative multi-head attention](xl/relative_mha.html) 23 | 24 | ## [Compressive Transformer](compressive/index.html) 25 | 26 | This is an implementation of compressive transformer 27 | that extends upon [Transformer XL](xl/index.html) by compressing 28 | oldest memories to give a longer attention span. 29 | 30 | ## [GPT Architecture](gpt/index.html) 31 | 32 | This is an implementation of GPT-2 architecture. 33 | 34 | ## [GLU Variants](glu_variants/simple.html) 35 | 36 | This is an implementation of the paper 37 | [GLU Variants Improve Transformer](https://arxiv.org/abs/2002.05202). 38 | 39 | ## [kNN-LM](knn/index.html) 40 | 41 | This is an implementation of the paper 42 | [Generalization through Memorization: Nearest Neighbor Language Models](https://arxiv.org/abs/1911.00172). 43 | 44 | ## [Feedback Transformer](feedback/index.html) 45 | 46 | This is an implementation of the paper 47 | [Accessing Higher-level Representations in Sequential Transformers with Feedback Memory](https://arxiv.org/abs/2002.09402). 48 | 49 | ## [Switch Transformer](switch/index.html) 50 | 51 | This is a miniature implementation of the paper 52 | [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961). 53 | Our implementation only has a few million parameters and doesn't do model parallel distributed training. 54 | It does single GPU training but we implement the concept of switching as described in the paper. 55 | 56 | ## [Fast Weights Transformer](fast_weights/index.html) 57 | 58 | This is an implementation of the paper 59 | [Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch](https://arxiv.org/abs/2102.11174). 60 | 61 | ## [FNet: Mixing Tokens with Fourier Transforms](fnet/index.html) 62 | 63 | This is an implementation of the paper 64 | [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824). 65 | 66 | ## [Attention Free Transformer](aft/index.html) 67 | 68 | This is an implementation of the paper 69 | [An Attention Free Transformer](https://papers.labml.ai/paper/2105.14103). 70 | 71 | ## [Masked Language Model](mlm/index.html) 72 | 73 | This is an implementation of Masked Language Model used for pre-training in paper 74 | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805). 75 | 76 | ## [Pay Attention to MLPs (gMLP)](gmlp/index.html) 77 | 78 | This is an implementation of the paper 79 | [Pay Attention to MLPs](https://papers.labml.ai/paper/2105.08050). 80 | """ 81 | 82 | from .configs import TransformerConfigs 83 | from .models import TransformerLayer, Encoder, Decoder, Generator, EncoderDecoder 84 | from .mha import MultiHeadAttention 85 | from labml_nn.transformers.xl.relative_mha import RelativeMultiHeadAttention 86 | -------------------------------------------------------------------------------- /labml_nn/optimizers/noam.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Noam optimizer from Attention is All You Need paper 4 | summary: > 5 | This is a tutorial/implementation of Noam optimizer. 6 | Noam optimizer has a warm-up period and then an exponentially decaying learning rate. 7 | --- 8 | 9 | # Noam Optimizer 10 | 11 | This is the [PyTorch](https://pytorch.org) implementation of optimizer introduced in the paper 12 | [Attention Is All You Need](https://arxiv.org/abs/1706.03762). 13 | """ 14 | from typing import Dict 15 | 16 | from labml_nn.optimizers import WeightDecay 17 | from labml_nn.optimizers.amsgrad import AMSGrad 18 | 19 | 20 | class Noam(AMSGrad): 21 | """ 22 | ## Noam Optimizer 23 | 24 | This class extends from Adam optimizer defined in [`adam.py`](adam.html). 25 | """ 26 | 27 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, 28 | weight_decay: WeightDecay = WeightDecay(), 29 | optimized_update: bool = True, 30 | amsgrad=False, 31 | warmup=0, d_model=512, defaults=None): 32 | """ 33 | ### Initialize the optimizer 34 | 35 | * `params` is the list of parameters 36 | * `lr` is the learning rate $\alpha$ 37 | * `betas` is a tuple of ($\beta_1$, $\beta_2$) 38 | * `eps` is $\hat{\epsilon}$ or $\epsilon$ based on `optimized_update` 39 | * `weight_decay` is an instance of class `WeightDecay` defined in [`__init__.py`](index.html) 40 | * 'optimized_update' is a flag whether to optimize the bias correction of the second moment 41 | by doing it after adding $\epsilon$ 42 | * `amsgrad` is a flag indicating whether to use AMSGrad or fallback to plain Adam 43 | * `warmup` number of warmup steps 44 | * `d_model` model size; i.e. number of dimensions in the transformer 45 | * `defaults` is a dictionary of default for group values. 46 | This is useful when you want to extend the class `AdamWarmup`. 47 | """ 48 | 49 | defaults = {} if defaults is None else defaults 50 | defaults.update(dict(warmup=warmup)) 51 | super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults) 52 | self.d_model = d_model 53 | 54 | def get_lr(self, state: Dict[str, any], group: Dict[str, any]): 55 | """ 56 | ### Get learning-rate 57 | 58 | $$\alpha \frac{1}{\sqrt{d_{model}}} \min \bigg(\frac{1}{\sqrt{t}}, \frac{t}{w^{3/2}}\bigg)$$ 59 | where $w$ is the number of warmup steps. 60 | """ 61 | # $$\min \bigg(\frac{1}{\sqrt{t}}, \frac{t}{w^{3/2}}\bigg)$$ 62 | factor = min(state['step'] ** (-0.5), state['step'] * group['warmup'] ** (-1.5)) 63 | # $$\alpha \frac{1}{\sqrt{d_{model}}} \min \bigg(\frac{1}{\sqrt{t}}, \frac{t}{w^{3/2}}\bigg)$$ 64 | return group['lr'] * self.d_model ** (-0.5) * factor 65 | 66 | 67 | def _test_noam_lr(): 68 | """ 69 | ### Plot learning rate for different warmups and model sizes 70 | 71 | ![Plot of learning rate](noam_lr.png) 72 | """ 73 | import matplotlib.pyplot as plt 74 | import numpy as np 75 | from torch import nn 76 | 77 | model = nn.Linear(10, 10) 78 | opts = [Noam(model.parameters(), d_model=512, warmup=4000, lr=1), 79 | Noam(model.parameters(), d_model=512, warmup=8000, lr=1), 80 | Noam(model.parameters(), d_model=2048, warmup=2000, lr=1)] 81 | plt.plot(np.arange(1, 20000), [[opt.get_lr({'step': i}, opt.defaults) for opt in opts] for i in range(1, 20000)]) 82 | plt.legend(["512:4000", "512:8000", "2048:2000"]) 83 | plt.title("Learning Rate") 84 | plt.show() 85 | 86 | 87 | if __name__ == '__main__': 88 | _test_noam_lr() 89 | -------------------------------------------------------------------------------- /labml_nn/rl/dqn/model.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Deep Q Network (DQN) Model 4 | summary: Implementation of neural network model for Deep Q Network (DQN). 5 | --- 6 | 7 | # Deep Q Network (DQN) Model 8 | """ 9 | 10 | import torch 11 | from torch import nn 12 | 13 | from labml_helpers.module import Module 14 | 15 | 16 | class Model(Module): 17 | """ 18 | ## Dueling Network ⚔️ Model for $Q$ Values 19 | 20 | We are using a [dueling network](https://arxiv.org/abs/1511.06581) 21 | to calculate Q-values. 22 | Intuition behind dueling network architecture is that in most states 23 | the action doesn't matter, 24 | and in some states the action is significant. Dueling network allows 25 | this to be represented very well. 26 | 27 | \begin{align} 28 | Q^\pi(s,a) &= V^\pi(s) + A^\pi(s, a) 29 | \\ 30 | \mathop{\mathbb{E}}_{a \sim \pi(s)} 31 | \Big[ 32 | A^\pi(s, a) 33 | \Big] 34 | &= 0 35 | \end{align} 36 | 37 | So we create two networks for $V$ and $A$ and get $Q$ from them. 38 | $$ 39 | Q(s, a) = V(s) + 40 | \Big( 41 | A(s, a) - \frac{1}{|\mathcal{A}|} \sum_{a' \in \mathcal{A}} A(s, a') 42 | \Big) 43 | $$ 44 | We share the initial layers of the $V$ and $A$ networks. 45 | """ 46 | 47 | def __init__(self): 48 | super().__init__() 49 | self.conv = nn.Sequential( 50 | # The first convolution layer takes a 51 | # $84\times84$ frame and produces a $20\times20$ frame 52 | nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4), 53 | nn.ReLU(), 54 | 55 | # The second convolution layer takes a 56 | # $20\times20$ frame and produces a $9\times9$ frame 57 | nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2), 58 | nn.ReLU(), 59 | 60 | # The third convolution layer takes a 61 | # $9\times9$ frame and produces a $7\times7$ frame 62 | nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1), 63 | nn.ReLU(), 64 | ) 65 | 66 | # A fully connected layer takes the flattened 67 | # frame from third convolution layer, and outputs 68 | # $512$ features 69 | self.lin = nn.Linear(in_features=7 * 7 * 64, out_features=512) 70 | self.activation = nn.ReLU() 71 | 72 | # This head gives the state value $V$ 73 | self.state_value = nn.Sequential( 74 | nn.Linear(in_features=512, out_features=256), 75 | nn.ReLU(), 76 | nn.Linear(in_features=256, out_features=1), 77 | ) 78 | # This head gives the action value $A$ 79 | self.action_value = nn.Sequential( 80 | nn.Linear(in_features=512, out_features=256), 81 | nn.ReLU(), 82 | nn.Linear(in_features=256, out_features=4), 83 | ) 84 | 85 | def __call__(self, obs: torch.Tensor): 86 | # Convolution 87 | h = self.conv(obs) 88 | # Reshape for linear layers 89 | h = h.reshape((-1, 7 * 7 * 64)) 90 | 91 | # Linear layer 92 | h = self.activation(self.lin(h)) 93 | 94 | # $A$ 95 | action_value = self.action_value(h) 96 | # $V$ 97 | state_value = self.state_value(h) 98 | 99 | # $A(s, a) - \frac{1}{|\mathcal{A}|} \sum_{a' \in \mathcal{A}} A(s, a')$ 100 | action_score_centered = action_value - action_value.mean(dim=-1, keepdim=True) 101 | # $Q(s, a) =V(s) + \Big(A(s, a) - \frac{1}{|\mathcal{A}|} \sum_{a' \in \mathcal{A}} A(s, a')\Big)$ 102 | q = state_value + action_score_centered 103 | 104 | return q 105 | -------------------------------------------------------------------------------- /labml_nn/transformers/fnet/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: FNet - Mixing Tokens with Fourier Transforms 4 | summary: > 5 | This is an annotated implementation/tutorial the FNet in PyTorch. 6 | --- 7 | 8 | # FNet: Mixing Tokens with Fourier Transforms 9 | 10 | This is a [PyTorch](https://pytorch.org) implementation of the paper 11 | [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824). 12 | 13 | This paper replaces the [self-attention layer](../mha.html) with two 14 | [Fourier transforms](https://en.wikipedia.org/wiki/Discrete_Fourier_transform) to 15 | *mix* tokens. 16 | This is a $7 \times$ more efficient than self-attention. 17 | The accuracy loss of using this over self-attention is about 92% for 18 | [BERT](https://paperswithcode.com/method/bert) on 19 | [GLUE benchmark](https://paperswithcode.com/dataset/glue). 20 | 21 | ## Mixing tokens with two Fourier transforms 22 | 23 | We apply Fourier transform along the hidden dimension (embedding dimension) 24 | and then along the sequence dimension. 25 | 26 | $$ 27 | \mathcal{R}\big(\mathcal{F}_\text{seq} \big(\mathcal{F}_\text{hidden} (x) \big) \big) 28 | $$ 29 | 30 | where $x$ is the embedding input, $\mathcal{F}$ stands for the fourier transform and 31 | $\mathcal{R}$ stands for the real component in complex numbers. 32 | 33 | This is very simple to implement on PyTorch - just 1 line of code. 34 | The paper suggests using a precomputed DFT matrix and doing matrix multiplication to get the 35 | Fourier transformation. 36 | 37 | Here is [the training code](experiment.html) for using a FNet based model for classifying 38 | [AG News](https://paperswithcode.com/dataset/ag-news). 39 | """ 40 | 41 | from typing import Optional 42 | 43 | import torch 44 | from torch import nn 45 | 46 | 47 | class FNetMix(nn.Module): 48 | """ 49 | ## FNet - Mix tokens 50 | 51 | This module simply implements 52 | $$ 53 | \mathcal{R}\big(\mathcal{F}_\text{seq} \big(\mathcal{F}_\text{hidden} (x) \big) \big) 54 | $$ 55 | 56 | The structure of this module is made similar to a [standard attention module](../mha.html) so that we can simply 57 | replace it. 58 | """ 59 | 60 | def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: Optional[torch.Tensor] = None): 61 | """ 62 | The [normal attention module](../mha.html) can be fed with different token embeddings for 63 | $\text{query}$,$\text{key}$, and $\text{value}$ and a mask. 64 | 65 | We follow the same function signature so that we can replace it directly. 66 | 67 | For FNet mixing, $$x = \text{query} = \text{key} = \text{value}$$ and masking is not possible. 68 | Shape of `query` (and `key` and `value`) is `[seq_len, batch_size, d_model]`. 69 | """ 70 | 71 | # $\text{query}$,$\text{key}$, and $\text{value}$ all should be equal to $x$ for token mixing 72 | assert query is key and key is value 73 | # Token mixing doesn't support masking. i.e. all tokens will see all other token embeddings. 74 | assert mask is None 75 | 76 | # Assign to `x` for clarity 77 | x = query 78 | 79 | # Apply the Fourier transform along the hidden (embedding) dimension 80 | # $$\mathcal{F}_\text{hidden} (x)$$ 81 | # 82 | # The output of the Fourier transform is a tensor of 83 | # [complex numbers](https://pytorch.org/docs/stable/complex_numbers.html). 84 | fft_hidden = torch.fft.fft(x, dim=2) 85 | # Apply the Fourier transform along the sequence dimension 86 | # $$\mathcal{F}_\text{seq} \big(\mathcal{F}_\text{hidden} (x) \big)$$ 87 | fft_seq = torch.fft.fft(fft_hidden, dim=0) 88 | 89 | # Get the real component 90 | # $$\mathcal{R}\big(\mathcal{F}_\text{seq} \big(\mathcal{F}_\text{hidden} (x) \big) \big)$$ 91 | return torch.real(fft_seq) 92 | -------------------------------------------------------------------------------- /labml_nn/experiments/mnist.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: MNIST Experiment 4 | summary: > 5 | This is a reusable trainer for MNIST dataset 6 | --- 7 | 8 | # MNIST Experiment 9 | """ 10 | 11 | import torch.nn as nn 12 | import torch.utils.data 13 | from labml_helpers.module import Module 14 | 15 | from labml import tracker 16 | from labml.configs import option 17 | from labml_helpers.datasets.mnist import MNISTConfigs as MNISTDatasetConfigs 18 | from labml_helpers.device import DeviceConfigs 19 | from labml_helpers.metrics.accuracy import Accuracy 20 | from labml_helpers.train_valid import TrainValidConfigs, BatchIndex, hook_model_outputs 21 | from labml_nn.optimizers.configs import OptimizerConfigs 22 | 23 | 24 | class MNISTConfigs(MNISTDatasetConfigs, TrainValidConfigs): 25 | """ 26 | 27 | ## Trainer configurations 28 | 29 | """ 30 | 31 | # Optimizer 32 | optimizer: torch.optim.Adam 33 | # Training device 34 | device: torch.device = DeviceConfigs() 35 | 36 | # Classification model 37 | model: Module 38 | # Number of epochs to train for 39 | epochs: int = 10 40 | 41 | # Number of times to switch between training and validation within an epoch 42 | inner_iterations = 10 43 | 44 | # Accuracy function 45 | accuracy = Accuracy() 46 | # Loss function 47 | loss_func = nn.CrossEntropyLoss() 48 | 49 | def init(self): 50 | """ 51 | ### Initialization 52 | """ 53 | # Set tracker configurations 54 | tracker.set_scalar("loss.*", True) 55 | tracker.set_scalar("accuracy.*", True) 56 | # Add a hook to log module outputs 57 | hook_model_outputs(self.mode, self.model, 'model') 58 | # Add accuracy as a state module. 59 | # The name is probably confusing, since it's meant to store 60 | # states between training and validation for RNNs. 61 | # This will keep the accuracy metric stats separate for training and validation. 62 | self.state_modules = [self.accuracy] 63 | 64 | def step(self, batch: any, batch_idx: BatchIndex): 65 | """ 66 | ### Training or validation step 67 | """ 68 | 69 | # Move data to the device 70 | data, target = batch[0].to(self.device), batch[1].to(self.device) 71 | 72 | # Update global step (number of samples processed) when in training mode 73 | if self.mode.is_train: 74 | tracker.add_global_step(len(data)) 75 | 76 | # Whether to capture model outputs 77 | with self.mode.update(is_log_activations=batch_idx.is_last): 78 | # Get model outputs. 79 | output = self.model(data) 80 | 81 | # Calculate and log loss 82 | loss = self.loss_func(output, target) 83 | tracker.add("loss.", loss) 84 | 85 | # Calculate and log accuracy 86 | self.accuracy(output, target) 87 | self.accuracy.track() 88 | 89 | # Train the model 90 | if self.mode.is_train: 91 | # Calculate gradients 92 | loss.backward() 93 | # Take optimizer step 94 | self.optimizer.step() 95 | # Log the model parameters and gradients on last batch of every epoch 96 | if batch_idx.is_last: 97 | tracker.add('model', self.model) 98 | # Clear the gradients 99 | self.optimizer.zero_grad() 100 | 101 | # Save the tracked metrics 102 | tracker.save() 103 | 104 | 105 | @option(MNISTConfigs.optimizer) 106 | def _optimizer(c: MNISTConfigs): 107 | """ 108 | ### Default optimizer configurations 109 | """ 110 | opt_conf = OptimizerConfigs() 111 | opt_conf.parameters = c.model.parameters() 112 | opt_conf.optimizer = 'Adam' 113 | return opt_conf 114 | -------------------------------------------------------------------------------- /labml_nn/transformers/gmlp/experiment.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Pay Attention to MLPs (gMLP) Experiment 4 | summary: This experiment trains a gMLP based model on Tiny Shakespeare dataset. 5 | --- 6 | 7 | # [Pay Attention to MLPs (gMLP)](index.html) Experiment 8 | 9 | This is an annotated PyTorch experiment to train a [gMLP model](index.html). 10 | The paper also applies a Stochastic Depth regularization where some layers are removed randomly during training. 11 | We have not implemented that here. 12 | 13 | This is based on 14 | [training loop and configurations for a simple transformer auto-regressive NLP task](../basic/autoregressive_experiment.html). 15 | 16 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/01bd941ac74c11eb890c1d9196651a4a) 17 | """ 18 | from labml import experiment 19 | from labml.configs import option 20 | from labml_nn.transformers import TransformerConfigs 21 | from labml_nn.transformers.basic.autoregressive_experiment import Configs as BasicAutoRegressionConfigs 22 | from labml_nn.transformers.gmlp import GMLPBlock 23 | 24 | 25 | class Configs(BasicAutoRegressionConfigs): 26 | """ 27 | ## Configurations 28 | 29 | This inherits from 30 | [training loop and configurations for a simple transformer auto-regressive NLP task](../basic/autoregressive_transformer.html). 31 | """ 32 | 33 | # Transformer 34 | transformer: TransformerConfigs = 'gMLP' 35 | # gMLP Block 36 | gmlp: GMLPBlock 37 | # `d_ffn` for gMLP projection layer 38 | d_ffn: int = 2048 39 | 40 | 41 | @option(Configs.gmlp, 'gMLP') 42 | def _gmlp_configs(c: Configs): 43 | """ 44 | ### Create a gMLP block 45 | """ 46 | return GMLPBlock(c.d_model, c.d_ffn, c.seq_len) 47 | 48 | 49 | @option(Configs.transformer, 'gMLP') 50 | def _transformer_configs(c: Configs): 51 | """ 52 | ### Transformer configurations 53 | """ 54 | 55 | # We use our 56 | # [configurable transformer implementation](../configs.html#TransformerConfigs) 57 | conf = TransformerConfigs() 58 | # Set the vocabulary sizes for embeddings and generating logits 59 | conf.n_src_vocab = c.n_tokens 60 | conf.n_tgt_vocab = c.n_tokens 61 | # Set model size 62 | conf.d_model = c.d_model 63 | # Replace the encoder layer with a gMLP layer 64 | conf.encoder_layer = c.gmlp 65 | 66 | return conf 67 | 68 | 69 | def main(): 70 | # Create experiment 71 | experiment.create(name="gMLP") 72 | # Create configs 73 | conf = Configs() 74 | # Override configurations 75 | experiment.configs(conf, { 76 | # Use character level tokenizer 77 | 'tokenizer': 'character', 78 | # Prompt separator is blank 79 | 'prompt_separator': '', 80 | # Starting prompt for sampling 81 | 'prompt': 'It is ', 82 | # Use Tiny Shakespeare dataset 83 | 'text': 'tiny_shakespeare', 84 | 85 | # Use a context size of $256$ 86 | 'seq_len': 256, 87 | # Train for $128$ epochs 88 | 'epochs': 128, 89 | # Batch size $32$ 90 | 'batch_size': 32, 91 | # Switch between training and validation for $10$ times 92 | # per epoch 93 | 'inner_iterations': 10, 94 | 95 | # Model size 96 | 'd_model': 512, 97 | 'd_ffn': 2048, 98 | 99 | # Use [Noam optimizer](../../optimizers/noam.html) 100 | 'optimizer.optimizer': 'Noam', 101 | 'optimizer.learning_rate': 1., 102 | }) 103 | 104 | # Set models for saving and loading 105 | experiment.add_pytorch_models({'model': conf.model}) 106 | 107 | # Start the experiment 108 | with experiment.start(): 109 | # Run training 110 | conf.run() 111 | 112 | 113 | # 114 | if __name__ == '__main__': 115 | main() 116 | -------------------------------------------------------------------------------- /labml_nn/transformers/feed_forward.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Position-wise Feed-Forward Network (FFN) 4 | summary: Documented reusable implementation of the position wise feedforward network. 5 | --- 6 | 7 | # Position-wise Feed-Forward Network (FFN) 8 | 9 | This is a [PyTorch](https://pytorch.org) implementation 10 | of position-wise feedforward network used in transformer. 11 | 12 | FFN consists of two fully connected layers. 13 | Number of dimensions in the hidden layer $d_{ff}$, is generally set to around 14 | four times that of the token embedding $d_{model}$. 15 | So it is sometime also called the expand-and-contract network. 16 | 17 | There is an activation at the hidden layer, which is 18 | usually set to ReLU (Rectified Linear Unit) activation, $$\max(0, x)$$ 19 | 20 | That is, the FFN function is, 21 | $$FFN(x, W_1, W_2, b_1, b_2) = \max(0, x W_1 + b_1) W_2 + b_2$$ 22 | where $W_1$, $W_2$, $b_1$ and $b_2$ are learnable parameters. 23 | 24 | Sometimes the 25 | GELU (Gaussian Error Linear Unit) activation is also used instead of ReLU. 26 | $$x \Phi(x)$$ where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$ 27 | 28 | ### Gated Linear Units 29 | 30 | This is a generic implementation that supports different variants including 31 | [Gated Linear Units](https://arxiv.org/abs/2002.05202) (GLU). 32 | We have also implemented experiments on these: 33 | 34 | * [experiment that uses `labml.configs`](glu_variants/experiment.html) 35 | * [simpler version from scratch](glu_variants/simple.html) 36 | """ 37 | 38 | import torch 39 | from torch import nn as nn 40 | 41 | from labml_helpers.module import Module 42 | 43 | 44 | class FeedForward(Module): 45 | """ 46 | ## FFN module 47 | """ 48 | 49 | def __init__(self, d_model: int, d_ff: int, 50 | dropout: float = 0.1, 51 | activation=nn.ReLU(), 52 | is_gated: bool = False, 53 | bias1: bool = True, 54 | bias2: bool = True, 55 | bias_gate: bool = True): 56 | """ 57 | * `d_model` is the number of features in a token embedding 58 | * `d_ff` is the number of features in the hidden layer of the FFN 59 | * `dropout` is dropout probability for the hidden layer 60 | * `is_gated` specifies whether the hidden layer is gated 61 | * `bias1` specified whether the first fully connected layer should have a learnable bias 62 | * `bias2` specified whether the second fully connected layer should have a learnable bias 63 | * `bias_gate` specified whether the fully connected layer for the gate should have a learnable bias 64 | """ 65 | super().__init__() 66 | # Layer one parameterized by weight $W_1$ and bias $b_1$ 67 | self.layer1 = nn.Linear(d_model, d_ff, bias=bias1) 68 | # Layer one parameterized by weight $W_1$ and bias $b_1$ 69 | self.layer2 = nn.Linear(d_ff, d_model, bias=bias2) 70 | # Hidden layer dropout 71 | self.dropout = nn.Dropout(dropout) 72 | # Activation function $f$ 73 | self.activation = activation 74 | # Whether there is a gate 75 | self.is_gated = is_gated 76 | if is_gated: 77 | # If there is a gate the linear layer to transform inputs to 78 | # be multiplied by the gate, parameterized by weight $V$ and bias $c$ 79 | self.linear_v = nn.Linear(d_model, d_ff, bias=bias_gate) 80 | 81 | def forward(self, x: torch.Tensor): 82 | # $f(x W_1 + b_1)$ 83 | g = self.activation(self.layer1(x)) 84 | # If gated, $f(x W_1 + b_1) \otimes (x V + b) $ 85 | if self.is_gated: 86 | x = g * self.linear_v(x) 87 | # Otherwise 88 | else: 89 | x = g 90 | # Apply dropout 91 | x = self.dropout(x) 92 | # $(f(x W_1 + b_1) \otimes (x V + b)) W_2 + b_2$ or $f(x W_1 + b_1) W_2 + b_2$ 93 | # depending on whether it is gated 94 | return self.layer2(x) 95 | -------------------------------------------------------------------------------- /labml_nn/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | # [labml.ai Annotated PyTorch Paper Implementations](index.html) 3 | 4 | This is a collection of simple PyTorch implementations of 5 | neural networks and related algorithms. 6 | [These implementations](https://github.com/lab-ml/nn) are documented with explanations, 7 | and the [website](index.html) 8 | renders these as side-by-side formatted notes. 9 | We believe these would help you understand these algorithms better. 10 | 11 | We are actively maintaining this repo and adding new 12 | implementations. 13 | 14 | ## Modules 15 | 16 | #### ✨ [Transformers](transformers/index.html) 17 | 18 | * [Multi-headed attention](transformers/mha.html) 19 | * [Transformer building blocks](transformers/models.html) 20 | * [Transformer XL](transformers/xl/index.html) 21 | * [Relative multi-headed attention](transformers/xl/relative_mha.html) 22 | * [Compressive Transformer](transformers/compressive/index.html) 23 | * [GPT Architecture](transformers/gpt/index.html) 24 | * [GLU Variants](transformers/glu_variants/simple.html) 25 | * [kNN-LM: Generalization through Memorization](transformers/knn/index.html) 26 | * [Feedback Transformer](transformers/feedback/index.html) 27 | * [Switch Transformer](transformers/switch/index.html) 28 | * [Fast Weights Transformer](transformers/fast_weights/index.html) 29 | * [FNet](transformers/fnet/index.html) 30 | * [Attention Free Transformer](transformers/aft/index.html) 31 | * [Masked Language Model](transformers/mlm/index.html) 32 | * [Pay Attention to MLPs (gMLP)](transformers/gmlp/index.html) 33 | 34 | #### ✨ [Recurrent Highway Networks](recurrent_highway_networks/index.html) 35 | 36 | #### ✨ [LSTM](lstm/index.html) 37 | 38 | #### ✨ [HyperNetworks - HyperLSTM](hypernetworks/hyper_lstm.html) 39 | 40 | #### ✨ [Capsule Networks](capsule_networks/index.html) 41 | 42 | #### ✨ [Generative Adversarial Networks](gan/index.html) 43 | * [Original GAN](gan/original/index.html) 44 | * [GAN with deep convolutional network](gan/dcgan/index.html) 45 | * [Cycle GAN](gan/cycle_gan/index.html) 46 | * [Wasserstein GAN](gan/wasserstein/index.html) 47 | * [Wasserstein GAN with Gradient Penalty](gan/wasserstein/gradient_penalty/index.html) 48 | * [Style GAN 2](gan/stylegan/index.html) 49 | 50 | #### ✨ [Sketch RNN](sketch_rnn/index.html) 51 | 52 | #### ✨ [Reinforcement Learning](rl/index.html) 53 | * [Proximal Policy Optimization](rl/ppo/index.html) with 54 | [Generalized Advantage Estimation](rl/ppo/gae.html) 55 | * [Deep Q Networks](rl/dqn/index.html) with 56 | with [Dueling Network](rl/dqn/model.html), 57 | [Prioritized Replay](rl/dqn/replay_buffer.html) 58 | and Double Q Network. 59 | 60 | #### ✨ [Optimizers](optimizers/index.html) 61 | * [Adam](optimizers/adam.html) 62 | * [AMSGrad](optimizers/amsgrad.html) 63 | * [Adam Optimizer with warmup](optimizers/adam_warmup.html) 64 | * [Noam Optimizer](optimizers/noam.html) 65 | * [Rectified Adam Optimizer](optimizers/radam.html) 66 | * [AdaBelief Optimizer](optimizers/ada_belief.html) 67 | 68 | #### ✨ [Normalization Layers](https://nn.labml.ai/normalization/index.html) 69 | * [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html) 70 | * [Layer Normalization](https://nn.labml.ai/normalization/layer_norm/index.html) 71 | * [Instance Normalization](https://nn.labml.ai/normalization/instance_norm/index.html) 72 | * [Group Normalization](https://nn.labml.ai/normalization/group_norm/index.html) 73 | * [Weight Standardization](https://nn.labml.ai/normalization/weight_standardization/index.html) 74 | * [Batch-Channel Normalization](https://nn.labml.ai/normalization/batch_channel_norm/index.html) 75 | 76 | ### Installation 77 | 78 | ```bash 79 | pip install labml-nn 80 | ``` 81 | 82 | ### Citing LabML 83 | 84 | If you use LabML for academic research, please cite the library using the following BibTeX entry. 85 | 86 | ```bibtex 87 | @misc{labml, 88 | author = {Varuna Jayasiri, Nipun Wijerathne}, 89 | title = {LabML: A library to organize machine learning experiments}, 90 | year = {2020}, 91 | url = {https://nn.labml.ai/}, 92 | } 93 | ``` 94 | """ 95 | -------------------------------------------------------------------------------- /labml_nn/optimizers/adam_warmup_cosine_decay.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Adam optimizer with warm-up and cosine decay 4 | summary: A PyTorch implementation/tutorial of Adam optimizer with warm-up and cosine decay for GPT. 5 | --- 6 | 7 | # Adam Optimizer with Warmup and Cosine Decay 8 | 9 | This extends [AMSGrad optimizer](adam.html) and adds a warmup stage. 10 | """ 11 | import math 12 | from typing import Dict 13 | 14 | from labml_nn.optimizers import WeightDecay 15 | from labml_nn.optimizers.amsgrad import AMSGrad 16 | 17 | 18 | class AdamWarmupCosineDecay(AMSGrad): 19 | """ 20 | 21 | ## Adam Optimizer with Warmup and Cosine Decay 22 | 23 | 24 | This class extends from AMSGrad optimizer defined in [`amsgrad.py`](amsgrad.html). 25 | """ 26 | 27 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, 28 | weight_decay: WeightDecay = WeightDecay(), 29 | optimized_update: bool = True, 30 | amsgrad=False, warmup=0, total_steps=1e10, defaults=None): 31 | """ 32 | ### Initialize the optimizer 33 | 34 | * `params` is the list of parameters 35 | * `lr` is the learning rate $\alpha$ 36 | * `betas` is a tuple of ($\beta_1$, $\beta_2$) 37 | * `eps` is $\hat{\epsilon}$ or $\epsilon$ based on `optimized_update` 38 | * `weight_decay` is an instance of class `WeightDecay` defined in [`__init__.py`](index.html) 39 | * 'optimized_update' is a flag whether to optimize the bias correction of the second moment 40 | by doing it after adding $\epsilon$ 41 | * `amsgrad` is a flag indicating whether to use AMSGrad or fallback to plain Adam 42 | * `warmup` number of warmup steps 43 | * `total_steps` total number of steps. Cosine decay reaches 0 at this, 44 | but stays at 10% of `lr` because we take $\alpha * \max(0.1, decay)$ 45 | * `defaults` is a dictionary of default for group values. 46 | This is useful when you want to extend the class `AdamWarmup`. 47 | """ 48 | 49 | defaults = {} if defaults is None else defaults 50 | defaults.update(dict(warmup=warmup, total_steps=total_steps)) 51 | super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults) 52 | 53 | def get_lr(self, state: Dict[str, any], group: Dict[str, any]): 54 | """ 55 | ### Get learning-rate 56 | 57 | $$\alpha \min \bigg(1, \frac{t}{w}\bigg)$$ 58 | where $w$ is the number of warmup steps. 59 | """ 60 | # If we are in warmup stage 61 | if group['warmup'] > state['step']: 62 | # A linearly increasing learning rate from $0$ to $\alpha$ 63 | return 1e-8 + state['step'] * group['lr'] / group['warmup'] 64 | else: 65 | # Constant learning rate $\alpha$ 66 | progress = (state['step'] - group['warmup']) / max(1, group['total_steps'] - group['warmup']) 67 | return group['lr'] * max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress))) 68 | 69 | 70 | def _test_lr(): 71 | """ 72 | ### Plot learning rate for different warmups and model sizes 73 | 74 | ![Plot of learning rate](noam_lr.png) 75 | """ 76 | import matplotlib.pyplot as plt 77 | import numpy as np 78 | from torch import nn 79 | 80 | model = nn.Linear(10, 10) 81 | opt = AdamWarmupCosineDecay(model.parameters(), warmup=5000, lr=1e-4, total_steps=4e6) 82 | steps = 20_000 83 | plt.plot(np.arange(1, steps), [opt.get_lr({'step': i}, opt.defaults) for i in range(1, steps)]) 84 | plt.legend(["5000:4e6", "5000:2e6", "5000:1e6"]) 85 | plt.title("Learning Rate") 86 | plt.show() 87 | 88 | steps = int(6e6) 89 | step_size = 1000 90 | plt.plot(np.arange(1, steps, step_size), [opt.get_lr({'step': i}, opt.defaults) for i in range(1, steps, step_size)]) 91 | plt.legend(["5000:4e6", "5000:2e6", "5000:1e6"]) 92 | plt.title("Learning Rate") 93 | plt.show() 94 | 95 | 96 | if __name__ == '__main__': 97 | _test_lr() 98 | -------------------------------------------------------------------------------- /labml_nn/normalization/weight_standardization/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Weight Standardization 4 | summary: > 5 | A PyTorch implementation/tutorial of Weight Standardization. 6 | --- 7 | 8 | # Weight Standardization 9 | 10 | This is a [PyTorch](https://pytorch.org) implementation of Weight Standardization from the paper 11 | [Micro-Batch Training with Batch-Channel Normalization and Weight Standardization](https://arxiv.org/abs/1903.10520). 12 | We also have an [annotated implementation of Batch-Channel Normalization](../batch_channel_norm/index.html). 13 | 14 | Batch normalization **gives a smooth loss landscape** and 15 | **avoids elimination singularities**. 16 | Elimination singularities are nodes of the network that become 17 | useless (e.g. a ReLU that gives 0 all the time). 18 | 19 | However, batch normalization doesn't work well when the batch size is too small, 20 | which happens when training large networks because of device memory limitations. 21 | The paper introduces Weight Standardization with Batch-Channel Normalization as 22 | a better alternative. 23 | 24 | Weight Standardization: 25 | 1. Normalizes the gradients 26 | 2. Smoothes the landscape (reduced Lipschitz constant) 27 | 3. Avoids elimination singularities 28 | 29 | The Lipschitz constant is the maximum slope a function has between two points. 30 | That is, $L$ is the Lipschitz constant where $L$ is the smallest value that satisfies, 31 | $\forall a,b \in A: \lVert f(a) - f(b) \rVert \le L \lVert a - b \rVert$ 32 | where $f: A \rightarrow \mathbb{R}^m, A \in \mathbb{R}^n$. 33 | 34 | Elimination singularities are avoided because it keeps the statistics of the outputs similar to the 35 | inputs. So as long as the inputs are normally distributed the outputs remain close to normal. 36 | This avoids outputs of nodes from always falling beyond the active range of the activation function 37 | (e.g. always negative input for a ReLU). 38 | 39 | *[Refer to the paper for proofs](https://arxiv.org/abs/1903.10520)*. 40 | 41 | Here is [the training code](experiment.html) for training 42 | a VGG network that uses weight standardization to classify CIFAR-10 data. 43 | This uses a [2D-Convolution Layer with Weight Standardization](../conv2d.html). 44 | 45 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/weight_standardization/experiment.ipynb) 46 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/f4a783a2a7df11eb921d0242ac1c0002) 47 | [![WandB](https://img.shields.io/badge/wandb-run-yellow)](https://wandb.ai/vpj/cifar10/runs/3flr4k8w) 48 | """ 49 | 50 | import torch 51 | 52 | 53 | def weight_standardization(weight: torch.Tensor, eps: float): 54 | r""" 55 | ## Weight Standardization 56 | 57 | $$\hat{W}_{i,j} = \frac{W_{i,j} - \mu_{W_{i,\cdot}}} {\sigma_{W_{i,\cdot}}}$$ 58 | 59 | where, 60 | 61 | \begin{align} 62 | W &\in \mathbb{R}^{O \times I} \\ 63 | \mu_{W_{i,\cdot}} &= \frac{1}{I} \sum_{j=1}^I W_{i,j} \\ 64 | \sigma_{W_{i,\cdot}} &= \sqrt{\frac{1}{I} \sum_{j=1}^I W^2_{i,j} - \mu^2_{W_{i,\cdot}} + \epsilon} \\ 65 | \end{align} 66 | 67 | for a 2D-convolution layer $O$ is the number of output channels ($O = C_{out}$) 68 | and $I$ is the number of input channels times the kernel size ($I = C_{in} \times k_H \times k_W$) 69 | """ 70 | 71 | # Get $C_{out}$, $C_{in}$ and kernel shape 72 | c_out, c_in, *kernel_shape = weight.shape 73 | # Reshape $W$ to $O \times I$ 74 | weight = weight.view(c_out, -1) 75 | # Calculate 76 | # 77 | # \begin{align} 78 | # \mu_{W_{i,\cdot}} &= \frac{1}{I} \sum_{j=1}^I W_{i,j} \\ 79 | # \sigma^2_{W_{i,\cdot}} &= \frac{1}{I} \sum_{j=1}^I W^2_{i,j} - \mu^2_{W_{i,\cdot}} 80 | # \end{align} 81 | var, mean = torch.var_mean(weight, dim=1, keepdim=True) 82 | # Normalize 83 | # $$\hat{W}_{i,j} = \frac{W_{i,j} - \mu_{W_{i,\cdot}}} {\sigma_{W_{i,\cdot}}}$$ 84 | weight = (weight - mean) / (torch.sqrt(var + eps)) 85 | # Change back to original shape and return 86 | return weight.view(c_out, c_in, *kernel_shape) 87 | -------------------------------------------------------------------------------- /labml_nn/transformers/fast_weights/experiment.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Train Fast Weights Transformer 4 | summary: This is training code with notes for a Fast Weights Transformer. 5 | --- 6 | 7 | # Train Fast Weights Transformer 8 | 9 | This trains a fast weights transformer model for auto-regression. 10 | 11 | Here’s a Colab notebook for training a fast weights transformer on Tiny Shakespeare dataset. 12 | 13 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb) 14 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/928aadc0846c11eb85710242ac1c0002) 15 | """ 16 | 17 | import torch 18 | from torch import nn 19 | 20 | from labml import experiment 21 | from labml.configs import option 22 | from labml.utils.pytorch import get_modules 23 | from labml_helpers.module import Module 24 | from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs 25 | 26 | 27 | class AutoregressiveModel(Module): 28 | """ 29 | ## Auto regressive model 30 | """ 31 | 32 | def __init__(self, n_vocab: int, d_model: int, transformer: Module): 33 | super().__init__() 34 | # Token embedding module 35 | self.src_embed = nn.Embedding(n_vocab, d_model) 36 | self.transformer = transformer 37 | self.generator = nn.Linear(d_model, n_vocab) 38 | 39 | def forward(self, x: torch.Tensor): 40 | # Embed the tokens 41 | x = self.src_embed(x) 42 | # Run it through the the transformer 43 | res = self.transformer(x) 44 | # Generate logits of the next token 45 | return self.generator(res), None 46 | 47 | 48 | class Configs(NLPAutoRegressionConfigs): 49 | """ 50 | ## Configurations 51 | 52 | The default configs can and will be over-ridden when we start the experiment 53 | """ 54 | 55 | model: AutoregressiveModel 56 | 57 | d_model: int = 512 58 | nu: int = 1 59 | heads: int = 8 60 | dropout: float = 0.0 61 | d_ff: int = 2048 62 | n_layers: int = 6 63 | 64 | 65 | @option(Configs.model) 66 | def fast_weights_transformer(c: Configs): 67 | """ 68 | Create [fast weights transformer](index.html). 69 | """ 70 | from labml_nn.transformers.fast_weights import FastWeightsAttentionTransformer, \ 71 | FastWeightsAttentionTransformerLayer, FastWeightsAttention, FeedForward 72 | 73 | from labml_nn.transformers.fast_weights import DPFP 74 | return AutoregressiveModel( 75 | c.n_tokens, c.d_model, 76 | FastWeightsAttentionTransformer( 77 | FastWeightsAttentionTransformerLayer(d_model=c.d_model, 78 | attn=FastWeightsAttention(c.heads, c.d_model, c.dropout, DPFP(nu=c.nu)), 79 | feed_forward=FeedForward(c.d_model, c.d_ff, c.dropout), 80 | dropout_prob=c.dropout), 81 | c.n_layers)).to(c.device) 82 | 83 | 84 | def main(): 85 | # Create experiment 86 | experiment.create(name="fast_weights_transformer") 87 | # Create configs 88 | conf = Configs() 89 | # Load configurations 90 | experiment.configs(conf, 91 | # A dictionary of configurations to override 92 | {'tokenizer': 'character', 93 | 'text': 'tiny_shakespeare', 94 | 'optimizer.learning_rate': 1.0, 95 | 'optimizer.optimizer': 'Noam', 96 | 'prompt': 'It is', 97 | 'prompt_separator': '', 98 | 99 | 'train_loader': 'shuffled_train_loader', 100 | 'valid_loader': 'shuffled_valid_loader', 101 | 102 | 'seq_len': 128, 103 | 'epochs': 128, 104 | 'batch_size': 16, 105 | 'inner_iterations': 25}) 106 | 107 | # Set models for saving and loading 108 | experiment.add_pytorch_models(get_modules(conf)) 109 | 110 | # Start the experiment 111 | with experiment.start(): 112 | # Run the training loop 113 | conf.run() 114 | 115 | 116 | if __name__ == '__main__': 117 | main() 118 | -------------------------------------------------------------------------------- /labml_nn/gan/dcgan/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Deep Convolutional Generative Adversarial Networks (DCGAN) 4 | summary: A simple PyTorch implementation/tutorial of Deep Convolutional Generative Adversarial Networks (DCGAN). 5 | --- 6 | 7 | # Deep Convolutional Generative Adversarial Networks (DCGAN) 8 | 9 | This is a [PyTorch](https://pytorch.org) implementation of paper 10 | [Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](https://arxiv.org/abs/1511.06434). 11 | 12 | This implementation is based on the [PyTorch DCGAN Tutorial](https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html). 13 | """ 14 | 15 | import torch.nn as nn 16 | 17 | from labml import experiment 18 | from labml.configs import calculate 19 | from labml_helpers.module import Module 20 | from labml_nn.gan.original.experiment import Configs 21 | 22 | 23 | class Generator(Module): 24 | """ 25 | ### Convolutional Generator Network 26 | 27 | This is similar to the de-convolutional network used for CelebA faces, 28 | but modified for MNIST images. 29 | 30 | 31 | """ 32 | 33 | def __init__(self): 34 | super().__init__() 35 | # The input is $1 \times 1$ with 100 channels 36 | self.layers = nn.Sequential( 37 | # This gives $3 \times 3$ output 38 | nn.ConvTranspose2d(100, 1024, 3, 1, 0, bias=False), 39 | nn.BatchNorm2d(1024), 40 | nn.ReLU(True), 41 | # This gives $7 \times 7$ 42 | nn.ConvTranspose2d(1024, 512, 3, 2, 0, bias=False), 43 | nn.BatchNorm2d(512), 44 | nn.ReLU(True), 45 | # This gives $14 \times 14$ 46 | nn.ConvTranspose2d(512, 256, 4, 2, 1, bias=False), 47 | nn.BatchNorm2d(256), 48 | nn.ReLU(True), 49 | # This gives $28 \times 28$ 50 | nn.ConvTranspose2d(256, 1, 4, 2, 1, bias=False), 51 | nn.Tanh() 52 | ) 53 | 54 | self.apply(_weights_init) 55 | 56 | def __call__(self, x): 57 | # Change from shape `[batch_size, 100]` to `[batch_size, 100, 1, 1]` 58 | x = x.unsqueeze(-1).unsqueeze(-1) 59 | x = self.layers(x) 60 | return x 61 | 62 | 63 | class Discriminator(Module): 64 | """ 65 | ### Convolutional Discriminator Network 66 | """ 67 | 68 | def __init__(self): 69 | super().__init__() 70 | # The input is $28 \times 28$ with one channel 71 | self.layers = nn.Sequential( 72 | # This gives $14 \times 14$ 73 | nn.Conv2d(1, 256, 4, 2, 1, bias=False), 74 | nn.LeakyReLU(0.2, inplace=True), 75 | # This gives $7 \times 7$ 76 | nn.Conv2d(256, 512, 4, 2, 1, bias=False), 77 | nn.BatchNorm2d(512), 78 | nn.LeakyReLU(0.2, inplace=True), 79 | # This gives $3 \times 3$ 80 | nn.Conv2d(512, 1024, 3, 2, 0, bias=False), 81 | nn.BatchNorm2d(1024), 82 | nn.LeakyReLU(0.2, inplace=True), 83 | # This gives $1 \times 1$ 84 | nn.Conv2d(1024, 1, 3, 1, 0, bias=False), 85 | ) 86 | self.apply(_weights_init) 87 | 88 | def forward(self, x): 89 | x = self.layers(x) 90 | return x.view(x.shape[0], -1) 91 | 92 | 93 | def _weights_init(m): 94 | classname = m.__class__.__name__ 95 | if classname.find('Conv') != -1: 96 | nn.init.normal_(m.weight.data, 0.0, 0.02) 97 | elif classname.find('BatchNorm') != -1: 98 | nn.init.normal_(m.weight.data, 1.0, 0.02) 99 | nn.init.constant_(m.bias.data, 0) 100 | 101 | 102 | # We import the [simple gan experiment]((simple_mnist_experiment.html) and change the 103 | # generator and discriminator networks 104 | calculate(Configs.generator, 'cnn', lambda c: Generator().to(c.device)) 105 | calculate(Configs.discriminator, 'cnn', lambda c: Discriminator().to(c.device)) 106 | 107 | 108 | def main(): 109 | conf = Configs() 110 | experiment.create(name='mnist_dcgan') 111 | experiment.configs(conf, 112 | {'discriminator': 'cnn', 113 | 'generator': 'cnn', 114 | 'label_smoothing': 0.01}) 115 | with experiment.start(): 116 | conf.run() 117 | 118 | 119 | if __name__ == '__main__': 120 | main() 121 | -------------------------------------------------------------------------------- /labml_nn/resnets/utils/train.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import torch 4 | from torch.utils.data import DataLoader, ConcatDataset 5 | # from sklearn.model_selection import KFold 6 | # from torch.utils.data.sampler import SubsetRandomSampler 7 | 8 | import matplotlib.pyplot as plt 9 | from pylab import * 10 | import os 11 | 12 | from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR 13 | 14 | 15 | 16 | class Trainer(): 17 | def __init__(self, net, opt, cost, name="default", lr=0.0005, use_lr_schedule =False , device=None): 18 | self.net = net 19 | self.opt = opt 20 | self.cost = cost 21 | self.device = device 22 | self.epoch = 0 23 | self.start_epoch = 0 24 | self.name = name 25 | 26 | self.lr = lr 27 | self.use_lr_schedule = use_lr_schedule 28 | if self.use_lr_schedule: 29 | self.scheduler = ReduceLROnPlateau( self.opt, 'max', factor=0.1, patience=5, threshold=0.00001, verbose=True) 30 | # self.scheduler = StepLR(self.opt, step_size=15, gamma=0.1) 31 | 32 | # Train loop over epochs. Optinal use testloader to return test accuracy after each epoch 33 | def Train(self, trainloader, epochs, testloader=None): 34 | # Enable Dropout 35 | 36 | # Record loss/accuracies 37 | loss = torch.zeros(epochs) 38 | self.epoch = 0 39 | 40 | # If testloader is used, loss will be the accuracy 41 | for epoch in range(self.start_epoch, self.start_epoch+epochs): 42 | self.epoch = epoch+1 43 | 44 | self.net.train() # Enable Dropout 45 | for data in trainloader: 46 | # Get the inputs; data is a list of [inputs, labels] 47 | if self.device: 48 | images, labels = data[0].to(self.device), data[1].to(self.device) 49 | else: 50 | images, labels = data 51 | 52 | self.opt.zero_grad() 53 | # Forward + backward + optimize 54 | outputs = self.net(images) 55 | epoch_loss = self.cost(outputs, labels) 56 | epoch_loss.backward() 57 | self.opt.step() 58 | 59 | loss[epoch] += epoch_loss.item() 60 | 61 | if testloader: 62 | loss[epoch] = self.Test(testloader) 63 | else: 64 | loss[epoch] /= len(trainloader) 65 | 66 | print("Epoch %d Learning rate %.6f %s: %.3f" % ( 67 | self.epoch, self.opt.param_groups[0]['lr'], "Accuracy" if testloader else "Loss", loss[epoch])) 68 | 69 | #learning rate scheduler 70 | if self.use_lr_schedule: 71 | self.scheduler.step(loss[epoch]) 72 | # self.scheduler.step() 73 | 74 | # Saving best model 75 | if loss[epoch] >= torch.max(loss): 76 | self.save_best_model({ 77 | 'epoch': self.epoch, 78 | 'state_dict': self.net.state_dict(), 79 | 'optimizer': self.opt.state_dict(), 80 | }) 81 | 82 | return loss 83 | 84 | # Testing 85 | def Test(self, testloader, ret="accuracy"): 86 | # Disable Dropout 87 | self.net.eval() 88 | 89 | # Track correct and total 90 | correct = 0.0 91 | total = 0.0 92 | with torch.no_grad(): 93 | for data in testloader: 94 | if self.device: 95 | images, labels = data[0].to(self.device), data[1].to(self.device) 96 | else: 97 | images, labels = data 98 | 99 | outputs = self.net(images) 100 | _, predicted = torch.max(outputs.data, 1) 101 | total += labels.size(0) 102 | correct += (predicted == labels).sum().item() 103 | 104 | return correct / total 105 | 106 | def save_best_model(self, state): 107 | directory = os.path.dirname("./save/%s-best-model/"%(self.name)) 108 | if not os.path.exists(directory): 109 | os.mkdir(directory) 110 | torch.save(state, "%s/model.pt" %(directory)) 111 | 112 | def save_checkpoint(self, state): 113 | directory = os.path.dirname("./save/%s-checkpoints/"%(self.name)) 114 | if not os.path.exists(directory): 115 | os.mkdir(directory) 116 | torch.save(state, "%s/model_epoch_%s.pt" %(directory, self.epoch)) 117 | # torch.save(state, "./save/checkpoints/model_epoch_%s.pt" % (self.epoch)) 118 | -------------------------------------------------------------------------------- /labml_nn/normalization/batch_norm/readme.md: -------------------------------------------------------------------------------- 1 | # [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html) 2 | 3 | This is a [PyTorch](https://pytorch.org) implementation of Batch Normalization from paper 4 | [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167). 5 | 6 | ### Internal Covariate Shift 7 | 8 | The paper defines *Internal Covariate Shift* as the change in the 9 | distribution of network activations due to the change in 10 | network parameters during training. 11 | For example, let's say there are two layers $l_1$ and $l_2$. 12 | During the beginning of the training $l_1$ outputs (inputs to $l_2$) 13 | could be in distribution $\mathcal{N}(0.5, 1)$. 14 | Then, after some training steps, it could move to $\mathcal{N}(0.6, 1.5)$. 15 | This is *internal covariate shift*. 16 | 17 | Internal covariate shift will adversely affect training speed because the later layers 18 | ($l_2$ in the above example) have to adapt to this shifted distribution. 19 | 20 | By stabilizing the distribution, batch normalization minimizes the internal covariate shift. 21 | 22 | ## Normalization 23 | 24 | It is known that whitening improves training speed and convergence. 25 | *Whitening* is linearly transforming inputs to have zero mean, unit variance, 26 | and be uncorrelated. 27 | 28 | ### Normalizing outside gradient computation doesn't work 29 | 30 | Normalizing outside the gradient computation using pre-computed (detached) 31 | means and variances doesn't work. For instance. (ignoring variance), let 32 | $$\hat{x} = x - \mathbb{E}[x]$$ 33 | where $x = u + b$ and $b$ is a trained bias 34 | and $\mathbb{E}[x]$ is an outside gradient computation (pre-computed constant). 35 | 36 | Note that $\hat{x}$ has no effect on $b$. 37 | Therefore, 38 | $b$ will increase or decrease based 39 | $\frac{\partial{\mathcal{L}}}{\partial x}$, 40 | and keep on growing indefinitely in each training update. 41 | The paper notes that similar explosions happen with variances. 42 | 43 | ### Batch Normalization 44 | 45 | Whitening is computationally expensive because you need to de-correlate and 46 | the gradients must flow through the full whitening calculation. 47 | 48 | The paper introduces a simplified version which they call *Batch Normalization*. 49 | First simplification is that it normalizes each feature independently to have 50 | zero mean and unit variance: 51 | $$\hat{x}^{(k)} = \frac{x^{(k)} - \mathbb{E}[x^{(k)}]}{\sqrt{Var[x^{(k)}]}}$$ 52 | where $x = (x^{(1)} ... x^{(d)})$ is the $d$-dimensional input. 53 | 54 | The second simplification is to use estimates of mean $\mathbb{E}[x^{(k)}]$ 55 | and variance $Var[x^{(k)}]$ from the mini-batch 56 | for normalization; instead of calculating the mean and variance across the whole dataset. 57 | 58 | Normalizing each feature to zero mean and unit variance could affect what the layer 59 | can represent. 60 | As an example paper illustrates that, if the inputs to a sigmoid are normalized 61 | most of it will be within $[-1, 1]$ range where the sigmoid is linear. 62 | To overcome this each feature is scaled and shifted by two trained parameters 63 | $\gamma^{(k)}$ and $\beta^{(k)}$. 64 | $$y^{(k)} =\gamma^{(k)} \hat{x}^{(k)} + \beta^{(k)}$$ 65 | where $y^{(k)}$ is the output of the batch normalization layer. 66 | 67 | Note that when applying batch normalization after a linear transform 68 | like $Wu + b$ the bias parameter $b$ gets cancelled due to normalization. 69 | So you can and should omit bias parameter in linear transforms right before the 70 | batch normalization. 71 | 72 | Batch normalization also makes the back propagation invariant to the scale of the weights 73 | and empirically it improves generalization, so it has regularization effects too. 74 | 75 | ## Inference 76 | 77 | We need to know $\mathbb{E}[x^{(k)}]$ and $Var[x^{(k)}]$ in order to 78 | perform the normalization. 79 | So during inference, you either need to go through the whole (or part of) dataset 80 | and find the mean and variance, or you can use an estimate calculated during training. 81 | The usual practice is to calculate an exponential moving average of 82 | mean and variance during the training phase and use that for inference. 83 | 84 | Here's [the training code](mnist.html) and a notebook for training 85 | a CNN classifier that uses batch normalization for MNIST dataset. 86 | 87 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/batch_norm/mnist.ipynb) 88 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/011254fe647011ebbb8e0242ac1c0002) 89 | -------------------------------------------------------------------------------- /labml_nn/optimizers/mnist_experiment.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: MNIST example to test the optimizers 4 | summary: This is a simple MNIST example with a CNN model to test the optimizers. 5 | --- 6 | 7 | # MNIST example to test the optimizers 8 | """ 9 | import torch.nn as nn 10 | import torch.utils.data 11 | from labml_helpers.module import Module 12 | 13 | from labml import experiment, tracker 14 | from labml.configs import option 15 | from labml_helpers.datasets.mnist import MNISTConfigs 16 | from labml_helpers.device import DeviceConfigs 17 | from labml_helpers.metrics.accuracy import Accuracy 18 | from labml_helpers.seed import SeedConfigs 19 | from labml_helpers.train_valid import TrainValidConfigs, BatchIndex, hook_model_outputs 20 | from labml_nn.optimizers.configs import OptimizerConfigs 21 | 22 | 23 | class Model(Module): 24 | """ 25 | ## The model 26 | """ 27 | def __init__(self): 28 | super().__init__() 29 | self.conv1 = nn.Conv2d(1, 20, 5, 1) 30 | self.pool1 = nn.MaxPool2d(2) 31 | self.conv2 = nn.Conv2d(20, 50, 5, 1) 32 | self.pool2 = nn.MaxPool2d(2) 33 | self.fc1 = nn.Linear(16 * 50, 500) 34 | self.fc2 = nn.Linear(500, 10) 35 | self.activation = nn.ReLU() 36 | 37 | def forward(self, x): 38 | x = self.activation(self.conv1(x)) 39 | x = self.pool1(x) 40 | x = self.activation(self.conv2(x)) 41 | x = self.pool2(x) 42 | x = self.activation(self.fc1(x.view(-1, 16 * 50))) 43 | return self.fc2(x) 44 | 45 | 46 | class Configs(MNISTConfigs, TrainValidConfigs): 47 | """ 48 | ## Configurable Experiment Definition 49 | """ 50 | optimizer: torch.optim.Adam 51 | model: nn.Module 52 | set_seed = SeedConfigs() 53 | device: torch.device = DeviceConfigs() 54 | epochs: int = 10 55 | 56 | is_save_models = True 57 | model: nn.Module 58 | inner_iterations = 10 59 | 60 | accuracy_func = Accuracy() 61 | loss_func = nn.CrossEntropyLoss() 62 | 63 | def init(self): 64 | tracker.set_queue("loss.*", 20, True) 65 | tracker.set_scalar("accuracy.*", True) 66 | hook_model_outputs(self.mode, self.model, 'model') 67 | self.state_modules = [self.accuracy_func] 68 | 69 | def step(self, batch: any, batch_idx: BatchIndex): 70 | # Get the batch 71 | data, target = batch[0].to(self.device), batch[1].to(self.device) 72 | 73 | # Add global step if we are in training mode 74 | if self.mode.is_train: 75 | tracker.add_global_step(len(data)) 76 | 77 | # Run the model and specify whether to log the activations 78 | with self.mode.update(is_log_activations=batch_idx.is_last): 79 | output = self.model(data) 80 | 81 | # Calculate the loss 82 | loss = self.loss_func(output, target) 83 | # Calculate the accuracy 84 | self.accuracy_func(output, target) 85 | # Log the loss 86 | tracker.add("loss.", loss) 87 | 88 | # Optimize if we are in training mode 89 | if self.mode.is_train: 90 | # Calculate the gradients 91 | loss.backward() 92 | 93 | # Take optimizer step 94 | self.optimizer.step() 95 | # Log the parameter and gradient L2 norms once per epoch 96 | if batch_idx.is_last: 97 | tracker.add('model', self.model) 98 | tracker.add('optimizer', (self.optimizer, {'model': self.model})) 99 | # Clear the gradients 100 | self.optimizer.zero_grad() 101 | 102 | # Save logs 103 | tracker.save() 104 | 105 | 106 | @option(Configs.model) 107 | def model(c: Configs): 108 | return Model().to(c.device) 109 | 110 | 111 | @option(Configs.optimizer) 112 | def _optimizer(c: Configs): 113 | """ 114 | Create a configurable optimizer. 115 | We can change the optimizer type and hyper-parameters using configurations. 116 | """ 117 | opt_conf = OptimizerConfigs() 118 | opt_conf.parameters = c.model.parameters() 119 | return opt_conf 120 | 121 | 122 | def main(): 123 | conf = Configs() 124 | conf.inner_iterations = 10 125 | experiment.create(name='mnist_ada_belief') 126 | experiment.configs(conf, {'inner_iterations': 10, 127 | # Specify the optimizer 128 | 'optimizer.optimizer': 'Adam', 129 | 'optimizer.learning_rate': 1.5e-4}) 130 | conf.set_seed.set() 131 | experiment.add_pytorch_models(dict(model=conf.model)) 132 | with experiment.start(): 133 | conf.run() 134 | 135 | 136 | if __name__ == '__main__': 137 | main() 138 | -------------------------------------------------------------------------------- /docs/resnets/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | None 24 | 25 | 26 | 27 | 28 | 29 | 40 | 41 | 42 |
43 |
44 |
45 |
46 |

47 | home 48 | resnets 49 |

50 |

51 | 52 | 53 | Github 56 | 58 | Twitter 61 |

62 |
63 |
64 |
65 | 66 | 68 | 69 | 83 | 123 | 124 | -------------------------------------------------------------------------------- /docs/experiments/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | None 24 | 25 | 26 | 27 | 28 | 29 | 40 | 41 | 42 |
43 |
44 |
45 |
46 |

47 | home 48 | experiments 49 |

50 |

51 | 52 | 53 | Github 56 | 58 | Twitter 61 |

62 |
63 |
64 |
65 | 66 | 68 | 69 | 83 | 123 | 124 | -------------------------------------------------------------------------------- /labml_nn/normalization/instance_norm/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Instance Normalization 4 | summary: > 5 | A PyTorch implementation/tutorial of instance normalization. 6 | --- 7 | 8 | # Instance Normalization 9 | 10 | This is a [PyTorch](https://pytorch.org) implementation of 11 | [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022). 12 | 13 | Instance normalization was introduced to improve [style transfer](https://paperswithcode.com/task/style-transfer). 14 | It is based on the observation that stylization should not depend on the contrast of the content image. 15 | The "contrast normalization" is 16 | 17 | $$y_{t,i,j,k} = \frac{x_{t,i,j,k}}{\sum_{l=1}^H \sum_{m=1}^W x_{t,i,l,m}}$$ 18 | 19 | where $x$ is a batch of images with dimensions image index $t$, 20 | feature channel $i$, and 21 | spatial position $j, k$. 22 | 23 | Since it's hard for a convolutional network to learn "contrast normalization", this paper 24 | introduces instance normalization which does that. 25 | 26 | Here's a [CIFAR 10 classification model](experiment.html) that uses instance normalization. 27 | """ 28 | 29 | import torch 30 | from torch import nn 31 | 32 | from labml_helpers.module import Module 33 | 34 | 35 | class InstanceNorm(Module): 36 | r""" 37 | ## Instance Normalization Layer 38 | 39 | Instance normalization layer $\text{IN}$ normalizes the input $X$ as follows: 40 | 41 | When input $X \in \mathbb{R}^{B \times C \times H \times W}$ is a batch of image representations, 42 | where $B$ is the batch size, $C$ is the number of channels, $H$ is the height and $W$ is the width. 43 | $\gamma \in \mathbb{R}^{C}$ and $\beta \in \mathbb{R}^{C}$. The affine transformation with $gamma$ and 44 | $beta$ are optional. 45 | 46 | $$\text{IN}(X) = \gamma 47 | \frac{X - \underset{H, W}{\mathbb{E}}[X]}{\sqrt{\underset{H, W}{Var}[X] + \epsilon}} 48 | + \beta$$ 49 | """ 50 | 51 | def __init__(self, channels: int, *, 52 | eps: float = 1e-5, affine: bool = True): 53 | """ 54 | * `channels` is the number of features in the input 55 | * `eps` is $\epsilon$, used in $\sqrt{Var[X] + \epsilon}$ for numerical stability 56 | * `affine` is whether to scale and shift the normalized value 57 | """ 58 | super().__init__() 59 | 60 | self.channels = channels 61 | 62 | self.eps = eps 63 | self.affine = affine 64 | # Create parameters for $\gamma$ and $\beta$ for scale and shift 65 | if self.affine: 66 | self.scale = nn.Parameter(torch.ones(channels)) 67 | self.shift = nn.Parameter(torch.zeros(channels)) 68 | 69 | def forward(self, x: torch.Tensor): 70 | """ 71 | `x` is a tensor of shape `[batch_size, channels, *]`. 72 | `*` denotes any number of (possibly 0) dimensions. 73 | For example, in an image (2D) convolution this will be 74 | `[batch_size, channels, height, width]` 75 | """ 76 | # Keep the original shape 77 | x_shape = x.shape 78 | # Get the batch size 79 | batch_size = x_shape[0] 80 | # Sanity check to make sure the number of features is the same 81 | assert self.channels == x.shape[1] 82 | 83 | # Reshape into `[batch_size, channels, n]` 84 | x = x.view(batch_size, self.channels, -1) 85 | 86 | # Calculate the mean across last dimension 87 | # i.e. the means for each feature $\mathbb{E}[x_{t,i}]$ 88 | mean = x.mean(dim=[-1], keepdim=True) 89 | # Calculate the squared mean across first and last dimension; 90 | # i.e. the means for each feature $\mathbb{E}[(x_{t,i}^2]$ 91 | mean_x2 = (x ** 2).mean(dim=[-1], keepdim=True) 92 | # Variance for each feature $Var[x_{t,i}] = \mathbb{E}[x_{t,i}^2] - \mathbb{E}[x_{t,i}]^2$ 93 | var = mean_x2 - mean ** 2 94 | 95 | # Normalize $$\hat{x}_{t,i} = \frac{x_{t,i} - \mathbb{E}[x_{t,i}]}{\sqrt{Var[x_{t,i}] + \epsilon}}$$ 96 | x_norm = (x - mean) / torch.sqrt(var + self.eps) 97 | x_norm = x_norm.view(batch_size, self.channels, -1) 98 | 99 | # Scale and shift $$y_{t,i} =\gamma_i \hat{x}_{t,i} + \beta_i$$ 100 | if self.affine: 101 | x_norm = self.scale.view(1, -1, 1) * x_norm + self.shift.view(1, -1, 1) 102 | 103 | # Reshape to original and return 104 | return x_norm.view(x_shape) 105 | 106 | 107 | def _test(): 108 | """ 109 | Simple test 110 | """ 111 | from labml.logger import inspect 112 | 113 | x = torch.zeros([2, 6, 2, 4]) 114 | inspect(x.shape) 115 | bn = InstanceNorm(6) 116 | 117 | x = bn(x) 118 | inspect(x.shape) 119 | 120 | 121 | # 122 | if __name__ == '__main__': 123 | _test() 124 | -------------------------------------------------------------------------------- /labml_nn/cnn/ray_tune.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | 3 | import numpy as np 4 | import os 5 | import torch 6 | from ray import tune 7 | from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining 8 | from utils.train import Trainer 9 | from models.cnn import GetCNN 10 | 11 | # Check if GPU is available 12 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 13 | print("Device: " + str(device)) 14 | 15 | # 16 | num_samples= 40 # for multiple trials 17 | max_num_epochs= 25 18 | gpus_per_trial= 1 19 | 20 | # Cifar 10 Datasets location 21 | data_dir = './data/Cifar10' 22 | 23 | """ 24 | Code has been referenced from the official ray tune documentation 25 | ASHA 26 | https://docs.ray.io/en/master/tune/api_docs/schedulers.html#tune-scheduler-hyperband 27 | 28 | PBT 29 | https://docs.ray.io/en/latest/tune/api_docs/schedulers.html#tune-scheduler-pbt 30 | """ 31 | 32 | """config - returns a dict of hyperparameters 33 | 34 | Selecting different hyperparameters for tuning 35 | l1 : Number of units in first fully connected layer 36 | l2 : Number of units in second fully connected layer 37 | lr : Learning rate 38 | decay : Decay rate for regularization 39 | batch_size : Batch size of test and train data 40 | """ 41 | config = { 42 | "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), # eg. 4, 8, 16 .. 512 43 | "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), # eg. 4, 8, 16 .. 512 44 | "lr": tune.loguniform(1e-4, 1e-1), # Sampling from log uniform distribution 45 | "decay": tune.sample_from(lambda _: 10 ** np.random.randint(-7, -3)), # eg. 1e-7, 1e-6, .. 1e-3 46 | "batch_size": tune.choice([32, 64, 128, 256]) 47 | } 48 | 49 | # calling trainer 50 | trainer = Trainer(device=device) 51 | 52 | """ASHA (Asynchronous Successive Halving Algorithm) scheduler 53 | max_t : Maximum number of units per trail (can be time or epochs) 54 | grace_period : Stop trials after specific number of unit if model is not performing well (can be time or epochs) 55 | reduction_factor : Set halving rate 56 | """ 57 | scheduler = ASHAScheduler( 58 | max_t=max_num_epochs, 59 | grace_period=4, 60 | reduction_factor=4) 61 | 62 | 63 | 64 | """Population based training scheduler 65 | time_attr : Can be time or epochs 66 | metric : Objective of training (loss or accuracy) 67 | perturbation_interval : Perturbation occur after specified unit (can be time or epochs) 68 | hyperparam_mutations : Hyperparameters to mutate 69 | """ 70 | scheduler = PopulationBasedTraining( 71 | time_attr= "training_iteration", # epochs 72 | metric='loss', # loss is objective function 73 | mode='min', # minimizing loss is objective of training 74 | perturbation_interval=5.0, # after 5 epochs perturbate 75 | hyperparam_mutations={ 76 | "lr": [1e-3, 5e-4, 1e-4, 5e-4, 1e-5], # choose from given learning rates 77 | "batch_size": [64, 128, 256], # choose from given batch sizes 78 | "decay": tune.uniform(10**-8, 10**-4) # sample from uniform distribution 79 | } 80 | ) 81 | 82 | result = tune.run( 83 | tune.with_parameters(trainer.Train_ray, data_dir=data_dir), 84 | name="ray_test_basic-CNN", # name for identifying models (checkpoints) 85 | scheduler=scheduler, # select scheduler PBT or ASHA 86 | resources_per_trial={"cpu": 8, "gpu": gpus_per_trial}, # select number of CPUs or GPUs 87 | config=config, # input config dict consisting of different hyperparameters 88 | stop={ 89 | "training_iteration": max_num_epochs, # stopping criterea 90 | }, 91 | metric="loss", # uncomment for ASHA scheduler 92 | mode="min", # uncomment for ASHA scheduler 93 | num_samples=num_samples, 94 | verbose=True, # keep to true to check how training progresses 95 | fail_fast=True, # fail on first error 96 | keep_checkpoints_num=5, # number of checkpoints to be saved per num_samples 97 | 98 | ) 99 | 100 | best_trial = result.get_best_trial("loss", "min", "last") 101 | print("Best configuration: {}".format(best_trial.config)) 102 | print("Best validation loss: {}".format(best_trial.last_result["loss"])) 103 | print("Best validation accuracy: {}".format( 104 | best_trial.last_result["accuracy"])) 105 | 106 | 107 | best_trained_model = GetCNN(best_trial.config["l1"], best_trial.config["l2"]) 108 | best_trained_model.to(device) 109 | checkpoint_path = os.path.join(best_trial.checkpoint.value, "checkpoint") 110 | model_state, optimizer_state = torch.load(checkpoint_path) 111 | best_trained_model.load_state_dict(model_state) 112 | 113 | # Check accuracy of best model 114 | test_acc = trainer.Test(best_trained_model, save=data_dir) 115 | print("Best Test accuracy: {}".format(test_acc)) -------------------------------------------------------------------------------- /docs/resnets/utils/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | None 24 | 25 | 26 | 27 | 28 | 29 | 40 | 41 | 42 |
43 |
44 |
45 |
46 |

47 | home 48 | resnets 49 | utils 50 |

51 |

52 | 53 | 54 | Github 57 | 59 | Twitter 62 |

63 |
64 |
65 |
66 | 67 | 69 | 70 | 84 | 124 | 125 | -------------------------------------------------------------------------------- /docs/resnets/models/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | None 24 | 25 | 26 | 27 | 28 | 29 | 40 | 41 | 42 |
43 |
44 |
45 |
46 |

47 | home 48 | resnets 49 | models 50 |

51 |

52 | 53 | 54 | Github 57 | 59 | Twitter 62 |

63 |
64 |
65 |
66 | 67 | 69 | 70 | 84 | 124 | 125 | -------------------------------------------------------------------------------- /labml_nn/transformers/fast_weights/token_wise.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Fast Weight Systems 4 | summary: > 5 | This is an annotated implementation/tutorial of 6 | Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch. 7 | --- 8 | """ 9 | from typing import Optional 10 | 11 | import torch 12 | from torch import nn 13 | 14 | from labml_helpers.module import Module 15 | from labml_nn.transformers.fast_weights import DPFP 16 | from labml_nn.transformers.feed_forward import FeedForward 17 | from labml_nn.transformers.mha import PrepareForMultiHeadAttention 18 | from labml_nn.utils import clone_module_list 19 | 20 | 21 | class FastWeightsAttention(Module): 22 | def __init__(self, heads: int, d_model: int, dropout_prob: float, phi: DPFP): 23 | super().__init__() 24 | 25 | # Number of features per head 26 | self.d_k = d_model // heads 27 | # 28 | self.heads = heads 29 | 30 | # These transform the `query` multi-headed attention. 31 | self.query = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=False) 32 | # These transform the `key` and `value` for multi-headed attention. 33 | self.key = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=False) 34 | self.value = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=False) 35 | 36 | self.gate = nn.Sequential(PrepareForMultiHeadAttention(d_model, heads, 1, bias=False), 37 | nn.Sigmoid()) 38 | 39 | self.phi = phi 40 | 41 | # Output layer 42 | self.output = nn.Linear(d_model, d_model) 43 | # Dropout 44 | self.dropout = nn.Dropout(dropout_prob) 45 | 46 | def __call__(self, x: torch.Tensor, weights: Optional[torch.Tensor]): 47 | query = self.phi(self.query(x)) 48 | key = self.phi(self.key(x)) 49 | value = self.value(x) 50 | 51 | if weights is None: 52 | weights = key.new_zeros((key.shape[0], key.shape[1], value.shape[2], key.shape[2])) 53 | 54 | value_existing = torch.einsum('bhvk,bhk->bhv', weights, key) 55 | 56 | beta = self.gate(x) 57 | 58 | weights = weights + torch.einsum('bhv,bhk->bhvk', beta * (value - value_existing), key) 59 | 60 | x = torch.einsum('bhvk,bhk->bhv', weights, query) 61 | 62 | # Concatenate multiple heads 63 | x = x.reshape(x.shape[0], -1) 64 | 65 | # Output layer 66 | return self.output(x), weights 67 | 68 | 69 | class FastWeightsAttentionTransformerLayer(Module): 70 | def __init__(self, *, 71 | d_model: int, 72 | attn: FastWeightsAttention, 73 | feed_forward: FeedForward, 74 | dropout_prob: float): 75 | super().__init__() 76 | # Transformer size $d_{model}$ 77 | self.size = d_model 78 | # 79 | self.attn = attn 80 | self.feed_forward = feed_forward 81 | self.dropout = nn.Dropout(dropout_prob) 82 | 83 | # Normalization layers 84 | self.norm_self_attn = nn.LayerNorm([d_model]) 85 | self.norm_ff = nn.LayerNorm([d_model]) 86 | 87 | def __call__(self, x: torch.Tensor, weights: Optional[torch.Tensor]): 88 | attn, weights = self.attn(x, weights) 89 | # Add the self attention results 90 | x = x + self.dropout(attn) 91 | 92 | # Normalize for feed-forward 93 | z = self.norm_ff(x) 94 | # Pass through the feed-forward network 95 | ff = self.feed_forward(z) 96 | # Add the feed-forward results back 97 | x = x + self.dropout(ff) 98 | 99 | # 100 | return x, weights 101 | 102 | 103 | class FastWeightsAttentionTransformer(Module): 104 | def __init__(self, layer: FastWeightsAttentionTransformerLayer, n_layers: int): 105 | super().__init__() 106 | # Make copies of the transformer layer 107 | self.layers = clone_module_list(layer, n_layers) 108 | # Final normalization layer 109 | self.norm = nn.LayerNorm([layer.size]) 110 | 111 | def __call__(self, x_seq: torch.Tensor): 112 | # Split the input to a list along the sequence axis 113 | x_seq = torch.unbind(x_seq, dim=0) 114 | # List to store the outputs 115 | res = [] 116 | # For each input step 117 | weights = [None for _ in range(len(self.layers))] 118 | 119 | for x in x_seq: 120 | # Run through each layer 121 | for i, layer in enumerate(self.layers): 122 | # Get layer output 123 | x, weights[i] = layer(x, weights[i]) 124 | 125 | res.append(x) 126 | 127 | # Stack the output tensors 128 | res = torch.stack(res) 129 | # Normalize the output 130 | return self.norm(res) 131 | -------------------------------------------------------------------------------- /docs/transformers/basic/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | None 24 | 25 | 26 | 27 | 28 | 29 | 40 | 41 | 42 |
43 |
44 |
45 |
46 |

47 | home 48 | transformers 49 | basic 50 |

51 |

52 | 53 | 54 | Github 57 | 59 | Twitter 62 |

63 |
64 |
65 |
66 | 67 | 69 | 70 | 84 | 124 | 125 | -------------------------------------------------------------------------------- /labml_nn/transformers/glu_variants/experiment.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Gated Linear Units and Variants 4 | summary: > 5 | Train an auto-regressive transformer with Gated Linear Units and variants 6 | for the position-wise feedforward network (FFN). 7 | --- 8 | 9 | # Gated Linear Units and Variants 10 | 11 | This trains a simple [transformer](../../) model for auto-regression. 12 | We try different variants for the [position-wise feedforward network](../feed_forward). 13 | The reusable & configurable are defined in [`configs.py`](configs.html). 14 | """ 15 | 16 | import torch 17 | from labml import experiment 18 | from labml.configs import option 19 | from labml.utils.pytorch import get_modules 20 | from labml_helpers.module import Module 21 | 22 | from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs 23 | from labml_nn.transformers import Encoder, Generator, TransformerConfigs 24 | from labml_nn.transformers.utils import subsequent_mask 25 | 26 | 27 | class AutoregressiveModel(Module): 28 | """ 29 | ## Auto regressive model 30 | """ 31 | 32 | def __init__(self, src_embed: Module, encoder: Encoder, generator: Generator): 33 | super().__init__() 34 | # Token embedding module 35 | self.src_embed = src_embed 36 | # Transformer based encoder 37 | self.encoder = encoder 38 | # Next token generation layer; 39 | # this give logits of the the next token 40 | self.generator = generator 41 | # This will be initialized on the first call 42 | self.src_mask = None 43 | 44 | def forward(self, src: torch.Tensor): 45 | # Create subsequent mask, so that the transformer can only pay attention to past tokens. 46 | if self.src_mask is None or self.src_mask.size(0) != len(src): 47 | self.src_mask = subsequent_mask(len(src)).to(src.device) 48 | # Embed the tokens (`src`) and run it through the the transformer 49 | res = self.encoder(self.src_embed(src), self.src_mask) 50 | # Generate logits of the next token 51 | return self.generator(res), None 52 | 53 | 54 | class Configs(NLPAutoRegressionConfigs): 55 | """ 56 | ## Configurations 57 | 58 | The default configs can and will be over-ridden when we start the experiment 59 | """ 60 | 61 | transformer: TransformerConfigs 62 | model: AutoregressiveModel 63 | 64 | 65 | @option(Configs.model) 66 | def autoregressive_model(c: Configs): 67 | """ 68 | Initialize the auto-regressive model 69 | """ 70 | m = AutoregressiveModel(c.transformer.src_embed, c.transformer.encoder, c.transformer.generator) 71 | return m.to(c.device) 72 | 73 | 74 | @option(Configs.transformer) 75 | def transformer_c(c: Configs): 76 | """ 77 | Initialize the [configurable transformer](../configs.html) encoder for our autoregressive model. 78 | """ 79 | tc = TransformerConfigs() 80 | tc.n_src_vocab = c.n_tokens 81 | tc.n_tgt_vocab = c.n_tokens 82 | 83 | return tc 84 | 85 | 86 | def main(): 87 | # Create experiment 88 | experiment.create(name="glu_variants") 89 | # Create configs 90 | conf = Configs() 91 | # Load configurations 92 | experiment.configs(conf, 93 | # A dictionary of configurations to override 94 | {'tokenizer': 'character', 95 | 'prompt_separator': '', 96 | 'prompt': 'It is ', 97 | 'text': 'tiny_shakespeare', 98 | 99 | 'optimizer.optimizer': 'Noam', 100 | 'optimizer.learning_rate': 1., 101 | 'optimizer.d_model': 256, 102 | 103 | 'seq_len': 1024, 104 | 'epochs': 128, 105 | 'batch_size': 6, 106 | 'inner_iterations': 10, 107 | 108 | # GLU Variant, one of GLU, Bilinear, ReGLU, GEGLU, SwiGLU 109 | # 110 | # These are defined in the [configurable FFN](../configs.html#FFN) 111 | # implementation 112 | 'transformer.ffn.glu_variant': 'Bilinear', 113 | 114 | # Transformer configurations 115 | 'transformer.d_model': 256, 116 | 'transformer.ffn.d_ff': 1024, 117 | 'transformer.n_heads': 8, 118 | 'transformer.n_layers': 6}) 119 | 120 | # This is needed to initialize models 121 | conf.n_tokens = conf.text.n_tokens 122 | 123 | # Set models for saving and loading 124 | experiment.add_pytorch_models(get_modules(conf)) 125 | 126 | # Start the experiment 127 | with experiment.start(): 128 | # `TrainValidConfigs.run` 129 | conf.run() 130 | 131 | 132 | if __name__ == '__main__': 133 | main() 134 | -------------------------------------------------------------------------------- /labml_nn/transformers/fnet/experiment.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: FNet Experiment 4 | summary: This experiment trains a FNet based model on AG News dataset. 5 | --- 6 | 7 | # [FNet](index.html) Experiment 8 | 9 | This is an annotated PyTorch experiment to train a [FNet model](index.html). 10 | 11 | This is based on 12 | [general training loop and configurations for AG News classification task](../../experiments/nlp_classification.html). 13 | """ 14 | 15 | import torch 16 | from torch import nn 17 | 18 | from labml import experiment 19 | from labml.configs import option 20 | from labml_helpers.module import Module 21 | from labml_nn.experiments.nlp_classification import NLPClassificationConfigs 22 | from labml_nn.transformers import Encoder 23 | from labml_nn.transformers import TransformerConfigs 24 | 25 | 26 | class TransformerClassifier(nn.Module): 27 | """ 28 | # Transformer based classifier model 29 | """ 30 | def __init__(self, encoder: Encoder, src_embed: Module, generator: nn.Linear): 31 | """ 32 | * `encoder` is the transformer [Encoder](../models.html#Encoder) 33 | * `src_embed` is the token 34 | [embedding module (with positional encodings)](../models.html#EmbeddingsWithLearnedPositionalEncoding) 35 | * `generator` is the [final fully connected layer](../models.html#Generator) that gives the logits. 36 | """ 37 | super().__init__() 38 | self.src_embed = src_embed 39 | self.encoder = encoder 40 | self.generator = generator 41 | 42 | def forward(self, x: torch.Tensor): 43 | # Get the token embeddings with positional encodings 44 | x = self.src_embed(x) 45 | # Transformer encoder 46 | x = self.encoder(x, None) 47 | # Get logits for classification. 48 | # 49 | # We set the `[CLS]` token at the last position of the sequence. 50 | # This is extracted by `x[-1]`, where `x` is of 51 | # shape `[seq_len, batch_size, d_model]` 52 | x = self.generator(x[-1]) 53 | 54 | # Return results 55 | # (second value is for state, since our trainer is used with RNNs also) 56 | return x, None 57 | 58 | 59 | class Configs(NLPClassificationConfigs): 60 | """ 61 | ## Configurations 62 | 63 | This inherits from 64 | [`NLPClassificationConfigs`](../../experiments/nlp_classification.html) 65 | """ 66 | 67 | # Classification model 68 | model: TransformerClassifier 69 | # Transformer 70 | transformer: TransformerConfigs 71 | 72 | 73 | @option(Configs.transformer) 74 | def _transformer_configs(c: Configs): 75 | """ 76 | ### Transformer configurations 77 | """ 78 | 79 | # We use our 80 | # [configurable transformer implementation](../configs.html#TransformerConfigs) 81 | conf = TransformerConfigs() 82 | # Set the vocabulary sizes for embeddings and generating logits 83 | conf.n_src_vocab = c.n_tokens 84 | conf.n_tgt_vocab = c.n_tokens 85 | 86 | # 87 | return conf 88 | 89 | 90 | @option(TransformerConfigs.encoder_attn) 91 | def fnet_mix(): 92 | """ 93 | Create `FNetMix` module that can replace the self-attention in 94 | [transformer encoder layer](../models.html#TransformerLayer) 95 | . 96 | """ 97 | from labml_nn.transformers.fnet import FNetMix 98 | return FNetMix() 99 | 100 | 101 | @option(Configs.model) 102 | def _model(c: Configs): 103 | """ 104 | Create classification model 105 | """ 106 | m = TransformerClassifier(c.transformer.encoder, 107 | c.transformer.src_embed, 108 | nn.Linear(c.d_model, c.n_classes)).to(c.device) 109 | 110 | return m 111 | 112 | 113 | def main(): 114 | # Create experiment 115 | experiment.create(name="fnet") 116 | # Create configs 117 | conf = Configs() 118 | # Override configurations 119 | experiment.configs(conf, { 120 | # Use world level tokenizer 121 | 'tokenizer': 'basic_english', 122 | 123 | # Train for $32$ epochs 124 | 'epochs': 32, 125 | # Switch between training and validation for $10$ times 126 | # per epoch 127 | 'inner_iterations': 10, 128 | 129 | # Transformer configurations (same as defaults) 130 | 'transformer.d_model': 512, 131 | 'transformer.ffn.d_ff': 2048, 132 | 'transformer.n_heads': 8, 133 | 'transformer.n_layers': 6, 134 | 135 | # Use [FNet](index.html) instead of self-a 136 | # ttention 137 | 'transformer.encoder_attn': 'fnet_mix', 138 | 139 | # Use [Noam optimizer](../../optimizers/noam.html) 140 | 'optimizer.optimizer': 'Noam', 141 | 'optimizer.learning_rate': 1., 142 | }) 143 | 144 | # Set models for saving and loading 145 | experiment.add_pytorch_models({'model': conf.model}) 146 | 147 | # Start the experiment 148 | with experiment.start(): 149 | # Run training 150 | conf.run() 151 | 152 | 153 | # 154 | if __name__ == '__main__': 155 | main() 156 | -------------------------------------------------------------------------------- /docs/transformers/relative_mha.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | Relative Multi-Headed Attention 25 | 26 | 27 | 28 | 29 | 30 | 41 | 42 | 43 |
44 |
45 |
46 |
47 |

48 | home 49 | transformers 50 |

51 |

52 | 53 | 54 | Github 57 | 59 | Twitter 62 |

63 |
64 |
65 |
66 | 67 | 69 | 70 | 84 | 124 | 125 | -------------------------------------------------------------------------------- /labml_nn/transformers/knn/train_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | --- 3 | title: Train Autoregressive Transformer 4 | summary: This is training code with notes for a basic auto-regressive transformer. 5 | --- 6 | 7 | # Train Autoregressive Transformer 8 | 9 | This trains a simple [transformer](../../) model for auto-regression. 10 | """ 11 | 12 | import torch 13 | from labml import experiment 14 | from labml.configs import option 15 | from labml.utils.pytorch import get_modules 16 | from labml_helpers.module import Module 17 | 18 | from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs 19 | from labml_nn.transformers import Encoder, Generator, TransformerConfigs 20 | from labml_nn.transformers.utils import subsequent_mask 21 | 22 | 23 | class AutoregressiveModel(Module): 24 | """ 25 | ## Auto regressive model 26 | """ 27 | 28 | def __init__(self, src_embed: Module, encoder: Encoder, generator: Generator, *, 29 | is_save_ff_input: bool = False): 30 | super().__init__() 31 | # Token embedding module 32 | self.src_embed = src_embed 33 | # Transformer based encoder 34 | self.encoder = encoder 35 | # Whether the last layer of the encoder should 36 | # save the input to the feed-forward layer. 37 | # This is out $f(c_t)$, the embedding of the context. 38 | self.encoder.layers[-1].is_save_ff_input = is_save_ff_input 39 | # Next token generation layer; 40 | # this give logits of the the next token 41 | self.generator = generator 42 | # This will be initialized on the first call 43 | self.src_mask = None 44 | 45 | @property 46 | def ff_input(self) -> torch.Tensor: 47 | """ 48 | Retrieve saved $f(c_t)$ 49 | """ 50 | return self.encoder.layers[-1].ff_input 51 | 52 | def forward(self, src: torch.Tensor): 53 | # Create subsequent mask, so that the transformer can only pay attention to past tokens. 54 | if self.src_mask is None or self.src_mask.size(0) != len(src): 55 | self.src_mask = subsequent_mask(len(src)).to(src.device) 56 | # Embed the tokens (`src`) and run it through the the transformer 57 | res = self.encoder(self.src_embed(src), self.src_mask) 58 | # Generate logits of the next token 59 | return self.generator(res), None 60 | 61 | 62 | class Configs(NLPAutoRegressionConfigs): 63 | """ 64 | ## Configurations 65 | 66 | The default configs can and will be over-ridden when we start the experiment 67 | """ 68 | 69 | transformer: TransformerConfigs 70 | model: AutoregressiveModel 71 | 72 | is_save_ff_input = False 73 | 74 | 75 | @option(Configs.model) 76 | def autoregressive_model(c: Configs): 77 | """ 78 | Initialize the auto-regressive model 79 | """ 80 | m = AutoregressiveModel( 81 | # Get the source token embedding layer, encoder and 82 | # final token generator from configurable transformer 83 | src_embed=c.transformer.src_embed, 84 | encoder=c.transformer.encoder, 85 | generator=c.transformer.generator, 86 | # Whether to save $f(c_t)$ 87 | is_save_ff_input=c.is_save_ff_input) 88 | return m.to(c.device) 89 | 90 | 91 | @option(Configs.transformer) 92 | def transformer_c(c: Configs): 93 | """ 94 | Initialize the configurable transformer encoder for our autoregressive model 95 | """ 96 | tc = TransformerConfigs() 97 | tc.n_src_vocab = c.n_tokens 98 | tc.n_tgt_vocab = c.n_tokens 99 | 100 | return tc 101 | 102 | 103 | def main(): 104 | # Create experiment 105 | experiment.create(name="knn_lm") 106 | # Create configs 107 | conf = Configs() 108 | # Load configurations 109 | experiment.configs(conf, 110 | # A dictionary of configurations to override 111 | {'tokenizer': 'character', 112 | 'prompt_separator': '', 113 | 'prompt': 'It is ', 114 | 'text': 'tiny_shakespeare', 115 | 116 | 'optimizer.optimizer': 'Noam', 117 | 'optimizer.learning_rate': 1., 118 | 'optimizer.d_model': 256, 119 | 120 | 'seq_len': 1024, 121 | 'epochs': 128, 122 | 'batch_size': 6, 123 | 'inner_iterations': 10, 124 | 125 | # Transformer configurations 126 | 'transformer.d_model': 256, 127 | 'transformer.ffn.d_ff': 1024, 128 | 'transformer.n_heads': 8, 129 | 'transformer.n_layers': 6}) 130 | 131 | # This is needed to initialize models 132 | conf.n_tokens = conf.text.n_tokens 133 | 134 | # Set models for saving and loading 135 | experiment.add_pytorch_models(get_modules(conf)) 136 | 137 | # Start the experiment 138 | with experiment.start(): 139 | # `TrainValidConfigs.run` 140 | conf.run() 141 | 142 | 143 | if __name__ == '__main__': 144 | main() 145 | --------------------------------------------------------------------------------