├── .labml.yaml
├── utils
├── __init__.py
└── sitemap.py
├── docs
├── CNAME
├── icon.png
├── cnn
│ └── utils
│ │ ├── cv-folds.png
│ │ ├── overfitting.png
│ │ ├── Underfitting.png
│ │ ├── early-stopping.png
│ │ ├── ground_truth.png
│ │ └── Cross-validation.png
├── optimizers
│ ├── noam_lr.png
│ └── radam_r_t.png
├── gan
│ └── stylegan
│ │ └── generated_64.png
├── resnets
│ ├── index.html
│ ├── utils
│ │ └── index.html
│ └── models
│ │ └── index.html
├── experiments
│ └── index.html
└── transformers
│ ├── basic
│ └── index.html
│ └── relative_mha.html
├── labml_nn
├── resnets
│ ├── __init__.py
│ ├── models
│ │ ├── __init__.py
│ │ └── mlp.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── labelsmoothing.py
│ │ ├── utils.py
│ │ └── train.py
│ ├── accuracy_graph_85.png
│ ├── pretrained_nets.py
│ └── resnet_net.py
├── experiments
│ ├── __init__.py
│ ├── cifar10.py
│ └── mnist.py
├── transformers
│ ├── basic
│ │ └── __init__.py
│ ├── relative_mha.py
│ ├── glu_variants
│ │ ├── __init__.py
│ │ └── experiment.py
│ ├── utils.py
│ ├── gmlp
│ │ ├── readme.md
│ │ └── experiment.py
│ ├── fnet
│ │ ├── readme.md
│ │ ├── __init__.py
│ │ └── experiment.py
│ ├── aft
│ │ └── readme.md
│ ├── fast_weights
│ │ ├── readme.md
│ │ ├── experiment.py
│ │ └── token_wise.py
│ ├── xl
│ │ └── readme.md
│ ├── knn
│ │ ├── __init__.py
│ │ └── train_model.py
│ ├── switch
│ │ └── readme.md
│ ├── feedback
│ │ └── readme.md
│ ├── compressive
│ │ └── readme.md
│ ├── label_smoothing_loss.py
│ ├── positional_encoding.py
│ ├── mlm
│ │ └── readme.md
│ ├── __init__.py
│ └── feed_forward.py
├── activations
│ ├── __init__.py
│ └── swish.py
├── cnn
│ ├── save
│ │ └── Basic_CNN-best-model
│ │ │ └── model.pt
│ ├── cross_validation.py
│ ├── utils
│ │ └── dataloader.py
│ └── ray_tune.py
├── gan
│ ├── wasserstein
│ │ ├── readme.md
│ │ ├── gradient_penalty
│ │ │ ├── readme.md
│ │ │ ├── experiment.py
│ │ │ └── __init__.py
│ │ └── experiment.py
│ ├── original
│ │ └── readme.md
│ ├── cycle_gan
│ │ └── readme.md
│ ├── dcgan
│ │ ├── readme.md
│ │ └── __init__.py
│ ├── __init__.py
│ └── stylegan
│ │ └── readme.md
├── hypernetworks
│ ├── __init__.py
│ └── experiment.py
├── normalization
│ ├── weight_standardization
│ │ ├── readme.md
│ │ ├── conv2d.py
│ │ ├── experiment.py
│ │ └── __init__.py
│ ├── __init__.py
│ ├── instance_norm
│ │ ├── readme.md
│ │ ├── experiment.py
│ │ └── __init__.py
│ ├── layer_norm
│ │ └── readme.md
│ ├── group_norm
│ │ ├── readme.md
│ │ └── experiment.py
│ └── batch_norm
│ │ ├── cifar10.py
│ │ ├── mnist.py
│ │ └── readme.md
├── optimizers
│ ├── readme.md
│ ├── performance_test.py
│ ├── adam_warmup.py
│ ├── noam.py
│ ├── adam_warmup_cosine_decay.py
│ └── mnist_experiment.py
├── rl
│ ├── __init__.py
│ ├── ppo
│ │ ├── readme.md
│ │ └── gae.py
│ └── dqn
│ │ └── model.py
├── utils
│ ├── tokenizer.py
│ └── __init__.py
├── capsule_networks
│ └── readme.md
└── __init__.py
├── MANIFEST.in
├── images
└── dqn.png
├── requirements.txt
├── .gitignore
├── Makefile
├── license
└── setup.py
/.labml.yaml:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/CNAME:
--------------------------------------------------------------------------------
1 | nn.labml.ai
--------------------------------------------------------------------------------
/labml_nn/resnets/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/labml_nn/experiments/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include readme.rst
2 |
--------------------------------------------------------------------------------
/labml_nn/resnets/models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/labml_nn/resnets/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/labml_nn/transformers/basic/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/labml_nn/activations/__init__.py:
--------------------------------------------------------------------------------
1 | from .swish import Swish
2 |
--------------------------------------------------------------------------------
/docs/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/icon.png
--------------------------------------------------------------------------------
/images/dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/images/dqn.png
--------------------------------------------------------------------------------
/docs/cnn/utils/cv-folds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/cnn/utils/cv-folds.png
--------------------------------------------------------------------------------
/docs/optimizers/noam_lr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/optimizers/noam_lr.png
--------------------------------------------------------------------------------
/docs/cnn/utils/overfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/cnn/utils/overfitting.png
--------------------------------------------------------------------------------
/docs/optimizers/radam_r_t.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/optimizers/radam_r_t.png
--------------------------------------------------------------------------------
/docs/cnn/utils/Underfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/cnn/utils/Underfitting.png
--------------------------------------------------------------------------------
/docs/cnn/utils/early-stopping.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/cnn/utils/early-stopping.png
--------------------------------------------------------------------------------
/docs/cnn/utils/ground_truth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/cnn/utils/ground_truth.png
--------------------------------------------------------------------------------
/docs/cnn/utils/Cross-validation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/cnn/utils/Cross-validation.png
--------------------------------------------------------------------------------
/docs/gan/stylegan/generated_64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/gan/stylegan/generated_64.png
--------------------------------------------------------------------------------
/labml_nn/resnets/accuracy_graph_85.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/labml_nn/resnets/accuracy_graph_85.png
--------------------------------------------------------------------------------
/labml_nn/cnn/save/Basic_CNN-best-model/model.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/labml_nn/cnn/save/Basic_CNN-best-model/model.pt
--------------------------------------------------------------------------------
/labml_nn/gan/wasserstein/readme.md:
--------------------------------------------------------------------------------
1 | # [Wasserstein GAN - WGAN](https://nn.labml.ai/gan/wasserstein/index.html)
2 |
3 | This is an implementation of
4 | [Wasserstein GAN](https://arxiv.org/abs/1701.07875).
5 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.7
2 | labml>=0.4.94
3 | labml-helpers>=0.4.77
4 | torchvision
5 | numpy>=1.16.3
6 | matplotlib>=3.0.3
7 | einops>=0.3.0
8 | gym[atari]
9 | opencv-python
10 | Pillow>=6.2.1
11 |
--------------------------------------------------------------------------------
/labml_nn/hypernetworks/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: HyperNetworks
4 | summary: A PyTorch implementation/tutorial of HyperLSTM introduced in paper HyperNetworks.
5 | ---
6 |
7 | ## [HyperLSTM](hyper_lstm.html)
8 | """
--------------------------------------------------------------------------------
/labml_nn/gan/original/readme.md:
--------------------------------------------------------------------------------
1 | # [Generative Adversarial Networks - GAN](https://nn.labml.ai/gan/original/index.html)
2 |
3 | This is an annotated implementation of
4 | [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
5 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | __pycache__
3 | .DS_Store
4 | .*.swp
5 | *.egg-info/
6 | dist/
7 | build/
8 | .idea/*
9 | !.idea/dictionaries
10 | labml
11 | labml_helpers
12 | labml_samples
13 | data
14 | logs
15 | html/
16 | diagrams/
--------------------------------------------------------------------------------
/labml_nn/transformers/relative_mha.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Relative Multi-Headed Attention
4 | summary: Relative Multi-Headed Attention from paper Transformer-XL.
5 | redirect: https://nn.labml.ai/transformers/xl/relative_mha.html
6 | ---
7 | """
8 |
--------------------------------------------------------------------------------
/labml_nn/gan/cycle_gan/readme.md:
--------------------------------------------------------------------------------
1 | # [Cycle GAN](https://nn.labml.ai/gan/cycle_gan/index.html)
2 |
3 | This is a [PyTorch](https://pytorch.org) implementation/tutorial of the paper
4 | [Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks](https://arxiv.org/abs/1703.10593).
5 |
--------------------------------------------------------------------------------
/labml_nn/gan/dcgan/readme.md:
--------------------------------------------------------------------------------
1 | # [Deep Convolutional Generative Adversarial Networks - DCGAN](https://nn.labml.ai/gan/dcgan/index.html)
2 |
3 | This is a [PyTorch](https://pytorch.org) implementation of paper
4 | [Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](https://arxiv.org/abs/1511.06434).
5 |
--------------------------------------------------------------------------------
/labml_nn/activations/swish.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 | from labml_helpers.module import Module
5 |
6 |
7 | class Swish(Module):
8 | def __init__(self):
9 | super().__init__()
10 | self.sigmoid = nn.Sigmoid()
11 |
12 | def forward(self, x: torch.Tensor) -> torch.Tensor:
13 | return x * self.sigmoid(x)
14 |
--------------------------------------------------------------------------------
/labml_nn/experiments/cifar10.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: CIFAR10 Experiment
4 | summary: >
5 | This is a reusable trainer for CIFAR10 dataset
6 | ---
7 |
8 | # CIFAR10 Experiment
9 | """
10 |
11 | from labml_helpers.datasets.cifar10 import CIFAR10Configs as CIFAR10DatasetConfigs
12 | from labml_nn.experiments.mnist import MNISTConfigs
13 |
14 |
15 | class CIFAR10Configs(CIFAR10DatasetConfigs, MNISTConfigs):
16 | dataset_name: str = 'CIFAR10'
17 |
--------------------------------------------------------------------------------
/labml_nn/transformers/glu_variants/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Gated Linear Units and Variants
4 | summary: >
5 | Train an auto-regressive transformer with Gated Linear Units and variants
6 | for the position-wise feedforward network (FFN).
7 | ---
8 |
9 | # Gated Linear Units and Variants
10 |
11 | * [Experiment that uses `labml.configs`](glu_variants/experiment.html)
12 | * [Simpler version from scratch](glu_variants/simple.html)
13 | """
14 |
--------------------------------------------------------------------------------
/labml_nn/normalization/weight_standardization/readme.md:
--------------------------------------------------------------------------------
1 | # [Weight Standardization](https://nn.labml.ai/normalization/weight_standardization/index.html)
2 |
3 | This is a [PyTorch](https://pytorch.org) implementation of Weight Standardization from the paper
4 | [Micro-Batch Training with Batch-Channel Normalization and Weight Standardization](https://arxiv.org/abs/1903.10520).
5 | We also have an
6 | [annotated implementation of Batch-Channel Normalization](https://nn.labml.ai/normalization/batch_channel_norm/index.html).
7 |
--------------------------------------------------------------------------------
/labml_nn/gan/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Generative Adversarial Networks
4 | summary: >
5 | A set of PyTorch implementations/tutorials of GANs.
6 | ---
7 |
8 | # Generative Adversarial Networks
9 |
10 | * [Original GAN](original/index.html)
11 | * [GAN with deep convolutional network](dcgan/index.html)
12 | * [Cycle GAN](cycle_gan/index.html)
13 | * [Wasserstein GAN](wasserstein/index.html)
14 | * [Wasserstein GAN with Gradient Penalty](wasserstein/gradient_penalty/index.html)
15 | * [Style GAN 2](stylegan/index.html)
16 | """
--------------------------------------------------------------------------------
/labml_nn/optimizers/readme.md:
--------------------------------------------------------------------------------
1 | # [Optimizers](https://nn.labml.ai/optimizers/index.html)
2 |
3 | ## Optimizer Implementations
4 | * [Adam Optimizer](https://nn.labml.ai/optimizers/adam.html)
5 | * [AMSGrad Optimizer](https://nn.labml.ai/optimizers/amsgrad.html)
6 | * [Adam Optimizer with warmup](https://nn.labml.ai/optimizers/adam_warmup.html)
7 | * [Noam Optimizer](https://nn.labml.ai/optimizers/noam.html)
8 | * [Rectified Adam Optimizer](https://nn.labml.ai/optimizers/radam.html)
9 | * [AdaBelief Optimizer](https://nn.labml.ai/optimizers/ada_belief.html)
10 |
--------------------------------------------------------------------------------
/labml_nn/normalization/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Normalization Layers
4 | summary: >
5 | A set of PyTorch implementations/tutorials of normalization layers.
6 | ---
7 |
8 | # Normalization Layers
9 |
10 | * [Batch Normalization](batch_norm/index.html)
11 | * [Layer Normalization](layer_norm/index.html)
12 | * [Instance Normalization](instance_norm/index.html)
13 | * [Group Normalization](group_norm/index.html)
14 | * [Weight Standardization](weight_standardization/index.html)
15 | * [Batch-Channel Normalization](batch_channel_norm/index.html)
16 | """
17 |
--------------------------------------------------------------------------------
/labml_nn/resnets/utils/labelsmoothing.py:
--------------------------------------------------------------------------------
1 | import torch.nn.functional as F
2 | from torch import nn
3 |
4 | class LabelSmoothingLoss(nn.Module):
5 | def __init__(self, epsilon= 0.5, reduction='mean'):
6 | super().__init__()
7 | self.epsilon = epsilon
8 | self.reduction = reduction
9 |
10 | def forward(self, pred, target):
11 | n = pred.size()[-1]
12 | log_pred = F.log_softmax(pred, dim=-1)
13 | loss = -log_pred.sum(dim=-1).mean()
14 | nll = F.nll_loss(log_pred, target, reduction=self.reduction)
15 | out = (1-self.epsilon)*nll + self.epsilon*(loss / n)
16 | return out
17 |
--------------------------------------------------------------------------------
/labml_nn/normalization/instance_norm/readme.md:
--------------------------------------------------------------------------------
1 | # [Instance Normalization](https://nn.labml.ai/normalization/instance_norm/index.html)
2 |
3 | This is a [PyTorch](https://pytorch.org) implementation of
4 | [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
5 |
6 | Instance normalization was introduced to improve [style transfer](https://paperswithcode.com/task/style-transfer).
7 | It is based on the observation that stylization should not depend on the contrast of the content image.
8 | Since it's hard for a convolutional network to learn "contrast normalization", this paper
9 | introduces instance normalization which does that.
--------------------------------------------------------------------------------
/labml_nn/transformers/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Utilities for Transformer
4 | summary: A bunch of utility functions and classes for transformers.
5 | ---
6 |
7 | # Utilities for Transformer
8 | """
9 |
10 | import torch
11 |
12 |
13 | def subsequent_mask(seq_len):
14 | """
15 | ## Subsequent mask to mask out data from future (subsequent) time steps
16 | """
17 | mask = torch.tril(torch.ones(seq_len, seq_len)).to(torch.bool).unsqueeze(-1)
18 | return mask
19 |
20 |
21 | def _subsequent_mask():
22 | from labml.logger import inspect
23 | inspect(subsequent_mask(10)[:, :, 0])
24 |
25 |
26 | if __name__ == '__main__':
27 | _subsequent_mask()
28 |
--------------------------------------------------------------------------------
/labml_nn/transformers/gmlp/readme.md:
--------------------------------------------------------------------------------
1 | # [Pay Attention to MLPs (gMLP)](https://nn.labml.ai/transformers/gmlp/index.html)
2 |
3 | This is a [PyTorch](https://pytorch.org) implementation of the paper
4 | [Pay Attention to MLPs](https://papers.labml.ai/paper/2105.08050).
5 |
6 | This paper introduces a Multilayer Perceptron (MLP) based architecture with gating,
7 | which they name **gMLP**. It consists of a stack of $L$ *gMLP* blocks.
8 |
9 | Here is [the training code](https://nn.labml.ai/transformers/gmlp/experiment.html) for a gMLP model based autoregressive model.
10 |
11 | [](https://app.labml.ai/run/01bd941ac74c11eb890c1d9196651a4a)
12 |
--------------------------------------------------------------------------------
/labml_nn/transformers/fnet/readme.md:
--------------------------------------------------------------------------------
1 | # [FNet: Mixing Tokens with Fourier Transforms](https://nn.labml.ai/transformers/fnet/index.html)
2 |
3 | This is a [PyTorch](https://pytorch.org) implementation of the paper
4 | [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824).
5 |
6 | This paper replaces the [self-attention layer](https://nn.labml.ai/transformers//mha.html) with two
7 | [Fourier transforms](https://en.wikipedia.org/wiki/Discrete_Fourier_transform) to
8 | *mix* tokens.
9 | This is a 7X more efficient than self-attention.
10 | The accuracy loss of using this over self-attention is about 92% for
11 | [BERT](https://paperswithcode.com/method/bert) on
12 | [GLUE benchmark](https://paperswithcode.com/dataset/glue).
13 |
--------------------------------------------------------------------------------
/labml_nn/gan/stylegan/readme.md:
--------------------------------------------------------------------------------
1 | # [Style GAN 2](https://nn.labml.ai/gan/stylegan/index.html)
2 |
3 | This is a [PyTorch](https://pytorch.org) implementation of the paper
4 | [Analyzing and Improving the Image Quality of StyleGAN](https://arxiv.org/abs/1912.04958)
5 | which introduces **Style GAN2**.
6 | Style GAN2 is an improvement over **Style GAN** from the paper
7 | [A Style-Based Generator Architecture for Generative Adversarial Networks](https://arxiv.org/abs/1812.04948).
8 | And Style GAN is based on **Progressive GAN** from the paper
9 | [Progressive Growing of GANs for Improved Quality, Stability, and Variation](https://arxiv.org/abs/1710.10196).
10 | All three papers are from the same authors from [NVIDIA AI](https://twitter.com/NVIDIAAI).
11 |
--------------------------------------------------------------------------------
/labml_nn/transformers/aft/readme.md:
--------------------------------------------------------------------------------
1 | # [An Attention Free Transformer](https://nn.labml.ai/transformers/aft/index.html)
2 |
3 | This is a [PyTorch](https://pytorch.org) implementation of the paper
4 | [An Attention Free Transformer](https://papers.labml.ai/paper/2105.14103).
5 |
6 | This paper replaces the [self-attention layer](https://nn.labml.ai/transformers/mha.html)
7 | with a new efficient operation,
8 | that has memory complexity of O(Td), where T is the sequence length
9 | and $d$ is the dimensionality of embeddings.
10 |
11 | The paper introduces AFT along with AFT-local and AFT-conv.
12 | Here we have implemented AFT-local which pays attention to closeby tokens
13 | in an autoregressive model.
14 |
15 | [](https://app.labml.ai/run/6348e504c3a511eba9529daa283fb495)
16 |
--------------------------------------------------------------------------------
/labml_nn/rl/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Reinforcement Learning Algorithms
4 | summary: >
5 | This is a collection of PyTorch implementations/tutorials of reinforcement learning algorithms.
6 | It currently includes Proximal Policy Optimization, Generalized Advantage Estimation, and
7 | Deep Q Networks.
8 | ---
9 |
10 | # Reinforcement Learning Algorithms
11 |
12 | * [Proximal Policy Optimization](ppo)
13 | * [This is an experiment](ppo/experiment.html) that runs a PPO agent on Atari Breakout.
14 | * [Generalized advantage estimation](ppo/gae.html)
15 | * [Deep Q Networks](dqn)
16 | * [This is an experiment](dqn/experiment.html) that runs a DQN agent on Atari Breakout.
17 | * [Model](dqn/model.html) with dueling network
18 | * [Prioritized Experience Replay Buffer](dqn/replay_buffer.html)
19 |
20 | [This is the implementation for OpenAI game wrapper](game.html) using `multiprocessing`.
21 | """
--------------------------------------------------------------------------------
/labml_nn/transformers/fast_weights/readme.md:
--------------------------------------------------------------------------------
1 | # [Fast weights transformer](https://nn.labml.ai/transformers/fast_weights/index.html)
2 |
3 | This is an annotated implementation of the paper
4 | [Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch](https://arxiv.org/abs/2102.11174).
5 |
6 | Here is the [annotated implementation](https://nn.labml.ai/transformers/fast_weights/index.html).
7 | Here are [the training code](https://nn.labml.ai/transformers/fast_weights/experiment.html)
8 | and a notebook for training a fast weights transformer on the Tiny Shakespeare dataset.
9 |
10 | [](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb)
11 | [](https://app.labml.ai/run/928aadc0846c11eb85710242ac1c0002)
12 |
--------------------------------------------------------------------------------
/labml_nn/gan/wasserstein/gradient_penalty/readme.md:
--------------------------------------------------------------------------------
1 | # [Gradient Penalty for Wasserstein GAN (WGAN-GP)](https://nn.labml.ai/gan/wasserstein/gradient_penalty/index.html)
2 |
3 | This is an implementation of
4 | [Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028).
5 |
6 | [WGAN](https://nn.labml.ai/gan/wasserstein/index.html) suggests
7 | clipping weights to enforce Lipschitz constraint
8 | on the discriminator network (critic).
9 | This and other weight constraints like L2 norm clipping, weight normalization,
10 | L1, L2 weight decay have problems:
11 |
12 | 1. Limiting the capacity of the discriminator
13 | 2. Exploding and vanishing gradients (without [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html)).
14 |
15 | The paper [Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028)
16 | proposal a better way to improve Lipschitz constraint, a gradient penalty.
17 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | clean: ## Clean
2 | rm -rf dist
3 | rm -rf build
4 | rm -rf *.egg-info
5 |
6 | build: clean ## Build PIPy Package
7 | python setup.py sdist bdist_wheel
8 |
9 | check-content: build ## List contents of PIPy Package
10 | tar -tvf dist/*.tar.gz
11 |
12 | check: build ## Check PIPy Package
13 | twine check dist/*
14 |
15 | upload: build ## Upload PIPy Package
16 | twine upload dist/*
17 |
18 | install: ## Install from repo
19 | pip install -e .
20 |
21 | uninstall: ## Uninstall
22 | pip uninstall labml_nn
23 |
24 | docs: ## Render annotated HTML
25 | find ./docs/ -name "*.html" -type f -delete
26 | find ./docs/ -name "*.svg" -type f -delete
27 | python utils/sitemap.py
28 | python utils/diagrams.py
29 | cd labml_nn; pylit --remove_empty_sections --title_md -t ../../../pylit/templates/nn -d ../docs -w *
30 |
31 | help: ## Show this help.
32 | @fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##//'
33 |
34 | .PHONY: clean build check upload help docs
35 | .DEFAULT_GOAL := help
36 |
--------------------------------------------------------------------------------
/license:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2020 Varuna Jayasiri
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/labml_nn/utils/tokenizer.py:
--------------------------------------------------------------------------------
1 | from typing import Callable
2 |
3 | from labml.configs import BaseConfigs, option
4 |
5 |
6 | class TokenizerConfigs(BaseConfigs):
7 | """
8 |
9 | ## Optimizer Configurations
10 |
11 | """
12 |
13 | tokenizer: Callable = 'character'
14 |
15 | def __init__(self):
16 | super().__init__(_primary='tokenizer')
17 |
18 |
19 | @option(TokenizerConfigs.tokenizer)
20 | def basic_english():
21 | """
22 | ### Basic english tokenizer
23 |
24 | We use character level tokenizer in this experiment.
25 | You can switch by setting,
26 |
27 | ```
28 | 'tokenizer': 'basic_english',
29 | ```
30 |
31 | as the configurations dictionary when starting the experiment.
32 |
33 | """
34 | from torchtext.data import get_tokenizer
35 | return get_tokenizer('basic_english')
36 |
37 |
38 | def character_tokenizer(x: str):
39 | """
40 | ### Character level tokenizer
41 | """
42 | return list(x)
43 |
44 |
45 | @option(TokenizerConfigs.tokenizer)
46 | def character():
47 | """
48 | Character level tokenizer configuration
49 | """
50 | return character_tokenizer
51 |
--------------------------------------------------------------------------------
/labml_nn/normalization/layer_norm/readme.md:
--------------------------------------------------------------------------------
1 | # [Layer Normalization](https://nn.labml.ai/normalization/layer_norm/index.html)
2 |
3 | This is a [PyTorch](https://pytorch.org) implementation of
4 | [Layer Normalization](https://arxiv.org/abs/1607.06450).
5 |
6 | ### Limitations of [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html)
7 |
8 | * You need to maintain running means.
9 | * Tricky for RNNs. Do you need different normalizations for each step?
10 | * Doesn't work with small batch sizes;
11 | large NLP models are usually trained with small batch sizes.
12 | * Need to compute means and variances across devices in distributed training.
13 |
14 | ## Layer Normalization
15 |
16 | Layer normalization is a simpler normalization method that works
17 | on a wider range of settings.
18 | Layer normalization transforms the inputs to have zero mean and unit variance
19 | across the features.
20 | *Note that batch normalization fixes the zero mean and unit variance for each element.*
21 | Layer normalization does it for each batch across all elements.
22 |
23 | Layer normalization is generally used for NLP tasks.
24 |
25 | We have used layer normalization in most of the
26 | [transformer implementations](https://nn.labml.ai/transformers/gpt/index.html).
--------------------------------------------------------------------------------
/labml_nn/capsule_networks/readme.md:
--------------------------------------------------------------------------------
1 | # [Capsule Networks](https://nn.labml.ai/capsule_networks/index.html)
2 |
3 | This is a [PyTorch](https://pytorch.org) implementation/tutorial of
4 | [Dynamic Routing Between Capsules](https://arxiv.org/abs/1710.09829).
5 |
6 | Capsule network is a neural network architecture that embeds features
7 | as capsules and routes them with a voting mechanism to next layer of capsules.
8 |
9 | Unlike in other implementations of models, we've included a sample, because
10 | it is difficult to understand some concepts with just the modules.
11 | [This is the annotated code for a model that uses capsules to classify MNIST dataset](mnist.html)
12 |
13 | This file holds the implementations of the core modules of Capsule Networks.
14 |
15 | I used [jindongwang/Pytorch-CapsuleNet](https://github.com/jindongwang/Pytorch-CapsuleNet) to clarify some
16 | confusions I had with the paper.
17 |
18 | Here's a notebook for training a Capsule Network on MNIST dataset.
19 |
20 | [](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/capsule_networks/mnist.ipynb)
21 | [](https://app.labml.ai/run/e7c08e08586711ebb3e30242ac1c0002)
22 |
--------------------------------------------------------------------------------
/labml_nn/rl/ppo/readme.md:
--------------------------------------------------------------------------------
1 | # [Proximal Policy Optimization - PPO](https://nn.labml.ai/rl/ppo/index.html)
2 |
3 | This is a [PyTorch](https://pytorch.org) implementation of
4 | [Proximal Policy Optimization - PPO](https://arxiv.org/abs/1707.06347).
5 |
6 | PPO is a policy gradient method for reinforcement learning.
7 | Simple policy gradient methods one do a single gradient update per sample (or a set of samples).
8 | Doing multiple gradient steps for a singe sample causes problems
9 | because the policy deviates too much producing a bad policy.
10 | PPO lets us do multiple gradient updates per sample by trying to keep the
11 | policy close to the policy that was used to sample data.
12 | It does so by clipping gradient flow if the updated policy
13 | is not close to the policy used to sample the data.
14 |
15 | You can find an experiment that uses it [here](https://nn.labml.ai/rl/ppo/experiment.html).
16 | The experiment uses [Generalized Advantage Estimation](https://nn.labml.ai/rl/ppo/gae.html).
17 |
18 | [](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/rl/ppo/experiment.ipynb)
19 | [](https://app.labml.ai/run/6eff28a0910e11eb9b008db315936e2f)
20 |
--------------------------------------------------------------------------------
/labml_nn/gan/wasserstein/experiment.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: WGAN experiment with MNIST
4 | summary: This experiment generates MNIST images using convolutional neural network.
5 | ---
6 |
7 | # WGAN experiment with MNIST
8 | """
9 | from labml import experiment
10 |
11 | from labml.configs import calculate
12 | # Import configurations from [DCGAN experiment](../dcgan/index.html)
13 | from labml_nn.gan.dcgan import Configs
14 |
15 | # Import [Wasserstein GAN losses](./index.html)
16 | from labml_nn.gan.wasserstein import GeneratorLoss, DiscriminatorLoss
17 |
18 | # Set configurations options for Wasserstein GAN losses
19 | calculate(Configs.generator_loss, 'wasserstein', lambda c: GeneratorLoss())
20 | calculate(Configs.discriminator_loss, 'wasserstein', lambda c: DiscriminatorLoss())
21 |
22 |
23 | def main():
24 | # Create configs object
25 | conf = Configs()
26 | # Create experiment
27 | experiment.create(name='mnist_wassertein_dcgan', comment='test')
28 | # Override configurations
29 | experiment.configs(conf,
30 | {
31 | 'discriminator': 'cnn',
32 | 'generator': 'cnn',
33 | 'label_smoothing': 0.01,
34 | 'generator_loss': 'wasserstein',
35 | 'discriminator_loss': 'wasserstein',
36 | })
37 |
38 | # Start the experiment and run training loop
39 | with experiment.start():
40 | conf.run()
41 |
42 |
43 | if __name__ == '__main__':
44 | main()
45 |
--------------------------------------------------------------------------------
/labml_nn/normalization/group_norm/readme.md:
--------------------------------------------------------------------------------
1 | # [Group Normalization](https://nn.labml.ai/normalization/group_norm/index.html)
2 |
3 | This is a [PyTorch](https://pytorch.org) implementation of
4 | the [Group Normalization](https://arxiv.org/abs/1803.08494) paper.
5 |
6 | [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html) works well for large enough batch sizes
7 | but not well for small batch sizes, because it normalizes over the batch.
8 | Training large models with large batch sizes is not possible due to the memory capacity of the
9 | devices.
10 |
11 | This paper introduces Group Normalization, which normalizes a set of features together as a group.
12 | This is based on the observation that classical features such as
13 | [SIFT](https://en.wikipedia.org/wiki/Scale-invariant_feature_transform) and
14 | [HOG](https://en.wikipedia.org/wiki/Histogram_of_oriented_gradients) are group-wise features.
15 | The paper proposes dividing feature channels into groups and then separately normalizing
16 | all channels within each group.
17 |
18 | Here's a [CIFAR 10 classification model](https://nn.labml.ai/normalization/group_norm/experiment.html) that uses instance normalization.
19 |
20 | [](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/group_norm/experiment.ipynb)
21 | [](https://app.labml.ai/run/081d950aa4e011eb8f9f0242ac1c0002)
22 | [](https://wandb.ai/vpj/cifar10/runs/310etthp)
--------------------------------------------------------------------------------
/labml_nn/transformers/xl/readme.md:
--------------------------------------------------------------------------------
1 | # [Transformer XL](https://nn.labml.ai/transformers/xl/index.html)
2 |
3 | This is an implementation of
4 | [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860)
5 | in [PyTorch](https://pytorch.org).
6 |
7 | Transformer has a limited attention span,
8 | equal to the length of the sequence trained in parallel.
9 | All these positions have a fixed positional encoding.
10 | Transformer XL increases this attention span by letting
11 | each of the positions pay attention to precalculated past embeddings.
12 | For instance if the context length is $l$, it will keep the embeddings of
13 | all layers for previous batch of length $l$ and feed them to current step.
14 | If we use fixed-positional encodings these pre-calculated embeddings will have
15 | the same positions as the current context.
16 | They introduce relative positional encoding, where the positional encodings
17 | are introduced at the attention calculation.
18 |
19 | Annotated implementation of relative multi-headed attention is in [`relative_mha.py`](https://nn.labml.ai/transformers/xl/relative_mha.html).
20 |
21 | Here's [the training code](https://nn.labml.ai/transformers/xl/experiment.html) and a notebook for training a transformer XL model on Tiny Shakespeare dataset.
22 |
23 | [](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/xl/experiment.ipynb)
24 | [](https://app.labml.ai/run/d3b6760c692e11ebb6a70242ac1c0002)
25 |
--------------------------------------------------------------------------------
/utils/sitemap.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import git
4 |
5 | HOME = Path('./labml_nn')
6 | REPO = git.Repo('.')
7 |
8 |
9 | def collect(path: Path):
10 | if path.is_file():
11 | try:
12 | commit = next(iter(REPO.iter_commits(paths=path)))
13 | except StopIteration:
14 | return []
15 |
16 | html = path.relative_to(HOME)
17 | if html.stem == '__init__':
18 | html = html.parent / 'index.html'
19 | else:
20 | html = html.parent / f'{html.stem}.html'
21 |
22 | return [{'path': str(html), 'date': str(commit.committed_datetime.date())}]
23 |
24 | urls = []
25 | for f in path.iterdir():
26 | urls += collect(f)
27 |
28 | return urls
29 |
30 |
31 | def main():
32 | urls = []
33 | for f in HOME.iterdir():
34 | urls += collect(f)
35 |
36 | urls = [f'''
37 |
38 | https://nn.labml.ai/{u['path']}
39 | {u['date']}T16:30:00+00:00
40 | 1.00
41 |
42 | ''' for u in urls]
43 |
44 | urls = '\n'.join(urls)
45 | xml = f'''
46 |
47 |
52 | {urls}
53 |
54 | '''
55 |
56 | with open(str(HOME.parent / 'docs' / 'sitemap.xml'), 'w') as f:
57 | f.write(xml)
58 |
59 |
60 | if __name__ == '__main__':
61 | main()
62 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | with open("readme.md", "r") as f:
4 | long_description = f.read()
5 |
6 | setuptools.setup(
7 | name='labml-nn',
8 | version='0.4.99',
9 | author="Varuna Jayasiri, Nipun Wijerathne",
10 | author_email="vpjayasiri@gmail.com, hnipun@gmail.com",
11 | description="A collection of PyTorch implementations of neural network architectures and layers.",
12 | long_description=long_description,
13 | long_description_content_type="text/markdown",
14 | url="https://github.com/lab-ml/nn",
15 | project_urls={
16 | 'Documentation': 'https://lab-ml.com/'
17 | },
18 | packages=setuptools.find_packages(exclude=('labml', 'labml.*',
19 | 'labml_samples', 'labml_samples.*',
20 | 'labml_helpers', 'labml_helpers.*',
21 | 'test',
22 | 'test.*')),
23 | install_requires=['labml>=0.4.110',
24 | 'labml-helpers>=0.4.77',
25 | 'torch',
26 | 'einops',
27 | 'numpy'],
28 | classifiers=[
29 | "Programming Language :: Python :: 3",
30 | "License :: OSI Approved :: MIT License",
31 | 'Intended Audience :: Developers',
32 | 'Intended Audience :: Science/Research',
33 | 'Topic :: Scientific/Engineering',
34 | 'Topic :: Scientific/Engineering :: Mathematics',
35 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
36 | 'Topic :: Software Development',
37 | 'Topic :: Software Development :: Libraries',
38 | 'Topic :: Software Development :: Libraries :: Python Modules',
39 | ],
40 | keywords='machine learning',
41 | )
42 |
--------------------------------------------------------------------------------
/labml_nn/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Utilities
4 | summary: A bunch of utility functions and classes
5 | ---
6 |
7 | # Utilities
8 | """
9 |
10 | import copy
11 |
12 | from torch.utils.data import Dataset, IterableDataset
13 |
14 | from labml_helpers.module import M, TypedModuleList
15 |
16 |
17 | def clone_module_list(module: M, n: int) -> TypedModuleList[M]:
18 | """
19 | ## Clone Module
20 |
21 | Make a `nn.ModuleList` with clones of a given module
22 | """
23 | return TypedModuleList([copy.deepcopy(module) for _ in range(n)])
24 |
25 |
26 | def cycle_dataloader(data_loader):
27 | """
28 |
29 | ## Cycle Data Loader
30 |
31 | Infinite loader that recycles the data loader after each epoch
32 | """
33 | while True:
34 | for batch in data_loader:
35 | yield batch
36 |
37 |
38 | class MapStyleDataset(Dataset):
39 | """
40 |
41 | ## Map Style Dataset
42 |
43 | This converts an [`IterableDataset`](https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset)
44 | to a [map-style dataset](https://pytorch.org/docs/stable/data.html#map-style-datasets)
45 | so that we can shuffle the dataset.
46 |
47 | *This only works when the dataset size is small and can be held in memory.*
48 | """
49 |
50 | def __init__(self, dataset: IterableDataset):
51 | # Load the data to memory
52 | self.data = [d for d in dataset]
53 |
54 | def __getitem__(self, idx: int):
55 | """Get a sample by index"""
56 | return self.data[idx]
57 |
58 | def __iter__(self):
59 | """Create an iterator"""
60 | return iter(self.data)
61 |
62 | def __len__(self):
63 | """Size of the dataset"""
64 | return len(self.data)
65 |
--------------------------------------------------------------------------------
/labml_nn/optimizers/performance_test.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Test performance of Adam implementations
4 | summary: This experiment compares performance of Adam implementations.
5 | ---
6 |
7 | # Performance testing Adam
8 |
9 | ```
10 | TorchAdam warmup...[DONE] 222.59ms
11 | TorchAdam...[DONE] 1,356.01ms
12 | MyAdam warmup...[DONE] 119.15ms
13 | MyAdam...[DONE] 1,192.89ms
14 | ```
15 |
16 | [](https://colab.research.google.com/drive/1ngowaAsADj8VdZfBifu_6L6rtjGoEeoR?usp=sharing)
17 | """
18 |
19 | import torch
20 | import torch.nn as nn
21 | from labml_helpers.device import DeviceInfo
22 | from torch.optim import Adam as TorchAdam
23 |
24 | from labml import monit
25 | from labml_nn.optimizers.adam import Adam as MyAdam
26 | from labml_nn.optimizers.mnist_experiment import Model
27 |
28 |
29 | def test():
30 | device_info = DeviceInfo(use_cuda=True, cuda_device=0)
31 | print(device_info)
32 | inp = torch.randn((64, 1, 28, 28), device=device_info.device)
33 | target = torch.ones(64, dtype=torch.long, device=device_info.device)
34 | loss_func = nn.CrossEntropyLoss()
35 | model = Model().to(device_info.device)
36 | my_adam = MyAdam(model.parameters())
37 | torch_adam = TorchAdam(model.parameters())
38 | loss = loss_func(model(inp), target)
39 | loss.backward()
40 | with monit.section('MyAdam warmup'):
41 | for i in range(100):
42 | my_adam.step()
43 | with monit.section('MyAdam'):
44 | for i in range(1000):
45 | my_adam.step()
46 | with monit.section('TorchAdam warmup'):
47 | for i in range(100):
48 | torch_adam.step()
49 | with monit.section('TorchAdam'):
50 | for i in range(1000):
51 | torch_adam.step()
52 |
53 |
54 | if __name__ == '__main__':
55 | test()
56 |
--------------------------------------------------------------------------------
/labml_nn/transformers/knn/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: k-Nearest Neighbor Language Models
4 | summary: >
5 | This is a simple PyTorch implementation/tutorial of the paper
6 | Generalization through Memorization: Nearest Neighbor Language Models using FAISS.
7 | It runs a kNN model on the final transformer layer embeddings to improve the
8 | loss of transformer based language models.
9 | It's also great for domain adaptation without pre-training.
10 | ---
11 |
12 | # k-Nearest Neighbor Language Models
13 |
14 | This is a [PyTorch](https://pytorch.org) implementation of the paper
15 | [Generalization through Memorization: Nearest Neighbor Language Models](https://arxiv.org/abs/1911.00172).
16 | It uses k-nearest neighbors to improve perplexity of autoregressive transformer models.
17 |
18 | An autoregressive language model estimates $p(w_t | \color{yellowgreen}{c_t})$,
19 | where $w_t$ is the token at step $t$
20 | and $c_t$ is the context, $\color{yellowgreen}{c_t} = (w_1, w_2, ..., w_{t-1})$.
21 |
22 | This paper, improves $p(w_t | \color{yellowgreen}{c_t})$ using a k-nearest neighbor search
23 | on key-value pairs $\big(f(c_i), w_i\big)$, with search key $f(\color{yellowgreen}{c_t})$.
24 | Here $f(\color{yellowgreen}{c_t})$ is an embedding of the context $\color{yellowgreen}{c_t}$.
25 | The paper (and this implementation) uses the **input to the feed-forward layer of the
26 | final layer of the transformer** as $f(\color{yellowgreen}{c_t})$.
27 |
28 | We use [FAISS](https://github.com/facebookresearch/faiss) to index $f(c_i)$.
29 |
30 | ### Implementation
31 |
32 | So to run $k$NN-LM we need to:
33 |
34 | * [Train a transformer model](train_model.html)
35 | * [Build an index](build_index.html) of $\big(f(c_i), w_i\big)$
36 | * [Evaluate kNN-ML](eval_knn.html) using $k$NN seach on $\big(f(c_i), w_i\big)$
37 | with $f(\color{yellowgreen}{c_t})$
38 |
39 | This experiment uses a small dataset so that we can run this without using up a few hundred giga-bytes
40 | of disk space for the index.
41 |
42 | The official implementation of $k$NN-LM can be found [here](https://github.com/urvashik/knnlm).
43 | """
44 |
--------------------------------------------------------------------------------
/labml_nn/transformers/switch/readme.md:
--------------------------------------------------------------------------------
1 | # [Switch Transformer](https://nn.labml.ai/transformers/switch/index.html)
2 |
3 | This is a miniature [PyTorch](https://pytorch.org) implementation of the paper
4 | [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961).
5 | Our implementation only has a few million parameters and doesn't do model parallel distributed training.
6 | It does single GPU training, but we implement the concept of switching as described in the paper.
7 |
8 | The Switch Transformer uses different parameters for each token by switching among parameters
9 | based on the token.
10 | Therefore, only a fraction of parameters are chosen for each token.
11 | So you can have more parameters but less computational cost.
12 |
13 | The switching happens at the Position-wise Feedforward network (FFN) of each transformer block.
14 | Position-wise feedforward network consists of two sequentially fully connected layers.
15 | In switch transformer we have multiple FFNs (multiple experts),
16 | and we chose which one to use based on a router.
17 | The output is a set of probabilities for picking a FFN,
18 | and we pick the one with the highest probability and only evaluate that.
19 | So essentially the computational cost is the same as having a single FFN.
20 | In our implementation this doesn't parallelize well when you have many or large FFNs since it's all
21 | happening on a single GPU.
22 | In a distributed setup you would have each FFN (each very large) on a different device.
23 |
24 | The paper introduces another loss term to balance load among the experts (FFNs) and
25 | discusses dropping tokens when routing is not balanced.
26 |
27 | Here's [the training code](experiment.html) and a notebook for training a switch transformer on Tiny Shakespeare dataset.
28 |
29 | [](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/switch/experiment.ipynb)
30 | [](https://app.labml.ai/run/c4656c605b9311eba13d0242ac1c0002)
31 |
--------------------------------------------------------------------------------
/labml_nn/normalization/weight_standardization/conv2d.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: 2D Convolution Layer with Weight Standardization
4 | summary: >
5 | A PyTorch implementation/tutorial of a 2D Convolution Layer with Weight Standardization.
6 | ---
7 |
8 | # 2D Convolution Layer with Weight Standardization
9 |
10 | This is an implementation of a 2 dimensional convolution layer with [Weight Standardization](./index.html)
11 | """
12 |
13 | import torch
14 | import torch.nn as nn
15 | from torch.nn import functional as F
16 |
17 | from labml_nn.normalization.weight_standardization import weight_standardization
18 |
19 |
20 | class Conv2d(nn.Conv2d):
21 | """
22 | ## 2D Convolution Layer
23 |
24 | This extends the standard 2D Convolution layer and standardize the weights before the convolution step.
25 | """
26 | def __init__(self, in_channels, out_channels, kernel_size,
27 | stride=1,
28 | padding=0,
29 | dilation=1,
30 | groups: int = 1,
31 | bias: bool = True,
32 | padding_mode: str = 'zeros',
33 | eps: float = 1e-5):
34 | super(Conv2d, self).__init__(in_channels, out_channels, kernel_size,
35 | stride=stride,
36 | padding=padding,
37 | dilation=dilation,
38 | groups=groups,
39 | bias=bias,
40 | padding_mode=padding_mode)
41 | self.eps = eps
42 |
43 | def forward(self, x: torch.Tensor):
44 | return F.conv2d(x, weight_standardization(self.weight, self.eps), self.bias, self.stride,
45 | self.padding, self.dilation, self.groups)
46 |
47 |
48 | def _test():
49 | """
50 | A simple test to verify the tensor sizes
51 | """
52 | conv2d = Conv2d(10, 20, 5)
53 | from labml.logger import inspect
54 | inspect(conv2d.weight)
55 | import torch
56 | inspect(conv2d(torch.zeros(10, 10, 100, 100)))
57 |
58 |
59 | if __name__ == '__main__':
60 | _test()
61 |
--------------------------------------------------------------------------------
/labml_nn/normalization/batch_norm/cifar10.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: CIFAR10 Experiment to try Group Normalization
4 | summary: >
5 | This trains is a simple convolutional neural network that uses group normalization
6 | to classify CIFAR10 images.
7 | ---
8 |
9 | # CIFAR10 Experiment for Group Normalization
10 | """
11 |
12 | import torch.nn as nn
13 |
14 | from labml import experiment
15 | from labml.configs import option
16 | from labml_helpers.module import Module
17 | from labml_nn.experiments.cifar10 import CIFAR10Configs
18 | from labml_nn.normalization.batch_norm import BatchNorm
19 |
20 |
21 | class Model(Module):
22 | def __init__(self):
23 | super().__init__()
24 | layers = []
25 | in_channels = 3
26 | for block in [[64, 64], [128, 128], [256, 256, 256], [512, 512, 512], [512, 512, 512]]:
27 | for channels in block:
28 | layers += [nn.Conv2d(in_channels, channels, kernel_size=3, padding=1),
29 | BatchNorm(channels),
30 | nn.ReLU(inplace=True)]
31 | in_channels = channels
32 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
33 | layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
34 | self.layers = nn.Sequential(*layers)
35 | self.fc = nn.Linear(512, 10)
36 |
37 | def __call__(self, x):
38 | x = self.layers(x)
39 | x = x.view(x.shape[0], -1)
40 | return self.fc(x)
41 |
42 |
43 | @option(CIFAR10Configs.model)
44 | def model(c: CIFAR10Configs):
45 | """
46 | ### Create model
47 | """
48 | return Model().to(c.device)
49 |
50 |
51 | def main():
52 | # Create experiment
53 | experiment.create(name='cifar10', comment='batch norm')
54 | # Create configurations
55 | conf = CIFAR10Configs()
56 | # Load configurations
57 | experiment.configs(conf, {
58 | 'optimizer.optimizer': 'Adam',
59 | 'optimizer.learning_rate': 2.5e-4,
60 | 'train_batch_size': 64,
61 | })
62 | # Start the experiment and run the training loop
63 | with experiment.start():
64 | conf.run()
65 |
66 |
67 | #
68 | if __name__ == '__main__':
69 | main()
70 |
--------------------------------------------------------------------------------
/labml_nn/transformers/feedback/readme.md:
--------------------------------------------------------------------------------
1 | # [Feedback Transformer](https://nn.labml.ai/transformers/feedback/index.html)
2 |
3 | This is a [PyTorch](https://pytorch.org) implementation of the paper
4 | [Accessing Higher-level Representations in Sequential Transformers with Feedback Memory](https://arxiv.org/abs/2002.09402).
5 |
6 | Normal transformers process tokens in parallel. Each transformer layer pays attention
7 | to the outputs of the previous layer.
8 | Feedback transformer pays attention to the output of all layers in previous steps.
9 | So this adds recurrence, and we need to process token-by-token.
10 | This slows down the training significantly (about 5X - 10X depending on the sequence length).
11 | However, when predicting Feedback Transformer is faster because you can predict the next token
12 | if you cache the memory vectors.
13 |
14 | In order to speed up the training the paper discusses starting with a short sequence length and
15 | gradually increasing it.
16 | They also discuss using a pretrained parallel transformer as the starting point.
17 |
18 | The original feedback transformer doesn't keep the outputs of all layers.
19 | Instead it keeps weighted sum of the output of all layers.
20 | This reduces the memory used for caching during prediction.
21 | The first half of this file implements this.
22 |
23 | The updated feedback transformer shares weights used
24 | to calculate keys and values among the layers.
25 | We then calculate the keys and values for each step only once and keep
26 | them cached.
27 | The [second half](#shared_kv) of this file implements this.
28 | We implemented a custom PyTorch function to improve performance.
29 |
30 | Here's [the training code](experiment.html) and a notebook for training a feedback transformer on Tiny Shakespeare dataset.
31 |
32 | [Colab Notebook](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb)
33 |
34 | [](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb)
35 | [](https://app.labml.ai/run/d8eb9416530a11eb8fb50242ac1c0002)
36 |
--------------------------------------------------------------------------------
/labml_nn/cnn/cross_validation.py:
--------------------------------------------------------------------------------
1 |
2 | import torch
3 | import torchvision
4 | import torchvision.transforms as transforms
5 | from torch.utils.data.sampler import SubsetRandomSampler
6 | import matplotlib.pyplot as plt
7 | import numpy as np
8 | import torch.optim as optim
9 | from torchsummary import summary
10 | import torch.nn as nn
11 |
12 | # from models.mlp import MLP
13 | # from utils.utils import *
14 | # from utils.train_dataset import *
15 | #from nutsflow import Take, Consume
16 | #from nutsml import *
17 | from utils.dataloader import *
18 | from models.cnn import CNN
19 | from utils.train import Trainer
20 |
21 | from utils.cv_train import *
22 |
23 | # Check if GPU is available
24 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
25 | print("Device: " + str(device))
26 |
27 | # Cifar 10 Datasets location
28 | save='./data/Cifar10'
29 |
30 | # Transformations train
31 | transform_train = transforms.Compose(
32 | [transforms.ToTensor(),
33 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
34 |
35 | # Load train dataset and dataloader
36 | trainset = LoadCifar10DatasetTrain(save, transform_train)
37 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
38 | shuffle=True, num_workers=4)
39 |
40 | # Transformations test (for inference later)
41 | transform_test = transforms.Compose(
42 | [transforms.ToTensor(),
43 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
44 |
45 | # Load test dataset and dataloader (for inference later)
46 | testset = LoadCifar10DatasetTest(save, transform_test)
47 | testloader = torch.utils.data.DataLoader(testset, batch_size=64,
48 | shuffle=False, num_workers=4)
49 |
50 | # Specify loss function
51 | cost = nn.CrossEntropyLoss()
52 |
53 | epochs=25 #10
54 | splits = 4 #5
55 |
56 | # Training - Cross-validation
57 | history = cross_val_train(cost, trainset, epochs, splits, device=device)
58 |
59 | # Inference
60 | best_model, best_val_accuracy = retreive_best_trial()
61 | print("Best Validation Accuracy = %.3f"%(best_val_accuracy))
62 |
63 | # Testing
64 | accuracy = Test(best_model, cost, testloader, device=device)
65 | print("Test Accuracy = %.3f"%(accuracy['val_acc']))
66 |
--------------------------------------------------------------------------------
/labml_nn/resnets/pretrained_nets.py:
--------------------------------------------------------------------------------
1 | #!/bin/python
2 |
3 | from utils.train import Trainer # Default custom training class
4 | from models.resnet import *
5 | from torchvision import models
6 |
7 | # GPU Check
8 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
9 | print("Device: " + str(device))
10 |
11 | # Use different train/test data augmentations
12 | transform_test = transforms.Compose(
13 | [transforms.ToTensor(),
14 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
15 |
16 | # Get Cifar 10 Datasets
17 | save='./data/Cifar10'
18 | transform_train = transforms.Compose([
19 | transforms.RandomHorizontalFlip(p=1.0),
20 | transforms.RandomRotation(20),
21 | transforms.RandomCrop(32, (2, 2), pad_if_needed=False, padding_mode='constant'),
22 | transforms.ToTensor(),
23 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
24 |
25 | # Get Cifar 10 Datasets
26 | trainset = torchvision.datasets.CIFAR10(root=save, train=True, download=True, transform=transform_train)
27 | testset = torchvision.datasets.CIFAR10(root=save, train=False, download=True, transform=transform_test)
28 |
29 | # Get Cifar 10 Dataloaders
30 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
31 | shuffle=True, num_workers=4)
32 |
33 | testloader = torch.utils.data.DataLoader(testset, batch_size=64,
34 | shuffle=False, num_workers=4)
35 |
36 | #################################
37 | # Load the pre-trained model
38 | #################################
39 |
40 | model_ft = models.resnet18(pretrained=True)
41 | num_ftrs = model_ft.fc.in_features
42 | model_ft.fc = nn.Sequential(
43 | nn.Dropout(0.5),
44 | nn.Linear(num_ftrs, 10)
45 | )
46 |
47 |
48 | model_ft = model_ft.to(device)
49 |
50 | # Loss function
51 | cost = nn.CrossEntropyLoss()
52 |
53 | # Optimizer
54 | lr = 0.0005
55 | # opt = optim.SGD(model_ft.parameters(), lr=lr, momentum=0.9)
56 | opt = torch.optim.Adam(model_ft.parameters(), lr=lr, betas=(0.9, 0.95), weight_decay=1e-4) #0.0005 l2_factor.item()
57 |
58 | # Create a trainer
59 | trainer = Trainer(model_ft, opt, cost, name="Transfer-learning",lr=lr , use_lr_schedule=True, device=device)
60 |
61 | # Run training
62 | epochs = 25
63 | trainer.Train(trainloader, epochs, testloader=testloader)
64 | # trainer.Train(trainloader, epochs) # check train error
65 |
66 | print('done')
67 |
--------------------------------------------------------------------------------
/labml_nn/transformers/compressive/readme.md:
--------------------------------------------------------------------------------
1 | # [Compressive Transformer](https://nn.labml.ai/transformers/compressive/index.html)
2 |
3 | This is an implementation of
4 | [Compressive Transformers for Long-Range Sequence Modelling](https://arxiv.org/abs/1911.05507)
5 | in [PyTorch](https://pytorch.org).
6 |
7 | This is an extension of [Transformer XL](https://nn.labml.ai/transformers/xl/index.html) where past memories
8 | are compressed to give a longer attention range.
9 | That is, the furthest $n_{cm} c$ memories are compressed into
10 | $n_{cm}$ memories, where $c$ is the compression rate.
11 |
12 | ## Compression operation
13 |
14 | The compression operation is defined as
15 | $f_c: \mathbb{R}^{nc \times d} \rightarrow \mathbb{R}^{n \times d}$.
16 | The paper introduces multiple choices for $f_c$ and we have only implemented
17 | 1D convolution which seems to give the best results.
18 | Each layer has a separate compression operation $f_c^{(i)}$ where
19 | $i$ is the layer number.
20 |
21 | ## Training compression operation
22 |
23 | Since training compression with BPTT requires maintaining
24 | a very large computational graph (many time steps), the paper proposes
25 | an *auto-encoding loss* and an *attention reconstruction loss*.
26 | The auto-encoding loss decodes the original memories from the compressed memories
27 | and calculates the loss.
28 | Attention reconstruction loss computes the multi-headed attention results
29 | on the compressed memory and on uncompressed memory and gets a mean squared error
30 | between them.
31 | We have implemented the latter here since it gives better results.
32 |
33 | This implementation uses pre-layer normalization
34 | while the paper uses post-layer normalization.
35 | Pre-layer norm does the layer norm before FFN[../feedforward.html) and
36 | self-attention, and the pass-through in the residual connection is not normalized.
37 | This is supposed to be more stable in standard transformer setups.
38 |
39 | Here are [the training code](https://nn.labml.ai/transformers/compressive/experiment.html) and a notebook for training a compressive transformer
40 | model on the Tiny Shakespeare dataset.
41 |
42 | [](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/compressive/experiment.ipynb)
43 | [](https://app.labml.ai/run/0d9b5338726c11ebb7c80242ac1c0002)
44 |
--------------------------------------------------------------------------------
/labml_nn/transformers/label_smoothing_loss.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Label Smoothing Loss
4 | summary: >
5 | This is an implementation of label smoothing loss, that can be used as
6 | an alternative to cross entropy loss for improved accuracy.
7 | ---
8 |
9 | # Label Smoothing Loss
10 | """
11 | import matplotlib.pyplot as plt
12 | import numpy as np
13 | import torch
14 | import torch.nn as nn
15 |
16 | from labml_helpers.module import Module
17 |
18 |
19 | class LabelSmoothingLoss(Module):
20 | def __init__(self, size: int, padding_idx: int, smoothing: float = 0.0):
21 | super().__init__()
22 | self.loss = nn.KLDivLoss(reduction='sum')
23 | self.padding_idx = padding_idx
24 | self.confidence = 1.0 - smoothing
25 | self.smoothing = smoothing
26 | self.size = size
27 | self.true_dist = None
28 |
29 | def forward(self, x: torch.Tensor, target: torch.Tensor):
30 | assert x.shape[1] == self.size
31 | true_dist = x.clone()
32 | true_dist.fill_(self.smoothing / (self.size - 2))
33 | true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
34 | true_dist[:, self.padding_idx] = 0
35 | mask = torch.nonzero(target == self.padding_idx, as_tuple=False)
36 | if mask.dim() > 0:
37 | true_dist.index_fill_(0, mask.squeeze(), 0.0)
38 | self.true_dist = true_dist
39 | return self.loss(x, true_dist.detach())
40 |
41 |
42 | def _test_label_smoothing():
43 | smooth_loss = LabelSmoothingLoss(5, 0, 0.4)
44 | predict = torch.tensor([[0, 0.2, 0.7, 0.1, 0],
45 | [0, 0.2, 0.7, 0.1, 0],
46 | [0, 0.2, 0.7, 0.1, 0]], dtype=torch.float)
47 | _ = smooth_loss(predict.log(),
48 | torch.tensor([2, 1, 0], dtype=torch.long))
49 |
50 | # Show the target distributions expected by the system.
51 | plt.imshow(smooth_loss.true_dist)
52 | plt.show()
53 |
54 | smooth_loss = LabelSmoothingLoss(5, 0, 0.1)
55 |
56 | def loss_sample(x):
57 | d = x + 3 * 1
58 | predict2 = torch.tensor([[0, x / d, 1 / d, 1 / d, 1 / d],
59 | ], dtype=torch.float)
60 | # print(predict)
61 | return smooth_loss(predict2.log(),
62 | torch.tensor([1], dtype=torch.long)).item()
63 |
64 | plt.plot(np.arange(1, 100), [loss_sample(x) for x in range(1, 100)])
65 | plt.show()
66 |
67 |
68 | if __name__ == '__main__':
69 | _test_label_smoothing()
70 |
--------------------------------------------------------------------------------
/labml_nn/optimizers/adam_warmup.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Adam optimizer with warm-up
4 | summary: A simple PyTorch implementation/tutorial of Adam optimizer with warm-up.
5 | ---
6 |
7 | # Adam Optimizer with Warmup
8 |
9 | This extends [AMSGrad optimizer](amsgrad.html) and adds a warmup stage.
10 | """
11 |
12 | from typing import Dict
13 |
14 | from labml_nn.optimizers import WeightDecay
15 | from labml_nn.optimizers.amsgrad import AMSGrad
16 |
17 |
18 | class AdamWarmup(AMSGrad):
19 | """
20 | ## Adam Optimizer with Warmup
21 |
22 | This class extends from AMSGrad optimizer defined in [`amsgrad.py`](amsgrad.html).
23 | """
24 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
25 | weight_decay: WeightDecay = WeightDecay(),
26 | optimized_update: bool = True,
27 | amsgrad=False, warmup=0, defaults=None):
28 | """
29 | ### Initialize the optimizer
30 |
31 | * `params` is the list of parameters
32 | * `lr` is the learning rate $\alpha$
33 | * `betas` is a tuple of ($\beta_1$, $\beta_2$)
34 | * `eps` is $\hat{\epsilon}$ or $\epsilon$ based on `optimized_update`
35 | * `weight_decay` is an instance of class `WeightDecay` defined in [`__init__.py`](index.html)
36 | * 'optimized_update' is a flag whether to optimize the bias correction of the second moment
37 | by doing it after adding $\epsilon$
38 | * `amsgrad` is a flag indicating whether to use AMSGrad or fallback to plain Adam
39 | * `warmup` number of warmup steps
40 | * `defaults` is a dictionary of default for group values.
41 | This is useful when you want to extend the class `AdamWarmup`.
42 | """
43 |
44 | defaults = {} if defaults is None else defaults
45 | defaults.update(dict(warmup=warmup))
46 | super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults)
47 |
48 | def get_lr(self, state: Dict[str, any], group: Dict[str, any]):
49 | """
50 | ### Get learning-rate
51 |
52 | $$\alpha \min \bigg(1, \frac{t}{w}\bigg)$$
53 | where $w$ is the number of warmup steps.
54 | """
55 | # If we are in warmup stage
56 | if group['warmup'] > state['step']:
57 | # A linearly increasing learning rate from $0$ to $\alpha$
58 | return 1e-8 + state['step'] * group['lr'] / group['warmup']
59 | else:
60 | # Constant learning rate $\alpha$
61 | return group['lr']
62 |
--------------------------------------------------------------------------------
/labml_nn/normalization/weight_standardization/experiment.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: CIFAR10 Experiment to try Weight Standardization and Batch-Channel Normalization
4 | summary: >
5 | This trains is a VGG net that uses weight standardization and batch-channel normalization
6 | to classify CIFAR10 images.
7 | ---
8 |
9 | # CIFAR10 Experiment to try Weight Standardization and Batch-Channel Normalization
10 | """
11 |
12 | import torch.nn as nn
13 |
14 | from labml import experiment
15 | from labml.configs import option
16 | from labml_helpers.module import Module
17 | from labml_nn.experiments.cifar10 import CIFAR10Configs
18 | from labml_nn.normalization.batch_channel_norm import BatchChannelNorm
19 | from labml_nn.normalization.weight_standardization.conv2d import Conv2d
20 |
21 |
22 | class Model(Module):
23 | """
24 | ### Model
25 |
26 | A VGG model that use [Weight Standardization](./index.html) and
27 | [Batch-Channel Normalization](../batch_channel_norm/index.html).
28 | """
29 | def __init__(self):
30 | super().__init__()
31 | layers = []
32 | in_channels = 3
33 | for block in [[64, 64], [128, 128], [256, 256, 256], [512, 512, 512], [512, 512, 512]]:
34 | for channels in block:
35 | layers += [Conv2d(in_channels, channels, kernel_size=3, padding=1),
36 | BatchChannelNorm(channels, 32),
37 | nn.ReLU(inplace=True)]
38 | in_channels = channels
39 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
40 | layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
41 | self.layers = nn.Sequential(*layers)
42 | self.fc = nn.Linear(512, 10)
43 |
44 | def __call__(self, x):
45 | x = self.layers(x)
46 | x = x.view(x.shape[0], -1)
47 | return self.fc(x)
48 |
49 |
50 | @option(CIFAR10Configs.model)
51 | def model(c: CIFAR10Configs):
52 | """
53 | ### Create model
54 | """
55 | return Model().to(c.device)
56 |
57 |
58 | def main():
59 | # Create experiment
60 | experiment.create(name='cifar10', comment='weight standardization')
61 | # Create configurations
62 | conf = CIFAR10Configs()
63 | # Load configurations
64 | experiment.configs(conf, {
65 | 'optimizer.optimizer': 'Adam',
66 | 'optimizer.learning_rate': 2.5e-4,
67 | 'train_batch_size': 64,
68 | })
69 | # Start the experiment and run the training loop
70 | with experiment.start():
71 | conf.run()
72 |
73 |
74 | #
75 | if __name__ == '__main__':
76 | main()
77 |
--------------------------------------------------------------------------------
/labml_nn/resnets/utils/utils.py:
--------------------------------------------------------------------------------
1 | #!/bin/python
2 |
3 | import torch
4 | import torchvision
5 | import torchvision.transforms as transforms
6 |
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 |
10 | import matplotlib.pyplot as plt
11 | import numpy as np
12 |
13 | from sklearn.model_selection import KFold
14 | from torch.utils.data.sampler import SubsetRandomSampler
15 |
16 |
17 |
18 | # Plot the loss of multiple runs together
19 | def PlotLosses(losses, titles, save=None):
20 | fig = plt.figure()
21 | fig.set_size_inches(14, 22)
22 | # Plot results on 3 subgraphs
23 | # subplot integers:
24 | # nrows
25 | # ncols
26 | # index
27 | sublplot_str_start = "" + str(len(losses)) + "1"
28 |
29 | for i in range(len(losses)):
30 | subplot = sublplot_str_start + str(i+1)
31 | loss = losses[i]
32 | title = titles[i]
33 |
34 | ax = plt.subplot(int(subplot))
35 | ax.plot(range(len(loss)), loss)
36 | ax.set_xlabel("Epoch")
37 | ax.set_title(title)
38 | ax.set_ylabel("Loss")
39 |
40 | # Save Figure
41 | if save:
42 | plt.savefig(save)
43 | else:
44 | plt.show()
45 |
46 |
47 |
48 | def ClassSpecificTestCifar10(net, testdata, device=None):
49 | classes = ('plane', 'car', 'bird', 'cat','deer', 'dog', 'frog', 'horse', 'ship', 'truck')
50 | class_correct = list(0. for i in range(10))
51 | class_total = list(0. for i in range(10))
52 | with torch.no_grad():
53 | for data in testdata:
54 | if device:
55 | images, labels = data[0].to(device), data[1].to(device)
56 | else:
57 | images, labels = data
58 |
59 | outputs = net(images)
60 | _, predicted = torch.max(outputs, 1)
61 | c = (predicted == labels).squeeze()
62 | for i in range(4):
63 | label = labels[i]
64 | class_correct[label] += c[i].item()
65 | class_total[label] += 1
66 |
67 | # Print out
68 | for i in range(10):
69 | print('Accuracy of %5s : %2d %%' % (
70 | classes[i], 100 * class_correct[i] / class_total[i]))
71 |
72 |
73 |
74 | def GetActivation(name="relu"):
75 | if name == "relu":
76 | return nn.ReLU()
77 | elif name == "leakyrelu":
78 | return nn.LeakyReLU()
79 | elif name == "Sigmoid":
80 | return nn.Sigmoid()
81 | elif name == "Tanh":
82 | return nn.Tanh()
83 | elif name == "Identity":
84 | return nn.Identity()
85 | else:
86 | return nn.ReLU()
--------------------------------------------------------------------------------
/labml_nn/transformers/positional_encoding.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Fixed Positional Encodings
4 | summary: >
5 | Implementation with explanation of fixed positional encodings as
6 | described in paper Attention is All You Need.
7 | ---
8 |
9 | # Fixed Positional Encodings
10 |
11 | The positional encoding encodes the position along the sequence into
12 | a vector of size `d_model`.
13 |
14 | \begin{align}
15 | PE_{p,2i} &= sin\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg) \\
16 | PE_{p,2i + 1} &= cos\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg)
17 | \end{align}
18 |
19 | Where $1 \leq 2i, 2i + 1 \leq d_{model}$
20 | are the feature indexes in the encoding, and $p$ is the position.
21 | """
22 |
23 | import math
24 |
25 | import numpy as np
26 | import torch
27 | import torch.nn as nn
28 |
29 | from labml_helpers.module import Module
30 |
31 |
32 | class PositionalEncoding(Module):
33 | def __init__(self, d_model: int, dropout_prob: float, max_len: int = 5000):
34 | super().__init__()
35 | self.dropout = nn.Dropout(dropout_prob)
36 |
37 | self.register_buffer('positional_encodings', get_positional_encoding(d_model, max_len), False)
38 |
39 | def forward(self, x: torch.Tensor):
40 | pe = self.positional_encodings[:x.shape[0]].detach().requires_grad_(False)
41 | x = x + pe
42 | x = self.dropout(x)
43 | return x
44 |
45 |
46 | def get_positional_encoding(d_model: int, max_len: int = 5000):
47 | # Empty encodings vectors
48 | encodings = torch.zeros(max_len, d_model)
49 | # Position indexes
50 | position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
51 | # $2 * i$
52 | two_i = torch.arange(0, d_model, 2, dtype=torch.float32)
53 | # $10000^{\frac{2i}{d_{model}}$
54 | div_term = torch.exp(two_i * -(math.log(10000.0) / d_model))
55 | # $PE_{p,2i} = sin\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg)$
56 | encodings[:, 0::2] = torch.sin(position * div_term)
57 | # $PE_{p,2i + 1} = cos\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg)$
58 | encodings[:, 1::2] = torch.cos(position * div_term)
59 |
60 | # Add batch dimension
61 | encodings = encodings.unsqueeze(1).requires_grad_(False)
62 |
63 | return encodings
64 |
65 |
66 | def _test_positional_encoding():
67 | import matplotlib.pyplot as plt
68 |
69 | plt.figure(figsize=(15, 5))
70 | pe = get_positional_encoding(20, 100)
71 | plt.plot(np.arange(100), pe[:, 0, 4:8].numpy())
72 | plt.legend(["dim %d" % p for p in [4, 5, 6, 7]])
73 | plt.title("Positional encoding")
74 | plt.show()
75 |
76 |
77 | if __name__ == '__main__':
78 | _test_positional_encoding()
79 |
--------------------------------------------------------------------------------
/labml_nn/normalization/group_norm/experiment.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: CIFAR10 Experiment to try Group Normalization
4 | summary: >
5 | This trains is a simple convolutional neural network that uses group normalization
6 | to classify CIFAR10 images.
7 | ---
8 |
9 | # CIFAR10 Experiment for Group Normalization
10 | """
11 |
12 | import torch.nn as nn
13 |
14 | from labml import experiment
15 | from labml.configs import option
16 | from labml_helpers.module import Module
17 | from labml_nn.experiments.cifar10 import CIFAR10Configs
18 | from labml_nn.normalization.group_norm import GroupNorm
19 |
20 |
21 | class Model(Module):
22 | """
23 | ### VGG model for CIFAR-10 classification
24 | """
25 |
26 | def __init__(self, groups: int = 32):
27 | super().__init__()
28 | layers = []
29 | # RGB channels
30 | in_channels = 3
31 | # Number of channels in each layer in each block
32 | for block in [[64, 64], [128, 128], [256, 256, 256], [512, 512, 512], [512, 512, 512]]:
33 | # Convolution, Normalization and Activation layers
34 | for channels in block:
35 | layers += [nn.Conv2d(in_channels, channels, kernel_size=3, padding=1),
36 | GroupNorm(groups, channels),
37 | nn.ReLU(inplace=True)]
38 | in_channels = channels
39 | # Max pooling at end of each block
40 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
41 |
42 | # Create a sequential model with the layers
43 | self.layers = nn.Sequential(*layers)
44 | # Final logits layer
45 | self.fc = nn.Linear(512, 10)
46 |
47 | def __call__(self, x):
48 | # The VGG layers
49 | x = self.layers(x)
50 | # Reshape for classification layer
51 | x = x.view(x.shape[0], -1)
52 | # Final linear layer
53 | return self.fc(x)
54 |
55 |
56 | class Configs(CIFAR10Configs):
57 | # Number of groups
58 | groups: int = 16
59 |
60 |
61 | @option(Configs.model)
62 | def model(c: Configs):
63 | """
64 | ### Create model
65 | """
66 | return Model(c.groups).to(c.device)
67 |
68 |
69 | def main():
70 | # Create experiment
71 | experiment.create(name='cifar10', comment='group norm')
72 | # Create configurations
73 | conf = Configs()
74 | # Load configurations
75 | experiment.configs(conf, {
76 | 'optimizer.optimizer': 'Adam',
77 | 'optimizer.learning_rate': 2.5e-4,
78 | })
79 | # Start the experiment and run the training loop
80 | with experiment.start():
81 | conf.run()
82 |
83 |
84 | #
85 | if __name__ == '__main__':
86 | main()
87 |
--------------------------------------------------------------------------------
/labml_nn/normalization/batch_norm/mnist.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: MNIST Experiment to try Batch Normalization
4 | summary: >
5 | This trains is a simple convolutional neural network that uses batch normalization
6 | to classify MNIST digits.
7 | ---
8 |
9 | # MNIST Experiment for Batch Normalization
10 | """
11 |
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 | import torch.utils.data
15 |
16 | from labml import experiment
17 | from labml.configs import option
18 | from labml_helpers.module import Module
19 | from labml_nn.experiments.mnist import MNISTConfigs
20 | from labml_nn.normalization.batch_norm import BatchNorm
21 |
22 |
23 | class Model(Module):
24 | """
25 | ### Model definition
26 | """
27 |
28 | def __init__(self):
29 | super().__init__()
30 | # Note that we omit the bias parameter
31 | self.conv1 = nn.Conv2d(1, 20, 5, 1, bias=False)
32 | # Batch normalization with 20 channels (output of convolution layer).
33 | # The input to this layer will have shape `[batch_size, 20, height(24), width(24)]`
34 | self.bn1 = BatchNorm(20)
35 | #
36 | self.conv2 = nn.Conv2d(20, 50, 5, 1, bias=False)
37 | # Batch normalization with 50 channels.
38 | # The input to this layer will have shape `[batch_size, 50, height(8), width(8)]`
39 | self.bn2 = BatchNorm(50)
40 | #
41 | self.fc1 = nn.Linear(4 * 4 * 50, 500, bias=False)
42 | # Batch normalization with 500 channels (output of fully connected layer).
43 | # The input to this layer will have shape `[batch_size, 500]`
44 | self.bn3 = BatchNorm(500)
45 | #
46 | self.fc2 = nn.Linear(500, 10)
47 |
48 | def __call__(self, x: torch.Tensor):
49 | x = F.relu(self.bn1(self.conv1(x)))
50 | x = F.max_pool2d(x, 2, 2)
51 | x = F.relu(self.bn2(self.conv2(x)))
52 | x = F.max_pool2d(x, 2, 2)
53 | x = x.view(-1, 4 * 4 * 50)
54 | x = F.relu(self.bn3(self.fc1(x)))
55 | return self.fc2(x)
56 |
57 |
58 | @option(MNISTConfigs.model)
59 | def model(c: MNISTConfigs):
60 | """
61 | ### Create model
62 |
63 | We use [`MNISTConfigs`](../../experiments/mnist.html#MNISTConfigs) configurations
64 | and set a new function to calculate the model.
65 | """
66 | return Model().to(c.device)
67 |
68 |
69 | def main():
70 | # Create experiment
71 | experiment.create(name='mnist_batch_norm')
72 | # Create configurations
73 | conf = MNISTConfigs()
74 | # Load configurations
75 | experiment.configs(conf, {'optimizer.optimizer': 'Adam'})
76 | # Start the experiment and run the training loop
77 | with experiment.start():
78 | conf.run()
79 |
80 |
81 | #
82 | if __name__ == '__main__':
83 | main()
84 |
--------------------------------------------------------------------------------
/labml_nn/normalization/instance_norm/experiment.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: CIFAR10 Experiment to try Instance Normalization
4 | summary: >
5 | This trains is a simple convolutional neural network that uses instance normalization
6 | to classify CIFAR10 images.
7 | ---
8 |
9 | # CIFAR10 Experiment for Instance Normalization
10 |
11 | This demonstrates the use of an instance normalization layer in a convolutional
12 | neural network for classification. Not that instance normalization was designed for
13 | style transfer and this is only a demo.
14 | """
15 |
16 | import torch.nn as nn
17 |
18 | from labml import experiment
19 | from labml.configs import option
20 | from labml_helpers.module import Module
21 | from labml_nn.experiments.cifar10 import CIFAR10Configs
22 | from labml_nn.normalization.instance_norm import InstanceNorm
23 |
24 |
25 | class Model(Module):
26 | """
27 | ### VGG model for CIFAR-10 classification
28 | """
29 |
30 | def __init__(self):
31 | super().__init__()
32 | layers = []
33 | # RGB channels
34 | in_channels = 3
35 | # Number of channels in each layer in each block
36 | for block in [[64, 64], [128, 128], [256, 256, 256], [512, 512, 512], [512, 512, 512]]:
37 | # Convolution, Normalization and Activation layers
38 | for channels in block:
39 | layers += [nn.Conv2d(in_channels, channels, kernel_size=3, padding=1),
40 | InstanceNorm(channels),
41 | nn.ReLU(inplace=True)]
42 | in_channels = channels
43 | # Max pooling at end of each block
44 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
45 |
46 | # Create a sequential model with the layers
47 | self.layers = nn.Sequential(*layers)
48 | # Final logits layer
49 | self.fc = nn.Linear(512, 10)
50 |
51 | def __call__(self, x):
52 | # The VGG layers
53 | x = self.layers(x)
54 | # Reshape for classification layer
55 | x = x.view(x.shape[0], -1)
56 | # Final linear layer
57 | return self.fc(x)
58 |
59 |
60 | @option(CIFAR10Configs.model)
61 | def model(c: CIFAR10Configs):
62 | """
63 | ### Create model
64 | """
65 | return Model().to(c.device)
66 |
67 |
68 | def main():
69 | # Create experiment
70 | experiment.create(name='cifar10', comment='instance norm')
71 | # Create configurations
72 | conf = CIFAR10Configs()
73 | # Load configurations
74 | experiment.configs(conf, {
75 | 'optimizer.optimizer': 'Adam',
76 | 'optimizer.learning_rate': 2.5e-4,
77 | })
78 | # Start the experiment and run the training loop
79 | with experiment.start():
80 | conf.run()
81 |
82 |
83 | #
84 | if __name__ == '__main__':
85 | main()
86 |
--------------------------------------------------------------------------------
/labml_nn/resnets/resnet_net.py:
--------------------------------------------------------------------------------
1 | #!/bin/python
2 |
3 | # Custom classes
4 | from models.mlp import MLP
5 | from utils.train import Trainer
6 | from models.resnet import *
7 |
8 | # GPU Check
9 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
10 | print("Device: " + str(device))
11 |
12 | #Use different train/test data augmentations
13 | transform_test = transforms.Compose(
14 | [transforms.ToTensor(),
15 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
16 |
17 | transform_train = transforms.Compose([
18 | transforms.RandomHorizontalFlip(p=1.0),
19 | transforms.RandomRotation(20),
20 | transforms.RandomCrop(32, (2, 2), pad_if_needed=False, padding_mode='constant'),
21 | transforms.ToTensor(),
22 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
23 |
24 |
25 | # Get Cifar 10 Datasets
26 | save='./data/Cifar10'
27 | trainset = torchvision.datasets.CIFAR10(root=save, train=True, download=True, transform=transform_train)
28 | testset = torchvision.datasets.CIFAR10(root=save, train=False, download=True, transform=transform_test)
29 |
30 | # Get Cifar 10 Dataloaders
31 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
32 | shuffle=True, num_workers=4)
33 |
34 | testloader = torch.utils.data.DataLoader(testset, batch_size=64,
35 | shuffle=False, num_workers=4)
36 |
37 | epochs = 50
38 |
39 | #################################
40 | # Create the assignment Resnet (part a)
41 | #################################
42 | def MyResNet():
43 | resnet = ResNet(in_features= [32, 32, 3],
44 | num_class=10,
45 | feature_channel_list = [128, 256, 512],
46 | batch_norm= True,
47 | num_stacks=1
48 | )
49 |
50 | # Create MLP
51 | # Calculate the input shape
52 | s = resnet.GetCurShape()
53 | in_features = s[0]*s[1]*s[2]
54 |
55 | mlp = MLP(in_features,
56 | 10,
57 | [], #512, 1024, 512
58 | [],
59 | use_batch_norm=False,
60 | use_dropout=False,
61 | use_softmax=False,
62 | device=device)
63 |
64 | resnet.AddMLP(mlp)
65 | return resnet
66 |
67 | model = MyResNet()
68 | model.to(device=device)
69 | summary(model, (3, 32,32))
70 |
71 | # Optimizer
72 | opt = torch.optim.Adam(model.parameters(), lr=0.0005, betas=(0.9, 0.95), weight_decay=1e-8) #0.0005 l2_factor.item()
73 |
74 | # Loss function
75 | cost = nn.CrossEntropyLoss()
76 |
77 | # Create a trainer
78 | trainer = Trainer(model, opt, cost, name="MyResNet", device=device, use_lr_schedule =True)
79 |
80 | # Run training
81 | trainer.Train(trainloader, epochs, testloader=testloader)
82 |
83 | print('done')
84 |
--------------------------------------------------------------------------------
/labml_nn/resnets/models/mlp.py:
--------------------------------------------------------------------------------
1 | #!/bin/python
2 |
3 | import torch
4 | import torch.nn as nn
5 |
6 | class MLP(nn.Module):
7 | def __init__(self
8 | , in_features
9 | , out_features
10 | , hidden_layers
11 | , actv_func
12 | , pre_module_list=None
13 | , use_dropout=False
14 | , use_batch_norm=False
15 | , use_softmax=True
16 | , device="cpu"
17 | ):
18 | super(MLP, self).__init__()
19 |
20 | self.in_features = in_features
21 | self.out_features = out_features
22 | self.num_hidden_layers = len(hidden_layers)
23 | self.hidden_layers = hidden_layers
24 | self.use_dropout = use_dropout
25 | self.use_batch_norm = use_batch_norm
26 | self.actv_func = actv_func
27 | self.use_softmax = use_softmax
28 |
29 | self.device = device
30 |
31 | # Add on to another model
32 | if pre_module_list:
33 | self.module_list = pre_module_list
34 | else:
35 | self.module_list = nn.ModuleList()
36 |
37 | self.build_()
38 |
39 | # Send to gpu
40 | self.to(self.device)
41 |
42 | def build_(self):
43 | # Activation Functions for Fully connected layers #
44 | # Start with input dimensions
45 | dim = self.in_features
46 | for i in range(self.num_hidden_layers):
47 | # Create a fully connected layer between the last layer
48 | # and the current hidden layer
49 | self.module_list.append(nn.Linear(dim, self.hidden_layers[i]))
50 | # Update the current dimension
51 | dim = self.hidden_layers[i]
52 |
53 | if self.use_batch_norm:
54 | self.module_list.append( nn.BatchNorm1d(dim, affine=True) )
55 |
56 | # Add the Activation function
57 | self.module_list.append( self.GetActivation(name=self.actv_func[i]) )
58 |
59 | if self.use_dropout:
60 | self.module_list.append( nn.Dropout(p=0.10) )
61 |
62 | # Fully connect to output dimensions
63 | if dim != self.out_features:
64 | self.module_list.append( nn.Linear(dim, self.out_features) )
65 |
66 |
67 | def forward(self, x):
68 | # Flatten the 2d image into 1d
69 | # Also convert into float for FC layer
70 | x = torch.flatten(x.float(), start_dim=1)
71 |
72 | # Apply each layer in the module list
73 | for i in range( len(self.module_list) ):
74 | x = self.module_list[i](x)
75 |
76 | return x
77 |
78 | def GetActivation(self, name="relu"):
79 | if name == "relu":
80 | return nn.ReLU()
81 | elif name == "leakyrelu":
82 | return nn.LeakyReLU()
83 | elif name == "Sigmoid":
84 | return nn.Sigmoid()
85 | elif name == "Tanh":
86 | return nn.Tanh()
87 | elif name == "Identity":
88 | return nn.Identity()
89 | else:
90 | return nn.ReLU()
--------------------------------------------------------------------------------
/labml_nn/gan/wasserstein/gradient_penalty/experiment.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: WGAN-GP experiment with MNIST
4 | summary: This experiment generates MNIST images using convolutional neural network.
5 | ---
6 |
7 | # WGAN-GP experiment with MNIST
8 | """
9 |
10 | import torch
11 |
12 | from labml import experiment, tracker
13 | # Import configurations from [Wasserstein experiment](../experiment.html)
14 | from labml_nn.gan.wasserstein.experiment import Configs as OriginalConfigs
15 | #
16 | from labml_nn.gan.wasserstein.gradient_penalty import GradientPenalty
17 |
18 |
19 | class Configs(OriginalConfigs):
20 | """
21 | ## Configuration class
22 |
23 | We extend [original GAN implementation](../../original/experiment.html) and override the discriminator (critic) loss
24 | calculation to include gradient penalty.
25 | """
26 |
27 | # Gradient penalty coefficient $\lambda$
28 | gradient_penalty_coefficient: float = 10.0
29 | #
30 | gradient_penalty = GradientPenalty()
31 |
32 | def calc_discriminator_loss(self, data: torch.Tensor):
33 | """
34 | This overrides the original discriminator loss calculation and
35 | includes gradient penalty.
36 | """
37 | # Require gradients on $x$ to calculate gradient penalty
38 | data.requires_grad_()
39 | # Sample $z \sim p(z)$
40 | latent = self.sample_z(data.shape[0])
41 | # $D(x)$
42 | f_real = self.discriminator(data)
43 | # $D(G_\theta(z))$
44 | f_fake = self.discriminator(self.generator(latent).detach())
45 | # Get discriminator losses
46 | loss_true, loss_false = self.discriminator_loss(f_real, f_fake)
47 | # Calculate gradient penalties in training mode
48 | if self.mode.is_train:
49 | gradient_penalty = self.gradient_penalty(data, f_real)
50 | tracker.add("loss.gp.", gradient_penalty)
51 | loss = loss_true + loss_false + self.gradient_penalty_coefficient * gradient_penalty
52 | # Skip gradient penalty otherwise
53 | else:
54 | loss = loss_true + loss_false
55 |
56 | # Log stuff
57 | tracker.add("loss.discriminator.true.", loss_true)
58 | tracker.add("loss.discriminator.false.", loss_false)
59 | tracker.add("loss.discriminator.", loss)
60 |
61 | return loss
62 |
63 |
64 | def main():
65 | # Create configs object
66 | conf = Configs()
67 | # Create experiment
68 | experiment.create(name='mnist_wassertein_gp_dcgan')
69 | # Override configurations
70 | experiment.configs(conf,
71 | {
72 | 'discriminator': 'cnn',
73 | 'generator': 'cnn',
74 | 'label_smoothing': 0.01,
75 | 'generator_loss': 'wasserstein',
76 | 'discriminator_loss': 'wasserstein',
77 | 'discriminator_k': 5,
78 | })
79 |
80 | # Start the experiment and run training loop
81 | with experiment.start():
82 | conf.run()
83 |
84 |
85 | if __name__ == '__main__':
86 | main()
87 |
--------------------------------------------------------------------------------
/labml_nn/gan/wasserstein/gradient_penalty/__init__.py:
--------------------------------------------------------------------------------
1 | r"""
2 | ---
3 | title: Gradient Penalty for Wasserstein GAN (WGAN-GP)
4 | summary: >
5 | An annotated PyTorch implementation/tutorial of
6 | Improved Training of Wasserstein GANs.
7 | ---
8 |
9 | # Gradient Penalty for Wasserstein GAN (WGAN-GP)
10 |
11 | This is an implementation of
12 | [Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028).
13 |
14 | [WGAN](../index.html) suggests clipping weights to enforce Lipschitz constraint
15 | on the discriminator network (critic).
16 | This and other weight constraints like L2 norm clipping, weight normalization,
17 | L1, L2 weight decay have problems:
18 |
19 | 1. Limiting the capacity of the discriminator
20 | 2. Exploding and vanishing gradients (without [Batch Normalization](../../../normalization/batch_norm/index.html)).
21 |
22 | The paper [Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028)
23 | proposal a better way to improve Lipschitz constraint, a gradient penalty.
24 |
25 | $$\mathcal{L}_{GP} = \lambda \underset{\hat{x} \sim \mathbb{P}_{\hat{x}}}{\mathbb{E}}
26 | \Big[ \big(\Vert \nabla_{\hat{x}} D(\hat{x}) \Vert_2 - 1\big)^2 \Big]
27 | $$
28 |
29 | where $\lambda$ is the penalty weight and
30 |
31 | \begin{align}
32 | x &\sim \mathbb{P}_r \\
33 | z &\sim p(z) \\
34 | \epsilon &\sim U[0,1] \\
35 | \tilde{x} &\leftarrow G_\theta (z) \\
36 | \hat{x} &\leftarrow \epsilon x + (1 - \epsilon) \tilde{x}
37 | \end{align}
38 |
39 | That is we try to keep the gradient norm $\Vert \nabla_{\hat{x}} D(\hat{x}) \Vert_2$ close to $1$.
40 |
41 | In this implementation we set $\epsilon = 1$.
42 |
43 | Here is the [code for an experiment](experiment.html) that uses gradient penalty.
44 | """
45 |
46 | import torch
47 | import torch.autograd
48 |
49 | from labml_helpers.module import Module
50 |
51 |
52 | class GradientPenalty(Module):
53 | """
54 | ## Gradient Penalty
55 | """
56 |
57 | def __call__(self, x: torch.Tensor, f: torch.Tensor):
58 | """
59 | * `x` is $x \sim \mathbb{P}_r$
60 | * `f` is $D(x)$
61 |
62 | $\hat{x} \leftarrow x$
63 | since we set $\epsilon = 1$ for this implementation.
64 | """
65 |
66 | # Get batch size
67 | batch_size = x.shape[0]
68 |
69 | # Calculate gradients of $D(x)$ with respect to $x$.
70 | # `grad_outputs` is set to ones since we want the gradients of $D(x)$,
71 | # and we need to create and retain graph since we have to compute gradients
72 | # with respect to weight on this loss.
73 | gradients, *_ = torch.autograd.grad(outputs=f,
74 | inputs=x,
75 | grad_outputs=f.new_ones(f.shape),
76 | create_graph=True)
77 |
78 | # Reshape gradients to calculate the norm
79 | gradients = gradients.reshape(batch_size, -1)
80 | # Calculate the norm $\Vert \nabla_{\hat{x}} D(\hat{x}) \Vert_2$
81 | norm = gradients.norm(2, dim=-1)
82 | # Return the loss $\big(\Vert \nabla_{\hat{x}} D(\hat{x}) \Vert_2 - 1\big)^2$
83 | return torch.mean((norm - 1) ** 2)
84 |
--------------------------------------------------------------------------------
/labml_nn/transformers/mlm/readme.md:
--------------------------------------------------------------------------------
1 | # [Masked Language Model (MLM)](https://nn.labml.ai/transformers/mlm/index.html)
2 |
3 | This is a [PyTorch](https://pytorch.org) implementation of Masked Language Model (MLM)
4 | used to pre-train the BERT model introduced in the paper
5 | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805).
6 |
7 | ## BERT Pretraining
8 |
9 | BERT model is a transformer model.
10 | The paper pre-trains the model using MLM and with next sentence prediction.
11 | We have only implemented MLM here.
12 |
13 | ### Next sentence prediction
14 |
15 | In *next sentence prediction*, the model is given two sentences `A` and `B` and the model
16 | makes a binary prediction whether `B` is the sentence that follows `A` in the actual text.
17 | The model is fed with actual sentence pairs 50% of the time and random pairs 50% of the time.
18 | This classification is done while applying MLM. *We haven't implemented this here.*
19 |
20 | ## Masked LM
21 |
22 | This masks a percentage of tokens at random and trains the model to predict
23 | the masked tokens.
24 | They **mask 15% of the tokens** by replacing them with a special `[MASK]` token.
25 |
26 | The loss is computed on predicting the masked tokens only.
27 | This causes a problem during fine-tuning and actual usage since there are no `[MASK]` tokens
28 | at that time.
29 | Therefore we might not get any meaningful representations.
30 |
31 | To overcome this **10% of the masked tokens are replaced with the original token**,
32 | and another **10% of the masked tokens are replaced with a random token**.
33 | This trains the model to give representations about the actual token whether or not the
34 | input token at that position is a `[MASK]`.
35 | And replacing with a random token causes it to
36 | give a representation that has information from the context as well;
37 | because it has to use the context to fix randomly replaced tokens.
38 |
39 | ## Training
40 |
41 | MLMs are harder to train than autoregressive models because they have a smaller training signal.
42 | i.e. only a small percentage of predictions are trained per sample.
43 |
44 | Another problem is since the model is bidirectional, any token can see any other token.
45 | This makes the "credit assignment" harder.
46 | Let's say you have the character level model trying to predict `home *s where i want to be`.
47 | At least during the early stages of the training, it'll be super hard to figure out why the
48 | replacement for `*` should be `i`, it could be anything from the whole sentence.
49 | Whilst, in an autoregressive setting the model will only have to use `h` to predict `o` and
50 | `hom` to predict `e` and so on. So the model will initially start predicting with a shorter context first
51 | and then learn to use longer contexts later.
52 | Since MLMs have this problem it's a lot faster to train if you start with a smaller sequence length
53 | initially and then use a longer sequence length later.
54 |
55 | Here is [the training code](https://nn.labml.ai/transformers/mlm/experiment.html) for a simple MLM model.
56 |
57 | [](https://app.labml.ai/run/3a6d22b6c67111ebb03d6764d13a38d1)
58 |
--------------------------------------------------------------------------------
/labml_nn/cnn/utils/dataloader.py:
--------------------------------------------------------------------------------
1 | #!/bin/python
2 |
3 | import torch
4 | import torchvision
5 | import torchvision.transforms as transforms
6 | from torch.utils.data import Dataset, random_split
7 | import matplotlib.pyplot as plt
8 | import numpy as np
9 |
10 | def LoadCifar10DatasetTrain(save, transform=None):
11 | trainset = torchvision.datasets.CIFAR10(root=save, train=True,
12 | download=True, transform=transform)
13 | return trainset
14 |
15 | def LoadCifar10DatasetTest(save, transform):
16 | return torchvision.datasets.CIFAR10(root=save, train=False,
17 | download=False, transform=transform)
18 |
19 | def GetCustTransform():
20 | transform_train = transforms.Compose([
21 | transforms.RandomRotation(20),
22 | transforms.RandomCrop(32, (2, 2), pad_if_needed=False, padding_mode='constant'),
23 | transforms.ToTensor(),
24 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
25 | return transform_train
26 |
27 | def Dataloader_train_valid(save, batch_size):
28 |
29 | # See utils/dataloader.py for data augmentations
30 | transform_train_valid = GetCustTransform()
31 |
32 | # Get Cifar 10 Datasets
33 | trainset = LoadCifar10DatasetTrain(save, transform_train_valid)
34 | train_val_abs = int(len(trainset) * 0.8)
35 | train_subset, val_subset = random_split(trainset, [train_val_abs, len(trainset) - train_val_abs])
36 |
37 | # Get Cifar 10 Dataloaders
38 | trainloader = torch.utils.data.DataLoader(train_subset, batch_size=batch_size,
39 | shuffle=True, num_workers=4)
40 |
41 | valloader = torch.utils.data.DataLoader(val_subset, batch_size=batch_size,
42 | shuffle=True, num_workers=4)
43 | return trainloader, valloader
44 |
45 | def Dataloader_train(save, batch_size):
46 |
47 | # See utils/dataloader.py for data augmentations
48 | transform_train = GetCustTransform()
49 |
50 | # Get Cifar 10 Datasets
51 | trainset = LoadCifar10DatasetTrain(save, transform_train)
52 | # Get Cifar 10 Dataloaders
53 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
54 | shuffle=True, num_workers=4)
55 |
56 | return trainloader
57 |
58 | def Dataloader_test(save, batch_size):
59 |
60 | # transformation test set
61 | transform_test = transforms.Compose(
62 | [transforms.ToTensor(),
63 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
64 |
65 | # initialize test dataset and dataloader
66 | testset = LoadCifar10DatasetTest(save, transform_test)
67 | testloader = torch.utils.data.DataLoader(testset, batch_size=64,
68 | shuffle=False, num_workers=4)
69 |
70 | return testloader
71 |
72 | def imshow(im):
73 | image = im.cpu().clone().detach().numpy()
74 | image = image.transpose(1, 2, 0)
75 | image = image * np.array((0.5, 0.5, 0.5)) + np.array((0.5, 0.5, 0.5)) # unnormalize
76 | plt.imshow(image)
77 | plt.show()
78 |
79 | def imretrun(im):
80 | image = im.cpu().clone().detach().numpy()
81 | image = image.transpose(1, 2, 0)
82 | image = image * np.array((0.5, 0.5, 0.5)) + np.array((0.5, 0.5, 0.5)) # unnormalize
83 | return image
--------------------------------------------------------------------------------
/labml_nn/hypernetworks/experiment.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from labml import experiment
4 | from labml.configs import option
5 | from labml.utils.pytorch import get_modules
6 | from labml_helpers.module import Module
7 |
8 | from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
9 | from labml_nn.hypernetworks.hyper_lstm import HyperLSTM
10 | from labml_nn.lstm import LSTM
11 |
12 |
13 | class AutoregressiveModel(Module):
14 | """
15 | ## Auto regressive model
16 | """
17 |
18 | def __init__(self, n_vocab: int, d_model: int, rnn_model: Module):
19 | super().__init__()
20 | # Token embedding module
21 | self.src_embed = nn.Embedding(n_vocab, d_model)
22 | self.lstm = rnn_model
23 | self.generator = nn.Linear(d_model, n_vocab)
24 |
25 | def __call__(self, x: torch.Tensor):
26 | x = self.src_embed(x)
27 | # Embed the tokens (`src`) and run it through the the transformer
28 | res, state = self.lstm(x)
29 | # Generate logits of the next token
30 | return self.generator(res), state
31 |
32 |
33 | class Configs(NLPAutoRegressionConfigs):
34 | """
35 | ## Configurations
36 |
37 | The default configs can and will be over-ridden when we start the experiment
38 | """
39 |
40 | model: AutoregressiveModel
41 | rnn_model: Module
42 |
43 | d_model: int = 512
44 | n_rhn: int = 16
45 | n_z: int = 16
46 |
47 |
48 | @option(Configs.model)
49 | def autoregressive_model(c: Configs):
50 | """
51 | Initialize the auto-regressive model
52 | """
53 | m = AutoregressiveModel(c.n_tokens, c.d_model, c.rnn_model)
54 | return m.to(c.device)
55 |
56 |
57 | @option(Configs.rnn_model)
58 | def hyper_lstm(c: Configs):
59 | return HyperLSTM(c.d_model, c.d_model, c.n_rhn, c.n_z, 1)
60 |
61 |
62 | @option(Configs.rnn_model)
63 | def lstm(c: Configs):
64 | return LSTM(c.d_model, c.d_model, 1)
65 |
66 |
67 | def main():
68 | # Create experiment
69 | experiment.create(name="hyper_lstm", comment='')
70 | # Create configs
71 | conf = Configs()
72 | # Load configurations
73 | experiment.configs(conf,
74 | # A dictionary of configurations to override
75 | {'tokenizer': 'character',
76 | 'text': 'tiny_shakespeare',
77 | 'optimizer.learning_rate': 2.5e-4,
78 | 'optimizer.optimizer': 'Adam',
79 | 'prompt': 'It is',
80 | 'prompt_separator': '',
81 |
82 | 'rnn_model': 'hyper_lstm',
83 |
84 | 'train_loader': 'shuffled_train_loader',
85 | 'valid_loader': 'shuffled_valid_loader',
86 |
87 | 'seq_len': 512,
88 | 'epochs': 128,
89 | 'batch_size': 2,
90 | 'inner_iterations': 25})
91 |
92 | # Set models for saving and loading
93 | experiment.add_pytorch_models(get_modules(conf))
94 |
95 | # Start the experiment
96 | with experiment.start():
97 | # `TrainValidConfigs.run`
98 | conf.run()
99 |
100 |
101 | if __name__ == '__main__':
102 | main()
103 |
--------------------------------------------------------------------------------
/labml_nn/rl/ppo/gae.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Generalized Advantage Estimation (GAE)
4 | summary: A PyTorch implementation/tutorial of Generalized Advantage Estimation (GAE).
5 | ---
6 |
7 | # Generalized Advantage Estimation (GAE)
8 |
9 | This is a [PyTorch](https://pytorch.org) implementation of paper
10 | [Generalized Advantage Estimation](https://arxiv.org/abs/1506.02438).
11 |
12 | You can find an experiment that uses it [here](experiment.html).
13 | """
14 |
15 | import numpy as np
16 |
17 |
18 | class GAE:
19 | def __init__(self, n_workers: int, worker_steps: int, gamma: float, lambda_: float):
20 | self.lambda_ = lambda_
21 | self.gamma = gamma
22 | self.worker_steps = worker_steps
23 | self.n_workers = n_workers
24 |
25 | def __call__(self, done: np.ndarray, rewards: np.ndarray, values: np.ndarray) -> np.ndarray:
26 | """
27 | ### Calculate advantages
28 | \begin{align}
29 | \hat{A_t^{(1)}} &= r_t + \gamma V(s_{t+1}) - V(s)
30 | \\
31 | \hat{A_t^{(2)}} &= r_t + \gamma r_{t+1} +\gamma^2 V(s_{t+2}) - V(s)
32 | \\
33 | ...
34 | \\
35 | \hat{A_t^{(\infty)}} &= r_t + \gamma r_{t+1} +\gamma^2 r_{t+1} + ... - V(s)
36 | \end{align}
37 |
38 | $\hat{A_t^{(1)}}$ is high bias, low variance, whilst
39 | $\hat{A_t^{(\infty)}}$ is unbiased, high variance.
40 |
41 | We take a weighted average of $\hat{A_t^{(k)}}$ to balance bias and variance.
42 | This is called Generalized Advantage Estimation.
43 | $$\hat{A_t} = \hat{A_t^{GAE}} = \sum_k w_k \hat{A_t^{(k)}}$$
44 | We set $w_k = \lambda^{k-1}$, this gives clean calculation for
45 | $\hat{A_t}$
46 |
47 | \begin{align}
48 | \delta_t &= r_t + \gamma V(s_{t+1}) - V(s_t)$
49 | \\
50 | \hat{A_t} &= \delta_t + \gamma \lambda \delta_{t+1} + ... +
51 | (\gamma \lambda)^{T - t + 1} \delta_{T - 1}$
52 | \\
53 | &= \delta_t + \gamma \lambda \hat{A_{t+1}}
54 | \end{align}
55 | """
56 |
57 | # advantages table
58 | advantages = np.zeros((self.n_workers, self.worker_steps), dtype=np.float32)
59 | last_advantage = 0
60 |
61 | # $V(s_{t+1})$
62 | last_value = values[:, -1]
63 |
64 | for t in reversed(range(self.worker_steps)):
65 | # mask if episode completed after step $t$
66 | mask = 1.0 - done[:, t]
67 | last_value = last_value * mask
68 | last_advantage = last_advantage * mask
69 | # $\delta_t$
70 | delta = rewards[:, t] + self.gamma * last_value - values[:, t]
71 |
72 | # $\hat{A_t} = \delta_t + \gamma \lambda \hat{A_{t+1}}$
73 | last_advantage = delta + self.gamma * self.lambda_ * last_advantage
74 |
75 | # note that we are collecting in reverse order.
76 | # *My initial code was appending to a list and
77 | # I forgot to reverse it later.
78 | # It took me around 4 to 5 hours to find the bug.
79 | # The performance of the model was improving
80 | # slightly during initial runs,
81 | # probably because the samples are similar.*
82 | advantages[:, t] = last_advantage
83 |
84 | last_value = values[:, t]
85 |
86 | return advantages
87 |
--------------------------------------------------------------------------------
/labml_nn/transformers/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Transformers
4 | summary: >
5 | This is a collection of PyTorch implementations/tutorials of
6 | transformers and related techniques.
7 | ---
8 |
9 | # Transformers
10 |
11 | This module contains [PyTorch](https://pytorch.org/)
12 | implementations and explanations of original transformer
13 | from paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762),
14 | and derivatives and enhancements of it.
15 |
16 | * [Multi-head attention](mha.html)
17 | * [Transformer Encoder and Decoder Models](models.html)
18 | * [Fixed positional encoding](positional_encoding.html)
19 |
20 | ## [Transformer XL](xl/index.html)
21 | This implements Transformer XL model using
22 | [relative multi-head attention](xl/relative_mha.html)
23 |
24 | ## [Compressive Transformer](compressive/index.html)
25 |
26 | This is an implementation of compressive transformer
27 | that extends upon [Transformer XL](xl/index.html) by compressing
28 | oldest memories to give a longer attention span.
29 |
30 | ## [GPT Architecture](gpt/index.html)
31 |
32 | This is an implementation of GPT-2 architecture.
33 |
34 | ## [GLU Variants](glu_variants/simple.html)
35 |
36 | This is an implementation of the paper
37 | [GLU Variants Improve Transformer](https://arxiv.org/abs/2002.05202).
38 |
39 | ## [kNN-LM](knn/index.html)
40 |
41 | This is an implementation of the paper
42 | [Generalization through Memorization: Nearest Neighbor Language Models](https://arxiv.org/abs/1911.00172).
43 |
44 | ## [Feedback Transformer](feedback/index.html)
45 |
46 | This is an implementation of the paper
47 | [Accessing Higher-level Representations in Sequential Transformers with Feedback Memory](https://arxiv.org/abs/2002.09402).
48 |
49 | ## [Switch Transformer](switch/index.html)
50 |
51 | This is a miniature implementation of the paper
52 | [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961).
53 | Our implementation only has a few million parameters and doesn't do model parallel distributed training.
54 | It does single GPU training but we implement the concept of switching as described in the paper.
55 |
56 | ## [Fast Weights Transformer](fast_weights/index.html)
57 |
58 | This is an implementation of the paper
59 | [Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch](https://arxiv.org/abs/2102.11174).
60 |
61 | ## [FNet: Mixing Tokens with Fourier Transforms](fnet/index.html)
62 |
63 | This is an implementation of the paper
64 | [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824).
65 |
66 | ## [Attention Free Transformer](aft/index.html)
67 |
68 | This is an implementation of the paper
69 | [An Attention Free Transformer](https://papers.labml.ai/paper/2105.14103).
70 |
71 | ## [Masked Language Model](mlm/index.html)
72 |
73 | This is an implementation of Masked Language Model used for pre-training in paper
74 | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805).
75 |
76 | ## [Pay Attention to MLPs (gMLP)](gmlp/index.html)
77 |
78 | This is an implementation of the paper
79 | [Pay Attention to MLPs](https://papers.labml.ai/paper/2105.08050).
80 | """
81 |
82 | from .configs import TransformerConfigs
83 | from .models import TransformerLayer, Encoder, Decoder, Generator, EncoderDecoder
84 | from .mha import MultiHeadAttention
85 | from labml_nn.transformers.xl.relative_mha import RelativeMultiHeadAttention
86 |
--------------------------------------------------------------------------------
/labml_nn/optimizers/noam.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Noam optimizer from Attention is All You Need paper
4 | summary: >
5 | This is a tutorial/implementation of Noam optimizer.
6 | Noam optimizer has a warm-up period and then an exponentially decaying learning rate.
7 | ---
8 |
9 | # Noam Optimizer
10 |
11 | This is the [PyTorch](https://pytorch.org) implementation of optimizer introduced in the paper
12 | [Attention Is All You Need](https://arxiv.org/abs/1706.03762).
13 | """
14 | from typing import Dict
15 |
16 | from labml_nn.optimizers import WeightDecay
17 | from labml_nn.optimizers.amsgrad import AMSGrad
18 |
19 |
20 | class Noam(AMSGrad):
21 | """
22 | ## Noam Optimizer
23 |
24 | This class extends from Adam optimizer defined in [`adam.py`](adam.html).
25 | """
26 |
27 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
28 | weight_decay: WeightDecay = WeightDecay(),
29 | optimized_update: bool = True,
30 | amsgrad=False,
31 | warmup=0, d_model=512, defaults=None):
32 | """
33 | ### Initialize the optimizer
34 |
35 | * `params` is the list of parameters
36 | * `lr` is the learning rate $\alpha$
37 | * `betas` is a tuple of ($\beta_1$, $\beta_2$)
38 | * `eps` is $\hat{\epsilon}$ or $\epsilon$ based on `optimized_update`
39 | * `weight_decay` is an instance of class `WeightDecay` defined in [`__init__.py`](index.html)
40 | * 'optimized_update' is a flag whether to optimize the bias correction of the second moment
41 | by doing it after adding $\epsilon$
42 | * `amsgrad` is a flag indicating whether to use AMSGrad or fallback to plain Adam
43 | * `warmup` number of warmup steps
44 | * `d_model` model size; i.e. number of dimensions in the transformer
45 | * `defaults` is a dictionary of default for group values.
46 | This is useful when you want to extend the class `AdamWarmup`.
47 | """
48 |
49 | defaults = {} if defaults is None else defaults
50 | defaults.update(dict(warmup=warmup))
51 | super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults)
52 | self.d_model = d_model
53 |
54 | def get_lr(self, state: Dict[str, any], group: Dict[str, any]):
55 | """
56 | ### Get learning-rate
57 |
58 | $$\alpha \frac{1}{\sqrt{d_{model}}} \min \bigg(\frac{1}{\sqrt{t}}, \frac{t}{w^{3/2}}\bigg)$$
59 | where $w$ is the number of warmup steps.
60 | """
61 | # $$\min \bigg(\frac{1}{\sqrt{t}}, \frac{t}{w^{3/2}}\bigg)$$
62 | factor = min(state['step'] ** (-0.5), state['step'] * group['warmup'] ** (-1.5))
63 | # $$\alpha \frac{1}{\sqrt{d_{model}}} \min \bigg(\frac{1}{\sqrt{t}}, \frac{t}{w^{3/2}}\bigg)$$
64 | return group['lr'] * self.d_model ** (-0.5) * factor
65 |
66 |
67 | def _test_noam_lr():
68 | """
69 | ### Plot learning rate for different warmups and model sizes
70 |
71 | 
72 | """
73 | import matplotlib.pyplot as plt
74 | import numpy as np
75 | from torch import nn
76 |
77 | model = nn.Linear(10, 10)
78 | opts = [Noam(model.parameters(), d_model=512, warmup=4000, lr=1),
79 | Noam(model.parameters(), d_model=512, warmup=8000, lr=1),
80 | Noam(model.parameters(), d_model=2048, warmup=2000, lr=1)]
81 | plt.plot(np.arange(1, 20000), [[opt.get_lr({'step': i}, opt.defaults) for opt in opts] for i in range(1, 20000)])
82 | plt.legend(["512:4000", "512:8000", "2048:2000"])
83 | plt.title("Learning Rate")
84 | plt.show()
85 |
86 |
87 | if __name__ == '__main__':
88 | _test_noam_lr()
89 |
--------------------------------------------------------------------------------
/labml_nn/rl/dqn/model.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Deep Q Network (DQN) Model
4 | summary: Implementation of neural network model for Deep Q Network (DQN).
5 | ---
6 |
7 | # Deep Q Network (DQN) Model
8 | """
9 |
10 | import torch
11 | from torch import nn
12 |
13 | from labml_helpers.module import Module
14 |
15 |
16 | class Model(Module):
17 | """
18 | ## Dueling Network ⚔️ Model for $Q$ Values
19 |
20 | We are using a [dueling network](https://arxiv.org/abs/1511.06581)
21 | to calculate Q-values.
22 | Intuition behind dueling network architecture is that in most states
23 | the action doesn't matter,
24 | and in some states the action is significant. Dueling network allows
25 | this to be represented very well.
26 |
27 | \begin{align}
28 | Q^\pi(s,a) &= V^\pi(s) + A^\pi(s, a)
29 | \\
30 | \mathop{\mathbb{E}}_{a \sim \pi(s)}
31 | \Big[
32 | A^\pi(s, a)
33 | \Big]
34 | &= 0
35 | \end{align}
36 |
37 | So we create two networks for $V$ and $A$ and get $Q$ from them.
38 | $$
39 | Q(s, a) = V(s) +
40 | \Big(
41 | A(s, a) - \frac{1}{|\mathcal{A}|} \sum_{a' \in \mathcal{A}} A(s, a')
42 | \Big)
43 | $$
44 | We share the initial layers of the $V$ and $A$ networks.
45 | """
46 |
47 | def __init__(self):
48 | super().__init__()
49 | self.conv = nn.Sequential(
50 | # The first convolution layer takes a
51 | # $84\times84$ frame and produces a $20\times20$ frame
52 | nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4),
53 | nn.ReLU(),
54 |
55 | # The second convolution layer takes a
56 | # $20\times20$ frame and produces a $9\times9$ frame
57 | nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
58 | nn.ReLU(),
59 |
60 | # The third convolution layer takes a
61 | # $9\times9$ frame and produces a $7\times7$ frame
62 | nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
63 | nn.ReLU(),
64 | )
65 |
66 | # A fully connected layer takes the flattened
67 | # frame from third convolution layer, and outputs
68 | # $512$ features
69 | self.lin = nn.Linear(in_features=7 * 7 * 64, out_features=512)
70 | self.activation = nn.ReLU()
71 |
72 | # This head gives the state value $V$
73 | self.state_value = nn.Sequential(
74 | nn.Linear(in_features=512, out_features=256),
75 | nn.ReLU(),
76 | nn.Linear(in_features=256, out_features=1),
77 | )
78 | # This head gives the action value $A$
79 | self.action_value = nn.Sequential(
80 | nn.Linear(in_features=512, out_features=256),
81 | nn.ReLU(),
82 | nn.Linear(in_features=256, out_features=4),
83 | )
84 |
85 | def __call__(self, obs: torch.Tensor):
86 | # Convolution
87 | h = self.conv(obs)
88 | # Reshape for linear layers
89 | h = h.reshape((-1, 7 * 7 * 64))
90 |
91 | # Linear layer
92 | h = self.activation(self.lin(h))
93 |
94 | # $A$
95 | action_value = self.action_value(h)
96 | # $V$
97 | state_value = self.state_value(h)
98 |
99 | # $A(s, a) - \frac{1}{|\mathcal{A}|} \sum_{a' \in \mathcal{A}} A(s, a')$
100 | action_score_centered = action_value - action_value.mean(dim=-1, keepdim=True)
101 | # $Q(s, a) =V(s) + \Big(A(s, a) - \frac{1}{|\mathcal{A}|} \sum_{a' \in \mathcal{A}} A(s, a')\Big)$
102 | q = state_value + action_score_centered
103 |
104 | return q
105 |
--------------------------------------------------------------------------------
/labml_nn/transformers/fnet/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: FNet - Mixing Tokens with Fourier Transforms
4 | summary: >
5 | This is an annotated implementation/tutorial the FNet in PyTorch.
6 | ---
7 |
8 | # FNet: Mixing Tokens with Fourier Transforms
9 |
10 | This is a [PyTorch](https://pytorch.org) implementation of the paper
11 | [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824).
12 |
13 | This paper replaces the [self-attention layer](../mha.html) with two
14 | [Fourier transforms](https://en.wikipedia.org/wiki/Discrete_Fourier_transform) to
15 | *mix* tokens.
16 | This is a $7 \times$ more efficient than self-attention.
17 | The accuracy loss of using this over self-attention is about 92% for
18 | [BERT](https://paperswithcode.com/method/bert) on
19 | [GLUE benchmark](https://paperswithcode.com/dataset/glue).
20 |
21 | ## Mixing tokens with two Fourier transforms
22 |
23 | We apply Fourier transform along the hidden dimension (embedding dimension)
24 | and then along the sequence dimension.
25 |
26 | $$
27 | \mathcal{R}\big(\mathcal{F}_\text{seq} \big(\mathcal{F}_\text{hidden} (x) \big) \big)
28 | $$
29 |
30 | where $x$ is the embedding input, $\mathcal{F}$ stands for the fourier transform and
31 | $\mathcal{R}$ stands for the real component in complex numbers.
32 |
33 | This is very simple to implement on PyTorch - just 1 line of code.
34 | The paper suggests using a precomputed DFT matrix and doing matrix multiplication to get the
35 | Fourier transformation.
36 |
37 | Here is [the training code](experiment.html) for using a FNet based model for classifying
38 | [AG News](https://paperswithcode.com/dataset/ag-news).
39 | """
40 |
41 | from typing import Optional
42 |
43 | import torch
44 | from torch import nn
45 |
46 |
47 | class FNetMix(nn.Module):
48 | """
49 | ## FNet - Mix tokens
50 |
51 | This module simply implements
52 | $$
53 | \mathcal{R}\big(\mathcal{F}_\text{seq} \big(\mathcal{F}_\text{hidden} (x) \big) \big)
54 | $$
55 |
56 | The structure of this module is made similar to a [standard attention module](../mha.html) so that we can simply
57 | replace it.
58 | """
59 |
60 | def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: Optional[torch.Tensor] = None):
61 | """
62 | The [normal attention module](../mha.html) can be fed with different token embeddings for
63 | $\text{query}$,$\text{key}$, and $\text{value}$ and a mask.
64 |
65 | We follow the same function signature so that we can replace it directly.
66 |
67 | For FNet mixing, $$x = \text{query} = \text{key} = \text{value}$$ and masking is not possible.
68 | Shape of `query` (and `key` and `value`) is `[seq_len, batch_size, d_model]`.
69 | """
70 |
71 | # $\text{query}$,$\text{key}$, and $\text{value}$ all should be equal to $x$ for token mixing
72 | assert query is key and key is value
73 | # Token mixing doesn't support masking. i.e. all tokens will see all other token embeddings.
74 | assert mask is None
75 |
76 | # Assign to `x` for clarity
77 | x = query
78 |
79 | # Apply the Fourier transform along the hidden (embedding) dimension
80 | # $$\mathcal{F}_\text{hidden} (x)$$
81 | #
82 | # The output of the Fourier transform is a tensor of
83 | # [complex numbers](https://pytorch.org/docs/stable/complex_numbers.html).
84 | fft_hidden = torch.fft.fft(x, dim=2)
85 | # Apply the Fourier transform along the sequence dimension
86 | # $$\mathcal{F}_\text{seq} \big(\mathcal{F}_\text{hidden} (x) \big)$$
87 | fft_seq = torch.fft.fft(fft_hidden, dim=0)
88 |
89 | # Get the real component
90 | # $$\mathcal{R}\big(\mathcal{F}_\text{seq} \big(\mathcal{F}_\text{hidden} (x) \big) \big)$$
91 | return torch.real(fft_seq)
92 |
--------------------------------------------------------------------------------
/labml_nn/experiments/mnist.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: MNIST Experiment
4 | summary: >
5 | This is a reusable trainer for MNIST dataset
6 | ---
7 |
8 | # MNIST Experiment
9 | """
10 |
11 | import torch.nn as nn
12 | import torch.utils.data
13 | from labml_helpers.module import Module
14 |
15 | from labml import tracker
16 | from labml.configs import option
17 | from labml_helpers.datasets.mnist import MNISTConfigs as MNISTDatasetConfigs
18 | from labml_helpers.device import DeviceConfigs
19 | from labml_helpers.metrics.accuracy import Accuracy
20 | from labml_helpers.train_valid import TrainValidConfigs, BatchIndex, hook_model_outputs
21 | from labml_nn.optimizers.configs import OptimizerConfigs
22 |
23 |
24 | class MNISTConfigs(MNISTDatasetConfigs, TrainValidConfigs):
25 | """
26 |
27 | ## Trainer configurations
28 |
29 | """
30 |
31 | # Optimizer
32 | optimizer: torch.optim.Adam
33 | # Training device
34 | device: torch.device = DeviceConfigs()
35 |
36 | # Classification model
37 | model: Module
38 | # Number of epochs to train for
39 | epochs: int = 10
40 |
41 | # Number of times to switch between training and validation within an epoch
42 | inner_iterations = 10
43 |
44 | # Accuracy function
45 | accuracy = Accuracy()
46 | # Loss function
47 | loss_func = nn.CrossEntropyLoss()
48 |
49 | def init(self):
50 | """
51 | ### Initialization
52 | """
53 | # Set tracker configurations
54 | tracker.set_scalar("loss.*", True)
55 | tracker.set_scalar("accuracy.*", True)
56 | # Add a hook to log module outputs
57 | hook_model_outputs(self.mode, self.model, 'model')
58 | # Add accuracy as a state module.
59 | # The name is probably confusing, since it's meant to store
60 | # states between training and validation for RNNs.
61 | # This will keep the accuracy metric stats separate for training and validation.
62 | self.state_modules = [self.accuracy]
63 |
64 | def step(self, batch: any, batch_idx: BatchIndex):
65 | """
66 | ### Training or validation step
67 | """
68 |
69 | # Move data to the device
70 | data, target = batch[0].to(self.device), batch[1].to(self.device)
71 |
72 | # Update global step (number of samples processed) when in training mode
73 | if self.mode.is_train:
74 | tracker.add_global_step(len(data))
75 |
76 | # Whether to capture model outputs
77 | with self.mode.update(is_log_activations=batch_idx.is_last):
78 | # Get model outputs.
79 | output = self.model(data)
80 |
81 | # Calculate and log loss
82 | loss = self.loss_func(output, target)
83 | tracker.add("loss.", loss)
84 |
85 | # Calculate and log accuracy
86 | self.accuracy(output, target)
87 | self.accuracy.track()
88 |
89 | # Train the model
90 | if self.mode.is_train:
91 | # Calculate gradients
92 | loss.backward()
93 | # Take optimizer step
94 | self.optimizer.step()
95 | # Log the model parameters and gradients on last batch of every epoch
96 | if batch_idx.is_last:
97 | tracker.add('model', self.model)
98 | # Clear the gradients
99 | self.optimizer.zero_grad()
100 |
101 | # Save the tracked metrics
102 | tracker.save()
103 |
104 |
105 | @option(MNISTConfigs.optimizer)
106 | def _optimizer(c: MNISTConfigs):
107 | """
108 | ### Default optimizer configurations
109 | """
110 | opt_conf = OptimizerConfigs()
111 | opt_conf.parameters = c.model.parameters()
112 | opt_conf.optimizer = 'Adam'
113 | return opt_conf
114 |
--------------------------------------------------------------------------------
/labml_nn/transformers/gmlp/experiment.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Pay Attention to MLPs (gMLP) Experiment
4 | summary: This experiment trains a gMLP based model on Tiny Shakespeare dataset.
5 | ---
6 |
7 | # [Pay Attention to MLPs (gMLP)](index.html) Experiment
8 |
9 | This is an annotated PyTorch experiment to train a [gMLP model](index.html).
10 | The paper also applies a Stochastic Depth regularization where some layers are removed randomly during training.
11 | We have not implemented that here.
12 |
13 | This is based on
14 | [training loop and configurations for a simple transformer auto-regressive NLP task](../basic/autoregressive_experiment.html).
15 |
16 | [](https://app.labml.ai/run/01bd941ac74c11eb890c1d9196651a4a)
17 | """
18 | from labml import experiment
19 | from labml.configs import option
20 | from labml_nn.transformers import TransformerConfigs
21 | from labml_nn.transformers.basic.autoregressive_experiment import Configs as BasicAutoRegressionConfigs
22 | from labml_nn.transformers.gmlp import GMLPBlock
23 |
24 |
25 | class Configs(BasicAutoRegressionConfigs):
26 | """
27 | ## Configurations
28 |
29 | This inherits from
30 | [training loop and configurations for a simple transformer auto-regressive NLP task](../basic/autoregressive_transformer.html).
31 | """
32 |
33 | # Transformer
34 | transformer: TransformerConfigs = 'gMLP'
35 | # gMLP Block
36 | gmlp: GMLPBlock
37 | # `d_ffn` for gMLP projection layer
38 | d_ffn: int = 2048
39 |
40 |
41 | @option(Configs.gmlp, 'gMLP')
42 | def _gmlp_configs(c: Configs):
43 | """
44 | ### Create a gMLP block
45 | """
46 | return GMLPBlock(c.d_model, c.d_ffn, c.seq_len)
47 |
48 |
49 | @option(Configs.transformer, 'gMLP')
50 | def _transformer_configs(c: Configs):
51 | """
52 | ### Transformer configurations
53 | """
54 |
55 | # We use our
56 | # [configurable transformer implementation](../configs.html#TransformerConfigs)
57 | conf = TransformerConfigs()
58 | # Set the vocabulary sizes for embeddings and generating logits
59 | conf.n_src_vocab = c.n_tokens
60 | conf.n_tgt_vocab = c.n_tokens
61 | # Set model size
62 | conf.d_model = c.d_model
63 | # Replace the encoder layer with a gMLP layer
64 | conf.encoder_layer = c.gmlp
65 |
66 | return conf
67 |
68 |
69 | def main():
70 | # Create experiment
71 | experiment.create(name="gMLP")
72 | # Create configs
73 | conf = Configs()
74 | # Override configurations
75 | experiment.configs(conf, {
76 | # Use character level tokenizer
77 | 'tokenizer': 'character',
78 | # Prompt separator is blank
79 | 'prompt_separator': '',
80 | # Starting prompt for sampling
81 | 'prompt': 'It is ',
82 | # Use Tiny Shakespeare dataset
83 | 'text': 'tiny_shakespeare',
84 |
85 | # Use a context size of $256$
86 | 'seq_len': 256,
87 | # Train for $128$ epochs
88 | 'epochs': 128,
89 | # Batch size $32$
90 | 'batch_size': 32,
91 | # Switch between training and validation for $10$ times
92 | # per epoch
93 | 'inner_iterations': 10,
94 |
95 | # Model size
96 | 'd_model': 512,
97 | 'd_ffn': 2048,
98 |
99 | # Use [Noam optimizer](../../optimizers/noam.html)
100 | 'optimizer.optimizer': 'Noam',
101 | 'optimizer.learning_rate': 1.,
102 | })
103 |
104 | # Set models for saving and loading
105 | experiment.add_pytorch_models({'model': conf.model})
106 |
107 | # Start the experiment
108 | with experiment.start():
109 | # Run training
110 | conf.run()
111 |
112 |
113 | #
114 | if __name__ == '__main__':
115 | main()
116 |
--------------------------------------------------------------------------------
/labml_nn/transformers/feed_forward.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Position-wise Feed-Forward Network (FFN)
4 | summary: Documented reusable implementation of the position wise feedforward network.
5 | ---
6 |
7 | # Position-wise Feed-Forward Network (FFN)
8 |
9 | This is a [PyTorch](https://pytorch.org) implementation
10 | of position-wise feedforward network used in transformer.
11 |
12 | FFN consists of two fully connected layers.
13 | Number of dimensions in the hidden layer $d_{ff}$, is generally set to around
14 | four times that of the token embedding $d_{model}$.
15 | So it is sometime also called the expand-and-contract network.
16 |
17 | There is an activation at the hidden layer, which is
18 | usually set to ReLU (Rectified Linear Unit) activation, $$\max(0, x)$$
19 |
20 | That is, the FFN function is,
21 | $$FFN(x, W_1, W_2, b_1, b_2) = \max(0, x W_1 + b_1) W_2 + b_2$$
22 | where $W_1$, $W_2$, $b_1$ and $b_2$ are learnable parameters.
23 |
24 | Sometimes the
25 | GELU (Gaussian Error Linear Unit) activation is also used instead of ReLU.
26 | $$x \Phi(x)$$ where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$
27 |
28 | ### Gated Linear Units
29 |
30 | This is a generic implementation that supports different variants including
31 | [Gated Linear Units](https://arxiv.org/abs/2002.05202) (GLU).
32 | We have also implemented experiments on these:
33 |
34 | * [experiment that uses `labml.configs`](glu_variants/experiment.html)
35 | * [simpler version from scratch](glu_variants/simple.html)
36 | """
37 |
38 | import torch
39 | from torch import nn as nn
40 |
41 | from labml_helpers.module import Module
42 |
43 |
44 | class FeedForward(Module):
45 | """
46 | ## FFN module
47 | """
48 |
49 | def __init__(self, d_model: int, d_ff: int,
50 | dropout: float = 0.1,
51 | activation=nn.ReLU(),
52 | is_gated: bool = False,
53 | bias1: bool = True,
54 | bias2: bool = True,
55 | bias_gate: bool = True):
56 | """
57 | * `d_model` is the number of features in a token embedding
58 | * `d_ff` is the number of features in the hidden layer of the FFN
59 | * `dropout` is dropout probability for the hidden layer
60 | * `is_gated` specifies whether the hidden layer is gated
61 | * `bias1` specified whether the first fully connected layer should have a learnable bias
62 | * `bias2` specified whether the second fully connected layer should have a learnable bias
63 | * `bias_gate` specified whether the fully connected layer for the gate should have a learnable bias
64 | """
65 | super().__init__()
66 | # Layer one parameterized by weight $W_1$ and bias $b_1$
67 | self.layer1 = nn.Linear(d_model, d_ff, bias=bias1)
68 | # Layer one parameterized by weight $W_1$ and bias $b_1$
69 | self.layer2 = nn.Linear(d_ff, d_model, bias=bias2)
70 | # Hidden layer dropout
71 | self.dropout = nn.Dropout(dropout)
72 | # Activation function $f$
73 | self.activation = activation
74 | # Whether there is a gate
75 | self.is_gated = is_gated
76 | if is_gated:
77 | # If there is a gate the linear layer to transform inputs to
78 | # be multiplied by the gate, parameterized by weight $V$ and bias $c$
79 | self.linear_v = nn.Linear(d_model, d_ff, bias=bias_gate)
80 |
81 | def forward(self, x: torch.Tensor):
82 | # $f(x W_1 + b_1)$
83 | g = self.activation(self.layer1(x))
84 | # If gated, $f(x W_1 + b_1) \otimes (x V + b) $
85 | if self.is_gated:
86 | x = g * self.linear_v(x)
87 | # Otherwise
88 | else:
89 | x = g
90 | # Apply dropout
91 | x = self.dropout(x)
92 | # $(f(x W_1 + b_1) \otimes (x V + b)) W_2 + b_2$ or $f(x W_1 + b_1) W_2 + b_2$
93 | # depending on whether it is gated
94 | return self.layer2(x)
95 |
--------------------------------------------------------------------------------
/labml_nn/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | # [labml.ai Annotated PyTorch Paper Implementations](index.html)
3 |
4 | This is a collection of simple PyTorch implementations of
5 | neural networks and related algorithms.
6 | [These implementations](https://github.com/lab-ml/nn) are documented with explanations,
7 | and the [website](index.html)
8 | renders these as side-by-side formatted notes.
9 | We believe these would help you understand these algorithms better.
10 |
11 | We are actively maintaining this repo and adding new
12 | implementations.
13 |
14 | ## Modules
15 |
16 | #### ✨ [Transformers](transformers/index.html)
17 |
18 | * [Multi-headed attention](transformers/mha.html)
19 | * [Transformer building blocks](transformers/models.html)
20 | * [Transformer XL](transformers/xl/index.html)
21 | * [Relative multi-headed attention](transformers/xl/relative_mha.html)
22 | * [Compressive Transformer](transformers/compressive/index.html)
23 | * [GPT Architecture](transformers/gpt/index.html)
24 | * [GLU Variants](transformers/glu_variants/simple.html)
25 | * [kNN-LM: Generalization through Memorization](transformers/knn/index.html)
26 | * [Feedback Transformer](transformers/feedback/index.html)
27 | * [Switch Transformer](transformers/switch/index.html)
28 | * [Fast Weights Transformer](transformers/fast_weights/index.html)
29 | * [FNet](transformers/fnet/index.html)
30 | * [Attention Free Transformer](transformers/aft/index.html)
31 | * [Masked Language Model](transformers/mlm/index.html)
32 | * [Pay Attention to MLPs (gMLP)](transformers/gmlp/index.html)
33 |
34 | #### ✨ [Recurrent Highway Networks](recurrent_highway_networks/index.html)
35 |
36 | #### ✨ [LSTM](lstm/index.html)
37 |
38 | #### ✨ [HyperNetworks - HyperLSTM](hypernetworks/hyper_lstm.html)
39 |
40 | #### ✨ [Capsule Networks](capsule_networks/index.html)
41 |
42 | #### ✨ [Generative Adversarial Networks](gan/index.html)
43 | * [Original GAN](gan/original/index.html)
44 | * [GAN with deep convolutional network](gan/dcgan/index.html)
45 | * [Cycle GAN](gan/cycle_gan/index.html)
46 | * [Wasserstein GAN](gan/wasserstein/index.html)
47 | * [Wasserstein GAN with Gradient Penalty](gan/wasserstein/gradient_penalty/index.html)
48 | * [Style GAN 2](gan/stylegan/index.html)
49 |
50 | #### ✨ [Sketch RNN](sketch_rnn/index.html)
51 |
52 | #### ✨ [Reinforcement Learning](rl/index.html)
53 | * [Proximal Policy Optimization](rl/ppo/index.html) with
54 | [Generalized Advantage Estimation](rl/ppo/gae.html)
55 | * [Deep Q Networks](rl/dqn/index.html) with
56 | with [Dueling Network](rl/dqn/model.html),
57 | [Prioritized Replay](rl/dqn/replay_buffer.html)
58 | and Double Q Network.
59 |
60 | #### ✨ [Optimizers](optimizers/index.html)
61 | * [Adam](optimizers/adam.html)
62 | * [AMSGrad](optimizers/amsgrad.html)
63 | * [Adam Optimizer with warmup](optimizers/adam_warmup.html)
64 | * [Noam Optimizer](optimizers/noam.html)
65 | * [Rectified Adam Optimizer](optimizers/radam.html)
66 | * [AdaBelief Optimizer](optimizers/ada_belief.html)
67 |
68 | #### ✨ [Normalization Layers](https://nn.labml.ai/normalization/index.html)
69 | * [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html)
70 | * [Layer Normalization](https://nn.labml.ai/normalization/layer_norm/index.html)
71 | * [Instance Normalization](https://nn.labml.ai/normalization/instance_norm/index.html)
72 | * [Group Normalization](https://nn.labml.ai/normalization/group_norm/index.html)
73 | * [Weight Standardization](https://nn.labml.ai/normalization/weight_standardization/index.html)
74 | * [Batch-Channel Normalization](https://nn.labml.ai/normalization/batch_channel_norm/index.html)
75 |
76 | ### Installation
77 |
78 | ```bash
79 | pip install labml-nn
80 | ```
81 |
82 | ### Citing LabML
83 |
84 | If you use LabML for academic research, please cite the library using the following BibTeX entry.
85 |
86 | ```bibtex
87 | @misc{labml,
88 | author = {Varuna Jayasiri, Nipun Wijerathne},
89 | title = {LabML: A library to organize machine learning experiments},
90 | year = {2020},
91 | url = {https://nn.labml.ai/},
92 | }
93 | ```
94 | """
95 |
--------------------------------------------------------------------------------
/labml_nn/optimizers/adam_warmup_cosine_decay.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Adam optimizer with warm-up and cosine decay
4 | summary: A PyTorch implementation/tutorial of Adam optimizer with warm-up and cosine decay for GPT.
5 | ---
6 |
7 | # Adam Optimizer with Warmup and Cosine Decay
8 |
9 | This extends [AMSGrad optimizer](adam.html) and adds a warmup stage.
10 | """
11 | import math
12 | from typing import Dict
13 |
14 | from labml_nn.optimizers import WeightDecay
15 | from labml_nn.optimizers.amsgrad import AMSGrad
16 |
17 |
18 | class AdamWarmupCosineDecay(AMSGrad):
19 | """
20 |
21 | ## Adam Optimizer with Warmup and Cosine Decay
22 |
23 |
24 | This class extends from AMSGrad optimizer defined in [`amsgrad.py`](amsgrad.html).
25 | """
26 |
27 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
28 | weight_decay: WeightDecay = WeightDecay(),
29 | optimized_update: bool = True,
30 | amsgrad=False, warmup=0, total_steps=1e10, defaults=None):
31 | """
32 | ### Initialize the optimizer
33 |
34 | * `params` is the list of parameters
35 | * `lr` is the learning rate $\alpha$
36 | * `betas` is a tuple of ($\beta_1$, $\beta_2$)
37 | * `eps` is $\hat{\epsilon}$ or $\epsilon$ based on `optimized_update`
38 | * `weight_decay` is an instance of class `WeightDecay` defined in [`__init__.py`](index.html)
39 | * 'optimized_update' is a flag whether to optimize the bias correction of the second moment
40 | by doing it after adding $\epsilon$
41 | * `amsgrad` is a flag indicating whether to use AMSGrad or fallback to plain Adam
42 | * `warmup` number of warmup steps
43 | * `total_steps` total number of steps. Cosine decay reaches 0 at this,
44 | but stays at 10% of `lr` because we take $\alpha * \max(0.1, decay)$
45 | * `defaults` is a dictionary of default for group values.
46 | This is useful when you want to extend the class `AdamWarmup`.
47 | """
48 |
49 | defaults = {} if defaults is None else defaults
50 | defaults.update(dict(warmup=warmup, total_steps=total_steps))
51 | super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults)
52 |
53 | def get_lr(self, state: Dict[str, any], group: Dict[str, any]):
54 | """
55 | ### Get learning-rate
56 |
57 | $$\alpha \min \bigg(1, \frac{t}{w}\bigg)$$
58 | where $w$ is the number of warmup steps.
59 | """
60 | # If we are in warmup stage
61 | if group['warmup'] > state['step']:
62 | # A linearly increasing learning rate from $0$ to $\alpha$
63 | return 1e-8 + state['step'] * group['lr'] / group['warmup']
64 | else:
65 | # Constant learning rate $\alpha$
66 | progress = (state['step'] - group['warmup']) / max(1, group['total_steps'] - group['warmup'])
67 | return group['lr'] * max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
68 |
69 |
70 | def _test_lr():
71 | """
72 | ### Plot learning rate for different warmups and model sizes
73 |
74 | 
75 | """
76 | import matplotlib.pyplot as plt
77 | import numpy as np
78 | from torch import nn
79 |
80 | model = nn.Linear(10, 10)
81 | opt = AdamWarmupCosineDecay(model.parameters(), warmup=5000, lr=1e-4, total_steps=4e6)
82 | steps = 20_000
83 | plt.plot(np.arange(1, steps), [opt.get_lr({'step': i}, opt.defaults) for i in range(1, steps)])
84 | plt.legend(["5000:4e6", "5000:2e6", "5000:1e6"])
85 | plt.title("Learning Rate")
86 | plt.show()
87 |
88 | steps = int(6e6)
89 | step_size = 1000
90 | plt.plot(np.arange(1, steps, step_size), [opt.get_lr({'step': i}, opt.defaults) for i in range(1, steps, step_size)])
91 | plt.legend(["5000:4e6", "5000:2e6", "5000:1e6"])
92 | plt.title("Learning Rate")
93 | plt.show()
94 |
95 |
96 | if __name__ == '__main__':
97 | _test_lr()
98 |
--------------------------------------------------------------------------------
/labml_nn/normalization/weight_standardization/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Weight Standardization
4 | summary: >
5 | A PyTorch implementation/tutorial of Weight Standardization.
6 | ---
7 |
8 | # Weight Standardization
9 |
10 | This is a [PyTorch](https://pytorch.org) implementation of Weight Standardization from the paper
11 | [Micro-Batch Training with Batch-Channel Normalization and Weight Standardization](https://arxiv.org/abs/1903.10520).
12 | We also have an [annotated implementation of Batch-Channel Normalization](../batch_channel_norm/index.html).
13 |
14 | Batch normalization **gives a smooth loss landscape** and
15 | **avoids elimination singularities**.
16 | Elimination singularities are nodes of the network that become
17 | useless (e.g. a ReLU that gives 0 all the time).
18 |
19 | However, batch normalization doesn't work well when the batch size is too small,
20 | which happens when training large networks because of device memory limitations.
21 | The paper introduces Weight Standardization with Batch-Channel Normalization as
22 | a better alternative.
23 |
24 | Weight Standardization:
25 | 1. Normalizes the gradients
26 | 2. Smoothes the landscape (reduced Lipschitz constant)
27 | 3. Avoids elimination singularities
28 |
29 | The Lipschitz constant is the maximum slope a function has between two points.
30 | That is, $L$ is the Lipschitz constant where $L$ is the smallest value that satisfies,
31 | $\forall a,b \in A: \lVert f(a) - f(b) \rVert \le L \lVert a - b \rVert$
32 | where $f: A \rightarrow \mathbb{R}^m, A \in \mathbb{R}^n$.
33 |
34 | Elimination singularities are avoided because it keeps the statistics of the outputs similar to the
35 | inputs. So as long as the inputs are normally distributed the outputs remain close to normal.
36 | This avoids outputs of nodes from always falling beyond the active range of the activation function
37 | (e.g. always negative input for a ReLU).
38 |
39 | *[Refer to the paper for proofs](https://arxiv.org/abs/1903.10520)*.
40 |
41 | Here is [the training code](experiment.html) for training
42 | a VGG network that uses weight standardization to classify CIFAR-10 data.
43 | This uses a [2D-Convolution Layer with Weight Standardization](../conv2d.html).
44 |
45 | [](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/weight_standardization/experiment.ipynb)
46 | [](https://app.labml.ai/run/f4a783a2a7df11eb921d0242ac1c0002)
47 | [](https://wandb.ai/vpj/cifar10/runs/3flr4k8w)
48 | """
49 |
50 | import torch
51 |
52 |
53 | def weight_standardization(weight: torch.Tensor, eps: float):
54 | r"""
55 | ## Weight Standardization
56 |
57 | $$\hat{W}_{i,j} = \frac{W_{i,j} - \mu_{W_{i,\cdot}}} {\sigma_{W_{i,\cdot}}}$$
58 |
59 | where,
60 |
61 | \begin{align}
62 | W &\in \mathbb{R}^{O \times I} \\
63 | \mu_{W_{i,\cdot}} &= \frac{1}{I} \sum_{j=1}^I W_{i,j} \\
64 | \sigma_{W_{i,\cdot}} &= \sqrt{\frac{1}{I} \sum_{j=1}^I W^2_{i,j} - \mu^2_{W_{i,\cdot}} + \epsilon} \\
65 | \end{align}
66 |
67 | for a 2D-convolution layer $O$ is the number of output channels ($O = C_{out}$)
68 | and $I$ is the number of input channels times the kernel size ($I = C_{in} \times k_H \times k_W$)
69 | """
70 |
71 | # Get $C_{out}$, $C_{in}$ and kernel shape
72 | c_out, c_in, *kernel_shape = weight.shape
73 | # Reshape $W$ to $O \times I$
74 | weight = weight.view(c_out, -1)
75 | # Calculate
76 | #
77 | # \begin{align}
78 | # \mu_{W_{i,\cdot}} &= \frac{1}{I} \sum_{j=1}^I W_{i,j} \\
79 | # \sigma^2_{W_{i,\cdot}} &= \frac{1}{I} \sum_{j=1}^I W^2_{i,j} - \mu^2_{W_{i,\cdot}}
80 | # \end{align}
81 | var, mean = torch.var_mean(weight, dim=1, keepdim=True)
82 | # Normalize
83 | # $$\hat{W}_{i,j} = \frac{W_{i,j} - \mu_{W_{i,\cdot}}} {\sigma_{W_{i,\cdot}}}$$
84 | weight = (weight - mean) / (torch.sqrt(var + eps))
85 | # Change back to original shape and return
86 | return weight.view(c_out, c_in, *kernel_shape)
87 |
--------------------------------------------------------------------------------
/labml_nn/transformers/fast_weights/experiment.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Train Fast Weights Transformer
4 | summary: This is training code with notes for a Fast Weights Transformer.
5 | ---
6 |
7 | # Train Fast Weights Transformer
8 |
9 | This trains a fast weights transformer model for auto-regression.
10 |
11 | Here’s a Colab notebook for training a fast weights transformer on Tiny Shakespeare dataset.
12 |
13 | [](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb)
14 | [](https://app.labml.ai/run/928aadc0846c11eb85710242ac1c0002)
15 | """
16 |
17 | import torch
18 | from torch import nn
19 |
20 | from labml import experiment
21 | from labml.configs import option
22 | from labml.utils.pytorch import get_modules
23 | from labml_helpers.module import Module
24 | from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
25 |
26 |
27 | class AutoregressiveModel(Module):
28 | """
29 | ## Auto regressive model
30 | """
31 |
32 | def __init__(self, n_vocab: int, d_model: int, transformer: Module):
33 | super().__init__()
34 | # Token embedding module
35 | self.src_embed = nn.Embedding(n_vocab, d_model)
36 | self.transformer = transformer
37 | self.generator = nn.Linear(d_model, n_vocab)
38 |
39 | def forward(self, x: torch.Tensor):
40 | # Embed the tokens
41 | x = self.src_embed(x)
42 | # Run it through the the transformer
43 | res = self.transformer(x)
44 | # Generate logits of the next token
45 | return self.generator(res), None
46 |
47 |
48 | class Configs(NLPAutoRegressionConfigs):
49 | """
50 | ## Configurations
51 |
52 | The default configs can and will be over-ridden when we start the experiment
53 | """
54 |
55 | model: AutoregressiveModel
56 |
57 | d_model: int = 512
58 | nu: int = 1
59 | heads: int = 8
60 | dropout: float = 0.0
61 | d_ff: int = 2048
62 | n_layers: int = 6
63 |
64 |
65 | @option(Configs.model)
66 | def fast_weights_transformer(c: Configs):
67 | """
68 | Create [fast weights transformer](index.html).
69 | """
70 | from labml_nn.transformers.fast_weights import FastWeightsAttentionTransformer, \
71 | FastWeightsAttentionTransformerLayer, FastWeightsAttention, FeedForward
72 |
73 | from labml_nn.transformers.fast_weights import DPFP
74 | return AutoregressiveModel(
75 | c.n_tokens, c.d_model,
76 | FastWeightsAttentionTransformer(
77 | FastWeightsAttentionTransformerLayer(d_model=c.d_model,
78 | attn=FastWeightsAttention(c.heads, c.d_model, c.dropout, DPFP(nu=c.nu)),
79 | feed_forward=FeedForward(c.d_model, c.d_ff, c.dropout),
80 | dropout_prob=c.dropout),
81 | c.n_layers)).to(c.device)
82 |
83 |
84 | def main():
85 | # Create experiment
86 | experiment.create(name="fast_weights_transformer")
87 | # Create configs
88 | conf = Configs()
89 | # Load configurations
90 | experiment.configs(conf,
91 | # A dictionary of configurations to override
92 | {'tokenizer': 'character',
93 | 'text': 'tiny_shakespeare',
94 | 'optimizer.learning_rate': 1.0,
95 | 'optimizer.optimizer': 'Noam',
96 | 'prompt': 'It is',
97 | 'prompt_separator': '',
98 |
99 | 'train_loader': 'shuffled_train_loader',
100 | 'valid_loader': 'shuffled_valid_loader',
101 |
102 | 'seq_len': 128,
103 | 'epochs': 128,
104 | 'batch_size': 16,
105 | 'inner_iterations': 25})
106 |
107 | # Set models for saving and loading
108 | experiment.add_pytorch_models(get_modules(conf))
109 |
110 | # Start the experiment
111 | with experiment.start():
112 | # Run the training loop
113 | conf.run()
114 |
115 |
116 | if __name__ == '__main__':
117 | main()
118 |
--------------------------------------------------------------------------------
/labml_nn/gan/dcgan/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Deep Convolutional Generative Adversarial Networks (DCGAN)
4 | summary: A simple PyTorch implementation/tutorial of Deep Convolutional Generative Adversarial Networks (DCGAN).
5 | ---
6 |
7 | # Deep Convolutional Generative Adversarial Networks (DCGAN)
8 |
9 | This is a [PyTorch](https://pytorch.org) implementation of paper
10 | [Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](https://arxiv.org/abs/1511.06434).
11 |
12 | This implementation is based on the [PyTorch DCGAN Tutorial](https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html).
13 | """
14 |
15 | import torch.nn as nn
16 |
17 | from labml import experiment
18 | from labml.configs import calculate
19 | from labml_helpers.module import Module
20 | from labml_nn.gan.original.experiment import Configs
21 |
22 |
23 | class Generator(Module):
24 | """
25 | ### Convolutional Generator Network
26 |
27 | This is similar to the de-convolutional network used for CelebA faces,
28 | but modified for MNIST images.
29 |
30 |
31 | """
32 |
33 | def __init__(self):
34 | super().__init__()
35 | # The input is $1 \times 1$ with 100 channels
36 | self.layers = nn.Sequential(
37 | # This gives $3 \times 3$ output
38 | nn.ConvTranspose2d(100, 1024, 3, 1, 0, bias=False),
39 | nn.BatchNorm2d(1024),
40 | nn.ReLU(True),
41 | # This gives $7 \times 7$
42 | nn.ConvTranspose2d(1024, 512, 3, 2, 0, bias=False),
43 | nn.BatchNorm2d(512),
44 | nn.ReLU(True),
45 | # This gives $14 \times 14$
46 | nn.ConvTranspose2d(512, 256, 4, 2, 1, bias=False),
47 | nn.BatchNorm2d(256),
48 | nn.ReLU(True),
49 | # This gives $28 \times 28$
50 | nn.ConvTranspose2d(256, 1, 4, 2, 1, bias=False),
51 | nn.Tanh()
52 | )
53 |
54 | self.apply(_weights_init)
55 |
56 | def __call__(self, x):
57 | # Change from shape `[batch_size, 100]` to `[batch_size, 100, 1, 1]`
58 | x = x.unsqueeze(-1).unsqueeze(-1)
59 | x = self.layers(x)
60 | return x
61 |
62 |
63 | class Discriminator(Module):
64 | """
65 | ### Convolutional Discriminator Network
66 | """
67 |
68 | def __init__(self):
69 | super().__init__()
70 | # The input is $28 \times 28$ with one channel
71 | self.layers = nn.Sequential(
72 | # This gives $14 \times 14$
73 | nn.Conv2d(1, 256, 4, 2, 1, bias=False),
74 | nn.LeakyReLU(0.2, inplace=True),
75 | # This gives $7 \times 7$
76 | nn.Conv2d(256, 512, 4, 2, 1, bias=False),
77 | nn.BatchNorm2d(512),
78 | nn.LeakyReLU(0.2, inplace=True),
79 | # This gives $3 \times 3$
80 | nn.Conv2d(512, 1024, 3, 2, 0, bias=False),
81 | nn.BatchNorm2d(1024),
82 | nn.LeakyReLU(0.2, inplace=True),
83 | # This gives $1 \times 1$
84 | nn.Conv2d(1024, 1, 3, 1, 0, bias=False),
85 | )
86 | self.apply(_weights_init)
87 |
88 | def forward(self, x):
89 | x = self.layers(x)
90 | return x.view(x.shape[0], -1)
91 |
92 |
93 | def _weights_init(m):
94 | classname = m.__class__.__name__
95 | if classname.find('Conv') != -1:
96 | nn.init.normal_(m.weight.data, 0.0, 0.02)
97 | elif classname.find('BatchNorm') != -1:
98 | nn.init.normal_(m.weight.data, 1.0, 0.02)
99 | nn.init.constant_(m.bias.data, 0)
100 |
101 |
102 | # We import the [simple gan experiment]((simple_mnist_experiment.html) and change the
103 | # generator and discriminator networks
104 | calculate(Configs.generator, 'cnn', lambda c: Generator().to(c.device))
105 | calculate(Configs.discriminator, 'cnn', lambda c: Discriminator().to(c.device))
106 |
107 |
108 | def main():
109 | conf = Configs()
110 | experiment.create(name='mnist_dcgan')
111 | experiment.configs(conf,
112 | {'discriminator': 'cnn',
113 | 'generator': 'cnn',
114 | 'label_smoothing': 0.01})
115 | with experiment.start():
116 | conf.run()
117 |
118 |
119 | if __name__ == '__main__':
120 | main()
121 |
--------------------------------------------------------------------------------
/labml_nn/resnets/utils/train.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import torch
4 | from torch.utils.data import DataLoader, ConcatDataset
5 | # from sklearn.model_selection import KFold
6 | # from torch.utils.data.sampler import SubsetRandomSampler
7 |
8 | import matplotlib.pyplot as plt
9 | from pylab import *
10 | import os
11 |
12 | from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
13 |
14 |
15 |
16 | class Trainer():
17 | def __init__(self, net, opt, cost, name="default", lr=0.0005, use_lr_schedule =False , device=None):
18 | self.net = net
19 | self.opt = opt
20 | self.cost = cost
21 | self.device = device
22 | self.epoch = 0
23 | self.start_epoch = 0
24 | self.name = name
25 |
26 | self.lr = lr
27 | self.use_lr_schedule = use_lr_schedule
28 | if self.use_lr_schedule:
29 | self.scheduler = ReduceLROnPlateau( self.opt, 'max', factor=0.1, patience=5, threshold=0.00001, verbose=True)
30 | # self.scheduler = StepLR(self.opt, step_size=15, gamma=0.1)
31 |
32 | # Train loop over epochs. Optinal use testloader to return test accuracy after each epoch
33 | def Train(self, trainloader, epochs, testloader=None):
34 | # Enable Dropout
35 |
36 | # Record loss/accuracies
37 | loss = torch.zeros(epochs)
38 | self.epoch = 0
39 |
40 | # If testloader is used, loss will be the accuracy
41 | for epoch in range(self.start_epoch, self.start_epoch+epochs):
42 | self.epoch = epoch+1
43 |
44 | self.net.train() # Enable Dropout
45 | for data in trainloader:
46 | # Get the inputs; data is a list of [inputs, labels]
47 | if self.device:
48 | images, labels = data[0].to(self.device), data[1].to(self.device)
49 | else:
50 | images, labels = data
51 |
52 | self.opt.zero_grad()
53 | # Forward + backward + optimize
54 | outputs = self.net(images)
55 | epoch_loss = self.cost(outputs, labels)
56 | epoch_loss.backward()
57 | self.opt.step()
58 |
59 | loss[epoch] += epoch_loss.item()
60 |
61 | if testloader:
62 | loss[epoch] = self.Test(testloader)
63 | else:
64 | loss[epoch] /= len(trainloader)
65 |
66 | print("Epoch %d Learning rate %.6f %s: %.3f" % (
67 | self.epoch, self.opt.param_groups[0]['lr'], "Accuracy" if testloader else "Loss", loss[epoch]))
68 |
69 | #learning rate scheduler
70 | if self.use_lr_schedule:
71 | self.scheduler.step(loss[epoch])
72 | # self.scheduler.step()
73 |
74 | # Saving best model
75 | if loss[epoch] >= torch.max(loss):
76 | self.save_best_model({
77 | 'epoch': self.epoch,
78 | 'state_dict': self.net.state_dict(),
79 | 'optimizer': self.opt.state_dict(),
80 | })
81 |
82 | return loss
83 |
84 | # Testing
85 | def Test(self, testloader, ret="accuracy"):
86 | # Disable Dropout
87 | self.net.eval()
88 |
89 | # Track correct and total
90 | correct = 0.0
91 | total = 0.0
92 | with torch.no_grad():
93 | for data in testloader:
94 | if self.device:
95 | images, labels = data[0].to(self.device), data[1].to(self.device)
96 | else:
97 | images, labels = data
98 |
99 | outputs = self.net(images)
100 | _, predicted = torch.max(outputs.data, 1)
101 | total += labels.size(0)
102 | correct += (predicted == labels).sum().item()
103 |
104 | return correct / total
105 |
106 | def save_best_model(self, state):
107 | directory = os.path.dirname("./save/%s-best-model/"%(self.name))
108 | if not os.path.exists(directory):
109 | os.mkdir(directory)
110 | torch.save(state, "%s/model.pt" %(directory))
111 |
112 | def save_checkpoint(self, state):
113 | directory = os.path.dirname("./save/%s-checkpoints/"%(self.name))
114 | if not os.path.exists(directory):
115 | os.mkdir(directory)
116 | torch.save(state, "%s/model_epoch_%s.pt" %(directory, self.epoch))
117 | # torch.save(state, "./save/checkpoints/model_epoch_%s.pt" % (self.epoch))
118 |
--------------------------------------------------------------------------------
/labml_nn/normalization/batch_norm/readme.md:
--------------------------------------------------------------------------------
1 | # [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html)
2 |
3 | This is a [PyTorch](https://pytorch.org) implementation of Batch Normalization from paper
4 | [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167).
5 |
6 | ### Internal Covariate Shift
7 |
8 | The paper defines *Internal Covariate Shift* as the change in the
9 | distribution of network activations due to the change in
10 | network parameters during training.
11 | For example, let's say there are two layers $l_1$ and $l_2$.
12 | During the beginning of the training $l_1$ outputs (inputs to $l_2$)
13 | could be in distribution $\mathcal{N}(0.5, 1)$.
14 | Then, after some training steps, it could move to $\mathcal{N}(0.6, 1.5)$.
15 | This is *internal covariate shift*.
16 |
17 | Internal covariate shift will adversely affect training speed because the later layers
18 | ($l_2$ in the above example) have to adapt to this shifted distribution.
19 |
20 | By stabilizing the distribution, batch normalization minimizes the internal covariate shift.
21 |
22 | ## Normalization
23 |
24 | It is known that whitening improves training speed and convergence.
25 | *Whitening* is linearly transforming inputs to have zero mean, unit variance,
26 | and be uncorrelated.
27 |
28 | ### Normalizing outside gradient computation doesn't work
29 |
30 | Normalizing outside the gradient computation using pre-computed (detached)
31 | means and variances doesn't work. For instance. (ignoring variance), let
32 | $$\hat{x} = x - \mathbb{E}[x]$$
33 | where $x = u + b$ and $b$ is a trained bias
34 | and $\mathbb{E}[x]$ is an outside gradient computation (pre-computed constant).
35 |
36 | Note that $\hat{x}$ has no effect on $b$.
37 | Therefore,
38 | $b$ will increase or decrease based
39 | $\frac{\partial{\mathcal{L}}}{\partial x}$,
40 | and keep on growing indefinitely in each training update.
41 | The paper notes that similar explosions happen with variances.
42 |
43 | ### Batch Normalization
44 |
45 | Whitening is computationally expensive because you need to de-correlate and
46 | the gradients must flow through the full whitening calculation.
47 |
48 | The paper introduces a simplified version which they call *Batch Normalization*.
49 | First simplification is that it normalizes each feature independently to have
50 | zero mean and unit variance:
51 | $$\hat{x}^{(k)} = \frac{x^{(k)} - \mathbb{E}[x^{(k)}]}{\sqrt{Var[x^{(k)}]}}$$
52 | where $x = (x^{(1)} ... x^{(d)})$ is the $d$-dimensional input.
53 |
54 | The second simplification is to use estimates of mean $\mathbb{E}[x^{(k)}]$
55 | and variance $Var[x^{(k)}]$ from the mini-batch
56 | for normalization; instead of calculating the mean and variance across the whole dataset.
57 |
58 | Normalizing each feature to zero mean and unit variance could affect what the layer
59 | can represent.
60 | As an example paper illustrates that, if the inputs to a sigmoid are normalized
61 | most of it will be within $[-1, 1]$ range where the sigmoid is linear.
62 | To overcome this each feature is scaled and shifted by two trained parameters
63 | $\gamma^{(k)}$ and $\beta^{(k)}$.
64 | $$y^{(k)} =\gamma^{(k)} \hat{x}^{(k)} + \beta^{(k)}$$
65 | where $y^{(k)}$ is the output of the batch normalization layer.
66 |
67 | Note that when applying batch normalization after a linear transform
68 | like $Wu + b$ the bias parameter $b$ gets cancelled due to normalization.
69 | So you can and should omit bias parameter in linear transforms right before the
70 | batch normalization.
71 |
72 | Batch normalization also makes the back propagation invariant to the scale of the weights
73 | and empirically it improves generalization, so it has regularization effects too.
74 |
75 | ## Inference
76 |
77 | We need to know $\mathbb{E}[x^{(k)}]$ and $Var[x^{(k)}]$ in order to
78 | perform the normalization.
79 | So during inference, you either need to go through the whole (or part of) dataset
80 | and find the mean and variance, or you can use an estimate calculated during training.
81 | The usual practice is to calculate an exponential moving average of
82 | mean and variance during the training phase and use that for inference.
83 |
84 | Here's [the training code](mnist.html) and a notebook for training
85 | a CNN classifier that uses batch normalization for MNIST dataset.
86 |
87 | [](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/batch_norm/mnist.ipynb)
88 | [](https://app.labml.ai/run/011254fe647011ebbb8e0242ac1c0002)
89 |
--------------------------------------------------------------------------------
/labml_nn/optimizers/mnist_experiment.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: MNIST example to test the optimizers
4 | summary: This is a simple MNIST example with a CNN model to test the optimizers.
5 | ---
6 |
7 | # MNIST example to test the optimizers
8 | """
9 | import torch.nn as nn
10 | import torch.utils.data
11 | from labml_helpers.module import Module
12 |
13 | from labml import experiment, tracker
14 | from labml.configs import option
15 | from labml_helpers.datasets.mnist import MNISTConfigs
16 | from labml_helpers.device import DeviceConfigs
17 | from labml_helpers.metrics.accuracy import Accuracy
18 | from labml_helpers.seed import SeedConfigs
19 | from labml_helpers.train_valid import TrainValidConfigs, BatchIndex, hook_model_outputs
20 | from labml_nn.optimizers.configs import OptimizerConfigs
21 |
22 |
23 | class Model(Module):
24 | """
25 | ## The model
26 | """
27 | def __init__(self):
28 | super().__init__()
29 | self.conv1 = nn.Conv2d(1, 20, 5, 1)
30 | self.pool1 = nn.MaxPool2d(2)
31 | self.conv2 = nn.Conv2d(20, 50, 5, 1)
32 | self.pool2 = nn.MaxPool2d(2)
33 | self.fc1 = nn.Linear(16 * 50, 500)
34 | self.fc2 = nn.Linear(500, 10)
35 | self.activation = nn.ReLU()
36 |
37 | def forward(self, x):
38 | x = self.activation(self.conv1(x))
39 | x = self.pool1(x)
40 | x = self.activation(self.conv2(x))
41 | x = self.pool2(x)
42 | x = self.activation(self.fc1(x.view(-1, 16 * 50)))
43 | return self.fc2(x)
44 |
45 |
46 | class Configs(MNISTConfigs, TrainValidConfigs):
47 | """
48 | ## Configurable Experiment Definition
49 | """
50 | optimizer: torch.optim.Adam
51 | model: nn.Module
52 | set_seed = SeedConfigs()
53 | device: torch.device = DeviceConfigs()
54 | epochs: int = 10
55 |
56 | is_save_models = True
57 | model: nn.Module
58 | inner_iterations = 10
59 |
60 | accuracy_func = Accuracy()
61 | loss_func = nn.CrossEntropyLoss()
62 |
63 | def init(self):
64 | tracker.set_queue("loss.*", 20, True)
65 | tracker.set_scalar("accuracy.*", True)
66 | hook_model_outputs(self.mode, self.model, 'model')
67 | self.state_modules = [self.accuracy_func]
68 |
69 | def step(self, batch: any, batch_idx: BatchIndex):
70 | # Get the batch
71 | data, target = batch[0].to(self.device), batch[1].to(self.device)
72 |
73 | # Add global step if we are in training mode
74 | if self.mode.is_train:
75 | tracker.add_global_step(len(data))
76 |
77 | # Run the model and specify whether to log the activations
78 | with self.mode.update(is_log_activations=batch_idx.is_last):
79 | output = self.model(data)
80 |
81 | # Calculate the loss
82 | loss = self.loss_func(output, target)
83 | # Calculate the accuracy
84 | self.accuracy_func(output, target)
85 | # Log the loss
86 | tracker.add("loss.", loss)
87 |
88 | # Optimize if we are in training mode
89 | if self.mode.is_train:
90 | # Calculate the gradients
91 | loss.backward()
92 |
93 | # Take optimizer step
94 | self.optimizer.step()
95 | # Log the parameter and gradient L2 norms once per epoch
96 | if batch_idx.is_last:
97 | tracker.add('model', self.model)
98 | tracker.add('optimizer', (self.optimizer, {'model': self.model}))
99 | # Clear the gradients
100 | self.optimizer.zero_grad()
101 |
102 | # Save logs
103 | tracker.save()
104 |
105 |
106 | @option(Configs.model)
107 | def model(c: Configs):
108 | return Model().to(c.device)
109 |
110 |
111 | @option(Configs.optimizer)
112 | def _optimizer(c: Configs):
113 | """
114 | Create a configurable optimizer.
115 | We can change the optimizer type and hyper-parameters using configurations.
116 | """
117 | opt_conf = OptimizerConfigs()
118 | opt_conf.parameters = c.model.parameters()
119 | return opt_conf
120 |
121 |
122 | def main():
123 | conf = Configs()
124 | conf.inner_iterations = 10
125 | experiment.create(name='mnist_ada_belief')
126 | experiment.configs(conf, {'inner_iterations': 10,
127 | # Specify the optimizer
128 | 'optimizer.optimizer': 'Adam',
129 | 'optimizer.learning_rate': 1.5e-4})
130 | conf.set_seed.set()
131 | experiment.add_pytorch_models(dict(model=conf.model))
132 | with experiment.start():
133 | conf.run()
134 |
135 |
136 | if __name__ == '__main__':
137 | main()
138 |
--------------------------------------------------------------------------------
/docs/resnets/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 | None
24 |
25 |
26 |
27 |
28 |
29 |
40 |
41 |
42 |
65 |
66 |
68 |
69 |
83 |
123 |
124 |
--------------------------------------------------------------------------------
/docs/experiments/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 | None
24 |
25 |
26 |
27 |
28 |
29 |
40 |
41 |
42 |
65 |
66 |
68 |
69 |
83 |
123 |
124 |
--------------------------------------------------------------------------------
/labml_nn/normalization/instance_norm/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Instance Normalization
4 | summary: >
5 | A PyTorch implementation/tutorial of instance normalization.
6 | ---
7 |
8 | # Instance Normalization
9 |
10 | This is a [PyTorch](https://pytorch.org) implementation of
11 | [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
12 |
13 | Instance normalization was introduced to improve [style transfer](https://paperswithcode.com/task/style-transfer).
14 | It is based on the observation that stylization should not depend on the contrast of the content image.
15 | The "contrast normalization" is
16 |
17 | $$y_{t,i,j,k} = \frac{x_{t,i,j,k}}{\sum_{l=1}^H \sum_{m=1}^W x_{t,i,l,m}}$$
18 |
19 | where $x$ is a batch of images with dimensions image index $t$,
20 | feature channel $i$, and
21 | spatial position $j, k$.
22 |
23 | Since it's hard for a convolutional network to learn "contrast normalization", this paper
24 | introduces instance normalization which does that.
25 |
26 | Here's a [CIFAR 10 classification model](experiment.html) that uses instance normalization.
27 | """
28 |
29 | import torch
30 | from torch import nn
31 |
32 | from labml_helpers.module import Module
33 |
34 |
35 | class InstanceNorm(Module):
36 | r"""
37 | ## Instance Normalization Layer
38 |
39 | Instance normalization layer $\text{IN}$ normalizes the input $X$ as follows:
40 |
41 | When input $X \in \mathbb{R}^{B \times C \times H \times W}$ is a batch of image representations,
42 | where $B$ is the batch size, $C$ is the number of channels, $H$ is the height and $W$ is the width.
43 | $\gamma \in \mathbb{R}^{C}$ and $\beta \in \mathbb{R}^{C}$. The affine transformation with $gamma$ and
44 | $beta$ are optional.
45 |
46 | $$\text{IN}(X) = \gamma
47 | \frac{X - \underset{H, W}{\mathbb{E}}[X]}{\sqrt{\underset{H, W}{Var}[X] + \epsilon}}
48 | + \beta$$
49 | """
50 |
51 | def __init__(self, channels: int, *,
52 | eps: float = 1e-5, affine: bool = True):
53 | """
54 | * `channels` is the number of features in the input
55 | * `eps` is $\epsilon$, used in $\sqrt{Var[X] + \epsilon}$ for numerical stability
56 | * `affine` is whether to scale and shift the normalized value
57 | """
58 | super().__init__()
59 |
60 | self.channels = channels
61 |
62 | self.eps = eps
63 | self.affine = affine
64 | # Create parameters for $\gamma$ and $\beta$ for scale and shift
65 | if self.affine:
66 | self.scale = nn.Parameter(torch.ones(channels))
67 | self.shift = nn.Parameter(torch.zeros(channels))
68 |
69 | def forward(self, x: torch.Tensor):
70 | """
71 | `x` is a tensor of shape `[batch_size, channels, *]`.
72 | `*` denotes any number of (possibly 0) dimensions.
73 | For example, in an image (2D) convolution this will be
74 | `[batch_size, channels, height, width]`
75 | """
76 | # Keep the original shape
77 | x_shape = x.shape
78 | # Get the batch size
79 | batch_size = x_shape[0]
80 | # Sanity check to make sure the number of features is the same
81 | assert self.channels == x.shape[1]
82 |
83 | # Reshape into `[batch_size, channels, n]`
84 | x = x.view(batch_size, self.channels, -1)
85 |
86 | # Calculate the mean across last dimension
87 | # i.e. the means for each feature $\mathbb{E}[x_{t,i}]$
88 | mean = x.mean(dim=[-1], keepdim=True)
89 | # Calculate the squared mean across first and last dimension;
90 | # i.e. the means for each feature $\mathbb{E}[(x_{t,i}^2]$
91 | mean_x2 = (x ** 2).mean(dim=[-1], keepdim=True)
92 | # Variance for each feature $Var[x_{t,i}] = \mathbb{E}[x_{t,i}^2] - \mathbb{E}[x_{t,i}]^2$
93 | var = mean_x2 - mean ** 2
94 |
95 | # Normalize $$\hat{x}_{t,i} = \frac{x_{t,i} - \mathbb{E}[x_{t,i}]}{\sqrt{Var[x_{t,i}] + \epsilon}}$$
96 | x_norm = (x - mean) / torch.sqrt(var + self.eps)
97 | x_norm = x_norm.view(batch_size, self.channels, -1)
98 |
99 | # Scale and shift $$y_{t,i} =\gamma_i \hat{x}_{t,i} + \beta_i$$
100 | if self.affine:
101 | x_norm = self.scale.view(1, -1, 1) * x_norm + self.shift.view(1, -1, 1)
102 |
103 | # Reshape to original and return
104 | return x_norm.view(x_shape)
105 |
106 |
107 | def _test():
108 | """
109 | Simple test
110 | """
111 | from labml.logger import inspect
112 |
113 | x = torch.zeros([2, 6, 2, 4])
114 | inspect(x.shape)
115 | bn = InstanceNorm(6)
116 |
117 | x = bn(x)
118 | inspect(x.shape)
119 |
120 |
121 | #
122 | if __name__ == '__main__':
123 | _test()
124 |
--------------------------------------------------------------------------------
/labml_nn/cnn/ray_tune.py:
--------------------------------------------------------------------------------
1 | #!/bin/python
2 |
3 | import numpy as np
4 | import os
5 | import torch
6 | from ray import tune
7 | from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
8 | from utils.train import Trainer
9 | from models.cnn import GetCNN
10 |
11 | # Check if GPU is available
12 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
13 | print("Device: " + str(device))
14 |
15 | #
16 | num_samples= 40 # for multiple trials
17 | max_num_epochs= 25
18 | gpus_per_trial= 1
19 |
20 | # Cifar 10 Datasets location
21 | data_dir = './data/Cifar10'
22 |
23 | """
24 | Code has been referenced from the official ray tune documentation
25 | ASHA
26 | https://docs.ray.io/en/master/tune/api_docs/schedulers.html#tune-scheduler-hyperband
27 |
28 | PBT
29 | https://docs.ray.io/en/latest/tune/api_docs/schedulers.html#tune-scheduler-pbt
30 | """
31 |
32 | """config - returns a dict of hyperparameters
33 |
34 | Selecting different hyperparameters for tuning
35 | l1 : Number of units in first fully connected layer
36 | l2 : Number of units in second fully connected layer
37 | lr : Learning rate
38 | decay : Decay rate for regularization
39 | batch_size : Batch size of test and train data
40 | """
41 | config = {
42 | "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), # eg. 4, 8, 16 .. 512
43 | "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), # eg. 4, 8, 16 .. 512
44 | "lr": tune.loguniform(1e-4, 1e-1), # Sampling from log uniform distribution
45 | "decay": tune.sample_from(lambda _: 10 ** np.random.randint(-7, -3)), # eg. 1e-7, 1e-6, .. 1e-3
46 | "batch_size": tune.choice([32, 64, 128, 256])
47 | }
48 |
49 | # calling trainer
50 | trainer = Trainer(device=device)
51 |
52 | """ASHA (Asynchronous Successive Halving Algorithm) scheduler
53 | max_t : Maximum number of units per trail (can be time or epochs)
54 | grace_period : Stop trials after specific number of unit if model is not performing well (can be time or epochs)
55 | reduction_factor : Set halving rate
56 | """
57 | scheduler = ASHAScheduler(
58 | max_t=max_num_epochs,
59 | grace_period=4,
60 | reduction_factor=4)
61 |
62 |
63 |
64 | """Population based training scheduler
65 | time_attr : Can be time or epochs
66 | metric : Objective of training (loss or accuracy)
67 | perturbation_interval : Perturbation occur after specified unit (can be time or epochs)
68 | hyperparam_mutations : Hyperparameters to mutate
69 | """
70 | scheduler = PopulationBasedTraining(
71 | time_attr= "training_iteration", # epochs
72 | metric='loss', # loss is objective function
73 | mode='min', # minimizing loss is objective of training
74 | perturbation_interval=5.0, # after 5 epochs perturbate
75 | hyperparam_mutations={
76 | "lr": [1e-3, 5e-4, 1e-4, 5e-4, 1e-5], # choose from given learning rates
77 | "batch_size": [64, 128, 256], # choose from given batch sizes
78 | "decay": tune.uniform(10**-8, 10**-4) # sample from uniform distribution
79 | }
80 | )
81 |
82 | result = tune.run(
83 | tune.with_parameters(trainer.Train_ray, data_dir=data_dir),
84 | name="ray_test_basic-CNN", # name for identifying models (checkpoints)
85 | scheduler=scheduler, # select scheduler PBT or ASHA
86 | resources_per_trial={"cpu": 8, "gpu": gpus_per_trial}, # select number of CPUs or GPUs
87 | config=config, # input config dict consisting of different hyperparameters
88 | stop={
89 | "training_iteration": max_num_epochs, # stopping criterea
90 | },
91 | metric="loss", # uncomment for ASHA scheduler
92 | mode="min", # uncomment for ASHA scheduler
93 | num_samples=num_samples,
94 | verbose=True, # keep to true to check how training progresses
95 | fail_fast=True, # fail on first error
96 | keep_checkpoints_num=5, # number of checkpoints to be saved per num_samples
97 |
98 | )
99 |
100 | best_trial = result.get_best_trial("loss", "min", "last")
101 | print("Best configuration: {}".format(best_trial.config))
102 | print("Best validation loss: {}".format(best_trial.last_result["loss"]))
103 | print("Best validation accuracy: {}".format(
104 | best_trial.last_result["accuracy"]))
105 |
106 |
107 | best_trained_model = GetCNN(best_trial.config["l1"], best_trial.config["l2"])
108 | best_trained_model.to(device)
109 | checkpoint_path = os.path.join(best_trial.checkpoint.value, "checkpoint")
110 | model_state, optimizer_state = torch.load(checkpoint_path)
111 | best_trained_model.load_state_dict(model_state)
112 |
113 | # Check accuracy of best model
114 | test_acc = trainer.Test(best_trained_model, save=data_dir)
115 | print("Best Test accuracy: {}".format(test_acc))
--------------------------------------------------------------------------------
/docs/resnets/utils/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 | None
24 |
25 |
26 |
27 |
28 |
29 |
40 |
41 |
42 |
66 |
67 |
69 |
70 |
84 |
124 |
125 |
--------------------------------------------------------------------------------
/docs/resnets/models/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 | None
24 |
25 |
26 |
27 |
28 |
29 |
40 |
41 |
42 |
66 |
67 |
69 |
70 |
84 |
124 |
125 |
--------------------------------------------------------------------------------
/labml_nn/transformers/fast_weights/token_wise.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Fast Weight Systems
4 | summary: >
5 | This is an annotated implementation/tutorial of
6 | Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch.
7 | ---
8 | """
9 | from typing import Optional
10 |
11 | import torch
12 | from torch import nn
13 |
14 | from labml_helpers.module import Module
15 | from labml_nn.transformers.fast_weights import DPFP
16 | from labml_nn.transformers.feed_forward import FeedForward
17 | from labml_nn.transformers.mha import PrepareForMultiHeadAttention
18 | from labml_nn.utils import clone_module_list
19 |
20 |
21 | class FastWeightsAttention(Module):
22 | def __init__(self, heads: int, d_model: int, dropout_prob: float, phi: DPFP):
23 | super().__init__()
24 |
25 | # Number of features per head
26 | self.d_k = d_model // heads
27 | #
28 | self.heads = heads
29 |
30 | # These transform the `query` multi-headed attention.
31 | self.query = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=False)
32 | # These transform the `key` and `value` for multi-headed attention.
33 | self.key = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=False)
34 | self.value = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=False)
35 |
36 | self.gate = nn.Sequential(PrepareForMultiHeadAttention(d_model, heads, 1, bias=False),
37 | nn.Sigmoid())
38 |
39 | self.phi = phi
40 |
41 | # Output layer
42 | self.output = nn.Linear(d_model, d_model)
43 | # Dropout
44 | self.dropout = nn.Dropout(dropout_prob)
45 |
46 | def __call__(self, x: torch.Tensor, weights: Optional[torch.Tensor]):
47 | query = self.phi(self.query(x))
48 | key = self.phi(self.key(x))
49 | value = self.value(x)
50 |
51 | if weights is None:
52 | weights = key.new_zeros((key.shape[0], key.shape[1], value.shape[2], key.shape[2]))
53 |
54 | value_existing = torch.einsum('bhvk,bhk->bhv', weights, key)
55 |
56 | beta = self.gate(x)
57 |
58 | weights = weights + torch.einsum('bhv,bhk->bhvk', beta * (value - value_existing), key)
59 |
60 | x = torch.einsum('bhvk,bhk->bhv', weights, query)
61 |
62 | # Concatenate multiple heads
63 | x = x.reshape(x.shape[0], -1)
64 |
65 | # Output layer
66 | return self.output(x), weights
67 |
68 |
69 | class FastWeightsAttentionTransformerLayer(Module):
70 | def __init__(self, *,
71 | d_model: int,
72 | attn: FastWeightsAttention,
73 | feed_forward: FeedForward,
74 | dropout_prob: float):
75 | super().__init__()
76 | # Transformer size $d_{model}$
77 | self.size = d_model
78 | #
79 | self.attn = attn
80 | self.feed_forward = feed_forward
81 | self.dropout = nn.Dropout(dropout_prob)
82 |
83 | # Normalization layers
84 | self.norm_self_attn = nn.LayerNorm([d_model])
85 | self.norm_ff = nn.LayerNorm([d_model])
86 |
87 | def __call__(self, x: torch.Tensor, weights: Optional[torch.Tensor]):
88 | attn, weights = self.attn(x, weights)
89 | # Add the self attention results
90 | x = x + self.dropout(attn)
91 |
92 | # Normalize for feed-forward
93 | z = self.norm_ff(x)
94 | # Pass through the feed-forward network
95 | ff = self.feed_forward(z)
96 | # Add the feed-forward results back
97 | x = x + self.dropout(ff)
98 |
99 | #
100 | return x, weights
101 |
102 |
103 | class FastWeightsAttentionTransformer(Module):
104 | def __init__(self, layer: FastWeightsAttentionTransformerLayer, n_layers: int):
105 | super().__init__()
106 | # Make copies of the transformer layer
107 | self.layers = clone_module_list(layer, n_layers)
108 | # Final normalization layer
109 | self.norm = nn.LayerNorm([layer.size])
110 |
111 | def __call__(self, x_seq: torch.Tensor):
112 | # Split the input to a list along the sequence axis
113 | x_seq = torch.unbind(x_seq, dim=0)
114 | # List to store the outputs
115 | res = []
116 | # For each input step
117 | weights = [None for _ in range(len(self.layers))]
118 |
119 | for x in x_seq:
120 | # Run through each layer
121 | for i, layer in enumerate(self.layers):
122 | # Get layer output
123 | x, weights[i] = layer(x, weights[i])
124 |
125 | res.append(x)
126 |
127 | # Stack the output tensors
128 | res = torch.stack(res)
129 | # Normalize the output
130 | return self.norm(res)
131 |
--------------------------------------------------------------------------------
/docs/transformers/basic/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 | None
24 |
25 |
26 |
27 |
28 |
29 |
40 |
41 |
42 |
66 |
67 |
69 |
70 |
84 |
124 |
125 |
--------------------------------------------------------------------------------
/labml_nn/transformers/glu_variants/experiment.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Gated Linear Units and Variants
4 | summary: >
5 | Train an auto-regressive transformer with Gated Linear Units and variants
6 | for the position-wise feedforward network (FFN).
7 | ---
8 |
9 | # Gated Linear Units and Variants
10 |
11 | This trains a simple [transformer](../../) model for auto-regression.
12 | We try different variants for the [position-wise feedforward network](../feed_forward).
13 | The reusable & configurable are defined in [`configs.py`](configs.html).
14 | """
15 |
16 | import torch
17 | from labml import experiment
18 | from labml.configs import option
19 | from labml.utils.pytorch import get_modules
20 | from labml_helpers.module import Module
21 |
22 | from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
23 | from labml_nn.transformers import Encoder, Generator, TransformerConfigs
24 | from labml_nn.transformers.utils import subsequent_mask
25 |
26 |
27 | class AutoregressiveModel(Module):
28 | """
29 | ## Auto regressive model
30 | """
31 |
32 | def __init__(self, src_embed: Module, encoder: Encoder, generator: Generator):
33 | super().__init__()
34 | # Token embedding module
35 | self.src_embed = src_embed
36 | # Transformer based encoder
37 | self.encoder = encoder
38 | # Next token generation layer;
39 | # this give logits of the the next token
40 | self.generator = generator
41 | # This will be initialized on the first call
42 | self.src_mask = None
43 |
44 | def forward(self, src: torch.Tensor):
45 | # Create subsequent mask, so that the transformer can only pay attention to past tokens.
46 | if self.src_mask is None or self.src_mask.size(0) != len(src):
47 | self.src_mask = subsequent_mask(len(src)).to(src.device)
48 | # Embed the tokens (`src`) and run it through the the transformer
49 | res = self.encoder(self.src_embed(src), self.src_mask)
50 | # Generate logits of the next token
51 | return self.generator(res), None
52 |
53 |
54 | class Configs(NLPAutoRegressionConfigs):
55 | """
56 | ## Configurations
57 |
58 | The default configs can and will be over-ridden when we start the experiment
59 | """
60 |
61 | transformer: TransformerConfigs
62 | model: AutoregressiveModel
63 |
64 |
65 | @option(Configs.model)
66 | def autoregressive_model(c: Configs):
67 | """
68 | Initialize the auto-regressive model
69 | """
70 | m = AutoregressiveModel(c.transformer.src_embed, c.transformer.encoder, c.transformer.generator)
71 | return m.to(c.device)
72 |
73 |
74 | @option(Configs.transformer)
75 | def transformer_c(c: Configs):
76 | """
77 | Initialize the [configurable transformer](../configs.html) encoder for our autoregressive model.
78 | """
79 | tc = TransformerConfigs()
80 | tc.n_src_vocab = c.n_tokens
81 | tc.n_tgt_vocab = c.n_tokens
82 |
83 | return tc
84 |
85 |
86 | def main():
87 | # Create experiment
88 | experiment.create(name="glu_variants")
89 | # Create configs
90 | conf = Configs()
91 | # Load configurations
92 | experiment.configs(conf,
93 | # A dictionary of configurations to override
94 | {'tokenizer': 'character',
95 | 'prompt_separator': '',
96 | 'prompt': 'It is ',
97 | 'text': 'tiny_shakespeare',
98 |
99 | 'optimizer.optimizer': 'Noam',
100 | 'optimizer.learning_rate': 1.,
101 | 'optimizer.d_model': 256,
102 |
103 | 'seq_len': 1024,
104 | 'epochs': 128,
105 | 'batch_size': 6,
106 | 'inner_iterations': 10,
107 |
108 | # GLU Variant, one of GLU, Bilinear, ReGLU, GEGLU, SwiGLU
109 | #
110 | # These are defined in the [configurable FFN](../configs.html#FFN)
111 | # implementation
112 | 'transformer.ffn.glu_variant': 'Bilinear',
113 |
114 | # Transformer configurations
115 | 'transformer.d_model': 256,
116 | 'transformer.ffn.d_ff': 1024,
117 | 'transformer.n_heads': 8,
118 | 'transformer.n_layers': 6})
119 |
120 | # This is needed to initialize models
121 | conf.n_tokens = conf.text.n_tokens
122 |
123 | # Set models for saving and loading
124 | experiment.add_pytorch_models(get_modules(conf))
125 |
126 | # Start the experiment
127 | with experiment.start():
128 | # `TrainValidConfigs.run`
129 | conf.run()
130 |
131 |
132 | if __name__ == '__main__':
133 | main()
134 |
--------------------------------------------------------------------------------
/labml_nn/transformers/fnet/experiment.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: FNet Experiment
4 | summary: This experiment trains a FNet based model on AG News dataset.
5 | ---
6 |
7 | # [FNet](index.html) Experiment
8 |
9 | This is an annotated PyTorch experiment to train a [FNet model](index.html).
10 |
11 | This is based on
12 | [general training loop and configurations for AG News classification task](../../experiments/nlp_classification.html).
13 | """
14 |
15 | import torch
16 | from torch import nn
17 |
18 | from labml import experiment
19 | from labml.configs import option
20 | from labml_helpers.module import Module
21 | from labml_nn.experiments.nlp_classification import NLPClassificationConfigs
22 | from labml_nn.transformers import Encoder
23 | from labml_nn.transformers import TransformerConfigs
24 |
25 |
26 | class TransformerClassifier(nn.Module):
27 | """
28 | # Transformer based classifier model
29 | """
30 | def __init__(self, encoder: Encoder, src_embed: Module, generator: nn.Linear):
31 | """
32 | * `encoder` is the transformer [Encoder](../models.html#Encoder)
33 | * `src_embed` is the token
34 | [embedding module (with positional encodings)](../models.html#EmbeddingsWithLearnedPositionalEncoding)
35 | * `generator` is the [final fully connected layer](../models.html#Generator) that gives the logits.
36 | """
37 | super().__init__()
38 | self.src_embed = src_embed
39 | self.encoder = encoder
40 | self.generator = generator
41 |
42 | def forward(self, x: torch.Tensor):
43 | # Get the token embeddings with positional encodings
44 | x = self.src_embed(x)
45 | # Transformer encoder
46 | x = self.encoder(x, None)
47 | # Get logits for classification.
48 | #
49 | # We set the `[CLS]` token at the last position of the sequence.
50 | # This is extracted by `x[-1]`, where `x` is of
51 | # shape `[seq_len, batch_size, d_model]`
52 | x = self.generator(x[-1])
53 |
54 | # Return results
55 | # (second value is for state, since our trainer is used with RNNs also)
56 | return x, None
57 |
58 |
59 | class Configs(NLPClassificationConfigs):
60 | """
61 | ## Configurations
62 |
63 | This inherits from
64 | [`NLPClassificationConfigs`](../../experiments/nlp_classification.html)
65 | """
66 |
67 | # Classification model
68 | model: TransformerClassifier
69 | # Transformer
70 | transformer: TransformerConfigs
71 |
72 |
73 | @option(Configs.transformer)
74 | def _transformer_configs(c: Configs):
75 | """
76 | ### Transformer configurations
77 | """
78 |
79 | # We use our
80 | # [configurable transformer implementation](../configs.html#TransformerConfigs)
81 | conf = TransformerConfigs()
82 | # Set the vocabulary sizes for embeddings and generating logits
83 | conf.n_src_vocab = c.n_tokens
84 | conf.n_tgt_vocab = c.n_tokens
85 |
86 | #
87 | return conf
88 |
89 |
90 | @option(TransformerConfigs.encoder_attn)
91 | def fnet_mix():
92 | """
93 | Create `FNetMix` module that can replace the self-attention in
94 | [transformer encoder layer](../models.html#TransformerLayer)
95 | .
96 | """
97 | from labml_nn.transformers.fnet import FNetMix
98 | return FNetMix()
99 |
100 |
101 | @option(Configs.model)
102 | def _model(c: Configs):
103 | """
104 | Create classification model
105 | """
106 | m = TransformerClassifier(c.transformer.encoder,
107 | c.transformer.src_embed,
108 | nn.Linear(c.d_model, c.n_classes)).to(c.device)
109 |
110 | return m
111 |
112 |
113 | def main():
114 | # Create experiment
115 | experiment.create(name="fnet")
116 | # Create configs
117 | conf = Configs()
118 | # Override configurations
119 | experiment.configs(conf, {
120 | # Use world level tokenizer
121 | 'tokenizer': 'basic_english',
122 |
123 | # Train for $32$ epochs
124 | 'epochs': 32,
125 | # Switch between training and validation for $10$ times
126 | # per epoch
127 | 'inner_iterations': 10,
128 |
129 | # Transformer configurations (same as defaults)
130 | 'transformer.d_model': 512,
131 | 'transformer.ffn.d_ff': 2048,
132 | 'transformer.n_heads': 8,
133 | 'transformer.n_layers': 6,
134 |
135 | # Use [FNet](index.html) instead of self-a
136 | # ttention
137 | 'transformer.encoder_attn': 'fnet_mix',
138 |
139 | # Use [Noam optimizer](../../optimizers/noam.html)
140 | 'optimizer.optimizer': 'Noam',
141 | 'optimizer.learning_rate': 1.,
142 | })
143 |
144 | # Set models for saving and loading
145 | experiment.add_pytorch_models({'model': conf.model})
146 |
147 | # Start the experiment
148 | with experiment.start():
149 | # Run training
150 | conf.run()
151 |
152 |
153 | #
154 | if __name__ == '__main__':
155 | main()
156 |
--------------------------------------------------------------------------------
/docs/transformers/relative_mha.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 | Relative Multi-Headed Attention
25 |
26 |
27 |
28 |
29 |
30 |
41 |
42 |
43 |
66 |
67 |
69 |
70 |
84 |
124 |
125 |
--------------------------------------------------------------------------------
/labml_nn/transformers/knn/train_model.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Train Autoregressive Transformer
4 | summary: This is training code with notes for a basic auto-regressive transformer.
5 | ---
6 |
7 | # Train Autoregressive Transformer
8 |
9 | This trains a simple [transformer](../../) model for auto-regression.
10 | """
11 |
12 | import torch
13 | from labml import experiment
14 | from labml.configs import option
15 | from labml.utils.pytorch import get_modules
16 | from labml_helpers.module import Module
17 |
18 | from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
19 | from labml_nn.transformers import Encoder, Generator, TransformerConfigs
20 | from labml_nn.transformers.utils import subsequent_mask
21 |
22 |
23 | class AutoregressiveModel(Module):
24 | """
25 | ## Auto regressive model
26 | """
27 |
28 | def __init__(self, src_embed: Module, encoder: Encoder, generator: Generator, *,
29 | is_save_ff_input: bool = False):
30 | super().__init__()
31 | # Token embedding module
32 | self.src_embed = src_embed
33 | # Transformer based encoder
34 | self.encoder = encoder
35 | # Whether the last layer of the encoder should
36 | # save the input to the feed-forward layer.
37 | # This is out $f(c_t)$, the embedding of the context.
38 | self.encoder.layers[-1].is_save_ff_input = is_save_ff_input
39 | # Next token generation layer;
40 | # this give logits of the the next token
41 | self.generator = generator
42 | # This will be initialized on the first call
43 | self.src_mask = None
44 |
45 | @property
46 | def ff_input(self) -> torch.Tensor:
47 | """
48 | Retrieve saved $f(c_t)$
49 | """
50 | return self.encoder.layers[-1].ff_input
51 |
52 | def forward(self, src: torch.Tensor):
53 | # Create subsequent mask, so that the transformer can only pay attention to past tokens.
54 | if self.src_mask is None or self.src_mask.size(0) != len(src):
55 | self.src_mask = subsequent_mask(len(src)).to(src.device)
56 | # Embed the tokens (`src`) and run it through the the transformer
57 | res = self.encoder(self.src_embed(src), self.src_mask)
58 | # Generate logits of the next token
59 | return self.generator(res), None
60 |
61 |
62 | class Configs(NLPAutoRegressionConfigs):
63 | """
64 | ## Configurations
65 |
66 | The default configs can and will be over-ridden when we start the experiment
67 | """
68 |
69 | transformer: TransformerConfigs
70 | model: AutoregressiveModel
71 |
72 | is_save_ff_input = False
73 |
74 |
75 | @option(Configs.model)
76 | def autoregressive_model(c: Configs):
77 | """
78 | Initialize the auto-regressive model
79 | """
80 | m = AutoregressiveModel(
81 | # Get the source token embedding layer, encoder and
82 | # final token generator from configurable transformer
83 | src_embed=c.transformer.src_embed,
84 | encoder=c.transformer.encoder,
85 | generator=c.transformer.generator,
86 | # Whether to save $f(c_t)$
87 | is_save_ff_input=c.is_save_ff_input)
88 | return m.to(c.device)
89 |
90 |
91 | @option(Configs.transformer)
92 | def transformer_c(c: Configs):
93 | """
94 | Initialize the configurable transformer encoder for our autoregressive model
95 | """
96 | tc = TransformerConfigs()
97 | tc.n_src_vocab = c.n_tokens
98 | tc.n_tgt_vocab = c.n_tokens
99 |
100 | return tc
101 |
102 |
103 | def main():
104 | # Create experiment
105 | experiment.create(name="knn_lm")
106 | # Create configs
107 | conf = Configs()
108 | # Load configurations
109 | experiment.configs(conf,
110 | # A dictionary of configurations to override
111 | {'tokenizer': 'character',
112 | 'prompt_separator': '',
113 | 'prompt': 'It is ',
114 | 'text': 'tiny_shakespeare',
115 |
116 | 'optimizer.optimizer': 'Noam',
117 | 'optimizer.learning_rate': 1.,
118 | 'optimizer.d_model': 256,
119 |
120 | 'seq_len': 1024,
121 | 'epochs': 128,
122 | 'batch_size': 6,
123 | 'inner_iterations': 10,
124 |
125 | # Transformer configurations
126 | 'transformer.d_model': 256,
127 | 'transformer.ffn.d_ff': 1024,
128 | 'transformer.n_heads': 8,
129 | 'transformer.n_layers': 6})
130 |
131 | # This is needed to initialize models
132 | conf.n_tokens = conf.text.n_tokens
133 |
134 | # Set models for saving and loading
135 | experiment.add_pytorch_models(get_modules(conf))
136 |
137 | # Start the experiment
138 | with experiment.start():
139 | # `TrainValidConfigs.run`
140 | conf.run()
141 |
142 |
143 | if __name__ == '__main__':
144 | main()
145 |
--------------------------------------------------------------------------------