├── .labml.yaml
├── utils
    ├── __init__.py
    └── sitemap.py
├── docs
    ├── CNAME
    ├── icon.png
    ├── cnn
    │   └── utils
    │   │   ├── cv-folds.png
    │   │   ├── overfitting.png
    │   │   ├── Underfitting.png
    │   │   ├── early-stopping.png
    │   │   ├── ground_truth.png
    │   │   └── Cross-validation.png
    ├── optimizers
    │   ├── noam_lr.png
    │   └── radam_r_t.png
    ├── gan
    │   └── stylegan
    │   │   └── generated_64.png
    ├── resnets
    │   ├── index.html
    │   ├── utils
    │   │   └── index.html
    │   └── models
    │   │   └── index.html
    ├── experiments
    │   └── index.html
    └── transformers
    │   ├── basic
    │       └── index.html
    │   └── relative_mha.html
├── labml_nn
    ├── resnets
    │   ├── __init__.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   └── mlp.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   ├── labelsmoothing.py
    │   │   ├── utils.py
    │   │   └── train.py
    │   ├── accuracy_graph_85.png
    │   ├── pretrained_nets.py
    │   └── resnet_net.py
    ├── experiments
    │   ├── __init__.py
    │   ├── cifar10.py
    │   └── mnist.py
    ├── transformers
    │   ├── basic
    │   │   └── __init__.py
    │   ├── relative_mha.py
    │   ├── glu_variants
    │   │   ├── __init__.py
    │   │   └── experiment.py
    │   ├── utils.py
    │   ├── gmlp
    │   │   ├── readme.md
    │   │   └── experiment.py
    │   ├── fnet
    │   │   ├── readme.md
    │   │   ├── __init__.py
    │   │   └── experiment.py
    │   ├── aft
    │   │   └── readme.md
    │   ├── fast_weights
    │   │   ├── readme.md
    │   │   ├── experiment.py
    │   │   └── token_wise.py
    │   ├── xl
    │   │   └── readme.md
    │   ├── knn
    │   │   ├── __init__.py
    │   │   └── train_model.py
    │   ├── switch
    │   │   └── readme.md
    │   ├── feedback
    │   │   └── readme.md
    │   ├── compressive
    │   │   └── readme.md
    │   ├── label_smoothing_loss.py
    │   ├── positional_encoding.py
    │   ├── mlm
    │   │   └── readme.md
    │   ├── __init__.py
    │   └── feed_forward.py
    ├── activations
    │   ├── __init__.py
    │   └── swish.py
    ├── cnn
    │   ├── save
    │   │   └── Basic_CNN-best-model
    │   │   │   └── model.pt
    │   ├── cross_validation.py
    │   ├── utils
    │   │   └── dataloader.py
    │   └── ray_tune.py
    ├── gan
    │   ├── wasserstein
    │   │   ├── readme.md
    │   │   ├── gradient_penalty
    │   │   │   ├── readme.md
    │   │   │   ├── experiment.py
    │   │   │   └── __init__.py
    │   │   └── experiment.py
    │   ├── original
    │   │   └── readme.md
    │   ├── cycle_gan
    │   │   └── readme.md
    │   ├── dcgan
    │   │   ├── readme.md
    │   │   └── __init__.py
    │   ├── __init__.py
    │   └── stylegan
    │   │   └── readme.md
    ├── hypernetworks
    │   ├── __init__.py
    │   └── experiment.py
    ├── normalization
    │   ├── weight_standardization
    │   │   ├── readme.md
    │   │   ├── conv2d.py
    │   │   ├── experiment.py
    │   │   └── __init__.py
    │   ├── __init__.py
    │   ├── instance_norm
    │   │   ├── readme.md
    │   │   ├── experiment.py
    │   │   └── __init__.py
    │   ├── layer_norm
    │   │   └── readme.md
    │   ├── group_norm
    │   │   ├── readme.md
    │   │   └── experiment.py
    │   └── batch_norm
    │   │   ├── cifar10.py
    │   │   ├── mnist.py
    │   │   └── readme.md
    ├── optimizers
    │   ├── readme.md
    │   ├── performance_test.py
    │   ├── adam_warmup.py
    │   ├── noam.py
    │   ├── adam_warmup_cosine_decay.py
    │   └── mnist_experiment.py
    ├── rl
    │   ├── __init__.py
    │   ├── ppo
    │   │   ├── readme.md
    │   │   └── gae.py
    │   └── dqn
    │   │   └── model.py
    ├── utils
    │   ├── tokenizer.py
    │   └── __init__.py
    ├── capsule_networks
    │   └── readme.md
    └── __init__.py
├── MANIFEST.in
├── images
    └── dqn.png
├── requirements.txt
├── .gitignore
├── Makefile
├── license
└── setup.py


/.labml.yaml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/CNAME:
--------------------------------------------------------------------------------
1 | nn.labml.ai


--------------------------------------------------------------------------------
/labml_nn/resnets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/labml_nn/experiments/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include readme.rst
2 | 


--------------------------------------------------------------------------------
/labml_nn/resnets/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/labml_nn/resnets/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/labml_nn/transformers/basic/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/labml_nn/activations/__init__.py:
--------------------------------------------------------------------------------
1 | from .swish import Swish
2 | 


--------------------------------------------------------------------------------
/docs/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/icon.png


--------------------------------------------------------------------------------
/images/dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/images/dqn.png


--------------------------------------------------------------------------------
/docs/cnn/utils/cv-folds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/cnn/utils/cv-folds.png


--------------------------------------------------------------------------------
/docs/optimizers/noam_lr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/optimizers/noam_lr.png


--------------------------------------------------------------------------------
/docs/cnn/utils/overfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/cnn/utils/overfitting.png


--------------------------------------------------------------------------------
/docs/optimizers/radam_r_t.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/optimizers/radam_r_t.png


--------------------------------------------------------------------------------
/docs/cnn/utils/Underfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/cnn/utils/Underfitting.png


--------------------------------------------------------------------------------
/docs/cnn/utils/early-stopping.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/cnn/utils/early-stopping.png


--------------------------------------------------------------------------------
/docs/cnn/utils/ground_truth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/cnn/utils/ground_truth.png


--------------------------------------------------------------------------------
/docs/cnn/utils/Cross-validation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/cnn/utils/Cross-validation.png


--------------------------------------------------------------------------------
/docs/gan/stylegan/generated_64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/docs/gan/stylegan/generated_64.png


--------------------------------------------------------------------------------
/labml_nn/resnets/accuracy_graph_85.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/labml_nn/resnets/accuracy_graph_85.png


--------------------------------------------------------------------------------
/labml_nn/cnn/save/Basic_CNN-best-model/model.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashishpatel26/annotated_deep_learning_paper_implementations/master/labml_nn/cnn/save/Basic_CNN-best-model/model.pt


--------------------------------------------------------------------------------
/labml_nn/gan/wasserstein/readme.md:
--------------------------------------------------------------------------------
1 | # [Wasserstein GAN - WGAN](https://nn.labml.ai/gan/wasserstein/index.html)
2 | 
3 | This is an implementation of
4 | [Wasserstein GAN](https://arxiv.org/abs/1701.07875).
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch>=1.7
 2 | labml>=0.4.94
 3 | labml-helpers>=0.4.77
 4 | torchvision
 5 | numpy>=1.16.3
 6 | matplotlib>=3.0.3
 7 | einops>=0.3.0
 8 | gym[atari]
 9 | opencv-python
10 | Pillow>=6.2.1
11 | 


--------------------------------------------------------------------------------
/labml_nn/hypernetworks/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: HyperNetworks
4 | summary: A PyTorch implementation/tutorial of HyperLSTM introduced in paper HyperNetworks.
5 | ---
6 | 
7 | ## [HyperLSTM](hyper_lstm.html)
8 | """


--------------------------------------------------------------------------------
/labml_nn/gan/original/readme.md:
--------------------------------------------------------------------------------
1 | # [Generative Adversarial Networks - GAN](https://nn.labml.ai/gan/original/index.html)
2 | 
3 | This is an annotated implementation of
4 | [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .ipynb_checkpoints
 2 | __pycache__
 3 | .DS_Store
 4 | .*.swp
 5 | *.egg-info/
 6 | dist/
 7 | build/
 8 | .idea/*
 9 | !.idea/dictionaries
10 | labml
11 | labml_helpers
12 | labml_samples
13 | data
14 | logs
15 | html/
16 | diagrams/


--------------------------------------------------------------------------------
/labml_nn/transformers/relative_mha.py:
--------------------------------------------------------------------------------
1 | """
2 | ---
3 | title: Relative Multi-Headed Attention
4 | summary: Relative Multi-Headed Attention from paper Transformer-XL.
5 | redirect: https://nn.labml.ai/transformers/xl/relative_mha.html
6 | ---
7 | """
8 | 


--------------------------------------------------------------------------------
/labml_nn/gan/cycle_gan/readme.md:
--------------------------------------------------------------------------------
1 | # [Cycle GAN](https://nn.labml.ai/gan/cycle_gan/index.html)
2 | 
3 | This is a [PyTorch](https://pytorch.org) implementation/tutorial of the paper
4 | [Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks](https://arxiv.org/abs/1703.10593).
5 | 


--------------------------------------------------------------------------------
/labml_nn/gan/dcgan/readme.md:
--------------------------------------------------------------------------------
1 | # [Deep Convolutional Generative Adversarial Networks - DCGAN](https://nn.labml.ai/gan/dcgan/index.html)
2 | 
3 | This is a [PyTorch](https://pytorch.org) implementation of paper
4 | [Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](https://arxiv.org/abs/1511.06434).
5 | 


--------------------------------------------------------------------------------
/labml_nn/activations/swish.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | from labml_helpers.module import Module
 5 | 
 6 | 
 7 | class Swish(Module):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.sigmoid = nn.Sigmoid()
11 | 
12 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
13 |         return x * self.sigmoid(x)
14 | 


--------------------------------------------------------------------------------
/labml_nn/experiments/cifar10.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: CIFAR10 Experiment
 4 | summary: >
 5 |   This is a reusable trainer for CIFAR10 dataset
 6 | ---
 7 | 
 8 | # CIFAR10 Experiment
 9 | """
10 | 
11 | from labml_helpers.datasets.cifar10 import CIFAR10Configs as CIFAR10DatasetConfigs
12 | from labml_nn.experiments.mnist import MNISTConfigs
13 | 
14 | 
15 | class CIFAR10Configs(CIFAR10DatasetConfigs, MNISTConfigs):
16 |     dataset_name: str = 'CIFAR10'
17 | 


--------------------------------------------------------------------------------
/labml_nn/transformers/glu_variants/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: Gated Linear Units and Variants
 4 | summary: >
 5 |   Train an auto-regressive transformer with Gated Linear Units and variants
 6 |   for the position-wise feedforward network (FFN).
 7 | ---
 8 | 
 9 | # Gated Linear Units and Variants
10 | 
11 | * [Experiment that uses `labml.configs`](glu_variants/experiment.html)
12 | * [Simpler version from scratch](glu_variants/simple.html)
13 | """
14 | 


--------------------------------------------------------------------------------
/labml_nn/normalization/weight_standardization/readme.md:
--------------------------------------------------------------------------------
1 | # [Weight Standardization](https://nn.labml.ai/normalization/weight_standardization/index.html)
2 | 
3 | This is a [PyTorch](https://pytorch.org) implementation of Weight Standardization from the paper
4 |  [Micro-Batch Training with Batch-Channel Normalization and Weight Standardization](https://arxiv.org/abs/1903.10520).
5 | We also have an
6 | [annotated implementation of Batch-Channel Normalization](https://nn.labml.ai/normalization/batch_channel_norm/index.html).
7 | 


--------------------------------------------------------------------------------
/labml_nn/gan/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: Generative Adversarial Networks
 4 | summary: >
 5 |  A set of PyTorch implementations/tutorials of GANs.
 6 | ---
 7 | 
 8 | # Generative Adversarial Networks
 9 | 
10 | * [Original GAN](original/index.html)
11 | * [GAN with deep convolutional network](dcgan/index.html)
12 | * [Cycle GAN](cycle_gan/index.html)
13 | * [Wasserstein GAN](wasserstein/index.html)
14 | * [Wasserstein GAN with Gradient Penalty](wasserstein/gradient_penalty/index.html)
15 | * [Style GAN 2](stylegan/index.html)
16 | """


--------------------------------------------------------------------------------
/labml_nn/optimizers/readme.md:
--------------------------------------------------------------------------------
 1 | # [Optimizers](https://nn.labml.ai/optimizers/index.html)
 2 | 
 3 | ## Optimizer Implementations
 4 | * [Adam Optimizer](https://nn.labml.ai/optimizers/adam.html)
 5 | * [AMSGrad Optimizer](https://nn.labml.ai/optimizers/amsgrad.html)
 6 | * [Adam Optimizer with warmup](https://nn.labml.ai/optimizers/adam_warmup.html)
 7 | * [Noam Optimizer](https://nn.labml.ai/optimizers/noam.html)
 8 | * [Rectified Adam Optimizer](https://nn.labml.ai/optimizers/radam.html)
 9 | * [AdaBelief Optimizer](https://nn.labml.ai/optimizers/ada_belief.html)
10 | 


--------------------------------------------------------------------------------
/labml_nn/normalization/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: Normalization Layers
 4 | summary: >
 5 |  A set of PyTorch implementations/tutorials of normalization layers.
 6 | ---
 7 | 
 8 | # Normalization Layers
 9 | 
10 | * [Batch Normalization](batch_norm/index.html)
11 | * [Layer Normalization](layer_norm/index.html)
12 | * [Instance Normalization](instance_norm/index.html)
13 | * [Group Normalization](group_norm/index.html)
14 | * [Weight Standardization](weight_standardization/index.html)
15 | * [Batch-Channel Normalization](batch_channel_norm/index.html)
16 | """
17 | 


--------------------------------------------------------------------------------
/labml_nn/resnets/utils/labelsmoothing.py:
--------------------------------------------------------------------------------
 1 | import torch.nn.functional as F
 2 | from torch import nn
 3 | 
 4 | class LabelSmoothingLoss(nn.Module):
 5 |     def __init__(self, epsilon= 0.5, reduction='mean'):
 6 |         super().__init__()
 7 |         self.epsilon = epsilon
 8 |         self.reduction = reduction
 9 | 
10 |     def forward(self, pred, target):
11 |         n = pred.size()[-1]
12 |         log_pred = F.log_softmax(pred, dim=-1)
13 |         loss = -log_pred.sum(dim=-1).mean()
14 |         nll = F.nll_loss(log_pred, target, reduction=self.reduction)
15 |         out = (1-self.epsilon)*nll + self.epsilon*(loss / n)
16 |         return out
17 | 


--------------------------------------------------------------------------------
/labml_nn/normalization/instance_norm/readme.md:
--------------------------------------------------------------------------------
1 | # [Instance Normalization](https://nn.labml.ai/normalization/instance_norm/index.html)
2 | 
3 | This is a [PyTorch](https://pytorch.org) implementation of
4 | [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
5 | 
6 | Instance normalization was introduced to improve [style transfer](https://paperswithcode.com/task/style-transfer).
7 | It is based on the observation that stylization should not depend on the contrast of the content image.
8 | Since it's hard for a convolutional network to learn "contrast normalization", this paper
9 | introduces instance normalization which does that.


--------------------------------------------------------------------------------
/labml_nn/transformers/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: Utilities for Transformer
 4 | summary: A bunch of utility functions and classes for transformers.
 5 | ---
 6 | 
 7 | # Utilities for Transformer
 8 | """
 9 | 
10 | import torch
11 | 
12 | 
13 | def subsequent_mask(seq_len):
14 |     """
15 |     ## Subsequent mask to mask out data from future (subsequent) time steps
16 |     """
17 |     mask = torch.tril(torch.ones(seq_len, seq_len)).to(torch.bool).unsqueeze(-1)
18 |     return mask
19 | 
20 | 
21 | def _subsequent_mask():
22 |     from labml.logger import inspect
23 |     inspect(subsequent_mask(10)[:, :, 0])
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     _subsequent_mask()
28 | 


--------------------------------------------------------------------------------
/labml_nn/transformers/gmlp/readme.md:
--------------------------------------------------------------------------------
 1 | # [Pay Attention to MLPs (gMLP)](https://nn.labml.ai/transformers/gmlp/index.html)
 2 | 
 3 | This is a [PyTorch](https://pytorch.org) implementation of the paper
 4 | [Pay Attention to MLPs](https://papers.labml.ai/paper/2105.08050).
 5 | 
 6 | This paper introduces a Multilayer Perceptron (MLP) based architecture with gating,
 7 | which they name **gMLP**. It consists of a stack of $L$ *gMLP* blocks.
 8 | 
 9 | Here is [the training code](https://nn.labml.ai/transformers/gmlp/experiment.html) for a gMLP model based autoregressive model.
10 | 
11 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/01bd941ac74c11eb890c1d9196651a4a)
12 | 


--------------------------------------------------------------------------------
/labml_nn/transformers/fnet/readme.md:
--------------------------------------------------------------------------------
 1 | # [FNet: Mixing Tokens with Fourier Transforms](https://nn.labml.ai/transformers/fnet/index.html)
 2 | 
 3 | This is a [PyTorch](https://pytorch.org) implementation of the paper
 4 | [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824).
 5 | 
 6 | This paper replaces the [self-attention layer](https://nn.labml.ai/transformers//mha.html) with two
 7 | [Fourier transforms](https://en.wikipedia.org/wiki/Discrete_Fourier_transform) to
 8 | *mix* tokens.
 9 | This is a 7X more efficient than self-attention.
10 | The accuracy loss of using this over self-attention is about 92% for
11 | [BERT](https://paperswithcode.com/method/bert) on
12 | [GLUE benchmark](https://paperswithcode.com/dataset/glue).
13 | 


--------------------------------------------------------------------------------
/labml_nn/gan/stylegan/readme.md:
--------------------------------------------------------------------------------
 1 | # [Style GAN 2](https://nn.labml.ai/gan/stylegan/index.html)
 2 | 
 3 | This is a [PyTorch](https://pytorch.org) implementation of the paper
 4 |  [Analyzing and Improving the Image Quality of StyleGAN](https://arxiv.org/abs/1912.04958)
 5 |  which introduces **Style GAN2**.
 6 | Style GAN2 is an improvement over **Style GAN** from the paper
 7 |  [A Style-Based Generator Architecture for Generative Adversarial Networks](https://arxiv.org/abs/1812.04948).
 8 | And Style GAN is based on **Progressive GAN** from the paper
 9 |  [Progressive Growing of GANs for Improved Quality, Stability, and Variation](https://arxiv.org/abs/1710.10196).
10 | All three papers are from the same authors from [NVIDIA AI](https://twitter.com/NVIDIAAI).
11 | 


--------------------------------------------------------------------------------
/labml_nn/transformers/aft/readme.md:
--------------------------------------------------------------------------------
 1 | # [An Attention Free Transformer](https://nn.labml.ai/transformers/aft/index.html)
 2 | 
 3 | This is a [PyTorch](https://pytorch.org) implementation of the paper
 4 | [An Attention Free Transformer](https://papers.labml.ai/paper/2105.14103).
 5 | 
 6 | This paper replaces the [self-attention layer](https://nn.labml.ai/transformers/mha.html) 
 7 | with a new efficient operation,
 8 | that has memory complexity of O(Td), where T is the sequence length
 9 | and $d$ is the dimensionality of embeddings.
10 | 
11 | The paper introduces AFT along with AFT-local and AFT-conv.
12 | Here we have implemented AFT-local which pays attention to closeby tokens
13 | in an autoregressive model.
14 | 
15 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/6348e504c3a511eba9529daa283fb495)
16 | 


--------------------------------------------------------------------------------
/labml_nn/rl/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: Reinforcement Learning Algorithms
 4 | summary: >
 5 |   This is a collection of PyTorch implementations/tutorials of reinforcement learning algorithms.
 6 |   It currently includes Proximal Policy Optimization, Generalized Advantage Estimation, and
 7 |   Deep Q Networks.
 8 | ---
 9 | 
10 | # Reinforcement Learning Algorithms
11 | 
12 | * [Proximal Policy Optimization](ppo)
13 |     * [This is an experiment](ppo/experiment.html) that runs a PPO agent on Atari Breakout.
14 |     * [Generalized advantage estimation](ppo/gae.html)
15 | * [Deep Q Networks](dqn)
16 |     * [This is an experiment](dqn/experiment.html) that runs a DQN agent on Atari Breakout.
17 |     * [Model](dqn/model.html) with dueling network
18 |     * [Prioritized Experience Replay Buffer](dqn/replay_buffer.html)
19 | 
20 | [This is the implementation for OpenAI game wrapper](game.html) using `multiprocessing`.
21 | """


--------------------------------------------------------------------------------
/labml_nn/transformers/fast_weights/readme.md:
--------------------------------------------------------------------------------
 1 | # [Fast weights transformer](https://nn.labml.ai/transformers/fast_weights/index.html)
 2 | 
 3 | This is an annotated implementation of the paper
 4 | [Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch](https://arxiv.org/abs/2102.11174).
 5 | 
 6 | Here is the [annotated implementation](https://nn.labml.ai/transformers/fast_weights/index.html).
 7 | Here are [the training code](https://nn.labml.ai/transformers/fast_weights/experiment.html)
 8 | and a notebook for training a fast weights transformer on the Tiny Shakespeare dataset.
 9 | 
10 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb)
11 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/928aadc0846c11eb85710242ac1c0002)
12 | 


--------------------------------------------------------------------------------
/labml_nn/gan/wasserstein/gradient_penalty/readme.md:
--------------------------------------------------------------------------------
 1 | # [Gradient Penalty for Wasserstein GAN (WGAN-GP)](https://nn.labml.ai/gan/wasserstein/gradient_penalty/index.html)
 2 | 
 3 | This is an implementation of
 4 | [Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028).
 5 | 
 6 | [WGAN](https://nn.labml.ai/gan/wasserstein/index.html) suggests
 7 | clipping weights to enforce Lipschitz constraint
 8 | on the discriminator network (critic).
 9 | This and other weight constraints like L2 norm clipping, weight normalization,
10 | L1, L2 weight decay have problems:
11 | 
12 | 1. Limiting the capacity of the discriminator
13 | 2. Exploding and vanishing gradients (without [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html)).
14 | 
15 | The paper [Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028)
16 | proposal a better way to improve Lipschitz constraint, a gradient penalty.
17 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | clean: ## Clean
 2 | 	rm -rf dist
 3 | 	rm -rf build
 4 | 	rm -rf *.egg-info
 5 | 
 6 | build: clean ## Build PIPy Package
 7 | 	python setup.py sdist bdist_wheel
 8 | 
 9 | check-content: build  ## List contents of PIPy Package
10 | 	tar -tvf dist/*.tar.gz
11 | 
12 | check: build  ## Check PIPy Package
13 | 	twine check dist/*
14 | 
15 | upload: build  ## Upload PIPy Package
16 | 	twine upload dist/*
17 | 
18 | install:  ## Install from repo
19 | 	pip install -e .
20 | 
21 | uninstall: ## Uninstall
22 | 	pip uninstall labml_nn
23 | 
24 | docs: ## Render annotated HTML
25 | 	find ./docs/ -name "*.html" -type f -delete
26 | 	find ./docs/ -name "*.svg" -type f -delete
27 | 	python utils/sitemap.py
28 | 	python utils/diagrams.py
29 | 	cd labml_nn; pylit --remove_empty_sections --title_md -t ../../../pylit/templates/nn -d ../docs -w *
30 | 
31 | help: ## Show this help.
32 | 	@fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##//'
33 | 
34 | .PHONY: clean build check upload help docs
35 | .DEFAULT_GOAL := help
36 | 


--------------------------------------------------------------------------------
/license:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2020 Varuna Jayasiri
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/labml_nn/utils/tokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable
 2 | 
 3 | from labml.configs import BaseConfigs, option
 4 | 
 5 | 
 6 | class TokenizerConfigs(BaseConfigs):
 7 |     """
 8 |     <a id="OptimizerConfigs">
 9 |     ## Optimizer Configurations
10 |     </a>
11 |     """
12 | 
13 |     tokenizer: Callable = 'character'
14 | 
15 |     def __init__(self):
16 |         super().__init__(_primary='tokenizer')
17 | 
18 | 
19 | @option(TokenizerConfigs.tokenizer)
20 | def basic_english():
21 |     """
22 |     ### Basic  english tokenizer
23 | 
24 |     We use character level tokenizer in this experiment.
25 |     You can switch by setting,
26 | 
27 |     ```
28 |         'tokenizer': 'basic_english',
29 |     ```
30 | 
31 |     as the configurations dictionary when starting the experiment.
32 | 
33 |     """
34 |     from torchtext.data import get_tokenizer
35 |     return get_tokenizer('basic_english')
36 | 
37 | 
38 | def character_tokenizer(x: str):
39 |     """
40 |     ### Character level tokenizer
41 |     """
42 |     return list(x)
43 | 
44 | 
45 | @option(TokenizerConfigs.tokenizer)
46 | def character():
47 |     """
48 |     Character level tokenizer configuration
49 |     """
50 |     return character_tokenizer
51 | 


--------------------------------------------------------------------------------
/labml_nn/normalization/layer_norm/readme.md:
--------------------------------------------------------------------------------
 1 | # [Layer Normalization](https://nn.labml.ai/normalization/layer_norm/index.html)
 2 | 
 3 | This is a [PyTorch](https://pytorch.org) implementation of
 4 | [Layer Normalization](https://arxiv.org/abs/1607.06450).
 5 | 
 6 | ### Limitations of [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html)
 7 | 
 8 | * You need to maintain running means.
 9 | * Tricky for RNNs. Do you need different normalizations for each step?
10 | * Doesn't work with small batch sizes;
11 | large NLP models are usually trained with small batch sizes.
12 | * Need to compute means and variances across devices in distributed training.
13 | 
14 | ## Layer Normalization
15 | 
16 | Layer normalization is a simpler normalization method that works
17 | on a wider range of settings.
18 | Layer normalization transforms the inputs to have zero mean and unit variance
19 | across the features.
20 | *Note that batch normalization fixes the zero mean and unit variance for each element.*
21 | Layer normalization does it for each batch across all elements.
22 | 
23 | Layer normalization is generally used for NLP tasks.
24 | 
25 | We have used layer normalization in most of the
26 | [transformer implementations](https://nn.labml.ai/transformers/gpt/index.html).


--------------------------------------------------------------------------------
/labml_nn/capsule_networks/readme.md:
--------------------------------------------------------------------------------
 1 | # [Capsule Networks](https://nn.labml.ai/capsule_networks/index.html)
 2 | 
 3 | This is a [PyTorch](https://pytorch.org) implementation/tutorial of
 4 | [Dynamic Routing Between Capsules](https://arxiv.org/abs/1710.09829).
 5 | 
 6 | Capsule network is a neural network architecture that embeds features
 7 | as capsules and routes them with a voting mechanism to next layer of capsules.
 8 | 
 9 | Unlike in other implementations of models, we've included a sample, because
10 | it is difficult to understand some concepts with just the modules.
11 | [This is the annotated code for a model that uses capsules to classify MNIST dataset](mnist.html)
12 | 
13 | This file holds the implementations of the core modules of Capsule Networks.
14 | 
15 | I used [jindongwang/Pytorch-CapsuleNet](https://github.com/jindongwang/Pytorch-CapsuleNet) to clarify some
16 | confusions I had with the paper.
17 | 
18 | Here's a notebook for training a Capsule Network on MNIST dataset.
19 | 
20 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/capsule_networks/mnist.ipynb)
21 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/e7c08e08586711ebb3e30242ac1c0002)
22 | 


--------------------------------------------------------------------------------
/labml_nn/rl/ppo/readme.md:
--------------------------------------------------------------------------------
 1 | # [Proximal Policy Optimization - PPO](https://nn.labml.ai/rl/ppo/index.html)
 2 | 
 3 | This is a [PyTorch](https://pytorch.org) implementation of
 4 | [Proximal Policy Optimization - PPO](https://arxiv.org/abs/1707.06347).
 5 | 
 6 | PPO is a policy gradient method for reinforcement learning.
 7 | Simple policy gradient methods one do a single gradient update per sample (or a set of samples).
 8 | Doing multiple gradient steps for a singe sample causes problems
 9 | because the policy deviates too much producing a bad policy.
10 | PPO lets us do multiple gradient updates per sample by trying to keep the
11 | policy close to the policy that was used to sample data.
12 | It does so by clipping gradient flow if the updated policy
13 | is not close to the policy used to sample the data.
14 | 
15 | You can find an experiment that uses it [here](https://nn.labml.ai/rl/ppo/experiment.html).
16 | The experiment uses [Generalized Advantage Estimation](https://nn.labml.ai/rl/ppo/gae.html).
17 | 
18 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/rl/ppo/experiment.ipynb)
19 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/6eff28a0910e11eb9b008db315936e2f)
20 | 


--------------------------------------------------------------------------------
/labml_nn/gan/wasserstein/experiment.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: WGAN experiment with MNIST
 4 | summary: This experiment generates MNIST images using convolutional neural network.
 5 | ---
 6 | 
 7 | # WGAN experiment with MNIST
 8 | """
 9 | from labml import experiment
10 | 
11 | from labml.configs import calculate
12 | # Import configurations from [DCGAN experiment](../dcgan/index.html)
13 | from labml_nn.gan.dcgan import Configs
14 | 
15 | # Import [Wasserstein GAN losses](./index.html)
16 | from labml_nn.gan.wasserstein import GeneratorLoss, DiscriminatorLoss
17 | 
18 | # Set configurations options for Wasserstein GAN losses
19 | calculate(Configs.generator_loss, 'wasserstein', lambda c: GeneratorLoss())
20 | calculate(Configs.discriminator_loss, 'wasserstein', lambda c: DiscriminatorLoss())
21 | 
22 | 
23 | def main():
24 |     # Create configs object
25 |     conf = Configs()
26 |     # Create experiment
27 |     experiment.create(name='mnist_wassertein_dcgan', comment='test')
28 |     # Override configurations
29 |     experiment.configs(conf,
30 |                        {
31 |                            'discriminator': 'cnn',
32 |                            'generator': 'cnn',
33 |                            'label_smoothing': 0.01,
34 |                            'generator_loss': 'wasserstein',
35 |                            'discriminator_loss': 'wasserstein',
36 |                        })
37 | 
38 |     # Start the experiment and run training loop
39 |     with experiment.start():
40 |         conf.run()
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 | 


--------------------------------------------------------------------------------
/labml_nn/normalization/group_norm/readme.md:
--------------------------------------------------------------------------------
 1 | # [Group Normalization](https://nn.labml.ai/normalization/group_norm/index.html)
 2 | 
 3 | This is a [PyTorch](https://pytorch.org) implementation of
 4 | the [Group Normalization](https://arxiv.org/abs/1803.08494) paper.
 5 | 
 6 | [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html) works well for large enough batch sizes
 7 | but not well for small batch sizes, because it normalizes over the batch.
 8 | Training large models with large batch sizes is not possible due to the memory capacity of the
 9 | devices.
10 | 
11 | This paper introduces Group Normalization, which normalizes a set of features together as a group.
12 | This is based on the observation that classical features such as
13 | [SIFT](https://en.wikipedia.org/wiki/Scale-invariant_feature_transform) and
14 | [HOG](https://en.wikipedia.org/wiki/Histogram_of_oriented_gradients) are group-wise features.
15 | The paper proposes dividing feature channels into groups and then separately normalizing
16 | all channels within each group.
17 | 
18 | Here's a [CIFAR 10 classification model](https://nn.labml.ai/normalization/group_norm/experiment.html) that uses instance normalization.
19 | 
20 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/group_norm/experiment.ipynb)
21 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/081d950aa4e011eb8f9f0242ac1c0002)
22 | [![WandB](https://img.shields.io/badge/wandb-run-yellow)](https://wandb.ai/vpj/cifar10/runs/310etthp)


--------------------------------------------------------------------------------
/labml_nn/transformers/xl/readme.md:
--------------------------------------------------------------------------------
 1 | # [Transformer XL](https://nn.labml.ai/transformers/xl/index.html)
 2 | 
 3 | This is an implementation of
 4 | [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860)
 5 | in [PyTorch](https://pytorch.org).
 6 | 
 7 | Transformer has a limited attention span,
 8 | equal to the length of the sequence trained in parallel.
 9 | All these positions have a fixed positional encoding.
10 | Transformer XL increases this attention span by letting
11 | each of the positions pay attention to precalculated past embeddings.
12 | For instance if the context length is $l$, it will keep the embeddings of
13 | all layers for previous batch of length $l$ and feed them to current step.
14 | If we use fixed-positional encodings these pre-calculated embeddings will have
15 | the same positions as the current context.
16 | They introduce relative positional encoding, where the positional encodings
17 | are introduced at the attention calculation.
18 | 
19 | Annotated implementation of relative multi-headed attention is in [`relative_mha.py`](https://nn.labml.ai/transformers/xl/relative_mha.html).
20 | 
21 | Here's [the training code](https://nn.labml.ai/transformers/xl/experiment.html) and a notebook for training a transformer XL model on Tiny Shakespeare dataset.
22 | 
23 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/xl/experiment.ipynb)
24 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/d3b6760c692e11ebb6a70242ac1c0002)
25 | 


--------------------------------------------------------------------------------
/utils/sitemap.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import git
 4 | 
 5 | HOME = Path('./labml_nn')
 6 | REPO = git.Repo('.')
 7 | 
 8 | 
 9 | def collect(path: Path):
10 |     if path.is_file():
11 |         try:
12 |             commit = next(iter(REPO.iter_commits(paths=path)))
13 |         except StopIteration:
14 |             return []
15 | 
16 |         html = path.relative_to(HOME)
17 |         if html.stem == '__init__':
18 |             html = html.parent / 'index.html'
19 |         else:
20 |             html = html.parent / f'{html.stem}.html'
21 | 
22 |         return [{'path': str(html), 'date': str(commit.committed_datetime.date())}]
23 | 
24 |     urls = []
25 |     for f in path.iterdir():
26 |         urls += collect(f)
27 | 
28 |     return urls
29 | 
30 | 
31 | def main():
32 |     urls = []
33 |     for f in HOME.iterdir():
34 |         urls += collect(f)
35 | 
36 |     urls = [f'''
37 |     <url>
38 |       <loc>https://nn.labml.ai/{u['path']}</loc>
39 |       <lastmod>{u['date']}T16:30:00+00:00</lastmod>
40 |       <priority>1.00</priority>
41 |     </url>
42 |     ''' for u in urls]
43 | 
44 |     urls = '\n'.join(urls)
45 |     xml = f'''
46 |     <?xml version="1.0" encoding="UTF-8"?>
47 |     <urlset
48 |       xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
49 |       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
50 |       xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
51 |             http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
52 |       {urls}
53 |     </urlset>
54 |     '''
55 | 
56 |     with open(str(HOME.parent / 'docs' / 'sitemap.xml'), 'w') as f:
57 |         f.write(xml)
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     main()
62 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("readme.md", "r") as f:
 4 |     long_description = f.read()
 5 | 
 6 | setuptools.setup(
 7 |     name='labml-nn',
 8 |     version='0.4.99',
 9 |     author="Varuna Jayasiri, Nipun Wijerathne",
10 |     author_email="vpjayasiri@gmail.com, hnipun@gmail.com",
11 |     description="A collection of PyTorch implementations of neural network architectures and layers.",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/lab-ml/nn",
15 |     project_urls={
16 |         'Documentation': 'https://lab-ml.com/'
17 |     },
18 |     packages=setuptools.find_packages(exclude=('labml', 'labml.*',
19 |                                                'labml_samples', 'labml_samples.*',
20 |                                                'labml_helpers', 'labml_helpers.*',
21 |                                                'test',
22 |                                                'test.*')),
23 |     install_requires=['labml>=0.4.110',
24 |                       'labml-helpers>=0.4.77',
25 |                       'torch',
26 |                       'einops',
27 |                       'numpy'],
28 |     classifiers=[
29 |         "Programming Language :: Python :: 3",
30 |         "License :: OSI Approved :: MIT License",
31 |         'Intended Audience :: Developers',
32 |         'Intended Audience :: Science/Research',
33 |         'Topic :: Scientific/Engineering',
34 |         'Topic :: Scientific/Engineering :: Mathematics',
35 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
36 |         'Topic :: Software Development',
37 |         'Topic :: Software Development :: Libraries',
38 |         'Topic :: Software Development :: Libraries :: Python Modules',
39 |     ],
40 |     keywords='machine learning',
41 | )
42 | 


--------------------------------------------------------------------------------
/labml_nn/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: Utilities
 4 | summary: A bunch of utility functions and classes
 5 | ---
 6 | 
 7 | # Utilities
 8 | """
 9 | 
10 | import copy
11 | 
12 | from torch.utils.data import Dataset, IterableDataset
13 | 
14 | from labml_helpers.module import M, TypedModuleList
15 | 
16 | 
17 | def clone_module_list(module: M, n: int) -> TypedModuleList[M]:
18 |     """
19 |     ## Clone Module
20 | 
21 |     Make a `nn.ModuleList` with clones of a given module
22 |     """
23 |     return TypedModuleList([copy.deepcopy(module) for _ in range(n)])
24 | 
25 | 
26 | def cycle_dataloader(data_loader):
27 |     """
28 |     <a id="cycle_dataloader"></a>
29 |     ## Cycle Data Loader
30 | 
31 |     Infinite loader that recycles the data loader after each epoch
32 |     """
33 |     while True:
34 |         for batch in data_loader:
35 |             yield batch
36 | 
37 | 
38 | class MapStyleDataset(Dataset):
39 |     """
40 |     <a id="map_style_dataset"></a>
41 |     ## Map Style Dataset
42 | 
43 |     This converts an [`IterableDataset`](https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset)
44 |     to a [map-style dataset](https://pytorch.org/docs/stable/data.html#map-style-datasets)
45 |     so that we can shuffle the dataset.
46 | 
47 |     *This only works when the dataset size is small and can be held in memory.*
48 |     """
49 | 
50 |     def __init__(self, dataset: IterableDataset):
51 |         # Load the data to memory
52 |         self.data = [d for d in dataset]
53 | 
54 |     def __getitem__(self, idx: int):
55 |         """Get a sample by index"""
56 |         return self.data[idx]
57 | 
58 |     def __iter__(self):
59 |         """Create an iterator"""
60 |         return iter(self.data)
61 | 
62 |     def __len__(self):
63 |         """Size of the dataset"""
64 |         return len(self.data)
65 | 


--------------------------------------------------------------------------------
/labml_nn/optimizers/performance_test.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: Test performance of Adam implementations
 4 | summary: This experiment compares performance of Adam implementations.
 5 | ---
 6 | 
 7 | # Performance testing Adam
 8 | 
 9 | ```
10 | TorchAdam warmup...[DONE]	222.59ms
11 | TorchAdam...[DONE]	1,356.01ms
12 | MyAdam warmup...[DONE]	119.15ms
13 | MyAdam...[DONE]	1,192.89ms
14 | ```
15 | 
16 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ngowaAsADj8VdZfBifu_6L6rtjGoEeoR?usp=sharing)
17 | """
18 | 
19 | import torch
20 | import torch.nn as nn
21 | from labml_helpers.device import DeviceInfo
22 | from torch.optim import Adam as TorchAdam
23 | 
24 | from labml import monit
25 | from labml_nn.optimizers.adam import Adam as MyAdam
26 | from labml_nn.optimizers.mnist_experiment import Model
27 | 
28 | 
29 | def test():
30 |     device_info = DeviceInfo(use_cuda=True, cuda_device=0)
31 |     print(device_info)
32 |     inp = torch.randn((64, 1, 28, 28), device=device_info.device)
33 |     target = torch.ones(64, dtype=torch.long, device=device_info.device)
34 |     loss_func = nn.CrossEntropyLoss()
35 |     model = Model().to(device_info.device)
36 |     my_adam = MyAdam(model.parameters())
37 |     torch_adam = TorchAdam(model.parameters())
38 |     loss = loss_func(model(inp), target)
39 |     loss.backward()
40 |     with monit.section('MyAdam warmup'):
41 |         for i in range(100):
42 |             my_adam.step()
43 |     with monit.section('MyAdam'):
44 |         for i in range(1000):
45 |             my_adam.step()
46 |     with monit.section('TorchAdam warmup'):
47 |         for i in range(100):
48 |             torch_adam.step()
49 |     with monit.section('TorchAdam'):
50 |         for i in range(1000):
51 |             torch_adam.step()
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     test()
56 | 


--------------------------------------------------------------------------------
/labml_nn/transformers/knn/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: k-Nearest Neighbor Language Models
 4 | summary: >
 5 |   This is a simple PyTorch implementation/tutorial of the paper
 6 |   Generalization through Memorization: Nearest Neighbor Language Models using FAISS.
 7 |   It runs a kNN model on the final transformer layer embeddings to improve the
 8 |   loss of transformer based language models.
 9 |   It's also great for domain adaptation without pre-training.
10 | ---
11 | 
12 | # k-Nearest Neighbor Language Models
13 | 
14 | This is a [PyTorch](https://pytorch.org) implementation of the paper
15 |  [Generalization through Memorization: Nearest Neighbor Language Models](https://arxiv.org/abs/1911.00172).
16 | It uses k-nearest neighbors to  improve perplexity of autoregressive transformer models.
17 | 
18 | An autoregressive language model estimates $p(w_t | \color{yellowgreen}{c_t})$,
19 |  where $w_t$ is the token at step $t$
20 |  and $c_t$ is the context, $\color{yellowgreen}{c_t} = (w_1, w_2, ..., w_{t-1})$.
21 | 
22 | This paper, improves  $p(w_t | \color{yellowgreen}{c_t})$ using a k-nearest neighbor search
23 |  on key-value pairs $\big(f(c_i), w_i\big)$, with search key $f(\color{yellowgreen}{c_t})$.
24 |  Here $f(\color{yellowgreen}{c_t})$ is an embedding of the context $\color{yellowgreen}{c_t}$.
25 |  The paper (and this implementation) uses the **input to the feed-forward layer of the
26 |  final layer of the transformer** as $f(\color{yellowgreen}{c_t})$.
27 | 
28 | We use [FAISS](https://github.com/facebookresearch/faiss) to index $f(c_i)$.
29 | 
30 | ### Implementation
31 | 
32 | So to run $k$NN-LM we need to:
33 | 
34 | * [Train a transformer model](train_model.html)
35 | * [Build an index](build_index.html) of $\big(f(c_i), w_i\big)$
36 | * [Evaluate kNN-ML](eval_knn.html) using $k$NN seach on $\big(f(c_i), w_i\big)$
37 | with  $f(\color{yellowgreen}{c_t})$
38 | 
39 | This experiment uses a small dataset so that we can run this without using up a few hundred giga-bytes
40 | of disk space for the index.
41 | 
42 | The official implementation of $k$NN-LM can be found [here](https://github.com/urvashik/knnlm).
43 | """
44 | 


--------------------------------------------------------------------------------
/labml_nn/transformers/switch/readme.md:
--------------------------------------------------------------------------------
 1 | # [Switch Transformer](https://nn.labml.ai/transformers/switch/index.html)
 2 | 
 3 | This is a miniature [PyTorch](https://pytorch.org) implementation of the paper
 4 | [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961).
 5 | Our implementation only has a few million parameters and doesn't do model parallel distributed training.
 6 | It does single GPU training, but we implement the concept of switching as described in the paper.
 7 | 
 8 | The Switch Transformer uses different parameters for each token by switching among parameters
 9 | based on the token.
10 | Therefore, only a fraction of parameters are chosen for each token.
11 | So you can have more parameters but less computational cost.
12 | 
13 | The switching happens at the Position-wise Feedforward network (FFN) of each transformer block.
14 | Position-wise feedforward network consists of two sequentially fully connected layers.
15 | In switch transformer we have multiple FFNs (multiple experts),
16 | and we chose which one to use based on a router.
17 | The output is a set of probabilities for picking a FFN,
18 | and we pick the one with the highest probability and only evaluate that.
19 | So essentially the computational cost is the same as having a single FFN.
20 | In our implementation this doesn't parallelize well when you have many or large FFNs since it's all
21 | happening on a single GPU.
22 | In a distributed setup you would have each FFN (each very large) on a different device.
23 | 
24 | The paper introduces another loss term to balance load among the experts (FFNs) and
25 | discusses dropping tokens when routing is not balanced.
26 | 
27 | Here's [the training code](experiment.html) and a notebook for training a switch transformer on Tiny Shakespeare dataset.
28 | 
29 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/switch/experiment.ipynb)
30 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/c4656c605b9311eba13d0242ac1c0002)
31 | 


--------------------------------------------------------------------------------
/labml_nn/normalization/weight_standardization/conv2d.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: 2D Convolution Layer with Weight Standardization
 4 | summary: >
 5 |  A PyTorch implementation/tutorial of a 2D Convolution Layer with Weight Standardization.
 6 | ---
 7 | 
 8 | # 2D Convolution Layer with Weight Standardization
 9 | 
10 | This is an implementation of a 2 dimensional convolution layer with [Weight Standardization](./index.html)
11 | """
12 | 
13 | import torch
14 | import torch.nn as nn
15 | from torch.nn import functional as F
16 | 
17 | from labml_nn.normalization.weight_standardization import weight_standardization
18 | 
19 | 
20 | class Conv2d(nn.Conv2d):
21 |     """
22 |     ## 2D Convolution Layer
23 | 
24 |     This extends the standard 2D Convolution layer and standardize the weights before the convolution step.
25 |     """
26 |     def __init__(self, in_channels, out_channels, kernel_size,
27 |                  stride=1,
28 |                  padding=0,
29 |                  dilation=1,
30 |                  groups: int = 1,
31 |                  bias: bool = True,
32 |                  padding_mode: str = 'zeros',
33 |                  eps: float = 1e-5):
34 |         super(Conv2d, self).__init__(in_channels, out_channels, kernel_size,
35 |                                      stride=stride,
36 |                                      padding=padding,
37 |                                      dilation=dilation,
38 |                                      groups=groups,
39 |                                      bias=bias,
40 |                                      padding_mode=padding_mode)
41 |         self.eps = eps
42 | 
43 |     def forward(self, x: torch.Tensor):
44 |         return F.conv2d(x, weight_standardization(self.weight, self.eps), self.bias, self.stride,
45 |                         self.padding, self.dilation, self.groups)
46 | 
47 | 
48 | def _test():
49 |     """
50 |     A simple test to verify the tensor sizes
51 |     """
52 |     conv2d = Conv2d(10, 20, 5)
53 |     from labml.logger import inspect
54 |     inspect(conv2d.weight)
55 |     import torch
56 |     inspect(conv2d(torch.zeros(10, 10, 100, 100)))
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     _test()
61 | 


--------------------------------------------------------------------------------
/labml_nn/normalization/batch_norm/cifar10.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: CIFAR10 Experiment to try Group Normalization
 4 | summary: >
 5 |   This trains is a simple convolutional neural network that uses group normalization
 6 |   to classify CIFAR10 images.
 7 | ---
 8 | 
 9 | # CIFAR10 Experiment for Group Normalization
10 | """
11 | 
12 | import torch.nn as nn
13 | 
14 | from labml import experiment
15 | from labml.configs import option
16 | from labml_helpers.module import Module
17 | from labml_nn.experiments.cifar10 import CIFAR10Configs
18 | from labml_nn.normalization.batch_norm import BatchNorm
19 | 
20 | 
21 | class Model(Module):
22 |     def __init__(self):
23 |         super().__init__()
24 |         layers = []
25 |         in_channels = 3
26 |         for block in [[64, 64], [128, 128], [256, 256, 256], [512, 512, 512], [512, 512, 512]]:
27 |             for channels in block:
28 |                 layers += [nn.Conv2d(in_channels, channels, kernel_size=3, padding=1),
29 |                            BatchNorm(channels),
30 |                            nn.ReLU(inplace=True)]
31 |                 in_channels = channels
32 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
33 |         layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
34 |         self.layers = nn.Sequential(*layers)
35 |         self.fc = nn.Linear(512, 10)
36 | 
37 |     def __call__(self, x):
38 |         x = self.layers(x)
39 |         x = x.view(x.shape[0], -1)
40 |         return self.fc(x)
41 | 
42 | 
43 | @option(CIFAR10Configs.model)
44 | def model(c: CIFAR10Configs):
45 |     """
46 |     ### Create model
47 |     """
48 |     return Model().to(c.device)
49 | 
50 | 
51 | def main():
52 |     # Create experiment
53 |     experiment.create(name='cifar10', comment='batch norm')
54 |     # Create configurations
55 |     conf = CIFAR10Configs()
56 |     # Load configurations
57 |     experiment.configs(conf, {
58 |         'optimizer.optimizer': 'Adam',
59 |         'optimizer.learning_rate': 2.5e-4,
60 |         'train_batch_size': 64,
61 |     })
62 |     # Start the experiment and run the training loop
63 |     with experiment.start():
64 |         conf.run()
65 | 
66 | 
67 | #
68 | if __name__ == '__main__':
69 |     main()
70 | 


--------------------------------------------------------------------------------
/labml_nn/transformers/feedback/readme.md:
--------------------------------------------------------------------------------
 1 | # [Feedback Transformer](https://nn.labml.ai/transformers/feedback/index.html)
 2 | 
 3 | This is a [PyTorch](https://pytorch.org) implementation of the paper
 4 | [Accessing Higher-level Representations in Sequential Transformers with Feedback Memory](https://arxiv.org/abs/2002.09402).
 5 | 
 6 | Normal transformers process tokens in parallel. Each transformer layer pays attention
 7 | to the outputs of the previous layer.
 8 | Feedback transformer pays attention to the output of all layers in previous steps.
 9 | So this adds recurrence, and we need to process token-by-token.
10 | This slows down the training significantly (about 5X - 10X depending on the sequence length).
11 | However, when predicting Feedback Transformer is faster because you can predict the next token
12 | if you cache the memory vectors.
13 | 
14 | In order to speed up the training the paper discusses starting with a short sequence length and
15 | gradually increasing it.
16 | They also discuss using a pretrained parallel transformer as the starting point.
17 | 
18 | The original feedback transformer doesn't keep the outputs of all layers.
19 | Instead it keeps weighted sum of the output of all layers.
20 | This reduces the memory used for caching during prediction.
21 | The first half of this file implements this.
22 | 
23 | The updated feedback transformer shares weights used
24 | to calculate keys and values among the layers.
25 | We then calculate the keys and values for each step only once and keep
26 | them cached.
27 | The [second half](#shared_kv) of this file implements this.
28 | We implemented a custom PyTorch function to improve performance.
29 | 
30 | Here's [the training code](experiment.html) and a notebook for training a feedback transformer on Tiny Shakespeare dataset.
31 | 
32 | [Colab Notebook](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb)
33 | 
34 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/feedback/experiment.ipynb)
35 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/d8eb9416530a11eb8fb50242ac1c0002)
36 | 


--------------------------------------------------------------------------------
/labml_nn/cnn/cross_validation.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import torch
 3 | import torchvision
 4 | import torchvision.transforms as transforms
 5 | from torch.utils.data.sampler import SubsetRandomSampler
 6 | import matplotlib.pyplot as plt
 7 | import numpy as np
 8 | import torch.optim as optim
 9 | from torchsummary import summary
10 | import torch.nn as nn
11 | 
12 | # from models.mlp import MLP
13 | # from utils.utils import *
14 | # from utils.train_dataset import *
15 | #from nutsflow import Take, Consume
16 | #from nutsml import *
17 | from utils.dataloader import *
18 | from models.cnn import CNN
19 | from utils.train import Trainer
20 | 
21 | from utils.cv_train import *
22 | 
23 | # Check if GPU is available
24 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
25 | print("Device:  " + str(device))
26 | 
27 | # Cifar 10 Datasets location
28 | save='./data/Cifar10'
29 | 
30 | # Transformations train
31 | transform_train = transforms.Compose(
32 |         [transforms.ToTensor(),
33 |          transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
34 | 
35 | # Load train dataset and dataloader
36 | trainset = LoadCifar10DatasetTrain(save, transform_train)
37 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
38 |                                           shuffle=True, num_workers=4)
39 | 
40 | # Transformations test (for inference later)
41 | transform_test = transforms.Compose(
42 |         [transforms.ToTensor(),
43 |          transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
44 | 
45 | # Load test dataset and dataloader (for inference later)
46 | testset = LoadCifar10DatasetTest(save, transform_test)
47 | testloader = torch.utils.data.DataLoader(testset, batch_size=64,
48 |                                          shuffle=False, num_workers=4)
49 | 
50 | # Specify loss function
51 | cost = nn.CrossEntropyLoss()
52 | 
53 | epochs=25  #10
54 | splits = 4 #5
55 | 
56 | # Training - Cross-validation
57 | history = cross_val_train(cost, trainset, epochs, splits, device=device)
58 | 
59 | # Inference
60 | best_model, best_val_accuracy = retreive_best_trial()
61 | print("Best Validation Accuracy = %.3f"%(best_val_accuracy))
62 | 
63 | # Testing
64 | accuracy = Test(best_model, cost, testloader, device=device)
65 | print("Test Accuracy = %.3f"%(accuracy['val_acc']))
66 | 


--------------------------------------------------------------------------------
/labml_nn/resnets/pretrained_nets.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | 
 3 | from utils.train import Trainer # Default custom training class
 4 | from models.resnet import *
 5 | from torchvision import models
 6 | 
 7 | # GPU Check
 8 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 9 | print("Device:  " + str(device))
10 | 
11 | # Use different train/test data augmentations
12 | transform_test = transforms.Compose(
13 |         [transforms.ToTensor(),
14 |          transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
15 | 
16 | # Get Cifar 10 Datasets
17 | save='./data/Cifar10'
18 | transform_train = transforms.Compose([
19 |         transforms.RandomHorizontalFlip(p=1.0),
20 |         transforms.RandomRotation(20),
21 |         transforms.RandomCrop(32, (2, 2), pad_if_needed=False, padding_mode='constant'),
22 |         transforms.ToTensor(),
23 |         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
24 | 
25 | # Get Cifar 10 Datasets
26 | trainset = torchvision.datasets.CIFAR10(root=save, train=True, download=True, transform=transform_train)
27 | testset = torchvision.datasets.CIFAR10(root=save, train=False, download=True, transform=transform_test)
28 | 
29 | # Get Cifar 10 Dataloaders
30 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
31 |                                           shuffle=True, num_workers=4)
32 | 
33 | testloader = torch.utils.data.DataLoader(testset, batch_size=64,
34 |                                          shuffle=False, num_workers=4)
35 | 
36 | #################################
37 | # Load the pre-trained model
38 | #################################
39 | 
40 | model_ft = models.resnet18(pretrained=True)
41 | num_ftrs = model_ft.fc.in_features
42 | model_ft.fc = nn.Sequential(
43 |     nn.Dropout(0.5),
44 |     nn.Linear(num_ftrs, 10)
45 | )
46 | 
47 | 
48 | model_ft = model_ft.to(device)
49 | 
50 | # Loss function
51 | cost = nn.CrossEntropyLoss()
52 | 
53 | # Optimizer
54 | lr = 0.0005
55 | # opt = optim.SGD(model_ft.parameters(), lr=lr, momentum=0.9)
56 | opt = torch.optim.Adam(model_ft.parameters(), lr=lr, betas=(0.9, 0.95), weight_decay=1e-4) #0.0005 l2_factor.item()
57 | 
58 | # Create a trainer
59 | trainer = Trainer(model_ft, opt, cost, name="Transfer-learning",lr=lr , use_lr_schedule=True, device=device)
60 | 
61 | # Run training
62 | epochs = 25
63 | trainer.Train(trainloader, epochs, testloader=testloader)
64 | # trainer.Train(trainloader, epochs) # check train error
65 | 
66 | print('done')
67 | 


--------------------------------------------------------------------------------
/labml_nn/transformers/compressive/readme.md:
--------------------------------------------------------------------------------
 1 | # [Compressive Transformer](https://nn.labml.ai/transformers/compressive/index.html)
 2 | 
 3 | This is an implementation of
 4 | [Compressive Transformers for Long-Range Sequence Modelling](https://arxiv.org/abs/1911.05507)
 5 | in [PyTorch](https://pytorch.org).
 6 | 
 7 | This is an extension of [Transformer XL](https://nn.labml.ai/transformers/xl/index.html) where past memories
 8 | are compressed to give a longer attention range.
 9 | That is, the furthest $n_{cm} c$ memories are compressed into
10 | $n_{cm}$ memories, where $c$ is the compression rate.
11 | 
12 | ## Compression operation
13 | 
14 | The compression operation is defined as
15 | $f_c: \mathbb{R}^{nc \times d} \rightarrow \mathbb{R}^{n \times d}$.
16 | The paper introduces multiple choices for $f_c$ and we have only implemented
17 | 1D convolution which seems to give the best results.
18 | Each layer has a separate compression operation $f_c^{(i)}$ where
19 | $i$ is the layer number.
20 | 
21 | ## Training compression operation
22 | 
23 | Since training compression with BPTT requires maintaining
24 | a very large computational graph (many time steps), the paper proposes
25 | an *auto-encoding loss* and an *attention reconstruction loss*.
26 | The auto-encoding loss decodes the original memories from the compressed memories
27 | and calculates the loss.
28 | Attention reconstruction loss computes the multi-headed attention results
29 | on the compressed memory and on uncompressed memory and gets a mean squared error
30 | between them.
31 | We have implemented the latter here since it gives better results.
32 | 
33 | This implementation uses pre-layer normalization
34 | while the paper uses post-layer normalization.
35 | Pre-layer norm does the layer norm before FFN[../feedforward.html) and
36 | self-attention, and the pass-through in the residual connection is not normalized.
37 | This is supposed to be more stable in standard transformer setups.
38 | 
39 | Here are [the training code](https://nn.labml.ai/transformers/compressive/experiment.html) and a notebook for training a compressive transformer
40 | model on the Tiny Shakespeare dataset.
41 | 
42 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/compressive/experiment.ipynb)
43 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/0d9b5338726c11ebb7c80242ac1c0002)
44 | 


--------------------------------------------------------------------------------
/labml_nn/transformers/label_smoothing_loss.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: Label Smoothing Loss
 4 | summary: >
 5 |   This is an implementation of label smoothing loss, that can be used as
 6 |   an alternative to cross entropy loss for improved accuracy.
 7 | ---
 8 | 
 9 | # Label Smoothing Loss
10 | """
11 | import matplotlib.pyplot as plt
12 | import numpy as np
13 | import torch
14 | import torch.nn as nn
15 | 
16 | from labml_helpers.module import Module
17 | 
18 | 
19 | class LabelSmoothingLoss(Module):
20 |     def __init__(self, size: int, padding_idx: int, smoothing: float = 0.0):
21 |         super().__init__()
22 |         self.loss = nn.KLDivLoss(reduction='sum')
23 |         self.padding_idx = padding_idx
24 |         self.confidence = 1.0 - smoothing
25 |         self.smoothing = smoothing
26 |         self.size = size
27 |         self.true_dist = None
28 | 
29 |     def forward(self, x: torch.Tensor, target: torch.Tensor):
30 |         assert x.shape[1] == self.size
31 |         true_dist = x.clone()
32 |         true_dist.fill_(self.smoothing / (self.size - 2))
33 |         true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
34 |         true_dist[:, self.padding_idx] = 0
35 |         mask = torch.nonzero(target == self.padding_idx, as_tuple=False)
36 |         if mask.dim() > 0:
37 |             true_dist.index_fill_(0, mask.squeeze(), 0.0)
38 |         self.true_dist = true_dist
39 |         return self.loss(x, true_dist.detach())
40 | 
41 | 
42 | def _test_label_smoothing():
43 |     smooth_loss = LabelSmoothingLoss(5, 0, 0.4)
44 |     predict = torch.tensor([[0, 0.2, 0.7, 0.1, 0],
45 |                             [0, 0.2, 0.7, 0.1, 0],
46 |                             [0, 0.2, 0.7, 0.1, 0]], dtype=torch.float)
47 |     _ = smooth_loss(predict.log(),
48 |                     torch.tensor([2, 1, 0], dtype=torch.long))
49 | 
50 |     # Show the target distributions expected by the system.
51 |     plt.imshow(smooth_loss.true_dist)
52 |     plt.show()
53 | 
54 |     smooth_loss = LabelSmoothingLoss(5, 0, 0.1)
55 | 
56 |     def loss_sample(x):
57 |         d = x + 3 * 1
58 |         predict2 = torch.tensor([[0, x / d, 1 / d, 1 / d, 1 / d],
59 |                                  ], dtype=torch.float)
60 |         # print(predict)
61 |         return smooth_loss(predict2.log(),
62 |                            torch.tensor([1], dtype=torch.long)).item()
63 | 
64 |     plt.plot(np.arange(1, 100), [loss_sample(x) for x in range(1, 100)])
65 |     plt.show()
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     _test_label_smoothing()
70 | 


--------------------------------------------------------------------------------
/labml_nn/optimizers/adam_warmup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: Adam optimizer with warm-up
 4 | summary: A simple PyTorch implementation/tutorial of Adam optimizer with warm-up.
 5 | ---
 6 | 
 7 | # Adam Optimizer with Warmup
 8 | 
 9 | This extends [AMSGrad optimizer](amsgrad.html) and adds a warmup stage.
10 | """
11 | 
12 | from typing import Dict
13 | 
14 | from labml_nn.optimizers import WeightDecay
15 | from labml_nn.optimizers.amsgrad import AMSGrad
16 | 
17 | 
18 | class AdamWarmup(AMSGrad):
19 |     """
20 |     ## Adam Optimizer with Warmup
21 | 
22 |     This class extends from AMSGrad optimizer defined in [`amsgrad.py`](amsgrad.html).
23 |     """
24 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
25 |                  weight_decay: WeightDecay = WeightDecay(),
26 |                  optimized_update: bool = True,
27 |                  amsgrad=False, warmup=0, defaults=None):
28 |         """
29 |         ### Initialize the optimizer
30 | 
31 |         * `params` is the list of parameters
32 |         * `lr` is the learning rate $\alpha$
33 |         * `betas` is a tuple of ($\beta_1$, $\beta_2$)
34 |         * `eps` is $\hat{\epsilon}$ or $\epsilon$ based on `optimized_update`
35 |         * `weight_decay` is an instance of class `WeightDecay` defined in [`__init__.py`](index.html)
36 |         * 'optimized_update' is a flag whether to optimize the bias correction of the second moment
37 |           by doing it after adding $\epsilon$
38 |         * `amsgrad` is a flag indicating whether to use AMSGrad or fallback to plain Adam
39 |         * `warmup` number of warmup steps
40 |         * `defaults` is a dictionary of default for group values.
41 |          This is useful when you want to extend the class `AdamWarmup`.
42 |         """
43 | 
44 |         defaults = {} if defaults is None else defaults
45 |         defaults.update(dict(warmup=warmup))
46 |         super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults)
47 | 
48 |     def get_lr(self, state: Dict[str, any], group: Dict[str, any]):
49 |         """
50 |         ### Get learning-rate
51 | 
52 |         $$\alpha \min \bigg(1, \frac{t}{w}\bigg)$$
53 |         where $w$ is the number of warmup steps.
54 |         """
55 |         # If we are in warmup stage
56 |         if group['warmup'] > state['step']:
57 |             # A linearly increasing learning rate from $0$ to $\alpha$
58 |             return 1e-8 + state['step'] * group['lr'] / group['warmup']
59 |         else:
60 |             # Constant learning rate $\alpha$
61 |             return group['lr']
62 | 


--------------------------------------------------------------------------------
/labml_nn/normalization/weight_standardization/experiment.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: CIFAR10 Experiment to try Weight Standardization and Batch-Channel Normalization
 4 | summary: >
 5 |   This trains is a VGG net that uses weight standardization  and batch-channel normalization
 6 |   to classify CIFAR10 images.
 7 | ---
 8 | 
 9 | # CIFAR10 Experiment to try Weight Standardization and Batch-Channel Normalization
10 | """
11 | 
12 | import torch.nn as nn
13 | 
14 | from labml import experiment
15 | from labml.configs import option
16 | from labml_helpers.module import Module
17 | from labml_nn.experiments.cifar10 import CIFAR10Configs
18 | from labml_nn.normalization.batch_channel_norm import BatchChannelNorm
19 | from labml_nn.normalization.weight_standardization.conv2d import Conv2d
20 | 
21 | 
22 | class Model(Module):
23 |     """
24 |     ### Model
25 | 
26 |     A VGG model that use [Weight Standardization](./index.html) and
27 |      [Batch-Channel Normalization](../batch_channel_norm/index.html).
28 |     """
29 |     def __init__(self):
30 |         super().__init__()
31 |         layers = []
32 |         in_channels = 3
33 |         for block in [[64, 64], [128, 128], [256, 256, 256], [512, 512, 512], [512, 512, 512]]:
34 |             for channels in block:
35 |                 layers += [Conv2d(in_channels, channels, kernel_size=3, padding=1),
36 |                            BatchChannelNorm(channels, 32),
37 |                            nn.ReLU(inplace=True)]
38 |                 in_channels = channels
39 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
40 |         layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
41 |         self.layers = nn.Sequential(*layers)
42 |         self.fc = nn.Linear(512, 10)
43 | 
44 |     def __call__(self, x):
45 |         x = self.layers(x)
46 |         x = x.view(x.shape[0], -1)
47 |         return self.fc(x)
48 | 
49 | 
50 | @option(CIFAR10Configs.model)
51 | def model(c: CIFAR10Configs):
52 |     """
53 |     ### Create model
54 |     """
55 |     return Model().to(c.device)
56 | 
57 | 
58 | def main():
59 |     # Create experiment
60 |     experiment.create(name='cifar10', comment='weight standardization')
61 |     # Create configurations
62 |     conf = CIFAR10Configs()
63 |     # Load configurations
64 |     experiment.configs(conf, {
65 |         'optimizer.optimizer': 'Adam',
66 |         'optimizer.learning_rate': 2.5e-4,
67 |         'train_batch_size': 64,
68 |     })
69 |     # Start the experiment and run the training loop
70 |     with experiment.start():
71 |         conf.run()
72 | 
73 | 
74 | #
75 | if __name__ == '__main__':
76 |     main()
77 | 


--------------------------------------------------------------------------------
/labml_nn/resnets/utils/utils.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | 
 3 | import torch
 4 | import torchvision
 5 | import torchvision.transforms as transforms
 6 | 
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | 
10 | import matplotlib.pyplot as plt
11 | import numpy as np
12 | 
13 | from sklearn.model_selection import KFold
14 | from torch.utils.data.sampler import SubsetRandomSampler
15 | 
16 | 
17 | 
18 | # Plot the loss of multiple runs together
19 | def PlotLosses(losses, titles, save=None):
20 |     fig = plt.figure()
21 |     fig.set_size_inches(14, 22)
22 |     # Plot results on 3 subgraphs
23 |     # subplot integers:
24 |     #       nrows
25 |     #       ncols
26 |     #       index
27 |     sublplot_str_start = "" + str(len(losses)) + "1"
28 | 
29 |     for i in range(len(losses)):
30 |         subplot = sublplot_str_start + str(i+1)
31 |         loss = losses[i]
32 |         title = titles[i]
33 | 
34 |         ax = plt.subplot(int(subplot))
35 |         ax.plot(range(len(loss)), loss)
36 |         ax.set_xlabel("Epoch")
37 |         ax.set_title(title)
38 |         ax.set_ylabel("Loss")
39 | 
40 |     # Save Figure
41 |     if save:
42 |     	plt.savefig(save)
43 |     else:
44 |     	plt.show()
45 | 
46 | 
47 | 
48 | def ClassSpecificTestCifar10(net, testdata, device=None):
49 |     classes = ('plane', 'car', 'bird', 'cat','deer', 'dog', 'frog', 'horse', 'ship', 'truck')
50 |     class_correct = list(0. for i in range(10))
51 |     class_total = list(0. for i in range(10))
52 |     with torch.no_grad():
53 |         for data in testdata:
54 |             if device:
55 |                 images, labels = data[0].to(device), data[1].to(device)
56 |             else:
57 |                 images, labels = data
58 | 
59 |             outputs = net(images)
60 |             _, predicted = torch.max(outputs, 1)
61 |             c = (predicted == labels).squeeze()
62 |             for i in range(4):
63 |                 label = labels[i]
64 |                 class_correct[label] += c[i].item()
65 |                 class_total[label] += 1
66 | 
67 |     # Print out
68 |     for i in range(10):
69 |         print('Accuracy of %5s : %2d %%' % (
70 |             classes[i], 100 * class_correct[i] / class_total[i]))
71 | 
72 | 
73 | 
74 | def GetActivation(name="relu"):
75 |     if name == "relu":
76 |         return nn.ReLU()
77 |     elif name == "leakyrelu":
78 |         return nn.LeakyReLU()
79 |     elif name == "Sigmoid":
80 |         return nn.Sigmoid()
81 |     elif name == "Tanh":
82 |         return nn.Tanh()
83 |     elif name == "Identity":
84 |         return nn.Identity()
85 |     else:
86 |         return nn.ReLU()


--------------------------------------------------------------------------------
/labml_nn/transformers/positional_encoding.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: Fixed Positional Encodings
 4 | summary: >
 5 |   Implementation with explanation of fixed positional encodings as
 6 |   described in paper Attention is All You Need.
 7 | ---
 8 | 
 9 | # Fixed Positional Encodings
10 | 
11 | The positional encoding encodes the position along the sequence into
12 |  a vector of size `d_model`.
13 | 
14 | \begin{align}
15 | PE_{p,2i} &= sin\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg) \\
16 | PE_{p,2i + 1} &= cos\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg)
17 | \end{align}
18 | 
19 | Where $1 \leq 2i, 2i + 1 \leq d_{model}$
20 |  are the feature indexes in the encoding, and $p$ is the position.
21 | """
22 | 
23 | import math
24 | 
25 | import numpy as np
26 | import torch
27 | import torch.nn as nn
28 | 
29 | from labml_helpers.module import Module
30 | 
31 | 
32 | class PositionalEncoding(Module):
33 |     def __init__(self, d_model: int, dropout_prob: float, max_len: int = 5000):
34 |         super().__init__()
35 |         self.dropout = nn.Dropout(dropout_prob)
36 | 
37 |         self.register_buffer('positional_encodings', get_positional_encoding(d_model, max_len), False)
38 | 
39 |     def forward(self, x: torch.Tensor):
40 |         pe = self.positional_encodings[:x.shape[0]].detach().requires_grad_(False)
41 |         x = x + pe
42 |         x = self.dropout(x)
43 |         return x
44 | 
45 | 
46 | def get_positional_encoding(d_model: int, max_len: int = 5000):
47 |     # Empty encodings vectors
48 |     encodings = torch.zeros(max_len, d_model)
49 |     # Position indexes
50 |     position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
51 |     # $2 * i$
52 |     two_i = torch.arange(0, d_model, 2, dtype=torch.float32)
53 |     # $10000^{\frac{2i}{d_{model}}$
54 |     div_term = torch.exp(two_i * -(math.log(10000.0) / d_model))
55 |     # $PE_{p,2i} = sin\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg)$
56 |     encodings[:, 0::2] = torch.sin(position * div_term)
57 |     # $PE_{p,2i + 1} = cos\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg)$
58 |     encodings[:, 1::2] = torch.cos(position * div_term)
59 | 
60 |     # Add batch dimension
61 |     encodings = encodings.unsqueeze(1).requires_grad_(False)
62 | 
63 |     return encodings
64 | 
65 | 
66 | def _test_positional_encoding():
67 |     import matplotlib.pyplot as plt
68 | 
69 |     plt.figure(figsize=(15, 5))
70 |     pe = get_positional_encoding(20, 100)
71 |     plt.plot(np.arange(100), pe[:, 0, 4:8].numpy())
72 |     plt.legend(["dim %d" % p for p in [4, 5, 6, 7]])
73 |     plt.title("Positional encoding")
74 |     plt.show()
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     _test_positional_encoding()
79 | 


--------------------------------------------------------------------------------
/labml_nn/normalization/group_norm/experiment.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: CIFAR10 Experiment to try Group Normalization
 4 | summary: >
 5 |   This trains is a simple convolutional neural network that uses group normalization
 6 |   to classify CIFAR10 images.
 7 | ---
 8 | 
 9 | # CIFAR10 Experiment for Group Normalization
10 | """
11 | 
12 | import torch.nn as nn
13 | 
14 | from labml import experiment
15 | from labml.configs import option
16 | from labml_helpers.module import Module
17 | from labml_nn.experiments.cifar10 import CIFAR10Configs
18 | from labml_nn.normalization.group_norm import GroupNorm
19 | 
20 | 
21 | class Model(Module):
22 |     """
23 |     ### VGG model for CIFAR-10 classification
24 |     """
25 | 
26 |     def __init__(self, groups: int = 32):
27 |         super().__init__()
28 |         layers = []
29 |         # RGB channels
30 |         in_channels = 3
31 |         # Number of channels in each layer in each block
32 |         for block in [[64, 64], [128, 128], [256, 256, 256], [512, 512, 512], [512, 512, 512]]:
33 |             # Convolution, Normalization and Activation layers
34 |             for channels in block:
35 |                 layers += [nn.Conv2d(in_channels, channels, kernel_size=3, padding=1),
36 |                            GroupNorm(groups, channels),
37 |                            nn.ReLU(inplace=True)]
38 |                 in_channels = channels
39 |             # Max pooling at end of each block
40 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
41 | 
42 |         # Create a sequential model with the layers
43 |         self.layers = nn.Sequential(*layers)
44 |         # Final logits layer
45 |         self.fc = nn.Linear(512, 10)
46 | 
47 |     def __call__(self, x):
48 |         # The VGG layers
49 |         x = self.layers(x)
50 |         # Reshape for classification layer
51 |         x = x.view(x.shape[0], -1)
52 |         # Final linear layer
53 |         return self.fc(x)
54 | 
55 | 
56 | class Configs(CIFAR10Configs):
57 |     # Number of groups
58 |     groups: int = 16
59 | 
60 | 
61 | @option(Configs.model)
62 | def model(c: Configs):
63 |     """
64 |     ### Create model
65 |     """
66 |     return Model(c.groups).to(c.device)
67 | 
68 | 
69 | def main():
70 |     # Create experiment
71 |     experiment.create(name='cifar10', comment='group norm')
72 |     # Create configurations
73 |     conf = Configs()
74 |     # Load configurations
75 |     experiment.configs(conf, {
76 |         'optimizer.optimizer': 'Adam',
77 |         'optimizer.learning_rate': 2.5e-4,
78 |     })
79 |     # Start the experiment and run the training loop
80 |     with experiment.start():
81 |         conf.run()
82 | 
83 | 
84 | #
85 | if __name__ == '__main__':
86 |     main()
87 | 


--------------------------------------------------------------------------------
/labml_nn/normalization/batch_norm/mnist.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: MNIST Experiment to try Batch Normalization
 4 | summary: >
 5 |   This trains is a simple convolutional neural network that uses batch normalization
 6 |   to classify MNIST digits.
 7 | ---
 8 | 
 9 | # MNIST Experiment for Batch Normalization
10 | """
11 | 
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 | import torch.utils.data
15 | 
16 | from labml import experiment
17 | from labml.configs import option
18 | from labml_helpers.module import Module
19 | from labml_nn.experiments.mnist import MNISTConfigs
20 | from labml_nn.normalization.batch_norm import BatchNorm
21 | 
22 | 
23 | class Model(Module):
24 |     """
25 |     ### Model definition
26 |     """
27 | 
28 |     def __init__(self):
29 |         super().__init__()
30 |         # Note that we omit the bias parameter
31 |         self.conv1 = nn.Conv2d(1, 20, 5, 1, bias=False)
32 |         # Batch normalization with 20 channels (output of convolution layer).
33 |         # The input to this layer will have shape `[batch_size, 20, height(24), width(24)]`
34 |         self.bn1 = BatchNorm(20)
35 |         #
36 |         self.conv2 = nn.Conv2d(20, 50, 5, 1, bias=False)
37 |         # Batch normalization with 50 channels.
38 |         # The input to this layer will have shape `[batch_size, 50, height(8), width(8)]`
39 |         self.bn2 = BatchNorm(50)
40 |         #
41 |         self.fc1 = nn.Linear(4 * 4 * 50, 500, bias=False)
42 |         # Batch normalization with 500 channels (output of fully connected layer).
43 |         # The input to this layer will have shape `[batch_size, 500]`
44 |         self.bn3 = BatchNorm(500)
45 |         #
46 |         self.fc2 = nn.Linear(500, 10)
47 | 
48 |     def __call__(self, x: torch.Tensor):
49 |         x = F.relu(self.bn1(self.conv1(x)))
50 |         x = F.max_pool2d(x, 2, 2)
51 |         x = F.relu(self.bn2(self.conv2(x)))
52 |         x = F.max_pool2d(x, 2, 2)
53 |         x = x.view(-1, 4 * 4 * 50)
54 |         x = F.relu(self.bn3(self.fc1(x)))
55 |         return self.fc2(x)
56 | 
57 | 
58 | @option(MNISTConfigs.model)
59 | def model(c: MNISTConfigs):
60 |     """
61 |     ### Create model
62 | 
63 |     We use [`MNISTConfigs`](../../experiments/mnist.html#MNISTConfigs) configurations
64 |     and set a new function to calculate the model.
65 |     """
66 |     return Model().to(c.device)
67 | 
68 | 
69 | def main():
70 |     # Create experiment
71 |     experiment.create(name='mnist_batch_norm')
72 |     # Create configurations
73 |     conf = MNISTConfigs()
74 |     # Load configurations
75 |     experiment.configs(conf, {'optimizer.optimizer': 'Adam'})
76 |     # Start the experiment and run the training loop
77 |     with experiment.start():
78 |         conf.run()
79 | 
80 | 
81 | #
82 | if __name__ == '__main__':
83 |     main()
84 | 


--------------------------------------------------------------------------------
/labml_nn/normalization/instance_norm/experiment.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: CIFAR10 Experiment to try Instance Normalization
 4 | summary: >
 5 |   This trains is a simple convolutional neural network that uses instance normalization
 6 |   to classify CIFAR10 images.
 7 | ---
 8 | 
 9 | # CIFAR10 Experiment for Instance Normalization
10 | 
11 | This demonstrates the use of an instance normalization layer in a convolutional
12 | neural network for classification. Not that instance normalization was designed for
13 | style transfer and this is only a demo.
14 | """
15 | 
16 | import torch.nn as nn
17 | 
18 | from labml import experiment
19 | from labml.configs import option
20 | from labml_helpers.module import Module
21 | from labml_nn.experiments.cifar10 import CIFAR10Configs
22 | from labml_nn.normalization.instance_norm import InstanceNorm
23 | 
24 | 
25 | class Model(Module):
26 |     """
27 |     ### VGG model for CIFAR-10 classification
28 |     """
29 | 
30 |     def __init__(self):
31 |         super().__init__()
32 |         layers = []
33 |         # RGB channels
34 |         in_channels = 3
35 |         # Number of channels in each layer in each block
36 |         for block in [[64, 64], [128, 128], [256, 256, 256], [512, 512, 512], [512, 512, 512]]:
37 |             # Convolution, Normalization and Activation layers
38 |             for channels in block:
39 |                 layers += [nn.Conv2d(in_channels, channels, kernel_size=3, padding=1),
40 |                            InstanceNorm(channels),
41 |                            nn.ReLU(inplace=True)]
42 |                 in_channels = channels
43 |             # Max pooling at end of each block
44 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
45 | 
46 |         # Create a sequential model with the layers
47 |         self.layers = nn.Sequential(*layers)
48 |         # Final logits layer
49 |         self.fc = nn.Linear(512, 10)
50 | 
51 |     def __call__(self, x):
52 |         # The VGG layers
53 |         x = self.layers(x)
54 |         # Reshape for classification layer
55 |         x = x.view(x.shape[0], -1)
56 |         # Final linear layer
57 |         return self.fc(x)
58 | 
59 | 
60 | @option(CIFAR10Configs.model)
61 | def model(c: CIFAR10Configs):
62 |     """
63 |     ### Create model
64 |     """
65 |     return Model().to(c.device)
66 | 
67 | 
68 | def main():
69 |     # Create experiment
70 |     experiment.create(name='cifar10', comment='instance norm')
71 |     # Create configurations
72 |     conf = CIFAR10Configs()
73 |     # Load configurations
74 |     experiment.configs(conf, {
75 |         'optimizer.optimizer': 'Adam',
76 |         'optimizer.learning_rate': 2.5e-4,
77 |     })
78 |     # Start the experiment and run the training loop
79 |     with experiment.start():
80 |         conf.run()
81 | 
82 | 
83 | #
84 | if __name__ == '__main__':
85 |     main()
86 | 


--------------------------------------------------------------------------------
/labml_nn/resnets/resnet_net.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | 
 3 | # Custom classes
 4 | from models.mlp import MLP
 5 | from utils.train import Trainer
 6 | from models.resnet import *
 7 | 
 8 | # GPU Check
 9 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
10 | print("Device:  " + str(device))
11 | 
12 | #Use different train/test data augmentations
13 | transform_test = transforms.Compose(
14 |         [transforms.ToTensor(),
15 |          transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
16 | 
17 | transform_train = transforms.Compose([
18 |         transforms.RandomHorizontalFlip(p=1.0),
19 |         transforms.RandomRotation(20),
20 |         transforms.RandomCrop(32, (2, 2), pad_if_needed=False, padding_mode='constant'),
21 |         transforms.ToTensor(),
22 |         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
23 | 
24 | 
25 | # Get Cifar 10 Datasets
26 | save='./data/Cifar10'
27 | trainset = torchvision.datasets.CIFAR10(root=save, train=True, download=True, transform=transform_train)
28 | testset = torchvision.datasets.CIFAR10(root=save, train=False, download=True, transform=transform_test)
29 | 
30 | # Get Cifar 10 Dataloaders
31 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
32 |                                           shuffle=True, num_workers=4)
33 | 
34 | testloader = torch.utils.data.DataLoader(testset, batch_size=64, 
35 |                                          shuffle=False, num_workers=4)
36 | 
37 | epochs = 50
38 | 
39 | #################################
40 | # Create the assignment Resnet (part a)
41 | #################################
42 | def MyResNet():
43 |     resnet = ResNet(in_features= [32, 32, 3],
44 |                     num_class=10,
45 |                     feature_channel_list = [128, 256, 512],
46 |                     batch_norm= True,
47 |                     num_stacks=1
48 |                     )
49 | 
50 |     # Create MLP
51 |     # Calculate the input shape
52 |     s = resnet.GetCurShape()
53 |     in_features = s[0]*s[1]*s[2]
54 | 
55 |     mlp = MLP(in_features,
56 |                  10,
57 |                  [], #512, 1024, 512
58 |                  [],
59 |                  use_batch_norm=False,
60 |                  use_dropout=False,
61 |                  use_softmax=False,
62 |                  device=device)
63 | 
64 |     resnet.AddMLP(mlp)
65 |     return resnet
66 | 
67 | model = MyResNet()
68 | model.to(device=device)
69 | summary(model, (3, 32,32))
70 | 
71 | # Optimizer
72 | opt = torch.optim.Adam(model.parameters(), lr=0.0005, betas=(0.9, 0.95), weight_decay=1e-8) #0.0005 l2_factor.item()
73 | 
74 | # Loss function
75 | cost = nn.CrossEntropyLoss()
76 | 
77 | # Create a trainer
78 | trainer = Trainer(model, opt, cost, name="MyResNet", device=device, use_lr_schedule =True)
79 | 
80 | # Run training
81 | trainer.Train(trainloader, epochs, testloader=testloader)
82 | 
83 | print('done')
84 | 


--------------------------------------------------------------------------------
/labml_nn/resnets/models/mlp.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | class MLP(nn.Module):
 7 |     def __init__(self
 8 |             , in_features
 9 |             , out_features
10 |             , hidden_layers
11 |             , actv_func
12 |             , pre_module_list=None
13 |             , use_dropout=False
14 |             , use_batch_norm=False
15 |             , use_softmax=True
16 |             , device="cpu"
17 |             ):
18 |         super(MLP, self).__init__()
19 | 
20 |         self.in_features = in_features
21 |         self.out_features = out_features
22 |         self.num_hidden_layers = len(hidden_layers)
23 |         self.hidden_layers = hidden_layers
24 |         self.use_dropout = use_dropout
25 |         self.use_batch_norm = use_batch_norm
26 |         self.actv_func = actv_func
27 |         self.use_softmax = use_softmax
28 | 
29 |         self.device = device
30 | 
31 |         # Add on to another model
32 |         if pre_module_list:
33 |             self.module_list = pre_module_list
34 |         else:
35 |             self.module_list = nn.ModuleList()
36 | 
37 |         self.build_()
38 | 
39 |         # Send to gpu
40 |         self.to(self.device)
41 | 
42 |     def build_(self):
43 |         # Activation Functions for Fully connected layers #
44 |         # Start with input dimensions
45 |         dim = self.in_features
46 |         for i in range(self.num_hidden_layers):
47 |             # Create a fully connected layer between the last layer
48 |             #  and the current hidden layer
49 |             self.module_list.append(nn.Linear(dim, self.hidden_layers[i]))
50 |             # Update the current dimension
51 |             dim = self.hidden_layers[i]
52 | 
53 |             if self.use_batch_norm:
54 |                 self.module_list.append( nn.BatchNorm1d(dim, affine=True) )
55 | 
56 |             # Add the Activation function
57 |             self.module_list.append( self.GetActivation(name=self.actv_func[i]) )
58 | 
59 |             if self.use_dropout:
60 |                 self.module_list.append( nn.Dropout(p=0.10) )
61 | 
62 |         # Fully connect to output dimensions
63 |         if dim != self.out_features:
64 |             self.module_list.append( nn.Linear(dim, self.out_features) )
65 | 
66 | 
67 |     def forward(self, x):
68 |         # Flatten the 2d image into 1d
69 |         # Also convert into float for FC layer
70 |         x = torch.flatten(x.float(), start_dim=1)
71 | 
72 |         # Apply each layer in the module list
73 |         for i in range( len(self.module_list) ):
74 |             x = self.module_list[i](x)
75 | 
76 |         return x
77 | 
78 |     def GetActivation(self, name="relu"):
79 |         if name == "relu":
80 |             return nn.ReLU()
81 |         elif name == "leakyrelu":
82 |             return nn.LeakyReLU()
83 |         elif name == "Sigmoid":
84 |             return nn.Sigmoid()
85 |         elif name == "Tanh":
86 |             return nn.Tanh()
87 |         elif name == "Identity":
88 |             return nn.Identity()
89 |         else:
90 |             return nn.ReLU()


--------------------------------------------------------------------------------
/labml_nn/gan/wasserstein/gradient_penalty/experiment.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: WGAN-GP experiment with MNIST
 4 | summary: This experiment generates MNIST images using convolutional neural network.
 5 | ---
 6 | 
 7 | # WGAN-GP experiment with MNIST
 8 | """
 9 | 
10 | import torch
11 | 
12 | from labml import experiment, tracker
13 | # Import configurations from [Wasserstein experiment](../experiment.html)
14 | from labml_nn.gan.wasserstein.experiment import Configs as OriginalConfigs
15 | #
16 | from labml_nn.gan.wasserstein.gradient_penalty import GradientPenalty
17 | 
18 | 
19 | class Configs(OriginalConfigs):
20 |     """
21 |     ## Configuration class
22 | 
23 |     We extend [original GAN implementation](../../original/experiment.html) and override the discriminator (critic) loss
24 |     calculation to include gradient penalty.
25 |     """
26 | 
27 |     # Gradient penalty coefficient $\lambda$
28 |     gradient_penalty_coefficient: float = 10.0
29 |     #
30 |     gradient_penalty = GradientPenalty()
31 | 
32 |     def calc_discriminator_loss(self, data: torch.Tensor):
33 |         """
34 |         This overrides the original discriminator loss calculation and
35 |         includes gradient penalty.
36 |         """
37 |         # Require gradients on $x$ to calculate gradient penalty
38 |         data.requires_grad_()
39 |         # Sample $z \sim p(z)$
40 |         latent = self.sample_z(data.shape[0])
41 |         # $D(x)$
42 |         f_real = self.discriminator(data)
43 |         # $D(G_\theta(z))$
44 |         f_fake = self.discriminator(self.generator(latent).detach())
45 |         # Get discriminator losses
46 |         loss_true, loss_false = self.discriminator_loss(f_real, f_fake)
47 |         # Calculate gradient penalties in training mode
48 |         if self.mode.is_train:
49 |             gradient_penalty = self.gradient_penalty(data, f_real)
50 |             tracker.add("loss.gp.", gradient_penalty)
51 |             loss = loss_true + loss_false + self.gradient_penalty_coefficient * gradient_penalty
52 |         # Skip gradient penalty otherwise
53 |         else:
54 |             loss = loss_true + loss_false
55 | 
56 |         # Log stuff
57 |         tracker.add("loss.discriminator.true.", loss_true)
58 |         tracker.add("loss.discriminator.false.", loss_false)
59 |         tracker.add("loss.discriminator.", loss)
60 | 
61 |         return loss
62 | 
63 | 
64 | def main():
65 |     # Create configs object
66 |     conf = Configs()
67 |     # Create experiment
68 |     experiment.create(name='mnist_wassertein_gp_dcgan')
69 |     # Override configurations
70 |     experiment.configs(conf,
71 |                        {
72 |                            'discriminator': 'cnn',
73 |                            'generator': 'cnn',
74 |                            'label_smoothing': 0.01,
75 |                            'generator_loss': 'wasserstein',
76 |                            'discriminator_loss': 'wasserstein',
77 |                            'discriminator_k': 5,
78 |                        })
79 | 
80 |     # Start the experiment and run training loop
81 |     with experiment.start():
82 |         conf.run()
83 | 
84 | 
85 | if __name__ == '__main__':
86 |     main()
87 | 


--------------------------------------------------------------------------------
/labml_nn/gan/wasserstein/gradient_penalty/__init__.py:
--------------------------------------------------------------------------------
 1 | r"""
 2 | ---
 3 | title: Gradient Penalty for Wasserstein GAN (WGAN-GP)
 4 | summary: >
 5 |  An annotated PyTorch implementation/tutorial of
 6 |   Improved Training of Wasserstein GANs.
 7 | ---
 8 | 
 9 | # Gradient Penalty for Wasserstein GAN (WGAN-GP)
10 | 
11 | This is an implementation of
12 | [Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028).
13 | 
14 | [WGAN](../index.html) suggests clipping weights to enforce Lipschitz constraint
15 | on the discriminator network (critic).
16 | This and other weight constraints like L2 norm clipping, weight normalization,
17 | L1, L2 weight decay have problems:
18 | 
19 | 1. Limiting the capacity of the discriminator
20 | 2. Exploding and vanishing gradients (without [Batch Normalization](../../../normalization/batch_norm/index.html)).
21 | 
22 | The paper [Improved Training of Wasserstein GANs](https://arxiv.org/abs/1704.00028)
23 | proposal a better way to improve Lipschitz constraint, a gradient penalty.
24 | 
25 | $$\mathcal{L}_{GP} = \lambda \underset{\hat{x} \sim \mathbb{P}_{\hat{x}}}{\mathbb{E}}
26 | \Big[ \big(\Vert \nabla_{\hat{x}} D(\hat{x}) \Vert_2 - 1\big)^2 \Big]
27 | $$
28 | 
29 | where $\lambda$ is the penalty weight and
30 | 
31 | \begin{align}
32 | x &\sim \mathbb{P}_r \\
33 | z &\sim p(z) \\
34 | \epsilon &\sim U[0,1] \\
35 | \tilde{x} &\leftarrow G_\theta (z) \\
36 | \hat{x} &\leftarrow \epsilon x + (1 - \epsilon) \tilde{x}
37 | \end{align}
38 | 
39 | That is we try to keep the gradient norm $\Vert \nabla_{\hat{x}} D(\hat{x}) \Vert_2$ close to $1$.
40 | 
41 | In this implementation we set $\epsilon = 1$.
42 | 
43 | Here is the [code for an experiment](experiment.html) that uses gradient penalty.
44 | """
45 | 
46 | import torch
47 | import torch.autograd
48 | 
49 | from labml_helpers.module import Module
50 | 
51 | 
52 | class GradientPenalty(Module):
53 |     """
54 |     ## Gradient Penalty
55 |     """
56 | 
57 |     def __call__(self, x: torch.Tensor, f: torch.Tensor):
58 |         """
59 |         * `x` is $x \sim \mathbb{P}_r$
60 |         * `f` is $D(x)$
61 | 
62 |         $\hat{x} \leftarrow x$
63 |         since we set $\epsilon = 1$ for this implementation.
64 |         """
65 | 
66 |         # Get batch size
67 |         batch_size = x.shape[0]
68 | 
69 |         # Calculate gradients of $D(x)$ with respect to $x$.
70 |         # `grad_outputs` is set to ones since we want the gradients of $D(x)$,
71 |         # and we need to create and retain graph since we have to compute gradients
72 |         # with respect to weight on this loss.
73 |         gradients, *_ = torch.autograd.grad(outputs=f,
74 |                                             inputs=x,
75 |                                             grad_outputs=f.new_ones(f.shape),
76 |                                             create_graph=True)
77 | 
78 |         # Reshape gradients to calculate the norm
79 |         gradients = gradients.reshape(batch_size, -1)
80 |         # Calculate the norm $\Vert \nabla_{\hat{x}} D(\hat{x}) \Vert_2$
81 |         norm = gradients.norm(2, dim=-1)
82 |         # Return the loss $\big(\Vert \nabla_{\hat{x}} D(\hat{x}) \Vert_2 - 1\big)^2$
83 |         return torch.mean((norm - 1) ** 2)
84 | 


--------------------------------------------------------------------------------
/labml_nn/transformers/mlm/readme.md:
--------------------------------------------------------------------------------
 1 | # [Masked Language Model (MLM)](https://nn.labml.ai/transformers/mlm/index.html)
 2 | 
 3 | This is a [PyTorch](https://pytorch.org) implementation of Masked Language Model (MLM)
 4 |  used to pre-train the BERT model introduced in the paper
 5 | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805).
 6 | 
 7 | ## BERT Pretraining
 8 | 
 9 | BERT model is a transformer model.
10 | The paper pre-trains the model using MLM and with next sentence prediction.
11 | We have only implemented MLM here.
12 | 
13 | ### Next sentence prediction
14 | 
15 | In *next sentence prediction*, the model is given two sentences `A` and `B` and the model
16 | makes a binary prediction whether `B` is the sentence that follows `A` in the actual text.
17 | The model is fed with actual sentence pairs 50% of the time and random pairs 50% of the time.
18 | This classification is done while applying MLM. *We haven't implemented this here.*
19 | 
20 | ## Masked LM
21 | 
22 | This masks a percentage of tokens at random and trains the model to predict
23 | the masked tokens.
24 | They **mask 15% of the tokens** by replacing them with a special `[MASK]` token.
25 | 
26 | The loss is computed on predicting the masked tokens only.
27 | This causes a problem during fine-tuning and actual usage since there are no `[MASK]` tokens
28 |  at that time.
29 | Therefore we might not get any meaningful representations.
30 | 
31 | To overcome this **10% of the masked tokens are replaced with the original token**,
32 | and another **10% of the masked tokens are replaced with a random token**.
33 | This trains the model to give representations about the actual token whether or not the
34 | input token at that position is a `[MASK]`.
35 | And replacing with a random token causes it to
36 | give a representation that has information from the context as well;
37 | because it has to use the context to fix randomly replaced tokens.
38 | 
39 | ## Training
40 | 
41 | MLMs are harder to train than autoregressive models because they have a smaller training signal.
42 | i.e. only a small percentage of predictions are trained per sample.
43 | 
44 | Another problem is since the model is bidirectional, any token can see any other token.
45 | This makes the "credit assignment" harder.
46 | Let's say you have the character level model trying to predict `home *s where i want to be`.
47 | At least during the early stages of the training, it'll be super hard to figure out why the
48 | replacement for `*` should be `i`, it could be anything from the whole sentence.
49 | Whilst, in an autoregressive setting the model will only have to use `h` to predict `o` and
50 | `hom` to predict `e` and so on. So the model will initially start predicting with a shorter context first
51 | and then learn to use longer contexts later.
52 | Since MLMs have this problem it's a lot faster to train if you start with a smaller sequence length
53 | initially and then use a longer sequence length later.
54 | 
55 | Here is [the training code](https://nn.labml.ai/transformers/mlm/experiment.html) for a simple MLM model.
56 | 
57 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/3a6d22b6c67111ebb03d6764d13a38d1)
58 | 


--------------------------------------------------------------------------------
/labml_nn/cnn/utils/dataloader.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | 
 3 | import torch
 4 | import torchvision
 5 | import torchvision.transforms as transforms
 6 | from torch.utils.data import Dataset, random_split
 7 | import matplotlib.pyplot as plt
 8 | import numpy as np
 9 | 
10 | def LoadCifar10DatasetTrain(save, transform=None):
11 |     trainset = torchvision.datasets.CIFAR10(root=save, train=True,
12 |                                         download=True, transform=transform)
13 |     return trainset
14 | 
15 | def LoadCifar10DatasetTest(save, transform):
16 |     return torchvision.datasets.CIFAR10(root=save, train=False,
17 |                                        download=False, transform=transform)
18 | 
19 | def GetCustTransform():
20 |     transform_train = transforms.Compose([
21 |         transforms.RandomRotation(20),
22 |         transforms.RandomCrop(32, (2, 2), pad_if_needed=False, padding_mode='constant'),
23 |         transforms.ToTensor(),
24 |         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
25 |     return transform_train
26 | 
27 | def Dataloader_train_valid(save, batch_size):
28 | 
29 |     # See utils/dataloader.py for data augmentations
30 |     transform_train_valid = GetCustTransform()
31 | 
32 |     # Get Cifar 10 Datasets
33 |     trainset = LoadCifar10DatasetTrain(save, transform_train_valid)
34 |     train_val_abs = int(len(trainset) * 0.8)
35 |     train_subset, val_subset = random_split(trainset, [train_val_abs, len(trainset) - train_val_abs])
36 | 
37 |     # Get Cifar 10 Dataloaders
38 |     trainloader = torch.utils.data.DataLoader(train_subset, batch_size=batch_size,
39 |                                               shuffle=True, num_workers=4)
40 | 
41 |     valloader = torch.utils.data.DataLoader(val_subset, batch_size=batch_size,
42 |                                             shuffle=True, num_workers=4)
43 |     return trainloader, valloader
44 | 
45 | def Dataloader_train(save, batch_size):
46 | 
47 |     # See utils/dataloader.py for data augmentations
48 |     transform_train = GetCustTransform()
49 | 
50 |     # Get Cifar 10 Datasets
51 |     trainset = LoadCifar10DatasetTrain(save, transform_train)
52 |     # Get Cifar 10 Dataloaders
53 |     trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
54 |                                               shuffle=True, num_workers=4)
55 | 
56 |     return trainloader
57 | 
58 | def Dataloader_test(save, batch_size):
59 | 
60 |     # transformation test set
61 |     transform_test = transforms.Compose(
62 |         [transforms.ToTensor(),
63 |          transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
64 | 
65 |     # initialize test dataset and dataloader
66 |     testset = LoadCifar10DatasetTest(save, transform_test)
67 |     testloader = torch.utils.data.DataLoader(testset, batch_size=64,
68 |                                              shuffle=False, num_workers=4)
69 | 
70 |     return testloader
71 | 
72 | def imshow(im):
73 |     image = im.cpu().clone().detach().numpy()
74 |     image = image.transpose(1, 2, 0)
75 |     image = image * np.array((0.5, 0.5, 0.5)) + np.array((0.5, 0.5, 0.5)) # unnormalize
76 |     plt.imshow(image)
77 |     plt.show()
78 | 
79 | def imretrun(im):
80 |     image = im.cpu().clone().detach().numpy()
81 |     image = image.transpose(1, 2, 0)
82 |     image = image * np.array((0.5, 0.5, 0.5)) + np.array((0.5, 0.5, 0.5)) # unnormalize
83 |     return image


--------------------------------------------------------------------------------
/labml_nn/hypernetworks/experiment.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from labml import experiment
  4 | from labml.configs import option
  5 | from labml.utils.pytorch import get_modules
  6 | from labml_helpers.module import Module
  7 | 
  8 | from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
  9 | from labml_nn.hypernetworks.hyper_lstm import HyperLSTM
 10 | from labml_nn.lstm import LSTM
 11 | 
 12 | 
 13 | class AutoregressiveModel(Module):
 14 |     """
 15 |     ## Auto regressive model
 16 |     """
 17 | 
 18 |     def __init__(self, n_vocab: int, d_model: int, rnn_model: Module):
 19 |         super().__init__()
 20 |         # Token embedding module
 21 |         self.src_embed = nn.Embedding(n_vocab, d_model)
 22 |         self.lstm = rnn_model
 23 |         self.generator = nn.Linear(d_model, n_vocab)
 24 | 
 25 |     def __call__(self, x: torch.Tensor):
 26 |         x = self.src_embed(x)
 27 |         # Embed the tokens (`src`) and run it through the the transformer
 28 |         res, state = self.lstm(x)
 29 |         # Generate logits of the next token
 30 |         return self.generator(res), state
 31 | 
 32 | 
 33 | class Configs(NLPAutoRegressionConfigs):
 34 |     """
 35 |     ## Configurations
 36 | 
 37 |     The default configs can and will be over-ridden when we start the experiment
 38 |     """
 39 | 
 40 |     model: AutoregressiveModel
 41 |     rnn_model: Module
 42 | 
 43 |     d_model: int = 512
 44 |     n_rhn: int = 16
 45 |     n_z: int = 16
 46 | 
 47 | 
 48 | @option(Configs.model)
 49 | def autoregressive_model(c: Configs):
 50 |     """
 51 |     Initialize the auto-regressive model
 52 |     """
 53 |     m = AutoregressiveModel(c.n_tokens, c.d_model, c.rnn_model)
 54 |     return m.to(c.device)
 55 | 
 56 | 
 57 | @option(Configs.rnn_model)
 58 | def hyper_lstm(c: Configs):
 59 |     return HyperLSTM(c.d_model, c.d_model, c.n_rhn, c.n_z, 1)
 60 | 
 61 | 
 62 | @option(Configs.rnn_model)
 63 | def lstm(c: Configs):
 64 |     return LSTM(c.d_model, c.d_model, 1)
 65 | 
 66 | 
 67 | def main():
 68 |     # Create experiment
 69 |     experiment.create(name="hyper_lstm", comment='')
 70 |     # Create configs
 71 |     conf = Configs()
 72 |     # Load configurations
 73 |     experiment.configs(conf,
 74 |                        # A dictionary of configurations to override
 75 |                        {'tokenizer': 'character',
 76 |                         'text': 'tiny_shakespeare',
 77 |                         'optimizer.learning_rate': 2.5e-4,
 78 |                         'optimizer.optimizer': 'Adam',
 79 |                         'prompt': 'It is',
 80 |                         'prompt_separator': '',
 81 | 
 82 |                         'rnn_model': 'hyper_lstm',
 83 | 
 84 |                         'train_loader': 'shuffled_train_loader',
 85 |                         'valid_loader': 'shuffled_valid_loader',
 86 | 
 87 |                         'seq_len': 512,
 88 |                         'epochs': 128,
 89 |                         'batch_size': 2,
 90 |                         'inner_iterations': 25})
 91 | 
 92 |     # Set models for saving and loading
 93 |     experiment.add_pytorch_models(get_modules(conf))
 94 | 
 95 |     # Start the experiment
 96 |     with experiment.start():
 97 |         # `TrainValidConfigs.run`
 98 |         conf.run()
 99 | 
100 | 
101 | if __name__ == '__main__':
102 |     main()
103 | 


--------------------------------------------------------------------------------
/labml_nn/rl/ppo/gae.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: Generalized Advantage Estimation (GAE)
 4 | summary: A PyTorch implementation/tutorial of Generalized Advantage Estimation (GAE).
 5 | ---
 6 | 
 7 | # Generalized Advantage Estimation (GAE)
 8 | 
 9 | This is a [PyTorch](https://pytorch.org) implementation of paper
10 | [Generalized Advantage Estimation](https://arxiv.org/abs/1506.02438).
11 | 
12 | You can find an experiment that uses it [here](experiment.html).
13 | """
14 | 
15 | import numpy as np
16 | 
17 | 
18 | class GAE:
19 |     def __init__(self, n_workers: int, worker_steps: int, gamma: float, lambda_: float):
20 |         self.lambda_ = lambda_
21 |         self.gamma = gamma
22 |         self.worker_steps = worker_steps
23 |         self.n_workers = n_workers
24 | 
25 |     def __call__(self, done: np.ndarray, rewards: np.ndarray, values: np.ndarray) -> np.ndarray:
26 |         """
27 |         ### Calculate advantages
28 |         \begin{align}
29 |         \hat{A_t^{(1)}} &= r_t + \gamma V(s_{t+1}) - V(s)
30 |         \\
31 |         \hat{A_t^{(2)}} &= r_t + \gamma r_{t+1} +\gamma^2 V(s_{t+2}) - V(s)
32 |         \\
33 |         ...
34 |         \\
35 |         \hat{A_t^{(\infty)}} &= r_t + \gamma r_{t+1} +\gamma^2 r_{t+1} + ... - V(s)
36 |         \end{align}
37 | 
38 |         $\hat{A_t^{(1)}}$ is high bias, low variance, whilst
39 |         $\hat{A_t^{(\infty)}}$ is unbiased, high variance.
40 | 
41 |         We take a weighted average of $\hat{A_t^{(k)}}$ to balance bias and variance.
42 |         This is called Generalized Advantage Estimation.
43 |         $$\hat{A_t} = \hat{A_t^{GAE}} = \sum_k w_k \hat{A_t^{(k)}}$$
44 |         We set $w_k = \lambda^{k-1}$, this gives clean calculation for
45 |         $\hat{A_t}$
46 | 
47 |         \begin{align}
48 |         \delta_t &= r_t + \gamma V(s_{t+1}) - V(s_t)$
49 |         \\
50 |         \hat{A_t} &= \delta_t + \gamma \lambda \delta_{t+1} + ... +
51 |                              (\gamma \lambda)^{T - t + 1} \delta_{T - 1}$
52 |         \\
53 |         &= \delta_t + \gamma \lambda \hat{A_{t+1}}
54 |         \end{align}
55 |         """
56 | 
57 |         # advantages table
58 |         advantages = np.zeros((self.n_workers, self.worker_steps), dtype=np.float32)
59 |         last_advantage = 0
60 | 
61 |         # $V(s_{t+1})$
62 |         last_value = values[:, -1]
63 | 
64 |         for t in reversed(range(self.worker_steps)):
65 |             # mask if episode completed after step $t$
66 |             mask = 1.0 - done[:, t]
67 |             last_value = last_value * mask
68 |             last_advantage = last_advantage * mask
69 |             # $\delta_t$
70 |             delta = rewards[:, t] + self.gamma * last_value - values[:, t]
71 | 
72 |             # $\hat{A_t} = \delta_t + \gamma \lambda \hat{A_{t+1}}$
73 |             last_advantage = delta + self.gamma * self.lambda_ * last_advantage
74 | 
75 |             # note that we are collecting in reverse order.
76 |             # *My initial code was appending to a list and
77 |             #   I forgot to reverse it later.
78 |             # It took me around 4 to 5 hours to find the bug.
79 |             # The performance of the model was improving
80 |             #  slightly during initial runs,
81 |             #  probably because the samples are similar.*
82 |             advantages[:, t] = last_advantage
83 | 
84 |             last_value = values[:, t]
85 | 
86 |         return advantages
87 | 


--------------------------------------------------------------------------------
/labml_nn/transformers/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: Transformers
 4 | summary: >
 5 |   This is a collection of PyTorch implementations/tutorials of
 6 |   transformers and related techniques.
 7 | ---
 8 | 
 9 | # Transformers
10 | 
11 | This module contains [PyTorch](https://pytorch.org/)
12 | implementations and explanations of original transformer
13 | from paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762),
14 | and derivatives and enhancements of it.
15 | 
16 | * [Multi-head attention](mha.html)
17 | * [Transformer Encoder and Decoder Models](models.html)
18 | * [Fixed positional encoding](positional_encoding.html)
19 | 
20 | ## [Transformer XL](xl/index.html)
21 | This implements Transformer XL model using
22 | [relative multi-head attention](xl/relative_mha.html)
23 | 
24 | ## [Compressive Transformer](compressive/index.html)
25 | 
26 | This is an implementation of compressive transformer
27 | that extends upon [Transformer XL](xl/index.html) by compressing
28 | oldest memories to give a longer attention span.
29 | 
30 | ## [GPT Architecture](gpt/index.html)
31 | 
32 | This is an implementation of GPT-2 architecture.
33 | 
34 | ## [GLU Variants](glu_variants/simple.html)
35 | 
36 | This is an implementation of the paper
37 | [GLU Variants Improve Transformer](https://arxiv.org/abs/2002.05202).
38 | 
39 | ## [kNN-LM](knn/index.html)
40 | 
41 | This is an implementation of the paper
42 | [Generalization through Memorization: Nearest Neighbor Language Models](https://arxiv.org/abs/1911.00172).
43 | 
44 | ## [Feedback Transformer](feedback/index.html)
45 | 
46 | This is an implementation of the paper
47 | [Accessing Higher-level Representations in Sequential Transformers with Feedback Memory](https://arxiv.org/abs/2002.09402).
48 | 
49 | ## [Switch Transformer](switch/index.html)
50 | 
51 | This is a miniature implementation of the paper
52 | [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961).
53 | Our implementation only has a few million parameters and doesn't do model parallel distributed training.
54 | It does single GPU training but we implement the concept of switching as described in the paper.
55 | 
56 | ## [Fast Weights Transformer](fast_weights/index.html)
57 | 
58 | This is an implementation of the paper
59 | [Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch](https://arxiv.org/abs/2102.11174).
60 | 
61 | ## [FNet: Mixing Tokens with Fourier Transforms](fnet/index.html)
62 | 
63 | This is an implementation of the paper
64 | [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824).
65 | 
66 | ## [Attention Free Transformer](aft/index.html)
67 | 
68 | This is an implementation of the paper
69 | [An Attention Free Transformer](https://papers.labml.ai/paper/2105.14103).
70 | 
71 | ## [Masked Language Model](mlm/index.html)
72 | 
73 | This is an implementation of Masked Language Model used for pre-training in paper
74 | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805).
75 | 
76 | ## [Pay Attention to MLPs (gMLP)](gmlp/index.html)
77 | 
78 | This is an implementation of the paper
79 | [Pay Attention to MLPs](https://papers.labml.ai/paper/2105.08050).
80 | """
81 | 
82 | from .configs import TransformerConfigs
83 | from .models import TransformerLayer, Encoder, Decoder, Generator, EncoderDecoder
84 | from .mha import MultiHeadAttention
85 | from labml_nn.transformers.xl.relative_mha import RelativeMultiHeadAttention
86 | 


--------------------------------------------------------------------------------
/labml_nn/optimizers/noam.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: Noam optimizer from Attention is All You Need paper
 4 | summary: >
 5 |   This is a tutorial/implementation of Noam optimizer.
 6 |   Noam optimizer has a warm-up period and then an exponentially decaying learning rate.
 7 | ---
 8 | 
 9 | # Noam Optimizer
10 | 
11 | This is the [PyTorch](https://pytorch.org) implementation of optimizer introduced in the paper
12 | [Attention Is All You Need](https://arxiv.org/abs/1706.03762).
13 | """
14 | from typing import Dict
15 | 
16 | from labml_nn.optimizers import WeightDecay
17 | from labml_nn.optimizers.amsgrad import AMSGrad
18 | 
19 | 
20 | class Noam(AMSGrad):
21 |     """
22 |     ## Noam Optimizer
23 | 
24 |     This class extends from Adam optimizer defined in [`adam.py`](adam.html).
25 |     """
26 | 
27 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
28 |                  weight_decay: WeightDecay = WeightDecay(),
29 |                  optimized_update: bool = True,
30 |                  amsgrad=False,
31 |                  warmup=0, d_model=512, defaults=None):
32 |         """
33 |         ### Initialize the optimizer
34 | 
35 |         * `params` is the list of parameters
36 |         * `lr` is the learning rate $\alpha$
37 |         * `betas` is a tuple of ($\beta_1$, $\beta_2$)
38 |         * `eps` is $\hat{\epsilon}$ or $\epsilon$ based on `optimized_update`
39 |         * `weight_decay` is an instance of class `WeightDecay` defined in [`__init__.py`](index.html)
40 |         * 'optimized_update' is a flag whether to optimize the bias correction of the second moment
41 |           by doing it after adding $\epsilon$
42 |         * `amsgrad` is a flag indicating whether to use AMSGrad or fallback to plain Adam
43 |         * `warmup` number of warmup steps
44 |         * `d_model` model size; i.e. number of dimensions in the transformer
45 |         * `defaults` is a dictionary of default for group values.
46 |          This is useful when you want to extend the class `AdamWarmup`.
47 |         """
48 | 
49 |         defaults = {} if defaults is None else defaults
50 |         defaults.update(dict(warmup=warmup))
51 |         super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults)
52 |         self.d_model = d_model
53 | 
54 |     def get_lr(self, state: Dict[str, any], group: Dict[str, any]):
55 |         """
56 |         ### Get learning-rate
57 | 
58 |         $$\alpha \frac{1}{\sqrt{d_{model}}} \min \bigg(\frac{1}{\sqrt{t}}, \frac{t}{w^{3/2}}\bigg)$$
59 |         where $w$ is the number of warmup steps.
60 |         """
61 |         # $$\min \bigg(\frac{1}{\sqrt{t}}, \frac{t}{w^{3/2}}\bigg)$$
62 |         factor = min(state['step'] ** (-0.5), state['step'] * group['warmup'] ** (-1.5))
63 |         # $$\alpha \frac{1}{\sqrt{d_{model}}} \min \bigg(\frac{1}{\sqrt{t}}, \frac{t}{w^{3/2}}\bigg)$$
64 |         return group['lr'] * self.d_model ** (-0.5) * factor
65 | 
66 | 
67 | def _test_noam_lr():
68 |     """
69 |     ### Plot learning rate for different warmups and model sizes
70 | 
71 |     ![Plot of learning rate](noam_lr.png)
72 |     """
73 |     import matplotlib.pyplot as plt
74 |     import numpy as np
75 |     from torch import nn
76 | 
77 |     model = nn.Linear(10, 10)
78 |     opts = [Noam(model.parameters(), d_model=512, warmup=4000, lr=1),
79 |             Noam(model.parameters(), d_model=512, warmup=8000, lr=1),
80 |             Noam(model.parameters(), d_model=2048, warmup=2000, lr=1)]
81 |     plt.plot(np.arange(1, 20000), [[opt.get_lr({'step': i}, opt.defaults) for opt in opts] for i in range(1, 20000)])
82 |     plt.legend(["512:4000", "512:8000", "2048:2000"])
83 |     plt.title("Learning Rate")
84 |     plt.show()
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     _test_noam_lr()
89 | 


--------------------------------------------------------------------------------
/labml_nn/rl/dqn/model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ---
  3 | title: Deep Q Network (DQN) Model
  4 | summary: Implementation of neural network model for Deep Q Network (DQN).
  5 | ---
  6 | 
  7 | # Deep Q Network (DQN) Model
  8 | """
  9 | 
 10 | import torch
 11 | from torch import nn
 12 | 
 13 | from labml_helpers.module import Module
 14 | 
 15 | 
 16 | class Model(Module):
 17 |     """
 18 |     ## Dueling Network ⚔️ Model for $Q$ Values
 19 | 
 20 |     We are using a [dueling network](https://arxiv.org/abs/1511.06581)
 21 |      to calculate Q-values.
 22 |     Intuition behind dueling network architecture is that in most states
 23 |      the action doesn't matter,
 24 |     and in some states the action is significant. Dueling network allows
 25 |      this to be represented very well.
 26 | 
 27 |     \begin{align}
 28 |         Q^\pi(s,a) &= V^\pi(s) + A^\pi(s, a)
 29 |         \\
 30 |         \mathop{\mathbb{E}}_{a \sim \pi(s)}
 31 |          \Big[
 32 |           A^\pi(s, a)
 33 |          \Big]
 34 |         &= 0
 35 |     \end{align}
 36 | 
 37 |     So we create two networks for $V$ and $A$ and get $Q$ from them.
 38 |     $$
 39 |         Q(s, a) = V(s) +
 40 |         \Big(
 41 |             A(s, a) - \frac{1}{|\mathcal{A}|} \sum_{a' \in \mathcal{A}} A(s, a')
 42 |         \Big)
 43 |     $$
 44 |     We share the initial layers of the $V$ and $A$ networks.
 45 |     """
 46 | 
 47 |     def __init__(self):
 48 |         super().__init__()
 49 |         self.conv = nn.Sequential(
 50 |             # The first convolution layer takes a
 51 |             # $84\times84$ frame and produces a $20\times20$ frame
 52 |             nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4),
 53 |             nn.ReLU(),
 54 | 
 55 |             # The second convolution layer takes a
 56 |             # $20\times20$ frame and produces a $9\times9$ frame
 57 |             nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
 58 |             nn.ReLU(),
 59 | 
 60 |             # The third convolution layer takes a
 61 |             # $9\times9$ frame and produces a $7\times7$ frame
 62 |             nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
 63 |             nn.ReLU(),
 64 |         )
 65 | 
 66 |         # A fully connected layer takes the flattened
 67 |         # frame from third convolution layer, and outputs
 68 |         # $512$ features
 69 |         self.lin = nn.Linear(in_features=7 * 7 * 64, out_features=512)
 70 |         self.activation = nn.ReLU()
 71 | 
 72 |         # This head gives the state value $V$
 73 |         self.state_value = nn.Sequential(
 74 |             nn.Linear(in_features=512, out_features=256),
 75 |             nn.ReLU(),
 76 |             nn.Linear(in_features=256, out_features=1),
 77 |         )
 78 |         # This head gives the action value $A$
 79 |         self.action_value = nn.Sequential(
 80 |             nn.Linear(in_features=512, out_features=256),
 81 |             nn.ReLU(),
 82 |             nn.Linear(in_features=256, out_features=4),
 83 |         )
 84 | 
 85 |     def __call__(self, obs: torch.Tensor):
 86 |         # Convolution
 87 |         h = self.conv(obs)
 88 |         # Reshape for linear layers
 89 |         h = h.reshape((-1, 7 * 7 * 64))
 90 | 
 91 |         # Linear layer
 92 |         h = self.activation(self.lin(h))
 93 | 
 94 |         # $A$
 95 |         action_value = self.action_value(h)
 96 |         # $V$
 97 |         state_value = self.state_value(h)
 98 | 
 99 |         # $A(s, a) - \frac{1}{|\mathcal{A}|} \sum_{a' \in \mathcal{A}} A(s, a')$
100 |         action_score_centered = action_value - action_value.mean(dim=-1, keepdim=True)
101 |         # $Q(s, a) =V(s) + \Big(A(s, a) - \frac{1}{|\mathcal{A}|} \sum_{a' \in \mathcal{A}} A(s, a')\Big)$
102 |         q = state_value + action_score_centered
103 | 
104 |         return q
105 | 


--------------------------------------------------------------------------------
/labml_nn/transformers/fnet/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: FNet - Mixing Tokens with Fourier Transforms
 4 | summary: >
 5 |   This is an annotated implementation/tutorial the FNet in PyTorch.
 6 | ---
 7 | 
 8 | # FNet: Mixing Tokens with Fourier Transforms
 9 | 
10 | This is a [PyTorch](https://pytorch.org) implementation of the paper
11 | [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824).
12 | 
13 | This paper replaces the [self-attention layer](../mha.html) with two
14 | [Fourier transforms](https://en.wikipedia.org/wiki/Discrete_Fourier_transform) to
15 | *mix* tokens.
16 | This is a $7 \times$ more efficient than self-attention.
17 | The accuracy loss of using this over self-attention is about 92% for
18 | [BERT](https://paperswithcode.com/method/bert) on
19 | [GLUE benchmark](https://paperswithcode.com/dataset/glue).
20 | 
21 | ## Mixing tokens with two Fourier transforms
22 | 
23 | We apply Fourier transform along the hidden dimension (embedding dimension)
24 |  and then along the sequence dimension.
25 | 
26 | $$
27 | \mathcal{R}\big(\mathcal{F}_\text{seq} \big(\mathcal{F}_\text{hidden} (x) \big) \big)
28 | $$
29 | 
30 | where $x$ is the embedding input, $\mathcal{F}$ stands for the fourier transform and
31 | $\mathcal{R}$ stands for the real component in complex numbers.
32 | 
33 | This is very simple to implement on PyTorch - just 1 line of code.
34 | The paper suggests using a precomputed DFT matrix and doing matrix multiplication to get the
35 | Fourier transformation.
36 | 
37 | Here is [the training code](experiment.html) for using a FNet based model for classifying
38 | [AG News](https://paperswithcode.com/dataset/ag-news).
39 | """
40 | 
41 | from typing import Optional
42 | 
43 | import torch
44 | from torch import nn
45 | 
46 | 
47 | class FNetMix(nn.Module):
48 |     """
49 |     ## FNet - Mix tokens
50 | 
51 |     This module simply implements
52 |     $$
53 |     \mathcal{R}\big(\mathcal{F}_\text{seq} \big(\mathcal{F}_\text{hidden} (x) \big) \big)
54 |     $$
55 | 
56 |     The structure of this module is made similar to a [standard attention module](../mha.html) so that we can simply
57 |     replace it.
58 |     """
59 | 
60 |     def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: Optional[torch.Tensor] = None):
61 |         """
62 |         The [normal attention module](../mha.html) can be fed with different token embeddings for
63 |         $\text{query}$,$\text{key}$, and $\text{value}$ and a mask.
64 | 
65 |         We follow the same function signature so that we can replace it directly.
66 | 
67 |         For FNet mixing, $$x = \text{query} = \text{key} = \text{value}$$ and masking is not possible.
68 |         Shape of `query` (and `key` and `value`) is `[seq_len, batch_size, d_model]`.
69 |         """
70 | 
71 |         # $\text{query}$,$\text{key}$, and $\text{value}$ all should be equal to $x$ for token mixing
72 |         assert query is key and key is value
73 |         # Token mixing doesn't support masking. i.e. all tokens will see all other token embeddings.
74 |         assert mask is None
75 | 
76 |         # Assign to `x` for clarity
77 |         x = query
78 | 
79 |         # Apply the Fourier transform along the hidden (embedding) dimension
80 |         # $$\mathcal{F}_\text{hidden} (x)$$
81 |         #
82 |         # The output of the Fourier transform is a tensor of
83 |         # [complex numbers](https://pytorch.org/docs/stable/complex_numbers.html).
84 |         fft_hidden = torch.fft.fft(x, dim=2)
85 |         # Apply the Fourier transform along the sequence dimension
86 |         # $$\mathcal{F}_\text{seq} \big(\mathcal{F}_\text{hidden} (x) \big)$$
87 |         fft_seq = torch.fft.fft(fft_hidden, dim=0)
88 | 
89 |         # Get the real component
90 |         # $$\mathcal{R}\big(\mathcal{F}_\text{seq} \big(\mathcal{F}_\text{hidden} (x) \big) \big)$$
91 |         return torch.real(fft_seq)
92 | 


--------------------------------------------------------------------------------
/labml_nn/experiments/mnist.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ---
  3 | title: MNIST Experiment
  4 | summary: >
  5 |   This is a reusable trainer for MNIST dataset
  6 | ---
  7 | 
  8 | # MNIST Experiment
  9 | """
 10 | 
 11 | import torch.nn as nn
 12 | import torch.utils.data
 13 | from labml_helpers.module import Module
 14 | 
 15 | from labml import tracker
 16 | from labml.configs import option
 17 | from labml_helpers.datasets.mnist import MNISTConfigs as MNISTDatasetConfigs
 18 | from labml_helpers.device import DeviceConfigs
 19 | from labml_helpers.metrics.accuracy import Accuracy
 20 | from labml_helpers.train_valid import TrainValidConfigs, BatchIndex, hook_model_outputs
 21 | from labml_nn.optimizers.configs import OptimizerConfigs
 22 | 
 23 | 
 24 | class MNISTConfigs(MNISTDatasetConfigs, TrainValidConfigs):
 25 |     """
 26 |     <a id="MNISTConfigs">
 27 |     ## Trainer configurations
 28 |     </a>
 29 |     """
 30 | 
 31 |     # Optimizer
 32 |     optimizer: torch.optim.Adam
 33 |     # Training device
 34 |     device: torch.device = DeviceConfigs()
 35 | 
 36 |     # Classification model
 37 |     model: Module
 38 |     # Number of epochs to train for
 39 |     epochs: int = 10
 40 | 
 41 |     # Number of times to switch between training and validation within an epoch
 42 |     inner_iterations = 10
 43 | 
 44 |     # Accuracy function
 45 |     accuracy = Accuracy()
 46 |     # Loss function
 47 |     loss_func = nn.CrossEntropyLoss()
 48 | 
 49 |     def init(self):
 50 |         """
 51 |         ### Initialization
 52 |         """
 53 |         # Set tracker configurations
 54 |         tracker.set_scalar("loss.*", True)
 55 |         tracker.set_scalar("accuracy.*", True)
 56 |         # Add a hook to log module outputs
 57 |         hook_model_outputs(self.mode, self.model, 'model')
 58 |         # Add accuracy as a state module.
 59 |         # The name is probably confusing, since it's meant to store
 60 |         # states between training and validation for RNNs.
 61 |         # This will keep the accuracy metric stats separate for training and validation.
 62 |         self.state_modules = [self.accuracy]
 63 | 
 64 |     def step(self, batch: any, batch_idx: BatchIndex):
 65 |         """
 66 |         ### Training or validation step
 67 |         """
 68 | 
 69 |         # Move data to the device
 70 |         data, target = batch[0].to(self.device), batch[1].to(self.device)
 71 | 
 72 |         # Update global step (number of samples processed) when in training mode
 73 |         if self.mode.is_train:
 74 |             tracker.add_global_step(len(data))
 75 | 
 76 |         # Whether to capture model outputs
 77 |         with self.mode.update(is_log_activations=batch_idx.is_last):
 78 |             # Get model outputs.
 79 |             output = self.model(data)
 80 | 
 81 |         # Calculate and log loss
 82 |         loss = self.loss_func(output, target)
 83 |         tracker.add("loss.", loss)
 84 | 
 85 |         # Calculate and log accuracy
 86 |         self.accuracy(output, target)
 87 |         self.accuracy.track()
 88 | 
 89 |         # Train the model
 90 |         if self.mode.is_train:
 91 |             # Calculate gradients
 92 |             loss.backward()
 93 |             # Take optimizer step
 94 |             self.optimizer.step()
 95 |             # Log the model parameters and gradients on last batch of every epoch
 96 |             if batch_idx.is_last:
 97 |                 tracker.add('model', self.model)
 98 |             # Clear the gradients
 99 |             self.optimizer.zero_grad()
100 | 
101 |         # Save the tracked metrics
102 |         tracker.save()
103 | 
104 | 
105 | @option(MNISTConfigs.optimizer)
106 | def _optimizer(c: MNISTConfigs):
107 |     """
108 |     ### Default optimizer configurations
109 |     """
110 |     opt_conf = OptimizerConfigs()
111 |     opt_conf.parameters = c.model.parameters()
112 |     opt_conf.optimizer = 'Adam'
113 |     return opt_conf
114 | 


--------------------------------------------------------------------------------
/labml_nn/transformers/gmlp/experiment.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ---
  3 | title:  Pay Attention to MLPs (gMLP) Experiment
  4 | summary: This experiment trains a gMLP based model on Tiny Shakespeare dataset.
  5 | ---
  6 | 
  7 | # [Pay Attention to MLPs (gMLP)](index.html) Experiment
  8 | 
  9 | This is an annotated PyTorch experiment to train a [gMLP model](index.html).
 10 | The paper also applies a Stochastic Depth regularization where some layers are removed randomly during training.
 11 | We have not implemented that here.
 12 | 
 13 | This is based on
 14 | [training loop and configurations for a simple transformer auto-regressive NLP task](../basic/autoregressive_experiment.html).
 15 | 
 16 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/01bd941ac74c11eb890c1d9196651a4a)
 17 | """
 18 | from labml import experiment
 19 | from labml.configs import option
 20 | from labml_nn.transformers import TransformerConfigs
 21 | from labml_nn.transformers.basic.autoregressive_experiment import Configs as BasicAutoRegressionConfigs
 22 | from labml_nn.transformers.gmlp import GMLPBlock
 23 | 
 24 | 
 25 | class Configs(BasicAutoRegressionConfigs):
 26 |     """
 27 |     ## Configurations
 28 | 
 29 |     This inherits from
 30 |     [training loop and configurations for a simple transformer auto-regressive NLP task](../basic/autoregressive_transformer.html).
 31 |     """
 32 | 
 33 |     # Transformer
 34 |     transformer: TransformerConfigs = 'gMLP'
 35 |     # gMLP Block
 36 |     gmlp: GMLPBlock
 37 |     # `d_ffn` for gMLP projection layer
 38 |     d_ffn: int = 2048
 39 | 
 40 | 
 41 | @option(Configs.gmlp, 'gMLP')
 42 | def _gmlp_configs(c: Configs):
 43 |     """
 44 |     ### Create a gMLP block
 45 |     """
 46 |     return GMLPBlock(c.d_model, c.d_ffn, c.seq_len)
 47 | 
 48 | 
 49 | @option(Configs.transformer, 'gMLP')
 50 | def _transformer_configs(c: Configs):
 51 |     """
 52 |     ### Transformer configurations
 53 |     """
 54 | 
 55 |     # We use our
 56 |     # [configurable transformer implementation](../configs.html#TransformerConfigs)
 57 |     conf = TransformerConfigs()
 58 |     # Set the vocabulary sizes for embeddings and generating logits
 59 |     conf.n_src_vocab = c.n_tokens
 60 |     conf.n_tgt_vocab = c.n_tokens
 61 |     # Set model size
 62 |     conf.d_model = c.d_model
 63 |     # Replace the encoder layer with a gMLP layer
 64 |     conf.encoder_layer = c.gmlp
 65 | 
 66 |     return conf
 67 | 
 68 | 
 69 | def main():
 70 |     # Create experiment
 71 |     experiment.create(name="gMLP")
 72 |     # Create configs
 73 |     conf = Configs()
 74 |     # Override configurations
 75 |     experiment.configs(conf, {
 76 |         # Use character level tokenizer
 77 |         'tokenizer': 'character',
 78 |         # Prompt separator is blank
 79 |         'prompt_separator': '',
 80 |         # Starting prompt for sampling
 81 |         'prompt': 'It is ',
 82 |         # Use Tiny Shakespeare dataset
 83 |         'text': 'tiny_shakespeare',
 84 | 
 85 |         # Use a context size of $256$
 86 |         'seq_len': 256,
 87 |         # Train for $128$ epochs
 88 |         'epochs': 128,
 89 |         # Batch size $32$
 90 |         'batch_size': 32,
 91 |         # Switch between training and validation for $10$ times
 92 |         # per epoch
 93 |         'inner_iterations': 10,
 94 | 
 95 |         # Model size
 96 |         'd_model': 512,
 97 |         'd_ffn': 2048,
 98 | 
 99 |         # Use [Noam optimizer](../../optimizers/noam.html)
100 |         'optimizer.optimizer': 'Noam',
101 |         'optimizer.learning_rate': 1.,
102 |     })
103 | 
104 |     # Set models for saving and loading
105 |     experiment.add_pytorch_models({'model': conf.model})
106 | 
107 |     # Start the experiment
108 |     with experiment.start():
109 |         # Run training
110 |         conf.run()
111 | 
112 | 
113 | #
114 | if __name__ == '__main__':
115 |     main()
116 | 


--------------------------------------------------------------------------------
/labml_nn/transformers/feed_forward.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: Position-wise Feed-Forward Network (FFN)
 4 | summary: Documented reusable implementation of the position wise feedforward network.
 5 | ---
 6 | 
 7 | # Position-wise Feed-Forward Network (FFN)
 8 | 
 9 | This is a [PyTorch](https://pytorch.org)  implementation
10 | of position-wise feedforward network used in transformer.
11 | 
12 | FFN consists of two fully connected layers.
13 | Number of dimensions in the hidden layer $d_{ff}$, is generally set to around
14 | four times that of the token embedding $d_{model}$.
15 | So it is sometime also called the expand-and-contract network.
16 | 
17 | There is an activation at the hidden layer, which is
18 | usually set to ReLU (Rectified Linear Unit) activation, $$\max(0, x)$$
19 | 
20 | That is, the FFN function is,
21 | $$FFN(x, W_1, W_2, b_1, b_2) = \max(0, x W_1 + b_1) W_2 + b_2$$
22 | where $W_1$, $W_2$, $b_1$ and $b_2$ are learnable parameters.
23 | 
24 | Sometimes the
25 | GELU (Gaussian Error Linear Unit) activation is also used instead of ReLU.
26 | $$x \Phi(x)$$ where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$
27 | 
28 | ### Gated Linear Units
29 | 
30 | This is a generic implementation that supports different variants including
31 | [Gated Linear Units](https://arxiv.org/abs/2002.05202) (GLU).
32 | We have also implemented experiments on these:
33 | 
34 | * [experiment that uses `labml.configs`](glu_variants/experiment.html)
35 | * [simpler version from scratch](glu_variants/simple.html)
36 | """
37 | 
38 | import torch
39 | from torch import nn as nn
40 | 
41 | from labml_helpers.module import Module
42 | 
43 | 
44 | class FeedForward(Module):
45 |     """
46 |     ## FFN module
47 |     """
48 | 
49 |     def __init__(self, d_model: int, d_ff: int,
50 |                  dropout: float = 0.1,
51 |                  activation=nn.ReLU(),
52 |                  is_gated: bool = False,
53 |                  bias1: bool = True,
54 |                  bias2: bool = True,
55 |                  bias_gate: bool = True):
56 |         """
57 |         * `d_model` is the number of features in a token embedding
58 |         * `d_ff` is the number of features in the hidden layer of the FFN
59 |         * `dropout` is dropout probability for the hidden layer
60 |         * `is_gated` specifies whether the hidden layer is gated
61 |         * `bias1` specified whether the first fully connected layer should have a learnable bias
62 |         * `bias2` specified whether the second fully connected layer should have a learnable bias
63 |         * `bias_gate` specified whether the fully connected layer for the gate should have a learnable bias
64 |         """
65 |         super().__init__()
66 |         # Layer one parameterized by weight $W_1$ and bias $b_1$
67 |         self.layer1 = nn.Linear(d_model, d_ff, bias=bias1)
68 |         # Layer one parameterized by weight $W_1$ and bias $b_1$
69 |         self.layer2 = nn.Linear(d_ff, d_model, bias=bias2)
70 |         # Hidden layer dropout
71 |         self.dropout = nn.Dropout(dropout)
72 |         # Activation function $f$
73 |         self.activation = activation
74 |         # Whether there is a gate
75 |         self.is_gated = is_gated
76 |         if is_gated:
77 |             # If there is a gate the linear layer to transform inputs to
78 |             # be multiplied by the gate, parameterized by weight $V$ and bias $c$
79 |             self.linear_v = nn.Linear(d_model, d_ff, bias=bias_gate)
80 | 
81 |     def forward(self, x: torch.Tensor):
82 |         # $f(x W_1 + b_1)$
83 |         g = self.activation(self.layer1(x))
84 |         # If gated, $f(x W_1 + b_1) \otimes (x V + b) $
85 |         if self.is_gated:
86 |             x = g * self.linear_v(x)
87 |         # Otherwise
88 |         else:
89 |             x = g
90 |         # Apply dropout
91 |         x = self.dropout(x)
92 |         # $(f(x W_1 + b_1) \otimes (x V + b)) W_2 + b_2$ or $f(x W_1 + b_1) W_2 + b_2$
93 |         # depending on whether it is gated
94 |         return self.layer2(x)
95 | 


--------------------------------------------------------------------------------
/labml_nn/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | # [labml.ai Annotated PyTorch Paper Implementations](index.html)
 3 | 
 4 | This is a collection of simple PyTorch implementations of
 5 | neural networks and related algorithms.
 6 | [These implementations](https://github.com/lab-ml/nn) are documented with explanations,
 7 | and the [website](index.html)
 8 | renders these as side-by-side formatted notes.
 9 | We believe these would help you understand these algorithms better.
10 | 
11 | We are actively maintaining this repo and adding new
12 | implementations.
13 | 
14 | ## Modules
15 | 
16 | #### ✨ [Transformers](transformers/index.html)
17 | 
18 | * [Multi-headed attention](transformers/mha.html)
19 | * [Transformer building blocks](transformers/models.html)
20 | * [Transformer XL](transformers/xl/index.html)
21 |     * [Relative multi-headed attention](transformers/xl/relative_mha.html)
22 | * [Compressive Transformer](transformers/compressive/index.html)
23 | * [GPT Architecture](transformers/gpt/index.html)
24 | * [GLU Variants](transformers/glu_variants/simple.html)
25 | * [kNN-LM: Generalization through Memorization](transformers/knn/index.html)
26 | * [Feedback Transformer](transformers/feedback/index.html)
27 | * [Switch Transformer](transformers/switch/index.html)
28 | * [Fast Weights Transformer](transformers/fast_weights/index.html)
29 | * [FNet](transformers/fnet/index.html)
30 | * [Attention Free Transformer](transformers/aft/index.html)
31 | * [Masked Language Model](transformers/mlm/index.html)
32 | * [Pay Attention to MLPs (gMLP)](transformers/gmlp/index.html)
33 | 
34 | #### ✨ [Recurrent Highway Networks](recurrent_highway_networks/index.html)
35 | 
36 | #### ✨ [LSTM](lstm/index.html)
37 | 
38 | #### ✨ [HyperNetworks - HyperLSTM](hypernetworks/hyper_lstm.html)
39 | 
40 | #### ✨ [Capsule Networks](capsule_networks/index.html)
41 | 
42 | #### ✨ [Generative Adversarial Networks](gan/index.html)
43 | * [Original GAN](gan/original/index.html)
44 | * [GAN with deep convolutional network](gan/dcgan/index.html)
45 | * [Cycle GAN](gan/cycle_gan/index.html)
46 | * [Wasserstein GAN](gan/wasserstein/index.html)
47 | * [Wasserstein GAN with Gradient Penalty](gan/wasserstein/gradient_penalty/index.html)
48 | * [Style GAN 2](gan/stylegan/index.html)
49 | 
50 | #### ✨ [Sketch RNN](sketch_rnn/index.html)
51 | 
52 | #### ✨ [Reinforcement Learning](rl/index.html)
53 | * [Proximal Policy Optimization](rl/ppo/index.html) with
54 |  [Generalized Advantage Estimation](rl/ppo/gae.html)
55 | * [Deep Q Networks](rl/dqn/index.html) with
56 |  with [Dueling Network](rl/dqn/model.html),
57 |  [Prioritized Replay](rl/dqn/replay_buffer.html)
58 |  and Double Q Network.
59 | 
60 | #### ✨ [Optimizers](optimizers/index.html)
61 | * [Adam](optimizers/adam.html)
62 | * [AMSGrad](optimizers/amsgrad.html)
63 | * [Adam Optimizer with warmup](optimizers/adam_warmup.html)
64 | * [Noam Optimizer](optimizers/noam.html)
65 | * [Rectified Adam Optimizer](optimizers/radam.html)
66 | * [AdaBelief Optimizer](optimizers/ada_belief.html)
67 | 
68 | #### ✨ [Normalization Layers](https://nn.labml.ai/normalization/index.html)
69 | * [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html)
70 | * [Layer Normalization](https://nn.labml.ai/normalization/layer_norm/index.html)
71 | * [Instance Normalization](https://nn.labml.ai/normalization/instance_norm/index.html)
72 | * [Group Normalization](https://nn.labml.ai/normalization/group_norm/index.html)
73 | * [Weight Standardization](https://nn.labml.ai/normalization/weight_standardization/index.html)
74 | * [Batch-Channel Normalization](https://nn.labml.ai/normalization/batch_channel_norm/index.html)
75 | 
76 | ### Installation
77 | 
78 | ```bash
79 | pip install labml-nn
80 | ```
81 | 
82 | ### Citing LabML
83 | 
84 | If you use LabML for academic research, please cite the library using the following BibTeX entry.
85 | 
86 | ```bibtex
87 | @misc{labml,
88 |  author = {Varuna Jayasiri, Nipun Wijerathne},
89 |  title = {LabML: A library to organize machine learning experiments},
90 |  year = {2020},
91 |  url = {https://nn.labml.ai/},
92 | }
93 | ```
94 | """
95 | 


--------------------------------------------------------------------------------
/labml_nn/optimizers/adam_warmup_cosine_decay.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: Adam optimizer with warm-up and cosine decay
 4 | summary: A PyTorch implementation/tutorial of Adam optimizer with warm-up and cosine decay for GPT.
 5 | ---
 6 | 
 7 | # Adam Optimizer with Warmup and Cosine Decay
 8 | 
 9 | This extends [AMSGrad optimizer](adam.html) and adds a warmup stage.
10 | """
11 | import math
12 | from typing import Dict
13 | 
14 | from labml_nn.optimizers import WeightDecay
15 | from labml_nn.optimizers.amsgrad import AMSGrad
16 | 
17 | 
18 | class AdamWarmupCosineDecay(AMSGrad):
19 |     """
20 |     <a id="EmbeddingsWithPositionalEncoding">
21 |     ## Adam Optimizer with Warmup and Cosine Decay
22 |     </a>
23 | 
24 |     This class extends from AMSGrad optimizer defined in [`amsgrad.py`](amsgrad.html).
25 |     """
26 | 
27 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
28 |                  weight_decay: WeightDecay = WeightDecay(),
29 |                  optimized_update: bool = True,
30 |                  amsgrad=False, warmup=0, total_steps=1e10, defaults=None):
31 |         """
32 |         ### Initialize the optimizer
33 | 
34 |         * `params` is the list of parameters
35 |         * `lr` is the learning rate $\alpha$
36 |         * `betas` is a tuple of ($\beta_1$, $\beta_2$)
37 |         * `eps` is $\hat{\epsilon}$ or $\epsilon$ based on `optimized_update`
38 |         * `weight_decay` is an instance of class `WeightDecay` defined in [`__init__.py`](index.html)
39 |         * 'optimized_update' is a flag whether to optimize the bias correction of the second moment
40 |           by doing it after adding $\epsilon$
41 |         * `amsgrad` is a flag indicating whether to use AMSGrad or fallback to plain Adam
42 |         * `warmup` number of warmup steps
43 |         * `total_steps` total number of steps. Cosine decay reaches 0 at this,
44 |         but stays at 10% of `lr` because we take $\alpha * \max(0.1, decay)$
45 |         * `defaults` is a dictionary of default for group values.
46 |          This is useful when you want to extend the class `AdamWarmup`.
47 |         """
48 | 
49 |         defaults = {} if defaults is None else defaults
50 |         defaults.update(dict(warmup=warmup, total_steps=total_steps))
51 |         super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults)
52 | 
53 |     def get_lr(self, state: Dict[str, any], group: Dict[str, any]):
54 |         """
55 |         ### Get learning-rate
56 | 
57 |         $$\alpha \min \bigg(1, \frac{t}{w}\bigg)$$
58 |         where $w$ is the number of warmup steps.
59 |         """
60 |         # If we are in warmup stage
61 |         if group['warmup'] > state['step']:
62 |             # A linearly increasing learning rate from $0$ to $\alpha$
63 |             return 1e-8 + state['step'] * group['lr'] / group['warmup']
64 |         else:
65 |             # Constant learning rate $\alpha$
66 |             progress = (state['step'] - group['warmup']) / max(1, group['total_steps'] - group['warmup'])
67 |             return group['lr'] * max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
68 | 
69 | 
70 | def _test_lr():
71 |     """
72 |     ### Plot learning rate for different warmups and model sizes
73 | 
74 |     ![Plot of learning rate](noam_lr.png)
75 |     """
76 |     import matplotlib.pyplot as plt
77 |     import numpy as np
78 |     from torch import nn
79 | 
80 |     model = nn.Linear(10, 10)
81 |     opt = AdamWarmupCosineDecay(model.parameters(), warmup=5000, lr=1e-4, total_steps=4e6)
82 |     steps = 20_000
83 |     plt.plot(np.arange(1, steps), [opt.get_lr({'step': i}, opt.defaults) for i in range(1, steps)])
84 |     plt.legend(["5000:4e6", "5000:2e6", "5000:1e6"])
85 |     plt.title("Learning Rate")
86 |     plt.show()
87 | 
88 |     steps = int(6e6)
89 |     step_size = 1000
90 |     plt.plot(np.arange(1, steps, step_size), [opt.get_lr({'step': i}, opt.defaults) for i in range(1, steps, step_size)])
91 |     plt.legend(["5000:4e6", "5000:2e6", "5000:1e6"])
92 |     plt.title("Learning Rate")
93 |     plt.show()
94 | 
95 | 
96 | if __name__ == '__main__':
97 |     _test_lr()
98 | 


--------------------------------------------------------------------------------
/labml_nn/normalization/weight_standardization/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ---
 3 | title: Weight Standardization
 4 | summary: >
 5 |  A PyTorch implementation/tutorial of Weight Standardization.
 6 | ---
 7 | 
 8 | # Weight Standardization
 9 | 
10 | This is a [PyTorch](https://pytorch.org) implementation of Weight Standardization from the paper
11 |  [Micro-Batch Training with Batch-Channel Normalization and Weight Standardization](https://arxiv.org/abs/1903.10520).
12 | We also have an [annotated implementation of Batch-Channel Normalization](../batch_channel_norm/index.html).
13 | 
14 | Batch normalization **gives a smooth loss landscape** and
15 | **avoids elimination singularities**.
16 | Elimination singularities are nodes of the network that become
17 | useless (e.g. a ReLU that gives 0 all the time).
18 | 
19 | However, batch normalization doesn't work well when the batch size is too small,
20 | which happens when training large networks because of device memory limitations.
21 | The paper introduces Weight Standardization with Batch-Channel Normalization as
22 | a better alternative.
23 | 
24 | Weight Standardization:
25 | 1. Normalizes the gradients
26 | 2. Smoothes the landscape (reduced Lipschitz constant)
27 | 3. Avoids elimination singularities
28 | 
29 | The Lipschitz constant is the maximum slope a function has between two points.
30 | That is, $L$ is the Lipschitz constant where $L$ is the smallest value that satisfies,
31 | $\forall a,b \in A: \lVert f(a) - f(b) \rVert \le L \lVert a - b \rVert$
32 | where $f: A \rightarrow \mathbb{R}^m, A \in \mathbb{R}^n$.
33 | 
34 | Elimination singularities are avoided because it keeps the statistics of the outputs similar to the
35 | inputs. So as long as the inputs are normally distributed the outputs remain close to normal.
36 | This avoids outputs of nodes from always falling beyond the active range of the activation function
37 | (e.g. always negative input for a ReLU).
38 | 
39 | *[Refer to the paper for proofs](https://arxiv.org/abs/1903.10520)*.
40 | 
41 | Here is [the training code](experiment.html) for training
42 | a VGG network that uses weight standardization to classify CIFAR-10 data.
43 | This uses a [2D-Convolution Layer with Weight Standardization](../conv2d.html).
44 | 
45 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/weight_standardization/experiment.ipynb)
46 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/f4a783a2a7df11eb921d0242ac1c0002)
47 | [![WandB](https://img.shields.io/badge/wandb-run-yellow)](https://wandb.ai/vpj/cifar10/runs/3flr4k8w)
48 | """
49 | 
50 | import torch
51 | 
52 | 
53 | def weight_standardization(weight: torch.Tensor, eps: float):
54 |     r"""
55 |     ## Weight Standardization
56 | 
57 |     $$\hat{W}_{i,j} = \frac{W_{i,j} - \mu_{W_{i,\cdot}}} {\sigma_{W_{i,\cdot}}}$$
58 | 
59 |     where,
60 | 
61 |     \begin{align}
62 |     W &\in \mathbb{R}^{O \times I} \\
63 |     \mu_{W_{i,\cdot}} &= \frac{1}{I} \sum_{j=1}^I W_{i,j} \\
64 |     \sigma_{W_{i,\cdot}} &= \sqrt{\frac{1}{I} \sum_{j=1}^I W^2_{i,j} - \mu^2_{W_{i,\cdot}} + \epsilon} \\
65 |     \end{align}
66 | 
67 |     for a 2D-convolution layer $O$ is the number of output channels ($O = C_{out}$)
68 |     and $I$ is the number of input channels times the kernel size ($I = C_{in} \times k_H \times k_W$)
69 |     """
70 | 
71 |     # Get $C_{out}$, $C_{in}$ and kernel shape
72 |     c_out, c_in, *kernel_shape = weight.shape
73 |     # Reshape $W$ to $O \times I$
74 |     weight = weight.view(c_out, -1)
75 |     # Calculate
76 |     #
77 |     # \begin{align}
78 |     # \mu_{W_{i,\cdot}} &= \frac{1}{I} \sum_{j=1}^I W_{i,j} \\
79 |     # \sigma^2_{W_{i,\cdot}} &= \frac{1}{I} \sum_{j=1}^I W^2_{i,j} - \mu^2_{W_{i,\cdot}}
80 |     # \end{align}
81 |     var, mean = torch.var_mean(weight, dim=1, keepdim=True)
82 |     # Normalize
83 |     # $$\hat{W}_{i,j} = \frac{W_{i,j} - \mu_{W_{i,\cdot}}} {\sigma_{W_{i,\cdot}}}$$
84 |     weight = (weight - mean) / (torch.sqrt(var + eps))
85 |     # Change back to original shape and return
86 |     return weight.view(c_out, c_in, *kernel_shape)
87 | 


--------------------------------------------------------------------------------
/labml_nn/transformers/fast_weights/experiment.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ---
  3 | title: Train Fast Weights Transformer
  4 | summary: This is training code with notes for a Fast Weights Transformer.
  5 | ---
  6 | 
  7 | # Train Fast Weights Transformer
  8 | 
  9 | This trains a fast weights transformer model for auto-regression.
 10 | 
 11 | Here’s a Colab notebook for training a fast weights transformer on Tiny Shakespeare dataset.
 12 | 
 13 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb)
 14 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/928aadc0846c11eb85710242ac1c0002)
 15 | """
 16 | 
 17 | import torch
 18 | from torch import nn
 19 | 
 20 | from labml import experiment
 21 | from labml.configs import option
 22 | from labml.utils.pytorch import get_modules
 23 | from labml_helpers.module import Module
 24 | from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
 25 | 
 26 | 
 27 | class AutoregressiveModel(Module):
 28 |     """
 29 |     ## Auto regressive model
 30 |     """
 31 | 
 32 |     def __init__(self, n_vocab: int, d_model: int, transformer: Module):
 33 |         super().__init__()
 34 |         # Token embedding module
 35 |         self.src_embed = nn.Embedding(n_vocab, d_model)
 36 |         self.transformer = transformer
 37 |         self.generator = nn.Linear(d_model, n_vocab)
 38 | 
 39 |     def forward(self, x: torch.Tensor):
 40 |         # Embed the tokens
 41 |         x = self.src_embed(x)
 42 |         # Run it through the the transformer
 43 |         res = self.transformer(x)
 44 |         # Generate logits of the next token
 45 |         return self.generator(res), None
 46 | 
 47 | 
 48 | class Configs(NLPAutoRegressionConfigs):
 49 |     """
 50 |     ## Configurations
 51 | 
 52 |     The default configs can and will be over-ridden when we start the experiment
 53 |     """
 54 | 
 55 |     model: AutoregressiveModel
 56 | 
 57 |     d_model: int = 512
 58 |     nu: int = 1
 59 |     heads: int = 8
 60 |     dropout: float = 0.0
 61 |     d_ff: int = 2048
 62 |     n_layers: int = 6
 63 | 
 64 | 
 65 | @option(Configs.model)
 66 | def fast_weights_transformer(c: Configs):
 67 |     """
 68 |     Create [fast weights transformer](index.html).
 69 |     """
 70 |     from labml_nn.transformers.fast_weights import FastWeightsAttentionTransformer, \
 71 |         FastWeightsAttentionTransformerLayer, FastWeightsAttention, FeedForward
 72 | 
 73 |     from labml_nn.transformers.fast_weights import DPFP
 74 |     return AutoregressiveModel(
 75 |         c.n_tokens, c.d_model,
 76 |         FastWeightsAttentionTransformer(
 77 |             FastWeightsAttentionTransformerLayer(d_model=c.d_model,
 78 |                                                  attn=FastWeightsAttention(c.heads, c.d_model, c.dropout, DPFP(nu=c.nu)),
 79 |                                                  feed_forward=FeedForward(c.d_model, c.d_ff, c.dropout),
 80 |                                                  dropout_prob=c.dropout),
 81 |             c.n_layers)).to(c.device)
 82 | 
 83 | 
 84 | def main():
 85 |     # Create experiment
 86 |     experiment.create(name="fast_weights_transformer")
 87 |     # Create configs
 88 |     conf = Configs()
 89 |     # Load configurations
 90 |     experiment.configs(conf,
 91 |                        # A dictionary of configurations to override
 92 |                        {'tokenizer': 'character',
 93 |                         'text': 'tiny_shakespeare',
 94 |                         'optimizer.learning_rate': 1.0,
 95 |                         'optimizer.optimizer': 'Noam',
 96 |                         'prompt': 'It is',
 97 |                         'prompt_separator': '',
 98 | 
 99 |                         'train_loader': 'shuffled_train_loader',
100 |                         'valid_loader': 'shuffled_valid_loader',
101 | 
102 |                         'seq_len': 128,
103 |                         'epochs': 128,
104 |                         'batch_size': 16,
105 |                         'inner_iterations': 25})
106 | 
107 |     # Set models for saving and loading
108 |     experiment.add_pytorch_models(get_modules(conf))
109 | 
110 |     # Start the experiment
111 |     with experiment.start():
112 |         # Run the training loop
113 |         conf.run()
114 | 
115 | 
116 | if __name__ == '__main__':
117 |     main()
118 | 


--------------------------------------------------------------------------------
/labml_nn/gan/dcgan/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ---
  3 | title: Deep Convolutional Generative Adversarial Networks (DCGAN)
  4 | summary: A simple PyTorch implementation/tutorial of Deep Convolutional Generative Adversarial Networks (DCGAN).
  5 | ---
  6 | 
  7 | # Deep Convolutional Generative Adversarial Networks (DCGAN)
  8 | 
  9 | This is a [PyTorch](https://pytorch.org) implementation of paper
 10 | [Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](https://arxiv.org/abs/1511.06434).
 11 | 
 12 | This implementation is based on the [PyTorch DCGAN Tutorial](https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html).
 13 | """
 14 | 
 15 | import torch.nn as nn
 16 | 
 17 | from labml import experiment
 18 | from labml.configs import calculate
 19 | from labml_helpers.module import Module
 20 | from labml_nn.gan.original.experiment import Configs
 21 | 
 22 | 
 23 | class Generator(Module):
 24 |     """
 25 |     ### Convolutional Generator Network
 26 | 
 27 |     This is similar to the de-convolutional network used for CelebA faces,
 28 |     but modified for MNIST images.
 29 | 
 30 |     <img src="https://pytorch.org/tutorials/_images/dcgan_generator.png" style="max-width:90%" />
 31 |     """
 32 | 
 33 |     def __init__(self):
 34 |         super().__init__()
 35 |         # The input is $1 \times 1$ with 100 channels
 36 |         self.layers = nn.Sequential(
 37 |             # This gives $3 \times 3$ output
 38 |             nn.ConvTranspose2d(100, 1024, 3, 1, 0, bias=False),
 39 |             nn.BatchNorm2d(1024),
 40 |             nn.ReLU(True),
 41 |             # This gives $7 \times 7$
 42 |             nn.ConvTranspose2d(1024, 512, 3, 2, 0, bias=False),
 43 |             nn.BatchNorm2d(512),
 44 |             nn.ReLU(True),
 45 |             # This gives $14 \times 14$
 46 |             nn.ConvTranspose2d(512, 256, 4, 2, 1, bias=False),
 47 |             nn.BatchNorm2d(256),
 48 |             nn.ReLU(True),
 49 |             # This gives $28 \times 28$
 50 |             nn.ConvTranspose2d(256, 1, 4, 2, 1, bias=False),
 51 |             nn.Tanh()
 52 |         )
 53 | 
 54 |         self.apply(_weights_init)
 55 | 
 56 |     def __call__(self, x):
 57 |         # Change from shape `[batch_size, 100]` to `[batch_size, 100, 1, 1]`
 58 |         x = x.unsqueeze(-1).unsqueeze(-1)
 59 |         x = self.layers(x)
 60 |         return x
 61 | 
 62 | 
 63 | class Discriminator(Module):
 64 |     """
 65 |     ### Convolutional Discriminator Network
 66 |     """
 67 | 
 68 |     def __init__(self):
 69 |         super().__init__()
 70 |         # The input is $28 \times 28$ with one channel
 71 |         self.layers = nn.Sequential(
 72 |             # This gives $14 \times 14$
 73 |             nn.Conv2d(1, 256, 4, 2, 1, bias=False),
 74 |             nn.LeakyReLU(0.2, inplace=True),
 75 |             # This gives $7 \times 7$
 76 |             nn.Conv2d(256, 512, 4, 2, 1, bias=False),
 77 |             nn.BatchNorm2d(512),
 78 |             nn.LeakyReLU(0.2, inplace=True),
 79 |             # This gives $3 \times 3$
 80 |             nn.Conv2d(512, 1024, 3, 2, 0, bias=False),
 81 |             nn.BatchNorm2d(1024),
 82 |             nn.LeakyReLU(0.2, inplace=True),
 83 |             # This gives $1 \times 1$
 84 |             nn.Conv2d(1024, 1, 3, 1, 0, bias=False),
 85 |         )
 86 |         self.apply(_weights_init)
 87 | 
 88 |     def forward(self, x):
 89 |         x = self.layers(x)
 90 |         return x.view(x.shape[0], -1)
 91 | 
 92 | 
 93 | def _weights_init(m):
 94 |     classname = m.__class__.__name__
 95 |     if classname.find('Conv') != -1:
 96 |         nn.init.normal_(m.weight.data, 0.0, 0.02)
 97 |     elif classname.find('BatchNorm') != -1:
 98 |         nn.init.normal_(m.weight.data, 1.0, 0.02)
 99 |         nn.init.constant_(m.bias.data, 0)
100 | 
101 | 
102 | # We import the [simple gan experiment]((simple_mnist_experiment.html) and change the
103 | # generator and discriminator networks
104 | calculate(Configs.generator, 'cnn', lambda c: Generator().to(c.device))
105 | calculate(Configs.discriminator, 'cnn', lambda c: Discriminator().to(c.device))
106 | 
107 | 
108 | def main():
109 |     conf = Configs()
110 |     experiment.create(name='mnist_dcgan')
111 |     experiment.configs(conf,
112 |                        {'discriminator': 'cnn',
113 |                         'generator': 'cnn',
114 |                         'label_smoothing': 0.01})
115 |     with experiment.start():
116 |         conf.run()
117 | 
118 | 
119 | if __name__ == '__main__':
120 |     main()
121 | 


--------------------------------------------------------------------------------
/labml_nn/resnets/utils/train.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | import torch
  4 | from torch.utils.data import DataLoader, ConcatDataset
  5 | # from sklearn.model_selection import KFold
  6 | # from torch.utils.data.sampler import SubsetRandomSampler
  7 | 
  8 | import matplotlib.pyplot as plt
  9 | from pylab import *
 10 | import os
 11 | 
 12 | from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
 13 | 
 14 | 
 15 | 
 16 | class Trainer():
 17 |     def __init__(self, net, opt, cost, name="default", lr=0.0005, use_lr_schedule =False , device=None):
 18 |         self.net = net
 19 |         self.opt = opt
 20 |         self.cost = cost
 21 |         self.device = device
 22 |         self.epoch = 0
 23 |         self.start_epoch = 0
 24 |         self.name = name
 25 | 
 26 |         self.lr = lr
 27 |         self.use_lr_schedule = use_lr_schedule
 28 |         if self.use_lr_schedule:
 29 |             self.scheduler = ReduceLROnPlateau( self.opt, 'max', factor=0.1, patience=5, threshold=0.00001, verbose=True)
 30 |             # self.scheduler = StepLR(self.opt, step_size=15, gamma=0.1)
 31 | 
 32 |     # Train loop over epochs. Optinal use testloader to return test accuracy after each epoch
 33 |     def Train(self, trainloader, epochs, testloader=None):
 34 |         # Enable Dropout
 35 | 
 36 |         # Record loss/accuracies
 37 |         loss = torch.zeros(epochs)
 38 |         self.epoch = 0
 39 | 
 40 |         # If testloader is used, loss will be the accuracy
 41 |         for epoch in range(self.start_epoch, self.start_epoch+epochs):
 42 |             self.epoch = epoch+1
 43 | 
 44 |             self.net.train()  # Enable Dropout
 45 |             for data in trainloader:
 46 |                 # Get the inputs; data is a list of [inputs, labels]
 47 |                 if self.device:
 48 |                     images, labels = data[0].to(self.device), data[1].to(self.device)
 49 |                 else:
 50 |                     images, labels = data
 51 | 
 52 |                 self.opt.zero_grad()
 53 |                 # Forward + backward + optimize
 54 |                 outputs = self.net(images)
 55 |                 epoch_loss = self.cost(outputs, labels)
 56 |                 epoch_loss.backward()
 57 |                 self.opt.step()
 58 | 
 59 |                 loss[epoch] += epoch_loss.item()
 60 | 
 61 |             if testloader:
 62 |                 loss[epoch] = self.Test(testloader)
 63 |             else:
 64 |                 loss[epoch] /= len(trainloader)
 65 | 
 66 |             print("Epoch %d Learning rate %.6f %s: %.3f" % (
 67 |             self.epoch, self.opt.param_groups[0]['lr'], "Accuracy" if testloader else "Loss", loss[epoch]))
 68 | 
 69 |             #learning rate scheduler
 70 |             if self.use_lr_schedule:
 71 |                 self.scheduler.step(loss[epoch])
 72 |                 # self.scheduler.step()
 73 | 
 74 |             # Saving best model
 75 |             if loss[epoch] >= torch.max(loss):
 76 |                 self.save_best_model({
 77 |                     'epoch': self.epoch,
 78 |                     'state_dict': self.net.state_dict(),
 79 |                     'optimizer': self.opt.state_dict(),
 80 |                 })
 81 | 
 82 |         return loss
 83 | 
 84 |     # Testing
 85 |     def Test(self, testloader, ret="accuracy"):
 86 |         # Disable Dropout
 87 |         self.net.eval()
 88 | 
 89 |         # Track correct and total
 90 |         correct = 0.0
 91 |         total = 0.0
 92 |         with torch.no_grad():
 93 |             for data in testloader:
 94 |                 if self.device:
 95 |                     images, labels = data[0].to(self.device), data[1].to(self.device)
 96 |                 else:
 97 |                     images, labels = data
 98 | 
 99 |                 outputs = self.net(images)
100 |                 _, predicted = torch.max(outputs.data, 1)
101 |                 total += labels.size(0)
102 |                 correct += (predicted == labels).sum().item()
103 | 
104 |         return correct / total
105 | 
106 |     def save_best_model(self, state):
107 |         directory = os.path.dirname("./save/%s-best-model/"%(self.name))
108 |         if not os.path.exists(directory):
109 |             os.mkdir(directory)
110 |         torch.save(state, "%s/model.pt" %(directory))
111 | 
112 |     def save_checkpoint(self, state):
113 |         directory = os.path.dirname("./save/%s-checkpoints/"%(self.name))
114 |         if not os.path.exists(directory):
115 |             os.mkdir(directory)
116 |         torch.save(state, "%s/model_epoch_%s.pt" %(directory, self.epoch))
117 |         # torch.save(state, "./save/checkpoints/model_epoch_%s.pt" % (self.epoch))
118 | 


--------------------------------------------------------------------------------
/labml_nn/normalization/batch_norm/readme.md:
--------------------------------------------------------------------------------
 1 | # [Batch Normalization](https://nn.labml.ai/normalization/batch_norm/index.html)
 2 | 
 3 | This is a [PyTorch](https://pytorch.org) implementation of Batch Normalization from paper
 4 |  [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167).
 5 | 
 6 | ### Internal Covariate Shift
 7 | 
 8 | The paper defines *Internal Covariate Shift* as the change in the
 9 | distribution of network activations due to the change in
10 | network parameters during training.
11 | For example, let's say there are two layers $l_1$ and $l_2$.
12 | During the beginning of the training $l_1$ outputs (inputs to $l_2$)
13 | could be in distribution $\mathcal{N}(0.5, 1)$.
14 | Then, after some training steps, it could move to $\mathcal{N}(0.6, 1.5)$.
15 | This is *internal covariate shift*.
16 | 
17 | Internal covariate shift will adversely affect training speed because the later layers
18 | ($l_2$ in the above example) have to adapt to this shifted distribution.
19 | 
20 | By stabilizing the distribution, batch normalization minimizes the internal covariate shift.
21 | 
22 | ## Normalization
23 | 
24 | It is known that whitening improves training speed and convergence.
25 | *Whitening* is linearly transforming inputs to have zero mean, unit variance,
26 | and be uncorrelated.
27 | 
28 | ### Normalizing outside gradient computation doesn't work
29 | 
30 | Normalizing outside the gradient computation using pre-computed (detached)
31 | means and variances doesn't work. For instance. (ignoring variance), let
32 | $$\hat{x} = x - \mathbb{E}[x]$$
33 | where $x = u + b$ and $b$ is a trained bias
34 | and $\mathbb{E}[x]$ is an outside gradient computation (pre-computed constant).
35 | 
36 | Note that $\hat{x}$ has no effect on $b$.
37 | Therefore,
38 | $b$ will increase or decrease based
39 | $\frac{\partial{\mathcal{L}}}{\partial x}$,
40 | and keep on growing indefinitely in each training update.
41 | The paper notes that similar explosions happen with variances.
42 | 
43 | ### Batch Normalization
44 | 
45 | Whitening is computationally expensive because you need to de-correlate and
46 | the gradients must flow through the full whitening calculation.
47 | 
48 | The paper introduces a simplified version which they call *Batch Normalization*.
49 | First simplification is that it normalizes each feature independently to have
50 | zero mean and unit variance:
51 | $$\hat{x}^{(k)} = \frac{x^{(k)} - \mathbb{E}[x^{(k)}]}{\sqrt{Var[x^{(k)}]}}$$
52 | where $x = (x^{(1)} ... x^{(d)})$ is the $d$-dimensional input.
53 | 
54 | The second simplification is to use estimates of mean $\mathbb{E}[x^{(k)}]$
55 | and variance $Var[x^{(k)}]$ from the mini-batch
56 | for normalization; instead of calculating the mean and variance across the whole dataset.
57 | 
58 | Normalizing each feature to zero mean and unit variance could affect what the layer
59 | can represent.
60 | As an example paper illustrates that, if the inputs to a sigmoid are normalized
61 | most of it will be within $[-1, 1]$ range where the sigmoid is linear.
62 | To overcome this each feature is scaled and shifted by two trained parameters
63 | $\gamma^{(k)}$ and $\beta^{(k)}$.
64 | $$y^{(k)} =\gamma^{(k)} \hat{x}^{(k)} + \beta^{(k)}$$
65 | where $y^{(k)}$ is the output of the batch normalization layer.
66 | 
67 | Note that when applying batch normalization after a linear transform
68 | like $Wu + b$ the bias parameter $b$ gets cancelled due to normalization.
69 | So you can and should omit bias parameter in linear transforms right before the
70 | batch normalization.
71 | 
72 | Batch normalization also makes the back propagation invariant to the scale of the weights
73 | and empirically it improves generalization, so it has regularization effects too.
74 | 
75 | ## Inference
76 | 
77 | We need to know $\mathbb{E}[x^{(k)}]$ and $Var[x^{(k)}]$ in order to
78 | perform the normalization.
79 | So during inference, you either need to go through the whole (or part of) dataset
80 | and find the mean and variance, or you can use an estimate calculated during training.
81 | The usual practice is to calculate an exponential moving average of
82 | mean and variance during the training phase and use that for inference.
83 | 
84 | Here's [the training code](mnist.html) and a notebook for training
85 | a CNN classifier that uses batch normalization for MNIST dataset.
86 | 
87 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/batch_norm/mnist.ipynb)
88 | [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://app.labml.ai/run/011254fe647011ebbb8e0242ac1c0002)
89 | 


--------------------------------------------------------------------------------
/labml_nn/optimizers/mnist_experiment.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ---
  3 | title: MNIST example to test the optimizers
  4 | summary: This is a simple MNIST example with a CNN model to test the optimizers.
  5 | ---
  6 | 
  7 | # MNIST example to test the optimizers
  8 | """
  9 | import torch.nn as nn
 10 | import torch.utils.data
 11 | from labml_helpers.module import Module
 12 | 
 13 | from labml import experiment, tracker
 14 | from labml.configs import option
 15 | from labml_helpers.datasets.mnist import MNISTConfigs
 16 | from labml_helpers.device import DeviceConfigs
 17 | from labml_helpers.metrics.accuracy import Accuracy
 18 | from labml_helpers.seed import SeedConfigs
 19 | from labml_helpers.train_valid import TrainValidConfigs, BatchIndex, hook_model_outputs
 20 | from labml_nn.optimizers.configs import OptimizerConfigs
 21 | 
 22 | 
 23 | class Model(Module):
 24 |     """
 25 |     ## The model
 26 |     """
 27 |     def __init__(self):
 28 |         super().__init__()
 29 |         self.conv1 = nn.Conv2d(1, 20, 5, 1)
 30 |         self.pool1 = nn.MaxPool2d(2)
 31 |         self.conv2 = nn.Conv2d(20, 50, 5, 1)
 32 |         self.pool2 = nn.MaxPool2d(2)
 33 |         self.fc1 = nn.Linear(16 * 50, 500)
 34 |         self.fc2 = nn.Linear(500, 10)
 35 |         self.activation = nn.ReLU()
 36 | 
 37 |     def forward(self, x):
 38 |         x = self.activation(self.conv1(x))
 39 |         x = self.pool1(x)
 40 |         x = self.activation(self.conv2(x))
 41 |         x = self.pool2(x)
 42 |         x = self.activation(self.fc1(x.view(-1, 16 * 50)))
 43 |         return self.fc2(x)
 44 | 
 45 | 
 46 | class Configs(MNISTConfigs, TrainValidConfigs):
 47 |     """
 48 |     ## Configurable Experiment Definition
 49 |     """
 50 |     optimizer: torch.optim.Adam
 51 |     model: nn.Module
 52 |     set_seed = SeedConfigs()
 53 |     device: torch.device = DeviceConfigs()
 54 |     epochs: int = 10
 55 | 
 56 |     is_save_models = True
 57 |     model: nn.Module
 58 |     inner_iterations = 10
 59 | 
 60 |     accuracy_func = Accuracy()
 61 |     loss_func = nn.CrossEntropyLoss()
 62 | 
 63 |     def init(self):
 64 |         tracker.set_queue("loss.*", 20, True)
 65 |         tracker.set_scalar("accuracy.*", True)
 66 |         hook_model_outputs(self.mode, self.model, 'model')
 67 |         self.state_modules = [self.accuracy_func]
 68 | 
 69 |     def step(self, batch: any, batch_idx: BatchIndex):
 70 |         # Get the batch
 71 |         data, target = batch[0].to(self.device), batch[1].to(self.device)
 72 | 
 73 |         # Add global step if we are in training mode
 74 |         if self.mode.is_train:
 75 |             tracker.add_global_step(len(data))
 76 | 
 77 |         # Run the model and specify whether to log the activations
 78 |         with self.mode.update(is_log_activations=batch_idx.is_last):
 79 |             output = self.model(data)
 80 | 
 81 |         # Calculate the loss
 82 |         loss = self.loss_func(output, target)
 83 |         # Calculate the accuracy
 84 |         self.accuracy_func(output, target)
 85 |         # Log the loss
 86 |         tracker.add("loss.", loss)
 87 | 
 88 |         # Optimize if we are in training mode
 89 |         if self.mode.is_train:
 90 |             # Calculate the gradients
 91 |             loss.backward()
 92 | 
 93 |             # Take optimizer step
 94 |             self.optimizer.step()
 95 |             # Log the parameter and gradient L2 norms once per epoch
 96 |             if batch_idx.is_last:
 97 |                 tracker.add('model', self.model)
 98 |                 tracker.add('optimizer', (self.optimizer, {'model': self.model}))
 99 |             # Clear the gradients
100 |             self.optimizer.zero_grad()
101 | 
102 |         # Save logs
103 |         tracker.save()
104 | 
105 | 
106 | @option(Configs.model)
107 | def model(c: Configs):
108 |     return Model().to(c.device)
109 | 
110 | 
111 | @option(Configs.optimizer)
112 | def _optimizer(c: Configs):
113 |     """
114 |     Create a configurable optimizer.
115 |     We can change the optimizer type and hyper-parameters using configurations.
116 |     """
117 |     opt_conf = OptimizerConfigs()
118 |     opt_conf.parameters = c.model.parameters()
119 |     return opt_conf
120 | 
121 | 
122 | def main():
123 |     conf = Configs()
124 |     conf.inner_iterations = 10
125 |     experiment.create(name='mnist_ada_belief')
126 |     experiment.configs(conf, {'inner_iterations': 10,
127 |                               # Specify the optimizer
128 |                               'optimizer.optimizer': 'Adam',
129 |                               'optimizer.learning_rate': 1.5e-4})
130 |     conf.set_seed.set()
131 |     experiment.add_pytorch_models(dict(model=conf.model))
132 |     with experiment.start():
133 |         conf.run()
134 | 
135 | 
136 | if __name__ == '__main__':
137 |     main()
138 | 


--------------------------------------------------------------------------------
/docs/resnets/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta http-equiv="content-type" content="text/html;charset=utf-8"/>
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
  6 |     <meta name="description" content=""/>
  7 | 
  8 |     <meta name="twitter:card" content="summary"/>
  9 |     <meta name="twitter:image:src" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
 10 |     <meta name="twitter:title" content="None"/>
 11 |     <meta name="twitter:description" content=""/>
 12 |     <meta name="twitter:site" content="@labmlai"/>
 13 |     <meta name="twitter:creator" content="@labmlai"/>
 14 | 
 15 |     <meta property="og:url" content="https://nn.labml.ai/resnets/index.html"/>
 16 |     <meta property="og:title" content="None"/>
 17 |     <meta property="og:image" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
 18 |     <meta property="og:site_name" content="LabML Neural Networks"/>
 19 |     <meta property="og:type" content="object"/>
 20 |     <meta property="og:title" content="None"/>
 21 |     <meta property="og:description" content=""/>
 22 | 
 23 |     <title>None</title>
 24 |     <link rel="shortcut icon" href="/icon.png"/>
 25 |     <link rel="stylesheet" href="../pylit.css">
 26 |     <link rel="canonical" href="https://nn.labml.ai/resnets/index.html"/>
 27 |     <!-- Global site tag (gtag.js) - Google Analytics -->
 28 |     <script async src="https://www.googletagmanager.com/gtag/js?id=G-4V3HC8HBLH"></script>
 29 |     <script>
 30 |         window.dataLayer = window.dataLayer || [];
 31 | 
 32 |         function gtag() {
 33 |             dataLayer.push(arguments);
 34 |         }
 35 | 
 36 |         gtag('js', new Date());
 37 | 
 38 |         gtag('config', 'G-4V3HC8HBLH');
 39 |     </script>
 40 | </head>
 41 | <body>
 42 | <div id='container'>
 43 |     <div id="background"></div>
 44 |     <div class='section'>
 45 |         <div class='docs'>
 46 |             <p>
 47 |                 <a class="parent" href="/">home</a>
 48 |                 <a class="parent" href="index.html">resnets</a>
 49 |             </p>
 50 |             <p>
 51 | 
 52 |                 <a href="https://github.com/lab-ml/labml_nn/tree/master/labml_nn/resnets/__init__.py">
 53 |                     <img alt="Github"
 54 |                          src="https://img.shields.io/github/stars/lab-ml/nn?style=social"
 55 |                          style="max-width:100%;"/></a>
 56 |                 <a href="https://twitter.com/labmlai"
 57 |                    rel="nofollow">
 58 |                     <img alt="Twitter"
 59 |                          src="https://img.shields.io/twitter/follow/labmlai?style=social"
 60 |                          style="max-width:100%;"/></a>
 61 |             </p>
 62 |         </div>
 63 |     </div>
 64 |     </div>
 65 | </div>
 66 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=TeX-AMS_HTML">
 67 | </script>
 68 | <!-- MathJax configuration -->
 69 | <script type="text/x-mathjax-config">
 70 |     MathJax.Hub.Config({
 71 |         tex2jax: {
 72 |             inlineMath: [ ['$','$'] ],
 73 |             displayMath: [ ['$$','$$'] ],
 74 |             processEscapes: true,
 75 |             processEnvironments: true
 76 |         },
 77 |         // Center justify equations in code and markdown cells. Elsewhere
 78 |         // we use CSS to left justify single line equations in code cells.
 79 |         displayAlign: 'center',
 80 |         "HTML-CSS": { fonts: ["TeX"] }
 81 |     });
 82 | </script>
 83 | <script>
 84 |     function handleImages() {
 85 |         var images = document.querySelectorAll('p>img')
 86 | 
 87 |         console.log(images);
 88 |         for (var i = 0; i < images.length; ++i) {
 89 |             handleImage(images[i])
 90 |         }
 91 |     }
 92 | 
 93 |     function handleImage(img) {
 94 |         img.parentElement.style.textAlign = 'center'
 95 | 
 96 |         var modal = document.createElement('div')
 97 |         modal.id = 'modal'
 98 | 
 99 |         var modalContent = document.createElement('div')
100 |         modal.appendChild(modalContent)
101 | 
102 |         var modalImage = document.createElement('img')
103 |         modalContent.appendChild(modalImage)
104 | 
105 |         var span = document.createElement('span')
106 |         span.classList.add('close')
107 |         span.textContent = 'x'
108 |         modal.appendChild(span)
109 | 
110 |         img.onclick = function () {
111 |             console.log('clicked')
112 |             document.body.appendChild(modal)
113 |             modalImage.src = img.src
114 |         }
115 | 
116 |         span.onclick = function () {
117 |             document.body.removeChild(modal)
118 |         }
119 |     }
120 | 
121 |     handleImages()
122 | </script>
123 | </body>
124 | </html>


--------------------------------------------------------------------------------
/docs/experiments/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta http-equiv="content-type" content="text/html;charset=utf-8"/>
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
  6 |     <meta name="description" content=""/>
  7 | 
  8 |     <meta name="twitter:card" content="summary"/>
  9 |     <meta name="twitter:image:src" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
 10 |     <meta name="twitter:title" content="None"/>
 11 |     <meta name="twitter:description" content=""/>
 12 |     <meta name="twitter:site" content="@labmlai"/>
 13 |     <meta name="twitter:creator" content="@labmlai"/>
 14 | 
 15 |     <meta property="og:url" content="https://nn.labml.ai/experiments/index.html"/>
 16 |     <meta property="og:title" content="None"/>
 17 |     <meta property="og:image" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
 18 |     <meta property="og:site_name" content="LabML Neural Networks"/>
 19 |     <meta property="og:type" content="object"/>
 20 |     <meta property="og:title" content="None"/>
 21 |     <meta property="og:description" content=""/>
 22 | 
 23 |     <title>None</title>
 24 |     <link rel="shortcut icon" href="/icon.png"/>
 25 |     <link rel="stylesheet" href="../pylit.css">
 26 |     <link rel="canonical" href="https://nn.labml.ai/experiments/index.html"/>
 27 |     <!-- Global site tag (gtag.js) - Google Analytics -->
 28 |     <script async src="https://www.googletagmanager.com/gtag/js?id=G-4V3HC8HBLH"></script>
 29 |     <script>
 30 |         window.dataLayer = window.dataLayer || [];
 31 | 
 32 |         function gtag() {
 33 |             dataLayer.push(arguments);
 34 |         }
 35 | 
 36 |         gtag('js', new Date());
 37 | 
 38 |         gtag('config', 'G-4V3HC8HBLH');
 39 |     </script>
 40 | </head>
 41 | <body>
 42 | <div id='container'>
 43 |     <div id="background"></div>
 44 |     <div class='section'>
 45 |         <div class='docs'>
 46 |             <p>
 47 |                 <a class="parent" href="/">home</a>
 48 |                 <a class="parent" href="index.html">experiments</a>
 49 |             </p>
 50 |             <p>
 51 | 
 52 |                 <a href="https://github.com/lab-ml/labml_nn/tree/master/labml_nn/experiments/__init__.py">
 53 |                     <img alt="Github"
 54 |                          src="https://img.shields.io/github/stars/lab-ml/nn?style=social"
 55 |                          style="max-width:100%;"/></a>
 56 |                 <a href="https://twitter.com/labmlai"
 57 |                    rel="nofollow">
 58 |                     <img alt="Twitter"
 59 |                          src="https://img.shields.io/twitter/follow/labmlai?style=social"
 60 |                          style="max-width:100%;"/></a>
 61 |             </p>
 62 |         </div>
 63 |     </div>
 64 |     </div>
 65 | </div>
 66 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=TeX-AMS_HTML">
 67 | </script>
 68 | <!-- MathJax configuration -->
 69 | <script type="text/x-mathjax-config">
 70 |     MathJax.Hub.Config({
 71 |         tex2jax: {
 72 |             inlineMath: [ ['$','$'] ],
 73 |             displayMath: [ ['$$','$$'] ],
 74 |             processEscapes: true,
 75 |             processEnvironments: true
 76 |         },
 77 |         // Center justify equations in code and markdown cells. Elsewhere
 78 |         // we use CSS to left justify single line equations in code cells.
 79 |         displayAlign: 'center',
 80 |         "HTML-CSS": { fonts: ["TeX"] }
 81 |     });
 82 | </script>
 83 | <script>
 84 |     function handleImages() {
 85 |         var images = document.querySelectorAll('p>img')
 86 | 
 87 |         console.log(images);
 88 |         for (var i = 0; i < images.length; ++i) {
 89 |             handleImage(images[i])
 90 |         }
 91 |     }
 92 | 
 93 |     function handleImage(img) {
 94 |         img.parentElement.style.textAlign = 'center'
 95 | 
 96 |         var modal = document.createElement('div')
 97 |         modal.id = 'modal'
 98 | 
 99 |         var modalContent = document.createElement('div')
100 |         modal.appendChild(modalContent)
101 | 
102 |         var modalImage = document.createElement('img')
103 |         modalContent.appendChild(modalImage)
104 | 
105 |         var span = document.createElement('span')
106 |         span.classList.add('close')
107 |         span.textContent = 'x'
108 |         modal.appendChild(span)
109 | 
110 |         img.onclick = function () {
111 |             console.log('clicked')
112 |             document.body.appendChild(modal)
113 |             modalImage.src = img.src
114 |         }
115 | 
116 |         span.onclick = function () {
117 |             document.body.removeChild(modal)
118 |         }
119 |     }
120 | 
121 |     handleImages()
122 | </script>
123 | </body>
124 | </html>


--------------------------------------------------------------------------------
/labml_nn/normalization/instance_norm/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ---
  3 | title: Instance Normalization
  4 | summary: >
  5 |  A PyTorch implementation/tutorial of instance normalization.
  6 | ---
  7 | 
  8 | # Instance Normalization
  9 | 
 10 | This is a [PyTorch](https://pytorch.org) implementation of
 11 | [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
 12 | 
 13 | Instance normalization was introduced to improve [style transfer](https://paperswithcode.com/task/style-transfer).
 14 | It is based on the observation that stylization should not depend on the contrast of the content image.
 15 | The "contrast normalization" is
 16 | 
 17 | $$y_{t,i,j,k} = \frac{x_{t,i,j,k}}{\sum_{l=1}^H \sum_{m=1}^W x_{t,i,l,m}}$$
 18 | 
 19 | where $x$ is a batch of images with dimensions image index $t$,
 20 | feature channel $i$, and
 21 | spatial position $j, k$.
 22 | 
 23 | Since it's hard for a convolutional network to learn "contrast normalization", this paper
 24 | introduces instance normalization which does that.
 25 | 
 26 | Here's a [CIFAR 10 classification model](experiment.html) that uses instance normalization.
 27 | """
 28 | 
 29 | import torch
 30 | from torch import nn
 31 | 
 32 | from labml_helpers.module import Module
 33 | 
 34 | 
 35 | class InstanceNorm(Module):
 36 |     r"""
 37 |     ## Instance Normalization Layer
 38 | 
 39 |     Instance normalization layer $\text{IN}$ normalizes the input $X$ as follows:
 40 | 
 41 |     When input $X \in \mathbb{R}^{B \times C \times H \times W}$ is a batch of image representations,
 42 |     where $B$ is the batch size, $C$ is the number of channels, $H$ is the height and $W$ is the width.
 43 |     $\gamma \in \mathbb{R}^{C}$ and $\beta \in \mathbb{R}^{C}$. The affine transformation with $gamma$ and
 44 |     $beta$ are optional.
 45 | 
 46 |     $$\text{IN}(X) = \gamma
 47 |     \frac{X - \underset{H, W}{\mathbb{E}}[X]}{\sqrt{\underset{H, W}{Var}[X] + \epsilon}}
 48 |     + \beta$$
 49 |     """
 50 | 
 51 |     def __init__(self, channels: int, *,
 52 |                  eps: float = 1e-5, affine: bool = True):
 53 |         """
 54 |         * `channels` is the number of features in the input
 55 |         * `eps` is $\epsilon$, used in $\sqrt{Var[X] + \epsilon}$ for numerical stability
 56 |         * `affine` is whether to scale and shift the normalized value
 57 |         """
 58 |         super().__init__()
 59 | 
 60 |         self.channels = channels
 61 | 
 62 |         self.eps = eps
 63 |         self.affine = affine
 64 |         # Create parameters for $\gamma$ and $\beta$ for scale and shift
 65 |         if self.affine:
 66 |             self.scale = nn.Parameter(torch.ones(channels))
 67 |             self.shift = nn.Parameter(torch.zeros(channels))
 68 | 
 69 |     def forward(self, x: torch.Tensor):
 70 |         """
 71 |         `x` is a tensor of shape `[batch_size, channels, *]`.
 72 |         `*` denotes any number of (possibly 0) dimensions.
 73 |          For example, in an image (2D) convolution this will be
 74 |         `[batch_size, channels, height, width]`
 75 |         """
 76 |         # Keep the original shape
 77 |         x_shape = x.shape
 78 |         # Get the batch size
 79 |         batch_size = x_shape[0]
 80 |         # Sanity check to make sure the number of features is the same
 81 |         assert self.channels == x.shape[1]
 82 | 
 83 |         # Reshape into `[batch_size, channels, n]`
 84 |         x = x.view(batch_size, self.channels, -1)
 85 | 
 86 |         # Calculate the mean across last dimension
 87 |         # i.e. the means for each feature  $\mathbb{E}[x_{t,i}]$
 88 |         mean = x.mean(dim=[-1], keepdim=True)
 89 |         # Calculate the squared mean across first and last dimension;
 90 |         # i.e. the means for each feature $\mathbb{E}[(x_{t,i}^2]$
 91 |         mean_x2 = (x ** 2).mean(dim=[-1], keepdim=True)
 92 |         # Variance for each feature $Var[x_{t,i}] = \mathbb{E}[x_{t,i}^2] - \mathbb{E}[x_{t,i}]^2$
 93 |         var = mean_x2 - mean ** 2
 94 | 
 95 |         # Normalize $$\hat{x}_{t,i} = \frac{x_{t,i} - \mathbb{E}[x_{t,i}]}{\sqrt{Var[x_{t,i}] + \epsilon}}$$
 96 |         x_norm = (x - mean) / torch.sqrt(var + self.eps)
 97 |         x_norm = x_norm.view(batch_size, self.channels, -1)
 98 | 
 99 |         # Scale and shift $$y_{t,i} =\gamma_i \hat{x}_{t,i} + \beta_i$$
100 |         if self.affine:
101 |             x_norm = self.scale.view(1, -1, 1) * x_norm + self.shift.view(1, -1, 1)
102 | 
103 |         # Reshape to original and return
104 |         return x_norm.view(x_shape)
105 | 
106 | 
107 | def _test():
108 |     """
109 |     Simple test
110 |     """
111 |     from labml.logger import inspect
112 | 
113 |     x = torch.zeros([2, 6, 2, 4])
114 |     inspect(x.shape)
115 |     bn = InstanceNorm(6)
116 | 
117 |     x = bn(x)
118 |     inspect(x.shape)
119 | 
120 | 
121 | #
122 | if __name__ == '__main__':
123 |     _test()
124 | 


--------------------------------------------------------------------------------
/labml_nn/cnn/ray_tune.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python
  2 | 
  3 | import numpy as np
  4 | import os
  5 | import torch
  6 | from ray import tune
  7 | from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
  8 | from utils.train import Trainer
  9 | from models.cnn import GetCNN
 10 | 
 11 | # Check if GPU is available
 12 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 13 | print("Device:  " + str(device))
 14 | 
 15 | #
 16 | num_samples= 40  # for multiple trials
 17 | max_num_epochs= 25
 18 | gpus_per_trial= 1
 19 | 
 20 | # Cifar 10 Datasets location
 21 | data_dir = './data/Cifar10'
 22 | 
 23 | """
 24 | Code has been referenced from the official ray tune documentation
 25 | ASHA
 26 | https://docs.ray.io/en/master/tune/api_docs/schedulers.html#tune-scheduler-hyperband
 27 | 
 28 | PBT
 29 | https://docs.ray.io/en/latest/tune/api_docs/schedulers.html#tune-scheduler-pbt
 30 | """
 31 | 
 32 | """config - returns a dict of hyperparameters
 33 | 
 34 | Selecting different hyperparameters for tuning
 35 |     l1 : Number of units in first fully connected layer
 36 |     l2 : Number of units in second fully connected layer
 37 |     lr : Learning rate
 38 |     decay : Decay rate for regularization
 39 |     batch_size : Batch size of test and train data
 40 | """
 41 | config = {
 42 |     "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), # eg. 4, 8, 16 .. 512
 43 |     "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), # eg. 4, 8, 16 .. 512
 44 |     "lr": tune.loguniform(1e-4, 1e-1), # Sampling from log uniform distribution
 45 |     "decay": tune.sample_from(lambda _: 10 ** np.random.randint(-7, -3)), # eg. 1e-7, 1e-6, .. 1e-3
 46 |     "batch_size": tune.choice([32, 64, 128, 256])
 47 | }
 48 | 
 49 | # calling trainer
 50 | trainer = Trainer(device=device)
 51 | 
 52 | """ASHA (Asynchronous Successive Halving Algorithm) scheduler
 53 |         max_t              : Maximum number of units per trail (can be time or epochs)
 54 |         grace_period       : Stop trials after specific number of unit if model is not performing well (can be time or epochs)
 55 |         reduction_factor   : Set halving rate
 56 | """
 57 | scheduler = ASHAScheduler(
 58 |     max_t=max_num_epochs,
 59 |     grace_period=4,
 60 |     reduction_factor=4)
 61 | 
 62 | 
 63 | 
 64 | """Population based training scheduler
 65 |     time_attr             : Can be time or epochs
 66 |     metric                : Objective of training (loss or accuracy)
 67 |     perturbation_interval : Perturbation occur after specified unit (can be time or epochs)
 68 |     hyperparam_mutations  : Hyperparameters to mutate
 69 | """
 70 | scheduler = PopulationBasedTraining(
 71 |         time_attr= "training_iteration", # epochs
 72 |         metric='loss', # loss is objective function
 73 |         mode='min', # minimizing loss is objective of training
 74 |         perturbation_interval=5.0, # after 5 epochs perturbate
 75 |         hyperparam_mutations={
 76 |             "lr": [1e-3, 5e-4, 1e-4, 5e-4, 1e-5], # choose from given learning rates
 77 |             "batch_size": [64, 128, 256], # choose from given batch sizes
 78 |             "decay": tune.uniform(10**-8, 10**-4) # sample from uniform distribution
 79 |             }
 80 |         )
 81 | 
 82 | result = tune.run(
 83 |     tune.with_parameters(trainer.Train_ray, data_dir=data_dir),
 84 |     name="ray_test_basic-CNN", # name for identifying models (checkpoints)
 85 |     scheduler=scheduler, # select scheduler PBT or ASHA
 86 |     resources_per_trial={"cpu": 8, "gpu": gpus_per_trial}, # select number of CPUs or GPUs
 87 |     config=config, # input config dict consisting of different hyperparameters
 88 |     stop={
 89 |         "training_iteration": max_num_epochs, # stopping criterea
 90 |     },
 91 |     metric="loss", # uncomment for ASHA scheduler
 92 |     mode="min", # uncomment for ASHA scheduler
 93 |     num_samples=num_samples,
 94 |     verbose=True, # keep to true to check how training progresses
 95 |     fail_fast=True, # fail on first error
 96 |     keep_checkpoints_num=5, # number of checkpoints to be saved per num_samples
 97 | 
 98 | )
 99 | 
100 | best_trial = result.get_best_trial("loss", "min", "last")
101 | print("Best configuration: {}".format(best_trial.config))
102 | print("Best validation loss: {}".format(best_trial.last_result["loss"]))
103 | print("Best validation accuracy: {}".format(
104 |     best_trial.last_result["accuracy"]))
105 | 
106 | 
107 | best_trained_model = GetCNN(best_trial.config["l1"], best_trial.config["l2"])
108 | best_trained_model.to(device)
109 | checkpoint_path = os.path.join(best_trial.checkpoint.value, "checkpoint")
110 | model_state, optimizer_state = torch.load(checkpoint_path)
111 | best_trained_model.load_state_dict(model_state)
112 | 
113 | # Check accuracy of best model
114 | test_acc =  trainer.Test(best_trained_model, save=data_dir)
115 | print("Best Test accuracy: {}".format(test_acc))


--------------------------------------------------------------------------------
/docs/resnets/utils/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta http-equiv="content-type" content="text/html;charset=utf-8"/>
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
  6 |     <meta name="description" content=""/>
  7 | 
  8 |     <meta name="twitter:card" content="summary"/>
  9 |     <meta name="twitter:image:src" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
 10 |     <meta name="twitter:title" content="None"/>
 11 |     <meta name="twitter:description" content=""/>
 12 |     <meta name="twitter:site" content="@labmlai"/>
 13 |     <meta name="twitter:creator" content="@labmlai"/>
 14 | 
 15 |     <meta property="og:url" content="https://nn.labml.ai/resnets/utils/index.html"/>
 16 |     <meta property="og:title" content="None"/>
 17 |     <meta property="og:image" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
 18 |     <meta property="og:site_name" content="LabML Neural Networks"/>
 19 |     <meta property="og:type" content="object"/>
 20 |     <meta property="og:title" content="None"/>
 21 |     <meta property="og:description" content=""/>
 22 | 
 23 |     <title>None</title>
 24 |     <link rel="shortcut icon" href="/icon.png"/>
 25 |     <link rel="stylesheet" href="../../pylit.css">
 26 |     <link rel="canonical" href="https://nn.labml.ai/resnets/utils/index.html"/>
 27 |     <!-- Global site tag (gtag.js) - Google Analytics -->
 28 |     <script async src="https://www.googletagmanager.com/gtag/js?id=G-4V3HC8HBLH"></script>
 29 |     <script>
 30 |         window.dataLayer = window.dataLayer || [];
 31 | 
 32 |         function gtag() {
 33 |             dataLayer.push(arguments);
 34 |         }
 35 | 
 36 |         gtag('js', new Date());
 37 | 
 38 |         gtag('config', 'G-4V3HC8HBLH');
 39 |     </script>
 40 | </head>
 41 | <body>
 42 | <div id='container'>
 43 |     <div id="background"></div>
 44 |     <div class='section'>
 45 |         <div class='docs'>
 46 |             <p>
 47 |                 <a class="parent" href="/">home</a>
 48 |                 <a class="parent" href="../index.html">resnets</a>
 49 |                 <a class="parent" href="index.html">utils</a>
 50 |             </p>
 51 |             <p>
 52 | 
 53 |                 <a href="https://github.com/lab-ml/labml_nn/tree/master/labml_nn/resnets/utils/__init__.py">
 54 |                     <img alt="Github"
 55 |                          src="https://img.shields.io/github/stars/lab-ml/nn?style=social"
 56 |                          style="max-width:100%;"/></a>
 57 |                 <a href="https://twitter.com/labmlai"
 58 |                    rel="nofollow">
 59 |                     <img alt="Twitter"
 60 |                          src="https://img.shields.io/twitter/follow/labmlai?style=social"
 61 |                          style="max-width:100%;"/></a>
 62 |             </p>
 63 |         </div>
 64 |     </div>
 65 |     </div>
 66 | </div>
 67 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=TeX-AMS_HTML">
 68 | </script>
 69 | <!-- MathJax configuration -->
 70 | <script type="text/x-mathjax-config">
 71 |     MathJax.Hub.Config({
 72 |         tex2jax: {
 73 |             inlineMath: [ ['$','$'] ],
 74 |             displayMath: [ ['$$','$$'] ],
 75 |             processEscapes: true,
 76 |             processEnvironments: true
 77 |         },
 78 |         // Center justify equations in code and markdown cells. Elsewhere
 79 |         // we use CSS to left justify single line equations in code cells.
 80 |         displayAlign: 'center',
 81 |         "HTML-CSS": { fonts: ["TeX"] }
 82 |     });
 83 | </script>
 84 | <script>
 85 |     function handleImages() {
 86 |         var images = document.querySelectorAll('p>img')
 87 | 
 88 |         console.log(images);
 89 |         for (var i = 0; i < images.length; ++i) {
 90 |             handleImage(images[i])
 91 |         }
 92 |     }
 93 | 
 94 |     function handleImage(img) {
 95 |         img.parentElement.style.textAlign = 'center'
 96 | 
 97 |         var modal = document.createElement('div')
 98 |         modal.id = 'modal'
 99 | 
100 |         var modalContent = document.createElement('div')
101 |         modal.appendChild(modalContent)
102 | 
103 |         var modalImage = document.createElement('img')
104 |         modalContent.appendChild(modalImage)
105 | 
106 |         var span = document.createElement('span')
107 |         span.classList.add('close')
108 |         span.textContent = 'x'
109 |         modal.appendChild(span)
110 | 
111 |         img.onclick = function () {
112 |             console.log('clicked')
113 |             document.body.appendChild(modal)
114 |             modalImage.src = img.src
115 |         }
116 | 
117 |         span.onclick = function () {
118 |             document.body.removeChild(modal)
119 |         }
120 |     }
121 | 
122 |     handleImages()
123 | </script>
124 | </body>
125 | </html>


--------------------------------------------------------------------------------
/docs/resnets/models/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta http-equiv="content-type" content="text/html;charset=utf-8"/>
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
  6 |     <meta name="description" content=""/>
  7 | 
  8 |     <meta name="twitter:card" content="summary"/>
  9 |     <meta name="twitter:image:src" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
 10 |     <meta name="twitter:title" content="None"/>
 11 |     <meta name="twitter:description" content=""/>
 12 |     <meta name="twitter:site" content="@labmlai"/>
 13 |     <meta name="twitter:creator" content="@labmlai"/>
 14 | 
 15 |     <meta property="og:url" content="https://nn.labml.ai/resnets/models/index.html"/>
 16 |     <meta property="og:title" content="None"/>
 17 |     <meta property="og:image" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
 18 |     <meta property="og:site_name" content="LabML Neural Networks"/>
 19 |     <meta property="og:type" content="object"/>
 20 |     <meta property="og:title" content="None"/>
 21 |     <meta property="og:description" content=""/>
 22 | 
 23 |     <title>None</title>
 24 |     <link rel="shortcut icon" href="/icon.png"/>
 25 |     <link rel="stylesheet" href="../../pylit.css">
 26 |     <link rel="canonical" href="https://nn.labml.ai/resnets/models/index.html"/>
 27 |     <!-- Global site tag (gtag.js) - Google Analytics -->
 28 |     <script async src="https://www.googletagmanager.com/gtag/js?id=G-4V3HC8HBLH"></script>
 29 |     <script>
 30 |         window.dataLayer = window.dataLayer || [];
 31 | 
 32 |         function gtag() {
 33 |             dataLayer.push(arguments);
 34 |         }
 35 | 
 36 |         gtag('js', new Date());
 37 | 
 38 |         gtag('config', 'G-4V3HC8HBLH');
 39 |     </script>
 40 | </head>
 41 | <body>
 42 | <div id='container'>
 43 |     <div id="background"></div>
 44 |     <div class='section'>
 45 |         <div class='docs'>
 46 |             <p>
 47 |                 <a class="parent" href="/">home</a>
 48 |                 <a class="parent" href="../index.html">resnets</a>
 49 |                 <a class="parent" href="index.html">models</a>
 50 |             </p>
 51 |             <p>
 52 | 
 53 |                 <a href="https://github.com/lab-ml/labml_nn/tree/master/labml_nn/resnets/models/__init__.py">
 54 |                     <img alt="Github"
 55 |                          src="https://img.shields.io/github/stars/lab-ml/nn?style=social"
 56 |                          style="max-width:100%;"/></a>
 57 |                 <a href="https://twitter.com/labmlai"
 58 |                    rel="nofollow">
 59 |                     <img alt="Twitter"
 60 |                          src="https://img.shields.io/twitter/follow/labmlai?style=social"
 61 |                          style="max-width:100%;"/></a>
 62 |             </p>
 63 |         </div>
 64 |     </div>
 65 |     </div>
 66 | </div>
 67 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=TeX-AMS_HTML">
 68 | </script>
 69 | <!-- MathJax configuration -->
 70 | <script type="text/x-mathjax-config">
 71 |     MathJax.Hub.Config({
 72 |         tex2jax: {
 73 |             inlineMath: [ ['$','$'] ],
 74 |             displayMath: [ ['$$','$$'] ],
 75 |             processEscapes: true,
 76 |             processEnvironments: true
 77 |         },
 78 |         // Center justify equations in code and markdown cells. Elsewhere
 79 |         // we use CSS to left justify single line equations in code cells.
 80 |         displayAlign: 'center',
 81 |         "HTML-CSS": { fonts: ["TeX"] }
 82 |     });
 83 | </script>
 84 | <script>
 85 |     function handleImages() {
 86 |         var images = document.querySelectorAll('p>img')
 87 | 
 88 |         console.log(images);
 89 |         for (var i = 0; i < images.length; ++i) {
 90 |             handleImage(images[i])
 91 |         }
 92 |     }
 93 | 
 94 |     function handleImage(img) {
 95 |         img.parentElement.style.textAlign = 'center'
 96 | 
 97 |         var modal = document.createElement('div')
 98 |         modal.id = 'modal'
 99 | 
100 |         var modalContent = document.createElement('div')
101 |         modal.appendChild(modalContent)
102 | 
103 |         var modalImage = document.createElement('img')
104 |         modalContent.appendChild(modalImage)
105 | 
106 |         var span = document.createElement('span')
107 |         span.classList.add('close')
108 |         span.textContent = 'x'
109 |         modal.appendChild(span)
110 | 
111 |         img.onclick = function () {
112 |             console.log('clicked')
113 |             document.body.appendChild(modal)
114 |             modalImage.src = img.src
115 |         }
116 | 
117 |         span.onclick = function () {
118 |             document.body.removeChild(modal)
119 |         }
120 |     }
121 | 
122 |     handleImages()
123 | </script>
124 | </body>
125 | </html>


--------------------------------------------------------------------------------
/labml_nn/transformers/fast_weights/token_wise.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ---
  3 | title: Fast Weight Systems
  4 | summary: >
  5 |   This is an annotated implementation/tutorial of
  6 |   Linear Transformers Are Secretly Fast Weight Memory Systems in PyTorch.
  7 | ---
  8 | """
  9 | from typing import Optional
 10 | 
 11 | import torch
 12 | from torch import nn
 13 | 
 14 | from labml_helpers.module import Module
 15 | from labml_nn.transformers.fast_weights import DPFP
 16 | from labml_nn.transformers.feed_forward import FeedForward
 17 | from labml_nn.transformers.mha import PrepareForMultiHeadAttention
 18 | from labml_nn.utils import clone_module_list
 19 | 
 20 | 
 21 | class FastWeightsAttention(Module):
 22 |     def __init__(self, heads: int, d_model: int, dropout_prob: float, phi: DPFP):
 23 |         super().__init__()
 24 | 
 25 |         # Number of features per head
 26 |         self.d_k = d_model // heads
 27 |         #
 28 |         self.heads = heads
 29 | 
 30 |         # These transform the `query` multi-headed attention.
 31 |         self.query = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=False)
 32 |         # These transform the `key` and `value` for multi-headed attention.
 33 |         self.key = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=False)
 34 |         self.value = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=False)
 35 | 
 36 |         self.gate = nn.Sequential(PrepareForMultiHeadAttention(d_model, heads, 1, bias=False),
 37 |                                   nn.Sigmoid())
 38 | 
 39 |         self.phi = phi
 40 | 
 41 |         # Output layer
 42 |         self.output = nn.Linear(d_model, d_model)
 43 |         # Dropout
 44 |         self.dropout = nn.Dropout(dropout_prob)
 45 | 
 46 |     def __call__(self, x: torch.Tensor, weights: Optional[torch.Tensor]):
 47 |         query = self.phi(self.query(x))
 48 |         key = self.phi(self.key(x))
 49 |         value = self.value(x)
 50 | 
 51 |         if weights is None:
 52 |             weights = key.new_zeros((key.shape[0], key.shape[1], value.shape[2], key.shape[2]))
 53 | 
 54 |         value_existing = torch.einsum('bhvk,bhk->bhv', weights, key)
 55 | 
 56 |         beta = self.gate(x)
 57 | 
 58 |         weights = weights + torch.einsum('bhv,bhk->bhvk', beta * (value - value_existing), key)
 59 | 
 60 |         x = torch.einsum('bhvk,bhk->bhv', weights, query)
 61 | 
 62 |         # Concatenate multiple heads
 63 |         x = x.reshape(x.shape[0], -1)
 64 | 
 65 |         # Output layer
 66 |         return self.output(x), weights
 67 | 
 68 | 
 69 | class FastWeightsAttentionTransformerLayer(Module):
 70 |     def __init__(self, *,
 71 |                  d_model: int,
 72 |                  attn: FastWeightsAttention,
 73 |                  feed_forward: FeedForward,
 74 |                  dropout_prob: float):
 75 |         super().__init__()
 76 |         # Transformer size $d_{model}$
 77 |         self.size = d_model
 78 |         #
 79 |         self.attn = attn
 80 |         self.feed_forward = feed_forward
 81 |         self.dropout = nn.Dropout(dropout_prob)
 82 | 
 83 |         # Normalization layers
 84 |         self.norm_self_attn = nn.LayerNorm([d_model])
 85 |         self.norm_ff = nn.LayerNorm([d_model])
 86 | 
 87 |     def __call__(self, x: torch.Tensor, weights: Optional[torch.Tensor]):
 88 |         attn, weights = self.attn(x, weights)
 89 |         # Add the self attention results
 90 |         x = x + self.dropout(attn)
 91 | 
 92 |         # Normalize for feed-forward
 93 |         z = self.norm_ff(x)
 94 |         # Pass through the feed-forward network
 95 |         ff = self.feed_forward(z)
 96 |         # Add the feed-forward results back
 97 |         x = x + self.dropout(ff)
 98 | 
 99 |         #
100 |         return x, weights
101 | 
102 | 
103 | class FastWeightsAttentionTransformer(Module):
104 |     def __init__(self, layer: FastWeightsAttentionTransformerLayer, n_layers: int):
105 |         super().__init__()
106 |         # Make copies of the transformer layer
107 |         self.layers = clone_module_list(layer, n_layers)
108 |         # Final normalization layer
109 |         self.norm = nn.LayerNorm([layer.size])
110 | 
111 |     def __call__(self, x_seq: torch.Tensor):
112 |         # Split the input to a list along the sequence axis
113 |         x_seq = torch.unbind(x_seq, dim=0)
114 |         # List to store the outputs
115 |         res = []
116 |         # For each input step
117 |         weights = [None for _ in range(len(self.layers))]
118 | 
119 |         for x in x_seq:
120 |             # Run through each layer
121 |             for i, layer in enumerate(self.layers):
122 |                 # Get layer output
123 |                 x, weights[i] = layer(x, weights[i])
124 | 
125 |             res.append(x)
126 | 
127 |         # Stack the output tensors
128 |         res = torch.stack(res)
129 |         # Normalize the output
130 |         return self.norm(res)
131 | 


--------------------------------------------------------------------------------
/docs/transformers/basic/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta http-equiv="content-type" content="text/html;charset=utf-8"/>
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
  6 |     <meta name="description" content=""/>
  7 | 
  8 |     <meta name="twitter:card" content="summary"/>
  9 |     <meta name="twitter:image:src" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
 10 |     <meta name="twitter:title" content="None"/>
 11 |     <meta name="twitter:description" content=""/>
 12 |     <meta name="twitter:site" content="@labmlai"/>
 13 |     <meta name="twitter:creator" content="@labmlai"/>
 14 | 
 15 |     <meta property="og:url" content="https://nn.labml.ai/transformers/basic/index.html"/>
 16 |     <meta property="og:title" content="None"/>
 17 |     <meta property="og:image" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
 18 |     <meta property="og:site_name" content="LabML Neural Networks"/>
 19 |     <meta property="og:type" content="object"/>
 20 |     <meta property="og:title" content="None"/>
 21 |     <meta property="og:description" content=""/>
 22 | 
 23 |     <title>None</title>
 24 |     <link rel="shortcut icon" href="/icon.png"/>
 25 |     <link rel="stylesheet" href="../../pylit.css">
 26 |     <link rel="canonical" href="https://nn.labml.ai/transformers/basic/index.html"/>
 27 |     <!-- Global site tag (gtag.js) - Google Analytics -->
 28 |     <script async src="https://www.googletagmanager.com/gtag/js?id=G-4V3HC8HBLH"></script>
 29 |     <script>
 30 |         window.dataLayer = window.dataLayer || [];
 31 | 
 32 |         function gtag() {
 33 |             dataLayer.push(arguments);
 34 |         }
 35 | 
 36 |         gtag('js', new Date());
 37 | 
 38 |         gtag('config', 'G-4V3HC8HBLH');
 39 |     </script>
 40 | </head>
 41 | <body>
 42 | <div id='container'>
 43 |     <div id="background"></div>
 44 |     <div class='section'>
 45 |         <div class='docs'>
 46 |             <p>
 47 |                 <a class="parent" href="/">home</a>
 48 |                 <a class="parent" href="../index.html">transformers</a>
 49 |                 <a class="parent" href="index.html">basic</a>
 50 |             </p>
 51 |             <p>
 52 | 
 53 |                 <a href="https://github.com/lab-ml/labml_nn/tree/master/labml_nn/transformers/basic/__init__.py">
 54 |                     <img alt="Github"
 55 |                          src="https://img.shields.io/github/stars/lab-ml/nn?style=social"
 56 |                          style="max-width:100%;"/></a>
 57 |                 <a href="https://twitter.com/labmlai"
 58 |                    rel="nofollow">
 59 |                     <img alt="Twitter"
 60 |                          src="https://img.shields.io/twitter/follow/labmlai?style=social"
 61 |                          style="max-width:100%;"/></a>
 62 |             </p>
 63 |         </div>
 64 |     </div>
 65 |     </div>
 66 | </div>
 67 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=TeX-AMS_HTML">
 68 | </script>
 69 | <!-- MathJax configuration -->
 70 | <script type="text/x-mathjax-config">
 71 |     MathJax.Hub.Config({
 72 |         tex2jax: {
 73 |             inlineMath: [ ['$','$'] ],
 74 |             displayMath: [ ['$$','$$'] ],
 75 |             processEscapes: true,
 76 |             processEnvironments: true
 77 |         },
 78 |         // Center justify equations in code and markdown cells. Elsewhere
 79 |         // we use CSS to left justify single line equations in code cells.
 80 |         displayAlign: 'center',
 81 |         "HTML-CSS": { fonts: ["TeX"] }
 82 |     });
 83 | </script>
 84 | <script>
 85 |     function handleImages() {
 86 |         var images = document.querySelectorAll('p>img')
 87 | 
 88 |         console.log(images);
 89 |         for (var i = 0; i < images.length; ++i) {
 90 |             handleImage(images[i])
 91 |         }
 92 |     }
 93 | 
 94 |     function handleImage(img) {
 95 |         img.parentElement.style.textAlign = 'center'
 96 | 
 97 |         var modal = document.createElement('div')
 98 |         modal.id = 'modal'
 99 | 
100 |         var modalContent = document.createElement('div')
101 |         modal.appendChild(modalContent)
102 | 
103 |         var modalImage = document.createElement('img')
104 |         modalContent.appendChild(modalImage)
105 | 
106 |         var span = document.createElement('span')
107 |         span.classList.add('close')
108 |         span.textContent = 'x'
109 |         modal.appendChild(span)
110 | 
111 |         img.onclick = function () {
112 |             console.log('clicked')
113 |             document.body.appendChild(modal)
114 |             modalImage.src = img.src
115 |         }
116 | 
117 |         span.onclick = function () {
118 |             document.body.removeChild(modal)
119 |         }
120 |     }
121 | 
122 |     handleImages()
123 | </script>
124 | </body>
125 | </html>


--------------------------------------------------------------------------------
/labml_nn/transformers/glu_variants/experiment.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ---
  3 | title: Gated Linear Units and Variants
  4 | summary: >
  5 |   Train an auto-regressive transformer with Gated Linear Units and variants
  6 |   for the position-wise feedforward network (FFN).
  7 | ---
  8 | 
  9 | # Gated Linear Units and Variants
 10 | 
 11 | This trains a simple [transformer](../../) model for auto-regression.
 12 | We try different variants for the [position-wise feedforward network](../feed_forward).
 13 | The reusable & configurable are defined in [`configs.py`](configs.html).
 14 | """
 15 | 
 16 | import torch
 17 | from labml import experiment
 18 | from labml.configs import option
 19 | from labml.utils.pytorch import get_modules
 20 | from labml_helpers.module import Module
 21 | 
 22 | from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
 23 | from labml_nn.transformers import Encoder, Generator, TransformerConfigs
 24 | from labml_nn.transformers.utils import subsequent_mask
 25 | 
 26 | 
 27 | class AutoregressiveModel(Module):
 28 |     """
 29 |     ## Auto regressive model
 30 |     """
 31 | 
 32 |     def __init__(self, src_embed: Module, encoder: Encoder, generator: Generator):
 33 |         super().__init__()
 34 |         # Token embedding module
 35 |         self.src_embed = src_embed
 36 |         # Transformer based encoder
 37 |         self.encoder = encoder
 38 |         # Next token generation layer;
 39 |         # this give logits  of the the next token
 40 |         self.generator = generator
 41 |         # This will be initialized on the first call
 42 |         self.src_mask = None
 43 | 
 44 |     def forward(self, src: torch.Tensor):
 45 |         # Create subsequent mask, so that the transformer can only pay attention to past tokens.
 46 |         if self.src_mask is None or self.src_mask.size(0) != len(src):
 47 |             self.src_mask = subsequent_mask(len(src)).to(src.device)
 48 |         # Embed the tokens (`src`) and run it through the the transformer
 49 |         res = self.encoder(self.src_embed(src), self.src_mask)
 50 |         # Generate logits of the next token
 51 |         return self.generator(res), None
 52 | 
 53 | 
 54 | class Configs(NLPAutoRegressionConfigs):
 55 |     """
 56 |     ## Configurations
 57 | 
 58 |     The default configs can and will be over-ridden when we start the experiment
 59 |     """
 60 | 
 61 |     transformer: TransformerConfigs
 62 |     model: AutoregressiveModel
 63 | 
 64 | 
 65 | @option(Configs.model)
 66 | def autoregressive_model(c: Configs):
 67 |     """
 68 |     Initialize the auto-regressive model
 69 |     """
 70 |     m = AutoregressiveModel(c.transformer.src_embed, c.transformer.encoder, c.transformer.generator)
 71 |     return m.to(c.device)
 72 | 
 73 | 
 74 | @option(Configs.transformer)
 75 | def transformer_c(c: Configs):
 76 |     """
 77 |     Initialize the [configurable transformer](../configs.html) encoder for our autoregressive model.
 78 |     """
 79 |     tc = TransformerConfigs()
 80 |     tc.n_src_vocab = c.n_tokens
 81 |     tc.n_tgt_vocab = c.n_tokens
 82 | 
 83 |     return tc
 84 | 
 85 | 
 86 | def main():
 87 |     # Create experiment
 88 |     experiment.create(name="glu_variants")
 89 |     # Create configs
 90 |     conf = Configs()
 91 |     # Load configurations
 92 |     experiment.configs(conf,
 93 |                        # A dictionary of configurations to override
 94 |                        {'tokenizer': 'character',
 95 |                         'prompt_separator': '',
 96 |                         'prompt': 'It is ',
 97 |                         'text': 'tiny_shakespeare',
 98 | 
 99 |                         'optimizer.optimizer': 'Noam',
100 |                         'optimizer.learning_rate': 1.,
101 |                         'optimizer.d_model': 256,
102 | 
103 |                         'seq_len': 1024,
104 |                         'epochs': 128,
105 |                         'batch_size': 6,
106 |                         'inner_iterations': 10,
107 | 
108 |                         # GLU Variant, one of GLU, Bilinear, ReGLU, GEGLU, SwiGLU
109 |                         #
110 |                         # These are defined in the [configurable FFN](../configs.html#FFN)
111 |                         # implementation
112 |                         'transformer.ffn.glu_variant': 'Bilinear',
113 | 
114 |                         # Transformer configurations
115 |                         'transformer.d_model': 256,
116 |                         'transformer.ffn.d_ff': 1024,
117 |                         'transformer.n_heads': 8,
118 |                         'transformer.n_layers': 6})
119 | 
120 |     # This is needed to initialize models
121 |     conf.n_tokens = conf.text.n_tokens
122 | 
123 |     # Set models for saving and loading
124 |     experiment.add_pytorch_models(get_modules(conf))
125 | 
126 |     # Start the experiment
127 |     with experiment.start():
128 |         # `TrainValidConfigs.run`
129 |         conf.run()
130 | 
131 | 
132 | if __name__ == '__main__':
133 |     main()
134 | 


--------------------------------------------------------------------------------
/labml_nn/transformers/fnet/experiment.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ---
  3 | title: FNet Experiment
  4 | summary: This experiment trains a FNet based model on AG News dataset.
  5 | ---
  6 | 
  7 | # [FNet](index.html) Experiment
  8 | 
  9 | This is an annotated PyTorch experiment to train a [FNet model](index.html).
 10 | 
 11 | This is based on
 12 | [general training loop and configurations for AG News classification task](../../experiments/nlp_classification.html).
 13 | """
 14 | 
 15 | import torch
 16 | from torch import nn
 17 | 
 18 | from labml import experiment
 19 | from labml.configs import option
 20 | from labml_helpers.module import Module
 21 | from labml_nn.experiments.nlp_classification import NLPClassificationConfigs
 22 | from labml_nn.transformers import Encoder
 23 | from labml_nn.transformers import TransformerConfigs
 24 | 
 25 | 
 26 | class TransformerClassifier(nn.Module):
 27 |     """
 28 |     # Transformer based classifier model
 29 |     """
 30 |     def __init__(self, encoder: Encoder, src_embed: Module, generator: nn.Linear):
 31 |         """
 32 |         * `encoder` is the transformer [Encoder](../models.html#Encoder)
 33 |         * `src_embed` is the token
 34 |         [embedding module (with positional encodings)](../models.html#EmbeddingsWithLearnedPositionalEncoding)
 35 |         * `generator` is the [final fully connected layer](../models.html#Generator) that gives the logits.
 36 |         """
 37 |         super().__init__()
 38 |         self.src_embed = src_embed
 39 |         self.encoder = encoder
 40 |         self.generator = generator
 41 | 
 42 |     def forward(self, x: torch.Tensor):
 43 |         # Get the token embeddings with positional encodings
 44 |         x = self.src_embed(x)
 45 |         # Transformer encoder
 46 |         x = self.encoder(x, None)
 47 |         # Get logits for classification.
 48 |         #
 49 |         # We set the `[CLS]` token at the last position of the sequence.
 50 |         # This is extracted by `x[-1]`, where `x` is of
 51 |         # shape `[seq_len, batch_size, d_model]`
 52 |         x = self.generator(x[-1])
 53 | 
 54 |         # Return results
 55 |         # (second value is for state, since our trainer is used with RNNs also)
 56 |         return x, None
 57 | 
 58 | 
 59 | class Configs(NLPClassificationConfigs):
 60 |     """
 61 |     ## Configurations
 62 | 
 63 |     This inherits from
 64 |     [`NLPClassificationConfigs`](../../experiments/nlp_classification.html)
 65 |     """
 66 | 
 67 |     # Classification model
 68 |     model: TransformerClassifier
 69 |     # Transformer
 70 |     transformer: TransformerConfigs
 71 | 
 72 | 
 73 | @option(Configs.transformer)
 74 | def _transformer_configs(c: Configs):
 75 |     """
 76 |     ### Transformer configurations
 77 |     """
 78 | 
 79 |     # We use our
 80 |     # [configurable transformer implementation](../configs.html#TransformerConfigs)
 81 |     conf = TransformerConfigs()
 82 |     # Set the vocabulary sizes for embeddings and generating logits
 83 |     conf.n_src_vocab = c.n_tokens
 84 |     conf.n_tgt_vocab = c.n_tokens
 85 | 
 86 |     #
 87 |     return conf
 88 | 
 89 | 
 90 | @option(TransformerConfigs.encoder_attn)
 91 | def fnet_mix():
 92 |     """
 93 |     Create `FNetMix` module that can replace the self-attention in
 94 |     [transformer encoder layer](../models.html#TransformerLayer)
 95 | .
 96 |     """
 97 |     from labml_nn.transformers.fnet import FNetMix
 98 |     return FNetMix()
 99 | 
100 | 
101 | @option(Configs.model)
102 | def _model(c: Configs):
103 |     """
104 |     Create classification model
105 |     """
106 |     m = TransformerClassifier(c.transformer.encoder,
107 |                               c.transformer.src_embed,
108 |                               nn.Linear(c.d_model, c.n_classes)).to(c.device)
109 | 
110 |     return m
111 | 
112 | 
113 | def main():
114 |     # Create experiment
115 |     experiment.create(name="fnet")
116 |     # Create configs
117 |     conf = Configs()
118 |     # Override configurations
119 |     experiment.configs(conf, {
120 |         # Use world level tokenizer
121 |         'tokenizer': 'basic_english',
122 | 
123 |         # Train for $32$ epochs
124 |         'epochs': 32,
125 |         # Switch between training and validation for $10$ times
126 |         # per epoch
127 |         'inner_iterations': 10,
128 | 
129 |         # Transformer configurations (same as defaults)
130 |         'transformer.d_model': 512,
131 |         'transformer.ffn.d_ff': 2048,
132 |         'transformer.n_heads': 8,
133 |         'transformer.n_layers': 6,
134 | 
135 |         # Use [FNet](index.html) instead of self-a
136 |         # ttention
137 |         'transformer.encoder_attn': 'fnet_mix',
138 | 
139 |         # Use [Noam optimizer](../../optimizers/noam.html)
140 |         'optimizer.optimizer': 'Noam',
141 |         'optimizer.learning_rate': 1.,
142 |     })
143 | 
144 |     # Set models for saving and loading
145 |     experiment.add_pytorch_models({'model': conf.model})
146 | 
147 |     # Start the experiment
148 |     with experiment.start():
149 |         # Run training
150 |         conf.run()
151 | 
152 | 
153 | #
154 | if __name__ == '__main__':
155 |     main()
156 | 


--------------------------------------------------------------------------------
/docs/transformers/relative_mha.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta http-equiv="content-type" content="text/html;charset=utf-8"/>
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
  6 |     <meta name="description" content="Relative Multi-Headed Attention from paper Transformer-XL."/>
  7 |     <meta http-equiv="refresh" content="0; URL=https://nn.labml.ai/transformers/xl/relative_mha.html"/>
  8 | 
  9 |     <meta name="twitter:card" content="summary"/>
 10 |     <meta name="twitter:image:src" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
 11 |     <meta name="twitter:title" content="Relative Multi-Headed Attention"/>
 12 |     <meta name="twitter:description" content="Relative Multi-Headed Attention from paper Transformer-XL."/>
 13 |     <meta name="twitter:site" content="@labmlai"/>
 14 |     <meta name="twitter:creator" content="@labmlai"/>
 15 | 
 16 |     <meta property="og:url" content="https://nn.labml.ai/transformers/relative_mha.html"/>
 17 |     <meta property="og:title" content="Relative Multi-Headed Attention"/>
 18 |     <meta property="og:image" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
 19 |     <meta property="og:site_name" content="LabML Neural Networks"/>
 20 |     <meta property="og:type" content="object"/>
 21 |     <meta property="og:title" content="Relative Multi-Headed Attention"/>
 22 |     <meta property="og:description" content="Relative Multi-Headed Attention from paper Transformer-XL."/>
 23 | 
 24 |     <title>Relative Multi-Headed Attention</title>
 25 |     <link rel="shortcut icon" href="/icon.png"/>
 26 |     <link rel="stylesheet" href="../pylit.css">
 27 |     <link rel="canonical" href="https://nn.labml.ai/transformers/relative_mha.html"/>
 28 |     <!-- Global site tag (gtag.js) - Google Analytics -->
 29 |     <script async src="https://www.googletagmanager.com/gtag/js?id=G-4V3HC8HBLH"></script>
 30 |     <script>
 31 |         window.dataLayer = window.dataLayer || [];
 32 | 
 33 |         function gtag() {
 34 |             dataLayer.push(arguments);
 35 |         }
 36 | 
 37 |         gtag('js', new Date());
 38 | 
 39 |         gtag('config', 'G-4V3HC8HBLH');
 40 |     </script>
 41 | </head>
 42 | <body>
 43 | <div id='container'>
 44 |     <div id="background"></div>
 45 |     <div class='section'>
 46 |         <div class='docs'>
 47 |             <p>
 48 |                 <a class="parent" href="/">home</a>
 49 |                 <a class="parent" href="index.html">transformers</a>
 50 |             </p>
 51 |             <p>
 52 | 
 53 |                 <a href="https://github.com/lab-ml/labml_nn/tree/master/labml_nn/transformers/relative_mha.py">
 54 |                     <img alt="Github"
 55 |                          src="https://img.shields.io/github/stars/lab-ml/nn?style=social"
 56 |                          style="max-width:100%;"/></a>
 57 |                 <a href="https://twitter.com/labmlai"
 58 |                    rel="nofollow">
 59 |                     <img alt="Twitter"
 60 |                          src="https://img.shields.io/twitter/follow/labmlai?style=social"
 61 |                          style="max-width:100%;"/></a>
 62 |             </p>
 63 |         </div>
 64 |     </div>
 65 |     </div>
 66 | </div>
 67 | <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=TeX-AMS_HTML">
 68 | </script>
 69 | <!-- MathJax configuration -->
 70 | <script type="text/x-mathjax-config">
 71 |     MathJax.Hub.Config({
 72 |         tex2jax: {
 73 |             inlineMath: [ ['$','$'] ],
 74 |             displayMath: [ ['$$','$$'] ],
 75 |             processEscapes: true,
 76 |             processEnvironments: true
 77 |         },
 78 |         // Center justify equations in code and markdown cells. Elsewhere
 79 |         // we use CSS to left justify single line equations in code cells.
 80 |         displayAlign: 'center',
 81 |         "HTML-CSS": { fonts: ["TeX"] }
 82 |     });
 83 | </script>
 84 | <script>
 85 |     function handleImages() {
 86 |         var images = document.querySelectorAll('p>img')
 87 | 
 88 |         console.log(images);
 89 |         for (var i = 0; i < images.length; ++i) {
 90 |             handleImage(images[i])
 91 |         }
 92 |     }
 93 | 
 94 |     function handleImage(img) {
 95 |         img.parentElement.style.textAlign = 'center'
 96 | 
 97 |         var modal = document.createElement('div')
 98 |         modal.id = 'modal'
 99 | 
100 |         var modalContent = document.createElement('div')
101 |         modal.appendChild(modalContent)
102 | 
103 |         var modalImage = document.createElement('img')
104 |         modalContent.appendChild(modalImage)
105 | 
106 |         var span = document.createElement('span')
107 |         span.classList.add('close')
108 |         span.textContent = 'x'
109 |         modal.appendChild(span)
110 | 
111 |         img.onclick = function () {
112 |             console.log('clicked')
113 |             document.body.appendChild(modal)
114 |             modalImage.src = img.src
115 |         }
116 | 
117 |         span.onclick = function () {
118 |             document.body.removeChild(modal)
119 |         }
120 |     }
121 | 
122 |     handleImages()
123 | </script>
124 | </body>
125 | </html>


--------------------------------------------------------------------------------
/labml_nn/transformers/knn/train_model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ---
  3 | title: Train Autoregressive Transformer
  4 | summary: This is training code with notes for a basic auto-regressive transformer.
  5 | ---
  6 | 
  7 | # Train Autoregressive Transformer
  8 | 
  9 | This trains a simple [transformer](../../) model for auto-regression.
 10 | """
 11 | 
 12 | import torch
 13 | from labml import experiment
 14 | from labml.configs import option
 15 | from labml.utils.pytorch import get_modules
 16 | from labml_helpers.module import Module
 17 | 
 18 | from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
 19 | from labml_nn.transformers import Encoder, Generator, TransformerConfigs
 20 | from labml_nn.transformers.utils import subsequent_mask
 21 | 
 22 | 
 23 | class AutoregressiveModel(Module):
 24 |     """
 25 |     ## Auto regressive model
 26 |     """
 27 | 
 28 |     def __init__(self, src_embed: Module, encoder: Encoder, generator: Generator, *,
 29 |                  is_save_ff_input: bool = False):
 30 |         super().__init__()
 31 |         # Token embedding module
 32 |         self.src_embed = src_embed
 33 |         # Transformer based encoder
 34 |         self.encoder = encoder
 35 |         # Whether the last layer of the encoder should
 36 |         # save the input to the feed-forward layer.
 37 |         # This is out $f(c_t)$, the embedding of the context.
 38 |         self.encoder.layers[-1].is_save_ff_input = is_save_ff_input
 39 |         # Next token generation layer;
 40 |         # this give logits  of the the next token
 41 |         self.generator = generator
 42 |         # This will be initialized on the first call
 43 |         self.src_mask = None
 44 | 
 45 |     @property
 46 |     def ff_input(self) -> torch.Tensor:
 47 |         """
 48 |         Retrieve saved $f(c_t)$
 49 |         """
 50 |         return self.encoder.layers[-1].ff_input
 51 | 
 52 |     def forward(self, src: torch.Tensor):
 53 |         # Create subsequent mask, so that the transformer can only pay attention to past tokens.
 54 |         if self.src_mask is None or self.src_mask.size(0) != len(src):
 55 |             self.src_mask = subsequent_mask(len(src)).to(src.device)
 56 |         # Embed the tokens (`src`) and run it through the the transformer
 57 |         res = self.encoder(self.src_embed(src), self.src_mask)
 58 |         # Generate logits of the next token
 59 |         return self.generator(res), None
 60 | 
 61 | 
 62 | class Configs(NLPAutoRegressionConfigs):
 63 |     """
 64 |     ## Configurations
 65 | 
 66 |     The default configs can and will be over-ridden when we start the experiment
 67 |     """
 68 | 
 69 |     transformer: TransformerConfigs
 70 |     model: AutoregressiveModel
 71 | 
 72 |     is_save_ff_input = False
 73 | 
 74 | 
 75 | @option(Configs.model)
 76 | def autoregressive_model(c: Configs):
 77 |     """
 78 |     Initialize the auto-regressive model
 79 |     """
 80 |     m = AutoregressiveModel(
 81 |         # Get the source token embedding layer, encoder and
 82 |         # final token generator from configurable transformer
 83 |         src_embed=c.transformer.src_embed,
 84 |         encoder=c.transformer.encoder,
 85 |         generator=c.transformer.generator,
 86 |         # Whether to save $f(c_t)$
 87 |         is_save_ff_input=c.is_save_ff_input)
 88 |     return m.to(c.device)
 89 | 
 90 | 
 91 | @option(Configs.transformer)
 92 | def transformer_c(c: Configs):
 93 |     """
 94 |     Initialize the configurable transformer encoder for our autoregressive model
 95 |     """
 96 |     tc = TransformerConfigs()
 97 |     tc.n_src_vocab = c.n_tokens
 98 |     tc.n_tgt_vocab = c.n_tokens
 99 | 
100 |     return tc
101 | 
102 | 
103 | def main():
104 |     # Create experiment
105 |     experiment.create(name="knn_lm")
106 |     # Create configs
107 |     conf = Configs()
108 |     # Load configurations
109 |     experiment.configs(conf,
110 |                        # A dictionary of configurations to override
111 |                        {'tokenizer': 'character',
112 |                         'prompt_separator': '',
113 |                         'prompt': 'It is ',
114 |                         'text': 'tiny_shakespeare',
115 | 
116 |                         'optimizer.optimizer': 'Noam',
117 |                         'optimizer.learning_rate': 1.,
118 |                         'optimizer.d_model': 256,
119 | 
120 |                         'seq_len': 1024,
121 |                         'epochs': 128,
122 |                         'batch_size': 6,
123 |                         'inner_iterations': 10,
124 | 
125 |                         # Transformer configurations
126 |                         'transformer.d_model': 256,
127 |                         'transformer.ffn.d_ff': 1024,
128 |                         'transformer.n_heads': 8,
129 |                         'transformer.n_layers': 6})
130 | 
131 |     # This is needed to initialize models
132 |     conf.n_tokens = conf.text.n_tokens
133 | 
134 |     # Set models for saving and loading
135 |     experiment.add_pytorch_models(get_modules(conf))
136 | 
137 |     # Start the experiment
138 |     with experiment.start():
139 |         # `TrainValidConfigs.run`
140 |         conf.run()
141 | 
142 | 
143 | if __name__ == '__main__':
144 |     main()
145 | 


--------------------------------------------------------------------------------