├── tests
    ├── __init__.py
    ├── utils
    │   ├── __init__.py
    │   └── dist_test.py
    ├── test_decorator
    │   └── test_no_support.py
    ├── test_layer
    │   ├── test_mlp
    │   │   ├── test_vit_moe_mlp.py
    │   │   ├── test_transformer_mlp.py
    │   │   ├── test_vit_mlp.py
    │   │   └── test_detr_mlp.py
    │   ├── test_embedding
    │   │   ├── test_gpt_embedding.py
    │   │   └── test_vit_embedding.py
    │   ├── test_head
    │   │   ├── test_vit_head.py
    │   │   └── test_gpt_head.py
    │   ├── test_block
    │   │   ├── test_gpt_block.py
    │   │   ├── test_deepnet_block.py
    │   │   ├── test_vit_block.py
    │   │   └── test_detr_block.py
    │   └── test_attention
    │   │   └── test_transformer_attention.py
    ├── test_model
    │   ├── test_gpt.py
    │   ├── test_deepnet.py
    │   ├── test_vit.py
    │   ├── test_detr.py
    │   └── test_moe.py
    └── test_dataloader
    │   └── test_bert_pretrain_dataloader.py
├── version.txt
├── titans
    ├── dataloader
    │   ├── __init__.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   └── rand_augment.py
    │   ├── cifar10
    │   │   ├── __init__.py
    │   │   └── torchvision_cifar10.py
    │   ├── imagenet
    │   │   ├── __init__.py
    │   │   └── torchvision_imagenet.py
    │   └── bert
    │   │   ├── __init__.py
    │   │   ├── parquet_dataset.py
    │   │   └── bert_pretrain.py
    ├── model
    │   ├── vilt
    │   │   ├── __init__.py
    │   │   └── vilt.py
    │   ├── transformer
    │   │   ├── __init__.py
    │   │   └── transformer.py
    │   ├── detr
    │   │   ├── __init__.py
    │   │   └── detr.py
    │   ├── gpt
    │   │   ├── __init__.py
    │   │   └── gpt.py
    │   ├── vit
    │   │   ├── __init__.py
    │   │   └── vit.py
    │   ├── deepnet
    │   │   ├── __init__.py
    │   │   └── deepnet.py
    │   ├── knowledge_graph_embedding
    │   │   ├── dataloader
    │   │   │   ├── __init__.py
    │   │   │   └── dataloader.py
    │   │   └── __init__.py
    │   ├── moe
    │   │   ├── __init__.py
    │   │   ├── util.py
    │   │   ├── widenet.py
    │   │   ├── vit_moe.py
    │   │   └── gpt_moe.py
    │   ├── __init__.py
    │   └── helper.py
    ├── loss
    │   ├── mlm_loss
    │   │   ├── __init__.py
    │   │   └── mlm_loss.py
    │   ├── lm_loss
    │   │   ├── __init__.py
    │   │   └── gpt_lmloss.py
    │   ├── embedding_loss
    │   │   ├── __init__.py
    │   │   └── embedding_loss.py
    │   ├── vocab_cross_entropy
    │   │   ├── __init__.py
    │   │   └── vocab_cross_entropy.py
    │   └── __init__.py
    ├── layer
    │   ├── batchnorm
    │   │   ├── __init__.py
    │   │   └── frozen_batchnorm_2d.py
    │   ├── head
    │   │   ├── __init__.py
    │   │   ├── gpt_lm_head.py
    │   │   └── vit_head.py
    │   ├── embedding
    │   │   ├── __init__.py
    │   │   ├── gpt_embedding.py
    │   │   └── vit_embedding.py
    │   ├── block
    │   │   ├── utils.py
    │   │   ├── __init__.py
    │   │   ├── transformer_encoder.py
    │   │   ├── vit_block.py
    │   │   ├── deepnet_block.py
    │   │   ├── transformer_decoder.py
    │   │   ├── detr_block.py
    │   │   └── gpt_block.py
    │   ├── mlp
    │   │   ├── __init__.py
    │   │   ├── detr_mlp.py
    │   │   ├── vit_moe_mlp.py
    │   │   ├── vit_mlp.py
    │   │   └── transformer_mlp.py
    │   ├── __init__.py
    │   ├── attention
    │   │   ├── __init__.py
    │   │   ├── vit_moe_attention.py
    │   │   ├── transformer_attention.py
    │   │   ├── vit_attention.py
    │   │   ├── detr_attention.py
    │   │   └── gpt_attention.py
    │   └── init_rules.py
    ├── __init__.py
    ├── decorator
    │   ├── __init__.py
    │   └── no_support.py
    └── utils
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── context.py
    │   └── tensor_parallel_data_split.py
├── requirements
    ├── requirements-test.txt
    └── requirements.txt
├── MANIFEST.in
├── .style.yapf
├── README.md
├── .pre-commit-config.yaml
├── .github
    ├── workflows
    │   ├── release.yml
    │   ├── release_test.yml
    │   ├── build.yml
    │   └── close_inactive.yml
    └── ISSUE_TEMPLATE
    │   ├── documentation.yml
    │   ├── bug-report.yml
    │   ├── feature_request.yml
    │   └── proposal.yml
├── setup.py
└── .gitignore


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/version.txt:
--------------------------------------------------------------------------------
1 | 0.0.7
2 | 


--------------------------------------------------------------------------------
/titans/dataloader/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/titans/model/vilt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/titans/model/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements/requirements-test.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | 


--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .dist_test import *
2 | 


--------------------------------------------------------------------------------
/titans/model/detr/__init__.py:
--------------------------------------------------------------------------------
1 | from .detr import *
2 | 


--------------------------------------------------------------------------------
/titans/model/gpt/__init__.py:
--------------------------------------------------------------------------------
1 | from .gpt import *
2 | 


--------------------------------------------------------------------------------
/titans/model/vit/__init__.py:
--------------------------------------------------------------------------------
1 | from .vit import *
2 | 


--------------------------------------------------------------------------------
/titans/model/deepnet/__init__.py:
--------------------------------------------------------------------------------
1 | from .deepnet import *
2 | 


--------------------------------------------------------------------------------
/titans/model/knowledge_graph_embedding/dataloader/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision
3 | colossalai


--------------------------------------------------------------------------------
/titans/dataloader/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .rand_augment import *
2 | 


--------------------------------------------------------------------------------
/titans/loss/mlm_loss/__init__.py:
--------------------------------------------------------------------------------
1 | from .mlm_loss import MLM_loss
2 | 


--------------------------------------------------------------------------------
/titans/loss/lm_loss/__init__.py:
--------------------------------------------------------------------------------
1 | from .gpt_lmloss import GPTLMLoss
2 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt README.md
2 | recursive-include requirements *.txt


--------------------------------------------------------------------------------
/titans/dataloader/cifar10/__init__.py:
--------------------------------------------------------------------------------
1 | from .torchvision_cifar10 import *
2 | 


--------------------------------------------------------------------------------
/titans/loss/embedding_loss/__init__.py:
--------------------------------------------------------------------------------
1 | from .embedding_loss import embeddingLoss
2 | 


--------------------------------------------------------------------------------
/titans/layer/batchnorm/__init__.py:
--------------------------------------------------------------------------------
1 | from .frozen_batchnorm_2d import FrozenBatchNorm2d
2 | 


--------------------------------------------------------------------------------
/titans/model/knowledge_graph_embedding/__init__.py:
--------------------------------------------------------------------------------
1 | from .knowledge_graph_embedding import *
2 | 


--------------------------------------------------------------------------------
/titans/layer/head/__init__.py:
--------------------------------------------------------------------------------
1 | from .gpt_lm_head import GPTLMHead
2 | from .vit_head import ViTHead
3 | 


--------------------------------------------------------------------------------
/titans/dataloader/imagenet/__init__.py:
--------------------------------------------------------------------------------
1 | from .dali_imagenet import *
2 | from .torchvision_imagenet import *


--------------------------------------------------------------------------------
/titans/loss/vocab_cross_entropy/__init__.py:
--------------------------------------------------------------------------------
1 | from .vocab_cross_entropy import vocab_parallel_cross_entropy
2 | 


--------------------------------------------------------------------------------
/titans/layer/embedding/__init__.py:
--------------------------------------------------------------------------------
1 | from .gpt_embedding import GPTEmbedding
2 | from .vit_embedding import ViTEmbedding
3 | 


--------------------------------------------------------------------------------
/titans/dataloader/bert/__init__.py:
--------------------------------------------------------------------------------
1 | from .bert_pretrain import get_bert_pretrain_data_loader
2 | 
3 | __all__ = ['get_bert_pretrain_data_loader']


--------------------------------------------------------------------------------
/.style.yapf:
--------------------------------------------------------------------------------
1 | [style]
2 | based_on_style = google
3 | spaces_before_comment = 4
4 | split_before_logical_operator = true
5 | column_limit = 120
6 | 


--------------------------------------------------------------------------------
/titans/layer/block/utils.py:
--------------------------------------------------------------------------------
1 | import copy
2 | from torch import nn
3 | 
4 | 
5 | def get_clones(module, N):
6 |     return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
7 | 


--------------------------------------------------------------------------------
/titans/layer/mlp/__init__.py:
--------------------------------------------------------------------------------
1 | from .vit_mlp import ViTMLP
2 | from .vit_moe_mlp import MLPForMoe
3 | from .detr_mlp import DeTrMLP
4 | from .transformer_mlp import TransformerMLP
5 | 


--------------------------------------------------------------------------------
/titans/model/moe/__init__.py:
--------------------------------------------------------------------------------
1 | from .vit_moe import ViTMoE
2 | from .widenet import Widenet
3 | from .gpt_moe import MOEGPT, prmoe_4b, prmoe_16b, prmoe_25b, prmoe_29b, prmoe_31b, prmoe_51b
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ColossalAI-Models
2 | 
3 | [![Made with ColossalAI](https://img.shields.io/badge/Made%20with-ColossalAI-blue)](https://github.com/hpcaitech/ColossalAI)
4 | 
5 | 
6 | Model zoo for ColossalAI
7 | 


--------------------------------------------------------------------------------
/titans/loss/__init__.py:
--------------------------------------------------------------------------------
1 | from . import embedding_loss
2 | from . import lm_loss
3 | from . import mlm_loss
4 | from . import vocab_cross_entropy
5 | 
6 | __all__ = ['embedding_loss', 'lm_loss', 'mlm_loss', 'vocab_cross_entropy']


--------------------------------------------------------------------------------
/titans/__init__.py:
--------------------------------------------------------------------------------
1 | from . import layer
2 | from . import loss
3 | from . import model
4 | from . import utils
5 | from . import decorator
6 | from . import dataloader
7 | 
8 | __all__ = ['layer', 'loss', 'model', 'utils', 'decorator', 'dataloader']
9 | 


--------------------------------------------------------------------------------
/titans/decorator/__init__.py:
--------------------------------------------------------------------------------
1 | from .no_support import no_support, support_moe_only, support_sp_pp_only, support_tp_pp_only, no_parallel_support
2 | 
3 | __all__ = ['no_support', 'support_moe_only', 'support_sp_pp_only', 'support_tp_pp_only', 'no_parallel_support']


--------------------------------------------------------------------------------
/titans/layer/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import attention
 2 | from . import batchnorm
 3 | from . import block
 4 | from . import embedding
 5 | from . import head
 6 | from . import mlp
 7 | from .init_rules import init_rules
 8 | 
 9 | __all__ = ['attention', 'batchnorm', 'block', 'embedding', 'head', 'mlp', 'init_rules']
10 | 


--------------------------------------------------------------------------------
/titans/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # from . import detr
 2 | from . import gpt
 3 | from . import knowledge_graph_embedding
 4 | from . import moe
 5 | from . import transformer
 6 | from . import vilt
 7 | from . import vit
 8 | 
 9 | __all__ = ['detr', 'gpt', 'knowledge_graph_embedding', 'moe', 'transformer', 'vilt', 'vit']
10 | 


--------------------------------------------------------------------------------
/titans/layer/attention/__init__.py:
--------------------------------------------------------------------------------
1 | from .gpt_attention import GPTSelfAttention
2 | from .detr_attention import DeTrCrossAttention
3 | from .vit_attention import ViTSelfAttention
4 | from .vit_moe_attention import SelfAttentionForMoe
5 | from .transformer_attention import TransformerSelfAttention, TransformerMultiHeadAttention
6 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/mirrors-yapf
 3 |     rev: v0.32.0
 4 |     hooks:
 5 |     - id: yapf
 6 |       args: ['--style=.style.yapf', '--parallel', '--in-place']
 7 |   - repo: https://github.com/pre-commit/mirrors-clang-format
 8 |     rev: v13.0.1
 9 |     hooks:
10 |     - id: clang-format
11 | 


--------------------------------------------------------------------------------
/titans/layer/block/__init__.py:
--------------------------------------------------------------------------------
1 | from .gpt_block import GPTBlock, MOEGPTBlock
2 | from .vit_block import ViTBlock
3 | from .transformer_encoder import TransformerEncoderLayer, TransformerEncoder
4 | from .transformer_decoder import TransformerDecoderLayer, TransformerDecoder
5 | from .deepnet_block import DeepNetBlock
6 | from .detr_block import DeTrEncoder, DeTrDecoder
7 | 
8 | 


--------------------------------------------------------------------------------
/titans/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import VocabUtility
2 | from .context import barrier_context
3 | from .tensor_parallel_data_split import split_data_3d, split_data_2d, split_data_2p5d, split_data_for_tensor_parallel
4 | 
5 | __all__ = [
6 |     'VocabUtility', 'barrier_context', 'split_data_3d', 'split_data_for_tensor_parallel', 'split_data_2d',
7 |     'split_data_2p5d'
8 | ]
9 | 


--------------------------------------------------------------------------------
/titans/loss/mlm_loss/mlm_loss.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.modules.loss import *
 2 | from torch.nn.modules.loss import _Loss
 3 | 
 4 | 
 5 | class MLM_loss(_Loss):
 6 | 
 7 |     def __init__(self, reduction: bool = True, *args, **kwargs):
 8 |         super().__init__()
 9 | 
10 |     def itm_mlm_loss(self, output):
11 |         total_loss = sum([v for k, v in output.items() if "loss" in k])
12 |         return total_loss
13 | 
14 |     def forward(self, *args):
15 |         return self.itm_mlm_loss(*args)
16 | 


--------------------------------------------------------------------------------
/titans/loss/lm_loss/gpt_lmloss.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | from colossalai import nn as col_nn
 4 | 
 5 | 
 6 | class GPTLMLoss(nn.Module):
 7 | 
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.loss = col_nn.CrossEntropyLoss()
11 | 
12 |     def forward(self, logits, labels):
13 |         shift_logits = logits[..., :-1, :].contiguous()
14 |         shift_labels = labels[..., 1:].contiguous()
15 |         # Flatten the tokens
16 |         return self.loss(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))


--------------------------------------------------------------------------------
/titans/layer/mlp/detr_mlp.py:
--------------------------------------------------------------------------------
 1 | import torch.nn.functional as F
 2 | from torch import nn
 3 | 
 4 | from colossalai import nn as col_nn
 5 | 
 6 | 
 7 | class DeTrMLP(nn.Module):
 8 |     """ Very simple multi-layer perceptron (also called FFN)"""
 9 | 
10 |     def __init__(self, input_dim, hidden_size, output_dim, num_layers):
11 |         super().__init__()
12 |         self.num_layers = num_layers
13 |         h = [hidden_size] * (num_layers - 1)
14 |         self.layers = nn.ModuleList(col_nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
15 | 
16 |     def forward(self, x):
17 |         for i, layer in enumerate(self.layers):
18 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
19 |         return x
20 | 


--------------------------------------------------------------------------------
/titans/layer/head/gpt_lm_head.py:
--------------------------------------------------------------------------------
 1 | from torch import dtype, nn
 2 | 
 3 | from colossalai import nn as col_nn
 4 | 
 5 | 
 6 | class GPTLMHead(nn.Module):
 7 | 
 8 |     def __init__(self,
 9 |                  hidden_size: int,
10 |                  vocab_size: int,
11 |                  embedding_layer=None,
12 |                  bias: bool = False,
13 |                  dtype: dtype = None) -> None:
14 |         super().__init__()
15 |         self.dense = col_nn.Classifier(hidden_size, vocab_size, embedding_layer.word_embedding_weight, bias=bias, dtype=dtype)
16 | 
17 |     @property
18 |     def weight(self):
19 |         return self.dense.weight
20 | 
21 |     def forward(self, x):
22 |         # the size of x before dense is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
23 |         # the size of x after dense is (BATCH_SIZE, SEQ_LEN, VOCAB_SIZE)
24 |         x = self.dense(x)
25 |         return x
26 | 


--------------------------------------------------------------------------------
/titans/utils/utils.py:
--------------------------------------------------------------------------------
 1 | from colossalai.nn.layer.utils import divide
 2 | 
 3 | 
 4 | class VocabUtility:
 5 |     """Split the vocabulary into `world_size` chunks amd return the
 6 |         first and last index of the vocabulary belonging to the `rank`
 7 |         partition: Note that indecies in [fist, last)"""
 8 | 
 9 |     @staticmethod
10 |     def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size):
11 |         index_f = rank * per_partition_vocab_size
12 |         index_l = index_f + per_partition_vocab_size
13 |         return index_f, index_l
14 | 
15 |     @staticmethod
16 |     def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
17 |         per_partition_vocab_size = divide(global_vocab_size, world_size)
18 |         return VocabUtility.vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size)
19 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPI
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | jobs:
 6 |   build-n-publish:
 7 |     if: github.ref_name == 'main' && github.repository == 'hpcaitech/Titans' && contains(fromJson('["FrankLeeeee", "YuliangLiu0306"]'), github.actor)
 8 |     name: Build and publish Python 🐍 distributions 📦 to PyPI
 9 |     runs-on: ubuntu-latest
10 |     timeout-minutes: 20
11 |     steps:
12 |     - uses: actions/checkout@v2
13 |     - uses: actions/setup-python@v2
14 |       with:
15 |         python-version: '3.7.12'
16 |     - run: python setup.py sdist build
17 |     # publish to PyPI if executed on the main branch
18 |     # publish to Test PyPI if executed on the develop branch
19 |     - name: Publish package to PyPI
20 |       uses: pypa/gh-action-pypi-publish@release/v1
21 |       with:
22 |         user: __token__
23 |         password: ${{ secrets.TITANS_PYPI_TOKEN }}
24 |         verbose: true


--------------------------------------------------------------------------------
/.github/workflows/release_test.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to Test PyPI
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | jobs:
 6 |   build-n-publish:
 7 |     if: github.repository == 'hpcaitech/Titans' && contains(fromJson('["FrankLeeeee", "YuliangLiu0306"]'), github.actor)
 8 |     name: Build and publish Python 🐍 distributions 📦 to Test PyPI
 9 |     runs-on: ubuntu-latest
10 |     timeout-minutes: 20
11 |     steps:
12 |     - uses: actions/checkout@v2
13 |     - uses: actions/setup-python@v2
14 |       with:
15 |         python-version: '3.7.12'
16 |     - run: python setup.py sdist build
17 |     # publish to PyPI if executed on the main branch
18 |     # publish to Test PyPI if executed on the develop branch
19 |     - name: Publish package to Test PyPI
20 |       uses: pypa/gh-action-pypi-publish@release/v1
21 |       with:
22 |         user: __token__
23 |         password: ${{ secrets.TITANS_TEST_PYPI_TOKEN }}
24 |         repository_url: https://test.pypi.org/legacy/
25 |         verbose: true


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation.yml:
--------------------------------------------------------------------------------
 1 | name: 📚 Documentation
 2 | description: Report an issue related to documentation
 3 | title: "[DOC] "
 4 | labels: [documentation]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Not suitable for your needs? [Open a blank issue](https://github.com/hpcaitech/Titans/issues/new).
11 | - type: textarea
12 |   attributes:
13 |     label: 📚 The doc issue
14 |     description: |
15 |       **Description** What content in the documentation is an issue?
16 |       **Location** Where is the issue location?
17 |       **Expectation** What is your expected content about it?
18 |       **Screenshots** If applicable, add screenshots to help explain your problem.
19 |       **Suggestions** Tell us how we could improve the documentation.
20 |     placeholder: |
21 |       A clear and concise description of the issue.
22 |   validations:
23 |     required: true
24 | 
25 | - type: markdown
26 |   attributes:
27 |     value: >
28 |       Thanks for contributing 🎉!
29 | 


--------------------------------------------------------------------------------
/titans/layer/mlp/vit_moe_mlp.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | from colossalai.utils import get_current_device
 4 | 
 5 | 
 6 | class MLPForMoe(nn.Module):
 7 |     """FFN composed with two linear layers, also called MLP.
 8 |     """
 9 | 
10 |     def __init__(self,
11 |                  hidden_size: int,
12 |                  d_ff: int,
13 |                  activation=None,
14 |                  drop_rate: float = 0,
15 |                  bias: bool = True,
16 |                  dropout1=None,
17 |                  dropout2=None):
18 |         super().__init__()
19 |         dense1 = nn.Linear(hidden_size, d_ff, bias, device=get_current_device())
20 |         act = nn.GELU() if activation is None else activation
21 |         dense2 = nn.Linear(d_ff, hidden_size, bias, device=get_current_device())
22 |         drop1 = nn.Dropout(drop_rate) if dropout1 is None else dropout1
23 |         drop2 = nn.Dropout(drop_rate) if dropout2 is None else dropout2
24 | 
25 |         self.ffn = nn.Sequential(dense1, act, drop1, dense2, drop2)
26 | 
27 |     def forward(self, x):
28 |         return self.ffn(x)


--------------------------------------------------------------------------------
/tests/test_decorator/test_no_support.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import pytest
 3 | import torch.nn as nn
 4 | import torch.multiprocessing as mp
 5 | 
 6 | from colossalai.utils import free_port
 7 | from functools import partial
 8 | from titans.decorator import no_support
 9 | 
10 | CONFIG = dict(parallel=dict(tensor=dict(mode='1d', size=2)))
11 | 
12 | 
13 | @no_support('tp')
14 | class Net(nn.Module):
15 | 
16 |     def __init__(self):
17 |         super().__init__()
18 |         self.linear = nn.Linear(16, 16)
19 | 
20 |     def forward(self, x):
21 |         return self.linear(x)
22 | 
23 | 
24 | def run_dist(rank, world_size, port):
25 |     colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port)
26 |     try:
27 |         net = Net()
28 |     except Exception as e:
29 |         assert isinstance(e, AssertionError)
30 | 
31 | 
32 | def test_no_support():
33 |     world_size = 2
34 |     run_func = partial(run_dist, world_size=world_size, port=free_port())
35 |     mp.spawn(run_func, nprocs=world_size)
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     test_no_support()
40 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: Build
 2 | 
 3 | on: 
 4 |   pull_request:
 5 |     types: [synchronize, labeled]
 6 | 
 7 | jobs:
 8 |   build:
 9 |     name: Build and Test Colossal-AI
10 |     if: |
11 |         github.event.pull_request.draft == false &&
12 |         github.base_ref == 'main' &&
13 |         github.event.pull_request.base.repo.full_name == 'hpcaitech/Titans' &&
14 |         contains( github.event.pull_request.labels.*.name, 'Run Build and Test')
15 |     runs-on: [self-hosted, gpu]
16 |     container:
17 |       image: frankleeeee/pytorch-cuda:1.10.1-11.3.0
18 |       options: --gpus all --rm
19 |     timeout-minutes: 40
20 |     steps:
21 |       - uses: actions/checkout@v2
22 |         with:
23 |           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
24 |       - name: Install Colossal-AI      
25 |         run: |
26 |           pip install colossalai==0.1.4+torch1.10cu11.3 -f https://release.colossalai.org
27 |           pip install -v .
28 |           pip install -r requirements/requirements-test.txt
29 |       - name: Unit Testing
30 |         run: |
31 |           mkdir tmp_test
32 |           mv tests ./tmp_test
33 |           cd ./tmp_test
34 |           PYTHONPATH=$PWD pytest tests
35 | 


--------------------------------------------------------------------------------
/.github/workflows/close_inactive.yml:
--------------------------------------------------------------------------------
 1 | name: Close inactive issues
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: "0 0 * * *"
 6 | 
 7 | jobs:
 8 |   close-issues:
 9 |     if: github.event.pull_request.draft == false && github.event.pull_request.base.repo.full_name == 'hpcaitech/Titans' && github.base_ref == 'main'
10 |     runs-on: ubuntu-latest
11 |     permissions:
12 |       issues: write
13 |       pull-requests: write
14 |     steps:
15 |       - uses: actions/stale@v3
16 |         with:
17 |           days-before-issue-stale: 14
18 |           days-before-issue-close: -1
19 |           stale-issue-label: "stale"
20 |           stale-issue-message: "This issue is stale because it has been open for 14 days with no activity."
21 | #           close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
22 |           days-before-pr-stale: 14
23 |           days-before-pr-close: -1
24 |           stale-pr-message: "This PR is stale because it has been open for 14 days with no activity."
25 | #           close-pr-message: "This PR was closed because it has been inactive for 14 days since being marked as stale."
26 |           repo-token: ${{ secrets.GITHUB_TOKEN }}


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.yml:
--------------------------------------------------------------------------------
 1 | name: 🐛 Bug Report
 2 | description: Create a report to help us reproduce and fix the bug
 3 | title: "[BUG] "
 4 | labels: [bug]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Not suitable for your needs? [Open a blank issue](https://github.com/hpcaitech/Titans/issues/new).
11 | - type: textarea
12 |   attributes:
13 |     label: 🐛 Describe the bug
14 |     description: |
15 |       **Describe the bug**
16 |       A clear and concise description of what the bug is.
17 |       **To Reproduce**
18 |       Steps or code snippet to reproduce the behavior.
19 |       **Expected behavior**
20 |       A clear and concise description of what you expected to happen.
21 |       **Screenshots**
22 |       If applicable, add screenshots to help explain your problem.
23 |     placeholder: |
24 |       A clear and concise description of what the bug is.
25 |   validations:
26 |     required: true
27 | - type: textarea
28 |   attributes:
29 |     label: Environment
30 |     description: |
31 |       Please provide the environment information, eg. CUDA/cuDNN/NCCL/Python/PyTorch version.
32 | 
33 | - type: markdown
34 |   attributes:
35 |     value: >
36 |       Thanks for contributing 🎉!
37 | 


--------------------------------------------------------------------------------
/titans/model/helper.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from colossalai.nn.layer import WrappedDropPath as DropPath
 4 | from colossalai.nn.layer.utils import CheckpointModule
 5 | 
 6 | 
 7 | class TransformerLayer(CheckpointModule):
 8 |     """Transformer layer builder.
 9 |     """
10 | 
11 |     def __init__(self,
12 |                  att: nn.Module,
13 |                  ffn: nn.Module,
14 |                  norm1: nn.Module,
15 |                  norm2: nn.Module,
16 |                  droppath=None,
17 |                  droppath_rate: float = 0,
18 |                  checkpoint: bool = False):
19 |         super().__init__(checkpoint=checkpoint)
20 |         self.att = att
21 |         self.ffn = ffn
22 |         self.norm1 = norm1
23 |         self.norm2 = norm2
24 |         self.droppath = DropPath(droppath_rate) if droppath is None else droppath
25 | 
26 |     def _forward(self, x, y):
27 |         x1 = x + self.droppath(self.att(self.norm1(x)))
28 |         x2 = self.ffn(self.norm2(x1))
29 | 
30 |         if isinstance(x2, tuple):
31 |             x, z = x2
32 |             y = y + z
33 |         else:
34 |             x = x2
35 | 
36 |         x = x1 + self.droppath(x)
37 |         return x, y
38 | 


--------------------------------------------------------------------------------
/titans/model/moe/util.py:
--------------------------------------------------------------------------------
 1 | from colossalai.context import ParallelMode
 2 | from colossalai.nn.layer import WrappedDropout as Dropout
 3 | 
 4 | 
 5 | def moe_sa_args(hidden_size: int,
 6 |                 n_heads: int,
 7 |                 d_kv: int,
 8 |                 attention_drop: float = 0,
 9 |                 drop_rate: float = 0,
10 |                 bias: bool = True):
11 |     """This is an example for args in moe self attention, since lots of modules should be
12 |     adapted before putting them in experts.
13 |     """
14 |     dropout1 = Dropout(attention_drop, mode=ParallelMode.TENSOR)
15 |     dropout2 = Dropout(drop_rate, mode=ParallelMode.TENSOR)
16 |     return dict(hidden_size=hidden_size, n_heads=n_heads, d_kv=d_kv, bias=bias, dropout1=dropout1, dropout2=dropout2)
17 | 
18 | 
19 | def moe_mlp_args(hidden_size: int, d_ff: int, drop_rate: float, bias: bool = True):
20 |     """This is an example for args of MLP in Experts, since lots of modules should be adapted
21 |     before putting them in experts.
22 |     """
23 |     dropout1 = Dropout(drop_rate, mode=ParallelMode.TENSOR)
24 |     dropout2 = Dropout(drop_rate, mode=ParallelMode.TENSOR)
25 |     return dict(hidden_size=hidden_size, d_ff=d_ff, bias=bias, dropout1=dropout1, dropout2=dropout2)
26 | 


--------------------------------------------------------------------------------
/titans/dataloader/cifar10/torchvision_cifar10.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from colossalai.utils import get_dataloader
 3 | 
 4 | from torchvision import transforms
 5 | from torchvision.datasets import CIFAR10
 6 | 
 7 | 
 8 | def build_cifar(batch_size, root, padding=None, pad_if_needed=False, crop=224, resize=224):
 9 |     transform_train = transforms.Compose([
10 |         transforms.RandomCrop(crop, padding=padding, pad_if_needed=pad_if_needed),
11 |         transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.CIFAR10),
12 |         transforms.ToTensor(),
13 |         transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
14 |     ])
15 |     transform_test = transforms.Compose([
16 |         transforms.Resize(resize),
17 |         transforms.ToTensor(),
18 |         transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
19 |     ])
20 | 
21 |     train_dataset = CIFAR10(root=root, train=True, download=True, transform=transform_train)
22 |     test_dataset = CIFAR10(root=root, train=False, transform=transform_test)
23 |     train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=batch_size, pin_memory=True)
24 |     test_dataloader = get_dataloader(dataset=test_dataset, batch_size=batch_size, pin_memory=True)
25 |     return train_dataloader, test_dataloader
26 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yml:
--------------------------------------------------------------------------------
 1 | name: 🚀 Feature request
 2 | description: Suggest an idea for this project
 3 | title: "[FEATURE] "
 4 | labels: [enhancement]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: >
10 |       #### Not suitable for your needs? [Open a blank issue](https://github.com/hpcaitech/Titans/issues/new).
11 | - type: textarea
12 |   attributes:
13 |     label: Describe the feature
14 |     description: |
15 |       **Is your feature request related to a problem? Please describe.**
16 |       A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
17 |       **Describe the solution you'd like**
18 |       A clear and concise description of what you want to happen.
19 |       **Describe alternatives you've considered**
20 |       A clear and concise description of any alternative solutions or features you've considered.
21 |       **Screenshots**
22 |       If applicable, add screenshots to help explain your problem.
23 |       **Suggest a potential alternative/fix**
24 |       Tell us how we could improve this project.
25 |     placeholder: |
26 |       A clear and concise description of your idea.
27 |   validations:
28 |     required: true
29 | 
30 | - type: markdown
31 |   attributes:
32 |     value: >
33 |       Thanks for contributing 🎉!
34 | 


--------------------------------------------------------------------------------
/tests/test_layer/test_mlp/test_vit_moe_mlp.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import torch
 3 | import torch.nn.functional as F
 4 | 
 5 | from titans.layer.mlp import MLPForMoe
 6 | from titans.utils import split_data_for_tensor_parallel
 7 | from colossalai.global_variables import tensor_parallel_env as tp_env
 8 | from colossalai.testing import rerun_if_address_is_in_use
 9 | from tests.utils import run_with_moe_config
10 | 
11 | BATCH_SIZE = 4
12 | SEQ_LENGTH = 16
13 | HIDDEN_SIZE = 32
14 | D_FF = 4 * 32
15 | 
16 | 
17 | def run_moe_mlp(data, hidden_size, d_ff):
18 | 
19 |     #build model
20 |     model = MLPForMoe(hidden_size=hidden_size, d_ff=d_ff).cuda()
21 | 
22 |     # forward
23 |     out = model(data)
24 | 
25 |     # backward
26 |     out.mean().backward()
27 | 
28 | 
29 | def run_dist(rank, world_size, port, config):
30 |     colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost')
31 | 
32 |     if tp_env.mode == 'sequence':
33 |         tp_env.mode = None
34 | 
35 |     data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda()
36 |     data = split_data_for_tensor_parallel(data)
37 |     run_moe_mlp(data, HIDDEN_SIZE, D_FF)
38 | 
39 | 
40 | @rerun_if_address_is_in_use()
41 | def test_moe_mlp():
42 |     run_with_moe_config(4, run_func=run_dist)
43 | 


--------------------------------------------------------------------------------
/tests/test_model/test_gpt.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import pytest
 3 | import torch
 4 | 
 5 | from titans.model.gpt import GPT
 6 | from colossalai.global_variables import tensor_parallel_env as tp_env
 7 | from colossalai.testing import rerun_if_address_is_in_use
 8 | from tests.utils import run_with_parallel_config
 9 | 
10 | BATCH_SIZE = 4
11 | SEQ_LENGHT = 16
12 | HIDDEN_SIZE = 32
13 | NUM_HEADS = 4
14 | VOCAB_SIZE = 50304
15 | 
16 | 
17 | def run_gpt(data, hidden_size, num_heads):
18 | 
19 |     #build model
20 |     model = GPT(hidden_size=hidden_size, num_heads=num_heads).cuda()
21 | 
22 |     # forward
23 |     out = model(data)
24 | 
25 |     # backward
26 |     out.mean().backward()
27 | 
28 | 
29 | def run_dist(rank, world_size, port, config):
30 |     colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost')
31 | 
32 |     if tp_env.mode == 'sequence':
33 |         tp_env.mode = None
34 | 
35 |     data = torch.rand(BATCH_SIZE, SEQ_LENGHT) * VOCAB_SIZE
36 |     data = data.int().cuda()
37 |     run_gpt(data, HIDDEN_SIZE, NUM_HEADS)
38 | 
39 | 
40 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')])
41 | @rerun_if_address_is_in_use()
42 | def test_gpt(parallel_config):
43 |     run_with_parallel_config(*parallel_config, run_func=run_dist)
44 | 


--------------------------------------------------------------------------------
/tests/test_model/test_deepnet.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import pytest
 3 | import torch
 4 | 
 5 | from titans.model.deepnet import DeepNet
 6 | from colossalai.global_variables import tensor_parallel_env as tp_env
 7 | from colossalai.testing import rerun_if_address_is_in_use
 8 | from tests.utils import run_with_parallel_config
 9 | 
10 | BATCH_SIZE = 4
11 | SEQ_LENGHT = 16
12 | HIDDEN_SIZE = 32
13 | NUM_HEADS = 4
14 | VOCAB_SIZE = 50304
15 | 
16 | 
17 | def run_deepnet(data, hidden_size, num_heads):
18 | 
19 |     #build model
20 |     model = DeepNet(hidden_size=hidden_size, num_heads=num_heads).cuda()
21 | 
22 |     # forward
23 |     out = model(data)
24 | 
25 |     # backward
26 |     out.mean().backward()
27 | 
28 | 
29 | def run_dist(rank, world_size, port, config):
30 |     colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost')
31 | 
32 |     if tp_env.mode == 'sequence':
33 |         tp_env.mode = None
34 | 
35 |     data = torch.rand(BATCH_SIZE, SEQ_LENGHT) * VOCAB_SIZE
36 |     data = data.int().cuda()
37 |     run_deepnet(data, HIDDEN_SIZE, NUM_HEADS)
38 | 
39 | 
40 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')])
41 | @rerun_if_address_is_in_use()
42 | def test_deepnet(parallel_config):
43 |     run_with_parallel_config(*parallel_config, run_func=run_dist)
44 | 


--------------------------------------------------------------------------------
/tests/test_layer/test_embedding/test_gpt_embedding.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import pytest
 3 | import torch
 4 | 
 5 | from titans.layer.embedding import GPTEmbedding
 6 | from colossalai.global_variables import tensor_parallel_env as tp_env
 7 | from colossalai.testing import rerun_if_address_is_in_use
 8 | from tests.utils import run_with_parallel_config
 9 | 
10 | BATCH_SIZE = 4
11 | SEQ_LENGHT = 16
12 | HIDDEN_SIZE = 32
13 | VOCAB_SIZE = 50304
14 | 
15 | 
16 | def run_gpt_embed(data, hidden_size, vocab_size):
17 | 
18 |     #build model
19 |     model = GPTEmbedding(embedding_dim=hidden_size, vocab_size=vocab_size, max_position_embeddings=1024).cuda()
20 | 
21 |     # forward
22 |     out = model(data)
23 | 
24 |     # backward
25 |     out.mean().backward()
26 | 
27 | 
28 | def run_dist(rank, world_size, port, config):
29 |     colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost')
30 | 
31 |     if tp_env.mode == 'sequence':
32 |         tp_env.mode = None
33 | 
34 |     data = torch.rand(BATCH_SIZE, SEQ_LENGHT) * VOCAB_SIZE
35 |     data = data.int().cuda()
36 |     run_gpt_embed(data, HIDDEN_SIZE, VOCAB_SIZE)
37 | 
38 | 
39 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')])
40 | @rerun_if_address_is_in_use()
41 | def test_gpt_embedding(parallel_config):
42 |     run_with_parallel_config(*parallel_config, run_func=run_dist)
43 | 


--------------------------------------------------------------------------------
/titans/layer/init_rules.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | from colossalai import nn as col_nn
 4 | from torch import nn
 5 | 
 6 | init_rules = dict(
 7 |     torch=dict(
 8 |         embed=dict(
 9 |             weight_initializer=col_nn.init.kaiming_uniform_(a=math.sqrt(5)),
10 |             bias_initializer=col_nn.init.xavier_uniform_(a=1, scale=1),
11 |             position_embed_initializer=col_nn.init.zeros_(),
12 |         ),
13 |         transformer=dict(
14 |             weight_initializer=col_nn.init.kaiming_uniform_(a=math.sqrt(5)),
15 |             bias_initializer=col_nn.init.xavier_uniform_(a=1, scale=1),
16 |         ),
17 |         head=dict(
18 |             weight_initializer=col_nn.init.kaiming_uniform_(a=math.sqrt(5)),
19 |             bias_initializer=col_nn.init.xavier_uniform_(a=1, scale=1),
20 |         ),
21 |     ),
22 |     jax=dict(
23 |         embed=dict(
24 |             weight_initializer=col_nn.init.lecun_normal_(),
25 |             bias_initializer=col_nn.init.zeros_(),
26 |             position_embed_initializer=col_nn.init.trunc_normal_(std=.02),
27 |         ),
28 |         transformer=dict(
29 |             weight_initializer=col_nn.init.xavier_uniform_(),
30 |             bias_initializer=col_nn.init.normal_(std=1e-6),
31 |         ),
32 |         head=dict(
33 |             weight_initializer=col_nn.init.zeros_(),
34 |             bias_initializer=col_nn.init.zeros_(),
35 |         ),
36 |     ),
37 | )


--------------------------------------------------------------------------------
/tests/test_layer/test_head/test_vit_head.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import pytest
 3 | import torch
 4 | 
 5 | from titans.layer.head import ViTHead
 6 | from titans.utils import split_data_for_tensor_parallel
 7 | from colossalai.global_variables import tensor_parallel_env as tp_env
 8 | from colossalai.testing import rerun_if_address_is_in_use
 9 | from tests.utils import run_with_parallel_config
10 | 
11 | BATCH_SIZE = 4
12 | MIDDLE_DIM = 80
13 | NUM_CLASSES = 10
14 | HIDDEN_SIZE = 32
15 | 
16 | 
17 | def run_vit_head(data, hidden_size, num_classes):
18 | 
19 |     #build model
20 |     model = ViTHead(hidden_size=hidden_size, num_classes=num_classes).cuda()
21 | 
22 |     # forward
23 |     out = model(data)
24 | 
25 |     # backward
26 |     out.mean().backward()
27 | 
28 | 
29 | def run_dist(rank, world_size, port, config):
30 |     colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost')
31 | 
32 |     if tp_env.mode == 'sequence':
33 |         tp_env.mode = None
34 | 
35 |     data = torch.rand(BATCH_SIZE, MIDDLE_DIM, HIDDEN_SIZE).cuda()
36 |     data = split_data_for_tensor_parallel(data)
37 |     run_vit_head(data, HIDDEN_SIZE, NUM_CLASSES)
38 | 
39 | 
40 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')])
41 | @rerun_if_address_is_in_use()
42 | def test_vit_head(parallel_config):
43 |     run_with_parallel_config(*parallel_config, run_func=run_dist)
44 | 


--------------------------------------------------------------------------------
/tests/test_layer/test_mlp/test_transformer_mlp.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import pytest
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | from titans.layer.mlp import TransformerMLP
 7 | from titans.utils import split_data_for_tensor_parallel
 8 | from colossalai.global_variables import tensor_parallel_env as tp_env
 9 | from colossalai.testing import rerun_if_address_is_in_use
10 | from tests.utils import run_with_parallel_config
11 | 
12 | BATCH_SIZE = 4
13 | SEQ_LENGTH = 16
14 | HIDDEN_SIZE = 32
15 | 
16 | 
17 | def run_transformer_mlp(data, hidden_size):
18 | 
19 |     #build model
20 |     model = TransformerMLP(hidden_size=hidden_size, mlp_ratio=4).cuda()
21 | 
22 |     # forward
23 |     out = model(data)
24 | 
25 |     # backward
26 |     out.mean().backward()
27 | 
28 | 
29 | def run_dist(rank, world_size, port, config):
30 |     colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost')
31 | 
32 |     if tp_env.mode == 'sequence':
33 |         tp_env.mode = None
34 | 
35 |     data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda()
36 |     data = split_data_for_tensor_parallel(data)
37 |     run_transformer_mlp(data, HIDDEN_SIZE)
38 | 
39 | 
40 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')])
41 | @rerun_if_address_is_in_use()
42 | def test_transformer_mlp(parallel_config):
43 |     run_with_parallel_config(*parallel_config, run_func=run_dist)
44 | 


--------------------------------------------------------------------------------
/tests/test_layer/test_mlp/test_vit_mlp.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import pytest
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | from titans.layer.mlp import ViTMLP
 7 | from titans.utils import split_data_for_tensor_parallel
 8 | from colossalai.global_variables import tensor_parallel_env as tp_env
 9 | from colossalai.testing import rerun_if_address_is_in_use
10 | from tests.utils import run_with_parallel_config
11 | 
12 | BATCH_SIZE = 4
13 | SEQ_LENGTH = 16
14 | HIDDEN_SIZE = 32
15 | 
16 | 
17 | def run_vit_mlp(data, hidden_size):
18 | 
19 |     #build model
20 |     model = ViTMLP(hidden_size=hidden_size, mlp_ratio=4, activation=F.gelu, dropout=0.0).cuda()
21 | 
22 |     # forward
23 |     out = model(data)
24 | 
25 |     # backward
26 |     out.mean().backward()
27 | 
28 | 
29 | def run_dist(rank, world_size, port, config):
30 |     colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost')
31 | 
32 |     if tp_env.mode == 'sequence':
33 |         tp_env.mode = None
34 | 
35 |     data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda()
36 |     data = split_data_for_tensor_parallel(data)
37 |     run_vit_mlp(data, HIDDEN_SIZE)
38 |     
39 | 
40 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')])
41 | @rerun_if_address_is_in_use()
42 | def test_transformer_mlp(parallel_config):
43 |     run_with_parallel_config(*parallel_config, run_func=run_dist)
44 | 


--------------------------------------------------------------------------------
/tests/test_layer/test_mlp/test_detr_mlp.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import pytest
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | from titans.layer.mlp import DeTrMLP
 7 | from titans.utils import split_data_for_tensor_parallel
 8 | from colossalai.global_variables import tensor_parallel_env as tp_env
 9 | from colossalai.testing import rerun_if_address_is_in_use
10 | from tests.utils import run_with_parallel_config
11 | 
12 | BATCH_SIZE = 4
13 | SEQ_LENGTH = 16
14 | HIDDEN_SIZE = 32
15 | 
16 | 
17 | def run_detr_mlp(data, hidden_size):
18 | 
19 |     #build model
20 |     model = DeTrMLP(input_dim=hidden_size, hidden_size=4*hidden_size, output_dim=hidden_size, num_layers=1).cuda()
21 | 
22 |     # forward
23 |     out = model(data)
24 | 
25 |     # backward
26 |     out.mean().backward()
27 | 
28 | 
29 | def run_dist(rank, world_size, port, config):
30 |     colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost')
31 | 
32 |     if tp_env.mode == 'sequence':
33 |         tp_env.mode = None
34 | 
35 |     data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda()
36 |     data = split_data_for_tensor_parallel(data)
37 |     run_detr_mlp(data, HIDDEN_SIZE)
38 | 
39 | 
40 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')])
41 | @rerun_if_address_is_in_use()
42 | def test_transformer_mlp(parallel_config):
43 |     run_with_parallel_config(*parallel_config, run_func=run_dist)
44 | 


--------------------------------------------------------------------------------
/titans/utils/context.py:
--------------------------------------------------------------------------------
 1 | import torch.distributed as dist
 2 | from colossalai.context import ParallelMode
 3 | from colossalai.core import global_context as gpc
 4 | 
 5 | 
 6 | class barrier_context():
 7 |     """
 8 |     This context manager is used to allow one process to execute while blocking all
 9 |     other processes in the same process group. This is often useful when downloading is required
10 |     as we only want to download in one process to prevent file corruption.
11 | 
12 |     Args:
13 |         executor_rank (int): the process rank to execute without blocking, all other processes will be blocked
14 |         parallel_mode (ParallelMode): the parallel mode corresponding to a process group
15 | 
16 |     Usage:
17 |         with barrier_context():
18 |             dataset = CIFAR10(root='./data', download=True)
19 |     """
20 | 
21 |     def __init__(self, executor_rank: int = 0, parallel_mode: ParallelMode = ParallelMode.GLOBAL):
22 |         # the class name is lowercase by convention
23 |         current_rank = gpc.get_local_rank(parallel_mode=parallel_mode)
24 |         self.should_block = current_rank != executor_rank
25 |         self.group = gpc.get_group(parallel_mode=parallel_mode)
26 | 
27 |     def __enter__(self):
28 |         if self.should_block:
29 |             dist.barrier(group=self.group)
30 | 
31 |     def __exit__(self, exc_type, exc_value, exc_traceback):
32 |         if not self.should_block:
33 |             dist.barrier(group=self.group)
34 | 


--------------------------------------------------------------------------------
/tests/test_layer/test_block/test_gpt_block.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import pytest
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | from titans.layer.block import GPTBlock
 7 | from titans.utils import split_data_for_tensor_parallel
 8 | from colossalai.testing import rerun_if_address_is_in_use
 9 | from tests.utils import run_with_parallel_config
10 | from colossalai.global_variables import tensor_parallel_env as tp_env
11 | 
12 | BATCH_SIZE = 4
13 | SEQ_LENGTH = 16
14 | NUM_HEADS = 4
15 | HIDDEN_SIZE = 32
16 | 
17 | 
18 | def run_gpt_block(data, hidden_size, num_heads):
19 | 
20 |     #build model
21 |     model = GPTBlock(hidden_size=hidden_size, num_heads=num_heads, mlp_ratio=4.0, activation=F.gelu).cuda()
22 | 
23 |     # forward
24 |     out, _ = model(data)
25 | 
26 |     # backward
27 |     out.mean().backward()
28 | 
29 | 
30 | def run_dist(rank, world_size, port, config):
31 |     colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost')
32 | 
33 |     if tp_env.mode == 'sequence':
34 |         tp_env.mode = None
35 | 
36 |     data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda()
37 |     data = split_data_for_tensor_parallel(data)
38 |     run_gpt_block(data, HIDDEN_SIZE, NUM_HEADS)
39 | 
40 | 
41 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')])
42 | @rerun_if_address_is_in_use()
43 | def test_gpt_block(parallel_config):
44 |     run_with_parallel_config(*parallel_config, run_func=run_dist)
45 | 


--------------------------------------------------------------------------------
/tests/test_layer/test_block/test_deepnet_block.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import pytest
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | from titans.layer.block import DeepNetBlock
 7 | from titans.utils import split_data_for_tensor_parallel
 8 | from colossalai.testing import rerun_if_address_is_in_use
 9 | from tests.utils import run_with_parallel_config
10 | from colossalai.global_variables import tensor_parallel_env as tp_env
11 | 
12 | BATCH_SIZE = 4
13 | SEQ_LENGTH = 16
14 | NUM_HEADS = 4
15 | HIDDEN_SIZE = 32
16 | 
17 | 
18 | def run_deepnet_block(data, hidden_size, num_heads):
19 | 
20 |     #build model
21 |     model = DeepNetBlock(hidden_size=hidden_size, num_heads=num_heads, mlp_ratio=4.0, activation=F.gelu).cuda()
22 | 
23 |     # forward
24 |     out, _ = model(data)
25 | 
26 |     # backward
27 |     out.mean().backward()
28 | 
29 | 
30 | def run_dist(rank, world_size, port, config):
31 |     colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost')
32 | 
33 |     if tp_env.mode == 'sequence':
34 |         tp_env.mode = None
35 | 
36 |     data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda()
37 |     data = split_data_for_tensor_parallel(data)
38 |     run_deepnet_block(data, HIDDEN_SIZE, NUM_HEADS)
39 | 
40 | 
41 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')])
42 | @rerun_if_address_is_in_use()
43 | def test_deepnet_block(parallel_config):
44 |     run_with_parallel_config(*parallel_config, run_func=run_dist)
45 | 


--------------------------------------------------------------------------------
/tests/utils/dist_test.py:
--------------------------------------------------------------------------------
 1 | import torch.multiprocessing as mp
 2 | from colossalai.utils import free_port
 3 | from functools import partial
 4 | 
 5 | 
 6 | def run_with_parallel_config(world_size, parallel_mode, run_func):
 7 |     """
 8 |     A wrapper function to reuse the same code snippet in layer/model testing.
 9 | 
10 |     Args:
11 |         world_size (int): the number of processes to launch
12 |         parallel_mode (str): the parallelism method used
13 |         run_func (Callable): the function to launch multiple processes, must have world_size, port and config as arguments.
14 |     """
15 | 
16 |     port = free_port()
17 | 
18 |     config = dict(parallel=dict(tensor=dict(size=world_size, mode=parallel_mode)))
19 | 
20 |     if parallel_mode == '2.5d':
21 |         config['parallel']['tensor']['depth'] = world_size // 4
22 | 
23 |     run_func = partial(run_func, world_size=world_size, port=port, config=config)
24 |     mp.spawn(run_func, nprocs=world_size)
25 | 
26 | 
27 | def run_with_moe_config(world_size, run_func):
28 |     """
29 |     A wrapper function to reuse the same code snippet in layer/model testing.
30 | 
31 |     Args:
32 |         world_size (int): the number of processes to launch
33 |         run_func (Callable): the function to launch multiple processes, must have world_size, port and config as arguments.
34 |     """
35 | 
36 |     port = free_port()
37 | 
38 |     config = dict()
39 | 
40 |     run_func = partial(run_func, world_size=world_size, port=port, config=config)
41 |     mp.spawn(run_func, nprocs=world_size)


--------------------------------------------------------------------------------
/titans/dataloader/imagenet/torchvision_imagenet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | import torch.nn as nn
 4 | import torch.optim as optim
 5 | from colossalai.utils import get_dataloader
 6 | import torchvision.transforms as transforms
 7 | 
 8 | 
 9 | def build_imagenet(batch_size, root, crop=224, resize=256):
10 |     transform_train = transforms.Compose([
11 |         transforms.RandomResizedCrop(crop, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.)),
12 |         transforms.RandomHorizontalFlip(),
13 |         transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.IMAGENET),
14 |         transforms.ToTensor(),
15 |         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
16 |     ])
17 |     transform_test = transforms.Compose([
18 |         transforms.Resize(resize),
19 |         transforms.CenterCrop(crop),
20 |         transforms.ToTensor(),
21 |         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
22 |     ])
23 |     train_path = os.path.join(root, "train")
24 |     test_path = os.path.join(root, "test")
25 |     train_dataset = torchvision.datasets.ImageFolder(root=train_path, transform=transform_train)
26 |     test_dataset = torchvision.datasets.ImageFolder(root=test_path, transform=transform_test)
27 |     train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=batch_size, pin_memory=True)
28 |     test_dataloader = get_dataloader(dataset=test_dataset, batch_size=batch_size, pin_memory=True)
29 |     return train_dataloader, test_dataloader
30 | 


--------------------------------------------------------------------------------
/tests/test_layer/test_embedding/test_vit_embedding.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import pytest
 3 | import torch
 4 | 
 5 | from titans.layer.embedding import ViTEmbedding
 6 | from colossalai.global_variables import tensor_parallel_env as tp_env
 7 | from colossalai.testing import rerun_if_address_is_in_use
 8 | from tests.utils import run_with_parallel_config
 9 | 
10 | BATCH_SIZE = 4
11 | IMAGE_SIZE = 224
12 | PATCH_SIZE = 16
13 | IN_CHANS = 3
14 | HIDDEN_SIZE = 32
15 | 
16 | 
17 | def run_vit_embed(data, img_size, patch_size, in_chans, hidden_size):
18 | 
19 |     #build model
20 |     model = ViTEmbedding(img_size=img_size,
21 |                          patch_size=patch_size,
22 |                          in_chans=in_chans,
23 |                          embedding_dim=hidden_size,
24 |                          dropout=0.0).cuda()
25 | 
26 |     # forward
27 |     out = model(data)
28 | 
29 |     # backward
30 |     out.mean().backward()
31 | 
32 | 
33 | def run_dist(rank, world_size, port, config):
34 |     colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost')
35 | 
36 |     if tp_env.mode == 'sequence':
37 |         tp_env.mode = None
38 | 
39 |     data = torch.rand(BATCH_SIZE, IN_CHANS, IMAGE_SIZE, IMAGE_SIZE).cuda()
40 |     run_vit_embed(data, IMAGE_SIZE, PATCH_SIZE, IN_CHANS, HIDDEN_SIZE)
41 | 
42 | 
43 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')])
44 | @rerun_if_address_is_in_use()
45 | def test_vit_embedding(parallel_config):
46 |     run_with_parallel_config(*parallel_config, run_func=run_dist)
47 | 


--------------------------------------------------------------------------------
/tests/test_model/test_vit.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import pytest
 3 | import torch
 4 | 
 5 | from titans.model.vit import VisionTransformer
 6 | from colossalai.global_variables import tensor_parallel_env as tp_env
 7 | from colossalai.testing import rerun_if_address_is_in_use
 8 | from tests.utils import run_with_parallel_config
 9 | 
10 | BATCH_SIZE = 4
11 | IMAGE_SIZE = 224
12 | PATCH_SIZE = 16
13 | NUM_HEADS = 4
14 | IN_CHANS = 3
15 | HIDDEN_SIZE = 32
16 | 
17 | 
18 | def run_vit(data, img_size, patch_size, in_chans, hidden_size, num_heads):
19 | 
20 |     #build model
21 |     model = VisionTransformer(img_size=img_size,
22 |                               patch_size=patch_size,
23 |                               in_chans=in_chans,
24 |                               hidden_size=hidden_size,
25 |                               num_heads=num_heads).cuda()
26 | 
27 |     # forward
28 |     out = model(data)
29 | 
30 |     # backward
31 |     out.mean().backward()
32 | 
33 | 
34 | def run_dist(rank, world_size, port, config):
35 |     colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost')
36 | 
37 |     if tp_env.mode == 'sequence':
38 |         tp_env.mode = None
39 | 
40 |     data = torch.rand(BATCH_SIZE, IN_CHANS, IMAGE_SIZE, IMAGE_SIZE).cuda()
41 |     run_vit(data, IMAGE_SIZE, PATCH_SIZE, IN_CHANS, HIDDEN_SIZE, NUM_HEADS)
42 | 
43 | 
44 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')])
45 | @rerun_if_address_is_in_use()
46 | def test_vit(parallel_config):
47 |     run_with_parallel_config(*parallel_config, run_func=run_dist)
48 | 


--------------------------------------------------------------------------------
/tests/test_layer/test_block/test_vit_block.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import pytest
 3 | import torch
 4 | import torch.multiprocessing as mp
 5 | import torch.nn.functional as F
 6 | 
 7 | from titans.layer.block import ViTBlock
 8 | from titans.utils import split_data_for_tensor_parallel
 9 | from colossalai.utils import free_port
10 | from colossalai.nn.layer.utils import divide
11 | from colossalai import nn as col_nn
12 | from functools import partial
13 | from colossalai.global_variables import tensor_parallel_env as tp_env
14 | from colossalai.testing import rerun_if_address_is_in_use
15 | from tests.utils import run_with_parallel_config
16 | 
17 | BATCH_SIZE = 4
18 | SEQ_LENGTH = 16
19 | NUM_HEADS = 4
20 | HIDDEN_SIZE = 32
21 | 
22 | 
23 | def run_vit_block(data, hidden_size, num_heads):
24 | 
25 |     #build model
26 |     model = ViTBlock(hidden_size=hidden_size, num_heads=num_heads, mlp_ratio=4, activation=F.gelu).cuda()
27 | 
28 |     # forward
29 |     out = model(data)
30 | 
31 |     # backward
32 |     out.mean().backward()
33 | 
34 | 
35 | def run_dist(rank, world_size, port, config):
36 |     colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost')
37 | 
38 |     if tp_env.mode == 'sequence':
39 |         tp_env.mode = None
40 | 
41 |     data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda()
42 |     data = split_data_for_tensor_parallel(data)
43 |     run_vit_block(data, HIDDEN_SIZE, NUM_HEADS)
44 | 
45 | 
46 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')])
47 | @rerun_if_address_is_in_use()
48 | def test_vit_block(parallel_config):
49 |     run_with_parallel_config(*parallel_config, run_func=run_dist)
50 | 


--------------------------------------------------------------------------------
/titans/layer/batchnorm/frozen_batchnorm_2d.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from typing import Dict
 4 | 
 5 | 
 6 | class FrozenBatchNorm2d(torch.nn.Module):
 7 |     """
 8 |     BatchNorm2d where the batch statistics and the affine parameters are fixed.
 9 | 
10 |     Copy-paste from torchvision.misc.ops with added eps before rqsrt,
11 |     without which any other models than torchvision.models.resnet[18,34,50,101]
12 |     produce nans.
13 |     """
14 | 
15 |     def __init__(self, n):
16 |         super(FrozenBatchNorm2d, self).__init__()
17 |         self.register_buffer("weight", torch.ones(n))
18 |         self.register_buffer("bias", torch.zeros(n))
19 |         self.register_buffer("running_mean", torch.zeros(n))
20 |         self.register_buffer("running_var", torch.ones(n))
21 | 
22 |     def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys,
23 |                               error_msgs):
24 |         num_batches_tracked_key = prefix + 'num_batches_tracked'
25 |         if num_batches_tracked_key in state_dict:
26 |             del state_dict[num_batches_tracked_key]
27 | 
28 |         super(FrozenBatchNorm2d, self)._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys,
29 |                                                              unexpected_keys, error_msgs)
30 | 
31 |     def forward(self, x):
32 |         w = self.weight.reshape(1, -1, 1, 1)
33 |         b = self.bias.reshape(1, -1, 1, 1)
34 |         rv = self.running_var.reshape(1, -1, 1, 1)
35 |         rm = self.running_mean.reshape(1, -1, 1, 1)
36 |         eps = 1e-5
37 |         scale = w * (rv + eps).rsqrt()
38 |         bias = b - rm * scale
39 |         return x * scale + bias
40 | 


--------------------------------------------------------------------------------
/titans/layer/block/transformer_encoder.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from colossalai import nn as col_nn
 3 | 
 4 | from titans.layer.attention import TransformerMultiHeadAttention
 5 | from titans.layer.mlp import TransformerMLP
 6 | from .utils import get_clones
 7 | 
 8 | 
 9 | class TransformerEncoderLayer(nn.Module):
10 | 
11 |     def __init__(self, hidden_size, nhead, dim_feedforward=2048, dropout=0.1):
12 |         super().__init__()
13 |         self.selfAttn = TransformerMultiHeadAttention(hidden_size, dim_feedforward, nhead, dropout)
14 |         self.feedForward = TransformerMLP(hidden_size, dim_feedforward, dropout)
15 | 
16 |         self.norm_1 = col_nn.LayerNorm(hidden_size)
17 |         self.norm_2 = col_nn.LayerNorm(hidden_size)
18 |         self.dropout_1 = col_nn.Dropout(dropout)
19 |         self.dropout_2 = col_nn.Dropout(dropout)
20 | 
21 |     def forward(self, x):
22 |         x1 = self.norm_1(x)
23 |         x = x + self.dropout_1(self.selfAttn(x1, x1, x1))
24 |         x2 = self.norm_2(x)
25 |         out = x + self.dropout_2(self.feedForward(x2))
26 |         return out
27 | 
28 | 
29 | class TransformerEncoder(nn.Module):
30 | 
31 |     def __init__(self, encoder_layer, num_layers, norm=None):
32 |         super().__init__()
33 |         self.layers = get_clones(encoder_layer, num_layers)
34 |         self.num_layers = num_layers
35 |         self.norm = norm
36 | 
37 |     def forward(self, src, pos):
38 |         output = src if pos is None else (src + pos)
39 |         output = output.transpose(0, 1)
40 | 
41 |         for layer in self.layers:
42 |             output = layer(output)
43 | 
44 |         if self.norm is not None:
45 |             output = self.norm(output)
46 | 
47 |         return output
48 | 


--------------------------------------------------------------------------------
/tests/test_dataloader/test_bert_pretrain_dataloader.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import os
 3 | import pytest
 4 | import torch.multiprocessing as mp
 5 | from colossalai.context.parallel_mode import ParallelMode
 6 | from colossalai.utils import free_port
 7 | from colossalai.core import global_context as gpc
 8 | from functools import partial
 9 | 
10 | try:
11 |     from titans.dataloader.bert import get_bert_pretrain_data_loader
12 | except:
13 |     # to bypass pytest
14 |     get_bert_pretrain_data_loader = None
15 | 
16 | 
17 | def load_data(rank, world_size, port):
18 |     CONFIG = dict(
19 |         parallel=dict(
20 |         tensor=dict(size=2, mode='1d')
21 |         )
22 |     )
23 |     DATA_PATH = os.environ['PARQUET_PATH']
24 | 
25 |     colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, port=port, host='localhost')
26 | 
27 |     dataloader = get_bert_pretrain_data_loader(
28 |         path=DATA_PATH,
29 |         vocab_file='bert-large-uncased',
30 |         local_rank=rank,
31 |         process_group=gpc.get_group(ParallelMode.DATA),
32 |         data_loader_kwargs={
33 |             'batch_size': 16,
34 |             # 'num_workers': 4,
35 |             # 'persistent_workers': True,
36 |             # 'pin_memory': True,
37 |         },
38 |     )
39 | 
40 |     for _ in dataloader:
41 |         break
42 |     
43 |     gpc.destroy()
44 | 
45 | 
46 | @pytest.mark.skip('This test should be manually invoked as the dataset is too large')
47 | def test_bert_pretrain_dataloader():
48 |     world_size = 4
49 |     port = free_port()
50 |     run_func = partial(load_data, world_size=world_size, port=port)
51 |     mp.spawn(run_func, nprocs=world_size)
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     test_bert_pretrain_dataloader()
56 | 


--------------------------------------------------------------------------------
/tests/test_model/test_detr.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import pytest
 3 | import torch
 4 | 
 5 | from titans.model.detr import DeTr
 6 | from colossalai.global_variables import tensor_parallel_env as tp_env
 7 | from colossalai.testing import rerun_if_address_is_in_use
 8 | from tests.utils import run_with_parallel_config
 9 | 
10 | BATCH_SIZE = 1
11 | HEIGHT = 800
12 | WIDTH = 1200
13 | PATCH_SIZE = 16
14 | NUM_HEADS = 4
15 | IN_CHANS = 3
16 | HIDDEN_SIZE = 256
17 | NUM_ENCODER_LAYER = 6
18 | NUM_DECODER_LAYER = 6
19 | 
20 | 
21 | def run_detr(data, img_size, patch_size, in_chans, hidden_size, num_heads, num_encoder_layer, num_decoder_layer):
22 | 
23 |     #build model
24 |     model = DeTr(img_size=img_size,
25 |                     patch_size=patch_size,
26 |                     in_chans=in_chans,
27 |                     hidden_size=hidden_size,
28 |                     num_heads=num_heads,
29 |                     num_encoder_layer=num_encoder_layer,
30 |                     num_decoder_layer=num_decoder_layer).cuda()
31 | 
32 |     # forward
33 |     out = model(data)
34 | 
35 |     # backward
36 |     out.mean().backward()
37 | 
38 | 
39 | def run_dist(rank, world_size, port, config):
40 |     colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost')
41 | 
42 |     if tp_env.mode == 'sequence':
43 |         tp_env.mode = None
44 | 
45 |     data = torch.rand(BATCH_SIZE, IN_CHANS, HEIGHT, WIDTH).cuda()
46 |     run_detr(data, 224, PATCH_SIZE, IN_CHANS, HIDDEN_SIZE, NUM_HEADS, NUM_ENCODER_LAYER, NUM_DECODER_LAYER)
47 | 
48 | 
49 | @pytest.mark.parametrize('parallel_config', [(4, '1d')])
50 | @rerun_if_address_is_in_use()
51 | def test_detr(parallel_config):
52 |     run_with_parallel_config(*parallel_config, run_func=run_dist)
53 | 


--------------------------------------------------------------------------------
/tests/test_layer/test_head/test_gpt_head.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import pytest
 3 | import torch
 4 | 
 5 | from titans.layer.embedding import GPTEmbedding
 6 | from titans.layer.head import GPTLMHead
 7 | from titans.utils import split_data_for_tensor_parallel
 8 | from colossalai.global_variables import tensor_parallel_env as tp_env
 9 | from colossalai.testing import rerun_if_address_is_in_use
10 | from tests.utils import run_with_parallel_config
11 | 
12 | BATCH_SIZE = 4
13 | SEQ_LENGTH = 256
14 | VOCAB_SIZE = 50304
15 | HIDDEN_SIZE = 32
16 | 
17 | 
18 | def run_gpt_head(data, hidden_size, vocab_size):
19 | 
20 |     #build model
21 |     embedding_layer = GPTEmbedding(embedding_dim=hidden_size, vocab_size=vocab_size,
22 |                                    max_position_embeddings=1024).cuda()
23 |     model = GPTLMHead(hidden_size=hidden_size, vocab_size=vocab_size, embedding_layer=embedding_layer).cuda()
24 | 
25 |     # forward
26 |     out = model(data)
27 | 
28 |     # backward
29 |     out.mean().backward()
30 | 
31 | 
32 | def run_dist(rank, world_size, port, config):
33 |     colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost')
34 | 
35 |     if tp_env.mode == 'sequence':
36 |         tp_env.mode = None
37 | 
38 |     data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda()
39 |     data = split_data_for_tensor_parallel(data)
40 |     run_gpt_head(data, HIDDEN_SIZE, VOCAB_SIZE)
41 | 
42 | 
43 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')])
44 | @rerun_if_address_is_in_use()
45 | def test_gpt_head(parallel_config):
46 |     run_with_parallel_config(*parallel_config, run_func=run_dist)
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     test_gpt_head((4, '1d'))
51 | 


--------------------------------------------------------------------------------
/titans/layer/head/vit_head.py:
--------------------------------------------------------------------------------
 1 | from torch import dtype, nn
 2 | 
 3 | from colossalai import nn as col_nn
 4 | from ..init_rules import init_rules
 5 | 
 6 | 
 7 | class ViTHead(nn.Module):
 8 | 
 9 |     def __init__(self,
10 |                  hidden_size: int,
11 |                  num_classes: int,
12 |                  representation_size: int = None,
13 |                  dtype: dtype = None,
14 |                  bias: bool = True,
15 |                  init_method: str = 'torch'):
16 |         super().__init__()
17 |         if representation_size:
18 |             self.representation = col_nn.Linear(hidden_size,
19 |                                                 representation_size,
20 |                                                 bias=bias,
21 |                                                 dtype=dtype,
22 |                                                 **init_rules[init_method]['head'])
23 |         else:
24 |             self.representation = None
25 |             representation_size = hidden_size
26 | 
27 |         self.dense = col_nn.Classifier(representation_size,
28 |                                        num_classes,
29 |                                        dtype=dtype,
30 |                                        bias=bias,
31 |                                        **init_rules[init_method]['head'])
32 | 
33 |     def forward(self, x):
34 |         # the size of x is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
35 |         x = x[:, 0]
36 |         # the size of x is (BATCH_SIZE, HIDDEN_SIZE)
37 |         if self.representation is not None:
38 |             x = self.representation(x)
39 |             # the size of x after representation is (BATCH_SIZE, REPRESENTATION_SIZE)
40 |         x = self.dense(x)
41 |         # the size of x after dense is (BATCH_SIZE, NUM_CLASSES)
42 |         return x
43 | 


--------------------------------------------------------------------------------
/titans/layer/mlp/vit_mlp.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable
 2 | 
 3 | from torch import dtype, nn
 4 | 
 5 | from colossalai import nn as col_nn
 6 | from ..init_rules import init_rules
 7 | 
 8 | 
 9 | class ViTMLP(nn.Module):
10 | 
11 |     def __init__(self,
12 |                  hidden_size: int,
13 |                  mlp_ratio: int,
14 |                  activation: Callable,
15 |                  dropout: float,
16 |                  dtype: dtype = None,
17 |                  bias: bool = True,
18 |                  init_method: str = 'torch'):
19 |         super().__init__()
20 |         self.dense_1 = col_nn.Linear(hidden_size,
21 |                                      mlp_ratio * hidden_size,
22 |                                      dtype=dtype,
23 |                                      bias=bias,
24 |                                      **init_rules[init_method]['transformer'])
25 |         self.activation = activation
26 |         self.dropout_1 = col_nn.Dropout(dropout)
27 |         self.dense_2 = col_nn.Linear(mlp_ratio * hidden_size,
28 |                                      hidden_size,
29 |                                      dtype=dtype,
30 |                                      bias=bias,
31 |                                      **init_rules[init_method]['transformer'])
32 |         self.dropout_2 = col_nn.Dropout(dropout)
33 | 
34 |     def forward(self, x):
35 |         # the size of x before dense_1 is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
36 |         # the size of x after dense_1 is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE*mlp_ratio)
37 |         x = self.dense_1(x)
38 |         x = self.activation(x)
39 |         x = self.dropout_1(x)
40 |         # the size of x after dense_2 is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
41 |         x = self.dense_2(x)
42 |         x = self.dropout_2(x)
43 |         return x
44 | 


--------------------------------------------------------------------------------
/titans/layer/attention/vit_moe_attention.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | from colossalai.utils import get_current_device
 7 | 
 8 | 
 9 | class SelfAttentionForMoe(nn.Module):
10 |     """Standard ViT self attention.
11 |     """
12 | 
13 |     def __init__(self,
14 |                  hidden_size: int,
15 |                  n_heads: int,
16 |                  d_kv: int,
17 |                  attention_drop: float = 0,
18 |                  drop_rate: float = 0,
19 |                  bias: bool = True,
20 |                  dropout1=None,
21 |                  dropout2=None):
22 |         super().__init__()
23 |         self.n_heads = n_heads
24 |         self.d_kv = d_kv
25 |         self.scale = 1.0 / math.sqrt(self.d_kv)
26 | 
27 |         self.dense1 = nn.Linear(hidden_size, 3 * n_heads * d_kv, bias, device=get_current_device())
28 |         self.softmax = nn.Softmax(dim=-1)
29 |         self.atten_drop = nn.Dropout(attention_drop) if dropout1 is None else dropout1
30 |         self.dense2 = nn.Linear(n_heads * d_kv, hidden_size, device=get_current_device())
31 |         self.dropout = nn.Dropout(drop_rate) if dropout2 is None else dropout2
32 | 
33 |     def forward(self, x):
34 |         qkv = self.dense1(x)
35 |         new_shape = qkv.shape[:2] + (3, self.n_heads, self.d_kv)
36 |         qkv = qkv.view(*new_shape)
37 |         qkv = qkv.permute(2, 0, 3, 1, 4)
38 |         q, k, v = qkv[:]
39 | 
40 |         x = torch.matmul(q, k.transpose(-2, -1)) * self.scale
41 |         x = self.atten_drop(self.softmax(x))
42 | 
43 |         x = torch.matmul(x, v)
44 |         x = x.transpose(1, 2)
45 |         new_shape = x.shape[:2] + (self.n_heads * self.d_kv,)
46 |         x = x.reshape(*new_shape)
47 |         x = self.dense2(x)
48 |         x = self.dropout(x)
49 | 
50 |         return x
51 | 


--------------------------------------------------------------------------------
/titans/layer/attention/transformer_attention.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | from colossalai import nn as col_nn
 6 | from titans.decorator import no_support
 7 | 
 8 | 
 9 | @no_support(['sp'])
10 | class TransformerSelfAttention(nn.Module):
11 | 
12 |     def __init__(
13 |         self,
14 |         dropout,
15 |     ):
16 |         super(TransformerSelfAttention, self).__init__()
17 |         self.dropout = col_nn.Dropout(dropout)
18 | 
19 |     def forward(self, queries, keys, values):
20 |         d = queries.shape[-1]
21 |         scores = torch.matmul(queries, keys.transpose(-1, -2)) / math.sqrt(d)
22 |         attention_weights = torch.softmax(scores, dim=2)
23 |         return torch.matmul(self.dropout(attention_weights), values)
24 | 
25 | 
26 | @no_support(['sp'])
27 | class TransformerMultiHeadAttention(nn.Module):
28 | 
29 |     def __init__(self, hidden_size, num_hiddens, num_heads, dropout, bias=False):
30 |         super(TransformerMultiHeadAttention, self).__init__()
31 |         self.num_heads = num_heads
32 |         self.attention = SelfAttention(dropout)
33 |         self.W_q = col_nn.Linear(hidden_size, num_hiddens, bias=bias)
34 |         self.W_k = col_nn.Linear(hidden_size, num_hiddens, bias=bias)
35 |         self.W_v = col_nn.Linear(hidden_size, num_hiddens, bias=bias)
36 |         self.W_o = col_nn.Linear(num_hiddens, hidden_size, bias=bias)
37 | 
38 |     def forward(self, queries, keys, values):
39 |         queries = transpose_qkv(self.W_q(queries), self.num_heads)
40 |         keys = transpose_qkv(self.W_k(keys), self.num_heads)
41 |         values = transpose_qkv(self.W_v(values), self.num_heads)
42 | 
43 |         output = self.attention(queries, keys, values)
44 |         output_concat = transpose_output(output, self.num_heads)
45 |         return self.W_o(output_concat)
46 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/proposal.yml:
--------------------------------------------------------------------------------
 1 | name: 💥 Proposal
 2 | description: Propose a non-trivial change to Titans
 3 | title: "[PROPOSAL] "
 4 | labels: [enhancement]
 5 | 
 6 | body:
 7 | - type: markdown
 8 |   attributes:
 9 |     value: |
10 |       Common reasons for proposals include:
11 | 
12 |       - Altering the infrastructure;
13 |       - Bumping a critical dependency's major version;
14 |       - A significant improvement in user-friendliness;
15 |       - Significant refactor;
16 |       - ...
17 | 
18 |       Please note this is not for feature request or bug template; such action could make us identify the issue wrongly and close it without doing anything.
19 | 
20 |       We give you maximum freedom to write an elaborated proposal illustrating why you think the change is beneficial for us, and what steps we should take to turn this into reality.
21 | 
22 | 
23 | - type: textarea
24 |   attributes:
25 |     label: Proposal
26 |     description: A clear and concise description of what the proposal is.
27 |   validations:
28 |     required: true
29 | 
30 | - type: checkboxes
31 |   attributes:
32 |     label: Self-service
33 |     description: |
34 |       If you feel like you could contribute to this issue, please check the box below. This would tell us and other people looking for contributions that someone's working on it.
35 |       If you do check this box, please send a pull request within 7 days after a maintainer's approval so we can still delegate this to someone else.
36 | 
37 |       Proposals usually involve significant code changes, so please reach consensus with the maintainers before rushing to implement it.
38 |       This ensures that you don't waste your time and we don't waste ours reading the large diffs.
39 |     options:
40 |       - label: I'd be willing to do some initial work on this proposal myself.
41 | 
42 | 
43 | - type: markdown
44 |   attributes:
45 |     value: >
46 |       Thanks for contributing 🎉!


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | # ninja build does not work unless include_dirs are abs path
 6 | this_dir = os.path.dirname(os.path.abspath(__file__))
 7 | 
 8 | 
 9 | def fetch_requirements(path):
10 |     with open(path, 'r') as fd:
11 |         return [r.strip() for r in fd.readlines()]
12 | 
13 | 
14 | def fetch_readme():
15 |     with open('README.md', encoding='utf-8') as f:
16 |         return f.read()
17 | 
18 | 
19 | def get_version():
20 |     with open('version.txt') as f:
21 |         return f.read().strip()
22 | 
23 | 
24 | setup(
25 |     name='titans',
26 |     version=get_version(),
27 |     packages=find_packages(exclude=(
28 |         'build',
29 |         'docker',
30 |         'tests',
31 |         'docs',
32 |         'examples',
33 |         '*.egg-info',
34 |     )),
35 |     description='A collection of deep learning components built with Colossal-AI',
36 |     long_description=fetch_readme(),
37 |     long_description_content_type='text/markdown',
38 |     license='Apache Software License 2.0',
39 |     url='https://www.colossalai.org',
40 |     project_urls={
41 |         'Forum': 'https://github.com/hpcaitech/Titans/discussions',
42 |         'Bug Tracker': 'https://github.com/hpcaitech/Titans/issues',
43 |         'Examples': 'https://github.com/hpcaitech/ColossalAI-Examples',
44 |         'Documentation': 'http://colossalai.readthedocs.io',
45 |         'Github': 'https://github.com/hpcaitech/Titans',
46 |     },
47 |     install_requires=fetch_requirements('requirements/requirements.txt'),
48 |     python_requires='>=3.6',
49 |     classifiers=[
50 |         'Programming Language :: Python :: 3',
51 |         'License :: OSI Approved :: Apache Software License',
52 |         'Environment :: GPU :: NVIDIA CUDA',
53 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
54 |         'Topic :: System :: Distributed Computing',
55 |     ],
56 | )
57 | 


--------------------------------------------------------------------------------
/titans/loss/embedding_loss/embedding_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from colossalai.core import global_context as gpc
 5 | from colossalai.context import ParallelMode
 6 | 
 7 | 
 8 | class embeddingLoss(nn.Module):
 9 | 
10 |     def forward(self, train_iterator, args, model):
11 | 
12 |         positive_sample, negative_sample, subsampling_weight, mode = next(train_iterator)
13 |         mode = mode[0]
14 |         if args.cuda:
15 |             positive_sample = positive_sample.cuda()
16 |             negative_sample = negative_sample.cuda()
17 |             subsampling_weight = subsampling_weight.cuda()
18 |         negative_score = model((positive_sample, negative_sample), mode=mode)
19 | 
20 |         if args.negative_adversarial_sampling:
21 |             #In self-adversarial sampling, we do not apply back-propagation on the sampling weight
22 |             negative_score = (F.softmax(negative_score * args.adversarial_temperature, dim=1).detach() *
23 |                               F.logsigmoid(-negative_score)).sum(dim=1)
24 |         else:
25 |             negative_score = F.logsigmoid(-negative_score).mean(dim=1)
26 | 
27 |         positive_score = model(positive_sample)
28 | 
29 |         positive_score = F.logsigmoid(positive_score).squeeze(dim=1)
30 | 
31 |         if args.uni_weight:
32 |             positive_sample_loss = -positive_score.mean()
33 |             negative_sample_loss = -negative_score.mean()
34 |         else:
35 |             positive_sample_loss = -(subsampling_weight * positive_score).sum() / subsampling_weight.sum()
36 |             negative_sample_loss = -(subsampling_weight * negative_score).sum() / subsampling_weight.sum()
37 | 
38 |         loss = (positive_sample_loss + negative_sample_loss) / 2
39 | 
40 |         torch.distributed.all_reduce(loss, group=gpc.get_group(ParallelMode.GLOBAL))
41 | 
42 |         return loss, positive_sample_loss, negative_sample_loss
43 | 


--------------------------------------------------------------------------------
/titans/model/transformer/transformer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from torch import nn
 4 | from colossalai import nn as col_nn
 5 | from titans.layer.block import TransformerEncoderLayer, TransformerEncoder, \
 6 |                                TransformerDecoderLayer, TransformerDecoder
 7 | 
 8 | 
 9 | class Transformer(nn.Module):
10 | 
11 |     def __init__(self,
12 |                  hidden_size=512,
13 |                  nhead=8,
14 |                  num_encoder_layers=6,
15 |                  num_decoder_layers=6,
16 |                  dim_feedforward=2048,
17 |                  dropout=0.1,
18 |                  return_intermediate_dec=False):
19 |         super().__init__()
20 | 
21 |         encoder_layer = TransformerEncoderLayer(hidden_size, nhead, dim_feedforward, dropout)
22 |         encoder_norm = col_nn.LayerNorm(hidden_size)
23 |         self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
24 | 
25 |         decoder_layer = TransformerDecoderLayer(hidden_size, nhead, dim_feedforward, dropout)
26 |         decoder_norm = col_nn.LayerNorm(hidden_size)
27 |         self.decoder = TransformerDecoder(decoder_layer,
28 |                                           num_decoder_layers,
29 |                                           decoder_norm,
30 |                                           return_intermediate=return_intermediate_dec)
31 | 
32 |         self.hidden_size = hidden_size
33 |         self.nhead = nhead
34 | 
35 |     def forward(self, src, mask, query_embed, pos_embed):
36 |         bs, c, h, w = src.shape
37 |         src = src.flatten(2).permute(2, 0, 1)
38 |         pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
39 |         query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
40 |         # mask = mask.flatten(1)
41 | 
42 |         tgt = torch.zeros_like(query_embed)
43 |         memory = self.encoder(src, pos=pos_embed)
44 | 
45 |         hs = self.decoder(tgt, memory, pos=pos_embed, query_pos=query_embed)
46 | 
47 |         return hs.transpose(1, 2)
48 | 


--------------------------------------------------------------------------------
/titans/layer/embedding/gpt_embedding.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import dtype, nn
 3 | 
 4 | from colossalai import nn as col_nn
 5 | from colossalai.utils import get_current_device
 6 | 
 7 | 
 8 | class GPTEmbedding(nn.Module):
 9 | 
10 |     def __init__(self,
11 |                  embedding_dim: int,
12 |                  vocab_size: int,
13 |                  max_position_embeddings: int,
14 |                  num_tokentypes: int = 0,
15 |                  padding_idx: int = None,
16 |                  dropout: float = 0.,
17 |                  dtype: dtype = None) -> None:
18 |         super().__init__()
19 |         self.word_embeddings = col_nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx, dtype=dtype)
20 |         self.position_embeddings = col_nn.Embedding(max_position_embeddings, embedding_dim, dtype=dtype)
21 |         if num_tokentypes > 0:
22 |             self.tokentype_embeddings = col_nn.Embedding(num_tokentypes, embedding_dim, dtype=dtype)
23 |         else:
24 |             self.tokentype_embeddings = None
25 |         self.dropout = col_nn.Dropout(dropout)
26 | 
27 |     @property
28 |     def word_embedding_weight(self):
29 |         return self.word_embeddings.weight
30 | 
31 |     def forward(self, input_ids, position_ids=None, tokentype_ids=None):
32 |         seq_length = input_ids.size(1)
33 |         if position_ids is None:
34 |             bs = input_ids.size(0)
35 |             position_ids = torch.arange(seq_length, dtype=torch.long, device=get_current_device()).unsqueeze(0)
36 |             position_ids = position_ids.repeat(bs, 1)
37 |         # the size of input_ids is (BATCH_SIZE, SEQ_LEN)
38 |         # the size of x after word_embeddings is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
39 |         x = self.word_embeddings(input_ids) + self.position_embeddings(position_ids)
40 |         if self.tokentype_embeddings is not None and tokentype_ids is not None:
41 |             x = x + self.tokentype_embeddings(tokentype_ids)
42 |         x = self.dropout(x)
43 | 
44 |         return x


--------------------------------------------------------------------------------
/tests/test_layer/test_block/test_detr_block.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import pytest
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | from titans.layer.block import DeTrEncoder, DeTrDecoder
 7 | from titans.utils import split_data_for_tensor_parallel
 8 | from functools import partial
 9 | from colossalai.global_variables import tensor_parallel_env as tp_env
10 | from colossalai.testing import rerun_if_address_is_in_use
11 | from tests.utils import run_with_parallel_config
12 | 
13 | BATCH_SIZE = 4
14 | SEQ_LENGTH = 16
15 | NUM_HEADS = 4
16 | HIDDEN_SIZE = 32
17 | 
18 | 
19 | def run_detr_encoder(data, hidden_size, num_heads):
20 | 
21 |     #build model
22 |     model = DeTrEncoder(hidden_size=hidden_size, num_heads=num_heads, mlp_ratio=4, activation=F.gelu).cuda()
23 | 
24 |     # forward
25 |     out = model(data)
26 | 
27 |     # backward
28 |     out.mean().backward()
29 | 
30 | 
31 | def run_detr_decoder(data, memory, hidden_size, num_heads):
32 | 
33 |     #build model
34 |     model = DeTrDecoder(hidden_size=hidden_size, num_heads=num_heads, mlp_ratio=4, activation=F.gelu).cuda()
35 | 
36 |     # forward
37 |     out = model(data, memory)
38 | 
39 |     # backward
40 |     out.mean().backward()
41 | 
42 | 
43 | def run_dist(rank, world_size, port, config):
44 |     colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost')
45 | 
46 |     if tp_env.mode == 'sequence':
47 |         tp_env.mode = None
48 | 
49 |     data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda()
50 |     data = split_data_for_tensor_parallel(data)
51 |     memory = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda()
52 |     memory = split_data_for_tensor_parallel(memory)
53 |     run_detr_encoder(data, HIDDEN_SIZE, NUM_HEADS)
54 |     run_detr_decoder(data, memory, HIDDEN_SIZE, NUM_HEADS)
55 | 
56 | 
57 | 
58 | @pytest.mark.parametrize('parallel_config', [(4, '1d')])
59 | @rerun_if_address_is_in_use()
60 | def test_detr_block(parallel_config):
61 |     run_with_parallel_config(*parallel_config, run_func=run_dist)
62 | 


--------------------------------------------------------------------------------
/titans/layer/embedding/vit_embedding.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import dtype, nn
 3 | 
 4 | from colossalai import nn as col_nn
 5 | from ..init_rules import init_rules
 6 | 
 7 | 
 8 | class ViTEmbedding(nn.Module):
 9 |     """
10 |     Construct the patch embeddings.
11 | 
12 |     Args:
13 |         img_size(int): The size of images.
14 |         patch_size(int): The size of patches.
15 |         in_chans(int): The size of input channels.
16 |         embedding_dim(int): The embedding size of patches.
17 |         dropout(float): The ratio used to construct dropout modules, which indicates the percentage of parameters should be casted to zero.
18 |         dtype (:class:`torch.dtype`): The dtype of parameters, defaults to None.
19 |         flatten(bool): If set to ``False``, the patches will not be flatten, defaults to ``True``.
20 |         init_method(str): The initializing method used in layers, defaults to `torch`.
21 |     """
22 | 
23 |     def __init__(self,
24 |                  img_size: int,
25 |                  patch_size: int,
26 |                  in_chans: int,
27 |                  embedding_dim: int,
28 |                  dropout: float,
29 |                  dtype: dtype = None,
30 |                  flatten: bool = True,
31 |                  init_method: str = 'torch'):
32 |         super().__init__()
33 |         self.patch_embed = col_nn.PatchEmbedding(img_size,
34 |                                                  patch_size,
35 |                                                  in_chans,
36 |                                                  embedding_dim,
37 |                                                  dtype=dtype,
38 |                                                  flatten=flatten,
39 |                                                  **init_rules[init_method]['embed'])
40 |         self.dropout = col_nn.Dropout(dropout)
41 | 
42 |     def forward(self, x):
43 |         # the size of x before embed is (BATCH_SIZE, IN_CHAN, IMAGE_SIZE, IMAGE_SIZE)
44 |         # the size of x after embedding is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
45 |         x = self.patch_embed(x)
46 |         x = self.dropout(x)
47 |         return x
48 | 


--------------------------------------------------------------------------------
/titans/layer/mlp/transformer_mlp.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | 
 4 | from colossalai import nn as col_nn
 5 | from typing import Callable
 6 | from torch import Tensor
 7 | from torch import dtype
 8 | 
 9 | 
10 | class TransformerMLP(nn.Module):
11 |     """
12 |     The MLP module in the Transformer Architecture.
13 | 
14 |     Args:
15 |         hidden_size (int): the dimension of the linear layer.
16 |         mlp_ratio (int): the multiplication factor of the linear dimension, default is 4.
17 |         activation (Callable): the activation function, default is None which will use GeLU.
18 |         dropout_prob (float): the probability of dropout, default is 0.
19 |         dtype (torch.dtype): the data type for model parameters, default is None.
20 |         bias (bool): whether the linear layers have bias, default is True.
21 |     """
22 | 
23 |     def __init__(self,
24 |                  hidden_size: int,
25 |                  mlp_ratio: int = 4,
26 |                  activation: Callable = None,
27 |                  dropout_prob: float = 0.0,
28 |                  dtype: dtype = None,
29 |                  bias: bool = True):
30 |         super().__init__()
31 |         intermediate_dim = int(hidden_size * mlp_ratio)
32 | 
33 |         # int linear layers
34 |         self.linear_1 = col_nn.Linear(hidden_size, intermediate_dim, dtype=dtype, bias=bias)
35 |         self.linear_2 = col_nn.Linear(intermediate_dim, hidden_size, dtype=dtype, bias=bias)
36 | 
37 |         # int activation function
38 |         if activation:
39 |             self.activation = activation
40 |         else:
41 |             self.activation = F.gelu
42 | 
43 |         # init dropout
44 |         if dropout_prob > 0:
45 |             self.dropout = col_nn.Dropout(dropout_prob)
46 |         else:
47 |             self.dropout = None
48 | 
49 |     def forward(self, x: Tensor) -> Tensor:
50 |         # the size of x is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
51 |         # the size of intermediate_activate is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE*mlp_ratio)
52 |         intermediate_activate = self.linear_1(x)
53 |         intermediate_activate = self.activation(intermediate_activate)
54 |         # the size of output is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
55 |         output = self.linear_2(intermediate_activate)
56 | 
57 |         if self.dropout:
58 |             output = self.dropout(output)
59 | 
60 |         return output
61 | 


--------------------------------------------------------------------------------
/titans/utils/tensor_parallel_data_split.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from colossalai.core import global_context as gpc
 4 | from colossalai.context import ParallelMode
 5 | from colossalai.global_variables import tensor_parallel_env as tp_env
 6 | from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
 7 | from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
 8 | 
 9 | 
10 | def split_data_2d(x: Tensor) -> Tensor:
11 |     """
12 |     2D tensor parallel requries splitting the data in the first dimension and last dimension
13 |     """
14 |     j = gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW)
15 |     i = gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL)
16 |     x = torch.chunk(x, tp_env.summa_dim, dim=0)[i]
17 |     x = torch.chunk(x, tp_env.summa_dim, dim=-1)[j]
18 |     return x
19 | 
20 | 
21 | def split_data_2p5d(x: Tensor) -> Tensor:
22 |     """
23 |     2.5D tensor parallel requries splitting the data in the first dimension and last dimension just like 2D
24 |     """
25 |     i = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL)
26 |     j = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_ROW)
27 |     x = torch.chunk(x, tp_env.tesseract_dim, dim=0)[i]
28 |     x = torch.chunk(x, tp_env.tesseract_dim, dim=-1)[j]
29 |     return x
30 | 
31 | 
32 | def split_data_3d(x: Tensor) -> Tensor:
33 |     """
34 |     2.5D tensor parallel requries splitting the data in the first dimension twice and last dimension once
35 |     """
36 |     input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
37 |     weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
38 |     output_parallel_mode = get_parallel_mode_from_env(OUTPUT_GROUP_3D)
39 | 
40 |     j = gpc.get_local_rank(input_parallel_mode)
41 |     i = gpc.get_local_rank(weight_parallel_mode)
42 |     k = gpc.get_local_rank(output_parallel_mode)
43 | 
44 |     x = torch.chunk(x, tp_env.depth_3d, dim=0)[i]
45 |     x = torch.chunk(x, tp_env.depth_3d, dim=-1)[k]
46 |     x = torch.chunk(x, tp_env.depth_3d, dim=0)[j]
47 |     return x
48 | 
49 | 
50 | def split_data_for_tensor_parallel(x: Tensor) -> Tensor:
51 |     """
52 |     Split the data based on the tensor parallel environment
53 |     """
54 | 
55 |     if tp_env.mode == '2d':
56 |         return split_data_2d(x)
57 |     elif tp_env.mode == '2.5d':
58 |         return split_data_2p5d(x)
59 |     elif tp_env.mode == '3d':
60 |         return split_data_3d(x)
61 |     else:
62 |         return x
63 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/titans/layer/block/vit_block.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from typing import Callable
 3 | 
 4 | import torch
 5 | from colossalai import nn as col_nn
 6 | from colossalai.nn.layer.utils import CheckpointModule
 7 | from torch import dtype, nn
 8 | 
 9 | from titans.layer.attention import ViTSelfAttention
10 | from titans.layer.mlp import ViTMLP
11 | from titans.decorator import support_tp_pp_only
12 | 
13 | 
14 | @support_tp_pp_only()
15 | class ViTBlock(CheckpointModule):
16 | 
17 |     def __init__(self,
18 |                  hidden_size: int,
19 |                  num_heads: int,
20 |                  mlp_ratio: int,
21 |                  activation: Callable,
22 |                  attention_dropout: float = 0.,
23 |                  dropout: float = 0.,
24 |                  drop_path: float = 0.,
25 |                  layernorm_epsilon: float = 1e-6,
26 |                  dtype: dtype = None,
27 |                  bias: bool = True,
28 |                  checkpoint: bool = False,
29 |                  init_method: str = 'torch'):
30 |         super().__init__(checkpoint)
31 |         self.norm1 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
32 |         self.attn = ViTSelfAttention(hidden_size=hidden_size,
33 |                                      num_heads=num_heads,
34 |                                      attention_dropout=attention_dropout,
35 |                                      dropout=dropout,
36 |                                      bias=bias,
37 |                                      dtype=dtype,
38 |                                      init_method=init_method)
39 |         self.drop_path = col_nn.DropPath(drop_path) if drop_path > 0. else nn.Identity()
40 |         self.norm2 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
41 |         self.mlp = ViTMLP(hidden_size=hidden_size,
42 |                           mlp_ratio=mlp_ratio,
43 |                           activation=activation,
44 |                           dropout=dropout,
45 |                           dtype=dtype,
46 |                           bias=bias,
47 |                           init_method=init_method)
48 | 
49 |     def _forward(self, x):
50 |         # the size of x is (BATCH_SZIE, SEQ_LEN, HIDDEN_SIZE)
51 |         x = x + self.drop_path(self.attn(self.norm1(x)))
52 |         # the size of x after attn is (BATCH_SZIE, SEQ_LEN, HIDDEN_SIZE)
53 |         x = x + self.drop_path(self.mlp(self.norm2(x)))
54 |         # the size of x after mlp is (BATCH_SZIE, SEQ_LEN, HIDDEN_SIZE)
55 |         return x
56 | 


--------------------------------------------------------------------------------
/tests/test_model/test_moe.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import torch
 3 | 
 4 | from colossalai.context import MOE_CONTEXT
 5 | from titans.model.moe import MOEGPT, ViTMoE, Widenet
 6 | from colossalai.global_variables import tensor_parallel_env as tp_env
 7 | from colossalai.testing import rerun_if_address_is_in_use
 8 | from tests.utils import run_with_moe_config
 9 | 
10 | NUM_EXPERTS = 64
11 | BATCH_SIZE = 4
12 | IMAGE_SIZE = 224
13 | PATCH_SIZE = 16
14 | NUM_HEADS = 4
15 | IN_CHANS = 3
16 | HIDDEN_SIZE = 32
17 | 
18 | SEQ_LENGHT = 16
19 | VOCAB_SIZE = 50304
20 | 
21 | 
22 | def run_moe_gpt(data, num_experts, hidden_size, num_heads):
23 |     # build model
24 |     model = MOEGPT(num_experts=num_experts, hidden_size=hidden_size, num_heads=num_heads).cuda()
25 | 
26 |     # forward
27 |     out = model(data)
28 | 
29 |     # backward
30 |     out.mean().backward()
31 | 
32 | 
33 | def run_vit_moe(data, num_experts, img_size, patch_size, in_chans, hidden_size, num_heads):
34 |     # build model
35 |     model = ViTMoE(num_experts=num_experts,
36 |                    img_size=img_size,
37 |                    patch_size=patch_size,
38 |                    in_chans=in_chans,
39 |                    hidden_size=hidden_size,
40 |                    num_heads=num_heads).cuda()
41 | 
42 |     # forward
43 |     out = model(data)
44 | 
45 |     # backward
46 |     out.mean().backward()
47 | 
48 | 
49 | def run_widenet(data, num_experts, img_size, patch_size, in_chans, hidden_size, num_heads):
50 |     # build model
51 |     model = Widenet(num_experts=num_experts,
52 |                     img_size=img_size,
53 |                     patch_size=patch_size,
54 |                     in_chans=in_chans,
55 |                     hidden_size=hidden_size,
56 |                     num_heads=num_heads).cuda()
57 | 
58 |     # forward
59 |     out = model(data)
60 | 
61 |     # backward
62 |     out.mean().backward()
63 | 
64 | 
65 | def run_dist(rank, world_size, port, config):
66 |     colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost')
67 | 
68 |     if tp_env.mode == 'sequence':
69 |         tp_env.mode = None
70 |     MOE_CONTEXT.setup(42)
71 |     language_data = torch.rand(BATCH_SIZE, SEQ_LENGHT) * VOCAB_SIZE
72 |     language_data = language_data.int().cuda()
73 |     run_moe_gpt(language_data, NUM_EXPERTS, HIDDEN_SIZE, NUM_HEADS)
74 | 
75 |     image_data = torch.rand(BATCH_SIZE, IN_CHANS, IMAGE_SIZE, IMAGE_SIZE).cuda()
76 |     run_vit_moe(image_data, NUM_EXPERTS, IMAGE_SIZE, PATCH_SIZE, IN_CHANS, HIDDEN_SIZE, NUM_HEADS)
77 |     run_widenet(image_data, NUM_EXPERTS, IMAGE_SIZE, PATCH_SIZE, IN_CHANS, HIDDEN_SIZE, NUM_HEADS)
78 | 
79 | 
80 | @rerun_if_address_is_in_use()
81 | def test_moe():
82 |     run_with_moe_config(4, run_func=run_dist)
83 | 


--------------------------------------------------------------------------------
/titans/layer/attention/vit_attention.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | from torch import dtype, nn
 5 | 
 6 | from colossalai import nn as col_nn
 7 | from ..init_rules import init_rules
 8 | from titans.decorator import no_support
 9 | 
10 | 
11 | @no_support(['sp'])
12 | class ViTSelfAttention(nn.Module):
13 | 
14 |     def __init__(self,
15 |                  hidden_size: int,
16 |                  num_heads: int,
17 |                  attention_dropout: float,
18 |                  dropout: float,
19 |                  bias: bool = True,
20 |                  dtype: dtype = None,
21 |                  init_method: str = 'torch'):
22 |         super().__init__()
23 |         self.attention_head_size = hidden_size // num_heads
24 |         self.query_key_value = col_nn.Linear(hidden_size,
25 |                                              3 * hidden_size,
26 |                                              dtype=dtype,
27 |                                              bias=bias,
28 |                                              **init_rules[init_method]['transformer'])
29 |         self.attention_dropout = col_nn.Dropout(attention_dropout)
30 |         self.dense = col_nn.Linear(hidden_size, hidden_size, dtype=dtype, bias=True, **init_rules[init_method]['transformer'])
31 |         self.dropout = col_nn.Dropout(dropout)
32 |         self.softmax = nn.Softmax(dim=-1)
33 | 
34 |     def forward(self, x):
35 |         # the size of x is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
36 |         # the size of qkv is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE*3)
37 |         qkv = self.query_key_value(x)
38 |         all_head_size = qkv.shape[-1] // 3
39 |         num_attention_heads = all_head_size // self.attention_head_size
40 |         new_qkv_shape = qkv.shape[:-1] + \
41 |             (num_attention_heads, 3 * self.attention_head_size)
42 |         qkv = qkv.view(new_qkv_shape)
43 |         qkv = qkv.permute((0, 2, 1, 3))
44 |         # the size of q is (BATCH_SZIE, NUM_HEADS, SEQ_LEN, HIDDEN_SIZE//NUM_HEADS)
45 |         q, k, v = torch.chunk(qkv, 3, dim=-1)
46 |         # the size of x is (BATCH_SIZE, NUM_HEADS, SEQ_LEN, SEQ_LEN)
47 |         x = torch.matmul(q, k.transpose(-1, -2))
48 |         x = x / math.sqrt(self.attention_head_size)
49 |         x = self.softmax(x)
50 |         x = self.attention_dropout(x)
51 | 
52 |         # the size of x after matmul is (BATCH_SZIE, NUM_HEADS, SEQ_LEN, HIDDEN_SIZE//NUM_HEADS)
53 |         x = torch.matmul(x, v)
54 |         x = x.transpose(1, 2)
55 |         new_context_layer_shape = x.size()[:-2] + (all_head_size,)
56 |         # the size of x after reshape is (BATCH_SZIE, SEQ_LEN, HIDDEN_SIZE)
57 |         x = x.reshape(new_context_layer_shape)
58 |         # the size of x after dense is (BATCH_SZIE, SEQ_LEN, HIDDEN_SIZE)
59 |         x = self.dense(x)
60 |         x = self.dropout(x)
61 | 
62 |         return x
63 | 


--------------------------------------------------------------------------------
/titans/layer/block/deepnet_block.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn as nn, Tensor
 3 | from torch import dtype
 4 | from typing import Callable
 5 | from colossalai import nn as col_nn
 6 | from colossalai.core import global_context as gpc
 7 | from colossalai.utils.activation_checkpoint import checkpoint
 8 | from colossalai.nn.layer.utils import CheckpointModule
 9 | from colossalai.nn.layer.base_layer import ParallelLayer
10 | from colossalai import kernel
11 | from titans.decorator import support_tp_pp_only
12 | from titans.layer.attention import GPTSelfAttention
13 | from titans.layer.mlp import TransformerMLP
14 | 
15 | 
16 | @support_tp_pp_only()
17 | class DeepNetBlock(CheckpointModule):
18 | 
19 |     def __init__(self,
20 |                  hidden_size: int,
21 |                  num_heads: int,
22 |                  mlp_ratio: float,
23 |                  activation: Callable,
24 |                  attention_dropout: float = 0.,
25 |                  dropout: float = 0.,
26 |                  alpha: float = 1.0,
27 |                  layernorm_epsilon: float = 1e-5,
28 |                  dtype: dtype = None,
29 |                  bias: bool = True,
30 |                  fuse_scale_mask_softmax: bool = False,
31 |                  checkpoint: bool = False,
32 |                  activation_offload: bool = False):
33 |         super().__init__(checkpoint, activation_offload)
34 |         self.norm1 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
35 |         self.attn = GPTSelfAttention(hidden_size=hidden_size,
36 |                                      num_heads=num_heads,
37 |                                      attention_dropout=attention_dropout,
38 |                                      dropout=dropout,
39 |                                      bias=bias,
40 |                                      fuse_scale_mask_softmax=fuse_scale_mask_softmax,
41 |                                      dtype=dtype)
42 |         self.alpha = alpha
43 |         self.norm2 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
44 |         self.mlp = TransformerMLP(hidden_size=hidden_size,
45 |                                   mlp_ratio=mlp_ratio,
46 |                                   activation=activation,
47 |                                   dropout_prob=dropout,
48 |                                   dtype=dtype,
49 |                                   bias=bias)
50 | 
51 |     def _forward(self, x, attention_mask=None):
52 |         if attention_mask is not None and attention_mask.dtype != x.dtype:
53 |             attention_mask = attention_mask.to(x.dtype)
54 | 
55 |         residual = x
56 |         x = residual * self.alpha + self.attn(x, attention_mask)
57 |         x = self.norm1(x)
58 | 
59 |         residual = x
60 |         x = residual * self.alpha + self.mlp(x)
61 |         x = self.norm2(x)
62 | 
63 |         return x, attention_mask
64 | 


--------------------------------------------------------------------------------
/tests/test_layer/test_attention/test_transformer_attention.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import pytest
 3 | import torch
 4 | 
 5 | from titans.layer.attention import TransformerSelfAttention, GPTSelfAttention, ViTSelfAttention
 6 | from titans.utils import split_data_for_tensor_parallel
 7 | from colossalai.nn.layer.utils import divide
 8 | from colossalai.testing import rerun_if_address_is_in_use
 9 | from colossalai import nn as col_nn
10 | from colossalai.global_variables import tensor_parallel_env as tp_env
11 | from tests.utils import run_with_parallel_config
12 | 
13 | BATCH_SIZE = 4
14 | SEQ_LENGTH = 16
15 | NUM_HEADS = 4
16 | HIDDEN_SIZE = 32
17 | 
18 | 
19 | def run_transformer_attention(data, hidden_size, num_heads):
20 | 
21 |     #build model
22 |     model = TransformerSelfAttention(dropout=0.0).cuda()
23 | 
24 |     #process data
25 |     query_key_value = col_nn.Linear(hidden_size, 3 * hidden_size)
26 |     qkv = query_key_value(data)
27 |     all_head_size = qkv.shape[-1] // 3
28 |     attention_head_size = divide(hidden_size, num_heads)
29 |     num_attention_heads = divide(all_head_size, attention_head_size)
30 |     new_qkv_shape = qkv.shape[:-1] + \
31 |         (num_attention_heads, 3 * attention_head_size)
32 |     qkv = qkv.view(new_qkv_shape)
33 |     qkv = qkv.permute((0, 2, 1, 3))
34 |     q, k, v = torch.chunk(qkv, 3, dim=-1)
35 | 
36 |     # forward
37 |     out = model(q, k, v)
38 | 
39 |     # backward
40 |     out.mean().backward()
41 | 
42 | 
43 | def run_gpt_attention(data, hidden_size, num_heads):
44 | 
45 |     #build model
46 |     model = GPTSelfAttention(hidden_size=hidden_size, num_heads=num_heads, attention_dropout=0.0, dropout=0.0).cuda()
47 | 
48 |     # forward
49 |     out = model(data)
50 | 
51 |     # backward
52 |     out.mean().backward()
53 | 
54 | 
55 | def run_vit_attention(data, hidden_size, num_heads):
56 | 
57 |     #build model
58 |     model = ViTSelfAttention(hidden_size=hidden_size, num_heads=num_heads, attention_dropout=0.0, dropout=0.0).cuda()
59 | 
60 |     # forward
61 |     out = model(data)
62 | 
63 |     # backward
64 |     out.mean().backward()
65 | 
66 | 
67 | def run_dist(rank, world_size, port, config):
68 |     colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost')
69 | 
70 |     if tp_env.mode == 'sequence':
71 |         tp_env.mode = None
72 | 
73 |     data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda()
74 |     data = split_data_for_tensor_parallel(data)
75 |     run_gpt_attention(data, HIDDEN_SIZE, NUM_HEADS)
76 |     run_vit_attention(data, HIDDEN_SIZE, NUM_HEADS)
77 |     run_transformer_attention(data, HIDDEN_SIZE, NUM_HEADS)
78 | 
79 | 
80 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')])
81 | @rerun_if_address_is_in_use()
82 | def test_transformer_attention(parallel_config):
83 |     run_with_parallel_config(*parallel_config, run_func=run_dist)
84 | 


--------------------------------------------------------------------------------
/titans/layer/attention/detr_attention.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | from torch import dtype, nn
 5 | 
 6 | from colossalai import nn as col_nn
 7 | from ..init_rules import init_rules
 8 | from titans.decorator import no_support
 9 | # This part need to work together with the col_nn.Linear (row, col) in order to better parallelize.
10 | 
11 | @no_support(['sp'])
12 | class DeTrCrossAttention(nn.Module):
13 | 
14 |     def __init__(self,
15 |                  hidden_size: int,
16 |                  num_heads: int,
17 |                  attention_dropout: float,
18 |                  dropout: float,
19 |                  bias: bool = True,
20 |                  dtype: dtype = None,
21 |                  init_method: str = 'torch'):
22 |         super().__init__()
23 |         self.attention_head_size = hidden_size // num_heads
24 |         self.query = col_nn.Linear1D_Col(hidden_size,
25 |                                     hidden_size,
26 |                                     dtype=dtype,
27 |                                     bias=bias,
28 |                                     )
29 |         self.key_value = col_nn.Linear1D_Col(hidden_size,
30 |                                         2 * hidden_size,
31 |                                         dtype=dtype,
32 |                                         bias=bias,
33 |                                         )
34 |         self.attention_dropout = col_nn.Dropout(attention_dropout)
35 |         self.dense = col_nn.Linear1D_Row(hidden_size, hidden_size, dtype=dtype, bias=True)
36 |         self.dropout = col_nn.Dropout(dropout)
37 |         self.softmax = nn.Softmax(dim=-1)
38 | 
39 |     def forward(self, x, memory):
40 |         q = self.query(x)
41 |         kv = self.key_value(memory)
42 |         all_head_size = kv.shape[-1] // 2
43 |         num_attention_heads = all_head_size // self.attention_head_size
44 | 
45 |         new_q_shape = q.shape[:-1] + (num_attention_heads, self.attention_head_size)
46 |         q = q.view(new_q_shape)
47 |         q = q.permute((0, 2, 1, 3))
48 |         q = q.permute((2, 3, 0, 1)) # ?
49 | 
50 |         new_kv_shape = kv.shape[:-1] + (num_attention_heads, 2 * self.attention_head_size)
51 |         kv = kv.view(new_kv_shape)
52 |         kv = kv.permute((0, 2, 1, 3))
53 |         k, v = torch.chunk(kv, 2, dim=-1)
54 |         k = k.permute((2, 3, 0, 1)) # ?
55 |         v = v.permute((2, 3, 0, 1)) # ?
56 | 
57 |         x = torch.matmul(q, k.transpose(-1, -2))
58 |         x = x / math.sqrt(self.attention_head_size)
59 |         x = self.softmax(x)
60 |         x = self.attention_dropout(x)
61 | 
62 |         x = torch.matmul(x, v)
63 |         x = x.transpose(1, 2)
64 |         new_context_layer_shape = x.size()[:-2] + (all_head_size,)
65 |         x = x.reshape(new_context_layer_shape)
66 |         x = x.transpose(0, 1)
67 | 
68 |         x = self.dense(x)
69 |         x = self.dropout(x)
70 | 
71 |         return x
72 | 


--------------------------------------------------------------------------------
/titans/layer/block/transformer_decoder.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from colossalai import nn as col_nn
 3 | 
 4 | from titans.layer.attention import TransformerMultiHeadAttention
 5 | from .utils import get_clones
 6 | 
 7 | 
 8 | class TransformerDecoderLayer(nn.Module):
 9 | 
10 |     def __init__(self, hidden_size, nhead, dim_feedforward=2048, dropout=0.1):
11 |         super().__init__()
12 |         self.selfAttn = TransformerMultiHeadAttention(hidden_size, dim_feedforward, nhead, dropout)
13 | 
14 |         self.linear_1 = col_nn.Linear(hidden_size, dim_feedforward)
15 |         self.linear_2 = col_nn.Linear(dim_feedforward, hidden_size)
16 |         self.norm_1 = col_nn.LayerNorm(hidden_size)
17 |         self.norm_2 = col_nn.LayerNorm(hidden_size)
18 |         self.norm_3 = col_nn.LayerNorm(hidden_size)
19 |         self.dropout_1 = col_nn.Dropout(dropout)
20 |         self.dropout_2 = col_nn.Dropout(dropout)
21 |         self.dropout_3 = col_nn.Dropout(dropout)
22 |         self.dropout_4 = col_nn.Dropout(dropout)
23 | 
24 |     def with_pos_embed(self, tensor, pos):
25 |         return tensor if pos is None else tensor + pos
26 | 
27 |     def forward(self, tgt, memory, pos, query_pos):
28 |         tgt = tgt.transpose(0, 1)
29 |         query_pos = query_pos.transpose(0, 1)
30 |         pos = pos.transpose(0, 1)
31 | 
32 |         q = k = self.with_pos_embed(tgt, query_pos)
33 | 
34 |         tgt2 = self.selfAttn(q, k, tgt)
35 | 
36 |         tgt = tgt + self.dropout_1(tgt2)
37 |         tgt = self.norm_1(tgt)
38 |         tgt2 = self.selfAttn(q, self.with_pos_embed(memory, pos), memory)
39 |         tgt = tgt + self.dropout_2(tgt2)
40 |         tgt = self.norm_2(tgt)
41 |         tgt2 = self.linear_2(self.dropout_3(F.relu(self.linear_1(tgt))))
42 |         tgt = tgt + self.dropout_4(tgt2)
43 |         tgt = self.norm_3(tgt)
44 |         return tgt
45 | 
46 | 
47 | def transpose_qkv(X, num_heads):
48 |     X = X.reshape(X.shape[0], X.shape[1], num_heads, -1)
49 |     X = X.permute(0, 2, 1, 3)
50 |     return X.reshape(-1, X.shape[2], X.shape[3])
51 | 
52 | 
53 | def transpose_output(X, num_heads):
54 |     X = X.reshape(-1, num_heads, X.shape[1], X.shape[2])
55 |     X = X.permute(0, 2, 1, 3)
56 |     return X.reshape(X.shape[0], X.shape[1], -1)
57 | 
58 | 
59 | class TransformerDecoder(nn.Module):
60 | 
61 |     def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
62 |         super().__init__()
63 |         self.layers = get_clones(decoder_layer, num_layers)
64 |         self.num_layers = num_layers
65 |         self.norm = norm
66 |         self.return_intermediate = return_intermediate
67 | 
68 |     def forward(self, tgt, memory, pos, query_pos):
69 |         intermediate = []
70 | 
71 |         for layer in self.layers:
72 |             tgt = layer(tgt, memory, pos=pos, query_pos=query_pos).transpose(0, 1)
73 | 
74 |             if self.return_intermediate:
75 |                 intermediate.append(self.norm(tgt))
76 | 
77 |         return torch.stack(intermediate)
78 | 


--------------------------------------------------------------------------------
/titans/decorator/no_support.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Union, Callable
 2 | from colossalai.global_variables import tensor_parallel_env as tp_env
 3 | from colossalai.context.moe_context import MOE_CONTEXT
 4 | from colossalai.core import global_context as gpc
 5 | from colossalai.context import ParallelMode
 6 | 
 7 | SUPPORTED_MODES = ['tp', 'pp', 'sp', 'moe']
 8 | 
 9 | 
10 | def no_support(modes: Union[str, List[str]]):
11 |     """
12 |     A decorator to indicate the forbidden parallel modes for the module.
13 |     
14 |     Args:
15 |         modes (Union[str, List[str]]): the mode can only be tp (tensor parallel), 
16 |             pp (pipeline parallel), sp (sequence parallel), and moe (mixture-of-experts).
17 | 
18 |     Usage:
19 |         # if this model does not support tensor parallel version
20 |         @no_support('tp')
21 |         class SomeModule(torch.nn.Module):
22 |             ...
23 | 
24 |         # if this model does not support tp and pp
25 |         @no_support(['tp', 'pp'])
26 |         class SomeModule(torch.nn.Module):
27 |             ...
28 |     """
29 | 
30 |     if isinstance(modes, str):
31 |         assert modes in SUPPORTED_MODES, f'expected modes to be none, tp, pp, sp or moe, but got {modes}'
32 |         modes = [modes]
33 |     elif isinstance(modes, (tuple, list)):
34 |         for mode in modes:
35 |             assert mode in SUPPORTED_MODES, f'expected modes to be none, tp, pp, sp or moe, but got {mode}'
36 |     else:
37 |         raise TypeError(f'expected modes to be of type str or list, but got {type(modes)}')
38 | 
39 |     def _wrap_callable(callable_: Callable):
40 |         assert hasattr(callable_, '__init__'), 'the wrapped callable must be a class'
41 |         origin_init = callable_.__init__
42 |         class_name = callable_.__class__.__name__
43 | 
44 |         def new_init(*args, **kwargs):
45 |             if tp_env.mode != None:
46 |                 assert 'tp' not in modes, f'{class_name} does not support tensor parallel implementation'
47 | 
48 |             if MOE_CONTEXT.is_initialized:
49 |                 assert 'moe' not in modes, f'{class_name} does not support MOE implementation'
50 | 
51 |             if gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1:
52 |                 assert 'pp' not in modes, f'{class_name} does not support pipeline parallel implementation'
53 | 
54 |             if gpc.is_initialized(ParallelMode.SEQUENCE) and gpc.get_world_size(ParallelMode.SEQUENCE):
55 |                 assert 'sp' not in modes, f'{class_name} does not support sequence parallel implementation'
56 | 
57 |             origin_init(*args, **kwargs)
58 | 
59 |         callable_.__init__ = new_init
60 | 
61 |         return callable_
62 | 
63 |     return _wrap_callable
64 | 
65 | 
66 | def support_tp_pp_only():
67 |     return no_support(['moe', 'sp'])
68 | 
69 | 
70 | def support_sp_pp_only():
71 |     return no_support(['moe', 'tp'])
72 | 
73 | 
74 | def support_moe_only():
75 |     return no_support(['tp', 'sp', 'pp'])
76 | 
77 | 
78 | def no_parallel_support():
79 |     return no_support(['tp', 'pp', 'sp', 'moe'])
80 | 


--------------------------------------------------------------------------------
/titans/model/moe/widenet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from colossalai.context import ParallelMode
 5 | from colossalai.nn.layer import VanillaPatchEmbedding, VanillaClassifier, \
 6 |     WrappedDropout as Dropout, WrappedDropPath as DropPath
 7 | from colossalai.nn.layer.moe import build_ffn_experts, MoeLayer, Top2Router, NormalNoiseGenerator
 8 | from .util import moe_sa_args
 9 | from ..helper import TransformerLayer
10 | from colossalai.context.moe_context import MOE_CONTEXT
11 | 
12 | from titans.layer.attention import SelfAttentionForMoe
13 | 
14 | 
15 | class Widenet(nn.Module):
16 | 
17 |     def __init__(self,
18 |                  num_experts: int,
19 |                  capacity_factor_train: float = 1.25,
20 |                  capacity_factor_eval: float = 2.0,
21 |                  drop_tks: bool = True,
22 |                  img_size: int = 224,
23 |                  patch_size: int = 16,
24 |                  in_chans: int = 3,
25 |                  num_classes: int = 1000,
26 |                  depth: int = 12,
27 |                  hidden_size: int = 768,
28 |                  num_heads: int = 12,
29 |                  d_kv: int = 64,
30 |                  d_ff: int = 4096,
31 |                  attention_drop: float = 0.,
32 |                  drop_rate: float = 0.1,
33 |                  drop_path: float = 0.):
34 |         super().__init__()
35 | 
36 |         self.embedding = VanillaPatchEmbedding(img_size=img_size,
37 |                                                patch_size=patch_size,
38 |                                                in_chans=in_chans,
39 |                                                embed_size=hidden_size)
40 |         self.embed_dropout = Dropout(p=drop_rate, mode=ParallelMode.TENSOR)
41 | 
42 |         shared_sa = SelfAttentionForMoe(**moe_sa_args(
43 |             hidden_size=hidden_size, n_heads=num_heads, d_kv=d_kv, attention_drop=attention_drop, drop_rate=drop_rate))
44 | 
45 |         noisy_func = NormalNoiseGenerator(num_experts)
46 |         shared_router = Top2Router(capacity_factor_train=capacity_factor_train,
47 |                                    capacity_factor_eval=capacity_factor_eval,
48 |                                    noisy_func=noisy_func,
49 |                                    drop_tks=drop_tks)
50 |         shared_experts = build_ffn_experts(num_experts, hidden_size, d_ff, drop_rate=drop_rate)
51 | 
52 |         # stochastic depth decay rule
53 |         dpr = [x.item() for x in torch.linspace(0, drop_path, depth)]
54 |         blocks = [
55 |             TransformerLayer(att=shared_sa,
56 |                              ffn=MoeLayer(dim_model=hidden_size,
57 |                                           num_experts=num_experts,
58 |                                           router=shared_router,
59 |                                           experts=shared_experts),
60 |                              norm1=nn.LayerNorm(hidden_size, eps=1e-6),
61 |                              norm2=nn.LayerNorm(hidden_size, eps=1e-6),
62 |                              droppath=DropPath(p=dpr[i], mode=ParallelMode.TENSOR)) for i in range(depth)
63 |         ]
64 | 
65 |         self.blocks = nn.ModuleList(blocks)
66 |         self.norm = nn.LayerNorm(hidden_size, eps=1e-6)
67 |         self.linear = VanillaClassifier(in_features=hidden_size, num_classes=num_classes)
68 |         nn.init.zeros_(self.linear.weight)
69 |         nn.init.zeros_(self.linear.bias)
70 | 
71 |     def forward(self, x):
72 |         MOE_CONTEXT.reset_loss()
73 | 
74 |         x = self.embedding(x)
75 |         x = self.embed_dropout(x)
76 | 
77 |         y = 0
78 |         for block in self.blocks:
79 |             x, y = block(x, y)
80 | 
81 |         x = self.norm(x)
82 |         x = torch.mean(x, dim=1)
83 |         x = self.linear(x)
84 | 
85 |         MOE_CONTEXT.add_loss(y)
86 |         return x
87 | 


--------------------------------------------------------------------------------
/titans/layer/attention/gpt_attention.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | from torch import dtype, nn
 5 | 
 6 | from colossalai import nn as col_nn
 7 | from colossalai.nn.layer.utils import divide
 8 | from colossalai.utils import get_current_device
 9 | from titans.decorator import no_support
10 | 
11 | 
12 | @no_support(['sp'])
13 | class GPTSelfAttention(nn.Module):
14 | 
15 |     def __init__(self,
16 |                  hidden_size: int,
17 |                  num_heads: int,
18 |                  attention_dropout: float,
19 |                  dropout: float,
20 |                  bias: bool = True,
21 |                  fuse_scale_mask_softmax: bool = False,
22 |                  dtype: dtype = None) -> None:
23 |         super().__init__()
24 |         self.fuse_scale_mask_softmax = fuse_scale_mask_softmax
25 |         self.attention_head_size = divide(hidden_size, num_heads)
26 |         self.query_key_value = col_nn.Linear(hidden_size, 3 * hidden_size, dtype=dtype, bias=bias)
27 |         if fuse_scale_mask_softmax:
28 |             from colossalai.kernel import FusedScaleMaskSoftmax
29 |             from colossalai.kernel.cuda_native.scaled_softmax import \
30 |                 AttnMaskType
31 |             self.softmax = FusedScaleMaskSoftmax(input_in_fp16=True,
32 |                                                  input_in_bf16=False,
33 |                                                  attn_mask_type=AttnMaskType.causal,
34 |                                                  scaled_masked_softmax_fusion=True,
35 |                                                  mask_func=None,
36 |                                                  softmax_in_fp32=True,
37 |                                                  scale=math.sqrt(self.attention_head_size))
38 |         else:
39 |             self.softmax = nn.Softmax(dim=-1)
40 |         self.attention_dropout = col_nn.Dropout(attention_dropout)
41 |         self.dense = col_nn.Linear(hidden_size, hidden_size, dtype=dtype, bias=True)
42 |         self.dropout = col_nn.Dropout(dropout)
43 | 
44 |     def forward(self, x, attention_mask=None):
45 |         # the size of x is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
46 |         # the size of qkv is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE*3)
47 |         qkv = self.query_key_value(x)
48 |         all_head_size = qkv.shape[-1] // 3
49 |         num_attention_heads = divide(all_head_size, self.attention_head_size)
50 |         new_qkv_shape = qkv.shape[:-1] + \
51 |             (num_attention_heads, 3 * self.attention_head_size)
52 |         qkv = qkv.view(new_qkv_shape)
53 |         qkv = qkv.permute((0, 2, 1, 3))
54 |         # the size of q is (BATCH_SZIE, NUM_HEADS, SEQ_LEN, HIDDEN_SIZE//NUM_HEADS)
55 |         q, k, v = torch.chunk(qkv, 3, dim=-1)
56 |         # the size of x after matmul is (BATCH_SIZE, NUM_HEADS, SEQ_LEN, SEQ_LEN)
57 |         x = torch.matmul(q, k.transpose(-1, -2))
58 | 
59 |         if self.fuse_scale_mask_softmax:
60 |             x = self.softmax(x, attention_mask)
61 |         else:
62 |             x = x / math.sqrt(self.attention_head_size)
63 |             # causal mask
64 |             q_len, k_len = q.size(-2), k.size(-2)
65 |             causal_mask = torch.tril(torch.ones((q_len, k_len), dtype=torch.uint8,
66 |                                                 device=get_current_device())).view(1, 1, q_len, k_len).bool()
67 |             x = torch.where(causal_mask, x, torch.tensor(-1e4, dtype=x.dtype, device=get_current_device()))
68 |             if attention_mask is not None:
69 |                 x = x + attention_mask
70 |             x = self.softmax(x)
71 | 
72 |         x = self.attention_dropout(x)
73 | 
74 |         # the size of x after matmul is (BATCH_SZIE, NUM_HEADS, SEQ_LEN, HIDDEN_SIZE//NUM_HEADS)
75 |         x = torch.matmul(x, v)
76 |         x = x.transpose(1, 2)
77 |         new_context_layer_shape = x.size()[:-2] + (all_head_size,)
78 |         # the size of x after reshape is (BATCH_SZIE, SEQ_LEN, HIDDEN_SIZE)
79 |         x = x.reshape(new_context_layer_shape)
80 |         # the size of x after dense is (BATCH_SZIE, SEQ_LEN, HIDDEN_SIZE)
81 |         x = self.dense(x)
82 |         x = self.dropout(x)
83 | 
84 |         return x
85 | 


--------------------------------------------------------------------------------
/titans/loss/vocab_cross_entropy/vocab_cross_entropy.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn as nn, Tensor, distributed as dist
  3 | from torch.nn import functional as F
  4 | import torch.nn.init as init
  5 | from torch.nn.parameter import Parameter
  6 | 
  7 | from colossalai.context import ParallelMode
  8 | from colossalai.core import global_context as gpc
  9 | 
 10 | from titans.utils import VocabUtility
 11 | 
 12 | 
 13 | class vocab_parallel_cross_entropy(nn.Module):
 14 | 
 15 |     def __init__(self):
 16 |         super().__init__()
 17 | 
 18 |     def forward(self, vocab_parallel_logits, target):
 19 |         """Helper function for the cross entropy."""
 20 |         vocab_parallel_logits = vocab_parallel_logits[..., :-1, :].contiguous()
 21 |         target = target[..., 1:].contiguous()
 22 |         return _VocabParallelCrossEntropy.apply(vocab_parallel_logits.view(-1, vocab_parallel_logits.size(-1)),
 23 |                                                 target.view(-1))
 24 | 
 25 | 
 26 | class _VocabParallelCrossEntropy(torch.autograd.Function):
 27 | 
 28 |     @staticmethod
 29 |     def forward(ctx, vocab_parallel_logits, target):
 30 | 
 31 |         # Maximum value along vocab dimension across all GPUs.
 32 |         logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
 33 |         torch.distributed.all_reduce(logits_max,
 34 |                                      op=torch.distributed.ReduceOp.MAX,
 35 |                                      group=gpc.get_group(ParallelMode.PARALLEL_1D))
 36 |         # Subtract the maximum value.
 37 |         vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
 38 | 
 39 |         # Get the partition's vocab indecies
 40 |         get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
 41 |         partition_vocab_size = vocab_parallel_logits.size()[-1]
 42 |         rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
 43 |         world_size = gpc.tensor_parallel_size
 44 |         vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size)
 45 | 
 46 |         # Create a mask of valid vocab ids (1 means it needs to be masked).
 47 |         target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
 48 |         masked_target = target.clone() - vocab_start_index
 49 |         masked_target[target_mask] = 0
 50 | 
 51 |         # Get predicted-logits = logits[target].
 52 |         # For Simplicity, we convert logits to a 2-D tensor with size
 53 |         # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
 54 |         logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
 55 |         masked_target_1d = masked_target.view(-1)
 56 |         arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
 57 |         predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
 58 |         predicted_logits_1d = predicted_logits_1d.clone().contiguous()
 59 |         predicted_logits = predicted_logits_1d.view_as(target)
 60 |         predicted_logits[target_mask] = 0.0
 61 |         # All reduce is needed to get the chunks from other GPUs.
 62 |         torch.distributed.all_reduce(predicted_logits,
 63 |                                      op=torch.distributed.ReduceOp.SUM,
 64 |                                      group=gpc.get_group(ParallelMode.PARALLEL_1D))
 65 | 
 66 |         # Sum of exponential of logits along vocab dimension across all GPUs.
 67 |         exp_logits = vocab_parallel_logits
 68 |         torch.exp(vocab_parallel_logits, out=exp_logits)
 69 |         sum_exp_logits = exp_logits.sum(dim=-1)
 70 |         torch.distributed.all_reduce(sum_exp_logits,
 71 |                                      op=torch.distributed.ReduceOp.SUM,
 72 |                                      group=gpc.get_group(ParallelMode.PARALLEL_1D))
 73 | 
 74 |         # Loss = log(sum(exp(logits))) - predicted-logit.
 75 |         loss = torch.log(sum_exp_logits) - predicted_logits
 76 |         loss = loss.mean()
 77 |         # Store softmax, target-mask and masked-target for backward pass.
 78 |         exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
 79 |         ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
 80 |         return loss
 81 | 
 82 |     @staticmethod
 83 |     def backward(ctx, grad_output):
 84 | 
 85 |         # Retreive tensors from the forward path.
 86 |         softmax, target_mask, masked_target_1d = ctx.saved_tensors
 87 | 
 88 |         # All the inputs have softmax as thier gradient.
 89 |         grad_input = softmax
 90 |         # For simplicity, work with the 2D gradient.
 91 |         partition_vocab_size = softmax.size()[-1]
 92 |         grad_2d = grad_input.view(-1, partition_vocab_size)
 93 | 
 94 |         # Add the gradient from matching classes.
 95 |         arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
 96 |         grad_2d[arange_1d, masked_target_1d] -= (1.0 - target_mask.view(-1).float())
 97 | 
 98 |         # Finally elementwise multiplication with the output gradients.
 99 |         grad_input.mul_(grad_output.unsqueeze(dim=-1))
100 | 
101 |         return grad_input, None
102 | 


--------------------------------------------------------------------------------
/titans/layer/block/detr_block.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from typing import Callable
  3 | 
  4 | import torch
  5 | from colossalai import nn as col_nn
  6 | from colossalai.nn.layer.utils import CheckpointModule
  7 | from torch import dtype, nn
  8 | 
  9 | from titans.layer.attention import ViTSelfAttention, DeTrCrossAttention
 10 | from titans.layer.mlp import ViTMLP
 11 | from titans.decorator import support_tp_pp_only
 12 | 
 13 | 
 14 | @support_tp_pp_only()
 15 | class DeTrEncoder(CheckpointModule):
 16 | 
 17 |     def __init__(self,
 18 |                  hidden_size: int,
 19 |                  num_heads: int,
 20 |                  mlp_ratio: int,
 21 |                  activation: Callable,
 22 |                  attention_dropout: float = 0.,
 23 |                  dropout: float = 0.,
 24 |                  drop_path: float = 0.,
 25 |                  layernorm_epsilon: float = 1e-6,
 26 |                  dtype: dtype = None,
 27 |                  bias: bool = True,
 28 |                  checkpoint: bool = False,
 29 |                  init_method: str = 'torch'):
 30 |         super().__init__(checkpoint)
 31 |         self.norm1 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
 32 |         self.attn = ViTSelfAttention(hidden_size=hidden_size,
 33 |                                      num_heads=num_heads,
 34 |                                      attention_dropout=attention_dropout,
 35 |                                      dropout=dropout,
 36 |                                      bias=bias,
 37 |                                      dtype=dtype,
 38 |                                      init_method=init_method)
 39 |         self.drop_path = col_nn.DropPath(drop_path) if drop_path > 0. else nn.Identity()
 40 |         self.norm2 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
 41 |         self.mlp = ViTMLP(hidden_size=hidden_size,
 42 |                           mlp_ratio=mlp_ratio,
 43 |                           activation=activation,
 44 |                           dropout=dropout,
 45 |                           dtype=dtype,
 46 |                           bias=bias,
 47 |                           init_method=init_method)
 48 | 
 49 |     def _forward(self, x):
 50 |         x = x + self.drop_path(self.norm1(self.attn(x)))
 51 |         x = x + self.drop_path(self.norm2(self.mlp(x)))
 52 |         return x
 53 | 
 54 | 
 55 | @support_tp_pp_only()
 56 | class DeTrDecoder(CheckpointModule):
 57 | 
 58 |     def __init__(self,
 59 |                  hidden_size: int,
 60 |                  num_heads: int,
 61 |                  mlp_ratio: int,
 62 |                  activation: Callable,
 63 |                  attention_dropout: float = 0.,
 64 |                  dropout: float = 0.,
 65 |                  drop_path: float = 0.,
 66 |                  layernorm_epsilon: float = 1e-6,
 67 |                  dtype: dtype = None,
 68 |                  bias: bool = True,
 69 |                  checkpoint: bool = False,
 70 |                  init_method: str = 'torch'):
 71 |         super().__init__(checkpoint)
 72 |         self.norm1 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
 73 |         self.norm2 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
 74 |         self.norm3 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
 75 | 
 76 |         self.attn1 = ViTSelfAttention(hidden_size=hidden_size,
 77 |                                      num_heads=num_heads,
 78 |                                      attention_dropout=attention_dropout,
 79 |                                      dropout=dropout,
 80 |                                      bias=bias,
 81 |                                      dtype=dtype,
 82 |                                      init_method=init_method)
 83 | 
 84 |         self.attn2 = DeTrCrossAttention(hidden_size=hidden_size,
 85 |                                      num_heads=num_heads,
 86 |                                      attention_dropout=attention_dropout,
 87 |                                      dropout=dropout,
 88 |                                      bias=bias,
 89 |                                      dtype=dtype,
 90 |                                      init_method=init_method)
 91 | 
 92 |         self.drop_path = col_nn.DropPath(drop_path) if drop_path > 0. else nn.Identity()
 93 |         
 94 |         self.mlp = ViTMLP(hidden_size=hidden_size,
 95 |                           mlp_ratio=mlp_ratio,
 96 |                           activation=activation,
 97 |                           dropout=dropout,
 98 |                           dtype=dtype,
 99 |                           bias=bias,
100 |                           init_method=init_method)
101 | 
102 |     def _forward(self, x, memory):
103 |         x = x + self.drop_path(self.norm1(self.attn1(x)))
104 |         x = x + self.drop_path(self.norm2(self.attn2(x, memory)))
105 |         x = x + self.drop_path(self.mlp(self.norm3(x)))
106 |         return x
107 | 


--------------------------------------------------------------------------------
/titans/model/moe/vit_moe.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from colossalai.context import ParallelMode
  5 | from colossalai.nn.layer import VanillaPatchEmbedding, VanillaClassifier, \
  6 |     WrappedDropout as Dropout, WrappedDropPath as DropPath
  7 | from colossalai.nn.layer.moe import build_ffn_experts, MoeModule
  8 | from .util import moe_sa_args, moe_mlp_args
  9 | from ..helper import TransformerLayer
 10 | from colossalai.context.moe_context import MOE_CONTEXT
 11 | 
 12 | from typing import List
 13 | from titans.layer.mlp import MLPForMoe
 14 | from titans.layer.attention import SelfAttentionForMoe
 15 | 
 16 | 
 17 | class ViTMoE(nn.Module):
 18 | 
 19 |     def __init__(self,
 20 |                  num_experts: int or List[int],
 21 |                  use_residual: bool = False,
 22 |                  capacity_factor_train: float = 1.25,
 23 |                  capacity_factor_eval: float = 2.0,
 24 |                  drop_tks: bool = True,
 25 |                  img_size: int = 224,
 26 |                  patch_size: int = 16,
 27 |                  in_chans: int = 3,
 28 |                  num_classes: int = 1000,
 29 |                  depth: int = 12,
 30 |                  hidden_size: int = 768,
 31 |                  num_heads: int = 12,
 32 |                  d_kv: int = 64,
 33 |                  d_ff: int = 3072,
 34 |                  attention_drop: float = 0.,
 35 |                  drop_rate: float = 0.1,
 36 |                  drop_path: float = 0.,
 37 |                  checkpoint: bool = False):
 38 |         super().__init__()
 39 | 
 40 |         assert depth % 2 == 0, "The number of layers should be even right now"
 41 | 
 42 |         if isinstance(num_experts, list):
 43 |             assert len(num_experts) == depth // 2, \
 44 |                 "The length of num_experts should equal to the number of MOE layers"
 45 |             num_experts_list = num_experts
 46 |         else:
 47 |             num_experts_list = [num_experts] * (depth // 2)
 48 | 
 49 |         self.embedding = VanillaPatchEmbedding(img_size=img_size,
 50 |                                                patch_size=patch_size,
 51 |                                                in_chans=in_chans,
 52 |                                                embed_size=hidden_size)
 53 |         self.embed_dropout = Dropout(p=drop_rate, mode=ParallelMode.TENSOR)
 54 | 
 55 |         # stochastic depth decay rule
 56 |         dpr = [x.item() for x in torch.linspace(0, drop_path, depth)]
 57 |         blocks = []
 58 |         for i in range(depth):
 59 |             sa = SelfAttentionForMoe(**moe_sa_args(hidden_size=hidden_size,
 60 |                                                    n_heads=num_heads,
 61 |                                                    d_kv=d_kv,
 62 |                                                    attention_drop=attention_drop,
 63 |                                                    drop_rate=drop_rate))
 64 | 
 65 |             if i % 2 == 0:
 66 |                 ffn = MLPForMoe(**moe_mlp_args(hidden_size=hidden_size, d_ff=d_ff, drop_rate=drop_rate))
 67 |             else:
 68 |                 num_experts = num_experts_list[i // 2]
 69 |                 experts = build_ffn_experts(num_experts, hidden_size, d_ff, drop_rate=drop_rate)
 70 |                 ffn = MoeModule(dim_model=hidden_size,
 71 |                                 num_experts=num_experts,
 72 |                                 top_k=1 if use_residual else 2,
 73 |                                 capacity_factor_train=capacity_factor_train,
 74 |                                 capacity_factor_eval=capacity_factor_eval,
 75 |                                 noisy_policy='Jitter' if use_residual else 'Gaussian',
 76 |                                 drop_tks=drop_tks,
 77 |                                 use_residual=use_residual,
 78 |                                 expert_instance=experts,
 79 |                                 expert_cls=MLPForMoe,
 80 |                                 **moe_mlp_args(hidden_size=hidden_size, d_ff=d_ff, drop_rate=drop_rate))
 81 | 
 82 |             layer = TransformerLayer(att=sa,
 83 |                                      ffn=ffn,
 84 |                                      norm1=nn.LayerNorm(hidden_size, eps=1e-6),
 85 |                                      norm2=nn.LayerNorm(hidden_size, eps=1e-6),
 86 |                                      droppath=DropPath(p=dpr[i], mode=ParallelMode.TENSOR),
 87 |                                      checkpoint=checkpoint)
 88 |             blocks.append(layer)
 89 | 
 90 |         self.blocks = nn.ModuleList(blocks)
 91 |         self.norm = nn.LayerNorm(hidden_size, eps=1e-6)
 92 |         self.linear = VanillaClassifier(in_features=hidden_size, num_classes=num_classes)
 93 |         nn.init.zeros_(self.linear.weight)
 94 |         nn.init.zeros_(self.linear.bias)
 95 | 
 96 |     def forward(self, x):
 97 |         MOE_CONTEXT.reset_loss()
 98 | 
 99 |         x = self.embedding(x)
100 |         x = self.embed_dropout(x)
101 | 
102 |         y = 0
103 |         for block in self.blocks:
104 |             x, y = block(x, y)
105 | 
106 |         x = self.norm(x)
107 |         x = torch.mean(x, dim=1)
108 |         x = self.linear(x)
109 | 
110 |         MOE_CONTEXT.add_loss(y)
111 |         return x
112 | 


--------------------------------------------------------------------------------
/titans/model/detr/detr.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from typing import Callable
  3 | 
  4 | import torch
  5 | from colossalai import nn as col_nn
  6 | from colossalai.nn.layer.utils import CheckpointModule
  7 | from torch import dtype, nn
  8 | from torchvision.models import resnet50
  9 | 
 10 | from titans.layer.embedding import ViTEmbedding
 11 | # from titans.layer.head import DeTrHead
 12 | from titans.layer.mlp import DeTrMLP
 13 | from titans.layer.block import DeTrEncoder, DeTrDecoder
 14 | from titans.decorator import no_support
 15 | 
 16 | __all__ = [
 17 |     'DeTr',
 18 |     'detr_1',
 19 | ]
 20 | 
 21 | 
 22 | @no_support(['sp', 'moe'])
 23 | class DeTr(nn.Module):
 24 | 
 25 |     def __init__(self,
 26 |                  img_size: int = 224,
 27 |                  patch_size: int = 16,
 28 |                  in_chans: int = 3,
 29 |                  num_classes: int = 91,
 30 |                  num_encoder_layer: int = 6,
 31 |                  num_decoder_layer: int = 6,
 32 |                  num_heads: int = 12,
 33 |                  num_queries: int = 100,
 34 |                  hidden_size: int = 256,
 35 |                  mlp_ratio: int = 4,
 36 |                  attention_dropout: float = 0.,
 37 |                  dropout: float = 0.1,
 38 |                  drop_path: float = 0.,
 39 |                  layernorm_epsilon: float = 1e-6,
 40 |                  activation: Callable = nn.functional.gelu,
 41 |                  representation_size: int = None,
 42 |                  dtype: dtype = None,
 43 |                  bias: bool = True,
 44 |                  checkpoint: bool = False,
 45 |                  init_method: str = 'torch'):
 46 |         super().__init__()
 47 | 
 48 |         # self.embed = ViTEmbedding(img_size=img_size,
 49 |         #                           patch_size=patch_size,
 50 |         #                           in_chans=in_chans,
 51 |         #                           embedding_dim=hidden_size,
 52 |         #                           dropout=dropout,
 53 |         #                           dtype=dtype,
 54 |         #                           init_method=init_method)
 55 | 
 56 |         self.backbone = nn.Sequential(*list(resnet50(pretrained=True).children())[:-2])
 57 |         self.conv = nn.Conv2d(2048, hidden_size, 1)
 58 | 
 59 |         # stochastic depth decay rule
 60 |         dpr1 = [x.item() for x in torch.linspace(0, drop_path, num_encoder_layer)]
 61 |         self.blocks1 = nn.ModuleList([
 62 |             DeTrEncoder(
 63 |                 hidden_size=hidden_size,
 64 |                 num_heads=num_heads,
 65 |                 mlp_ratio=mlp_ratio,
 66 |                 attention_dropout=attention_dropout,
 67 |                 dropout=dropout,
 68 |                 drop_path=dpr1[i],
 69 |                 activation=activation,
 70 |                 dtype=dtype,
 71 |                 bias=bias,
 72 |                 checkpoint=checkpoint,
 73 |                 init_method=init_method,
 74 |             ) for i in range(num_encoder_layer)
 75 |         ])
 76 | 
 77 |         dpr2 = [x.item() for x in torch.linspace(0, drop_path, num_decoder_layer)]
 78 |         self.blocks2 = nn.ModuleList([
 79 |             DeTrDecoder(
 80 |                 hidden_size=hidden_size,
 81 |                 num_heads=num_heads,
 82 |                 mlp_ratio=mlp_ratio,
 83 |                 attention_dropout=attention_dropout,
 84 |                 dropout=dropout,
 85 |                 drop_path=dpr2[i],
 86 |                 activation=activation,
 87 |                 dtype=dtype,
 88 |                 bias=bias,
 89 |                 checkpoint=checkpoint,
 90 |                 init_method=init_method,
 91 |             ) for i in range(num_decoder_layer)
 92 |         ])
 93 | 
 94 |         self.norm = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
 95 |         
 96 |         self.class_embed = nn.Linear(hidden_size, num_classes + 1)
 97 |         self.bbox_embed = DeTrMLP(hidden_size, hidden_size, 4, 3)
 98 |         self.query_embed = nn.Embedding(num_queries, hidden_size)
 99 | 
100 |         self.query_pos = nn.Parameter(torch.rand(100, hidden_size))
101 |         self.row_embed = nn.Parameter(torch.rand(50, hidden_size // 2))
102 |         self.col_embed = nn.Parameter(torch.rand(50, hidden_size // 2))
103 | 
104 |     def forward(self, x):
105 |         x = self.backbone(x)
106 |         h = self.conv(x)
107 |         H, W = h.shape[-2:]
108 |         pos = torch.cat([
109 |             self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
110 |             self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
111 |         ], dim=-1).flatten(0, 1).unsqueeze(1)
112 | 
113 |         for block in self.blocks1:
114 |             memory = block(pos + h.flatten(2).permute(2, 0, 1))
115 |         print('memory',memory.size())
116 |         print('self.query_pos.unsqueeze(1)',self.query_pos.unsqueeze(1).size())
117 |         for block in self.blocks2:
118 |             x = block(self.query_pos.unsqueeze(1), memory)
119 | 
120 |         x = self.norm(x)
121 |         outputs_class = self.class_embed(x)
122 |         outputs_coord = self.bbox_embed(x).sigmoid()
123 |         out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}
124 |         # if self.aux_loss:
125 |         #     out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord)
126 |         # return out # not dict 
127 |         return outputs_class # temp
128 |  
129 |         
130 | 
131 | 
132 | def _create_detr_model(**model_kwargs):
133 |     model = DeTr(**model_kwargs)
134 |     return model
135 | 
136 | 
137 | def detr_1(**kwargs):
138 |     model_kwargs = dict(img_size=32, patch_size=4, hidden_size=256, depth=7, num_heads=4, mlp_ratio=2, num_classes=10, **kwargs)
139 |     return _create_detr_model(**model_kwargs)
140 | 
141 | 


--------------------------------------------------------------------------------
/titans/layer/block/gpt_block.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable
  2 | 
  3 | from torch import dtype
  4 | 
  5 | from colossalai import nn as col_nn
  6 | from colossalai.nn.layer.utils import CheckpointModule
  7 | from colossalai.nn.layer import MoeModule
  8 | 
  9 | from titans.layer.attention import GPTSelfAttention
 10 | 
 11 | from titans.decorator import support_tp_pp_only
 12 | from titans.layer.mlp import TransformerMLP
 13 | 
 14 | 
 15 | class GPTBlock(CheckpointModule):
 16 | 
 17 |     def __init__(self,
 18 |                  hidden_size: int,
 19 |                  num_heads: int,
 20 |                  mlp_ratio: float,
 21 |                  activation: Callable,
 22 |                  attention_dropout: float = 0.,
 23 |                  dropout: float = 0.,
 24 |                  layernorm_epsilon: float = 1e-5,
 25 |                  dtype: dtype = None,
 26 |                  bias: bool = True,
 27 |                  apply_post_layernorm: bool = False,
 28 |                  fuse_scale_mask_softmax: bool = False,
 29 |                  checkpoint: bool = False,
 30 |                  activation_offload: bool = False):
 31 |         super().__init__(checkpoint, activation_offload)
 32 |         self.apply_post_layernorm = apply_post_layernorm
 33 |         self.norm1 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
 34 |         self.attn = GPTSelfAttention(hidden_size=hidden_size,
 35 |                                      num_heads=num_heads,
 36 |                                      attention_dropout=attention_dropout,
 37 |                                      dropout=dropout,
 38 |                                      bias=bias,
 39 |                                      fuse_scale_mask_softmax=fuse_scale_mask_softmax,
 40 |                                      dtype=dtype)
 41 |         self.norm2 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
 42 |         self.mlp = TransformerMLP(hidden_size=hidden_size,
 43 |                                   mlp_ratio=mlp_ratio,
 44 |                                   activation=activation,
 45 |                                   dropout_prob=dropout,
 46 |                                   dtype=dtype,
 47 |                                   bias=bias)
 48 | 
 49 |     def _forward(self, x, attention_mask=None):
 50 |         if attention_mask is not None and attention_mask.dtype != x.dtype:
 51 |             attention_mask = attention_mask.to(x.dtype)
 52 |         if not self.apply_post_layernorm:
 53 |             residual = x
 54 |         x = self.norm1(x)
 55 |         if self.apply_post_layernorm:
 56 |             residual = x
 57 |         x = residual + self.attn(x, attention_mask)
 58 | 
 59 |         if not self.apply_post_layernorm:
 60 |             residual = x
 61 |         x = self.norm2(x)
 62 |         if self.apply_post_layernorm:
 63 |             residual = x
 64 |         x = residual + self.mlp(x)
 65 | 
 66 |         return x, attention_mask
 67 | 
 68 | 
 69 | class MOEGPTBlock(CheckpointModule):
 70 | 
 71 |     def __init__(self,
 72 |                  num_experts: int,
 73 |                  hidden_size: int,
 74 |                  num_heads: int,
 75 |                  mlp_ratio: float,
 76 |                  activation: Callable,
 77 |                  capacity_factor_train: float = 1.0,
 78 |                  capacity_factor_eval: float = 1.0,
 79 |                  use_residual: bool = False,
 80 |                  attention_dropout: float = 0.,
 81 |                  dropout: float = 0.,
 82 |                  layernorm_epsilon: float = 1e-5,
 83 |                  dtype: dtype = None,
 84 |                  bias: bool = True,
 85 |                  apply_post_layernorm: bool = False,
 86 |                  fuse_scale_mask_softmax: bool = False,
 87 |                  checkpoint: bool = False):
 88 |         super().__init__(checkpoint)
 89 |         self.apply_post_layernorm = apply_post_layernorm
 90 |         self.norm1 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
 91 |         self.attn = GPTSelfAttention(hidden_size=hidden_size,
 92 |                                      num_heads=num_heads,
 93 |                                      attention_dropout=attention_dropout,
 94 |                                      dropout=dropout,
 95 |                                      bias=bias,
 96 |                                      fuse_scale_mask_softmax=fuse_scale_mask_softmax,
 97 |                                      dtype=dtype)
 98 |         self.norm2 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
 99 | 
100 |         mpl_factory_dict = dict(hidden_size=hidden_size,
101 |                                 mlp_ratio=mlp_ratio,
102 |                                 activation=activation,
103 |                                 dtype=dtype,
104 |                                 bias=bias)
105 | 
106 |         self.mlp = MoeModule(dim_model=hidden_size,
107 |                              num_experts=num_experts,
108 |                              top_k=1,
109 |                              capacity_factor_train=capacity_factor_train,
110 |                              capacity_factor_eval=capacity_factor_eval,
111 |                              noisy_policy='Jitter',
112 |                              use_residual=use_residual,
113 |                              expert_cls=TransformerMLP,
114 |                              **mpl_factory_dict)
115 | 
116 |     def _forward(self, x, y, attention_mask=None):
117 |         if not self.apply_post_layernorm:
118 |             residual = x
119 |         x = self.norm1(x)
120 |         if self.apply_post_layernorm:
121 |             residual = x
122 |         x = residual + self.attn(x, attention_mask)
123 | 
124 |         if not self.apply_post_layernorm:
125 |             residual = x
126 |         x = self.norm2(x)
127 |         if self.apply_post_layernorm:
128 |             residual = x
129 |         x, z = self.mlp(x)
130 | 
131 |         x = residual + x
132 |         y = y + z
133 | 
134 |         return x, y, attention_mask
135 | 


--------------------------------------------------------------------------------
/titans/model/vilt/vilt.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable
  2 | import torch
  3 | from colossalai import nn as col_nn
  4 | from torch import dtype, nn
  5 | 
  6 | from titans.layer.embedding import ViTEmbedding
  7 | from titans.layer.block import ViTBlock
  8 | from utils import heads, objectives
  9 | import torch.nn.functional as F
 10 | from colossalai.nn.layer.colossalai_layer import LayerNorm
 11 | from transformers.models.bert.modeling_bert import BertConfig, BertEmbeddings
 12 | 
 13 | 
 14 | class ViLT(nn.Module):
 15 | 
 16 |     def __init__(
 17 |         self,
 18 |         config,
 19 |         img_size: int = 384,
 20 |         patch_size: int = 16,
 21 |         in_chans: int = 3,
 22 |         num_classes: int = 1000,
 23 |         depth: int = 12,
 24 |         num_heads: int = 12,
 25 |         hidden_size: int = 768,
 26 |         mlp_ratio: int = 4,
 27 |         attention_dropout: float = 0.,
 28 |         dropout: float = 0.1,
 29 |         dropout_prob=0.1,
 30 |         drop_path: float = 0.,
 31 |         init_std=0.02,
 32 |         layernorm_epsilon: float = 1e-6,
 33 |         activation: Callable = nn.functional.gelu,
 34 |         representation_size: int = None,
 35 |         convert_fp16_to_fp32_in_softmax=False,
 36 |         dtype: dtype = None,
 37 |         bias: bool = True,
 38 |         checkpoint: bool = False,
 39 |         init_method: str = 'torch',
 40 |         first_stage=True,
 41 |         last_stage=True,
 42 |         start_idx=0,
 43 |         end_idx=None,
 44 |     ):
 45 | 
 46 |         super().__init__()
 47 |         max_sequence_length = config["max_text_len"]
 48 |         num_layers = config["num_layers"]
 49 |         vocab_size = config["vocab_size"]
 50 |         self.vocab_size = vocab_size
 51 |         hidden_size = config["hidden_size"]
 52 |         self.first_stage = first_stage
 53 |         self.last_stage = last_stage
 54 |         self.init_std = init_std
 55 |         self.num_layers = num_layers
 56 | 
 57 |         bert_config = BertConfig(
 58 |             vocab_size=vocab_size,
 59 |             hidden_size=hidden_size,
 60 |             num_hidden_layers=num_layers,
 61 |             num_attention_heads=num_heads,
 62 |             intermediate_size=hidden_size * mlp_ratio,
 63 |             max_position_embeddings=max_sequence_length,
 64 |             hidden_dropout_prob=dropout,
 65 |             attention_probs_dropout_prob=dropout,
 66 |         )
 67 | 
 68 |         self.pooler = heads.Pooler(hidden_size)
 69 |         self.token_type_embeddings = nn.Embedding(2, hidden_size)
 70 |         self.token_type_embeddings.apply(objectives.init_weights)
 71 |         self.text_embedding = BertEmbeddings(bert_config)
 72 |         self.vis_embedding = ViTEmbedding(img_size=img_size,
 73 |                                           patch_size=patch_size,
 74 |                                           in_chans=in_chans,
 75 |                                           embedding_dim=hidden_size,
 76 |                                           dropout=dropout,
 77 |                                           dtype=dtype,
 78 |                                           init_method=init_method)
 79 | 
 80 |         # stochastic depth decay rule
 81 |         dpr = [x.item() for x in torch.linspace(0, drop_path, depth)]
 82 |         blocks = [
 83 |             ViTBlock(
 84 |                 hidden_size=hidden_size,
 85 |                 num_heads=num_heads,
 86 |                 mlp_ratio=mlp_ratio,
 87 |                 attention_dropout=attention_dropout,
 88 |                 dropout=dropout,
 89 |                 drop_path=dpr[i],
 90 |                 activation=activation,
 91 |                 dtype=dtype,
 92 |                 bias=bias,
 93 |                 checkpoint=checkpoint,
 94 |                 init_method=init_method,
 95 |             ) for i in range(depth)
 96 |         ]
 97 |         norm = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
 98 | 
 99 |         if self.last_stage:
100 |             self.mlm_score = heads.MLMHead(bert_config)
101 |             self.mlm_score.apply(objectives.init_weights)
102 | 
103 |         self.layer_norm = LayerNorm(hidden_size)
104 | 
105 |         layers = []
106 |         layers.extend(blocks)
107 |         layers.extend([norm])
108 |         self.layers = nn.Sequential(*layers)
109 |         # self.layers = build_pipeline_model(self.layers, num_chunks=1, verbose=True)
110 | 
111 |     def infer(self, x, image_token_type_idx=1):
112 |         do_mlm = "_mlm"
113 |         if f"image_{image_token_type_idx - 1}" in x:
114 |             imgkey = f"image_{image_token_type_idx - 1}"
115 |         else:
116 |             imgkey = "image"
117 |         img = x[imgkey]
118 |         text_ids = x[f"text_ids{do_mlm}"]
119 |         text_labels = x[f"text_labels{do_mlm}"]
120 |         image_embeds = self.vis_embedding(img)
121 |         text_embeds = self.text_embedding(text_ids)
122 |         co_embeds = torch.cat([text_embeds, image_embeds], dim=1)
123 |         x = co_embeds
124 |         x = self.layers(x)
125 |         text_feats, image_feats = (
126 |             x[:, :text_embeds.shape[1]],
127 |             x[:, text_embeds.shape[1]:],
128 |         )
129 |         cls_feats = self.pooler(x)
130 |         ret = {
131 |             "text_feats": text_feats,
132 |             "image_feats": image_feats,
133 |             "cls_feats": cls_feats,
134 |             "raw_cls_feats": x[:, 0],
135 |             "text_labels": text_labels,
136 |             "text_ids": text_ids,
137 |         }
138 |         return ret
139 | 
140 |     def forward(self, x):
141 |         ret = dict()
142 |         ret.update(self.compute_mlm(x))
143 |         return ret
144 | 
145 |     def compute_mlm(self, batch):
146 |         infer = self.infer(batch)
147 |         mlm_logits = self.mlm_score(infer["text_feats"])
148 |         mlm_labels = infer["text_labels"]
149 | 
150 |         mlm_loss = F.cross_entropy(
151 |             mlm_logits.view(-1, self.vocab_size),
152 |             mlm_labels.view(-1),
153 |             ignore_index=-100,
154 |         )
155 | 
156 |         ret = {
157 |             "mlm_loss": mlm_loss,
158 |             "mlm_logits": mlm_logits,
159 |             "mlm_labels": mlm_labels,
160 |             "mlm_ids": infer["text_ids"],
161 |         }
162 | 
163 |         return ret
164 | 
165 | 
166 | def get_current_device():
167 |     '''
168 |     Returns the index of a currently selected device (gpu/cpu).
169 |     '''
170 |     if torch.cuda.is_available():
171 |         return torch.cuda.current_device()
172 |     else:
173 |         return 'cpu'
174 | 


--------------------------------------------------------------------------------
/titans/model/deepnet/deepnet.py:
--------------------------------------------------------------------------------
  1 | from colossalai.context.parallel_mode import ParallelMode
  2 | from typing import Callable
  3 | import math
  4 | from torch import dtype
  5 | import torch.nn as nn
  6 | import torch
  7 | from colossalai import nn as col_nn
  8 | from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
  9 | from colossalai.core import global_context as gpc
 10 | import inspect
 11 | from colossalai.pipeline.utils import partition_uniform
 12 | from colossalai import kernel
 13 | from colossalai.logging import get_dist_logger
 14 | from titans.decorator import support_tp_pp_only
 15 | from titans.layer.block import DeepNetBlock
 16 | from titans.layer.embedding import GPTEmbedding
 17 | from titans.layer.head import GPTLMHead
 18 | from titans.layer.block import GPTBlock
 19 | from titans.loss.lm_loss import GPTLMLoss
 20 | 
 21 | __all__ = ['DeepNet', 'deepnet_small']
 22 | 
 23 | 
 24 | @support_tp_pp_only()
 25 | class DeepNet(nn.Module):
 26 |     """The decoder-only DeepNet model is modified from the GPT model.
 27 | 
 28 |     Args:
 29 |         vocab_size(int): The size of dictionary, defaults to 50304.
 30 |         max_position_embeddings(int): The max value of positional embeddings, defaults to 1024.
 31 |         dim(int): Hidden size of the transformer blocks, defaults to 768.
 32 |         num_heads(int): The number of heads in transformer blocks, defaults to 12.
 33 |         depth(int): The number of transformer layers, defaults to 12.
 34 |         mlp_ratio(float): The ratio used in mlp layer, defaults to 4.0.
 35 |         dropout(float): The ratio used to construct dropout modules, which indicates the percentage of parameters should be casted to zero, defaults to 0.1.
 36 |         embedding_dropout(float): The ratio used to construct embedding dropout modules, which indicates the percentage of parameters should be casted to zero, defaults to 0.1.
 37 |         attention_dropout(float): The ratio used to construct attention dropout modules, which indicates the percentage of parameters should be casted to zero, defaults to 0.1.
 38 |         layernorm_epsilon(float): The argument used to construct layernorm modules, defaults to 1e-5.
 39 |         activation(Callable): The activation function used in model, defaults to nn.functional.gelu.
 40 |         padding_idx(int): The length to be padded for each batch, defaults to None.
 41 |         dtype (:class:`torch.dtype`): The dtype of parameters, defaults to None.
 42 |         bias (bool): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
 43 |         fuse_scale_mask_softmax(bool): If set to "True", FuseScaleMaskSoftmax will be used in self-attention layer, defaults to ``False``.
 44 |         checkpoint(bool): If set to "True", checkpoint feature will be activated to save memory, defaults to ``False``.
 45 |         activation_offload(bool): If set to "True", offload feature will be activated during checkpointing, defaults to ``False``.    
 46 |     """
 47 | 
 48 |     def __init__(self,
 49 |                  vocab_size: int = 50304,
 50 |                  max_position_embeddings: int = 1024,
 51 |                  hidden_size: int = 768,
 52 |                  num_heads: int = 12,
 53 |                  depth: int = 12,
 54 |                  mlp_ratio: float = 4.0,
 55 |                  dropout: float = 0.1,
 56 |                  embedding_dropout: float = 0.1,
 57 |                  attention_dropout: float = 0.1,
 58 |                  layernorm_epsilon: float = 1e-5,
 59 |                  activation: Callable = nn.functional.gelu,
 60 |                  padding_idx: int = None,
 61 |                  dtype: dtype = None,
 62 |                  bias: bool = True,
 63 |                  fuse_scale_mask_softmax: bool = False,
 64 |                  checkpoint: bool = False,
 65 |                  activation_offload: bool = False) -> None:
 66 |         super().__init__()
 67 |         self.embed = GPTEmbedding(embedding_dim=hidden_size,
 68 |                                   vocab_size=vocab_size,
 69 |                                   max_position_embeddings=max_position_embeddings,
 70 |                                   padding_idx=padding_idx,
 71 |                                   dropout=embedding_dropout,
 72 |                                   dtype=dtype)
 73 |         alpha = math.sqrt(2 * depth)
 74 |         self.blocks = nn.ModuleList([
 75 |             DeepNetBlock(hidden_size=hidden_size,
 76 |                          num_heads=num_heads,
 77 |                          mlp_ratio=mlp_ratio,
 78 |                          activation=activation,
 79 |                          attention_dropout=attention_dropout,
 80 |                          dropout=dropout,
 81 |                          alpha=alpha,
 82 |                          layernorm_epsilon=layernorm_epsilon,
 83 |                          dtype=dtype,
 84 |                          bias=bias,
 85 |                          fuse_scale_mask_softmax=fuse_scale_mask_softmax,
 86 |                          checkpoint=checkpoint,
 87 |                          activation_offload=activation_offload) for _ in range(depth)
 88 |         ])
 89 | 
 90 |         self.norm = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
 91 | 
 92 |         self.head = GPTLMHead(hidden_size=hidden_size, vocab_size=vocab_size, embedding_layer=self.embed, dtype=dtype)
 93 | 
 94 |     def forward(self, input_ids, attention_mask=None):
 95 | 
 96 |         # the size of input_ids is (BATCH_SIZE, SEQ_LEN)
 97 |         x = self.embed(input_ids)
 98 |         # the size of x after embed layer is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
 99 | 
100 |         # We create a 3D attention mask from a 2D tensor mask.
101 |         # Sizes are [batch_size, 1, 1, to_seq_length]
102 |         # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
103 |         # Adapted from huggingface
104 |         if attention_mask is not None:
105 |             batch_size = input_ids.shape[0]
106 |             attention_mask = attention_mask.view(batch_size, -1)
107 |             attention_mask = col_nn.partition_batch(attention_mask)
108 |             attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
109 |             attention_mask = attention_mask.to(dtype=x.dtype)    # fp16 compatibility
110 |             attention_mask = (1.0 - attention_mask) * -10000.0
111 | 
112 |         # the size of x in blocks is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
113 |         for block in self.blocks:
114 |             x, attention_mask = block(x, attention_mask)
115 | 
116 |         x = self.head(self.norm(x))
117 |         # the size of x is (BATCH_SIZE, SEQ_LEN, VOCAB_SIZE)
118 |         return x
119 | 
120 | 
121 | def _create_deepnet_model(**model_kwargs):
122 |     model = DeepNet(**model_kwargs)
123 |     return model
124 | 
125 | 
126 | def deepnet_small(**kwargs):
127 |     model_kwargs = dict(dim=768, depth=12, num_heads=12, **kwargs)
128 |     return _create_deepnet_model(**model_kwargs)
129 | 


--------------------------------------------------------------------------------
/titans/dataloader/utils/rand_augment.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import torchvision.transforms.functional as TF
  4 | 
  5 | _MAX_LEVEL = 10
  6 | 
  7 | _HPARAMS = {
  8 |     'cutout_const': 40,
  9 |     'translate_const': 40,
 10 | }
 11 | 
 12 | _FILL = tuple([128, 128, 128])
 13 | # RGB
 14 | 
 15 | 
 16 | def blend(image0, image1, factor):
 17 |     # blend image0 with image1
 18 |     # we only use this function in the 'color' function
 19 |     if factor == 0.0:
 20 |         return image0
 21 |     if factor == 1.0:
 22 |         return image1
 23 |     image0 = image0.type(torch.float32)
 24 |     image1 = image1.type(torch.float32)
 25 |     scaled = (image1 - image0) * factor
 26 |     image = image0 + scaled
 27 | 
 28 |     if factor > 0.0 and factor < 1.0:
 29 |         return image.type(torch.uint8)
 30 | 
 31 |     image = torch.clamp(image, 0, 255).type(torch.uint8)
 32 |     return image
 33 | 
 34 | 
 35 | def autocontrast(image):
 36 |     image = TF.autocontrast(image)
 37 |     return image
 38 | 
 39 | 
 40 | def equalize(image):
 41 |     image = TF.equalize(image)
 42 |     return image
 43 | 
 44 | 
 45 | def rotate(image, degree, fill=_FILL):
 46 |     image = TF.rotate(image, angle=degree, fill=fill)
 47 |     return image
 48 | 
 49 | 
 50 | def posterize(image, bits):
 51 |     image = TF.posterize(image, bits)
 52 |     return image
 53 | 
 54 | 
 55 | def sharpness(image, factor):
 56 |     image = TF.adjust_sharpness(image, sharpness_factor=factor)
 57 |     return image
 58 | 
 59 | 
 60 | def contrast(image, factor):
 61 |     image = TF.adjust_contrast(image, factor)
 62 |     return image
 63 | 
 64 | 
 65 | def brightness(image, factor):
 66 |     image = TF.adjust_brightness(image, factor)
 67 |     return image
 68 | 
 69 | 
 70 | def invert(image):
 71 |     return 255 - image
 72 | 
 73 | 
 74 | def solarize(image, threshold=128):
 75 |     return torch.where(image < threshold, image, 255 - image)
 76 | 
 77 | 
 78 | def solarize_add(image, addition=0, threshold=128):
 79 |     add_image = image.long() + addition
 80 |     add_image = torch.clamp(add_image, 0, 255).type(torch.uint8)
 81 |     return torch.where(image < threshold, add_image, image)
 82 | 
 83 | 
 84 | def color(image, factor):
 85 |     new_image = TF.rgb_to_grayscale(image, num_output_channels=3)
 86 |     return blend(new_image, image, factor=factor)
 87 | 
 88 | 
 89 | def shear_x(image, level, fill=_FILL):
 90 |     image = TF.affine(image, 0, [0, 0], 1.0, [level, 0], fill=fill)
 91 |     return image
 92 | 
 93 | 
 94 | def shear_y(image, level, fill=_FILL):
 95 |     image = TF.affine(image, 0, [0, 0], 1.0, [0, level], fill=fill)
 96 |     return image
 97 | 
 98 | 
 99 | def translate_x(image, level, fill=_FILL):
100 |     image = TF.affine(image, 0, [level, 0], 1.0, [0, 0], fill=fill)
101 |     return image
102 | 
103 | 
104 | def translate_y(image, level, fill=_FILL):
105 |     image = TF.affine(image, 0, [0, level], 1.0, [0, 0], fill=fill)
106 |     return image
107 | 
108 | 
109 | def cutout(image, pad_size, fill=_FILL):
110 |     b, c, h, w = image.shape
111 |     mask = torch.ones((b, c, h, w), dtype=torch.uint8).cuda()
112 |     y = np.random.randint(pad_size, h - pad_size)
113 |     x = np.random.randint(pad_size, w - pad_size)
114 |     for i in range(c):
115 |         mask[:, i, (y - pad_size):(y + pad_size), (x - pad_size):(x + pad_size)] = fill[i]
116 |     image = torch.where(mask == 1, image, mask)
117 |     return image
118 | 
119 | 
120 | def _randomly_negate_tensor(level):
121 |     # With 50% prob turn the tensor negative.
122 |     flip = np.random.randint(0, 2)
123 |     final_level = -level if flip else level
124 |     return final_level
125 | 
126 | 
127 | def _rotate_level_to_arg(level):
128 |     level = (level / _MAX_LEVEL) * 30.
129 |     level = _randomly_negate_tensor(level)
130 |     return level
131 | 
132 | 
133 | def _shear_level_to_arg(level):
134 |     level = (level / _MAX_LEVEL) * 0.3
135 |     # Flip level to negative with 50% chance.
136 |     level = _randomly_negate_tensor(level)
137 |     return level
138 | 
139 | 
140 | def _translate_level_to_arg(level, translate_const):
141 |     level = (level / _MAX_LEVEL) * float(translate_const)
142 |     # Flip level to negative with 50% chance.
143 |     level = _randomly_negate_tensor(level)
144 |     return level
145 | 
146 | 
147 | def level(hparams):
148 |     return {
149 |         'AutoContrast': lambda level: None,
150 |         'Equalize': lambda level: None,
151 |         'Invert': lambda level: None,
152 |         'Rotate': _rotate_level_to_arg,
153 |         'Posterize': lambda level: (int((level / _MAX_LEVEL) * 4)),
154 |         'Solarize': lambda level: (int((level / _MAX_LEVEL) * 200)),
155 |         'SolarizeAdd': lambda level: (int((level / _MAX_LEVEL) * 110)),
156 |         'Color': lambda level: ((level / _MAX_LEVEL) * 1.8 + 0.1),
157 |         'Contrast': lambda level: ((level / _MAX_LEVEL) * 1.8 + 0.1),
158 |         'Brightness': lambda level: ((level / _MAX_LEVEL) * 1.8 + 0.1),
159 |         'Sharpness': lambda level: ((level / _MAX_LEVEL) * 1.8 + 0.1),
160 |         'ShearX': _shear_level_to_arg,
161 |         'ShearY': _shear_level_to_arg,
162 |         'Cutout': lambda level: (int((level / _MAX_LEVEL) * hparams['cutout_const'])),
163 |         'TranslateX': lambda level: _translate_level_to_arg(level, hparams['translate_const']),
164 |         'TranslateY': lambda level: _translate_level_to_arg(level, hparams['translate_const']),
165 |     }
166 | 
167 | 
168 | AUGMENTS = {
169 |     'AutoContrast': autocontrast,
170 |     'Equalize': equalize,
171 |     'Invert': invert,
172 |     'Rotate': rotate,
173 |     'Posterize': posterize,
174 |     'Solarize': solarize,
175 |     'SolarizeAdd': solarize_add,
176 |     'Color': color,
177 |     'Contrast': contrast,
178 |     'Brightness': brightness,
179 |     'Sharpness': sharpness,
180 |     'ShearX': shear_x,
181 |     'ShearY': shear_y,
182 |     'TranslateX': translate_x,
183 |     'TranslateY': translate_y,
184 |     'Cutout': cutout,
185 | }
186 | 
187 | 
188 | def RandAugment(image, num_layers=2, magnitude=_MAX_LEVEL, augments=AUGMENTS):
189 |     """Random Augment for images, followed google randaug and the paper(https://arxiv.org/abs/2106.10270)
190 |     :param image: the input image, in tensor format with shape of C, H, W
191 |     :type image: uint8 Tensor
192 |     :num_layers: how many layers will the randaug do, default=2
193 |     :type num_layers: int
194 |     :param magnitude: the magnitude of random augment, default=10
195 |     :type magnitude: int 
196 |     """
197 |     if np.random.random() < 0.5:
198 |         return image
199 |     Choice_Augment = np.random.choice(a=list(augments.keys()), size=num_layers, replace=False)
200 |     magnitude = float(magnitude)
201 |     for i in range(num_layers):
202 |         arg = level(_HPARAMS)[Choice_Augment[i]](magnitude)
203 |         if arg is None:
204 |             image = augments[Choice_Augment[i]](image)
205 |         else:
206 |             image = augments[Choice_Augment[i]](image, arg)
207 |     return image
208 | 


--------------------------------------------------------------------------------
/titans/model/knowledge_graph_embedding/dataloader/dataloader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | from __future__ import absolute_import
  4 | from __future__ import division
  5 | from __future__ import print_function
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from torch.utils.data import Dataset
 10 | 
 11 | 
 12 | class TrainDataset(Dataset):
 13 | 
 14 |     def __init__(self, triples, nentity, nrelation, negative_sample_size, mode):
 15 |         self.len = len(triples)
 16 |         self.triples = triples
 17 |         self.triple_set = set(triples)
 18 |         self.nentity = nentity
 19 |         self.nrelation = nrelation
 20 |         self.negative_sample_size = negative_sample_size
 21 |         self.mode = mode
 22 |         self.count = self.count_frequency(triples)
 23 |         self.true_head, self.true_tail = self.get_true_head_and_tail(self.triples)
 24 | 
 25 |     def __len__(self):
 26 |         return self.len
 27 | 
 28 |     def __getitem__(self, idx):
 29 |         positive_sample = self.triples[idx]
 30 | 
 31 |         head, relation, tail = positive_sample
 32 | 
 33 |         subsampling_weight = self.count[(head, relation)] + self.count[(tail, -relation - 1)]
 34 |         subsampling_weight = torch.sqrt(1 / torch.Tensor([subsampling_weight]))
 35 | 
 36 |         negative_sample_list = []
 37 |         negative_sample_size = 0
 38 | 
 39 |         while negative_sample_size < self.negative_sample_size:
 40 |             negative_sample = np.random.randint(self.nentity, size=self.negative_sample_size * 2)
 41 |             if self.mode == 'head-batch':
 42 |                 mask = np.in1d(negative_sample, self.true_head[(relation, tail)], assume_unique=True, invert=True)
 43 |             elif self.mode == 'tail-batch':
 44 |                 mask = np.in1d(negative_sample, self.true_tail[(head, relation)], assume_unique=True, invert=True)
 45 |             else:
 46 |                 raise ValueError('Training batch mode %s not supported' % self.mode)
 47 |             negative_sample = negative_sample[mask]
 48 |             negative_sample_list.append(negative_sample)
 49 |             negative_sample_size += negative_sample.size
 50 | 
 51 |         negative_sample = np.concatenate(negative_sample_list)[:self.negative_sample_size]
 52 | 
 53 |         negative_sample = torch.LongTensor(negative_sample)
 54 | 
 55 |         positive_sample = torch.LongTensor(positive_sample)
 56 | 
 57 |         return positive_sample, negative_sample, subsampling_weight, self.mode
 58 | 
 59 |     @staticmethod
 60 |     def collate_fn(data):
 61 |         positive_sample = torch.stack([_[0] for _ in data], dim=0)
 62 |         negative_sample = torch.stack([_[1] for _ in data], dim=0)
 63 |         subsample_weight = torch.cat([_[2] for _ in data], dim=0)
 64 |         mode = data[0][3]
 65 |         return positive_sample, negative_sample, subsample_weight, mode
 66 | 
 67 |     @staticmethod
 68 |     def count_frequency(triples, start=4):
 69 |         '''
 70 |         Get frequency of a partial triple like (head, relation) or (relation, tail)
 71 |         The frequency will be used for subsampling like word2vec
 72 |         '''
 73 |         count = {}
 74 |         for head, relation, tail in triples:
 75 |             if (head, relation) not in count:
 76 |                 count[(head, relation)] = start
 77 |             else:
 78 |                 count[(head, relation)] += 1
 79 | 
 80 |             if (tail, -relation - 1) not in count:
 81 |                 count[(tail, -relation - 1)] = start
 82 |             else:
 83 |                 count[(tail, -relation - 1)] += 1
 84 |         return count
 85 | 
 86 |     @staticmethod
 87 |     def get_true_head_and_tail(triples):
 88 |         '''
 89 |         Build a dictionary of true triples that will
 90 |         be used to filter these true triples for negative sampling
 91 |         '''
 92 | 
 93 |         true_head = {}
 94 |         true_tail = {}
 95 | 
 96 |         for head, relation, tail in triples:
 97 |             if (head, relation) not in true_tail:
 98 |                 true_tail[(head, relation)] = []
 99 |             true_tail[(head, relation)].append(tail)
100 |             if (relation, tail) not in true_head:
101 |                 true_head[(relation, tail)] = []
102 |             true_head[(relation, tail)].append(head)
103 | 
104 |         for relation, tail in true_head:
105 |             true_head[(relation, tail)] = np.array(list(set(true_head[(relation, tail)])))
106 |         for head, relation in true_tail:
107 |             true_tail[(head, relation)] = np.array(list(set(true_tail[(head, relation)])))
108 | 
109 |         return true_head, true_tail
110 | 
111 | 
112 | class TestDataset(Dataset):
113 | 
114 |     def __init__(self, triples, all_true_triples, nentity, nrelation, mode):
115 |         self.len = len(triples)
116 |         self.triple_set = set(all_true_triples)
117 |         self.triples = triples
118 |         self.nentity = nentity
119 |         self.nrelation = nrelation
120 |         self.mode = mode
121 | 
122 |     def __len__(self):
123 |         return self.len
124 | 
125 |     def __getitem__(self, idx):
126 |         head, relation, tail = self.triples[idx]
127 | 
128 |         if self.mode == 'head-batch':
129 |             tmp = [(0, rand_head) if (rand_head, relation, tail) not in self.triple_set else (-1, head)
130 |                    for rand_head in range(self.nentity)]
131 |             tmp[head] = (0, head)
132 |         elif self.mode == 'tail-batch':
133 |             tmp = [(0, rand_tail) if (head, relation, rand_tail) not in self.triple_set else (-1, tail)
134 |                    for rand_tail in range(self.nentity)]
135 |             tmp[tail] = (0, tail)
136 |         else:
137 |             raise ValueError('negative batch mode %s not supported' % self.mode)
138 | 
139 |         tmp = torch.LongTensor(tmp)
140 |         filter_bias = tmp[:, 0].float()
141 |         negative_sample = tmp[:, 1]
142 | 
143 |         positive_sample = torch.LongTensor((head, relation, tail))
144 | 
145 |         return positive_sample, negative_sample, filter_bias, self.mode
146 | 
147 |     @staticmethod
148 |     def collate_fn(data):
149 |         positive_sample = torch.stack([_[0] for _ in data], dim=0)
150 |         negative_sample = torch.stack([_[1] for _ in data], dim=0)
151 |         filter_bias = torch.stack([_[2] for _ in data], dim=0)
152 |         mode = data[0][3]
153 |         return positive_sample, negative_sample, filter_bias, mode
154 | 
155 | 
156 | class BidirectionalOneShotIterator(object):
157 | 
158 |     def __init__(self, dataloader_head, dataloader_tail):
159 |         self.iterator_head = self.one_shot_iterator(dataloader_head)
160 |         self.iterator_tail = self.one_shot_iterator(dataloader_tail)
161 |         self.step = 0
162 | 
163 |     def __next__(self):
164 |         self.step += 1
165 |         if self.step % 2 == 0:
166 |             data = next(self.iterator_head)
167 |         else:
168 |             data = next(self.iterator_tail)
169 |         return data
170 | 
171 |     @staticmethod
172 |     def one_shot_iterator(dataloader):
173 |         '''
174 |         Transform a PyTorch Dataloader into python iterator
175 |         '''
176 |         while True:
177 |             for data in dataloader:
178 |                 yield data
179 | 


--------------------------------------------------------------------------------
/titans/dataloader/bert/parquet_dataset.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import numpy as np
  3 | import os
  4 | import random
  5 | import torch
  6 | 
  7 | import torch.distributed as dist
  8 | from torch.utils.data import IterableDataset
  9 | from torch.utils.data import get_worker_info
 10 | 
 11 | from lddl.types import File
 12 | from lddl.utils import get_num_samples_of_parquet
 13 | from lddl.random import sample
 14 | from lddl.torch.datasets import ShuffleBuffer
 15 | from lddl.torch.datasets import ParquetDataset as PD
 16 | 
 17 | 
 18 | class ParquetDataset(PD, IterableDataset):
 19 | 
 20 |   def __init__(
 21 |       self,
 22 |       file_paths,
 23 |       transform=lambda x: x,
 24 |       shuffle_buffer_size=16384,
 25 |       shuffle_buffer_warmup_factor=16,
 26 |       base_seed=12345,
 27 |       logger=None,
 28 |       start_epoch=0,
 29 |       process_group=None
 30 |   ):
 31 |     # we do not want to init the original PD as it is overridden by this function
 32 |     # we only init with IterabledDataset
 33 |     IterableDataset.__init__(self)
 34 |     self._transform = transform
 35 |     self._shuffle_buffer_size = shuffle_buffer_size
 36 |     self._shuffle_buffer_warmup_factor = shuffle_buffer_warmup_factor
 37 |     self._base_seed = base_seed
 38 | 
 39 |     self._rank = dist.get_rank(group=process_group)
 40 |     self._world_size = dist.get_world_size(group=process_group)
 41 |     self._process_group = process_group
 42 | 
 43 |     self._epoch = start_epoch - 1
 44 | 
 45 |     self._logger = logger
 46 | 
 47 |     assert len(file_paths) % self._world_size == 0
 48 |     self._files = self._get_files(file_paths)
 49 |     max_num_samples_per_file = max((f.num_samples for f in self._files))
 50 |     min_num_samples_per_file = min((f.num_samples for f in self._files))
 51 |     assert min_num_samples_per_file + 1 == max_num_samples_per_file
 52 |     self._num_samples_per_file = min_num_samples_per_file
 53 |     total_num_samples = sum((f.num_samples for f in self._files))
 54 |     num_samples_lost = (total_num_samples -
 55 |                         self._num_samples_per_file * len(self._files))
 56 |     self._logger.to('node').warning('lost {}/{}={}% samples in total'.format(
 57 |         num_samples_lost,
 58 |         total_num_samples,
 59 |         num_samples_lost / total_num_samples * 100,
 60 |     ))
 61 | 
 62 |     self._world_rng_state = None
 63 |     self._worker_rng_state = None
 64 |   
 65 |   def _get_files(self, file_paths):
 66 |     all_files_num_samples = torch.zeros((len(file_paths),), dtype=torch.long)
 67 |     if self._world_size > 1 and torch.distributed.get_backend() == 'nccl':
 68 |       all_files_num_samples = all_files_num_samples.to('cuda')
 69 |     # Figure out how many samples in each file.
 70 |     num_samples_cache = {}  # Map dirname to the dict of {basename: num_samples}
 71 | 
 72 |     for idx in range(self._rank, len(file_paths), self._world_size):
 73 |       fp = file_paths[idx]
 74 |       dn = os.path.dirname(fp)
 75 |       bn = os.path.basename(fp)
 76 |       # Load the num_samples cache file if it exists.
 77 |       if dn not in num_samples_cache:
 78 |         nsfp = os.path.join(dn, '.num_samples.json')
 79 |         try:
 80 |           with open(nsfp, 'r') as nsf:
 81 |             num_samples_cache[dn] = json.load(nsf)
 82 |         except Exception as e:
 83 |           self._logger.to('rank').warning('failed to load {}: {}'.format(
 84 |               nsfp, e))
 85 |           # Mark that the num_samples cache file doesn't exist for this
 86 |           # directory.
 87 |           num_samples_cache[dn] = None
 88 |       if num_samples_cache[dn] is not None and bn in num_samples_cache[dn]:
 89 |         all_files_num_samples[idx] = num_samples_cache[dn][bn]
 90 |       else:
 91 |         # Find out num_samples by loading the parquet table.
 92 |         all_files_num_samples[idx] = get_num_samples_of_parquet(fp)
 93 |     if self._world_size > 1:
 94 |       # Sync. accross all ranks.
 95 |       torch.distributed.all_reduce(
 96 |           all_files_num_samples,
 97 |           op=torch.distributed.ReduceOp.SUM,
 98 |           group=self._process_group
 99 |       )
100 |     all_files_num_samples = all_files_num_samples.tolist()
101 |     return [File(fp, ns) for fp, ns in zip(file_paths, all_files_num_samples)]
102 | 
103 |   def __len__(self):
104 |     """ This function only returns how many samples per rank will be yielded
105 |     by this dataset.
106 | 
107 |     Note that, len(dataloader), where dataloader is a PyTorch DataLoader
108 |     wrapping this dataset, does NOT return the accurate number of batches. This
109 |     is because, when (num_samples_per_file * num_files_per_worker) is not
110 |     divisible by batch_size, each worker is going to generate a partial batch
111 |     at the very end.
112 | 
113 |     However, PyTorch DataLoader's __len__ only divide the number returned from
114 |     this function by batch_size, which would be smaller than the actual number
115 |     of batches by at most (num_workers - 1).
116 | 
117 |     We need to patch PyTorch DataLoader function for this function to behave
118 |     correctly.
119 |     """
120 |     return self._num_samples_per_file * len(self._files) // self._world_size
121 | 
122 |   @property
123 |   def num_samples_per_file(self):
124 |     return self._num_samples_per_file
125 | 
126 |   @property
127 |   def num_files_per_rank(self):
128 |     return len(self._files) // self._world_size
129 | 
130 |   def _decode_record_batch(self, b):
131 |     raise NotImplementedError('ParquetDataset is an abstract/interface class!')
132 | 
133 |   def _world_identical_sample(self, population, k, counts=None):
134 |     s, self._world_rng_state = sample(
135 |         population,
136 |         k,
137 |         rng_state=self._world_rng_state,
138 |     )
139 |     return s
140 | 
141 |   def _init_worker(self):
142 |     worker_info = get_worker_info()
143 |     if worker_info is None:
144 |       num_workers_per_rank = 1
145 |       worker_rank = 0
146 |     else:
147 |       num_workers_per_rank = worker_info.num_workers
148 |       worker_rank = worker_info.id
149 |     assert (len(self._files) % (self._world_size * num_workers_per_rank) == 0)
150 |     self._logger.init_for_worker(worker_rank)
151 |     return worker_rank, num_workers_per_rank
152 | 
153 |   def _init_rng_states(self, worker_rank, num_workers_per_rank):
154 |     orig_rng_state = random.getstate()
155 | 
156 |     random.seed(self._base_seed + self._epoch)
157 |     self._world_rng_state = random.getstate()
158 | 
159 |     random.seed(self._base_seed +
160 |                 (self._epoch * self._world_size + self._rank) *
161 |                 num_workers_per_rank + worker_rank)
162 |     self._worker_rng_state = random.getstate()
163 | 
164 |     random.setstate(orig_rng_state)
165 | 
166 |   def __iter__(self):
167 |     self._epoch += 1
168 | 
169 |     worker_rank, num_workers_per_rank = self._init_worker()
170 |     self._init_rng_states(worker_rank, num_workers_per_rank)
171 | 
172 |     files = self._world_identical_sample(self._files, k=len(self._files))
173 |     rank_files = files[self._rank::self._world_size]
174 |     worker_files = rank_files[worker_rank::num_workers_per_rank]
175 | 
176 |     sb = ShuffleBuffer(
177 |         worker_files,
178 |         self._num_samples_per_file * len(worker_files),
179 |         lambda b: self._decode_record_batch(b),
180 |         self._shuffle_buffer_size,
181 |         self._shuffle_buffer_warmup_factor,
182 |         self._logger,
183 |         self._worker_rng_state,
184 |     )
185 |     for sample in iter(sb):
186 |       yield self._transform(sample)


--------------------------------------------------------------------------------
/titans/model/moe/gpt_moe.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, List
  2 | from torch import dtype, nn
  3 | from colossalai import nn as col_nn
  4 | from colossalai.nn.layer import MoeModule
  5 | from colossalai.context import MOE_CONTEXT
  6 | from colossalai.logging import get_dist_logger
  7 | from colossalai.nn.layer.utils import CheckpointModule, divide
  8 | 
  9 | from titans.layer.embedding import GPTEmbedding
 10 | from titans.layer.block import GPTBlock, MOEGPTBlock
 11 | from titans.layer.head import GPTLMHead
 12 | 
 13 | 
 14 | class MOEGPT(nn.Module):
 15 | 
 16 |     def __init__(self,
 17 |                  num_experts: int or List[int],
 18 |                  use_residual: bool = False,
 19 |                  capacity_factor_train: float = 1.0,
 20 |                  capacity_factor_eval: float = 1.0,
 21 |                  vocab_size: int = 50304,
 22 |                  max_position_embeddings: int = 1024,
 23 |                  hidden_size: int = 768,
 24 |                  num_heads: int = 12,
 25 |                  depth: int = 12,
 26 |                  mlp_ratio: float = 4.0,
 27 |                  dropout: float = 0.1,
 28 |                  embedding_dropout: float = 0.1,
 29 |                  attention_dropout: float = 0.1,
 30 |                  layernorm_epsilon: float = 1e-5,
 31 |                  activation: Callable = nn.functional.gelu,
 32 |                  padding_idx: int = None,
 33 |                  dtype: dtype = None,
 34 |                  bias: bool = True,
 35 |                  apply_post_layernorm: bool = False,
 36 |                  fuse_scale_mask_softmax: bool = False,
 37 |                  checkpoint: bool = False) -> None:
 38 |         super().__init__()
 39 | 
 40 |         half_depth = divide(depth, 2)
 41 |         if isinstance(num_experts, list):
 42 |             assert len(num_experts) == half_depth, \
 43 |                 "The length of num_experts should equal to the number of MOE layers"
 44 |             num_experts_list = num_experts
 45 |         else:
 46 |             num_experts_list = [num_experts] * half_depth
 47 | 
 48 |         self.embed = GPTEmbedding(embedding_dim=hidden_size,
 49 |                                   vocab_size=vocab_size,
 50 |                                   max_position_embeddings=max_position_embeddings,
 51 |                                   padding_idx=padding_idx,
 52 |                                   dropout=embedding_dropout,
 53 |                                   dtype=dtype)
 54 | 
 55 |         block_list = []
 56 |         block_factory_dict = dict(hidden_size=hidden_size,
 57 |                                   num_heads=num_heads,
 58 |                                   mlp_ratio=mlp_ratio,
 59 |                                   activation=activation,
 60 |                                   attention_dropout=attention_dropout,
 61 |                                   dropout=dropout,
 62 |                                   layernorm_epsilon=layernorm_epsilon,
 63 |                                   dtype=dtype,
 64 |                                   bias=bias,
 65 |                                   apply_post_layernorm=apply_post_layernorm,
 66 |                                   fuse_scale_mask_softmax=fuse_scale_mask_softmax,
 67 |                                   checkpoint=checkpoint)
 68 | 
 69 |         for i in range(depth):
 70 | 
 71 |             if i % 2 == 0:
 72 |                 block_module = GPTBlock(**block_factory_dict)
 73 |             else:
 74 |                 num_experts = num_experts_list[i // 2]
 75 |                 block_module = MOEGPTBlock(num_experts=num_experts,
 76 |                                            capacity_factor_train=capacity_factor_train,
 77 |                                            capacity_factor_eval=capacity_factor_eval,
 78 |                                            use_residual=use_residual,
 79 |                                            **block_factory_dict)
 80 | 
 81 |             block_list.append(block_module)
 82 | 
 83 |         self.blocks = nn.ModuleList(block_list)
 84 | 
 85 |         self.norm = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
 86 | 
 87 |         self.head = GPTLMHead(hidden_size=hidden_size, vocab_size=vocab_size, embedding_layer=self.embed, dtype=dtype)
 88 | 
 89 |     def forward(self, input_ids, attention_mask=None):
 90 |         MOE_CONTEXT.reset_loss()
 91 |         x = self.embed(input_ids)
 92 | 
 93 |         # We create a 3D attention mask from a 2D tensor mask.
 94 |         # Sizes are [batch_size, 1, 1, to_seq_length]
 95 |         # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
 96 |         # Adapted from huggingface
 97 |         if attention_mask is not None:
 98 |             batch_size = input_ids.shape[0]
 99 |             attention_mask = attention_mask.view(batch_size, -1)
100 |             attention_mask = col_nn.partition_batch(attention_mask)
101 |             attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
102 |             attention_mask = attention_mask.to(dtype=x.dtype)    # fp16 compatibility
103 |             attention_mask = (1.0 - attention_mask) * -10000.0
104 | 
105 |         y = 0
106 |         for block in self.blocks:
107 |             if isinstance(block, GPTBlock):
108 |                 x, attention_mask = block(x, attention_mask)
109 |             else:
110 |                 x, y, attention_mask = block(x, y, attention_mask)
111 | 
112 |         x = self.head(self.norm(x))
113 |         MOE_CONTEXT.add_loss(y)
114 |         return x
115 | 
116 | 
117 | def _create_moegpt_model(**model_kwargs):
118 |     model = MOEGPT(**model_kwargs)
119 |     return model
120 | 
121 | 
122 | def _prmoe_check_sanity(kwargs_dict):
123 |     logger = get_dist_logger()
124 |     if not kwargs_dict.pop('use_residual', False):
125 |         logger.warning(
126 |             "If you want to use PR-MOE, please set 'use_residual' to True. "
127 |             "Otherwise, we'll force 'use_residual' to True.",
128 |             ranks=[0])
129 | 
130 | 
131 | def prmoe_4b(**kwargs):
132 |     _prmoe_check_sanity(kwargs)
133 |     model_kwargs = dict(num_experts=[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64],
134 |                         use_residual=True,
135 |                         hidden_size=1024,
136 |                         depth=24,
137 |                         num_heads=16,
138 |                         **kwargs)
139 |     return _create_moegpt_model(**model_kwargs)
140 | 
141 | 
142 | def prmoe_16b(**kwargs):
143 |     _prmoe_check_sanity(kwargs)
144 |     model_kwargs = dict(num_experts=[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64],
145 |                         use_residual=True,
146 |                         hidden_size=2048,
147 |                         depth=24,
148 |                         num_heads=16,
149 |                         **kwargs)
150 |     return _create_moegpt_model(**model_kwargs)
151 | 
152 | 
153 | def prmoe_25b(**kwargs):
154 |     _prmoe_check_sanity(kwargs)
155 |     model_kwargs = dict(num_experts=[32, 32, 32, 32, 32, 32, 64, 64, 64, 64, 128, 128],
156 |                         use_residual=True,
157 |                         hidden_size=2048,
158 |                         depth=24,
159 |                         num_heads=16,
160 |                         **kwargs)
161 |     return _create_moegpt_model(**model_kwargs)
162 | 
163 | 
164 | def prmoe_29b(**kwargs):
165 |     _prmoe_check_sanity(kwargs)
166 |     model_kwargs = dict(num_experts=[32, 32, 48, 64, 64, 64, 64, 64, 64, 64, 128, 128],
167 |                         use_residual=True,
168 |                         hidden_size=2048,
169 |                         depth=24,
170 |                         num_heads=16,
171 |                         **kwargs)
172 |     return _create_moegpt_model(**model_kwargs)
173 | 
174 | 
175 | def prmoe_31b(**kwargs):
176 |     _prmoe_check_sanity(kwargs)
177 |     model_kwargs = dict(num_experts=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 128, 128],
178 |                         use_residual=True,
179 |                         hidden_size=2048,
180 |                         depth=24,
181 |                         num_heads=16,
182 |                         **kwargs)
183 |     return _create_moegpt_model(**model_kwargs)
184 | 
185 | 
186 | def prmoe_51b(**kwargs):
187 |     _prmoe_check_sanity(kwargs)
188 |     model_kwargs = dict(num_experts=[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64, 64, 64],
189 |                         use_residual=True,
190 |                         hidden_size=3072,
191 |                         depth=32,
192 |                         num_heads=24,
193 |                         **kwargs)
194 |     return _create_moegpt_model(**model_kwargs)
195 | 


--------------------------------------------------------------------------------
/titans/model/vit/vit.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from typing import Callable
  3 | 
  4 | import torch
  5 | from colossalai import nn as col_nn
  6 | from colossalai.nn.layer.utils import CheckpointModule
  7 | from torch import dtype, nn
  8 | 
  9 | from titans.layer.embedding import ViTEmbedding
 10 | from titans.layer.head import ViTHead
 11 | from titans.layer.block import ViTBlock
 12 | from titans.decorator import no_support
 13 | 
 14 | __all__ = [
 15 |     'VisionTransformer',
 16 |     'vit_lite_depth7_patch4_32',
 17 |     'vit_tiny_patch4_32',
 18 |     'vit_tiny_patch16_224',
 19 |     'vit_tiny_patch16_384',
 20 |     'vit_small_patch16_224',
 21 |     'vit_small_patch16_384',
 22 |     'vit_small_patch32_224',
 23 |     'vit_small_patch32_384',
 24 |     'vit_base_patch16_224',
 25 |     'vit_base_patch16_384',
 26 |     'vit_base_patch32_224',
 27 |     'vit_base_patch32_384',
 28 |     'vit_large_patch16_224',
 29 |     'vit_large_patch16_384',
 30 |     'vit_large_patch32_224',
 31 |     'vit_large_patch32_384',
 32 | ]
 33 | 
 34 | 
 35 | @no_support(['sp', 'moe'])
 36 | class VisionTransformer(nn.Module):
 37 |     """
 38 |     ViT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
 39 |     the [CLS] token) e.g. for ImageNet.
 40 | 
 41 |     Args:
 42 |         img_size(int): The size of images, defaults to 224.
 43 |         patch_size(int): The size of patches, defaults to 16.
 44 |         in_chans(int): The size of input channels, defaults to 3.
 45 |         num_classes(int): The number of target classes, defaults to 1000.
 46 |         depth(int): The number of transformer layers, defaults to 12.
 47 |         num_heads(int): The number of heads in transformer blocks, defaults to 12.
 48 |         dim(int): Hidden size of the transformer blocks, defaults to 768.
 49 |         mlp_ratio(int): The ratio used in mlp layer, defaults to 4.
 50 |         attention_dropout(float): The ratio used to construct attention dropout modules, which indicates the percentage of parameters should be casted to zero, defaults to 0.1.
 51 |         dropout(float): The ratio used to construct dropout modules, which indicates the percentage of parameters should be casted to zero, defaults to 0.1.
 52 |         drop_path(float): The ratio used to construct drop_path modules, which indicates the percentage of branches should be casted to zero, defaults to 0..
 53 |         layernorm_epsilon(float): The argument used to construct layernorm modules, defaults to 1e-6.
 54 |         activation(Callable): The activation function used in model, defaults to nn.functional.gelu.
 55 |         representation_size(int): The size of representation in head layer, defaults to None.
 56 |         dtype (:class:`torch.dtype`): The dtype of parameters, defaults to None.
 57 |         bias (bool): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
 58 |         checkpoint(bool): If set to "True", checkpoint feature will be activated to save memory, defaults to ``False``.
 59 |         init_method(str): The initializing method used in layers, defaults to `torch`.
 60 |     """
 61 | 
 62 |     def __init__(self,
 63 |                  img_size: int = 224,
 64 |                  patch_size: int = 16,
 65 |                  in_chans: int = 3,
 66 |                  num_classes: int = 1000,
 67 |                  depth: int = 12,
 68 |                  num_heads: int = 12,
 69 |                  hidden_size: int = 768,
 70 |                  mlp_ratio: int = 4,
 71 |                  attention_dropout: float = 0.,
 72 |                  dropout: float = 0.1,
 73 |                  drop_path: float = 0.,
 74 |                  layernorm_epsilon: float = 1e-6,
 75 |                  activation: Callable = nn.functional.gelu,
 76 |                  representation_size: int = None,
 77 |                  dtype: dtype = None,
 78 |                  bias: bool = True,
 79 |                  checkpoint: bool = False,
 80 |                  init_method: str = 'torch'):
 81 |         super().__init__()
 82 | 
 83 |         self.embed = ViTEmbedding(img_size=img_size,
 84 |                                   patch_size=patch_size,
 85 |                                   in_chans=in_chans,
 86 |                                   embedding_dim=hidden_size,
 87 |                                   dropout=dropout,
 88 |                                   dtype=dtype,
 89 |                                   init_method=init_method)
 90 | 
 91 |         # stochastic depth decay rule
 92 |         dpr = [x.item() for x in torch.linspace(0, drop_path, depth)]
 93 |         self.blocks = nn.ModuleList([
 94 |             ViTBlock(
 95 |                 hidden_size=hidden_size,
 96 |                 num_heads=num_heads,
 97 |                 mlp_ratio=mlp_ratio,
 98 |                 attention_dropout=attention_dropout,
 99 |                 dropout=dropout,
100 |                 drop_path=dpr[i],
101 |                 activation=activation,
102 |                 dtype=dtype,
103 |                 bias=bias,
104 |                 checkpoint=checkpoint,
105 |                 init_method=init_method,
106 |             ) for i in range(depth)
107 |         ])
108 | 
109 |         self.norm = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
110 | 
111 |         self.head = ViTHead(hidden_size=hidden_size,
112 |                             num_classes=num_classes,
113 |                             representation_size=representation_size,
114 |                             dtype=dtype,
115 |                             bias=bias,
116 |                             init_method=init_method)
117 | 
118 |     def forward(self, x):
119 |         # the size of x is (BATCH_SIZE, IN_CHAN, IMAGE_SIZE, IMAGE_SIZE)
120 |         x = self.embed(x)
121 |         # the size of x after embed layer is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
122 |         for block in self.blocks:
123 |             x = block(x)
124 |         # the size of x after block is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
125 |         x = self.head(self.norm(x))
126 |         # the size of x is (BATCH_SIZE, NUM_CLASSES)
127 |         return x
128 | 
129 | 
130 | def _create_vit_model(**model_kwargs):
131 |     model = VisionTransformer(**model_kwargs)
132 |     return model
133 | 
134 | 
135 | def vit_lite_depth7_patch4_32(**kwargs):
136 |     model_kwargs = dict(img_size=32, patch_size=4, hidden_size=256, depth=7, num_heads=4, mlp_ratio=2, num_classes=10, **kwargs)
137 |     return _create_vit_model(**model_kwargs)
138 | 
139 | 
140 | def vit_tiny_patch4_32(**kwargs):
141 |     model_kwargs = dict(img_size=32, patch_size=4, hidden_size=512, depth=6, num_heads=8, mlp_ratio=1, num_classes=10, **kwargs)
142 |     return _create_vit_model(**model_kwargs)
143 | 
144 | 
145 | def vit_tiny_patch16_224(**kwargs):
146 |     model_kwargs = dict(img_size=224, patch_size=16, hidden_size=192, depth=12, num_heads=3, mlp_ratio=4, **kwargs)
147 |     return _create_vit_model(**model_kwargs)
148 | 
149 | 
150 | def vit_tiny_patch16_384(**kwargs):
151 |     model_kwargs = dict(img_size=384, patch_size=16, hidden_size=192, depth=12, num_heads=3, mlp_ratio=4, **kwargs)
152 |     return _create_vit_model(**model_kwargs)
153 | 
154 | 
155 | def vit_small_patch16_224(**kwargs):
156 |     model_kwargs = dict(img_size=224, patch_size=16, hidden_size=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs)
157 |     return _create_vit_model(**model_kwargs)
158 | 
159 | 
160 | def vit_small_patch16_384(**kwargs):
161 |     model_kwargs = dict(img_size=384, patch_size=16, hidden_size=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs)
162 |     return _create_vit_model(**model_kwargs)
163 | 
164 | 
165 | def vit_small_patch32_224(**kwargs):
166 |     model_kwargs = dict(img_size=224, patch_size=32, hidden_size=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs)
167 |     return _create_vit_model(**model_kwargs)
168 | 
169 | 
170 | def vit_small_patch32_384(**kwargs):
171 |     model_kwargs = dict(img_size=384, patch_size=32, hidden_size=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs)
172 |     return _create_vit_model(**model_kwargs)
173 | 
174 | 
175 | def vit_base_patch16_224(**kwargs):
176 |     model_kwargs = dict(img_size=224, patch_size=16, hidden_size=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs)
177 |     return _create_vit_model(**model_kwargs)
178 | 
179 | 
180 | def vit_base_patch16_384(**kwargs):
181 |     model_kwargs = dict(img_size=384, patch_size=16, hidden_size=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs)
182 |     return _create_vit_model(**model_kwargs)
183 | 
184 | 
185 | def vit_base_patch32_224(**kwargs):
186 |     model_kwargs = dict(img_size=224, patch_size=32, hidden_size=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs)
187 |     return _create_vit_model(**model_kwargs)
188 | 
189 | 
190 | def vit_base_patch32_384(**kwargs):
191 |     model_kwargs = dict(img_size=384, patch_size=32, hidden_size=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs)
192 |     return _create_vit_model(**model_kwargs)
193 | 
194 | 
195 | def vit_large_patch16_224(**kwargs):
196 |     model_kwargs = dict(img_size=224, patch_size=16, hidden_size=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs)
197 |     return _create_vit_model(**model_kwargs)
198 | 
199 | 
200 | def vit_large_patch16_384(**kwargs):
201 |     model_kwargs = dict(img_size=384, patch_size=16, hidden_size=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs)
202 |     return _create_vit_model(**model_kwargs)
203 | 
204 | 
205 | def vit_large_patch32_224(**kwargs):
206 |     model_kwargs = dict(img_size=224, patch_size=32, hidden_size=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs)
207 |     return _create_vit_model(**model_kwargs)
208 | 
209 | 
210 | def vit_large_patch32_384(**kwargs):
211 |     model_kwargs = dict(img_size=384, patch_size=32, hidden_size=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs)
212 |     return _create_vit_model(**model_kwargs)
213 | 


--------------------------------------------------------------------------------
/titans/model/gpt/gpt.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from typing import Callable
  3 | 
  4 | import torch
  5 | from colossalai import nn as col_nn
  6 | from colossalai.pipeline.utils import partition_uniform
  7 | from colossalai.context import ParallelMode
  8 | from colossalai.core import global_context as gpc
  9 | from colossalai.logging import get_dist_logger
 10 | from colossalai.nn.layer.utils import CheckpointModule, divide
 11 | from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
 12 | from colossalai.utils import get_current_device
 13 | from torch import dtype, nn
 14 | 
 15 | from titans.layer.embedding import GPTEmbedding
 16 | from titans.layer.head import GPTLMHead
 17 | from titans.layer.block import GPTBlock
 18 | from titans.loss.lm_loss import GPTLMLoss
 19 | from titans.decorator import no_support
 20 | 
 21 | __all__ = ['GPT', 'GPTLMLoss', 'gpt2_small', 'gpt2_medium', 'gpt2_large', 'gpt2_xl', 'gpt2_8B', 'gpt3']
 22 | 
 23 | 
 24 | @no_support(['sp', 'moe'])
 25 | class GPT(nn.Module):
 26 |     """
 27 |     The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
 28 |     embeddings).
 29 | 
 30 |     Args:
 31 |         vocab_size(int): The size of dictionary, defaults to 50304.
 32 |         max_position_embeddings(int): The max value of positional embeddings, defaults to 1024.
 33 |         hidden_size(int): Hidden size of the transformer blocks, defaults to 768.
 34 |         num_heads(int): The number of heads in transformer blocks, defaults to 12.
 35 |         depth(int): The number of transformer layers, defaults to 12.
 36 |         mlp_ratio(float): The ratio used in mlp layer, defaults to 4.0.
 37 |         dropout(float): The ratio used to construct dropout modules, which indicates the percentage of parameters should be casted to zero, defaults to 0.1.
 38 |         embedding_dropout(float): The ratio used to construct embedding dropout modules, which indicates the percentage of parameters should be casted to zero, defaults to 0.1.
 39 |         attention_dropout(float): The ratio used to construct attention dropout modules, which indicates the percentage of parameters should be casted to zero, defaults to 0.1.
 40 |         layernorm_epsilon(float): The argument used to construct layernorm modules, defaults to 1e-5.
 41 |         activation(Callable): The activation function used in model, defaults to nn.functional.gelu.
 42 |         padding_idx(int): The length to be padded for each batch, defaults to None.
 43 |         dtype (:class:`torch.dtype`): The dtype of parameters, defaults to None.
 44 |         bias (bool): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
 45 |         apply_post_layernorm(bool): If set to "True", the residual value will be record after layernorm modules, defaults to ``False``.
 46 |         fuse_scale_mask_softmax(bool): If set to "True", FuseScaleMaskSoftmax will be used in self-attention layer, defaults to ``False``.
 47 |         checkpoint(bool): If set to "True", checkpoint feature will be activated to save memory, defaults to ``False``.
 48 |         activation_offload(bool): If set to "True", offload feature will be activated during checkpointing, defaults to ``False``.
 49 |     """
 50 | 
 51 |     def __init__(self,
 52 |                  vocab_size: int = 50304,
 53 |                  max_position_embeddings: int = 1024,
 54 |                  hidden_size: int = 768,
 55 |                  num_heads: int = 12,
 56 |                  depth: int = 12,
 57 |                  mlp_ratio: float = 4.0,
 58 |                  dropout: float = 0.1,
 59 |                  embedding_dropout: float = 0.1,
 60 |                  attention_dropout: float = 0.1,
 61 |                  layernorm_epsilon: float = 1e-5,
 62 |                  activation: Callable = nn.functional.gelu,
 63 |                  padding_idx: int = None,
 64 |                  dtype: dtype = None,
 65 |                  bias: bool = True,
 66 |                  apply_post_layernorm: bool = False,
 67 |                  fuse_scale_mask_softmax: bool = False,
 68 |                  checkpoint: bool = False,
 69 |                  activation_offload: bool = False) -> None:
 70 |         super().__init__()
 71 |         self.embed = GPTEmbedding(embedding_dim=hidden_size,
 72 |                                   vocab_size=vocab_size,
 73 |                                   max_position_embeddings=max_position_embeddings,
 74 |                                   padding_idx=padding_idx,
 75 |                                   dropout=embedding_dropout,
 76 |                                   dtype=dtype)
 77 |         self.blocks = nn.ModuleList([
 78 |             GPTBlock(hidden_size=hidden_size,
 79 |                      num_heads=num_heads,
 80 |                      mlp_ratio=mlp_ratio,
 81 |                      activation=activation,
 82 |                      attention_dropout=attention_dropout,
 83 |                      dropout=dropout,
 84 |                      layernorm_epsilon=layernorm_epsilon,
 85 |                      dtype=dtype,
 86 |                      bias=bias,
 87 |                      apply_post_layernorm=apply_post_layernorm,
 88 |                      fuse_scale_mask_softmax=fuse_scale_mask_softmax,
 89 |                      checkpoint=checkpoint,
 90 |                      activation_offload=activation_offload) for _ in range(depth)
 91 |         ])
 92 | 
 93 |         self.norm = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype)
 94 | 
 95 |         self.head = GPTLMHead(
 96 |             hidden_size=hidden_size,
 97 |             vocab_size=vocab_size,
 98 |             embedding_layer=self.embed,
 99 |         # word_embeeding_weight=self.embed.word_embedding_weight,
100 |             dtype=dtype)
101 | 
102 |     def forward(self, input_ids, attention_mask=None):
103 | 
104 |         # the size of input_ids is (BATCH_SIZE, SEQ_LEN)
105 |         x = self.embed(input_ids)
106 |         # the size of x after embed layer is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
107 | 
108 |         # We create a 3D attention mask from a 2D tensor mask.
109 |         # Sizes are [batch_size, 1, 1, to_seq_length]
110 |         # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
111 |         # Adapted from huggingface
112 |         if attention_mask is not None:
113 |             batch_size = input_ids.shape[0]
114 |             attention_mask = attention_mask.view(batch_size, -1)
115 |             attention_mask = col_nn.partition_batch(attention_mask)
116 |             attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
117 |             attention_mask = attention_mask.to(dtype=x.dtype)    # fp16 compatibility
118 |             attention_mask = (1.0 - attention_mask) * -10000.0
119 | 
120 |         # the size of x in blocks is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
121 |         for block in self.blocks:
122 |             x, attention_mask = block(x, attention_mask)
123 | 
124 |         x = self.head(self.norm(x))
125 |         # the size of x is (BATCH_SIZE, SEQ_LEN, VOCAB_SIZE)
126 | 
127 |         return x
128 | 
129 | 
130 | def _create_gpt_model(**model_kwargs):
131 |     model = GPT(**model_kwargs)
132 |     return model
133 | 
134 | 
135 | def gpt2_small(**kwargs):
136 |     model_kwargs = dict(hidden_size=768, depth=12, num_heads=12, **kwargs)
137 |     return _create_gpt_model(**model_kwargs)
138 | 
139 | 
140 | def gpt2_medium(**kwargs):
141 |     model_kwargs = dict(hidden_size=1024, depth=24, num_heads=8, **kwargs)
142 |     return _create_gpt_model(**model_kwargs)
143 | 
144 | 
145 | def gpt2_large(**kwargs):
146 |     model_kwargs = dict(hidden_size=1536, depth=36, num_heads=12, **kwargs)
147 |     return _create_gpt_model(**model_kwargs)
148 | 
149 | 
150 | def gpt2_xl(**kwargs):
151 |     model_kwargs = dict(hidden_size=1600, depth=48, num_heads=16, **kwargs)
152 |     return _create_gpt_model(**model_kwargs)
153 | 
154 | 
155 | def gpt2_2B(**kwargs):
156 |     model_kwargs = dict(hidden_size=2048, depth=40, num_heads=16, **kwargs)
157 |     return _create_gpt_model(**model_kwargs)
158 | 
159 | 
160 | def gpt2_3B(**kwargs):
161 |     model_kwargs = dict(hidden_size=2304, depth=48, num_heads=16, **kwargs)
162 |     return _create_gpt_model(**model_kwargs)
163 | 
164 | 
165 | def gpt2_4B(**kwargs):
166 |     model_kwargs = dict(hidden_size=2304, depth=64, num_heads=16, **kwargs)
167 |     return _create_gpt_model(**model_kwargs)
168 | 
169 | 
170 | def gpt2_6B(**kwargs):
171 |     model_kwargs = dict(hidden_size=4096, depth=30, num_heads=16, **kwargs)
172 |     return _create_gpt_model(**model_kwargs)
173 | 
174 | 
175 | def gpt2_8B(**kwargs):
176 |     model_kwargs = dict(hidden_size=3072, depth=72, num_heads=24, **kwargs)
177 |     return _create_gpt_model(**model_kwargs)
178 | 
179 | 
180 | def gpt2_12B(**kwargs):
181 |     model_kwargs = dict(hidden_size=4096, depth=60, num_heads=16, **kwargs)
182 |     return _create_gpt_model(**model_kwargs)
183 | 
184 | 
185 | def gpt2_15B(**kwargs):
186 |     model_kwargs = dict(hidden_size=4096, depth=78, num_heads=16, **kwargs)
187 |     return _create_gpt_model(**model_kwargs)
188 | 
189 | 
190 | def gpt2_18B(**kwargs):
191 |     model_kwargs = dict(hidden_size=4096, depth=90, num_heads=16, **kwargs)
192 |     return _create_gpt_model(**model_kwargs)
193 | 
194 | 
195 | def gpt2_20B(**kwargs):
196 |     model_kwargs = dict(hidden_size=8192, depth=25, num_heads=16, **kwargs)
197 |     return _create_gpt_model(**model_kwargs)
198 | 
199 | 
200 | def gpt2_24B(**kwargs):
201 |     model_kwargs = dict(hidden_size=8192, depth=30, num_heads=16, **kwargs)
202 |     return _create_gpt_model(**model_kwargs)
203 | 
204 | 
205 | def gpt2_28B(**kwargs):
206 |     model_kwargs = dict(hidden_size=8192, depth=35, num_heads=16, **kwargs)
207 |     return _create_gpt_model(**model_kwargs)
208 | 
209 | 
210 | def gpt2_32B(**kwargs):
211 |     model_kwargs = dict(hidden_size=8192, depth=40, num_heads=16, **kwargs)
212 |     return _create_gpt_model(**model_kwargs)
213 | 
214 | 
215 | def gpt2_36B(**kwargs):
216 |     model_kwargs = dict(hidden_size=8192, depth=45, num_heads=16, **kwargs)
217 |     return _create_gpt_model(**model_kwargs)
218 | 
219 | 
220 | def gpt2_40B(**kwargs):
221 |     model_kwargs = dict(hidden_size=8192, depth=50, num_heads=16, **kwargs)
222 |     return _create_gpt_model(**model_kwargs)
223 | 
224 | 
225 | def gpt3(**kwargs):
226 |     model_kwargs = dict(hidden_size=12288, depth=96, num_heads=96, **kwargs)
227 |     return _create_gpt_model(**model_kwargs)
228 | 


--------------------------------------------------------------------------------
/titans/dataloader/bert/bert_pretrain.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file is adapated from 
  3 | """
  4 | 
  5 | import logging
  6 | import os
  7 | import torch
  8 | import transformers
  9 | import torch.distributed as dist
 10 | 
 11 | try:
 12 |   from lddl.utils import get_all_parquets_under, get_all_bin_ids, get_file_paths_for_bin_id
 13 |   from lddl.torch.dataloader import DataLoader
 14 |   from .parquet_dataset import ParquetDataset
 15 |   from lddl.torch.bert import _decode_record_batch, _to_encoded_inputs, _mask_tokens, BertPretrainBinned
 16 |   from lddl.torch.log import DatasetLogger
 17 |   from lddl.torch.utils import get_node_rank, get_nproc_per_node
 18 | except ImportError:
 19 |   raise ImportError('lddl is required for BERT pretraining but not found, '
 20 |     'you can install lddl by pip install git+https://github.com/NVIDIA/DeepLearningExamples.git#subdirectory=Tools/lddl')
 21 | 
 22 | 
 23 | class BertPretrainDataset(ParquetDataset):
 24 | 
 25 |   def _decode_record_batch(self, b):
 26 |     return _decode_record_batch(b)
 27 | 
 28 | 
 29 | def get_bert_pretrain_data_loader(
 30 |     path,
 31 |     shuffle_buffer_size=16384,
 32 |     shuffle_buffer_warmup_factor=16,
 33 |     tokenizer_class=transformers.BertTokenizerFast,
 34 |     vocab_file=None,
 35 |     tokenizer_kwargs={},
 36 |     data_loader_class=DataLoader,
 37 |     data_loader_kwargs={},
 38 |     mlm_probability=0.15,
 39 |     base_seed=12345,
 40 |     log_dir=None,
 41 |     log_level=logging.INFO,
 42 |     return_raw_samples=False,
 43 |     start_epoch=0,
 44 |     sequence_length_alignment=8,
 45 |     ignore_index=-1,
 46 |     process_group=None,
 47 | ):
 48 |   """Gets a PyTorch DataLoader for the BERT pretraining task.
 49 | 
 50 |   The LDDL DataLoader can be used in the same way as a normal PyTorch
 51 |   DataLoader. The 'persistent_workers' attribute will always be enabled.
 52 | 
 53 |   The LDDL DataLoader streams samples from disk into memory, and uses a shuffle
 54 |   buffer to perform shuffling: at each iteration, a random sample from the
 55 |   shuffle buffer is popped, and a new sample is pushed into the shuffle buffer
 56 |   at this vacant location.
 57 | 
 58 |   Args:
 59 |     path: A string of the path pointing to the directory that contains the
 60 |       pretraining dataset in the format of balanced parquet shards.
 61 |     local_rank: The local rank ID (on this node) of the current pretraining
 62 |       process.
 63 |     shuffle_buffer_size: The size of the shuffle buffer.
 64 |     shuffle_buffer_warmup_factor: At the beginning, the shuffle buffer is empty.
 65 |       Therefore, in order to fill the shuffle buffer, at each iteration, more
 66 |       samples need to be pushed into the shuffle buffer than being popped out
 67 |       of. This factor indicates how many samples is pushed into the shuffle
 68 |       buffer per 1 sample being popped out of the shuffle buffer, until the
 69 |       shuffle buffer is full.
 70 |     tokenizer_class: The HuggingFace tokenizer class for BERT pretraining.
 71 |     vocab_file: The path to a vocab file, or the name of a pretrained model
 72 |       registered on huggingface.co (e.g., 'bert-large-uncased') of which the
 73 |       vocab file is downloaded.
 74 |     tokenizer_kwargs: The arguments to the tokenizer class.
 75 |     data_loader_class: The class of the DataLoader.
 76 |     data_loader_kwargs: The arguments to the DataLoader class.
 77 |     mlm_probability: The probability for masking tokens in the masked language
 78 |       modeling task (in BERT pretraining).
 79 |     base_seed: A base seed value on which other seeds used in the DataLoader are
 80 |       based.
 81 |     log_dir: The path to a directory to store the logs from the LDDL DataLoader.
 82 |     log_level: The logging verbose level.
 83 |     return_raw_samples: If True, returns the raw string pairs instead of token
 84 |       indices.
 85 |     start_epoch: The epoch number to start from. An epoch is defined as going
 86 |       through every sample in a dataset once.
 87 |     sequence_length_alignment: To get the input tensors of token indices, each
 88 |       sequence in a batch will only be padded to the longest sequence in this
 89 |       batch. However, certain hardware features might prefer the shapes of the
 90 |       input tensors to meet certain conditions. For example, it's better for the
 91 |       Tensor Core on NVIDIA GPUs if the dimensions of the input tensors are
 92 |       divisible by 8. Therefore, this argument is an alignment factor such that
 93 |       the sequences in a batch will be padded to the first sequence length
 94 |       larger than the longest sequence in this batch and also divisible by this
 95 |       alignment factor.
 96 |     ignore_index: The label value for the unmasked tokens in the language
 97 |       modeling task (in BERT pretraining).
 98 | 
 99 |   Returns:
100 |     A PyTorch DataLoader that, in each iteration, yield:
101 |     - If return_raw_samples is False, a dict of 5 key-value pairs which are the
102 |       necessary input for BERT pretraining:
103 |       {
104 |         'input_ids': a torch.Tensor of size [batch_size, sequence_length],
105 |         'token_type_ids': a torch.Tensor of size [batch_size, sequence_length],
106 |         'attention_mask': a torch.Tensor of size [batch_size, sequence_length],
107 |         'labels': a torch.Tensor of size [batch_size, sequence_length],
108 |         'next_sentence_labels': a torch.Tensor of size [batch_size],
109 |       }
110 |     - If return_raw_samples is True, a list of the following lists:
111 |       [
112 |         strings of the first sequences in the sequence pairs,
113 |         strings of the second sequences in the sequence pairs,
114 |         bools that indicate whether the second sequences are the next sequences
115 |           for the first sequences,
116 |         numpy.ndarrays of positions of the masked tokens for the masked language
117 |           modeling task (only exists if static masking is enabled),
118 |         strings of space-seperated labels of the masked tokens for the masked
119 |           language modeling task (only exists if static masking is enabled),
120 |       ]
121 | 
122 |   Examples:
123 |     train_dataloader = lddl.torch.get_bert_pretrain_data_loader(
124 |       input_dir,
125 |       local_rank=local_rank,
126 |       vocab_file=vocab_file,
127 |       data_loader_kwargs={
128 |         'batch_size': batch_size,
129 |         'num_workers': num_workers,
130 |         'pin_memory': True,
131 |       },
132 |       log_level=logging.WARNING,
133 |       start_epoch=start_epoch,
134 |     )
135 | 
136 |     for epoch in range(start_epoch, start_epoch + epochs):
137 |       for i, batch in enumerate(train_dataloader):
138 |         prediction_scores, seq_relationship_score = model(
139 |           input_ids=batch['input_ids'].to(device),
140 |           token_type_ids=batch['token_type_ids'].to(device),
141 |           attention_mask=batch['attention_mask'].to(device),
142 |       )
143 |       loss = criterion(
144 |           prediction_scores,
145 |           seq_relationship_score,
146 |           batch['labels'].to(device),
147 |           batch['next_sentence_labels'].to(device),
148 |       )
149 |       ...
150 |   """
151 |   assert isinstance(path, str)
152 |   assert isinstance(shuffle_buffer_size, int) and shuffle_buffer_size > 0
153 |   assert (isinstance(shuffle_buffer_warmup_factor, int) and
154 |           shuffle_buffer_warmup_factor > 0)
155 |   assert tokenizer_class in {
156 |       transformers.BertTokenizerFast, transformers.BertTokenizer
157 |   }
158 |   assert isinstance(vocab_file, str)
159 |   assert isinstance(tokenizer_kwargs, dict)
160 |   assert data_loader_class in {DataLoader}
161 |   assert isinstance(data_loader_kwargs, dict)
162 |   assert isinstance(mlm_probability, (int, float)) and 0 <= mlm_probability <= 1
163 |   assert isinstance(base_seed, int)
164 |   assert log_dir is None or isinstance(log_dir, str)
165 |   assert log_level in {
166 |       logging.NOTSET, logging.DEBUG, logging.INFO, logging.WARNING,
167 |       logging.ERROR, logging.CRITICAL
168 |   }
169 |   assert isinstance(return_raw_samples, bool)
170 |   assert isinstance(start_epoch, int)
171 | 
172 |   local_rank = dist.get_rank(process_group)
173 | 
174 |   if os.path.isfile(vocab_file):
175 |     tokenizer = tokenizer_class(vocab_file, **tokenizer_kwargs)
176 |   else:
177 |     tokenizer = tokenizer_class.from_pretrained(vocab_file, **tokenizer_kwargs)
178 | 
179 |   def _batch_preprocess(batch):
180 |     with torch.no_grad():
181 |       encoded_inputs = _to_encoded_inputs(
182 |           batch,
183 |           tokenizer,
184 |           sequence_length_alignment=sequence_length_alignment,
185 |           ignore_index=ignore_index,
186 |       )
187 |       if 'special_tokens_mask' in encoded_inputs:  # Dynamic masking.
188 |         special_tokens_mask = encoded_inputs.pop('special_tokens_mask', None)
189 |         (encoded_inputs['input_ids'], encoded_inputs['labels']) = _mask_tokens(
190 |             encoded_inputs['input_ids'],
191 |             special_tokens_mask=special_tokens_mask,
192 |             tokenizer=tokenizer,
193 |             mlm_probability=mlm_probability,
194 |             ignore_index=ignore_index,
195 |         )
196 |     return encoded_inputs
197 | 
198 |   logger = DatasetLogger(
199 |       log_dir=log_dir,
200 |       node_rank=get_node_rank(nproc_per_node=get_nproc_per_node(local_rank)),
201 |       local_rank=local_rank,
202 |       log_level=log_level,
203 |   )
204 | 
205 |   dataset_kwargs = {
206 |       'shuffle_buffer_size': shuffle_buffer_size,
207 |       'shuffle_buffer_warmup_factor': shuffle_buffer_warmup_factor,
208 |       'base_seed': base_seed,
209 |       'logger': logger,
210 |       'start_epoch': start_epoch,
211 |       'process_group': process_group
212 |   }
213 | 
214 |   extra_collate = data_loader_kwargs.get('collate_fn', lambda x: x)
215 |   if not return_raw_samples:
216 |     data_loader_kwargs['collate_fn'] = lambda batch: extra_collate(
217 |         _batch_preprocess(batch))
218 | 
219 |   # Find all the parquet file paths and figure out whether it is binned or
220 |   # un-binned.
221 |   all_file_paths = get_all_parquets_under(path)
222 |   bin_ids = get_all_bin_ids(all_file_paths)
223 |   if len(bin_ids) > 0:
224 |     data_loader = BertPretrainBinned(
225 |         [
226 |             data_loader_class(
227 |                 BertPretrainDataset(
228 |                     get_file_paths_for_bin_id(all_file_paths, bin_id),
229 |                     **dataset_kwargs,
230 |                 ),
231 |                 **data_loader_kwargs,
232 |             ) for bin_id in bin_ids
233 |         ],
234 |         base_seed=base_seed,
235 |         start_epoch=start_epoch,
236 |         logger=logger,
237 |     )
238 |   else:  # un-binned
239 |     data_loader = data_loader_class(
240 |         BertPretrainDataset(all_file_paths, **dataset_kwargs),
241 |         **data_loader_kwargs,
242 |     )
243 | 
244 |   return data_loader
245 | 


--------------------------------------------------------------------------------