├── tests ├── __init__.py ├── utils │ ├── __init__.py │ └── dist_test.py ├── test_decorator │ └── test_no_support.py ├── test_layer │ ├── test_mlp │ │ ├── test_vit_moe_mlp.py │ │ ├── test_transformer_mlp.py │ │ ├── test_vit_mlp.py │ │ └── test_detr_mlp.py │ ├── test_embedding │ │ ├── test_gpt_embedding.py │ │ └── test_vit_embedding.py │ ├── test_head │ │ ├── test_vit_head.py │ │ └── test_gpt_head.py │ ├── test_block │ │ ├── test_gpt_block.py │ │ ├── test_deepnet_block.py │ │ ├── test_vit_block.py │ │ └── test_detr_block.py │ └── test_attention │ │ └── test_transformer_attention.py ├── test_model │ ├── test_gpt.py │ ├── test_deepnet.py │ ├── test_vit.py │ ├── test_detr.py │ └── test_moe.py └── test_dataloader │ └── test_bert_pretrain_dataloader.py ├── version.txt ├── titans ├── dataloader │ ├── __init__.py │ ├── utils │ │ ├── __init__.py │ │ └── rand_augment.py │ ├── cifar10 │ │ ├── __init__.py │ │ └── torchvision_cifar10.py │ ├── imagenet │ │ ├── __init__.py │ │ └── torchvision_imagenet.py │ └── bert │ │ ├── __init__.py │ │ ├── parquet_dataset.py │ │ └── bert_pretrain.py ├── model │ ├── vilt │ │ ├── __init__.py │ │ └── vilt.py │ ├── transformer │ │ ├── __init__.py │ │ └── transformer.py │ ├── detr │ │ ├── __init__.py │ │ └── detr.py │ ├── gpt │ │ ├── __init__.py │ │ └── gpt.py │ ├── vit │ │ ├── __init__.py │ │ └── vit.py │ ├── deepnet │ │ ├── __init__.py │ │ └── deepnet.py │ ├── knowledge_graph_embedding │ │ ├── dataloader │ │ │ ├── __init__.py │ │ │ └── dataloader.py │ │ └── __init__.py │ ├── moe │ │ ├── __init__.py │ │ ├── util.py │ │ ├── widenet.py │ │ ├── vit_moe.py │ │ └── gpt_moe.py │ ├── __init__.py │ └── helper.py ├── loss │ ├── mlm_loss │ │ ├── __init__.py │ │ └── mlm_loss.py │ ├── lm_loss │ │ ├── __init__.py │ │ └── gpt_lmloss.py │ ├── embedding_loss │ │ ├── __init__.py │ │ └── embedding_loss.py │ ├── vocab_cross_entropy │ │ ├── __init__.py │ │ └── vocab_cross_entropy.py │ └── __init__.py ├── layer │ ├── batchnorm │ │ ├── __init__.py │ │ └── frozen_batchnorm_2d.py │ ├── head │ │ ├── __init__.py │ │ ├── gpt_lm_head.py │ │ └── vit_head.py │ ├── embedding │ │ ├── __init__.py │ │ ├── gpt_embedding.py │ │ └── vit_embedding.py │ ├── block │ │ ├── utils.py │ │ ├── __init__.py │ │ ├── transformer_encoder.py │ │ ├── vit_block.py │ │ ├── deepnet_block.py │ │ ├── transformer_decoder.py │ │ ├── detr_block.py │ │ └── gpt_block.py │ ├── mlp │ │ ├── __init__.py │ │ ├── detr_mlp.py │ │ ├── vit_moe_mlp.py │ │ ├── vit_mlp.py │ │ └── transformer_mlp.py │ ├── __init__.py │ ├── attention │ │ ├── __init__.py │ │ ├── vit_moe_attention.py │ │ ├── transformer_attention.py │ │ ├── vit_attention.py │ │ ├── detr_attention.py │ │ └── gpt_attention.py │ └── init_rules.py ├── __init__.py ├── decorator │ ├── __init__.py │ └── no_support.py └── utils │ ├── __init__.py │ ├── utils.py │ ├── context.py │ └── tensor_parallel_data_split.py ├── requirements ├── requirements-test.txt └── requirements.txt ├── MANIFEST.in ├── .style.yapf ├── README.md ├── .pre-commit-config.yaml ├── .github ├── workflows │ ├── release.yml │ ├── release_test.yml │ ├── build.yml │ └── close_inactive.yml └── ISSUE_TEMPLATE │ ├── documentation.yml │ ├── bug-report.yml │ ├── feature_request.yml │ └── proposal.yml ├── setup.py └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /version.txt: -------------------------------------------------------------------------------- 1 | 0.0.7 2 | -------------------------------------------------------------------------------- /titans/dataloader/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /titans/model/vilt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /titans/model/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements/requirements-test.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .dist_test import * 2 | -------------------------------------------------------------------------------- /titans/model/detr/__init__.py: -------------------------------------------------------------------------------- 1 | from .detr import * 2 | -------------------------------------------------------------------------------- /titans/model/gpt/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt import * 2 | -------------------------------------------------------------------------------- /titans/model/vit/__init__.py: -------------------------------------------------------------------------------- 1 | from .vit import * 2 | -------------------------------------------------------------------------------- /titans/model/deepnet/__init__.py: -------------------------------------------------------------------------------- 1 | from .deepnet import * 2 | -------------------------------------------------------------------------------- /titans/model/knowledge_graph_embedding/dataloader/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchvision 3 | colossalai -------------------------------------------------------------------------------- /titans/dataloader/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .rand_augment import * 2 | -------------------------------------------------------------------------------- /titans/loss/mlm_loss/__init__.py: -------------------------------------------------------------------------------- 1 | from .mlm_loss import MLM_loss 2 | -------------------------------------------------------------------------------- /titans/loss/lm_loss/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt_lmloss import GPTLMLoss 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt README.md 2 | recursive-include requirements *.txt -------------------------------------------------------------------------------- /titans/dataloader/cifar10/__init__.py: -------------------------------------------------------------------------------- 1 | from .torchvision_cifar10 import * 2 | -------------------------------------------------------------------------------- /titans/loss/embedding_loss/__init__.py: -------------------------------------------------------------------------------- 1 | from .embedding_loss import embeddingLoss 2 | -------------------------------------------------------------------------------- /titans/layer/batchnorm/__init__.py: -------------------------------------------------------------------------------- 1 | from .frozen_batchnorm_2d import FrozenBatchNorm2d 2 | -------------------------------------------------------------------------------- /titans/model/knowledge_graph_embedding/__init__.py: -------------------------------------------------------------------------------- 1 | from .knowledge_graph_embedding import * 2 | -------------------------------------------------------------------------------- /titans/layer/head/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt_lm_head import GPTLMHead 2 | from .vit_head import ViTHead 3 | -------------------------------------------------------------------------------- /titans/dataloader/imagenet/__init__.py: -------------------------------------------------------------------------------- 1 | from .dali_imagenet import * 2 | from .torchvision_imagenet import * -------------------------------------------------------------------------------- /titans/loss/vocab_cross_entropy/__init__.py: -------------------------------------------------------------------------------- 1 | from .vocab_cross_entropy import vocab_parallel_cross_entropy 2 | -------------------------------------------------------------------------------- /titans/layer/embedding/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt_embedding import GPTEmbedding 2 | from .vit_embedding import ViTEmbedding 3 | -------------------------------------------------------------------------------- /titans/dataloader/bert/__init__.py: -------------------------------------------------------------------------------- 1 | from .bert_pretrain import get_bert_pretrain_data_loader 2 | 3 | __all__ = ['get_bert_pretrain_data_loader'] -------------------------------------------------------------------------------- /.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | based_on_style = google 3 | spaces_before_comment = 4 4 | split_before_logical_operator = true 5 | column_limit = 120 6 | -------------------------------------------------------------------------------- /titans/layer/block/utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from torch import nn 3 | 4 | 5 | def get_clones(module, N): 6 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) 7 | -------------------------------------------------------------------------------- /titans/layer/mlp/__init__.py: -------------------------------------------------------------------------------- 1 | from .vit_mlp import ViTMLP 2 | from .vit_moe_mlp import MLPForMoe 3 | from .detr_mlp import DeTrMLP 4 | from .transformer_mlp import TransformerMLP 5 | -------------------------------------------------------------------------------- /titans/model/moe/__init__.py: -------------------------------------------------------------------------------- 1 | from .vit_moe import ViTMoE 2 | from .widenet import Widenet 3 | from .gpt_moe import MOEGPT, prmoe_4b, prmoe_16b, prmoe_25b, prmoe_29b, prmoe_31b, prmoe_51b 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ColossalAI-Models 2 | 3 | [![Made with ColossalAI](https://img.shields.io/badge/Made%20with-ColossalAI-blue)](https://github.com/hpcaitech/ColossalAI) 4 | 5 | 6 | Model zoo for ColossalAI 7 | -------------------------------------------------------------------------------- /titans/loss/__init__.py: -------------------------------------------------------------------------------- 1 | from . import embedding_loss 2 | from . import lm_loss 3 | from . import mlm_loss 4 | from . import vocab_cross_entropy 5 | 6 | __all__ = ['embedding_loss', 'lm_loss', 'mlm_loss', 'vocab_cross_entropy'] -------------------------------------------------------------------------------- /titans/__init__.py: -------------------------------------------------------------------------------- 1 | from . import layer 2 | from . import loss 3 | from . import model 4 | from . import utils 5 | from . import decorator 6 | from . import dataloader 7 | 8 | __all__ = ['layer', 'loss', 'model', 'utils', 'decorator', 'dataloader'] 9 | -------------------------------------------------------------------------------- /titans/decorator/__init__.py: -------------------------------------------------------------------------------- 1 | from .no_support import no_support, support_moe_only, support_sp_pp_only, support_tp_pp_only, no_parallel_support 2 | 3 | __all__ = ['no_support', 'support_moe_only', 'support_sp_pp_only', 'support_tp_pp_only', 'no_parallel_support'] -------------------------------------------------------------------------------- /titans/layer/__init__.py: -------------------------------------------------------------------------------- 1 | from . import attention 2 | from . import batchnorm 3 | from . import block 4 | from . import embedding 5 | from . import head 6 | from . import mlp 7 | from .init_rules import init_rules 8 | 9 | __all__ = ['attention', 'batchnorm', 'block', 'embedding', 'head', 'mlp', 'init_rules'] 10 | -------------------------------------------------------------------------------- /titans/model/__init__.py: -------------------------------------------------------------------------------- 1 | # from . import detr 2 | from . import gpt 3 | from . import knowledge_graph_embedding 4 | from . import moe 5 | from . import transformer 6 | from . import vilt 7 | from . import vit 8 | 9 | __all__ = ['detr', 'gpt', 'knowledge_graph_embedding', 'moe', 'transformer', 'vilt', 'vit'] 10 | -------------------------------------------------------------------------------- /titans/layer/attention/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt_attention import GPTSelfAttention 2 | from .detr_attention import DeTrCrossAttention 3 | from .vit_attention import ViTSelfAttention 4 | from .vit_moe_attention import SelfAttentionForMoe 5 | from .transformer_attention import TransformerSelfAttention, TransformerMultiHeadAttention 6 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/mirrors-yapf 3 | rev: v0.32.0 4 | hooks: 5 | - id: yapf 6 | args: ['--style=.style.yapf', '--parallel', '--in-place'] 7 | - repo: https://github.com/pre-commit/mirrors-clang-format 8 | rev: v13.0.1 9 | hooks: 10 | - id: clang-format 11 | -------------------------------------------------------------------------------- /titans/layer/block/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt_block import GPTBlock, MOEGPTBlock 2 | from .vit_block import ViTBlock 3 | from .transformer_encoder import TransformerEncoderLayer, TransformerEncoder 4 | from .transformer_decoder import TransformerDecoderLayer, TransformerDecoder 5 | from .deepnet_block import DeepNetBlock 6 | from .detr_block import DeTrEncoder, DeTrDecoder 7 | 8 | -------------------------------------------------------------------------------- /titans/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import VocabUtility 2 | from .context import barrier_context 3 | from .tensor_parallel_data_split import split_data_3d, split_data_2d, split_data_2p5d, split_data_for_tensor_parallel 4 | 5 | __all__ = [ 6 | 'VocabUtility', 'barrier_context', 'split_data_3d', 'split_data_for_tensor_parallel', 'split_data_2d', 7 | 'split_data_2p5d' 8 | ] 9 | -------------------------------------------------------------------------------- /titans/loss/mlm_loss/mlm_loss.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules.loss import * 2 | from torch.nn.modules.loss import _Loss 3 | 4 | 5 | class MLM_loss(_Loss): 6 | 7 | def __init__(self, reduction: bool = True, *args, **kwargs): 8 | super().__init__() 9 | 10 | def itm_mlm_loss(self, output): 11 | total_loss = sum([v for k, v in output.items() if "loss" in k]) 12 | return total_loss 13 | 14 | def forward(self, *args): 15 | return self.itm_mlm_loss(*args) 16 | -------------------------------------------------------------------------------- /titans/loss/lm_loss/gpt_lmloss.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from colossalai import nn as col_nn 4 | 5 | 6 | class GPTLMLoss(nn.Module): 7 | 8 | def __init__(self): 9 | super().__init__() 10 | self.loss = col_nn.CrossEntropyLoss() 11 | 12 | def forward(self, logits, labels): 13 | shift_logits = logits[..., :-1, :].contiguous() 14 | shift_labels = labels[..., 1:].contiguous() 15 | # Flatten the tokens 16 | return self.loss(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) -------------------------------------------------------------------------------- /titans/layer/mlp/detr_mlp.py: -------------------------------------------------------------------------------- 1 | import torch.nn.functional as F 2 | from torch import nn 3 | 4 | from colossalai import nn as col_nn 5 | 6 | 7 | class DeTrMLP(nn.Module): 8 | """ Very simple multi-layer perceptron (also called FFN)""" 9 | 10 | def __init__(self, input_dim, hidden_size, output_dim, num_layers): 11 | super().__init__() 12 | self.num_layers = num_layers 13 | h = [hidden_size] * (num_layers - 1) 14 | self.layers = nn.ModuleList(col_nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) 15 | 16 | def forward(self, x): 17 | for i, layer in enumerate(self.layers): 18 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 19 | return x 20 | -------------------------------------------------------------------------------- /titans/layer/head/gpt_lm_head.py: -------------------------------------------------------------------------------- 1 | from torch import dtype, nn 2 | 3 | from colossalai import nn as col_nn 4 | 5 | 6 | class GPTLMHead(nn.Module): 7 | 8 | def __init__(self, 9 | hidden_size: int, 10 | vocab_size: int, 11 | embedding_layer=None, 12 | bias: bool = False, 13 | dtype: dtype = None) -> None: 14 | super().__init__() 15 | self.dense = col_nn.Classifier(hidden_size, vocab_size, embedding_layer.word_embedding_weight, bias=bias, dtype=dtype) 16 | 17 | @property 18 | def weight(self): 19 | return self.dense.weight 20 | 21 | def forward(self, x): 22 | # the size of x before dense is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) 23 | # the size of x after dense is (BATCH_SIZE, SEQ_LEN, VOCAB_SIZE) 24 | x = self.dense(x) 25 | return x 26 | -------------------------------------------------------------------------------- /titans/utils/utils.py: -------------------------------------------------------------------------------- 1 | from colossalai.nn.layer.utils import divide 2 | 3 | 4 | class VocabUtility: 5 | """Split the vocabulary into `world_size` chunks amd return the 6 | first and last index of the vocabulary belonging to the `rank` 7 | partition: Note that indecies in [fist, last)""" 8 | 9 | @staticmethod 10 | def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size): 11 | index_f = rank * per_partition_vocab_size 12 | index_l = index_f + per_partition_vocab_size 13 | return index_f, index_l 14 | 15 | @staticmethod 16 | def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size): 17 | per_partition_vocab_size = divide(global_vocab_size, world_size) 18 | return VocabUtility.vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size) 19 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | 3 | on: workflow_dispatch 4 | 5 | jobs: 6 | build-n-publish: 7 | if: github.ref_name == 'main' && github.repository == 'hpcaitech/Titans' && contains(fromJson('["FrankLeeeee", "YuliangLiu0306"]'), github.actor) 8 | name: Build and publish Python 🐍 distributions 📦 to PyPI 9 | runs-on: ubuntu-latest 10 | timeout-minutes: 20 11 | steps: 12 | - uses: actions/checkout@v2 13 | - uses: actions/setup-python@v2 14 | with: 15 | python-version: '3.7.12' 16 | - run: python setup.py sdist build 17 | # publish to PyPI if executed on the main branch 18 | # publish to Test PyPI if executed on the develop branch 19 | - name: Publish package to PyPI 20 | uses: pypa/gh-action-pypi-publish@release/v1 21 | with: 22 | user: __token__ 23 | password: ${{ secrets.TITANS_PYPI_TOKEN }} 24 | verbose: true -------------------------------------------------------------------------------- /.github/workflows/release_test.yml: -------------------------------------------------------------------------------- 1 | name: Publish to Test PyPI 2 | 3 | on: workflow_dispatch 4 | 5 | jobs: 6 | build-n-publish: 7 | if: github.repository == 'hpcaitech/Titans' && contains(fromJson('["FrankLeeeee", "YuliangLiu0306"]'), github.actor) 8 | name: Build and publish Python 🐍 distributions 📦 to Test PyPI 9 | runs-on: ubuntu-latest 10 | timeout-minutes: 20 11 | steps: 12 | - uses: actions/checkout@v2 13 | - uses: actions/setup-python@v2 14 | with: 15 | python-version: '3.7.12' 16 | - run: python setup.py sdist build 17 | # publish to PyPI if executed on the main branch 18 | # publish to Test PyPI if executed on the develop branch 19 | - name: Publish package to Test PyPI 20 | uses: pypa/gh-action-pypi-publish@release/v1 21 | with: 22 | user: __token__ 23 | password: ${{ secrets.TITANS_TEST_PYPI_TOKEN }} 24 | repository_url: https://test.pypi.org/legacy/ 25 | verbose: true -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation.yml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation 2 | description: Report an issue related to documentation 3 | title: "[DOC] " 4 | labels: [documentation] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Not suitable for your needs? [Open a blank issue](https://github.com/hpcaitech/Titans/issues/new). 11 | - type: textarea 12 | attributes: 13 | label: 📚 The doc issue 14 | description: | 15 | **Description** What content in the documentation is an issue? 16 | **Location** Where is the issue location? 17 | **Expectation** What is your expected content about it? 18 | **Screenshots** If applicable, add screenshots to help explain your problem. 19 | **Suggestions** Tell us how we could improve the documentation. 20 | placeholder: | 21 | A clear and concise description of the issue. 22 | validations: 23 | required: true 24 | 25 | - type: markdown 26 | attributes: 27 | value: > 28 | Thanks for contributing 🎉! 29 | -------------------------------------------------------------------------------- /titans/layer/mlp/vit_moe_mlp.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | from colossalai.utils import get_current_device 4 | 5 | 6 | class MLPForMoe(nn.Module): 7 | """FFN composed with two linear layers, also called MLP. 8 | """ 9 | 10 | def __init__(self, 11 | hidden_size: int, 12 | d_ff: int, 13 | activation=None, 14 | drop_rate: float = 0, 15 | bias: bool = True, 16 | dropout1=None, 17 | dropout2=None): 18 | super().__init__() 19 | dense1 = nn.Linear(hidden_size, d_ff, bias, device=get_current_device()) 20 | act = nn.GELU() if activation is None else activation 21 | dense2 = nn.Linear(d_ff, hidden_size, bias, device=get_current_device()) 22 | drop1 = nn.Dropout(drop_rate) if dropout1 is None else dropout1 23 | drop2 = nn.Dropout(drop_rate) if dropout2 is None else dropout2 24 | 25 | self.ffn = nn.Sequential(dense1, act, drop1, dense2, drop2) 26 | 27 | def forward(self, x): 28 | return self.ffn(x) -------------------------------------------------------------------------------- /tests/test_decorator/test_no_support.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import pytest 3 | import torch.nn as nn 4 | import torch.multiprocessing as mp 5 | 6 | from colossalai.utils import free_port 7 | from functools import partial 8 | from titans.decorator import no_support 9 | 10 | CONFIG = dict(parallel=dict(tensor=dict(mode='1d', size=2))) 11 | 12 | 13 | @no_support('tp') 14 | class Net(nn.Module): 15 | 16 | def __init__(self): 17 | super().__init__() 18 | self.linear = nn.Linear(16, 16) 19 | 20 | def forward(self, x): 21 | return self.linear(x) 22 | 23 | 24 | def run_dist(rank, world_size, port): 25 | colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port) 26 | try: 27 | net = Net() 28 | except Exception as e: 29 | assert isinstance(e, AssertionError) 30 | 31 | 32 | def test_no_support(): 33 | world_size = 2 34 | run_func = partial(run_dist, world_size=world_size, port=free_port()) 35 | mp.spawn(run_func, nprocs=world_size) 36 | 37 | 38 | if __name__ == '__main__': 39 | test_no_support() 40 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | pull_request: 5 | types: [synchronize, labeled] 6 | 7 | jobs: 8 | build: 9 | name: Build and Test Colossal-AI 10 | if: | 11 | github.event.pull_request.draft == false && 12 | github.base_ref == 'main' && 13 | github.event.pull_request.base.repo.full_name == 'hpcaitech/Titans' && 14 | contains( github.event.pull_request.labels.*.name, 'Run Build and Test') 15 | runs-on: [self-hosted, gpu] 16 | container: 17 | image: frankleeeee/pytorch-cuda:1.10.1-11.3.0 18 | options: --gpus all --rm 19 | timeout-minutes: 40 20 | steps: 21 | - uses: actions/checkout@v2 22 | with: 23 | ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} 24 | - name: Install Colossal-AI 25 | run: | 26 | pip install colossalai==0.1.4+torch1.10cu11.3 -f https://release.colossalai.org 27 | pip install -v . 28 | pip install -r requirements/requirements-test.txt 29 | - name: Unit Testing 30 | run: | 31 | mkdir tmp_test 32 | mv tests ./tmp_test 33 | cd ./tmp_test 34 | PYTHONPATH=$PWD pytest tests 35 | -------------------------------------------------------------------------------- /.github/workflows/close_inactive.yml: -------------------------------------------------------------------------------- 1 | name: Close inactive issues 2 | 3 | on: 4 | schedule: 5 | - cron: "0 0 * * *" 6 | 7 | jobs: 8 | close-issues: 9 | if: github.event.pull_request.draft == false && github.event.pull_request.base.repo.full_name == 'hpcaitech/Titans' && github.base_ref == 'main' 10 | runs-on: ubuntu-latest 11 | permissions: 12 | issues: write 13 | pull-requests: write 14 | steps: 15 | - uses: actions/stale@v3 16 | with: 17 | days-before-issue-stale: 14 18 | days-before-issue-close: -1 19 | stale-issue-label: "stale" 20 | stale-issue-message: "This issue is stale because it has been open for 14 days with no activity." 21 | # close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." 22 | days-before-pr-stale: 14 23 | days-before-pr-close: -1 24 | stale-pr-message: "This PR is stale because it has been open for 14 days with no activity." 25 | # close-pr-message: "This PR was closed because it has been inactive for 14 days since being marked as stale." 26 | repo-token: ${{ secrets.GITHUB_TOKEN }} -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.yml: -------------------------------------------------------------------------------- 1 | name: 🐛 Bug Report 2 | description: Create a report to help us reproduce and fix the bug 3 | title: "[BUG] " 4 | labels: [bug] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Not suitable for your needs? [Open a blank issue](https://github.com/hpcaitech/Titans/issues/new). 11 | - type: textarea 12 | attributes: 13 | label: 🐛 Describe the bug 14 | description: | 15 | **Describe the bug** 16 | A clear and concise description of what the bug is. 17 | **To Reproduce** 18 | Steps or code snippet to reproduce the behavior. 19 | **Expected behavior** 20 | A clear and concise description of what you expected to happen. 21 | **Screenshots** 22 | If applicable, add screenshots to help explain your problem. 23 | placeholder: | 24 | A clear and concise description of what the bug is. 25 | validations: 26 | required: true 27 | - type: textarea 28 | attributes: 29 | label: Environment 30 | description: | 31 | Please provide the environment information, eg. CUDA/cuDNN/NCCL/Python/PyTorch version. 32 | 33 | - type: markdown 34 | attributes: 35 | value: > 36 | Thanks for contributing 🎉! 37 | -------------------------------------------------------------------------------- /titans/model/helper.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from colossalai.nn.layer import WrappedDropPath as DropPath 4 | from colossalai.nn.layer.utils import CheckpointModule 5 | 6 | 7 | class TransformerLayer(CheckpointModule): 8 | """Transformer layer builder. 9 | """ 10 | 11 | def __init__(self, 12 | att: nn.Module, 13 | ffn: nn.Module, 14 | norm1: nn.Module, 15 | norm2: nn.Module, 16 | droppath=None, 17 | droppath_rate: float = 0, 18 | checkpoint: bool = False): 19 | super().__init__(checkpoint=checkpoint) 20 | self.att = att 21 | self.ffn = ffn 22 | self.norm1 = norm1 23 | self.norm2 = norm2 24 | self.droppath = DropPath(droppath_rate) if droppath is None else droppath 25 | 26 | def _forward(self, x, y): 27 | x1 = x + self.droppath(self.att(self.norm1(x))) 28 | x2 = self.ffn(self.norm2(x1)) 29 | 30 | if isinstance(x2, tuple): 31 | x, z = x2 32 | y = y + z 33 | else: 34 | x = x2 35 | 36 | x = x1 + self.droppath(x) 37 | return x, y 38 | -------------------------------------------------------------------------------- /titans/model/moe/util.py: -------------------------------------------------------------------------------- 1 | from colossalai.context import ParallelMode 2 | from colossalai.nn.layer import WrappedDropout as Dropout 3 | 4 | 5 | def moe_sa_args(hidden_size: int, 6 | n_heads: int, 7 | d_kv: int, 8 | attention_drop: float = 0, 9 | drop_rate: float = 0, 10 | bias: bool = True): 11 | """This is an example for args in moe self attention, since lots of modules should be 12 | adapted before putting them in experts. 13 | """ 14 | dropout1 = Dropout(attention_drop, mode=ParallelMode.TENSOR) 15 | dropout2 = Dropout(drop_rate, mode=ParallelMode.TENSOR) 16 | return dict(hidden_size=hidden_size, n_heads=n_heads, d_kv=d_kv, bias=bias, dropout1=dropout1, dropout2=dropout2) 17 | 18 | 19 | def moe_mlp_args(hidden_size: int, d_ff: int, drop_rate: float, bias: bool = True): 20 | """This is an example for args of MLP in Experts, since lots of modules should be adapted 21 | before putting them in experts. 22 | """ 23 | dropout1 = Dropout(drop_rate, mode=ParallelMode.TENSOR) 24 | dropout2 = Dropout(drop_rate, mode=ParallelMode.TENSOR) 25 | return dict(hidden_size=hidden_size, d_ff=d_ff, bias=bias, dropout1=dropout1, dropout2=dropout2) 26 | -------------------------------------------------------------------------------- /titans/dataloader/cifar10/torchvision_cifar10.py: -------------------------------------------------------------------------------- 1 | import os 2 | from colossalai.utils import get_dataloader 3 | 4 | from torchvision import transforms 5 | from torchvision.datasets import CIFAR10 6 | 7 | 8 | def build_cifar(batch_size, root, padding=None, pad_if_needed=False, crop=224, resize=224): 9 | transform_train = transforms.Compose([ 10 | transforms.RandomCrop(crop, padding=padding, pad_if_needed=pad_if_needed), 11 | transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.CIFAR10), 12 | transforms.ToTensor(), 13 | transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), 14 | ]) 15 | transform_test = transforms.Compose([ 16 | transforms.Resize(resize), 17 | transforms.ToTensor(), 18 | transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), 19 | ]) 20 | 21 | train_dataset = CIFAR10(root=root, train=True, download=True, transform=transform_train) 22 | test_dataset = CIFAR10(root=root, train=False, transform=transform_test) 23 | train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=batch_size, pin_memory=True) 24 | test_dataloader = get_dataloader(dataset=test_dataset, batch_size=batch_size, pin_memory=True) 25 | return train_dataloader, test_dataloader 26 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yml: -------------------------------------------------------------------------------- 1 | name: 🚀 Feature request 2 | description: Suggest an idea for this project 3 | title: "[FEATURE] " 4 | labels: [enhancement] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: > 10 | #### Not suitable for your needs? [Open a blank issue](https://github.com/hpcaitech/Titans/issues/new). 11 | - type: textarea 12 | attributes: 13 | label: Describe the feature 14 | description: | 15 | **Is your feature request related to a problem? Please describe.** 16 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 17 | **Describe the solution you'd like** 18 | A clear and concise description of what you want to happen. 19 | **Describe alternatives you've considered** 20 | A clear and concise description of any alternative solutions or features you've considered. 21 | **Screenshots** 22 | If applicable, add screenshots to help explain your problem. 23 | **Suggest a potential alternative/fix** 24 | Tell us how we could improve this project. 25 | placeholder: | 26 | A clear and concise description of your idea. 27 | validations: 28 | required: true 29 | 30 | - type: markdown 31 | attributes: 32 | value: > 33 | Thanks for contributing 🎉! 34 | -------------------------------------------------------------------------------- /tests/test_layer/test_mlp/test_vit_moe_mlp.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | from titans.layer.mlp import MLPForMoe 6 | from titans.utils import split_data_for_tensor_parallel 7 | from colossalai.global_variables import tensor_parallel_env as tp_env 8 | from colossalai.testing import rerun_if_address_is_in_use 9 | from tests.utils import run_with_moe_config 10 | 11 | BATCH_SIZE = 4 12 | SEQ_LENGTH = 16 13 | HIDDEN_SIZE = 32 14 | D_FF = 4 * 32 15 | 16 | 17 | def run_moe_mlp(data, hidden_size, d_ff): 18 | 19 | #build model 20 | model = MLPForMoe(hidden_size=hidden_size, d_ff=d_ff).cuda() 21 | 22 | # forward 23 | out = model(data) 24 | 25 | # backward 26 | out.mean().backward() 27 | 28 | 29 | def run_dist(rank, world_size, port, config): 30 | colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost') 31 | 32 | if tp_env.mode == 'sequence': 33 | tp_env.mode = None 34 | 35 | data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda() 36 | data = split_data_for_tensor_parallel(data) 37 | run_moe_mlp(data, HIDDEN_SIZE, D_FF) 38 | 39 | 40 | @rerun_if_address_is_in_use() 41 | def test_moe_mlp(): 42 | run_with_moe_config(4, run_func=run_dist) 43 | -------------------------------------------------------------------------------- /tests/test_model/test_gpt.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import pytest 3 | import torch 4 | 5 | from titans.model.gpt import GPT 6 | from colossalai.global_variables import tensor_parallel_env as tp_env 7 | from colossalai.testing import rerun_if_address_is_in_use 8 | from tests.utils import run_with_parallel_config 9 | 10 | BATCH_SIZE = 4 11 | SEQ_LENGHT = 16 12 | HIDDEN_SIZE = 32 13 | NUM_HEADS = 4 14 | VOCAB_SIZE = 50304 15 | 16 | 17 | def run_gpt(data, hidden_size, num_heads): 18 | 19 | #build model 20 | model = GPT(hidden_size=hidden_size, num_heads=num_heads).cuda() 21 | 22 | # forward 23 | out = model(data) 24 | 25 | # backward 26 | out.mean().backward() 27 | 28 | 29 | def run_dist(rank, world_size, port, config): 30 | colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost') 31 | 32 | if tp_env.mode == 'sequence': 33 | tp_env.mode = None 34 | 35 | data = torch.rand(BATCH_SIZE, SEQ_LENGHT) * VOCAB_SIZE 36 | data = data.int().cuda() 37 | run_gpt(data, HIDDEN_SIZE, NUM_HEADS) 38 | 39 | 40 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')]) 41 | @rerun_if_address_is_in_use() 42 | def test_gpt(parallel_config): 43 | run_with_parallel_config(*parallel_config, run_func=run_dist) 44 | -------------------------------------------------------------------------------- /tests/test_model/test_deepnet.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import pytest 3 | import torch 4 | 5 | from titans.model.deepnet import DeepNet 6 | from colossalai.global_variables import tensor_parallel_env as tp_env 7 | from colossalai.testing import rerun_if_address_is_in_use 8 | from tests.utils import run_with_parallel_config 9 | 10 | BATCH_SIZE = 4 11 | SEQ_LENGHT = 16 12 | HIDDEN_SIZE = 32 13 | NUM_HEADS = 4 14 | VOCAB_SIZE = 50304 15 | 16 | 17 | def run_deepnet(data, hidden_size, num_heads): 18 | 19 | #build model 20 | model = DeepNet(hidden_size=hidden_size, num_heads=num_heads).cuda() 21 | 22 | # forward 23 | out = model(data) 24 | 25 | # backward 26 | out.mean().backward() 27 | 28 | 29 | def run_dist(rank, world_size, port, config): 30 | colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost') 31 | 32 | if tp_env.mode == 'sequence': 33 | tp_env.mode = None 34 | 35 | data = torch.rand(BATCH_SIZE, SEQ_LENGHT) * VOCAB_SIZE 36 | data = data.int().cuda() 37 | run_deepnet(data, HIDDEN_SIZE, NUM_HEADS) 38 | 39 | 40 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')]) 41 | @rerun_if_address_is_in_use() 42 | def test_deepnet(parallel_config): 43 | run_with_parallel_config(*parallel_config, run_func=run_dist) 44 | -------------------------------------------------------------------------------- /tests/test_layer/test_embedding/test_gpt_embedding.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import pytest 3 | import torch 4 | 5 | from titans.layer.embedding import GPTEmbedding 6 | from colossalai.global_variables import tensor_parallel_env as tp_env 7 | from colossalai.testing import rerun_if_address_is_in_use 8 | from tests.utils import run_with_parallel_config 9 | 10 | BATCH_SIZE = 4 11 | SEQ_LENGHT = 16 12 | HIDDEN_SIZE = 32 13 | VOCAB_SIZE = 50304 14 | 15 | 16 | def run_gpt_embed(data, hidden_size, vocab_size): 17 | 18 | #build model 19 | model = GPTEmbedding(embedding_dim=hidden_size, vocab_size=vocab_size, max_position_embeddings=1024).cuda() 20 | 21 | # forward 22 | out = model(data) 23 | 24 | # backward 25 | out.mean().backward() 26 | 27 | 28 | def run_dist(rank, world_size, port, config): 29 | colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost') 30 | 31 | if tp_env.mode == 'sequence': 32 | tp_env.mode = None 33 | 34 | data = torch.rand(BATCH_SIZE, SEQ_LENGHT) * VOCAB_SIZE 35 | data = data.int().cuda() 36 | run_gpt_embed(data, HIDDEN_SIZE, VOCAB_SIZE) 37 | 38 | 39 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')]) 40 | @rerun_if_address_is_in_use() 41 | def test_gpt_embedding(parallel_config): 42 | run_with_parallel_config(*parallel_config, run_func=run_dist) 43 | -------------------------------------------------------------------------------- /titans/layer/init_rules.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from colossalai import nn as col_nn 4 | from torch import nn 5 | 6 | init_rules = dict( 7 | torch=dict( 8 | embed=dict( 9 | weight_initializer=col_nn.init.kaiming_uniform_(a=math.sqrt(5)), 10 | bias_initializer=col_nn.init.xavier_uniform_(a=1, scale=1), 11 | position_embed_initializer=col_nn.init.zeros_(), 12 | ), 13 | transformer=dict( 14 | weight_initializer=col_nn.init.kaiming_uniform_(a=math.sqrt(5)), 15 | bias_initializer=col_nn.init.xavier_uniform_(a=1, scale=1), 16 | ), 17 | head=dict( 18 | weight_initializer=col_nn.init.kaiming_uniform_(a=math.sqrt(5)), 19 | bias_initializer=col_nn.init.xavier_uniform_(a=1, scale=1), 20 | ), 21 | ), 22 | jax=dict( 23 | embed=dict( 24 | weight_initializer=col_nn.init.lecun_normal_(), 25 | bias_initializer=col_nn.init.zeros_(), 26 | position_embed_initializer=col_nn.init.trunc_normal_(std=.02), 27 | ), 28 | transformer=dict( 29 | weight_initializer=col_nn.init.xavier_uniform_(), 30 | bias_initializer=col_nn.init.normal_(std=1e-6), 31 | ), 32 | head=dict( 33 | weight_initializer=col_nn.init.zeros_(), 34 | bias_initializer=col_nn.init.zeros_(), 35 | ), 36 | ), 37 | ) -------------------------------------------------------------------------------- /tests/test_layer/test_head/test_vit_head.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import pytest 3 | import torch 4 | 5 | from titans.layer.head import ViTHead 6 | from titans.utils import split_data_for_tensor_parallel 7 | from colossalai.global_variables import tensor_parallel_env as tp_env 8 | from colossalai.testing import rerun_if_address_is_in_use 9 | from tests.utils import run_with_parallel_config 10 | 11 | BATCH_SIZE = 4 12 | MIDDLE_DIM = 80 13 | NUM_CLASSES = 10 14 | HIDDEN_SIZE = 32 15 | 16 | 17 | def run_vit_head(data, hidden_size, num_classes): 18 | 19 | #build model 20 | model = ViTHead(hidden_size=hidden_size, num_classes=num_classes).cuda() 21 | 22 | # forward 23 | out = model(data) 24 | 25 | # backward 26 | out.mean().backward() 27 | 28 | 29 | def run_dist(rank, world_size, port, config): 30 | colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost') 31 | 32 | if tp_env.mode == 'sequence': 33 | tp_env.mode = None 34 | 35 | data = torch.rand(BATCH_SIZE, MIDDLE_DIM, HIDDEN_SIZE).cuda() 36 | data = split_data_for_tensor_parallel(data) 37 | run_vit_head(data, HIDDEN_SIZE, NUM_CLASSES) 38 | 39 | 40 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')]) 41 | @rerun_if_address_is_in_use() 42 | def test_vit_head(parallel_config): 43 | run_with_parallel_config(*parallel_config, run_func=run_dist) 44 | -------------------------------------------------------------------------------- /tests/test_layer/test_mlp/test_transformer_mlp.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import pytest 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | from titans.layer.mlp import TransformerMLP 7 | from titans.utils import split_data_for_tensor_parallel 8 | from colossalai.global_variables import tensor_parallel_env as tp_env 9 | from colossalai.testing import rerun_if_address_is_in_use 10 | from tests.utils import run_with_parallel_config 11 | 12 | BATCH_SIZE = 4 13 | SEQ_LENGTH = 16 14 | HIDDEN_SIZE = 32 15 | 16 | 17 | def run_transformer_mlp(data, hidden_size): 18 | 19 | #build model 20 | model = TransformerMLP(hidden_size=hidden_size, mlp_ratio=4).cuda() 21 | 22 | # forward 23 | out = model(data) 24 | 25 | # backward 26 | out.mean().backward() 27 | 28 | 29 | def run_dist(rank, world_size, port, config): 30 | colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost') 31 | 32 | if tp_env.mode == 'sequence': 33 | tp_env.mode = None 34 | 35 | data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda() 36 | data = split_data_for_tensor_parallel(data) 37 | run_transformer_mlp(data, HIDDEN_SIZE) 38 | 39 | 40 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')]) 41 | @rerun_if_address_is_in_use() 42 | def test_transformer_mlp(parallel_config): 43 | run_with_parallel_config(*parallel_config, run_func=run_dist) 44 | -------------------------------------------------------------------------------- /tests/test_layer/test_mlp/test_vit_mlp.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import pytest 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | from titans.layer.mlp import ViTMLP 7 | from titans.utils import split_data_for_tensor_parallel 8 | from colossalai.global_variables import tensor_parallel_env as tp_env 9 | from colossalai.testing import rerun_if_address_is_in_use 10 | from tests.utils import run_with_parallel_config 11 | 12 | BATCH_SIZE = 4 13 | SEQ_LENGTH = 16 14 | HIDDEN_SIZE = 32 15 | 16 | 17 | def run_vit_mlp(data, hidden_size): 18 | 19 | #build model 20 | model = ViTMLP(hidden_size=hidden_size, mlp_ratio=4, activation=F.gelu, dropout=0.0).cuda() 21 | 22 | # forward 23 | out = model(data) 24 | 25 | # backward 26 | out.mean().backward() 27 | 28 | 29 | def run_dist(rank, world_size, port, config): 30 | colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost') 31 | 32 | if tp_env.mode == 'sequence': 33 | tp_env.mode = None 34 | 35 | data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda() 36 | data = split_data_for_tensor_parallel(data) 37 | run_vit_mlp(data, HIDDEN_SIZE) 38 | 39 | 40 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')]) 41 | @rerun_if_address_is_in_use() 42 | def test_transformer_mlp(parallel_config): 43 | run_with_parallel_config(*parallel_config, run_func=run_dist) 44 | -------------------------------------------------------------------------------- /tests/test_layer/test_mlp/test_detr_mlp.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import pytest 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | from titans.layer.mlp import DeTrMLP 7 | from titans.utils import split_data_for_tensor_parallel 8 | from colossalai.global_variables import tensor_parallel_env as tp_env 9 | from colossalai.testing import rerun_if_address_is_in_use 10 | from tests.utils import run_with_parallel_config 11 | 12 | BATCH_SIZE = 4 13 | SEQ_LENGTH = 16 14 | HIDDEN_SIZE = 32 15 | 16 | 17 | def run_detr_mlp(data, hidden_size): 18 | 19 | #build model 20 | model = DeTrMLP(input_dim=hidden_size, hidden_size=4*hidden_size, output_dim=hidden_size, num_layers=1).cuda() 21 | 22 | # forward 23 | out = model(data) 24 | 25 | # backward 26 | out.mean().backward() 27 | 28 | 29 | def run_dist(rank, world_size, port, config): 30 | colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost') 31 | 32 | if tp_env.mode == 'sequence': 33 | tp_env.mode = None 34 | 35 | data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda() 36 | data = split_data_for_tensor_parallel(data) 37 | run_detr_mlp(data, HIDDEN_SIZE) 38 | 39 | 40 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')]) 41 | @rerun_if_address_is_in_use() 42 | def test_transformer_mlp(parallel_config): 43 | run_with_parallel_config(*parallel_config, run_func=run_dist) 44 | -------------------------------------------------------------------------------- /titans/utils/context.py: -------------------------------------------------------------------------------- 1 | import torch.distributed as dist 2 | from colossalai.context import ParallelMode 3 | from colossalai.core import global_context as gpc 4 | 5 | 6 | class barrier_context(): 7 | """ 8 | This context manager is used to allow one process to execute while blocking all 9 | other processes in the same process group. This is often useful when downloading is required 10 | as we only want to download in one process to prevent file corruption. 11 | 12 | Args: 13 | executor_rank (int): the process rank to execute without blocking, all other processes will be blocked 14 | parallel_mode (ParallelMode): the parallel mode corresponding to a process group 15 | 16 | Usage: 17 | with barrier_context(): 18 | dataset = CIFAR10(root='./data', download=True) 19 | """ 20 | 21 | def __init__(self, executor_rank: int = 0, parallel_mode: ParallelMode = ParallelMode.GLOBAL): 22 | # the class name is lowercase by convention 23 | current_rank = gpc.get_local_rank(parallel_mode=parallel_mode) 24 | self.should_block = current_rank != executor_rank 25 | self.group = gpc.get_group(parallel_mode=parallel_mode) 26 | 27 | def __enter__(self): 28 | if self.should_block: 29 | dist.barrier(group=self.group) 30 | 31 | def __exit__(self, exc_type, exc_value, exc_traceback): 32 | if not self.should_block: 33 | dist.barrier(group=self.group) 34 | -------------------------------------------------------------------------------- /tests/test_layer/test_block/test_gpt_block.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import pytest 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | from titans.layer.block import GPTBlock 7 | from titans.utils import split_data_for_tensor_parallel 8 | from colossalai.testing import rerun_if_address_is_in_use 9 | from tests.utils import run_with_parallel_config 10 | from colossalai.global_variables import tensor_parallel_env as tp_env 11 | 12 | BATCH_SIZE = 4 13 | SEQ_LENGTH = 16 14 | NUM_HEADS = 4 15 | HIDDEN_SIZE = 32 16 | 17 | 18 | def run_gpt_block(data, hidden_size, num_heads): 19 | 20 | #build model 21 | model = GPTBlock(hidden_size=hidden_size, num_heads=num_heads, mlp_ratio=4.0, activation=F.gelu).cuda() 22 | 23 | # forward 24 | out, _ = model(data) 25 | 26 | # backward 27 | out.mean().backward() 28 | 29 | 30 | def run_dist(rank, world_size, port, config): 31 | colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost') 32 | 33 | if tp_env.mode == 'sequence': 34 | tp_env.mode = None 35 | 36 | data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda() 37 | data = split_data_for_tensor_parallel(data) 38 | run_gpt_block(data, HIDDEN_SIZE, NUM_HEADS) 39 | 40 | 41 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')]) 42 | @rerun_if_address_is_in_use() 43 | def test_gpt_block(parallel_config): 44 | run_with_parallel_config(*parallel_config, run_func=run_dist) 45 | -------------------------------------------------------------------------------- /tests/test_layer/test_block/test_deepnet_block.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import pytest 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | from titans.layer.block import DeepNetBlock 7 | from titans.utils import split_data_for_tensor_parallel 8 | from colossalai.testing import rerun_if_address_is_in_use 9 | from tests.utils import run_with_parallel_config 10 | from colossalai.global_variables import tensor_parallel_env as tp_env 11 | 12 | BATCH_SIZE = 4 13 | SEQ_LENGTH = 16 14 | NUM_HEADS = 4 15 | HIDDEN_SIZE = 32 16 | 17 | 18 | def run_deepnet_block(data, hidden_size, num_heads): 19 | 20 | #build model 21 | model = DeepNetBlock(hidden_size=hidden_size, num_heads=num_heads, mlp_ratio=4.0, activation=F.gelu).cuda() 22 | 23 | # forward 24 | out, _ = model(data) 25 | 26 | # backward 27 | out.mean().backward() 28 | 29 | 30 | def run_dist(rank, world_size, port, config): 31 | colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost') 32 | 33 | if tp_env.mode == 'sequence': 34 | tp_env.mode = None 35 | 36 | data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda() 37 | data = split_data_for_tensor_parallel(data) 38 | run_deepnet_block(data, HIDDEN_SIZE, NUM_HEADS) 39 | 40 | 41 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')]) 42 | @rerun_if_address_is_in_use() 43 | def test_deepnet_block(parallel_config): 44 | run_with_parallel_config(*parallel_config, run_func=run_dist) 45 | -------------------------------------------------------------------------------- /tests/utils/dist_test.py: -------------------------------------------------------------------------------- 1 | import torch.multiprocessing as mp 2 | from colossalai.utils import free_port 3 | from functools import partial 4 | 5 | 6 | def run_with_parallel_config(world_size, parallel_mode, run_func): 7 | """ 8 | A wrapper function to reuse the same code snippet in layer/model testing. 9 | 10 | Args: 11 | world_size (int): the number of processes to launch 12 | parallel_mode (str): the parallelism method used 13 | run_func (Callable): the function to launch multiple processes, must have world_size, port and config as arguments. 14 | """ 15 | 16 | port = free_port() 17 | 18 | config = dict(parallel=dict(tensor=dict(size=world_size, mode=parallel_mode))) 19 | 20 | if parallel_mode == '2.5d': 21 | config['parallel']['tensor']['depth'] = world_size // 4 22 | 23 | run_func = partial(run_func, world_size=world_size, port=port, config=config) 24 | mp.spawn(run_func, nprocs=world_size) 25 | 26 | 27 | def run_with_moe_config(world_size, run_func): 28 | """ 29 | A wrapper function to reuse the same code snippet in layer/model testing. 30 | 31 | Args: 32 | world_size (int): the number of processes to launch 33 | run_func (Callable): the function to launch multiple processes, must have world_size, port and config as arguments. 34 | """ 35 | 36 | port = free_port() 37 | 38 | config = dict() 39 | 40 | run_func = partial(run_func, world_size=world_size, port=port, config=config) 41 | mp.spawn(run_func, nprocs=world_size) -------------------------------------------------------------------------------- /titans/dataloader/imagenet/torchvision_imagenet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch.nn as nn 4 | import torch.optim as optim 5 | from colossalai.utils import get_dataloader 6 | import torchvision.transforms as transforms 7 | 8 | 9 | def build_imagenet(batch_size, root, crop=224, resize=256): 10 | transform_train = transforms.Compose([ 11 | transforms.RandomResizedCrop(crop, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.)), 12 | transforms.RandomHorizontalFlip(), 13 | transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.IMAGENET), 14 | transforms.ToTensor(), 15 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 16 | ]) 17 | transform_test = transforms.Compose([ 18 | transforms.Resize(resize), 19 | transforms.CenterCrop(crop), 20 | transforms.ToTensor(), 21 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 22 | ]) 23 | train_path = os.path.join(root, "train") 24 | test_path = os.path.join(root, "test") 25 | train_dataset = torchvision.datasets.ImageFolder(root=train_path, transform=transform_train) 26 | test_dataset = torchvision.datasets.ImageFolder(root=test_path, transform=transform_test) 27 | train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=batch_size, pin_memory=True) 28 | test_dataloader = get_dataloader(dataset=test_dataset, batch_size=batch_size, pin_memory=True) 29 | return train_dataloader, test_dataloader 30 | -------------------------------------------------------------------------------- /tests/test_layer/test_embedding/test_vit_embedding.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import pytest 3 | import torch 4 | 5 | from titans.layer.embedding import ViTEmbedding 6 | from colossalai.global_variables import tensor_parallel_env as tp_env 7 | from colossalai.testing import rerun_if_address_is_in_use 8 | from tests.utils import run_with_parallel_config 9 | 10 | BATCH_SIZE = 4 11 | IMAGE_SIZE = 224 12 | PATCH_SIZE = 16 13 | IN_CHANS = 3 14 | HIDDEN_SIZE = 32 15 | 16 | 17 | def run_vit_embed(data, img_size, patch_size, in_chans, hidden_size): 18 | 19 | #build model 20 | model = ViTEmbedding(img_size=img_size, 21 | patch_size=patch_size, 22 | in_chans=in_chans, 23 | embedding_dim=hidden_size, 24 | dropout=0.0).cuda() 25 | 26 | # forward 27 | out = model(data) 28 | 29 | # backward 30 | out.mean().backward() 31 | 32 | 33 | def run_dist(rank, world_size, port, config): 34 | colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost') 35 | 36 | if tp_env.mode == 'sequence': 37 | tp_env.mode = None 38 | 39 | data = torch.rand(BATCH_SIZE, IN_CHANS, IMAGE_SIZE, IMAGE_SIZE).cuda() 40 | run_vit_embed(data, IMAGE_SIZE, PATCH_SIZE, IN_CHANS, HIDDEN_SIZE) 41 | 42 | 43 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')]) 44 | @rerun_if_address_is_in_use() 45 | def test_vit_embedding(parallel_config): 46 | run_with_parallel_config(*parallel_config, run_func=run_dist) 47 | -------------------------------------------------------------------------------- /tests/test_model/test_vit.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import pytest 3 | import torch 4 | 5 | from titans.model.vit import VisionTransformer 6 | from colossalai.global_variables import tensor_parallel_env as tp_env 7 | from colossalai.testing import rerun_if_address_is_in_use 8 | from tests.utils import run_with_parallel_config 9 | 10 | BATCH_SIZE = 4 11 | IMAGE_SIZE = 224 12 | PATCH_SIZE = 16 13 | NUM_HEADS = 4 14 | IN_CHANS = 3 15 | HIDDEN_SIZE = 32 16 | 17 | 18 | def run_vit(data, img_size, patch_size, in_chans, hidden_size, num_heads): 19 | 20 | #build model 21 | model = VisionTransformer(img_size=img_size, 22 | patch_size=patch_size, 23 | in_chans=in_chans, 24 | hidden_size=hidden_size, 25 | num_heads=num_heads).cuda() 26 | 27 | # forward 28 | out = model(data) 29 | 30 | # backward 31 | out.mean().backward() 32 | 33 | 34 | def run_dist(rank, world_size, port, config): 35 | colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost') 36 | 37 | if tp_env.mode == 'sequence': 38 | tp_env.mode = None 39 | 40 | data = torch.rand(BATCH_SIZE, IN_CHANS, IMAGE_SIZE, IMAGE_SIZE).cuda() 41 | run_vit(data, IMAGE_SIZE, PATCH_SIZE, IN_CHANS, HIDDEN_SIZE, NUM_HEADS) 42 | 43 | 44 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')]) 45 | @rerun_if_address_is_in_use() 46 | def test_vit(parallel_config): 47 | run_with_parallel_config(*parallel_config, run_func=run_dist) 48 | -------------------------------------------------------------------------------- /tests/test_layer/test_block/test_vit_block.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import pytest 3 | import torch 4 | import torch.multiprocessing as mp 5 | import torch.nn.functional as F 6 | 7 | from titans.layer.block import ViTBlock 8 | from titans.utils import split_data_for_tensor_parallel 9 | from colossalai.utils import free_port 10 | from colossalai.nn.layer.utils import divide 11 | from colossalai import nn as col_nn 12 | from functools import partial 13 | from colossalai.global_variables import tensor_parallel_env as tp_env 14 | from colossalai.testing import rerun_if_address_is_in_use 15 | from tests.utils import run_with_parallel_config 16 | 17 | BATCH_SIZE = 4 18 | SEQ_LENGTH = 16 19 | NUM_HEADS = 4 20 | HIDDEN_SIZE = 32 21 | 22 | 23 | def run_vit_block(data, hidden_size, num_heads): 24 | 25 | #build model 26 | model = ViTBlock(hidden_size=hidden_size, num_heads=num_heads, mlp_ratio=4, activation=F.gelu).cuda() 27 | 28 | # forward 29 | out = model(data) 30 | 31 | # backward 32 | out.mean().backward() 33 | 34 | 35 | def run_dist(rank, world_size, port, config): 36 | colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost') 37 | 38 | if tp_env.mode == 'sequence': 39 | tp_env.mode = None 40 | 41 | data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda() 42 | data = split_data_for_tensor_parallel(data) 43 | run_vit_block(data, HIDDEN_SIZE, NUM_HEADS) 44 | 45 | 46 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')]) 47 | @rerun_if_address_is_in_use() 48 | def test_vit_block(parallel_config): 49 | run_with_parallel_config(*parallel_config, run_func=run_dist) 50 | -------------------------------------------------------------------------------- /titans/layer/batchnorm/frozen_batchnorm_2d.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from typing import Dict 4 | 5 | 6 | class FrozenBatchNorm2d(torch.nn.Module): 7 | """ 8 | BatchNorm2d where the batch statistics and the affine parameters are fixed. 9 | 10 | Copy-paste from torchvision.misc.ops with added eps before rqsrt, 11 | without which any other models than torchvision.models.resnet[18,34,50,101] 12 | produce nans. 13 | """ 14 | 15 | def __init__(self, n): 16 | super(FrozenBatchNorm2d, self).__init__() 17 | self.register_buffer("weight", torch.ones(n)) 18 | self.register_buffer("bias", torch.zeros(n)) 19 | self.register_buffer("running_mean", torch.zeros(n)) 20 | self.register_buffer("running_var", torch.ones(n)) 21 | 22 | def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, 23 | error_msgs): 24 | num_batches_tracked_key = prefix + 'num_batches_tracked' 25 | if num_batches_tracked_key in state_dict: 26 | del state_dict[num_batches_tracked_key] 27 | 28 | super(FrozenBatchNorm2d, self)._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, 29 | unexpected_keys, error_msgs) 30 | 31 | def forward(self, x): 32 | w = self.weight.reshape(1, -1, 1, 1) 33 | b = self.bias.reshape(1, -1, 1, 1) 34 | rv = self.running_var.reshape(1, -1, 1, 1) 35 | rm = self.running_mean.reshape(1, -1, 1, 1) 36 | eps = 1e-5 37 | scale = w * (rv + eps).rsqrt() 38 | bias = b - rm * scale 39 | return x * scale + bias 40 | -------------------------------------------------------------------------------- /titans/layer/block/transformer_encoder.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from colossalai import nn as col_nn 3 | 4 | from titans.layer.attention import TransformerMultiHeadAttention 5 | from titans.layer.mlp import TransformerMLP 6 | from .utils import get_clones 7 | 8 | 9 | class TransformerEncoderLayer(nn.Module): 10 | 11 | def __init__(self, hidden_size, nhead, dim_feedforward=2048, dropout=0.1): 12 | super().__init__() 13 | self.selfAttn = TransformerMultiHeadAttention(hidden_size, dim_feedforward, nhead, dropout) 14 | self.feedForward = TransformerMLP(hidden_size, dim_feedforward, dropout) 15 | 16 | self.norm_1 = col_nn.LayerNorm(hidden_size) 17 | self.norm_2 = col_nn.LayerNorm(hidden_size) 18 | self.dropout_1 = col_nn.Dropout(dropout) 19 | self.dropout_2 = col_nn.Dropout(dropout) 20 | 21 | def forward(self, x): 22 | x1 = self.norm_1(x) 23 | x = x + self.dropout_1(self.selfAttn(x1, x1, x1)) 24 | x2 = self.norm_2(x) 25 | out = x + self.dropout_2(self.feedForward(x2)) 26 | return out 27 | 28 | 29 | class TransformerEncoder(nn.Module): 30 | 31 | def __init__(self, encoder_layer, num_layers, norm=None): 32 | super().__init__() 33 | self.layers = get_clones(encoder_layer, num_layers) 34 | self.num_layers = num_layers 35 | self.norm = norm 36 | 37 | def forward(self, src, pos): 38 | output = src if pos is None else (src + pos) 39 | output = output.transpose(0, 1) 40 | 41 | for layer in self.layers: 42 | output = layer(output) 43 | 44 | if self.norm is not None: 45 | output = self.norm(output) 46 | 47 | return output 48 | -------------------------------------------------------------------------------- /tests/test_dataloader/test_bert_pretrain_dataloader.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import os 3 | import pytest 4 | import torch.multiprocessing as mp 5 | from colossalai.context.parallel_mode import ParallelMode 6 | from colossalai.utils import free_port 7 | from colossalai.core import global_context as gpc 8 | from functools import partial 9 | 10 | try: 11 | from titans.dataloader.bert import get_bert_pretrain_data_loader 12 | except: 13 | # to bypass pytest 14 | get_bert_pretrain_data_loader = None 15 | 16 | 17 | def load_data(rank, world_size, port): 18 | CONFIG = dict( 19 | parallel=dict( 20 | tensor=dict(size=2, mode='1d') 21 | ) 22 | ) 23 | DATA_PATH = os.environ['PARQUET_PATH'] 24 | 25 | colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, port=port, host='localhost') 26 | 27 | dataloader = get_bert_pretrain_data_loader( 28 | path=DATA_PATH, 29 | vocab_file='bert-large-uncased', 30 | local_rank=rank, 31 | process_group=gpc.get_group(ParallelMode.DATA), 32 | data_loader_kwargs={ 33 | 'batch_size': 16, 34 | # 'num_workers': 4, 35 | # 'persistent_workers': True, 36 | # 'pin_memory': True, 37 | }, 38 | ) 39 | 40 | for _ in dataloader: 41 | break 42 | 43 | gpc.destroy() 44 | 45 | 46 | @pytest.mark.skip('This test should be manually invoked as the dataset is too large') 47 | def test_bert_pretrain_dataloader(): 48 | world_size = 4 49 | port = free_port() 50 | run_func = partial(load_data, world_size=world_size, port=port) 51 | mp.spawn(run_func, nprocs=world_size) 52 | 53 | 54 | if __name__ == '__main__': 55 | test_bert_pretrain_dataloader() 56 | -------------------------------------------------------------------------------- /tests/test_model/test_detr.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import pytest 3 | import torch 4 | 5 | from titans.model.detr import DeTr 6 | from colossalai.global_variables import tensor_parallel_env as tp_env 7 | from colossalai.testing import rerun_if_address_is_in_use 8 | from tests.utils import run_with_parallel_config 9 | 10 | BATCH_SIZE = 1 11 | HEIGHT = 800 12 | WIDTH = 1200 13 | PATCH_SIZE = 16 14 | NUM_HEADS = 4 15 | IN_CHANS = 3 16 | HIDDEN_SIZE = 256 17 | NUM_ENCODER_LAYER = 6 18 | NUM_DECODER_LAYER = 6 19 | 20 | 21 | def run_detr(data, img_size, patch_size, in_chans, hidden_size, num_heads, num_encoder_layer, num_decoder_layer): 22 | 23 | #build model 24 | model = DeTr(img_size=img_size, 25 | patch_size=patch_size, 26 | in_chans=in_chans, 27 | hidden_size=hidden_size, 28 | num_heads=num_heads, 29 | num_encoder_layer=num_encoder_layer, 30 | num_decoder_layer=num_decoder_layer).cuda() 31 | 32 | # forward 33 | out = model(data) 34 | 35 | # backward 36 | out.mean().backward() 37 | 38 | 39 | def run_dist(rank, world_size, port, config): 40 | colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost') 41 | 42 | if tp_env.mode == 'sequence': 43 | tp_env.mode = None 44 | 45 | data = torch.rand(BATCH_SIZE, IN_CHANS, HEIGHT, WIDTH).cuda() 46 | run_detr(data, 224, PATCH_SIZE, IN_CHANS, HIDDEN_SIZE, NUM_HEADS, NUM_ENCODER_LAYER, NUM_DECODER_LAYER) 47 | 48 | 49 | @pytest.mark.parametrize('parallel_config', [(4, '1d')]) 50 | @rerun_if_address_is_in_use() 51 | def test_detr(parallel_config): 52 | run_with_parallel_config(*parallel_config, run_func=run_dist) 53 | -------------------------------------------------------------------------------- /tests/test_layer/test_head/test_gpt_head.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import pytest 3 | import torch 4 | 5 | from titans.layer.embedding import GPTEmbedding 6 | from titans.layer.head import GPTLMHead 7 | from titans.utils import split_data_for_tensor_parallel 8 | from colossalai.global_variables import tensor_parallel_env as tp_env 9 | from colossalai.testing import rerun_if_address_is_in_use 10 | from tests.utils import run_with_parallel_config 11 | 12 | BATCH_SIZE = 4 13 | SEQ_LENGTH = 256 14 | VOCAB_SIZE = 50304 15 | HIDDEN_SIZE = 32 16 | 17 | 18 | def run_gpt_head(data, hidden_size, vocab_size): 19 | 20 | #build model 21 | embedding_layer = GPTEmbedding(embedding_dim=hidden_size, vocab_size=vocab_size, 22 | max_position_embeddings=1024).cuda() 23 | model = GPTLMHead(hidden_size=hidden_size, vocab_size=vocab_size, embedding_layer=embedding_layer).cuda() 24 | 25 | # forward 26 | out = model(data) 27 | 28 | # backward 29 | out.mean().backward() 30 | 31 | 32 | def run_dist(rank, world_size, port, config): 33 | colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost') 34 | 35 | if tp_env.mode == 'sequence': 36 | tp_env.mode = None 37 | 38 | data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda() 39 | data = split_data_for_tensor_parallel(data) 40 | run_gpt_head(data, HIDDEN_SIZE, VOCAB_SIZE) 41 | 42 | 43 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')]) 44 | @rerun_if_address_is_in_use() 45 | def test_gpt_head(parallel_config): 46 | run_with_parallel_config(*parallel_config, run_func=run_dist) 47 | 48 | 49 | if __name__ == "__main__": 50 | test_gpt_head((4, '1d')) 51 | -------------------------------------------------------------------------------- /titans/layer/head/vit_head.py: -------------------------------------------------------------------------------- 1 | from torch import dtype, nn 2 | 3 | from colossalai import nn as col_nn 4 | from ..init_rules import init_rules 5 | 6 | 7 | class ViTHead(nn.Module): 8 | 9 | def __init__(self, 10 | hidden_size: int, 11 | num_classes: int, 12 | representation_size: int = None, 13 | dtype: dtype = None, 14 | bias: bool = True, 15 | init_method: str = 'torch'): 16 | super().__init__() 17 | if representation_size: 18 | self.representation = col_nn.Linear(hidden_size, 19 | representation_size, 20 | bias=bias, 21 | dtype=dtype, 22 | **init_rules[init_method]['head']) 23 | else: 24 | self.representation = None 25 | representation_size = hidden_size 26 | 27 | self.dense = col_nn.Classifier(representation_size, 28 | num_classes, 29 | dtype=dtype, 30 | bias=bias, 31 | **init_rules[init_method]['head']) 32 | 33 | def forward(self, x): 34 | # the size of x is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) 35 | x = x[:, 0] 36 | # the size of x is (BATCH_SIZE, HIDDEN_SIZE) 37 | if self.representation is not None: 38 | x = self.representation(x) 39 | # the size of x after representation is (BATCH_SIZE, REPRESENTATION_SIZE) 40 | x = self.dense(x) 41 | # the size of x after dense is (BATCH_SIZE, NUM_CLASSES) 42 | return x 43 | -------------------------------------------------------------------------------- /titans/layer/mlp/vit_mlp.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | from torch import dtype, nn 4 | 5 | from colossalai import nn as col_nn 6 | from ..init_rules import init_rules 7 | 8 | 9 | class ViTMLP(nn.Module): 10 | 11 | def __init__(self, 12 | hidden_size: int, 13 | mlp_ratio: int, 14 | activation: Callable, 15 | dropout: float, 16 | dtype: dtype = None, 17 | bias: bool = True, 18 | init_method: str = 'torch'): 19 | super().__init__() 20 | self.dense_1 = col_nn.Linear(hidden_size, 21 | mlp_ratio * hidden_size, 22 | dtype=dtype, 23 | bias=bias, 24 | **init_rules[init_method]['transformer']) 25 | self.activation = activation 26 | self.dropout_1 = col_nn.Dropout(dropout) 27 | self.dense_2 = col_nn.Linear(mlp_ratio * hidden_size, 28 | hidden_size, 29 | dtype=dtype, 30 | bias=bias, 31 | **init_rules[init_method]['transformer']) 32 | self.dropout_2 = col_nn.Dropout(dropout) 33 | 34 | def forward(self, x): 35 | # the size of x before dense_1 is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) 36 | # the size of x after dense_1 is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE*mlp_ratio) 37 | x = self.dense_1(x) 38 | x = self.activation(x) 39 | x = self.dropout_1(x) 40 | # the size of x after dense_2 is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) 41 | x = self.dense_2(x) 42 | x = self.dropout_2(x) 43 | return x 44 | -------------------------------------------------------------------------------- /titans/layer/attention/vit_moe_attention.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from colossalai.utils import get_current_device 7 | 8 | 9 | class SelfAttentionForMoe(nn.Module): 10 | """Standard ViT self attention. 11 | """ 12 | 13 | def __init__(self, 14 | hidden_size: int, 15 | n_heads: int, 16 | d_kv: int, 17 | attention_drop: float = 0, 18 | drop_rate: float = 0, 19 | bias: bool = True, 20 | dropout1=None, 21 | dropout2=None): 22 | super().__init__() 23 | self.n_heads = n_heads 24 | self.d_kv = d_kv 25 | self.scale = 1.0 / math.sqrt(self.d_kv) 26 | 27 | self.dense1 = nn.Linear(hidden_size, 3 * n_heads * d_kv, bias, device=get_current_device()) 28 | self.softmax = nn.Softmax(dim=-1) 29 | self.atten_drop = nn.Dropout(attention_drop) if dropout1 is None else dropout1 30 | self.dense2 = nn.Linear(n_heads * d_kv, hidden_size, device=get_current_device()) 31 | self.dropout = nn.Dropout(drop_rate) if dropout2 is None else dropout2 32 | 33 | def forward(self, x): 34 | qkv = self.dense1(x) 35 | new_shape = qkv.shape[:2] + (3, self.n_heads, self.d_kv) 36 | qkv = qkv.view(*new_shape) 37 | qkv = qkv.permute(2, 0, 3, 1, 4) 38 | q, k, v = qkv[:] 39 | 40 | x = torch.matmul(q, k.transpose(-2, -1)) * self.scale 41 | x = self.atten_drop(self.softmax(x)) 42 | 43 | x = torch.matmul(x, v) 44 | x = x.transpose(1, 2) 45 | new_shape = x.shape[:2] + (self.n_heads * self.d_kv,) 46 | x = x.reshape(*new_shape) 47 | x = self.dense2(x) 48 | x = self.dropout(x) 49 | 50 | return x 51 | -------------------------------------------------------------------------------- /titans/layer/attention/transformer_attention.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch import nn 5 | from colossalai import nn as col_nn 6 | from titans.decorator import no_support 7 | 8 | 9 | @no_support(['sp']) 10 | class TransformerSelfAttention(nn.Module): 11 | 12 | def __init__( 13 | self, 14 | dropout, 15 | ): 16 | super(TransformerSelfAttention, self).__init__() 17 | self.dropout = col_nn.Dropout(dropout) 18 | 19 | def forward(self, queries, keys, values): 20 | d = queries.shape[-1] 21 | scores = torch.matmul(queries, keys.transpose(-1, -2)) / math.sqrt(d) 22 | attention_weights = torch.softmax(scores, dim=2) 23 | return torch.matmul(self.dropout(attention_weights), values) 24 | 25 | 26 | @no_support(['sp']) 27 | class TransformerMultiHeadAttention(nn.Module): 28 | 29 | def __init__(self, hidden_size, num_hiddens, num_heads, dropout, bias=False): 30 | super(TransformerMultiHeadAttention, self).__init__() 31 | self.num_heads = num_heads 32 | self.attention = SelfAttention(dropout) 33 | self.W_q = col_nn.Linear(hidden_size, num_hiddens, bias=bias) 34 | self.W_k = col_nn.Linear(hidden_size, num_hiddens, bias=bias) 35 | self.W_v = col_nn.Linear(hidden_size, num_hiddens, bias=bias) 36 | self.W_o = col_nn.Linear(num_hiddens, hidden_size, bias=bias) 37 | 38 | def forward(self, queries, keys, values): 39 | queries = transpose_qkv(self.W_q(queries), self.num_heads) 40 | keys = transpose_qkv(self.W_k(keys), self.num_heads) 41 | values = transpose_qkv(self.W_v(values), self.num_heads) 42 | 43 | output = self.attention(queries, keys, values) 44 | output_concat = transpose_output(output, self.num_heads) 45 | return self.W_o(output_concat) 46 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/proposal.yml: -------------------------------------------------------------------------------- 1 | name: 💥 Proposal 2 | description: Propose a non-trivial change to Titans 3 | title: "[PROPOSAL] " 4 | labels: [enhancement] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: | 10 | Common reasons for proposals include: 11 | 12 | - Altering the infrastructure; 13 | - Bumping a critical dependency's major version; 14 | - A significant improvement in user-friendliness; 15 | - Significant refactor; 16 | - ... 17 | 18 | Please note this is not for feature request or bug template; such action could make us identify the issue wrongly and close it without doing anything. 19 | 20 | We give you maximum freedom to write an elaborated proposal illustrating why you think the change is beneficial for us, and what steps we should take to turn this into reality. 21 | 22 | 23 | - type: textarea 24 | attributes: 25 | label: Proposal 26 | description: A clear and concise description of what the proposal is. 27 | validations: 28 | required: true 29 | 30 | - type: checkboxes 31 | attributes: 32 | label: Self-service 33 | description: | 34 | If you feel like you could contribute to this issue, please check the box below. This would tell us and other people looking for contributions that someone's working on it. 35 | If you do check this box, please send a pull request within 7 days after a maintainer's approval so we can still delegate this to someone else. 36 | 37 | Proposals usually involve significant code changes, so please reach consensus with the maintainers before rushing to implement it. 38 | This ensures that you don't waste your time and we don't waste ours reading the large diffs. 39 | options: 40 | - label: I'd be willing to do some initial work on this proposal myself. 41 | 42 | 43 | - type: markdown 44 | attributes: 45 | value: > 46 | Thanks for contributing 🎉! -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import setup, find_packages 4 | 5 | # ninja build does not work unless include_dirs are abs path 6 | this_dir = os.path.dirname(os.path.abspath(__file__)) 7 | 8 | 9 | def fetch_requirements(path): 10 | with open(path, 'r') as fd: 11 | return [r.strip() for r in fd.readlines()] 12 | 13 | 14 | def fetch_readme(): 15 | with open('README.md', encoding='utf-8') as f: 16 | return f.read() 17 | 18 | 19 | def get_version(): 20 | with open('version.txt') as f: 21 | return f.read().strip() 22 | 23 | 24 | setup( 25 | name='titans', 26 | version=get_version(), 27 | packages=find_packages(exclude=( 28 | 'build', 29 | 'docker', 30 | 'tests', 31 | 'docs', 32 | 'examples', 33 | '*.egg-info', 34 | )), 35 | description='A collection of deep learning components built with Colossal-AI', 36 | long_description=fetch_readme(), 37 | long_description_content_type='text/markdown', 38 | license='Apache Software License 2.0', 39 | url='https://www.colossalai.org', 40 | project_urls={ 41 | 'Forum': 'https://github.com/hpcaitech/Titans/discussions', 42 | 'Bug Tracker': 'https://github.com/hpcaitech/Titans/issues', 43 | 'Examples': 'https://github.com/hpcaitech/ColossalAI-Examples', 44 | 'Documentation': 'http://colossalai.readthedocs.io', 45 | 'Github': 'https://github.com/hpcaitech/Titans', 46 | }, 47 | install_requires=fetch_requirements('requirements/requirements.txt'), 48 | python_requires='>=3.6', 49 | classifiers=[ 50 | 'Programming Language :: Python :: 3', 51 | 'License :: OSI Approved :: Apache Software License', 52 | 'Environment :: GPU :: NVIDIA CUDA', 53 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 54 | 'Topic :: System :: Distributed Computing', 55 | ], 56 | ) 57 | -------------------------------------------------------------------------------- /titans/loss/embedding_loss/embedding_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from colossalai.core import global_context as gpc 5 | from colossalai.context import ParallelMode 6 | 7 | 8 | class embeddingLoss(nn.Module): 9 | 10 | def forward(self, train_iterator, args, model): 11 | 12 | positive_sample, negative_sample, subsampling_weight, mode = next(train_iterator) 13 | mode = mode[0] 14 | if args.cuda: 15 | positive_sample = positive_sample.cuda() 16 | negative_sample = negative_sample.cuda() 17 | subsampling_weight = subsampling_weight.cuda() 18 | negative_score = model((positive_sample, negative_sample), mode=mode) 19 | 20 | if args.negative_adversarial_sampling: 21 | #In self-adversarial sampling, we do not apply back-propagation on the sampling weight 22 | negative_score = (F.softmax(negative_score * args.adversarial_temperature, dim=1).detach() * 23 | F.logsigmoid(-negative_score)).sum(dim=1) 24 | else: 25 | negative_score = F.logsigmoid(-negative_score).mean(dim=1) 26 | 27 | positive_score = model(positive_sample) 28 | 29 | positive_score = F.logsigmoid(positive_score).squeeze(dim=1) 30 | 31 | if args.uni_weight: 32 | positive_sample_loss = -positive_score.mean() 33 | negative_sample_loss = -negative_score.mean() 34 | else: 35 | positive_sample_loss = -(subsampling_weight * positive_score).sum() / subsampling_weight.sum() 36 | negative_sample_loss = -(subsampling_weight * negative_score).sum() / subsampling_weight.sum() 37 | 38 | loss = (positive_sample_loss + negative_sample_loss) / 2 39 | 40 | torch.distributed.all_reduce(loss, group=gpc.get_group(ParallelMode.GLOBAL)) 41 | 42 | return loss, positive_sample_loss, negative_sample_loss 43 | -------------------------------------------------------------------------------- /titans/model/transformer/transformer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from torch import nn 4 | from colossalai import nn as col_nn 5 | from titans.layer.block import TransformerEncoderLayer, TransformerEncoder, \ 6 | TransformerDecoderLayer, TransformerDecoder 7 | 8 | 9 | class Transformer(nn.Module): 10 | 11 | def __init__(self, 12 | hidden_size=512, 13 | nhead=8, 14 | num_encoder_layers=6, 15 | num_decoder_layers=6, 16 | dim_feedforward=2048, 17 | dropout=0.1, 18 | return_intermediate_dec=False): 19 | super().__init__() 20 | 21 | encoder_layer = TransformerEncoderLayer(hidden_size, nhead, dim_feedforward, dropout) 22 | encoder_norm = col_nn.LayerNorm(hidden_size) 23 | self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) 24 | 25 | decoder_layer = TransformerDecoderLayer(hidden_size, nhead, dim_feedforward, dropout) 26 | decoder_norm = col_nn.LayerNorm(hidden_size) 27 | self.decoder = TransformerDecoder(decoder_layer, 28 | num_decoder_layers, 29 | decoder_norm, 30 | return_intermediate=return_intermediate_dec) 31 | 32 | self.hidden_size = hidden_size 33 | self.nhead = nhead 34 | 35 | def forward(self, src, mask, query_embed, pos_embed): 36 | bs, c, h, w = src.shape 37 | src = src.flatten(2).permute(2, 0, 1) 38 | pos_embed = pos_embed.flatten(2).permute(2, 0, 1) 39 | query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) 40 | # mask = mask.flatten(1) 41 | 42 | tgt = torch.zeros_like(query_embed) 43 | memory = self.encoder(src, pos=pos_embed) 44 | 45 | hs = self.decoder(tgt, memory, pos=pos_embed, query_pos=query_embed) 46 | 47 | return hs.transpose(1, 2) 48 | -------------------------------------------------------------------------------- /titans/layer/embedding/gpt_embedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import dtype, nn 3 | 4 | from colossalai import nn as col_nn 5 | from colossalai.utils import get_current_device 6 | 7 | 8 | class GPTEmbedding(nn.Module): 9 | 10 | def __init__(self, 11 | embedding_dim: int, 12 | vocab_size: int, 13 | max_position_embeddings: int, 14 | num_tokentypes: int = 0, 15 | padding_idx: int = None, 16 | dropout: float = 0., 17 | dtype: dtype = None) -> None: 18 | super().__init__() 19 | self.word_embeddings = col_nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx, dtype=dtype) 20 | self.position_embeddings = col_nn.Embedding(max_position_embeddings, embedding_dim, dtype=dtype) 21 | if num_tokentypes > 0: 22 | self.tokentype_embeddings = col_nn.Embedding(num_tokentypes, embedding_dim, dtype=dtype) 23 | else: 24 | self.tokentype_embeddings = None 25 | self.dropout = col_nn.Dropout(dropout) 26 | 27 | @property 28 | def word_embedding_weight(self): 29 | return self.word_embeddings.weight 30 | 31 | def forward(self, input_ids, position_ids=None, tokentype_ids=None): 32 | seq_length = input_ids.size(1) 33 | if position_ids is None: 34 | bs = input_ids.size(0) 35 | position_ids = torch.arange(seq_length, dtype=torch.long, device=get_current_device()).unsqueeze(0) 36 | position_ids = position_ids.repeat(bs, 1) 37 | # the size of input_ids is (BATCH_SIZE, SEQ_LEN) 38 | # the size of x after word_embeddings is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) 39 | x = self.word_embeddings(input_ids) + self.position_embeddings(position_ids) 40 | if self.tokentype_embeddings is not None and tokentype_ids is not None: 41 | x = x + self.tokentype_embeddings(tokentype_ids) 42 | x = self.dropout(x) 43 | 44 | return x -------------------------------------------------------------------------------- /tests/test_layer/test_block/test_detr_block.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import pytest 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | from titans.layer.block import DeTrEncoder, DeTrDecoder 7 | from titans.utils import split_data_for_tensor_parallel 8 | from functools import partial 9 | from colossalai.global_variables import tensor_parallel_env as tp_env 10 | from colossalai.testing import rerun_if_address_is_in_use 11 | from tests.utils import run_with_parallel_config 12 | 13 | BATCH_SIZE = 4 14 | SEQ_LENGTH = 16 15 | NUM_HEADS = 4 16 | HIDDEN_SIZE = 32 17 | 18 | 19 | def run_detr_encoder(data, hidden_size, num_heads): 20 | 21 | #build model 22 | model = DeTrEncoder(hidden_size=hidden_size, num_heads=num_heads, mlp_ratio=4, activation=F.gelu).cuda() 23 | 24 | # forward 25 | out = model(data) 26 | 27 | # backward 28 | out.mean().backward() 29 | 30 | 31 | def run_detr_decoder(data, memory, hidden_size, num_heads): 32 | 33 | #build model 34 | model = DeTrDecoder(hidden_size=hidden_size, num_heads=num_heads, mlp_ratio=4, activation=F.gelu).cuda() 35 | 36 | # forward 37 | out = model(data, memory) 38 | 39 | # backward 40 | out.mean().backward() 41 | 42 | 43 | def run_dist(rank, world_size, port, config): 44 | colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost') 45 | 46 | if tp_env.mode == 'sequence': 47 | tp_env.mode = None 48 | 49 | data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda() 50 | data = split_data_for_tensor_parallel(data) 51 | memory = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda() 52 | memory = split_data_for_tensor_parallel(memory) 53 | run_detr_encoder(data, HIDDEN_SIZE, NUM_HEADS) 54 | run_detr_decoder(data, memory, HIDDEN_SIZE, NUM_HEADS) 55 | 56 | 57 | 58 | @pytest.mark.parametrize('parallel_config', [(4, '1d')]) 59 | @rerun_if_address_is_in_use() 60 | def test_detr_block(parallel_config): 61 | run_with_parallel_config(*parallel_config, run_func=run_dist) 62 | -------------------------------------------------------------------------------- /titans/layer/embedding/vit_embedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import dtype, nn 3 | 4 | from colossalai import nn as col_nn 5 | from ..init_rules import init_rules 6 | 7 | 8 | class ViTEmbedding(nn.Module): 9 | """ 10 | Construct the patch embeddings. 11 | 12 | Args: 13 | img_size(int): The size of images. 14 | patch_size(int): The size of patches. 15 | in_chans(int): The size of input channels. 16 | embedding_dim(int): The embedding size of patches. 17 | dropout(float): The ratio used to construct dropout modules, which indicates the percentage of parameters should be casted to zero. 18 | dtype (:class:`torch.dtype`): The dtype of parameters, defaults to None. 19 | flatten(bool): If set to ``False``, the patches will not be flatten, defaults to ``True``. 20 | init_method(str): The initializing method used in layers, defaults to `torch`. 21 | """ 22 | 23 | def __init__(self, 24 | img_size: int, 25 | patch_size: int, 26 | in_chans: int, 27 | embedding_dim: int, 28 | dropout: float, 29 | dtype: dtype = None, 30 | flatten: bool = True, 31 | init_method: str = 'torch'): 32 | super().__init__() 33 | self.patch_embed = col_nn.PatchEmbedding(img_size, 34 | patch_size, 35 | in_chans, 36 | embedding_dim, 37 | dtype=dtype, 38 | flatten=flatten, 39 | **init_rules[init_method]['embed']) 40 | self.dropout = col_nn.Dropout(dropout) 41 | 42 | def forward(self, x): 43 | # the size of x before embed is (BATCH_SIZE, IN_CHAN, IMAGE_SIZE, IMAGE_SIZE) 44 | # the size of x after embedding is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) 45 | x = self.patch_embed(x) 46 | x = self.dropout(x) 47 | return x 48 | -------------------------------------------------------------------------------- /titans/layer/mlp/transformer_mlp.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | from colossalai import nn as col_nn 5 | from typing import Callable 6 | from torch import Tensor 7 | from torch import dtype 8 | 9 | 10 | class TransformerMLP(nn.Module): 11 | """ 12 | The MLP module in the Transformer Architecture. 13 | 14 | Args: 15 | hidden_size (int): the dimension of the linear layer. 16 | mlp_ratio (int): the multiplication factor of the linear dimension, default is 4. 17 | activation (Callable): the activation function, default is None which will use GeLU. 18 | dropout_prob (float): the probability of dropout, default is 0. 19 | dtype (torch.dtype): the data type for model parameters, default is None. 20 | bias (bool): whether the linear layers have bias, default is True. 21 | """ 22 | 23 | def __init__(self, 24 | hidden_size: int, 25 | mlp_ratio: int = 4, 26 | activation: Callable = None, 27 | dropout_prob: float = 0.0, 28 | dtype: dtype = None, 29 | bias: bool = True): 30 | super().__init__() 31 | intermediate_dim = int(hidden_size * mlp_ratio) 32 | 33 | # int linear layers 34 | self.linear_1 = col_nn.Linear(hidden_size, intermediate_dim, dtype=dtype, bias=bias) 35 | self.linear_2 = col_nn.Linear(intermediate_dim, hidden_size, dtype=dtype, bias=bias) 36 | 37 | # int activation function 38 | if activation: 39 | self.activation = activation 40 | else: 41 | self.activation = F.gelu 42 | 43 | # init dropout 44 | if dropout_prob > 0: 45 | self.dropout = col_nn.Dropout(dropout_prob) 46 | else: 47 | self.dropout = None 48 | 49 | def forward(self, x: Tensor) -> Tensor: 50 | # the size of x is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) 51 | # the size of intermediate_activate is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE*mlp_ratio) 52 | intermediate_activate = self.linear_1(x) 53 | intermediate_activate = self.activation(intermediate_activate) 54 | # the size of output is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) 55 | output = self.linear_2(intermediate_activate) 56 | 57 | if self.dropout: 58 | output = self.dropout(output) 59 | 60 | return output 61 | -------------------------------------------------------------------------------- /titans/utils/tensor_parallel_data_split.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from colossalai.core import global_context as gpc 4 | from colossalai.context import ParallelMode 5 | from colossalai.global_variables import tensor_parallel_env as tp_env 6 | from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env 7 | from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D 8 | 9 | 10 | def split_data_2d(x: Tensor) -> Tensor: 11 | """ 12 | 2D tensor parallel requries splitting the data in the first dimension and last dimension 13 | """ 14 | j = gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW) 15 | i = gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL) 16 | x = torch.chunk(x, tp_env.summa_dim, dim=0)[i] 17 | x = torch.chunk(x, tp_env.summa_dim, dim=-1)[j] 18 | return x 19 | 20 | 21 | def split_data_2p5d(x: Tensor) -> Tensor: 22 | """ 23 | 2.5D tensor parallel requries splitting the data in the first dimension and last dimension just like 2D 24 | """ 25 | i = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL) 26 | j = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_ROW) 27 | x = torch.chunk(x, tp_env.tesseract_dim, dim=0)[i] 28 | x = torch.chunk(x, tp_env.tesseract_dim, dim=-1)[j] 29 | return x 30 | 31 | 32 | def split_data_3d(x: Tensor) -> Tensor: 33 | """ 34 | 2.5D tensor parallel requries splitting the data in the first dimension twice and last dimension once 35 | """ 36 | input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D) 37 | weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D) 38 | output_parallel_mode = get_parallel_mode_from_env(OUTPUT_GROUP_3D) 39 | 40 | j = gpc.get_local_rank(input_parallel_mode) 41 | i = gpc.get_local_rank(weight_parallel_mode) 42 | k = gpc.get_local_rank(output_parallel_mode) 43 | 44 | x = torch.chunk(x, tp_env.depth_3d, dim=0)[i] 45 | x = torch.chunk(x, tp_env.depth_3d, dim=-1)[k] 46 | x = torch.chunk(x, tp_env.depth_3d, dim=0)[j] 47 | return x 48 | 49 | 50 | def split_data_for_tensor_parallel(x: Tensor) -> Tensor: 51 | """ 52 | Split the data based on the tensor parallel environment 53 | """ 54 | 55 | if tp_env.mode == '2d': 56 | return split_data_2d(x) 57 | elif tp_env.mode == '2.5d': 58 | return split_data_2p5d(x) 59 | elif tp_env.mode == '3d': 60 | return split_data_3d(x) 61 | else: 62 | return x 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /titans/layer/block/vit_block.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import Callable 3 | 4 | import torch 5 | from colossalai import nn as col_nn 6 | from colossalai.nn.layer.utils import CheckpointModule 7 | from torch import dtype, nn 8 | 9 | from titans.layer.attention import ViTSelfAttention 10 | from titans.layer.mlp import ViTMLP 11 | from titans.decorator import support_tp_pp_only 12 | 13 | 14 | @support_tp_pp_only() 15 | class ViTBlock(CheckpointModule): 16 | 17 | def __init__(self, 18 | hidden_size: int, 19 | num_heads: int, 20 | mlp_ratio: int, 21 | activation: Callable, 22 | attention_dropout: float = 0., 23 | dropout: float = 0., 24 | drop_path: float = 0., 25 | layernorm_epsilon: float = 1e-6, 26 | dtype: dtype = None, 27 | bias: bool = True, 28 | checkpoint: bool = False, 29 | init_method: str = 'torch'): 30 | super().__init__(checkpoint) 31 | self.norm1 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 32 | self.attn = ViTSelfAttention(hidden_size=hidden_size, 33 | num_heads=num_heads, 34 | attention_dropout=attention_dropout, 35 | dropout=dropout, 36 | bias=bias, 37 | dtype=dtype, 38 | init_method=init_method) 39 | self.drop_path = col_nn.DropPath(drop_path) if drop_path > 0. else nn.Identity() 40 | self.norm2 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 41 | self.mlp = ViTMLP(hidden_size=hidden_size, 42 | mlp_ratio=mlp_ratio, 43 | activation=activation, 44 | dropout=dropout, 45 | dtype=dtype, 46 | bias=bias, 47 | init_method=init_method) 48 | 49 | def _forward(self, x): 50 | # the size of x is (BATCH_SZIE, SEQ_LEN, HIDDEN_SIZE) 51 | x = x + self.drop_path(self.attn(self.norm1(x))) 52 | # the size of x after attn is (BATCH_SZIE, SEQ_LEN, HIDDEN_SIZE) 53 | x = x + self.drop_path(self.mlp(self.norm2(x))) 54 | # the size of x after mlp is (BATCH_SZIE, SEQ_LEN, HIDDEN_SIZE) 55 | return x 56 | -------------------------------------------------------------------------------- /tests/test_model/test_moe.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import torch 3 | 4 | from colossalai.context import MOE_CONTEXT 5 | from titans.model.moe import MOEGPT, ViTMoE, Widenet 6 | from colossalai.global_variables import tensor_parallel_env as tp_env 7 | from colossalai.testing import rerun_if_address_is_in_use 8 | from tests.utils import run_with_moe_config 9 | 10 | NUM_EXPERTS = 64 11 | BATCH_SIZE = 4 12 | IMAGE_SIZE = 224 13 | PATCH_SIZE = 16 14 | NUM_HEADS = 4 15 | IN_CHANS = 3 16 | HIDDEN_SIZE = 32 17 | 18 | SEQ_LENGHT = 16 19 | VOCAB_SIZE = 50304 20 | 21 | 22 | def run_moe_gpt(data, num_experts, hidden_size, num_heads): 23 | # build model 24 | model = MOEGPT(num_experts=num_experts, hidden_size=hidden_size, num_heads=num_heads).cuda() 25 | 26 | # forward 27 | out = model(data) 28 | 29 | # backward 30 | out.mean().backward() 31 | 32 | 33 | def run_vit_moe(data, num_experts, img_size, patch_size, in_chans, hidden_size, num_heads): 34 | # build model 35 | model = ViTMoE(num_experts=num_experts, 36 | img_size=img_size, 37 | patch_size=patch_size, 38 | in_chans=in_chans, 39 | hidden_size=hidden_size, 40 | num_heads=num_heads).cuda() 41 | 42 | # forward 43 | out = model(data) 44 | 45 | # backward 46 | out.mean().backward() 47 | 48 | 49 | def run_widenet(data, num_experts, img_size, patch_size, in_chans, hidden_size, num_heads): 50 | # build model 51 | model = Widenet(num_experts=num_experts, 52 | img_size=img_size, 53 | patch_size=patch_size, 54 | in_chans=in_chans, 55 | hidden_size=hidden_size, 56 | num_heads=num_heads).cuda() 57 | 58 | # forward 59 | out = model(data) 60 | 61 | # backward 62 | out.mean().backward() 63 | 64 | 65 | def run_dist(rank, world_size, port, config): 66 | colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost') 67 | 68 | if tp_env.mode == 'sequence': 69 | tp_env.mode = None 70 | MOE_CONTEXT.setup(42) 71 | language_data = torch.rand(BATCH_SIZE, SEQ_LENGHT) * VOCAB_SIZE 72 | language_data = language_data.int().cuda() 73 | run_moe_gpt(language_data, NUM_EXPERTS, HIDDEN_SIZE, NUM_HEADS) 74 | 75 | image_data = torch.rand(BATCH_SIZE, IN_CHANS, IMAGE_SIZE, IMAGE_SIZE).cuda() 76 | run_vit_moe(image_data, NUM_EXPERTS, IMAGE_SIZE, PATCH_SIZE, IN_CHANS, HIDDEN_SIZE, NUM_HEADS) 77 | run_widenet(image_data, NUM_EXPERTS, IMAGE_SIZE, PATCH_SIZE, IN_CHANS, HIDDEN_SIZE, NUM_HEADS) 78 | 79 | 80 | @rerun_if_address_is_in_use() 81 | def test_moe(): 82 | run_with_moe_config(4, run_func=run_dist) 83 | -------------------------------------------------------------------------------- /titans/layer/attention/vit_attention.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch import dtype, nn 5 | 6 | from colossalai import nn as col_nn 7 | from ..init_rules import init_rules 8 | from titans.decorator import no_support 9 | 10 | 11 | @no_support(['sp']) 12 | class ViTSelfAttention(nn.Module): 13 | 14 | def __init__(self, 15 | hidden_size: int, 16 | num_heads: int, 17 | attention_dropout: float, 18 | dropout: float, 19 | bias: bool = True, 20 | dtype: dtype = None, 21 | init_method: str = 'torch'): 22 | super().__init__() 23 | self.attention_head_size = hidden_size // num_heads 24 | self.query_key_value = col_nn.Linear(hidden_size, 25 | 3 * hidden_size, 26 | dtype=dtype, 27 | bias=bias, 28 | **init_rules[init_method]['transformer']) 29 | self.attention_dropout = col_nn.Dropout(attention_dropout) 30 | self.dense = col_nn.Linear(hidden_size, hidden_size, dtype=dtype, bias=True, **init_rules[init_method]['transformer']) 31 | self.dropout = col_nn.Dropout(dropout) 32 | self.softmax = nn.Softmax(dim=-1) 33 | 34 | def forward(self, x): 35 | # the size of x is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) 36 | # the size of qkv is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE*3) 37 | qkv = self.query_key_value(x) 38 | all_head_size = qkv.shape[-1] // 3 39 | num_attention_heads = all_head_size // self.attention_head_size 40 | new_qkv_shape = qkv.shape[:-1] + \ 41 | (num_attention_heads, 3 * self.attention_head_size) 42 | qkv = qkv.view(new_qkv_shape) 43 | qkv = qkv.permute((0, 2, 1, 3)) 44 | # the size of q is (BATCH_SZIE, NUM_HEADS, SEQ_LEN, HIDDEN_SIZE//NUM_HEADS) 45 | q, k, v = torch.chunk(qkv, 3, dim=-1) 46 | # the size of x is (BATCH_SIZE, NUM_HEADS, SEQ_LEN, SEQ_LEN) 47 | x = torch.matmul(q, k.transpose(-1, -2)) 48 | x = x / math.sqrt(self.attention_head_size) 49 | x = self.softmax(x) 50 | x = self.attention_dropout(x) 51 | 52 | # the size of x after matmul is (BATCH_SZIE, NUM_HEADS, SEQ_LEN, HIDDEN_SIZE//NUM_HEADS) 53 | x = torch.matmul(x, v) 54 | x = x.transpose(1, 2) 55 | new_context_layer_shape = x.size()[:-2] + (all_head_size,) 56 | # the size of x after reshape is (BATCH_SZIE, SEQ_LEN, HIDDEN_SIZE) 57 | x = x.reshape(new_context_layer_shape) 58 | # the size of x after dense is (BATCH_SZIE, SEQ_LEN, HIDDEN_SIZE) 59 | x = self.dense(x) 60 | x = self.dropout(x) 61 | 62 | return x 63 | -------------------------------------------------------------------------------- /titans/layer/block/deepnet_block.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn as nn, Tensor 3 | from torch import dtype 4 | from typing import Callable 5 | from colossalai import nn as col_nn 6 | from colossalai.core import global_context as gpc 7 | from colossalai.utils.activation_checkpoint import checkpoint 8 | from colossalai.nn.layer.utils import CheckpointModule 9 | from colossalai.nn.layer.base_layer import ParallelLayer 10 | from colossalai import kernel 11 | from titans.decorator import support_tp_pp_only 12 | from titans.layer.attention import GPTSelfAttention 13 | from titans.layer.mlp import TransformerMLP 14 | 15 | 16 | @support_tp_pp_only() 17 | class DeepNetBlock(CheckpointModule): 18 | 19 | def __init__(self, 20 | hidden_size: int, 21 | num_heads: int, 22 | mlp_ratio: float, 23 | activation: Callable, 24 | attention_dropout: float = 0., 25 | dropout: float = 0., 26 | alpha: float = 1.0, 27 | layernorm_epsilon: float = 1e-5, 28 | dtype: dtype = None, 29 | bias: bool = True, 30 | fuse_scale_mask_softmax: bool = False, 31 | checkpoint: bool = False, 32 | activation_offload: bool = False): 33 | super().__init__(checkpoint, activation_offload) 34 | self.norm1 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 35 | self.attn = GPTSelfAttention(hidden_size=hidden_size, 36 | num_heads=num_heads, 37 | attention_dropout=attention_dropout, 38 | dropout=dropout, 39 | bias=bias, 40 | fuse_scale_mask_softmax=fuse_scale_mask_softmax, 41 | dtype=dtype) 42 | self.alpha = alpha 43 | self.norm2 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 44 | self.mlp = TransformerMLP(hidden_size=hidden_size, 45 | mlp_ratio=mlp_ratio, 46 | activation=activation, 47 | dropout_prob=dropout, 48 | dtype=dtype, 49 | bias=bias) 50 | 51 | def _forward(self, x, attention_mask=None): 52 | if attention_mask is not None and attention_mask.dtype != x.dtype: 53 | attention_mask = attention_mask.to(x.dtype) 54 | 55 | residual = x 56 | x = residual * self.alpha + self.attn(x, attention_mask) 57 | x = self.norm1(x) 58 | 59 | residual = x 60 | x = residual * self.alpha + self.mlp(x) 61 | x = self.norm2(x) 62 | 63 | return x, attention_mask 64 | -------------------------------------------------------------------------------- /tests/test_layer/test_attention/test_transformer_attention.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import pytest 3 | import torch 4 | 5 | from titans.layer.attention import TransformerSelfAttention, GPTSelfAttention, ViTSelfAttention 6 | from titans.utils import split_data_for_tensor_parallel 7 | from colossalai.nn.layer.utils import divide 8 | from colossalai.testing import rerun_if_address_is_in_use 9 | from colossalai import nn as col_nn 10 | from colossalai.global_variables import tensor_parallel_env as tp_env 11 | from tests.utils import run_with_parallel_config 12 | 13 | BATCH_SIZE = 4 14 | SEQ_LENGTH = 16 15 | NUM_HEADS = 4 16 | HIDDEN_SIZE = 32 17 | 18 | 19 | def run_transformer_attention(data, hidden_size, num_heads): 20 | 21 | #build model 22 | model = TransformerSelfAttention(dropout=0.0).cuda() 23 | 24 | #process data 25 | query_key_value = col_nn.Linear(hidden_size, 3 * hidden_size) 26 | qkv = query_key_value(data) 27 | all_head_size = qkv.shape[-1] // 3 28 | attention_head_size = divide(hidden_size, num_heads) 29 | num_attention_heads = divide(all_head_size, attention_head_size) 30 | new_qkv_shape = qkv.shape[:-1] + \ 31 | (num_attention_heads, 3 * attention_head_size) 32 | qkv = qkv.view(new_qkv_shape) 33 | qkv = qkv.permute((0, 2, 1, 3)) 34 | q, k, v = torch.chunk(qkv, 3, dim=-1) 35 | 36 | # forward 37 | out = model(q, k, v) 38 | 39 | # backward 40 | out.mean().backward() 41 | 42 | 43 | def run_gpt_attention(data, hidden_size, num_heads): 44 | 45 | #build model 46 | model = GPTSelfAttention(hidden_size=hidden_size, num_heads=num_heads, attention_dropout=0.0, dropout=0.0).cuda() 47 | 48 | # forward 49 | out = model(data) 50 | 51 | # backward 52 | out.mean().backward() 53 | 54 | 55 | def run_vit_attention(data, hidden_size, num_heads): 56 | 57 | #build model 58 | model = ViTSelfAttention(hidden_size=hidden_size, num_heads=num_heads, attention_dropout=0.0, dropout=0.0).cuda() 59 | 60 | # forward 61 | out = model(data) 62 | 63 | # backward 64 | out.mean().backward() 65 | 66 | 67 | def run_dist(rank, world_size, port, config): 68 | colossalai.launch(config=config, rank=rank, world_size=world_size, port=port, host='localhost') 69 | 70 | if tp_env.mode == 'sequence': 71 | tp_env.mode = None 72 | 73 | data = torch.rand(BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE).cuda() 74 | data = split_data_for_tensor_parallel(data) 75 | run_gpt_attention(data, HIDDEN_SIZE, NUM_HEADS) 76 | run_vit_attention(data, HIDDEN_SIZE, NUM_HEADS) 77 | run_transformer_attention(data, HIDDEN_SIZE, NUM_HEADS) 78 | 79 | 80 | @pytest.mark.parametrize('parallel_config', [(4, '1d'), (4, '2d'), (4, '2.5d'), (8, '2.5d'), (8, '3d')]) 81 | @rerun_if_address_is_in_use() 82 | def test_transformer_attention(parallel_config): 83 | run_with_parallel_config(*parallel_config, run_func=run_dist) 84 | -------------------------------------------------------------------------------- /titans/layer/attention/detr_attention.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch import dtype, nn 5 | 6 | from colossalai import nn as col_nn 7 | from ..init_rules import init_rules 8 | from titans.decorator import no_support 9 | # This part need to work together with the col_nn.Linear (row, col) in order to better parallelize. 10 | 11 | @no_support(['sp']) 12 | class DeTrCrossAttention(nn.Module): 13 | 14 | def __init__(self, 15 | hidden_size: int, 16 | num_heads: int, 17 | attention_dropout: float, 18 | dropout: float, 19 | bias: bool = True, 20 | dtype: dtype = None, 21 | init_method: str = 'torch'): 22 | super().__init__() 23 | self.attention_head_size = hidden_size // num_heads 24 | self.query = col_nn.Linear1D_Col(hidden_size, 25 | hidden_size, 26 | dtype=dtype, 27 | bias=bias, 28 | ) 29 | self.key_value = col_nn.Linear1D_Col(hidden_size, 30 | 2 * hidden_size, 31 | dtype=dtype, 32 | bias=bias, 33 | ) 34 | self.attention_dropout = col_nn.Dropout(attention_dropout) 35 | self.dense = col_nn.Linear1D_Row(hidden_size, hidden_size, dtype=dtype, bias=True) 36 | self.dropout = col_nn.Dropout(dropout) 37 | self.softmax = nn.Softmax(dim=-1) 38 | 39 | def forward(self, x, memory): 40 | q = self.query(x) 41 | kv = self.key_value(memory) 42 | all_head_size = kv.shape[-1] // 2 43 | num_attention_heads = all_head_size // self.attention_head_size 44 | 45 | new_q_shape = q.shape[:-1] + (num_attention_heads, self.attention_head_size) 46 | q = q.view(new_q_shape) 47 | q = q.permute((0, 2, 1, 3)) 48 | q = q.permute((2, 3, 0, 1)) # ? 49 | 50 | new_kv_shape = kv.shape[:-1] + (num_attention_heads, 2 * self.attention_head_size) 51 | kv = kv.view(new_kv_shape) 52 | kv = kv.permute((0, 2, 1, 3)) 53 | k, v = torch.chunk(kv, 2, dim=-1) 54 | k = k.permute((2, 3, 0, 1)) # ? 55 | v = v.permute((2, 3, 0, 1)) # ? 56 | 57 | x = torch.matmul(q, k.transpose(-1, -2)) 58 | x = x / math.sqrt(self.attention_head_size) 59 | x = self.softmax(x) 60 | x = self.attention_dropout(x) 61 | 62 | x = torch.matmul(x, v) 63 | x = x.transpose(1, 2) 64 | new_context_layer_shape = x.size()[:-2] + (all_head_size,) 65 | x = x.reshape(new_context_layer_shape) 66 | x = x.transpose(0, 1) 67 | 68 | x = self.dense(x) 69 | x = self.dropout(x) 70 | 71 | return x 72 | -------------------------------------------------------------------------------- /titans/layer/block/transformer_decoder.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from colossalai import nn as col_nn 3 | 4 | from titans.layer.attention import TransformerMultiHeadAttention 5 | from .utils import get_clones 6 | 7 | 8 | class TransformerDecoderLayer(nn.Module): 9 | 10 | def __init__(self, hidden_size, nhead, dim_feedforward=2048, dropout=0.1): 11 | super().__init__() 12 | self.selfAttn = TransformerMultiHeadAttention(hidden_size, dim_feedforward, nhead, dropout) 13 | 14 | self.linear_1 = col_nn.Linear(hidden_size, dim_feedforward) 15 | self.linear_2 = col_nn.Linear(dim_feedforward, hidden_size) 16 | self.norm_1 = col_nn.LayerNorm(hidden_size) 17 | self.norm_2 = col_nn.LayerNorm(hidden_size) 18 | self.norm_3 = col_nn.LayerNorm(hidden_size) 19 | self.dropout_1 = col_nn.Dropout(dropout) 20 | self.dropout_2 = col_nn.Dropout(dropout) 21 | self.dropout_3 = col_nn.Dropout(dropout) 22 | self.dropout_4 = col_nn.Dropout(dropout) 23 | 24 | def with_pos_embed(self, tensor, pos): 25 | return tensor if pos is None else tensor + pos 26 | 27 | def forward(self, tgt, memory, pos, query_pos): 28 | tgt = tgt.transpose(0, 1) 29 | query_pos = query_pos.transpose(0, 1) 30 | pos = pos.transpose(0, 1) 31 | 32 | q = k = self.with_pos_embed(tgt, query_pos) 33 | 34 | tgt2 = self.selfAttn(q, k, tgt) 35 | 36 | tgt = tgt + self.dropout_1(tgt2) 37 | tgt = self.norm_1(tgt) 38 | tgt2 = self.selfAttn(q, self.with_pos_embed(memory, pos), memory) 39 | tgt = tgt + self.dropout_2(tgt2) 40 | tgt = self.norm_2(tgt) 41 | tgt2 = self.linear_2(self.dropout_3(F.relu(self.linear_1(tgt)))) 42 | tgt = tgt + self.dropout_4(tgt2) 43 | tgt = self.norm_3(tgt) 44 | return tgt 45 | 46 | 47 | def transpose_qkv(X, num_heads): 48 | X = X.reshape(X.shape[0], X.shape[1], num_heads, -1) 49 | X = X.permute(0, 2, 1, 3) 50 | return X.reshape(-1, X.shape[2], X.shape[3]) 51 | 52 | 53 | def transpose_output(X, num_heads): 54 | X = X.reshape(-1, num_heads, X.shape[1], X.shape[2]) 55 | X = X.permute(0, 2, 1, 3) 56 | return X.reshape(X.shape[0], X.shape[1], -1) 57 | 58 | 59 | class TransformerDecoder(nn.Module): 60 | 61 | def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): 62 | super().__init__() 63 | self.layers = get_clones(decoder_layer, num_layers) 64 | self.num_layers = num_layers 65 | self.norm = norm 66 | self.return_intermediate = return_intermediate 67 | 68 | def forward(self, tgt, memory, pos, query_pos): 69 | intermediate = [] 70 | 71 | for layer in self.layers: 72 | tgt = layer(tgt, memory, pos=pos, query_pos=query_pos).transpose(0, 1) 73 | 74 | if self.return_intermediate: 75 | intermediate.append(self.norm(tgt)) 76 | 77 | return torch.stack(intermediate) 78 | -------------------------------------------------------------------------------- /titans/decorator/no_support.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union, Callable 2 | from colossalai.global_variables import tensor_parallel_env as tp_env 3 | from colossalai.context.moe_context import MOE_CONTEXT 4 | from colossalai.core import global_context as gpc 5 | from colossalai.context import ParallelMode 6 | 7 | SUPPORTED_MODES = ['tp', 'pp', 'sp', 'moe'] 8 | 9 | 10 | def no_support(modes: Union[str, List[str]]): 11 | """ 12 | A decorator to indicate the forbidden parallel modes for the module. 13 | 14 | Args: 15 | modes (Union[str, List[str]]): the mode can only be tp (tensor parallel), 16 | pp (pipeline parallel), sp (sequence parallel), and moe (mixture-of-experts). 17 | 18 | Usage: 19 | # if this model does not support tensor parallel version 20 | @no_support('tp') 21 | class SomeModule(torch.nn.Module): 22 | ... 23 | 24 | # if this model does not support tp and pp 25 | @no_support(['tp', 'pp']) 26 | class SomeModule(torch.nn.Module): 27 | ... 28 | """ 29 | 30 | if isinstance(modes, str): 31 | assert modes in SUPPORTED_MODES, f'expected modes to be none, tp, pp, sp or moe, but got {modes}' 32 | modes = [modes] 33 | elif isinstance(modes, (tuple, list)): 34 | for mode in modes: 35 | assert mode in SUPPORTED_MODES, f'expected modes to be none, tp, pp, sp or moe, but got {mode}' 36 | else: 37 | raise TypeError(f'expected modes to be of type str or list, but got {type(modes)}') 38 | 39 | def _wrap_callable(callable_: Callable): 40 | assert hasattr(callable_, '__init__'), 'the wrapped callable must be a class' 41 | origin_init = callable_.__init__ 42 | class_name = callable_.__class__.__name__ 43 | 44 | def new_init(*args, **kwargs): 45 | if tp_env.mode != None: 46 | assert 'tp' not in modes, f'{class_name} does not support tensor parallel implementation' 47 | 48 | if MOE_CONTEXT.is_initialized: 49 | assert 'moe' not in modes, f'{class_name} does not support MOE implementation' 50 | 51 | if gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1: 52 | assert 'pp' not in modes, f'{class_name} does not support pipeline parallel implementation' 53 | 54 | if gpc.is_initialized(ParallelMode.SEQUENCE) and gpc.get_world_size(ParallelMode.SEQUENCE): 55 | assert 'sp' not in modes, f'{class_name} does not support sequence parallel implementation' 56 | 57 | origin_init(*args, **kwargs) 58 | 59 | callable_.__init__ = new_init 60 | 61 | return callable_ 62 | 63 | return _wrap_callable 64 | 65 | 66 | def support_tp_pp_only(): 67 | return no_support(['moe', 'sp']) 68 | 69 | 70 | def support_sp_pp_only(): 71 | return no_support(['moe', 'tp']) 72 | 73 | 74 | def support_moe_only(): 75 | return no_support(['tp', 'sp', 'pp']) 76 | 77 | 78 | def no_parallel_support(): 79 | return no_support(['tp', 'pp', 'sp', 'moe']) 80 | -------------------------------------------------------------------------------- /titans/model/moe/widenet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from colossalai.context import ParallelMode 5 | from colossalai.nn.layer import VanillaPatchEmbedding, VanillaClassifier, \ 6 | WrappedDropout as Dropout, WrappedDropPath as DropPath 7 | from colossalai.nn.layer.moe import build_ffn_experts, MoeLayer, Top2Router, NormalNoiseGenerator 8 | from .util import moe_sa_args 9 | from ..helper import TransformerLayer 10 | from colossalai.context.moe_context import MOE_CONTEXT 11 | 12 | from titans.layer.attention import SelfAttentionForMoe 13 | 14 | 15 | class Widenet(nn.Module): 16 | 17 | def __init__(self, 18 | num_experts: int, 19 | capacity_factor_train: float = 1.25, 20 | capacity_factor_eval: float = 2.0, 21 | drop_tks: bool = True, 22 | img_size: int = 224, 23 | patch_size: int = 16, 24 | in_chans: int = 3, 25 | num_classes: int = 1000, 26 | depth: int = 12, 27 | hidden_size: int = 768, 28 | num_heads: int = 12, 29 | d_kv: int = 64, 30 | d_ff: int = 4096, 31 | attention_drop: float = 0., 32 | drop_rate: float = 0.1, 33 | drop_path: float = 0.): 34 | super().__init__() 35 | 36 | self.embedding = VanillaPatchEmbedding(img_size=img_size, 37 | patch_size=patch_size, 38 | in_chans=in_chans, 39 | embed_size=hidden_size) 40 | self.embed_dropout = Dropout(p=drop_rate, mode=ParallelMode.TENSOR) 41 | 42 | shared_sa = SelfAttentionForMoe(**moe_sa_args( 43 | hidden_size=hidden_size, n_heads=num_heads, d_kv=d_kv, attention_drop=attention_drop, drop_rate=drop_rate)) 44 | 45 | noisy_func = NormalNoiseGenerator(num_experts) 46 | shared_router = Top2Router(capacity_factor_train=capacity_factor_train, 47 | capacity_factor_eval=capacity_factor_eval, 48 | noisy_func=noisy_func, 49 | drop_tks=drop_tks) 50 | shared_experts = build_ffn_experts(num_experts, hidden_size, d_ff, drop_rate=drop_rate) 51 | 52 | # stochastic depth decay rule 53 | dpr = [x.item() for x in torch.linspace(0, drop_path, depth)] 54 | blocks = [ 55 | TransformerLayer(att=shared_sa, 56 | ffn=MoeLayer(dim_model=hidden_size, 57 | num_experts=num_experts, 58 | router=shared_router, 59 | experts=shared_experts), 60 | norm1=nn.LayerNorm(hidden_size, eps=1e-6), 61 | norm2=nn.LayerNorm(hidden_size, eps=1e-6), 62 | droppath=DropPath(p=dpr[i], mode=ParallelMode.TENSOR)) for i in range(depth) 63 | ] 64 | 65 | self.blocks = nn.ModuleList(blocks) 66 | self.norm = nn.LayerNorm(hidden_size, eps=1e-6) 67 | self.linear = VanillaClassifier(in_features=hidden_size, num_classes=num_classes) 68 | nn.init.zeros_(self.linear.weight) 69 | nn.init.zeros_(self.linear.bias) 70 | 71 | def forward(self, x): 72 | MOE_CONTEXT.reset_loss() 73 | 74 | x = self.embedding(x) 75 | x = self.embed_dropout(x) 76 | 77 | y = 0 78 | for block in self.blocks: 79 | x, y = block(x, y) 80 | 81 | x = self.norm(x) 82 | x = torch.mean(x, dim=1) 83 | x = self.linear(x) 84 | 85 | MOE_CONTEXT.add_loss(y) 86 | return x 87 | -------------------------------------------------------------------------------- /titans/layer/attention/gpt_attention.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch import dtype, nn 5 | 6 | from colossalai import nn as col_nn 7 | from colossalai.nn.layer.utils import divide 8 | from colossalai.utils import get_current_device 9 | from titans.decorator import no_support 10 | 11 | 12 | @no_support(['sp']) 13 | class GPTSelfAttention(nn.Module): 14 | 15 | def __init__(self, 16 | hidden_size: int, 17 | num_heads: int, 18 | attention_dropout: float, 19 | dropout: float, 20 | bias: bool = True, 21 | fuse_scale_mask_softmax: bool = False, 22 | dtype: dtype = None) -> None: 23 | super().__init__() 24 | self.fuse_scale_mask_softmax = fuse_scale_mask_softmax 25 | self.attention_head_size = divide(hidden_size, num_heads) 26 | self.query_key_value = col_nn.Linear(hidden_size, 3 * hidden_size, dtype=dtype, bias=bias) 27 | if fuse_scale_mask_softmax: 28 | from colossalai.kernel import FusedScaleMaskSoftmax 29 | from colossalai.kernel.cuda_native.scaled_softmax import \ 30 | AttnMaskType 31 | self.softmax = FusedScaleMaskSoftmax(input_in_fp16=True, 32 | input_in_bf16=False, 33 | attn_mask_type=AttnMaskType.causal, 34 | scaled_masked_softmax_fusion=True, 35 | mask_func=None, 36 | softmax_in_fp32=True, 37 | scale=math.sqrt(self.attention_head_size)) 38 | else: 39 | self.softmax = nn.Softmax(dim=-1) 40 | self.attention_dropout = col_nn.Dropout(attention_dropout) 41 | self.dense = col_nn.Linear(hidden_size, hidden_size, dtype=dtype, bias=True) 42 | self.dropout = col_nn.Dropout(dropout) 43 | 44 | def forward(self, x, attention_mask=None): 45 | # the size of x is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) 46 | # the size of qkv is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE*3) 47 | qkv = self.query_key_value(x) 48 | all_head_size = qkv.shape[-1] // 3 49 | num_attention_heads = divide(all_head_size, self.attention_head_size) 50 | new_qkv_shape = qkv.shape[:-1] + \ 51 | (num_attention_heads, 3 * self.attention_head_size) 52 | qkv = qkv.view(new_qkv_shape) 53 | qkv = qkv.permute((0, 2, 1, 3)) 54 | # the size of q is (BATCH_SZIE, NUM_HEADS, SEQ_LEN, HIDDEN_SIZE//NUM_HEADS) 55 | q, k, v = torch.chunk(qkv, 3, dim=-1) 56 | # the size of x after matmul is (BATCH_SIZE, NUM_HEADS, SEQ_LEN, SEQ_LEN) 57 | x = torch.matmul(q, k.transpose(-1, -2)) 58 | 59 | if self.fuse_scale_mask_softmax: 60 | x = self.softmax(x, attention_mask) 61 | else: 62 | x = x / math.sqrt(self.attention_head_size) 63 | # causal mask 64 | q_len, k_len = q.size(-2), k.size(-2) 65 | causal_mask = torch.tril(torch.ones((q_len, k_len), dtype=torch.uint8, 66 | device=get_current_device())).view(1, 1, q_len, k_len).bool() 67 | x = torch.where(causal_mask, x, torch.tensor(-1e4, dtype=x.dtype, device=get_current_device())) 68 | if attention_mask is not None: 69 | x = x + attention_mask 70 | x = self.softmax(x) 71 | 72 | x = self.attention_dropout(x) 73 | 74 | # the size of x after matmul is (BATCH_SZIE, NUM_HEADS, SEQ_LEN, HIDDEN_SIZE//NUM_HEADS) 75 | x = torch.matmul(x, v) 76 | x = x.transpose(1, 2) 77 | new_context_layer_shape = x.size()[:-2] + (all_head_size,) 78 | # the size of x after reshape is (BATCH_SZIE, SEQ_LEN, HIDDEN_SIZE) 79 | x = x.reshape(new_context_layer_shape) 80 | # the size of x after dense is (BATCH_SZIE, SEQ_LEN, HIDDEN_SIZE) 81 | x = self.dense(x) 82 | x = self.dropout(x) 83 | 84 | return x 85 | -------------------------------------------------------------------------------- /titans/loss/vocab_cross_entropy/vocab_cross_entropy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn as nn, Tensor, distributed as dist 3 | from torch.nn import functional as F 4 | import torch.nn.init as init 5 | from torch.nn.parameter import Parameter 6 | 7 | from colossalai.context import ParallelMode 8 | from colossalai.core import global_context as gpc 9 | 10 | from titans.utils import VocabUtility 11 | 12 | 13 | class vocab_parallel_cross_entropy(nn.Module): 14 | 15 | def __init__(self): 16 | super().__init__() 17 | 18 | def forward(self, vocab_parallel_logits, target): 19 | """Helper function for the cross entropy.""" 20 | vocab_parallel_logits = vocab_parallel_logits[..., :-1, :].contiguous() 21 | target = target[..., 1:].contiguous() 22 | return _VocabParallelCrossEntropy.apply(vocab_parallel_logits.view(-1, vocab_parallel_logits.size(-1)), 23 | target.view(-1)) 24 | 25 | 26 | class _VocabParallelCrossEntropy(torch.autograd.Function): 27 | 28 | @staticmethod 29 | def forward(ctx, vocab_parallel_logits, target): 30 | 31 | # Maximum value along vocab dimension across all GPUs. 32 | logits_max = torch.max(vocab_parallel_logits, dim=-1)[0] 33 | torch.distributed.all_reduce(logits_max, 34 | op=torch.distributed.ReduceOp.MAX, 35 | group=gpc.get_group(ParallelMode.PARALLEL_1D)) 36 | # Subtract the maximum value. 37 | vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1)) 38 | 39 | # Get the partition's vocab indecies 40 | get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size 41 | partition_vocab_size = vocab_parallel_logits.size()[-1] 42 | rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D) 43 | world_size = gpc.tensor_parallel_size 44 | vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size) 45 | 46 | # Create a mask of valid vocab ids (1 means it needs to be masked). 47 | target_mask = (target < vocab_start_index) | (target >= vocab_end_index) 48 | masked_target = target.clone() - vocab_start_index 49 | masked_target[target_mask] = 0 50 | 51 | # Get predicted-logits = logits[target]. 52 | # For Simplicity, we convert logits to a 2-D tensor with size 53 | # [*, partition-vocab-size] and target to a 1-D tensor of size [*]. 54 | logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size) 55 | masked_target_1d = masked_target.view(-1) 56 | arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device) 57 | predicted_logits_1d = logits_2d[arange_1d, masked_target_1d] 58 | predicted_logits_1d = predicted_logits_1d.clone().contiguous() 59 | predicted_logits = predicted_logits_1d.view_as(target) 60 | predicted_logits[target_mask] = 0.0 61 | # All reduce is needed to get the chunks from other GPUs. 62 | torch.distributed.all_reduce(predicted_logits, 63 | op=torch.distributed.ReduceOp.SUM, 64 | group=gpc.get_group(ParallelMode.PARALLEL_1D)) 65 | 66 | # Sum of exponential of logits along vocab dimension across all GPUs. 67 | exp_logits = vocab_parallel_logits 68 | torch.exp(vocab_parallel_logits, out=exp_logits) 69 | sum_exp_logits = exp_logits.sum(dim=-1) 70 | torch.distributed.all_reduce(sum_exp_logits, 71 | op=torch.distributed.ReduceOp.SUM, 72 | group=gpc.get_group(ParallelMode.PARALLEL_1D)) 73 | 74 | # Loss = log(sum(exp(logits))) - predicted-logit. 75 | loss = torch.log(sum_exp_logits) - predicted_logits 76 | loss = loss.mean() 77 | # Store softmax, target-mask and masked-target for backward pass. 78 | exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1)) 79 | ctx.save_for_backward(exp_logits, target_mask, masked_target_1d) 80 | return loss 81 | 82 | @staticmethod 83 | def backward(ctx, grad_output): 84 | 85 | # Retreive tensors from the forward path. 86 | softmax, target_mask, masked_target_1d = ctx.saved_tensors 87 | 88 | # All the inputs have softmax as thier gradient. 89 | grad_input = softmax 90 | # For simplicity, work with the 2D gradient. 91 | partition_vocab_size = softmax.size()[-1] 92 | grad_2d = grad_input.view(-1, partition_vocab_size) 93 | 94 | # Add the gradient from matching classes. 95 | arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device) 96 | grad_2d[arange_1d, masked_target_1d] -= (1.0 - target_mask.view(-1).float()) 97 | 98 | # Finally elementwise multiplication with the output gradients. 99 | grad_input.mul_(grad_output.unsqueeze(dim=-1)) 100 | 101 | return grad_input, None 102 | -------------------------------------------------------------------------------- /titans/layer/block/detr_block.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import Callable 3 | 4 | import torch 5 | from colossalai import nn as col_nn 6 | from colossalai.nn.layer.utils import CheckpointModule 7 | from torch import dtype, nn 8 | 9 | from titans.layer.attention import ViTSelfAttention, DeTrCrossAttention 10 | from titans.layer.mlp import ViTMLP 11 | from titans.decorator import support_tp_pp_only 12 | 13 | 14 | @support_tp_pp_only() 15 | class DeTrEncoder(CheckpointModule): 16 | 17 | def __init__(self, 18 | hidden_size: int, 19 | num_heads: int, 20 | mlp_ratio: int, 21 | activation: Callable, 22 | attention_dropout: float = 0., 23 | dropout: float = 0., 24 | drop_path: float = 0., 25 | layernorm_epsilon: float = 1e-6, 26 | dtype: dtype = None, 27 | bias: bool = True, 28 | checkpoint: bool = False, 29 | init_method: str = 'torch'): 30 | super().__init__(checkpoint) 31 | self.norm1 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 32 | self.attn = ViTSelfAttention(hidden_size=hidden_size, 33 | num_heads=num_heads, 34 | attention_dropout=attention_dropout, 35 | dropout=dropout, 36 | bias=bias, 37 | dtype=dtype, 38 | init_method=init_method) 39 | self.drop_path = col_nn.DropPath(drop_path) if drop_path > 0. else nn.Identity() 40 | self.norm2 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 41 | self.mlp = ViTMLP(hidden_size=hidden_size, 42 | mlp_ratio=mlp_ratio, 43 | activation=activation, 44 | dropout=dropout, 45 | dtype=dtype, 46 | bias=bias, 47 | init_method=init_method) 48 | 49 | def _forward(self, x): 50 | x = x + self.drop_path(self.norm1(self.attn(x))) 51 | x = x + self.drop_path(self.norm2(self.mlp(x))) 52 | return x 53 | 54 | 55 | @support_tp_pp_only() 56 | class DeTrDecoder(CheckpointModule): 57 | 58 | def __init__(self, 59 | hidden_size: int, 60 | num_heads: int, 61 | mlp_ratio: int, 62 | activation: Callable, 63 | attention_dropout: float = 0., 64 | dropout: float = 0., 65 | drop_path: float = 0., 66 | layernorm_epsilon: float = 1e-6, 67 | dtype: dtype = None, 68 | bias: bool = True, 69 | checkpoint: bool = False, 70 | init_method: str = 'torch'): 71 | super().__init__(checkpoint) 72 | self.norm1 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 73 | self.norm2 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 74 | self.norm3 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 75 | 76 | self.attn1 = ViTSelfAttention(hidden_size=hidden_size, 77 | num_heads=num_heads, 78 | attention_dropout=attention_dropout, 79 | dropout=dropout, 80 | bias=bias, 81 | dtype=dtype, 82 | init_method=init_method) 83 | 84 | self.attn2 = DeTrCrossAttention(hidden_size=hidden_size, 85 | num_heads=num_heads, 86 | attention_dropout=attention_dropout, 87 | dropout=dropout, 88 | bias=bias, 89 | dtype=dtype, 90 | init_method=init_method) 91 | 92 | self.drop_path = col_nn.DropPath(drop_path) if drop_path > 0. else nn.Identity() 93 | 94 | self.mlp = ViTMLP(hidden_size=hidden_size, 95 | mlp_ratio=mlp_ratio, 96 | activation=activation, 97 | dropout=dropout, 98 | dtype=dtype, 99 | bias=bias, 100 | init_method=init_method) 101 | 102 | def _forward(self, x, memory): 103 | x = x + self.drop_path(self.norm1(self.attn1(x))) 104 | x = x + self.drop_path(self.norm2(self.attn2(x, memory))) 105 | x = x + self.drop_path(self.mlp(self.norm3(x))) 106 | return x 107 | -------------------------------------------------------------------------------- /titans/model/moe/vit_moe.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from colossalai.context import ParallelMode 5 | from colossalai.nn.layer import VanillaPatchEmbedding, VanillaClassifier, \ 6 | WrappedDropout as Dropout, WrappedDropPath as DropPath 7 | from colossalai.nn.layer.moe import build_ffn_experts, MoeModule 8 | from .util import moe_sa_args, moe_mlp_args 9 | from ..helper import TransformerLayer 10 | from colossalai.context.moe_context import MOE_CONTEXT 11 | 12 | from typing import List 13 | from titans.layer.mlp import MLPForMoe 14 | from titans.layer.attention import SelfAttentionForMoe 15 | 16 | 17 | class ViTMoE(nn.Module): 18 | 19 | def __init__(self, 20 | num_experts: int or List[int], 21 | use_residual: bool = False, 22 | capacity_factor_train: float = 1.25, 23 | capacity_factor_eval: float = 2.0, 24 | drop_tks: bool = True, 25 | img_size: int = 224, 26 | patch_size: int = 16, 27 | in_chans: int = 3, 28 | num_classes: int = 1000, 29 | depth: int = 12, 30 | hidden_size: int = 768, 31 | num_heads: int = 12, 32 | d_kv: int = 64, 33 | d_ff: int = 3072, 34 | attention_drop: float = 0., 35 | drop_rate: float = 0.1, 36 | drop_path: float = 0., 37 | checkpoint: bool = False): 38 | super().__init__() 39 | 40 | assert depth % 2 == 0, "The number of layers should be even right now" 41 | 42 | if isinstance(num_experts, list): 43 | assert len(num_experts) == depth // 2, \ 44 | "The length of num_experts should equal to the number of MOE layers" 45 | num_experts_list = num_experts 46 | else: 47 | num_experts_list = [num_experts] * (depth // 2) 48 | 49 | self.embedding = VanillaPatchEmbedding(img_size=img_size, 50 | patch_size=patch_size, 51 | in_chans=in_chans, 52 | embed_size=hidden_size) 53 | self.embed_dropout = Dropout(p=drop_rate, mode=ParallelMode.TENSOR) 54 | 55 | # stochastic depth decay rule 56 | dpr = [x.item() for x in torch.linspace(0, drop_path, depth)] 57 | blocks = [] 58 | for i in range(depth): 59 | sa = SelfAttentionForMoe(**moe_sa_args(hidden_size=hidden_size, 60 | n_heads=num_heads, 61 | d_kv=d_kv, 62 | attention_drop=attention_drop, 63 | drop_rate=drop_rate)) 64 | 65 | if i % 2 == 0: 66 | ffn = MLPForMoe(**moe_mlp_args(hidden_size=hidden_size, d_ff=d_ff, drop_rate=drop_rate)) 67 | else: 68 | num_experts = num_experts_list[i // 2] 69 | experts = build_ffn_experts(num_experts, hidden_size, d_ff, drop_rate=drop_rate) 70 | ffn = MoeModule(dim_model=hidden_size, 71 | num_experts=num_experts, 72 | top_k=1 if use_residual else 2, 73 | capacity_factor_train=capacity_factor_train, 74 | capacity_factor_eval=capacity_factor_eval, 75 | noisy_policy='Jitter' if use_residual else 'Gaussian', 76 | drop_tks=drop_tks, 77 | use_residual=use_residual, 78 | expert_instance=experts, 79 | expert_cls=MLPForMoe, 80 | **moe_mlp_args(hidden_size=hidden_size, d_ff=d_ff, drop_rate=drop_rate)) 81 | 82 | layer = TransformerLayer(att=sa, 83 | ffn=ffn, 84 | norm1=nn.LayerNorm(hidden_size, eps=1e-6), 85 | norm2=nn.LayerNorm(hidden_size, eps=1e-6), 86 | droppath=DropPath(p=dpr[i], mode=ParallelMode.TENSOR), 87 | checkpoint=checkpoint) 88 | blocks.append(layer) 89 | 90 | self.blocks = nn.ModuleList(blocks) 91 | self.norm = nn.LayerNorm(hidden_size, eps=1e-6) 92 | self.linear = VanillaClassifier(in_features=hidden_size, num_classes=num_classes) 93 | nn.init.zeros_(self.linear.weight) 94 | nn.init.zeros_(self.linear.bias) 95 | 96 | def forward(self, x): 97 | MOE_CONTEXT.reset_loss() 98 | 99 | x = self.embedding(x) 100 | x = self.embed_dropout(x) 101 | 102 | y = 0 103 | for block in self.blocks: 104 | x, y = block(x, y) 105 | 106 | x = self.norm(x) 107 | x = torch.mean(x, dim=1) 108 | x = self.linear(x) 109 | 110 | MOE_CONTEXT.add_loss(y) 111 | return x 112 | -------------------------------------------------------------------------------- /titans/model/detr/detr.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import Callable 3 | 4 | import torch 5 | from colossalai import nn as col_nn 6 | from colossalai.nn.layer.utils import CheckpointModule 7 | from torch import dtype, nn 8 | from torchvision.models import resnet50 9 | 10 | from titans.layer.embedding import ViTEmbedding 11 | # from titans.layer.head import DeTrHead 12 | from titans.layer.mlp import DeTrMLP 13 | from titans.layer.block import DeTrEncoder, DeTrDecoder 14 | from titans.decorator import no_support 15 | 16 | __all__ = [ 17 | 'DeTr', 18 | 'detr_1', 19 | ] 20 | 21 | 22 | @no_support(['sp', 'moe']) 23 | class DeTr(nn.Module): 24 | 25 | def __init__(self, 26 | img_size: int = 224, 27 | patch_size: int = 16, 28 | in_chans: int = 3, 29 | num_classes: int = 91, 30 | num_encoder_layer: int = 6, 31 | num_decoder_layer: int = 6, 32 | num_heads: int = 12, 33 | num_queries: int = 100, 34 | hidden_size: int = 256, 35 | mlp_ratio: int = 4, 36 | attention_dropout: float = 0., 37 | dropout: float = 0.1, 38 | drop_path: float = 0., 39 | layernorm_epsilon: float = 1e-6, 40 | activation: Callable = nn.functional.gelu, 41 | representation_size: int = None, 42 | dtype: dtype = None, 43 | bias: bool = True, 44 | checkpoint: bool = False, 45 | init_method: str = 'torch'): 46 | super().__init__() 47 | 48 | # self.embed = ViTEmbedding(img_size=img_size, 49 | # patch_size=patch_size, 50 | # in_chans=in_chans, 51 | # embedding_dim=hidden_size, 52 | # dropout=dropout, 53 | # dtype=dtype, 54 | # init_method=init_method) 55 | 56 | self.backbone = nn.Sequential(*list(resnet50(pretrained=True).children())[:-2]) 57 | self.conv = nn.Conv2d(2048, hidden_size, 1) 58 | 59 | # stochastic depth decay rule 60 | dpr1 = [x.item() for x in torch.linspace(0, drop_path, num_encoder_layer)] 61 | self.blocks1 = nn.ModuleList([ 62 | DeTrEncoder( 63 | hidden_size=hidden_size, 64 | num_heads=num_heads, 65 | mlp_ratio=mlp_ratio, 66 | attention_dropout=attention_dropout, 67 | dropout=dropout, 68 | drop_path=dpr1[i], 69 | activation=activation, 70 | dtype=dtype, 71 | bias=bias, 72 | checkpoint=checkpoint, 73 | init_method=init_method, 74 | ) for i in range(num_encoder_layer) 75 | ]) 76 | 77 | dpr2 = [x.item() for x in torch.linspace(0, drop_path, num_decoder_layer)] 78 | self.blocks2 = nn.ModuleList([ 79 | DeTrDecoder( 80 | hidden_size=hidden_size, 81 | num_heads=num_heads, 82 | mlp_ratio=mlp_ratio, 83 | attention_dropout=attention_dropout, 84 | dropout=dropout, 85 | drop_path=dpr2[i], 86 | activation=activation, 87 | dtype=dtype, 88 | bias=bias, 89 | checkpoint=checkpoint, 90 | init_method=init_method, 91 | ) for i in range(num_decoder_layer) 92 | ]) 93 | 94 | self.norm = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 95 | 96 | self.class_embed = nn.Linear(hidden_size, num_classes + 1) 97 | self.bbox_embed = DeTrMLP(hidden_size, hidden_size, 4, 3) 98 | self.query_embed = nn.Embedding(num_queries, hidden_size) 99 | 100 | self.query_pos = nn.Parameter(torch.rand(100, hidden_size)) 101 | self.row_embed = nn.Parameter(torch.rand(50, hidden_size // 2)) 102 | self.col_embed = nn.Parameter(torch.rand(50, hidden_size // 2)) 103 | 104 | def forward(self, x): 105 | x = self.backbone(x) 106 | h = self.conv(x) 107 | H, W = h.shape[-2:] 108 | pos = torch.cat([ 109 | self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1), 110 | self.row_embed[:H].unsqueeze(1).repeat(1, W, 1), 111 | ], dim=-1).flatten(0, 1).unsqueeze(1) 112 | 113 | for block in self.blocks1: 114 | memory = block(pos + h.flatten(2).permute(2, 0, 1)) 115 | print('memory',memory.size()) 116 | print('self.query_pos.unsqueeze(1)',self.query_pos.unsqueeze(1).size()) 117 | for block in self.blocks2: 118 | x = block(self.query_pos.unsqueeze(1), memory) 119 | 120 | x = self.norm(x) 121 | outputs_class = self.class_embed(x) 122 | outputs_coord = self.bbox_embed(x).sigmoid() 123 | out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]} 124 | # if self.aux_loss: 125 | # out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord) 126 | # return out # not dict 127 | return outputs_class # temp 128 | 129 | 130 | 131 | 132 | def _create_detr_model(**model_kwargs): 133 | model = DeTr(**model_kwargs) 134 | return model 135 | 136 | 137 | def detr_1(**kwargs): 138 | model_kwargs = dict(img_size=32, patch_size=4, hidden_size=256, depth=7, num_heads=4, mlp_ratio=2, num_classes=10, **kwargs) 139 | return _create_detr_model(**model_kwargs) 140 | 141 | -------------------------------------------------------------------------------- /titans/layer/block/gpt_block.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | from torch import dtype 4 | 5 | from colossalai import nn as col_nn 6 | from colossalai.nn.layer.utils import CheckpointModule 7 | from colossalai.nn.layer import MoeModule 8 | 9 | from titans.layer.attention import GPTSelfAttention 10 | 11 | from titans.decorator import support_tp_pp_only 12 | from titans.layer.mlp import TransformerMLP 13 | 14 | 15 | class GPTBlock(CheckpointModule): 16 | 17 | def __init__(self, 18 | hidden_size: int, 19 | num_heads: int, 20 | mlp_ratio: float, 21 | activation: Callable, 22 | attention_dropout: float = 0., 23 | dropout: float = 0., 24 | layernorm_epsilon: float = 1e-5, 25 | dtype: dtype = None, 26 | bias: bool = True, 27 | apply_post_layernorm: bool = False, 28 | fuse_scale_mask_softmax: bool = False, 29 | checkpoint: bool = False, 30 | activation_offload: bool = False): 31 | super().__init__(checkpoint, activation_offload) 32 | self.apply_post_layernorm = apply_post_layernorm 33 | self.norm1 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 34 | self.attn = GPTSelfAttention(hidden_size=hidden_size, 35 | num_heads=num_heads, 36 | attention_dropout=attention_dropout, 37 | dropout=dropout, 38 | bias=bias, 39 | fuse_scale_mask_softmax=fuse_scale_mask_softmax, 40 | dtype=dtype) 41 | self.norm2 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 42 | self.mlp = TransformerMLP(hidden_size=hidden_size, 43 | mlp_ratio=mlp_ratio, 44 | activation=activation, 45 | dropout_prob=dropout, 46 | dtype=dtype, 47 | bias=bias) 48 | 49 | def _forward(self, x, attention_mask=None): 50 | if attention_mask is not None and attention_mask.dtype != x.dtype: 51 | attention_mask = attention_mask.to(x.dtype) 52 | if not self.apply_post_layernorm: 53 | residual = x 54 | x = self.norm1(x) 55 | if self.apply_post_layernorm: 56 | residual = x 57 | x = residual + self.attn(x, attention_mask) 58 | 59 | if not self.apply_post_layernorm: 60 | residual = x 61 | x = self.norm2(x) 62 | if self.apply_post_layernorm: 63 | residual = x 64 | x = residual + self.mlp(x) 65 | 66 | return x, attention_mask 67 | 68 | 69 | class MOEGPTBlock(CheckpointModule): 70 | 71 | def __init__(self, 72 | num_experts: int, 73 | hidden_size: int, 74 | num_heads: int, 75 | mlp_ratio: float, 76 | activation: Callable, 77 | capacity_factor_train: float = 1.0, 78 | capacity_factor_eval: float = 1.0, 79 | use_residual: bool = False, 80 | attention_dropout: float = 0., 81 | dropout: float = 0., 82 | layernorm_epsilon: float = 1e-5, 83 | dtype: dtype = None, 84 | bias: bool = True, 85 | apply_post_layernorm: bool = False, 86 | fuse_scale_mask_softmax: bool = False, 87 | checkpoint: bool = False): 88 | super().__init__(checkpoint) 89 | self.apply_post_layernorm = apply_post_layernorm 90 | self.norm1 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 91 | self.attn = GPTSelfAttention(hidden_size=hidden_size, 92 | num_heads=num_heads, 93 | attention_dropout=attention_dropout, 94 | dropout=dropout, 95 | bias=bias, 96 | fuse_scale_mask_softmax=fuse_scale_mask_softmax, 97 | dtype=dtype) 98 | self.norm2 = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 99 | 100 | mpl_factory_dict = dict(hidden_size=hidden_size, 101 | mlp_ratio=mlp_ratio, 102 | activation=activation, 103 | dtype=dtype, 104 | bias=bias) 105 | 106 | self.mlp = MoeModule(dim_model=hidden_size, 107 | num_experts=num_experts, 108 | top_k=1, 109 | capacity_factor_train=capacity_factor_train, 110 | capacity_factor_eval=capacity_factor_eval, 111 | noisy_policy='Jitter', 112 | use_residual=use_residual, 113 | expert_cls=TransformerMLP, 114 | **mpl_factory_dict) 115 | 116 | def _forward(self, x, y, attention_mask=None): 117 | if not self.apply_post_layernorm: 118 | residual = x 119 | x = self.norm1(x) 120 | if self.apply_post_layernorm: 121 | residual = x 122 | x = residual + self.attn(x, attention_mask) 123 | 124 | if not self.apply_post_layernorm: 125 | residual = x 126 | x = self.norm2(x) 127 | if self.apply_post_layernorm: 128 | residual = x 129 | x, z = self.mlp(x) 130 | 131 | x = residual + x 132 | y = y + z 133 | 134 | return x, y, attention_mask 135 | -------------------------------------------------------------------------------- /titans/model/vilt/vilt.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | import torch 3 | from colossalai import nn as col_nn 4 | from torch import dtype, nn 5 | 6 | from titans.layer.embedding import ViTEmbedding 7 | from titans.layer.block import ViTBlock 8 | from utils import heads, objectives 9 | import torch.nn.functional as F 10 | from colossalai.nn.layer.colossalai_layer import LayerNorm 11 | from transformers.models.bert.modeling_bert import BertConfig, BertEmbeddings 12 | 13 | 14 | class ViLT(nn.Module): 15 | 16 | def __init__( 17 | self, 18 | config, 19 | img_size: int = 384, 20 | patch_size: int = 16, 21 | in_chans: int = 3, 22 | num_classes: int = 1000, 23 | depth: int = 12, 24 | num_heads: int = 12, 25 | hidden_size: int = 768, 26 | mlp_ratio: int = 4, 27 | attention_dropout: float = 0., 28 | dropout: float = 0.1, 29 | dropout_prob=0.1, 30 | drop_path: float = 0., 31 | init_std=0.02, 32 | layernorm_epsilon: float = 1e-6, 33 | activation: Callable = nn.functional.gelu, 34 | representation_size: int = None, 35 | convert_fp16_to_fp32_in_softmax=False, 36 | dtype: dtype = None, 37 | bias: bool = True, 38 | checkpoint: bool = False, 39 | init_method: str = 'torch', 40 | first_stage=True, 41 | last_stage=True, 42 | start_idx=0, 43 | end_idx=None, 44 | ): 45 | 46 | super().__init__() 47 | max_sequence_length = config["max_text_len"] 48 | num_layers = config["num_layers"] 49 | vocab_size = config["vocab_size"] 50 | self.vocab_size = vocab_size 51 | hidden_size = config["hidden_size"] 52 | self.first_stage = first_stage 53 | self.last_stage = last_stage 54 | self.init_std = init_std 55 | self.num_layers = num_layers 56 | 57 | bert_config = BertConfig( 58 | vocab_size=vocab_size, 59 | hidden_size=hidden_size, 60 | num_hidden_layers=num_layers, 61 | num_attention_heads=num_heads, 62 | intermediate_size=hidden_size * mlp_ratio, 63 | max_position_embeddings=max_sequence_length, 64 | hidden_dropout_prob=dropout, 65 | attention_probs_dropout_prob=dropout, 66 | ) 67 | 68 | self.pooler = heads.Pooler(hidden_size) 69 | self.token_type_embeddings = nn.Embedding(2, hidden_size) 70 | self.token_type_embeddings.apply(objectives.init_weights) 71 | self.text_embedding = BertEmbeddings(bert_config) 72 | self.vis_embedding = ViTEmbedding(img_size=img_size, 73 | patch_size=patch_size, 74 | in_chans=in_chans, 75 | embedding_dim=hidden_size, 76 | dropout=dropout, 77 | dtype=dtype, 78 | init_method=init_method) 79 | 80 | # stochastic depth decay rule 81 | dpr = [x.item() for x in torch.linspace(0, drop_path, depth)] 82 | blocks = [ 83 | ViTBlock( 84 | hidden_size=hidden_size, 85 | num_heads=num_heads, 86 | mlp_ratio=mlp_ratio, 87 | attention_dropout=attention_dropout, 88 | dropout=dropout, 89 | drop_path=dpr[i], 90 | activation=activation, 91 | dtype=dtype, 92 | bias=bias, 93 | checkpoint=checkpoint, 94 | init_method=init_method, 95 | ) for i in range(depth) 96 | ] 97 | norm = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 98 | 99 | if self.last_stage: 100 | self.mlm_score = heads.MLMHead(bert_config) 101 | self.mlm_score.apply(objectives.init_weights) 102 | 103 | self.layer_norm = LayerNorm(hidden_size) 104 | 105 | layers = [] 106 | layers.extend(blocks) 107 | layers.extend([norm]) 108 | self.layers = nn.Sequential(*layers) 109 | # self.layers = build_pipeline_model(self.layers, num_chunks=1, verbose=True) 110 | 111 | def infer(self, x, image_token_type_idx=1): 112 | do_mlm = "_mlm" 113 | if f"image_{image_token_type_idx - 1}" in x: 114 | imgkey = f"image_{image_token_type_idx - 1}" 115 | else: 116 | imgkey = "image" 117 | img = x[imgkey] 118 | text_ids = x[f"text_ids{do_mlm}"] 119 | text_labels = x[f"text_labels{do_mlm}"] 120 | image_embeds = self.vis_embedding(img) 121 | text_embeds = self.text_embedding(text_ids) 122 | co_embeds = torch.cat([text_embeds, image_embeds], dim=1) 123 | x = co_embeds 124 | x = self.layers(x) 125 | text_feats, image_feats = ( 126 | x[:, :text_embeds.shape[1]], 127 | x[:, text_embeds.shape[1]:], 128 | ) 129 | cls_feats = self.pooler(x) 130 | ret = { 131 | "text_feats": text_feats, 132 | "image_feats": image_feats, 133 | "cls_feats": cls_feats, 134 | "raw_cls_feats": x[:, 0], 135 | "text_labels": text_labels, 136 | "text_ids": text_ids, 137 | } 138 | return ret 139 | 140 | def forward(self, x): 141 | ret = dict() 142 | ret.update(self.compute_mlm(x)) 143 | return ret 144 | 145 | def compute_mlm(self, batch): 146 | infer = self.infer(batch) 147 | mlm_logits = self.mlm_score(infer["text_feats"]) 148 | mlm_labels = infer["text_labels"] 149 | 150 | mlm_loss = F.cross_entropy( 151 | mlm_logits.view(-1, self.vocab_size), 152 | mlm_labels.view(-1), 153 | ignore_index=-100, 154 | ) 155 | 156 | ret = { 157 | "mlm_loss": mlm_loss, 158 | "mlm_logits": mlm_logits, 159 | "mlm_labels": mlm_labels, 160 | "mlm_ids": infer["text_ids"], 161 | } 162 | 163 | return ret 164 | 165 | 166 | def get_current_device(): 167 | ''' 168 | Returns the index of a currently selected device (gpu/cpu). 169 | ''' 170 | if torch.cuda.is_available(): 171 | return torch.cuda.current_device() 172 | else: 173 | return 'cpu' 174 | -------------------------------------------------------------------------------- /titans/model/deepnet/deepnet.py: -------------------------------------------------------------------------------- 1 | from colossalai.context.parallel_mode import ParallelMode 2 | from typing import Callable 3 | import math 4 | from torch import dtype 5 | import torch.nn as nn 6 | import torch 7 | from colossalai import nn as col_nn 8 | from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper 9 | from colossalai.core import global_context as gpc 10 | import inspect 11 | from colossalai.pipeline.utils import partition_uniform 12 | from colossalai import kernel 13 | from colossalai.logging import get_dist_logger 14 | from titans.decorator import support_tp_pp_only 15 | from titans.layer.block import DeepNetBlock 16 | from titans.layer.embedding import GPTEmbedding 17 | from titans.layer.head import GPTLMHead 18 | from titans.layer.block import GPTBlock 19 | from titans.loss.lm_loss import GPTLMLoss 20 | 21 | __all__ = ['DeepNet', 'deepnet_small'] 22 | 23 | 24 | @support_tp_pp_only() 25 | class DeepNet(nn.Module): 26 | """The decoder-only DeepNet model is modified from the GPT model. 27 | 28 | Args: 29 | vocab_size(int): The size of dictionary, defaults to 50304. 30 | max_position_embeddings(int): The max value of positional embeddings, defaults to 1024. 31 | dim(int): Hidden size of the transformer blocks, defaults to 768. 32 | num_heads(int): The number of heads in transformer blocks, defaults to 12. 33 | depth(int): The number of transformer layers, defaults to 12. 34 | mlp_ratio(float): The ratio used in mlp layer, defaults to 4.0. 35 | dropout(float): The ratio used to construct dropout modules, which indicates the percentage of parameters should be casted to zero, defaults to 0.1. 36 | embedding_dropout(float): The ratio used to construct embedding dropout modules, which indicates the percentage of parameters should be casted to zero, defaults to 0.1. 37 | attention_dropout(float): The ratio used to construct attention dropout modules, which indicates the percentage of parameters should be casted to zero, defaults to 0.1. 38 | layernorm_epsilon(float): The argument used to construct layernorm modules, defaults to 1e-5. 39 | activation(Callable): The activation function used in model, defaults to nn.functional.gelu. 40 | padding_idx(int): The length to be padded for each batch, defaults to None. 41 | dtype (:class:`torch.dtype`): The dtype of parameters, defaults to None. 42 | bias (bool): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``. 43 | fuse_scale_mask_softmax(bool): If set to "True", FuseScaleMaskSoftmax will be used in self-attention layer, defaults to ``False``. 44 | checkpoint(bool): If set to "True", checkpoint feature will be activated to save memory, defaults to ``False``. 45 | activation_offload(bool): If set to "True", offload feature will be activated during checkpointing, defaults to ``False``. 46 | """ 47 | 48 | def __init__(self, 49 | vocab_size: int = 50304, 50 | max_position_embeddings: int = 1024, 51 | hidden_size: int = 768, 52 | num_heads: int = 12, 53 | depth: int = 12, 54 | mlp_ratio: float = 4.0, 55 | dropout: float = 0.1, 56 | embedding_dropout: float = 0.1, 57 | attention_dropout: float = 0.1, 58 | layernorm_epsilon: float = 1e-5, 59 | activation: Callable = nn.functional.gelu, 60 | padding_idx: int = None, 61 | dtype: dtype = None, 62 | bias: bool = True, 63 | fuse_scale_mask_softmax: bool = False, 64 | checkpoint: bool = False, 65 | activation_offload: bool = False) -> None: 66 | super().__init__() 67 | self.embed = GPTEmbedding(embedding_dim=hidden_size, 68 | vocab_size=vocab_size, 69 | max_position_embeddings=max_position_embeddings, 70 | padding_idx=padding_idx, 71 | dropout=embedding_dropout, 72 | dtype=dtype) 73 | alpha = math.sqrt(2 * depth) 74 | self.blocks = nn.ModuleList([ 75 | DeepNetBlock(hidden_size=hidden_size, 76 | num_heads=num_heads, 77 | mlp_ratio=mlp_ratio, 78 | activation=activation, 79 | attention_dropout=attention_dropout, 80 | dropout=dropout, 81 | alpha=alpha, 82 | layernorm_epsilon=layernorm_epsilon, 83 | dtype=dtype, 84 | bias=bias, 85 | fuse_scale_mask_softmax=fuse_scale_mask_softmax, 86 | checkpoint=checkpoint, 87 | activation_offload=activation_offload) for _ in range(depth) 88 | ]) 89 | 90 | self.norm = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 91 | 92 | self.head = GPTLMHead(hidden_size=hidden_size, vocab_size=vocab_size, embedding_layer=self.embed, dtype=dtype) 93 | 94 | def forward(self, input_ids, attention_mask=None): 95 | 96 | # the size of input_ids is (BATCH_SIZE, SEQ_LEN) 97 | x = self.embed(input_ids) 98 | # the size of x after embed layer is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) 99 | 100 | # We create a 3D attention mask from a 2D tensor mask. 101 | # Sizes are [batch_size, 1, 1, to_seq_length] 102 | # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] 103 | # Adapted from huggingface 104 | if attention_mask is not None: 105 | batch_size = input_ids.shape[0] 106 | attention_mask = attention_mask.view(batch_size, -1) 107 | attention_mask = col_nn.partition_batch(attention_mask) 108 | attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) 109 | attention_mask = attention_mask.to(dtype=x.dtype) # fp16 compatibility 110 | attention_mask = (1.0 - attention_mask) * -10000.0 111 | 112 | # the size of x in blocks is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) 113 | for block in self.blocks: 114 | x, attention_mask = block(x, attention_mask) 115 | 116 | x = self.head(self.norm(x)) 117 | # the size of x is (BATCH_SIZE, SEQ_LEN, VOCAB_SIZE) 118 | return x 119 | 120 | 121 | def _create_deepnet_model(**model_kwargs): 122 | model = DeepNet(**model_kwargs) 123 | return model 124 | 125 | 126 | def deepnet_small(**kwargs): 127 | model_kwargs = dict(dim=768, depth=12, num_heads=12, **kwargs) 128 | return _create_deepnet_model(**model_kwargs) 129 | -------------------------------------------------------------------------------- /titans/dataloader/utils/rand_augment.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import torchvision.transforms.functional as TF 4 | 5 | _MAX_LEVEL = 10 6 | 7 | _HPARAMS = { 8 | 'cutout_const': 40, 9 | 'translate_const': 40, 10 | } 11 | 12 | _FILL = tuple([128, 128, 128]) 13 | # RGB 14 | 15 | 16 | def blend(image0, image1, factor): 17 | # blend image0 with image1 18 | # we only use this function in the 'color' function 19 | if factor == 0.0: 20 | return image0 21 | if factor == 1.0: 22 | return image1 23 | image0 = image0.type(torch.float32) 24 | image1 = image1.type(torch.float32) 25 | scaled = (image1 - image0) * factor 26 | image = image0 + scaled 27 | 28 | if factor > 0.0 and factor < 1.0: 29 | return image.type(torch.uint8) 30 | 31 | image = torch.clamp(image, 0, 255).type(torch.uint8) 32 | return image 33 | 34 | 35 | def autocontrast(image): 36 | image = TF.autocontrast(image) 37 | return image 38 | 39 | 40 | def equalize(image): 41 | image = TF.equalize(image) 42 | return image 43 | 44 | 45 | def rotate(image, degree, fill=_FILL): 46 | image = TF.rotate(image, angle=degree, fill=fill) 47 | return image 48 | 49 | 50 | def posterize(image, bits): 51 | image = TF.posterize(image, bits) 52 | return image 53 | 54 | 55 | def sharpness(image, factor): 56 | image = TF.adjust_sharpness(image, sharpness_factor=factor) 57 | return image 58 | 59 | 60 | def contrast(image, factor): 61 | image = TF.adjust_contrast(image, factor) 62 | return image 63 | 64 | 65 | def brightness(image, factor): 66 | image = TF.adjust_brightness(image, factor) 67 | return image 68 | 69 | 70 | def invert(image): 71 | return 255 - image 72 | 73 | 74 | def solarize(image, threshold=128): 75 | return torch.where(image < threshold, image, 255 - image) 76 | 77 | 78 | def solarize_add(image, addition=0, threshold=128): 79 | add_image = image.long() + addition 80 | add_image = torch.clamp(add_image, 0, 255).type(torch.uint8) 81 | return torch.where(image < threshold, add_image, image) 82 | 83 | 84 | def color(image, factor): 85 | new_image = TF.rgb_to_grayscale(image, num_output_channels=3) 86 | return blend(new_image, image, factor=factor) 87 | 88 | 89 | def shear_x(image, level, fill=_FILL): 90 | image = TF.affine(image, 0, [0, 0], 1.0, [level, 0], fill=fill) 91 | return image 92 | 93 | 94 | def shear_y(image, level, fill=_FILL): 95 | image = TF.affine(image, 0, [0, 0], 1.0, [0, level], fill=fill) 96 | return image 97 | 98 | 99 | def translate_x(image, level, fill=_FILL): 100 | image = TF.affine(image, 0, [level, 0], 1.0, [0, 0], fill=fill) 101 | return image 102 | 103 | 104 | def translate_y(image, level, fill=_FILL): 105 | image = TF.affine(image, 0, [0, level], 1.0, [0, 0], fill=fill) 106 | return image 107 | 108 | 109 | def cutout(image, pad_size, fill=_FILL): 110 | b, c, h, w = image.shape 111 | mask = torch.ones((b, c, h, w), dtype=torch.uint8).cuda() 112 | y = np.random.randint(pad_size, h - pad_size) 113 | x = np.random.randint(pad_size, w - pad_size) 114 | for i in range(c): 115 | mask[:, i, (y - pad_size):(y + pad_size), (x - pad_size):(x + pad_size)] = fill[i] 116 | image = torch.where(mask == 1, image, mask) 117 | return image 118 | 119 | 120 | def _randomly_negate_tensor(level): 121 | # With 50% prob turn the tensor negative. 122 | flip = np.random.randint(0, 2) 123 | final_level = -level if flip else level 124 | return final_level 125 | 126 | 127 | def _rotate_level_to_arg(level): 128 | level = (level / _MAX_LEVEL) * 30. 129 | level = _randomly_negate_tensor(level) 130 | return level 131 | 132 | 133 | def _shear_level_to_arg(level): 134 | level = (level / _MAX_LEVEL) * 0.3 135 | # Flip level to negative with 50% chance. 136 | level = _randomly_negate_tensor(level) 137 | return level 138 | 139 | 140 | def _translate_level_to_arg(level, translate_const): 141 | level = (level / _MAX_LEVEL) * float(translate_const) 142 | # Flip level to negative with 50% chance. 143 | level = _randomly_negate_tensor(level) 144 | return level 145 | 146 | 147 | def level(hparams): 148 | return { 149 | 'AutoContrast': lambda level: None, 150 | 'Equalize': lambda level: None, 151 | 'Invert': lambda level: None, 152 | 'Rotate': _rotate_level_to_arg, 153 | 'Posterize': lambda level: (int((level / _MAX_LEVEL) * 4)), 154 | 'Solarize': lambda level: (int((level / _MAX_LEVEL) * 200)), 155 | 'SolarizeAdd': lambda level: (int((level / _MAX_LEVEL) * 110)), 156 | 'Color': lambda level: ((level / _MAX_LEVEL) * 1.8 + 0.1), 157 | 'Contrast': lambda level: ((level / _MAX_LEVEL) * 1.8 + 0.1), 158 | 'Brightness': lambda level: ((level / _MAX_LEVEL) * 1.8 + 0.1), 159 | 'Sharpness': lambda level: ((level / _MAX_LEVEL) * 1.8 + 0.1), 160 | 'ShearX': _shear_level_to_arg, 161 | 'ShearY': _shear_level_to_arg, 162 | 'Cutout': lambda level: (int((level / _MAX_LEVEL) * hparams['cutout_const'])), 163 | 'TranslateX': lambda level: _translate_level_to_arg(level, hparams['translate_const']), 164 | 'TranslateY': lambda level: _translate_level_to_arg(level, hparams['translate_const']), 165 | } 166 | 167 | 168 | AUGMENTS = { 169 | 'AutoContrast': autocontrast, 170 | 'Equalize': equalize, 171 | 'Invert': invert, 172 | 'Rotate': rotate, 173 | 'Posterize': posterize, 174 | 'Solarize': solarize, 175 | 'SolarizeAdd': solarize_add, 176 | 'Color': color, 177 | 'Contrast': contrast, 178 | 'Brightness': brightness, 179 | 'Sharpness': sharpness, 180 | 'ShearX': shear_x, 181 | 'ShearY': shear_y, 182 | 'TranslateX': translate_x, 183 | 'TranslateY': translate_y, 184 | 'Cutout': cutout, 185 | } 186 | 187 | 188 | def RandAugment(image, num_layers=2, magnitude=_MAX_LEVEL, augments=AUGMENTS): 189 | """Random Augment for images, followed google randaug and the paper(https://arxiv.org/abs/2106.10270) 190 | :param image: the input image, in tensor format with shape of C, H, W 191 | :type image: uint8 Tensor 192 | :num_layers: how many layers will the randaug do, default=2 193 | :type num_layers: int 194 | :param magnitude: the magnitude of random augment, default=10 195 | :type magnitude: int 196 | """ 197 | if np.random.random() < 0.5: 198 | return image 199 | Choice_Augment = np.random.choice(a=list(augments.keys()), size=num_layers, replace=False) 200 | magnitude = float(magnitude) 201 | for i in range(num_layers): 202 | arg = level(_HPARAMS)[Choice_Augment[i]](magnitude) 203 | if arg is None: 204 | image = augments[Choice_Augment[i]](image) 205 | else: 206 | image = augments[Choice_Augment[i]](image, arg) 207 | return image 208 | -------------------------------------------------------------------------------- /titans/model/knowledge_graph_embedding/dataloader/dataloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import numpy as np 8 | import torch 9 | from torch.utils.data import Dataset 10 | 11 | 12 | class TrainDataset(Dataset): 13 | 14 | def __init__(self, triples, nentity, nrelation, negative_sample_size, mode): 15 | self.len = len(triples) 16 | self.triples = triples 17 | self.triple_set = set(triples) 18 | self.nentity = nentity 19 | self.nrelation = nrelation 20 | self.negative_sample_size = negative_sample_size 21 | self.mode = mode 22 | self.count = self.count_frequency(triples) 23 | self.true_head, self.true_tail = self.get_true_head_and_tail(self.triples) 24 | 25 | def __len__(self): 26 | return self.len 27 | 28 | def __getitem__(self, idx): 29 | positive_sample = self.triples[idx] 30 | 31 | head, relation, tail = positive_sample 32 | 33 | subsampling_weight = self.count[(head, relation)] + self.count[(tail, -relation - 1)] 34 | subsampling_weight = torch.sqrt(1 / torch.Tensor([subsampling_weight])) 35 | 36 | negative_sample_list = [] 37 | negative_sample_size = 0 38 | 39 | while negative_sample_size < self.negative_sample_size: 40 | negative_sample = np.random.randint(self.nentity, size=self.negative_sample_size * 2) 41 | if self.mode == 'head-batch': 42 | mask = np.in1d(negative_sample, self.true_head[(relation, tail)], assume_unique=True, invert=True) 43 | elif self.mode == 'tail-batch': 44 | mask = np.in1d(negative_sample, self.true_tail[(head, relation)], assume_unique=True, invert=True) 45 | else: 46 | raise ValueError('Training batch mode %s not supported' % self.mode) 47 | negative_sample = negative_sample[mask] 48 | negative_sample_list.append(negative_sample) 49 | negative_sample_size += negative_sample.size 50 | 51 | negative_sample = np.concatenate(negative_sample_list)[:self.negative_sample_size] 52 | 53 | negative_sample = torch.LongTensor(negative_sample) 54 | 55 | positive_sample = torch.LongTensor(positive_sample) 56 | 57 | return positive_sample, negative_sample, subsampling_weight, self.mode 58 | 59 | @staticmethod 60 | def collate_fn(data): 61 | positive_sample = torch.stack([_[0] for _ in data], dim=0) 62 | negative_sample = torch.stack([_[1] for _ in data], dim=0) 63 | subsample_weight = torch.cat([_[2] for _ in data], dim=0) 64 | mode = data[0][3] 65 | return positive_sample, negative_sample, subsample_weight, mode 66 | 67 | @staticmethod 68 | def count_frequency(triples, start=4): 69 | ''' 70 | Get frequency of a partial triple like (head, relation) or (relation, tail) 71 | The frequency will be used for subsampling like word2vec 72 | ''' 73 | count = {} 74 | for head, relation, tail in triples: 75 | if (head, relation) not in count: 76 | count[(head, relation)] = start 77 | else: 78 | count[(head, relation)] += 1 79 | 80 | if (tail, -relation - 1) not in count: 81 | count[(tail, -relation - 1)] = start 82 | else: 83 | count[(tail, -relation - 1)] += 1 84 | return count 85 | 86 | @staticmethod 87 | def get_true_head_and_tail(triples): 88 | ''' 89 | Build a dictionary of true triples that will 90 | be used to filter these true triples for negative sampling 91 | ''' 92 | 93 | true_head = {} 94 | true_tail = {} 95 | 96 | for head, relation, tail in triples: 97 | if (head, relation) not in true_tail: 98 | true_tail[(head, relation)] = [] 99 | true_tail[(head, relation)].append(tail) 100 | if (relation, tail) not in true_head: 101 | true_head[(relation, tail)] = [] 102 | true_head[(relation, tail)].append(head) 103 | 104 | for relation, tail in true_head: 105 | true_head[(relation, tail)] = np.array(list(set(true_head[(relation, tail)]))) 106 | for head, relation in true_tail: 107 | true_tail[(head, relation)] = np.array(list(set(true_tail[(head, relation)]))) 108 | 109 | return true_head, true_tail 110 | 111 | 112 | class TestDataset(Dataset): 113 | 114 | def __init__(self, triples, all_true_triples, nentity, nrelation, mode): 115 | self.len = len(triples) 116 | self.triple_set = set(all_true_triples) 117 | self.triples = triples 118 | self.nentity = nentity 119 | self.nrelation = nrelation 120 | self.mode = mode 121 | 122 | def __len__(self): 123 | return self.len 124 | 125 | def __getitem__(self, idx): 126 | head, relation, tail = self.triples[idx] 127 | 128 | if self.mode == 'head-batch': 129 | tmp = [(0, rand_head) if (rand_head, relation, tail) not in self.triple_set else (-1, head) 130 | for rand_head in range(self.nentity)] 131 | tmp[head] = (0, head) 132 | elif self.mode == 'tail-batch': 133 | tmp = [(0, rand_tail) if (head, relation, rand_tail) not in self.triple_set else (-1, tail) 134 | for rand_tail in range(self.nentity)] 135 | tmp[tail] = (0, tail) 136 | else: 137 | raise ValueError('negative batch mode %s not supported' % self.mode) 138 | 139 | tmp = torch.LongTensor(tmp) 140 | filter_bias = tmp[:, 0].float() 141 | negative_sample = tmp[:, 1] 142 | 143 | positive_sample = torch.LongTensor((head, relation, tail)) 144 | 145 | return positive_sample, negative_sample, filter_bias, self.mode 146 | 147 | @staticmethod 148 | def collate_fn(data): 149 | positive_sample = torch.stack([_[0] for _ in data], dim=0) 150 | negative_sample = torch.stack([_[1] for _ in data], dim=0) 151 | filter_bias = torch.stack([_[2] for _ in data], dim=0) 152 | mode = data[0][3] 153 | return positive_sample, negative_sample, filter_bias, mode 154 | 155 | 156 | class BidirectionalOneShotIterator(object): 157 | 158 | def __init__(self, dataloader_head, dataloader_tail): 159 | self.iterator_head = self.one_shot_iterator(dataloader_head) 160 | self.iterator_tail = self.one_shot_iterator(dataloader_tail) 161 | self.step = 0 162 | 163 | def __next__(self): 164 | self.step += 1 165 | if self.step % 2 == 0: 166 | data = next(self.iterator_head) 167 | else: 168 | data = next(self.iterator_tail) 169 | return data 170 | 171 | @staticmethod 172 | def one_shot_iterator(dataloader): 173 | ''' 174 | Transform a PyTorch Dataloader into python iterator 175 | ''' 176 | while True: 177 | for data in dataloader: 178 | yield data 179 | -------------------------------------------------------------------------------- /titans/dataloader/bert/parquet_dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import os 4 | import random 5 | import torch 6 | 7 | import torch.distributed as dist 8 | from torch.utils.data import IterableDataset 9 | from torch.utils.data import get_worker_info 10 | 11 | from lddl.types import File 12 | from lddl.utils import get_num_samples_of_parquet 13 | from lddl.random import sample 14 | from lddl.torch.datasets import ShuffleBuffer 15 | from lddl.torch.datasets import ParquetDataset as PD 16 | 17 | 18 | class ParquetDataset(PD, IterableDataset): 19 | 20 | def __init__( 21 | self, 22 | file_paths, 23 | transform=lambda x: x, 24 | shuffle_buffer_size=16384, 25 | shuffle_buffer_warmup_factor=16, 26 | base_seed=12345, 27 | logger=None, 28 | start_epoch=0, 29 | process_group=None 30 | ): 31 | # we do not want to init the original PD as it is overridden by this function 32 | # we only init with IterabledDataset 33 | IterableDataset.__init__(self) 34 | self._transform = transform 35 | self._shuffle_buffer_size = shuffle_buffer_size 36 | self._shuffle_buffer_warmup_factor = shuffle_buffer_warmup_factor 37 | self._base_seed = base_seed 38 | 39 | self._rank = dist.get_rank(group=process_group) 40 | self._world_size = dist.get_world_size(group=process_group) 41 | self._process_group = process_group 42 | 43 | self._epoch = start_epoch - 1 44 | 45 | self._logger = logger 46 | 47 | assert len(file_paths) % self._world_size == 0 48 | self._files = self._get_files(file_paths) 49 | max_num_samples_per_file = max((f.num_samples for f in self._files)) 50 | min_num_samples_per_file = min((f.num_samples for f in self._files)) 51 | assert min_num_samples_per_file + 1 == max_num_samples_per_file 52 | self._num_samples_per_file = min_num_samples_per_file 53 | total_num_samples = sum((f.num_samples for f in self._files)) 54 | num_samples_lost = (total_num_samples - 55 | self._num_samples_per_file * len(self._files)) 56 | self._logger.to('node').warning('lost {}/{}={}% samples in total'.format( 57 | num_samples_lost, 58 | total_num_samples, 59 | num_samples_lost / total_num_samples * 100, 60 | )) 61 | 62 | self._world_rng_state = None 63 | self._worker_rng_state = None 64 | 65 | def _get_files(self, file_paths): 66 | all_files_num_samples = torch.zeros((len(file_paths),), dtype=torch.long) 67 | if self._world_size > 1 and torch.distributed.get_backend() == 'nccl': 68 | all_files_num_samples = all_files_num_samples.to('cuda') 69 | # Figure out how many samples in each file. 70 | num_samples_cache = {} # Map dirname to the dict of {basename: num_samples} 71 | 72 | for idx in range(self._rank, len(file_paths), self._world_size): 73 | fp = file_paths[idx] 74 | dn = os.path.dirname(fp) 75 | bn = os.path.basename(fp) 76 | # Load the num_samples cache file if it exists. 77 | if dn not in num_samples_cache: 78 | nsfp = os.path.join(dn, '.num_samples.json') 79 | try: 80 | with open(nsfp, 'r') as nsf: 81 | num_samples_cache[dn] = json.load(nsf) 82 | except Exception as e: 83 | self._logger.to('rank').warning('failed to load {}: {}'.format( 84 | nsfp, e)) 85 | # Mark that the num_samples cache file doesn't exist for this 86 | # directory. 87 | num_samples_cache[dn] = None 88 | if num_samples_cache[dn] is not None and bn in num_samples_cache[dn]: 89 | all_files_num_samples[idx] = num_samples_cache[dn][bn] 90 | else: 91 | # Find out num_samples by loading the parquet table. 92 | all_files_num_samples[idx] = get_num_samples_of_parquet(fp) 93 | if self._world_size > 1: 94 | # Sync. accross all ranks. 95 | torch.distributed.all_reduce( 96 | all_files_num_samples, 97 | op=torch.distributed.ReduceOp.SUM, 98 | group=self._process_group 99 | ) 100 | all_files_num_samples = all_files_num_samples.tolist() 101 | return [File(fp, ns) for fp, ns in zip(file_paths, all_files_num_samples)] 102 | 103 | def __len__(self): 104 | """ This function only returns how many samples per rank will be yielded 105 | by this dataset. 106 | 107 | Note that, len(dataloader), where dataloader is a PyTorch DataLoader 108 | wrapping this dataset, does NOT return the accurate number of batches. This 109 | is because, when (num_samples_per_file * num_files_per_worker) is not 110 | divisible by batch_size, each worker is going to generate a partial batch 111 | at the very end. 112 | 113 | However, PyTorch DataLoader's __len__ only divide the number returned from 114 | this function by batch_size, which would be smaller than the actual number 115 | of batches by at most (num_workers - 1). 116 | 117 | We need to patch PyTorch DataLoader function for this function to behave 118 | correctly. 119 | """ 120 | return self._num_samples_per_file * len(self._files) // self._world_size 121 | 122 | @property 123 | def num_samples_per_file(self): 124 | return self._num_samples_per_file 125 | 126 | @property 127 | def num_files_per_rank(self): 128 | return len(self._files) // self._world_size 129 | 130 | def _decode_record_batch(self, b): 131 | raise NotImplementedError('ParquetDataset is an abstract/interface class!') 132 | 133 | def _world_identical_sample(self, population, k, counts=None): 134 | s, self._world_rng_state = sample( 135 | population, 136 | k, 137 | rng_state=self._world_rng_state, 138 | ) 139 | return s 140 | 141 | def _init_worker(self): 142 | worker_info = get_worker_info() 143 | if worker_info is None: 144 | num_workers_per_rank = 1 145 | worker_rank = 0 146 | else: 147 | num_workers_per_rank = worker_info.num_workers 148 | worker_rank = worker_info.id 149 | assert (len(self._files) % (self._world_size * num_workers_per_rank) == 0) 150 | self._logger.init_for_worker(worker_rank) 151 | return worker_rank, num_workers_per_rank 152 | 153 | def _init_rng_states(self, worker_rank, num_workers_per_rank): 154 | orig_rng_state = random.getstate() 155 | 156 | random.seed(self._base_seed + self._epoch) 157 | self._world_rng_state = random.getstate() 158 | 159 | random.seed(self._base_seed + 160 | (self._epoch * self._world_size + self._rank) * 161 | num_workers_per_rank + worker_rank) 162 | self._worker_rng_state = random.getstate() 163 | 164 | random.setstate(orig_rng_state) 165 | 166 | def __iter__(self): 167 | self._epoch += 1 168 | 169 | worker_rank, num_workers_per_rank = self._init_worker() 170 | self._init_rng_states(worker_rank, num_workers_per_rank) 171 | 172 | files = self._world_identical_sample(self._files, k=len(self._files)) 173 | rank_files = files[self._rank::self._world_size] 174 | worker_files = rank_files[worker_rank::num_workers_per_rank] 175 | 176 | sb = ShuffleBuffer( 177 | worker_files, 178 | self._num_samples_per_file * len(worker_files), 179 | lambda b: self._decode_record_batch(b), 180 | self._shuffle_buffer_size, 181 | self._shuffle_buffer_warmup_factor, 182 | self._logger, 183 | self._worker_rng_state, 184 | ) 185 | for sample in iter(sb): 186 | yield self._transform(sample) -------------------------------------------------------------------------------- /titans/model/moe/gpt_moe.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, List 2 | from torch import dtype, nn 3 | from colossalai import nn as col_nn 4 | from colossalai.nn.layer import MoeModule 5 | from colossalai.context import MOE_CONTEXT 6 | from colossalai.logging import get_dist_logger 7 | from colossalai.nn.layer.utils import CheckpointModule, divide 8 | 9 | from titans.layer.embedding import GPTEmbedding 10 | from titans.layer.block import GPTBlock, MOEGPTBlock 11 | from titans.layer.head import GPTLMHead 12 | 13 | 14 | class MOEGPT(nn.Module): 15 | 16 | def __init__(self, 17 | num_experts: int or List[int], 18 | use_residual: bool = False, 19 | capacity_factor_train: float = 1.0, 20 | capacity_factor_eval: float = 1.0, 21 | vocab_size: int = 50304, 22 | max_position_embeddings: int = 1024, 23 | hidden_size: int = 768, 24 | num_heads: int = 12, 25 | depth: int = 12, 26 | mlp_ratio: float = 4.0, 27 | dropout: float = 0.1, 28 | embedding_dropout: float = 0.1, 29 | attention_dropout: float = 0.1, 30 | layernorm_epsilon: float = 1e-5, 31 | activation: Callable = nn.functional.gelu, 32 | padding_idx: int = None, 33 | dtype: dtype = None, 34 | bias: bool = True, 35 | apply_post_layernorm: bool = False, 36 | fuse_scale_mask_softmax: bool = False, 37 | checkpoint: bool = False) -> None: 38 | super().__init__() 39 | 40 | half_depth = divide(depth, 2) 41 | if isinstance(num_experts, list): 42 | assert len(num_experts) == half_depth, \ 43 | "The length of num_experts should equal to the number of MOE layers" 44 | num_experts_list = num_experts 45 | else: 46 | num_experts_list = [num_experts] * half_depth 47 | 48 | self.embed = GPTEmbedding(embedding_dim=hidden_size, 49 | vocab_size=vocab_size, 50 | max_position_embeddings=max_position_embeddings, 51 | padding_idx=padding_idx, 52 | dropout=embedding_dropout, 53 | dtype=dtype) 54 | 55 | block_list = [] 56 | block_factory_dict = dict(hidden_size=hidden_size, 57 | num_heads=num_heads, 58 | mlp_ratio=mlp_ratio, 59 | activation=activation, 60 | attention_dropout=attention_dropout, 61 | dropout=dropout, 62 | layernorm_epsilon=layernorm_epsilon, 63 | dtype=dtype, 64 | bias=bias, 65 | apply_post_layernorm=apply_post_layernorm, 66 | fuse_scale_mask_softmax=fuse_scale_mask_softmax, 67 | checkpoint=checkpoint) 68 | 69 | for i in range(depth): 70 | 71 | if i % 2 == 0: 72 | block_module = GPTBlock(**block_factory_dict) 73 | else: 74 | num_experts = num_experts_list[i // 2] 75 | block_module = MOEGPTBlock(num_experts=num_experts, 76 | capacity_factor_train=capacity_factor_train, 77 | capacity_factor_eval=capacity_factor_eval, 78 | use_residual=use_residual, 79 | **block_factory_dict) 80 | 81 | block_list.append(block_module) 82 | 83 | self.blocks = nn.ModuleList(block_list) 84 | 85 | self.norm = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 86 | 87 | self.head = GPTLMHead(hidden_size=hidden_size, vocab_size=vocab_size, embedding_layer=self.embed, dtype=dtype) 88 | 89 | def forward(self, input_ids, attention_mask=None): 90 | MOE_CONTEXT.reset_loss() 91 | x = self.embed(input_ids) 92 | 93 | # We create a 3D attention mask from a 2D tensor mask. 94 | # Sizes are [batch_size, 1, 1, to_seq_length] 95 | # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] 96 | # Adapted from huggingface 97 | if attention_mask is not None: 98 | batch_size = input_ids.shape[0] 99 | attention_mask = attention_mask.view(batch_size, -1) 100 | attention_mask = col_nn.partition_batch(attention_mask) 101 | attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) 102 | attention_mask = attention_mask.to(dtype=x.dtype) # fp16 compatibility 103 | attention_mask = (1.0 - attention_mask) * -10000.0 104 | 105 | y = 0 106 | for block in self.blocks: 107 | if isinstance(block, GPTBlock): 108 | x, attention_mask = block(x, attention_mask) 109 | else: 110 | x, y, attention_mask = block(x, y, attention_mask) 111 | 112 | x = self.head(self.norm(x)) 113 | MOE_CONTEXT.add_loss(y) 114 | return x 115 | 116 | 117 | def _create_moegpt_model(**model_kwargs): 118 | model = MOEGPT(**model_kwargs) 119 | return model 120 | 121 | 122 | def _prmoe_check_sanity(kwargs_dict): 123 | logger = get_dist_logger() 124 | if not kwargs_dict.pop('use_residual', False): 125 | logger.warning( 126 | "If you want to use PR-MOE, please set 'use_residual' to True. " 127 | "Otherwise, we'll force 'use_residual' to True.", 128 | ranks=[0]) 129 | 130 | 131 | def prmoe_4b(**kwargs): 132 | _prmoe_check_sanity(kwargs) 133 | model_kwargs = dict(num_experts=[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64], 134 | use_residual=True, 135 | hidden_size=1024, 136 | depth=24, 137 | num_heads=16, 138 | **kwargs) 139 | return _create_moegpt_model(**model_kwargs) 140 | 141 | 142 | def prmoe_16b(**kwargs): 143 | _prmoe_check_sanity(kwargs) 144 | model_kwargs = dict(num_experts=[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64], 145 | use_residual=True, 146 | hidden_size=2048, 147 | depth=24, 148 | num_heads=16, 149 | **kwargs) 150 | return _create_moegpt_model(**model_kwargs) 151 | 152 | 153 | def prmoe_25b(**kwargs): 154 | _prmoe_check_sanity(kwargs) 155 | model_kwargs = dict(num_experts=[32, 32, 32, 32, 32, 32, 64, 64, 64, 64, 128, 128], 156 | use_residual=True, 157 | hidden_size=2048, 158 | depth=24, 159 | num_heads=16, 160 | **kwargs) 161 | return _create_moegpt_model(**model_kwargs) 162 | 163 | 164 | def prmoe_29b(**kwargs): 165 | _prmoe_check_sanity(kwargs) 166 | model_kwargs = dict(num_experts=[32, 32, 48, 64, 64, 64, 64, 64, 64, 64, 128, 128], 167 | use_residual=True, 168 | hidden_size=2048, 169 | depth=24, 170 | num_heads=16, 171 | **kwargs) 172 | return _create_moegpt_model(**model_kwargs) 173 | 174 | 175 | def prmoe_31b(**kwargs): 176 | _prmoe_check_sanity(kwargs) 177 | model_kwargs = dict(num_experts=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 128, 128], 178 | use_residual=True, 179 | hidden_size=2048, 180 | depth=24, 181 | num_heads=16, 182 | **kwargs) 183 | return _create_moegpt_model(**model_kwargs) 184 | 185 | 186 | def prmoe_51b(**kwargs): 187 | _prmoe_check_sanity(kwargs) 188 | model_kwargs = dict(num_experts=[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64, 64, 64], 189 | use_residual=True, 190 | hidden_size=3072, 191 | depth=32, 192 | num_heads=24, 193 | **kwargs) 194 | return _create_moegpt_model(**model_kwargs) 195 | -------------------------------------------------------------------------------- /titans/model/vit/vit.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import Callable 3 | 4 | import torch 5 | from colossalai import nn as col_nn 6 | from colossalai.nn.layer.utils import CheckpointModule 7 | from torch import dtype, nn 8 | 9 | from titans.layer.embedding import ViTEmbedding 10 | from titans.layer.head import ViTHead 11 | from titans.layer.block import ViTBlock 12 | from titans.decorator import no_support 13 | 14 | __all__ = [ 15 | 'VisionTransformer', 16 | 'vit_lite_depth7_patch4_32', 17 | 'vit_tiny_patch4_32', 18 | 'vit_tiny_patch16_224', 19 | 'vit_tiny_patch16_384', 20 | 'vit_small_patch16_224', 21 | 'vit_small_patch16_384', 22 | 'vit_small_patch32_224', 23 | 'vit_small_patch32_384', 24 | 'vit_base_patch16_224', 25 | 'vit_base_patch16_384', 26 | 'vit_base_patch32_224', 27 | 'vit_base_patch32_384', 28 | 'vit_large_patch16_224', 29 | 'vit_large_patch16_384', 30 | 'vit_large_patch32_224', 31 | 'vit_large_patch32_384', 32 | ] 33 | 34 | 35 | @no_support(['sp', 'moe']) 36 | class VisionTransformer(nn.Module): 37 | """ 38 | ViT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of 39 | the [CLS] token) e.g. for ImageNet. 40 | 41 | Args: 42 | img_size(int): The size of images, defaults to 224. 43 | patch_size(int): The size of patches, defaults to 16. 44 | in_chans(int): The size of input channels, defaults to 3. 45 | num_classes(int): The number of target classes, defaults to 1000. 46 | depth(int): The number of transformer layers, defaults to 12. 47 | num_heads(int): The number of heads in transformer blocks, defaults to 12. 48 | dim(int): Hidden size of the transformer blocks, defaults to 768. 49 | mlp_ratio(int): The ratio used in mlp layer, defaults to 4. 50 | attention_dropout(float): The ratio used to construct attention dropout modules, which indicates the percentage of parameters should be casted to zero, defaults to 0.1. 51 | dropout(float): The ratio used to construct dropout modules, which indicates the percentage of parameters should be casted to zero, defaults to 0.1. 52 | drop_path(float): The ratio used to construct drop_path modules, which indicates the percentage of branches should be casted to zero, defaults to 0.. 53 | layernorm_epsilon(float): The argument used to construct layernorm modules, defaults to 1e-6. 54 | activation(Callable): The activation function used in model, defaults to nn.functional.gelu. 55 | representation_size(int): The size of representation in head layer, defaults to None. 56 | dtype (:class:`torch.dtype`): The dtype of parameters, defaults to None. 57 | bias (bool): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``. 58 | checkpoint(bool): If set to "True", checkpoint feature will be activated to save memory, defaults to ``False``. 59 | init_method(str): The initializing method used in layers, defaults to `torch`. 60 | """ 61 | 62 | def __init__(self, 63 | img_size: int = 224, 64 | patch_size: int = 16, 65 | in_chans: int = 3, 66 | num_classes: int = 1000, 67 | depth: int = 12, 68 | num_heads: int = 12, 69 | hidden_size: int = 768, 70 | mlp_ratio: int = 4, 71 | attention_dropout: float = 0., 72 | dropout: float = 0.1, 73 | drop_path: float = 0., 74 | layernorm_epsilon: float = 1e-6, 75 | activation: Callable = nn.functional.gelu, 76 | representation_size: int = None, 77 | dtype: dtype = None, 78 | bias: bool = True, 79 | checkpoint: bool = False, 80 | init_method: str = 'torch'): 81 | super().__init__() 82 | 83 | self.embed = ViTEmbedding(img_size=img_size, 84 | patch_size=patch_size, 85 | in_chans=in_chans, 86 | embedding_dim=hidden_size, 87 | dropout=dropout, 88 | dtype=dtype, 89 | init_method=init_method) 90 | 91 | # stochastic depth decay rule 92 | dpr = [x.item() for x in torch.linspace(0, drop_path, depth)] 93 | self.blocks = nn.ModuleList([ 94 | ViTBlock( 95 | hidden_size=hidden_size, 96 | num_heads=num_heads, 97 | mlp_ratio=mlp_ratio, 98 | attention_dropout=attention_dropout, 99 | dropout=dropout, 100 | drop_path=dpr[i], 101 | activation=activation, 102 | dtype=dtype, 103 | bias=bias, 104 | checkpoint=checkpoint, 105 | init_method=init_method, 106 | ) for i in range(depth) 107 | ]) 108 | 109 | self.norm = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 110 | 111 | self.head = ViTHead(hidden_size=hidden_size, 112 | num_classes=num_classes, 113 | representation_size=representation_size, 114 | dtype=dtype, 115 | bias=bias, 116 | init_method=init_method) 117 | 118 | def forward(self, x): 119 | # the size of x is (BATCH_SIZE, IN_CHAN, IMAGE_SIZE, IMAGE_SIZE) 120 | x = self.embed(x) 121 | # the size of x after embed layer is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) 122 | for block in self.blocks: 123 | x = block(x) 124 | # the size of x after block is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) 125 | x = self.head(self.norm(x)) 126 | # the size of x is (BATCH_SIZE, NUM_CLASSES) 127 | return x 128 | 129 | 130 | def _create_vit_model(**model_kwargs): 131 | model = VisionTransformer(**model_kwargs) 132 | return model 133 | 134 | 135 | def vit_lite_depth7_patch4_32(**kwargs): 136 | model_kwargs = dict(img_size=32, patch_size=4, hidden_size=256, depth=7, num_heads=4, mlp_ratio=2, num_classes=10, **kwargs) 137 | return _create_vit_model(**model_kwargs) 138 | 139 | 140 | def vit_tiny_patch4_32(**kwargs): 141 | model_kwargs = dict(img_size=32, patch_size=4, hidden_size=512, depth=6, num_heads=8, mlp_ratio=1, num_classes=10, **kwargs) 142 | return _create_vit_model(**model_kwargs) 143 | 144 | 145 | def vit_tiny_patch16_224(**kwargs): 146 | model_kwargs = dict(img_size=224, patch_size=16, hidden_size=192, depth=12, num_heads=3, mlp_ratio=4, **kwargs) 147 | return _create_vit_model(**model_kwargs) 148 | 149 | 150 | def vit_tiny_patch16_384(**kwargs): 151 | model_kwargs = dict(img_size=384, patch_size=16, hidden_size=192, depth=12, num_heads=3, mlp_ratio=4, **kwargs) 152 | return _create_vit_model(**model_kwargs) 153 | 154 | 155 | def vit_small_patch16_224(**kwargs): 156 | model_kwargs = dict(img_size=224, patch_size=16, hidden_size=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs) 157 | return _create_vit_model(**model_kwargs) 158 | 159 | 160 | def vit_small_patch16_384(**kwargs): 161 | model_kwargs = dict(img_size=384, patch_size=16, hidden_size=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs) 162 | return _create_vit_model(**model_kwargs) 163 | 164 | 165 | def vit_small_patch32_224(**kwargs): 166 | model_kwargs = dict(img_size=224, patch_size=32, hidden_size=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs) 167 | return _create_vit_model(**model_kwargs) 168 | 169 | 170 | def vit_small_patch32_384(**kwargs): 171 | model_kwargs = dict(img_size=384, patch_size=32, hidden_size=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs) 172 | return _create_vit_model(**model_kwargs) 173 | 174 | 175 | def vit_base_patch16_224(**kwargs): 176 | model_kwargs = dict(img_size=224, patch_size=16, hidden_size=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs) 177 | return _create_vit_model(**model_kwargs) 178 | 179 | 180 | def vit_base_patch16_384(**kwargs): 181 | model_kwargs = dict(img_size=384, patch_size=16, hidden_size=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs) 182 | return _create_vit_model(**model_kwargs) 183 | 184 | 185 | def vit_base_patch32_224(**kwargs): 186 | model_kwargs = dict(img_size=224, patch_size=32, hidden_size=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs) 187 | return _create_vit_model(**model_kwargs) 188 | 189 | 190 | def vit_base_patch32_384(**kwargs): 191 | model_kwargs = dict(img_size=384, patch_size=32, hidden_size=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs) 192 | return _create_vit_model(**model_kwargs) 193 | 194 | 195 | def vit_large_patch16_224(**kwargs): 196 | model_kwargs = dict(img_size=224, patch_size=16, hidden_size=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs) 197 | return _create_vit_model(**model_kwargs) 198 | 199 | 200 | def vit_large_patch16_384(**kwargs): 201 | model_kwargs = dict(img_size=384, patch_size=16, hidden_size=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs) 202 | return _create_vit_model(**model_kwargs) 203 | 204 | 205 | def vit_large_patch32_224(**kwargs): 206 | model_kwargs = dict(img_size=224, patch_size=32, hidden_size=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs) 207 | return _create_vit_model(**model_kwargs) 208 | 209 | 210 | def vit_large_patch32_384(**kwargs): 211 | model_kwargs = dict(img_size=384, patch_size=32, hidden_size=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs) 212 | return _create_vit_model(**model_kwargs) 213 | -------------------------------------------------------------------------------- /titans/model/gpt/gpt.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import Callable 3 | 4 | import torch 5 | from colossalai import nn as col_nn 6 | from colossalai.pipeline.utils import partition_uniform 7 | from colossalai.context import ParallelMode 8 | from colossalai.core import global_context as gpc 9 | from colossalai.logging import get_dist_logger 10 | from colossalai.nn.layer.utils import CheckpointModule, divide 11 | from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper 12 | from colossalai.utils import get_current_device 13 | from torch import dtype, nn 14 | 15 | from titans.layer.embedding import GPTEmbedding 16 | from titans.layer.head import GPTLMHead 17 | from titans.layer.block import GPTBlock 18 | from titans.loss.lm_loss import GPTLMLoss 19 | from titans.decorator import no_support 20 | 21 | __all__ = ['GPT', 'GPTLMLoss', 'gpt2_small', 'gpt2_medium', 'gpt2_large', 'gpt2_xl', 'gpt2_8B', 'gpt3'] 22 | 23 | 24 | @no_support(['sp', 'moe']) 25 | class GPT(nn.Module): 26 | """ 27 | The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input 28 | embeddings). 29 | 30 | Args: 31 | vocab_size(int): The size of dictionary, defaults to 50304. 32 | max_position_embeddings(int): The max value of positional embeddings, defaults to 1024. 33 | hidden_size(int): Hidden size of the transformer blocks, defaults to 768. 34 | num_heads(int): The number of heads in transformer blocks, defaults to 12. 35 | depth(int): The number of transformer layers, defaults to 12. 36 | mlp_ratio(float): The ratio used in mlp layer, defaults to 4.0. 37 | dropout(float): The ratio used to construct dropout modules, which indicates the percentage of parameters should be casted to zero, defaults to 0.1. 38 | embedding_dropout(float): The ratio used to construct embedding dropout modules, which indicates the percentage of parameters should be casted to zero, defaults to 0.1. 39 | attention_dropout(float): The ratio used to construct attention dropout modules, which indicates the percentage of parameters should be casted to zero, defaults to 0.1. 40 | layernorm_epsilon(float): The argument used to construct layernorm modules, defaults to 1e-5. 41 | activation(Callable): The activation function used in model, defaults to nn.functional.gelu. 42 | padding_idx(int): The length to be padded for each batch, defaults to None. 43 | dtype (:class:`torch.dtype`): The dtype of parameters, defaults to None. 44 | bias (bool): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``. 45 | apply_post_layernorm(bool): If set to "True", the residual value will be record after layernorm modules, defaults to ``False``. 46 | fuse_scale_mask_softmax(bool): If set to "True", FuseScaleMaskSoftmax will be used in self-attention layer, defaults to ``False``. 47 | checkpoint(bool): If set to "True", checkpoint feature will be activated to save memory, defaults to ``False``. 48 | activation_offload(bool): If set to "True", offload feature will be activated during checkpointing, defaults to ``False``. 49 | """ 50 | 51 | def __init__(self, 52 | vocab_size: int = 50304, 53 | max_position_embeddings: int = 1024, 54 | hidden_size: int = 768, 55 | num_heads: int = 12, 56 | depth: int = 12, 57 | mlp_ratio: float = 4.0, 58 | dropout: float = 0.1, 59 | embedding_dropout: float = 0.1, 60 | attention_dropout: float = 0.1, 61 | layernorm_epsilon: float = 1e-5, 62 | activation: Callable = nn.functional.gelu, 63 | padding_idx: int = None, 64 | dtype: dtype = None, 65 | bias: bool = True, 66 | apply_post_layernorm: bool = False, 67 | fuse_scale_mask_softmax: bool = False, 68 | checkpoint: bool = False, 69 | activation_offload: bool = False) -> None: 70 | super().__init__() 71 | self.embed = GPTEmbedding(embedding_dim=hidden_size, 72 | vocab_size=vocab_size, 73 | max_position_embeddings=max_position_embeddings, 74 | padding_idx=padding_idx, 75 | dropout=embedding_dropout, 76 | dtype=dtype) 77 | self.blocks = nn.ModuleList([ 78 | GPTBlock(hidden_size=hidden_size, 79 | num_heads=num_heads, 80 | mlp_ratio=mlp_ratio, 81 | activation=activation, 82 | attention_dropout=attention_dropout, 83 | dropout=dropout, 84 | layernorm_epsilon=layernorm_epsilon, 85 | dtype=dtype, 86 | bias=bias, 87 | apply_post_layernorm=apply_post_layernorm, 88 | fuse_scale_mask_softmax=fuse_scale_mask_softmax, 89 | checkpoint=checkpoint, 90 | activation_offload=activation_offload) for _ in range(depth) 91 | ]) 92 | 93 | self.norm = col_nn.LayerNorm(normalized_shape=hidden_size, eps=layernorm_epsilon, dtype=dtype) 94 | 95 | self.head = GPTLMHead( 96 | hidden_size=hidden_size, 97 | vocab_size=vocab_size, 98 | embedding_layer=self.embed, 99 | # word_embeeding_weight=self.embed.word_embedding_weight, 100 | dtype=dtype) 101 | 102 | def forward(self, input_ids, attention_mask=None): 103 | 104 | # the size of input_ids is (BATCH_SIZE, SEQ_LEN) 105 | x = self.embed(input_ids) 106 | # the size of x after embed layer is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) 107 | 108 | # We create a 3D attention mask from a 2D tensor mask. 109 | # Sizes are [batch_size, 1, 1, to_seq_length] 110 | # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] 111 | # Adapted from huggingface 112 | if attention_mask is not None: 113 | batch_size = input_ids.shape[0] 114 | attention_mask = attention_mask.view(batch_size, -1) 115 | attention_mask = col_nn.partition_batch(attention_mask) 116 | attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) 117 | attention_mask = attention_mask.to(dtype=x.dtype) # fp16 compatibility 118 | attention_mask = (1.0 - attention_mask) * -10000.0 119 | 120 | # the size of x in blocks is (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) 121 | for block in self.blocks: 122 | x, attention_mask = block(x, attention_mask) 123 | 124 | x = self.head(self.norm(x)) 125 | # the size of x is (BATCH_SIZE, SEQ_LEN, VOCAB_SIZE) 126 | 127 | return x 128 | 129 | 130 | def _create_gpt_model(**model_kwargs): 131 | model = GPT(**model_kwargs) 132 | return model 133 | 134 | 135 | def gpt2_small(**kwargs): 136 | model_kwargs = dict(hidden_size=768, depth=12, num_heads=12, **kwargs) 137 | return _create_gpt_model(**model_kwargs) 138 | 139 | 140 | def gpt2_medium(**kwargs): 141 | model_kwargs = dict(hidden_size=1024, depth=24, num_heads=8, **kwargs) 142 | return _create_gpt_model(**model_kwargs) 143 | 144 | 145 | def gpt2_large(**kwargs): 146 | model_kwargs = dict(hidden_size=1536, depth=36, num_heads=12, **kwargs) 147 | return _create_gpt_model(**model_kwargs) 148 | 149 | 150 | def gpt2_xl(**kwargs): 151 | model_kwargs = dict(hidden_size=1600, depth=48, num_heads=16, **kwargs) 152 | return _create_gpt_model(**model_kwargs) 153 | 154 | 155 | def gpt2_2B(**kwargs): 156 | model_kwargs = dict(hidden_size=2048, depth=40, num_heads=16, **kwargs) 157 | return _create_gpt_model(**model_kwargs) 158 | 159 | 160 | def gpt2_3B(**kwargs): 161 | model_kwargs = dict(hidden_size=2304, depth=48, num_heads=16, **kwargs) 162 | return _create_gpt_model(**model_kwargs) 163 | 164 | 165 | def gpt2_4B(**kwargs): 166 | model_kwargs = dict(hidden_size=2304, depth=64, num_heads=16, **kwargs) 167 | return _create_gpt_model(**model_kwargs) 168 | 169 | 170 | def gpt2_6B(**kwargs): 171 | model_kwargs = dict(hidden_size=4096, depth=30, num_heads=16, **kwargs) 172 | return _create_gpt_model(**model_kwargs) 173 | 174 | 175 | def gpt2_8B(**kwargs): 176 | model_kwargs = dict(hidden_size=3072, depth=72, num_heads=24, **kwargs) 177 | return _create_gpt_model(**model_kwargs) 178 | 179 | 180 | def gpt2_12B(**kwargs): 181 | model_kwargs = dict(hidden_size=4096, depth=60, num_heads=16, **kwargs) 182 | return _create_gpt_model(**model_kwargs) 183 | 184 | 185 | def gpt2_15B(**kwargs): 186 | model_kwargs = dict(hidden_size=4096, depth=78, num_heads=16, **kwargs) 187 | return _create_gpt_model(**model_kwargs) 188 | 189 | 190 | def gpt2_18B(**kwargs): 191 | model_kwargs = dict(hidden_size=4096, depth=90, num_heads=16, **kwargs) 192 | return _create_gpt_model(**model_kwargs) 193 | 194 | 195 | def gpt2_20B(**kwargs): 196 | model_kwargs = dict(hidden_size=8192, depth=25, num_heads=16, **kwargs) 197 | return _create_gpt_model(**model_kwargs) 198 | 199 | 200 | def gpt2_24B(**kwargs): 201 | model_kwargs = dict(hidden_size=8192, depth=30, num_heads=16, **kwargs) 202 | return _create_gpt_model(**model_kwargs) 203 | 204 | 205 | def gpt2_28B(**kwargs): 206 | model_kwargs = dict(hidden_size=8192, depth=35, num_heads=16, **kwargs) 207 | return _create_gpt_model(**model_kwargs) 208 | 209 | 210 | def gpt2_32B(**kwargs): 211 | model_kwargs = dict(hidden_size=8192, depth=40, num_heads=16, **kwargs) 212 | return _create_gpt_model(**model_kwargs) 213 | 214 | 215 | def gpt2_36B(**kwargs): 216 | model_kwargs = dict(hidden_size=8192, depth=45, num_heads=16, **kwargs) 217 | return _create_gpt_model(**model_kwargs) 218 | 219 | 220 | def gpt2_40B(**kwargs): 221 | model_kwargs = dict(hidden_size=8192, depth=50, num_heads=16, **kwargs) 222 | return _create_gpt_model(**model_kwargs) 223 | 224 | 225 | def gpt3(**kwargs): 226 | model_kwargs = dict(hidden_size=12288, depth=96, num_heads=96, **kwargs) 227 | return _create_gpt_model(**model_kwargs) 228 | -------------------------------------------------------------------------------- /titans/dataloader/bert/bert_pretrain.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is adapated from 3 | """ 4 | 5 | import logging 6 | import os 7 | import torch 8 | import transformers 9 | import torch.distributed as dist 10 | 11 | try: 12 | from lddl.utils import get_all_parquets_under, get_all_bin_ids, get_file_paths_for_bin_id 13 | from lddl.torch.dataloader import DataLoader 14 | from .parquet_dataset import ParquetDataset 15 | from lddl.torch.bert import _decode_record_batch, _to_encoded_inputs, _mask_tokens, BertPretrainBinned 16 | from lddl.torch.log import DatasetLogger 17 | from lddl.torch.utils import get_node_rank, get_nproc_per_node 18 | except ImportError: 19 | raise ImportError('lddl is required for BERT pretraining but not found, ' 20 | 'you can install lddl by pip install git+https://github.com/NVIDIA/DeepLearningExamples.git#subdirectory=Tools/lddl') 21 | 22 | 23 | class BertPretrainDataset(ParquetDataset): 24 | 25 | def _decode_record_batch(self, b): 26 | return _decode_record_batch(b) 27 | 28 | 29 | def get_bert_pretrain_data_loader( 30 | path, 31 | shuffle_buffer_size=16384, 32 | shuffle_buffer_warmup_factor=16, 33 | tokenizer_class=transformers.BertTokenizerFast, 34 | vocab_file=None, 35 | tokenizer_kwargs={}, 36 | data_loader_class=DataLoader, 37 | data_loader_kwargs={}, 38 | mlm_probability=0.15, 39 | base_seed=12345, 40 | log_dir=None, 41 | log_level=logging.INFO, 42 | return_raw_samples=False, 43 | start_epoch=0, 44 | sequence_length_alignment=8, 45 | ignore_index=-1, 46 | process_group=None, 47 | ): 48 | """Gets a PyTorch DataLoader for the BERT pretraining task. 49 | 50 | The LDDL DataLoader can be used in the same way as a normal PyTorch 51 | DataLoader. The 'persistent_workers' attribute will always be enabled. 52 | 53 | The LDDL DataLoader streams samples from disk into memory, and uses a shuffle 54 | buffer to perform shuffling: at each iteration, a random sample from the 55 | shuffle buffer is popped, and a new sample is pushed into the shuffle buffer 56 | at this vacant location. 57 | 58 | Args: 59 | path: A string of the path pointing to the directory that contains the 60 | pretraining dataset in the format of balanced parquet shards. 61 | local_rank: The local rank ID (on this node) of the current pretraining 62 | process. 63 | shuffle_buffer_size: The size of the shuffle buffer. 64 | shuffle_buffer_warmup_factor: At the beginning, the shuffle buffer is empty. 65 | Therefore, in order to fill the shuffle buffer, at each iteration, more 66 | samples need to be pushed into the shuffle buffer than being popped out 67 | of. This factor indicates how many samples is pushed into the shuffle 68 | buffer per 1 sample being popped out of the shuffle buffer, until the 69 | shuffle buffer is full. 70 | tokenizer_class: The HuggingFace tokenizer class for BERT pretraining. 71 | vocab_file: The path to a vocab file, or the name of a pretrained model 72 | registered on huggingface.co (e.g., 'bert-large-uncased') of which the 73 | vocab file is downloaded. 74 | tokenizer_kwargs: The arguments to the tokenizer class. 75 | data_loader_class: The class of the DataLoader. 76 | data_loader_kwargs: The arguments to the DataLoader class. 77 | mlm_probability: The probability for masking tokens in the masked language 78 | modeling task (in BERT pretraining). 79 | base_seed: A base seed value on which other seeds used in the DataLoader are 80 | based. 81 | log_dir: The path to a directory to store the logs from the LDDL DataLoader. 82 | log_level: The logging verbose level. 83 | return_raw_samples: If True, returns the raw string pairs instead of token 84 | indices. 85 | start_epoch: The epoch number to start from. An epoch is defined as going 86 | through every sample in a dataset once. 87 | sequence_length_alignment: To get the input tensors of token indices, each 88 | sequence in a batch will only be padded to the longest sequence in this 89 | batch. However, certain hardware features might prefer the shapes of the 90 | input tensors to meet certain conditions. For example, it's better for the 91 | Tensor Core on NVIDIA GPUs if the dimensions of the input tensors are 92 | divisible by 8. Therefore, this argument is an alignment factor such that 93 | the sequences in a batch will be padded to the first sequence length 94 | larger than the longest sequence in this batch and also divisible by this 95 | alignment factor. 96 | ignore_index: The label value for the unmasked tokens in the language 97 | modeling task (in BERT pretraining). 98 | 99 | Returns: 100 | A PyTorch DataLoader that, in each iteration, yield: 101 | - If return_raw_samples is False, a dict of 5 key-value pairs which are the 102 | necessary input for BERT pretraining: 103 | { 104 | 'input_ids': a torch.Tensor of size [batch_size, sequence_length], 105 | 'token_type_ids': a torch.Tensor of size [batch_size, sequence_length], 106 | 'attention_mask': a torch.Tensor of size [batch_size, sequence_length], 107 | 'labels': a torch.Tensor of size [batch_size, sequence_length], 108 | 'next_sentence_labels': a torch.Tensor of size [batch_size], 109 | } 110 | - If return_raw_samples is True, a list of the following lists: 111 | [ 112 | strings of the first sequences in the sequence pairs, 113 | strings of the second sequences in the sequence pairs, 114 | bools that indicate whether the second sequences are the next sequences 115 | for the first sequences, 116 | numpy.ndarrays of positions of the masked tokens for the masked language 117 | modeling task (only exists if static masking is enabled), 118 | strings of space-seperated labels of the masked tokens for the masked 119 | language modeling task (only exists if static masking is enabled), 120 | ] 121 | 122 | Examples: 123 | train_dataloader = lddl.torch.get_bert_pretrain_data_loader( 124 | input_dir, 125 | local_rank=local_rank, 126 | vocab_file=vocab_file, 127 | data_loader_kwargs={ 128 | 'batch_size': batch_size, 129 | 'num_workers': num_workers, 130 | 'pin_memory': True, 131 | }, 132 | log_level=logging.WARNING, 133 | start_epoch=start_epoch, 134 | ) 135 | 136 | for epoch in range(start_epoch, start_epoch + epochs): 137 | for i, batch in enumerate(train_dataloader): 138 | prediction_scores, seq_relationship_score = model( 139 | input_ids=batch['input_ids'].to(device), 140 | token_type_ids=batch['token_type_ids'].to(device), 141 | attention_mask=batch['attention_mask'].to(device), 142 | ) 143 | loss = criterion( 144 | prediction_scores, 145 | seq_relationship_score, 146 | batch['labels'].to(device), 147 | batch['next_sentence_labels'].to(device), 148 | ) 149 | ... 150 | """ 151 | assert isinstance(path, str) 152 | assert isinstance(shuffle_buffer_size, int) and shuffle_buffer_size > 0 153 | assert (isinstance(shuffle_buffer_warmup_factor, int) and 154 | shuffle_buffer_warmup_factor > 0) 155 | assert tokenizer_class in { 156 | transformers.BertTokenizerFast, transformers.BertTokenizer 157 | } 158 | assert isinstance(vocab_file, str) 159 | assert isinstance(tokenizer_kwargs, dict) 160 | assert data_loader_class in {DataLoader} 161 | assert isinstance(data_loader_kwargs, dict) 162 | assert isinstance(mlm_probability, (int, float)) and 0 <= mlm_probability <= 1 163 | assert isinstance(base_seed, int) 164 | assert log_dir is None or isinstance(log_dir, str) 165 | assert log_level in { 166 | logging.NOTSET, logging.DEBUG, logging.INFO, logging.WARNING, 167 | logging.ERROR, logging.CRITICAL 168 | } 169 | assert isinstance(return_raw_samples, bool) 170 | assert isinstance(start_epoch, int) 171 | 172 | local_rank = dist.get_rank(process_group) 173 | 174 | if os.path.isfile(vocab_file): 175 | tokenizer = tokenizer_class(vocab_file, **tokenizer_kwargs) 176 | else: 177 | tokenizer = tokenizer_class.from_pretrained(vocab_file, **tokenizer_kwargs) 178 | 179 | def _batch_preprocess(batch): 180 | with torch.no_grad(): 181 | encoded_inputs = _to_encoded_inputs( 182 | batch, 183 | tokenizer, 184 | sequence_length_alignment=sequence_length_alignment, 185 | ignore_index=ignore_index, 186 | ) 187 | if 'special_tokens_mask' in encoded_inputs: # Dynamic masking. 188 | special_tokens_mask = encoded_inputs.pop('special_tokens_mask', None) 189 | (encoded_inputs['input_ids'], encoded_inputs['labels']) = _mask_tokens( 190 | encoded_inputs['input_ids'], 191 | special_tokens_mask=special_tokens_mask, 192 | tokenizer=tokenizer, 193 | mlm_probability=mlm_probability, 194 | ignore_index=ignore_index, 195 | ) 196 | return encoded_inputs 197 | 198 | logger = DatasetLogger( 199 | log_dir=log_dir, 200 | node_rank=get_node_rank(nproc_per_node=get_nproc_per_node(local_rank)), 201 | local_rank=local_rank, 202 | log_level=log_level, 203 | ) 204 | 205 | dataset_kwargs = { 206 | 'shuffle_buffer_size': shuffle_buffer_size, 207 | 'shuffle_buffer_warmup_factor': shuffle_buffer_warmup_factor, 208 | 'base_seed': base_seed, 209 | 'logger': logger, 210 | 'start_epoch': start_epoch, 211 | 'process_group': process_group 212 | } 213 | 214 | extra_collate = data_loader_kwargs.get('collate_fn', lambda x: x) 215 | if not return_raw_samples: 216 | data_loader_kwargs['collate_fn'] = lambda batch: extra_collate( 217 | _batch_preprocess(batch)) 218 | 219 | # Find all the parquet file paths and figure out whether it is binned or 220 | # un-binned. 221 | all_file_paths = get_all_parquets_under(path) 222 | bin_ids = get_all_bin_ids(all_file_paths) 223 | if len(bin_ids) > 0: 224 | data_loader = BertPretrainBinned( 225 | [ 226 | data_loader_class( 227 | BertPretrainDataset( 228 | get_file_paths_for_bin_id(all_file_paths, bin_id), 229 | **dataset_kwargs, 230 | ), 231 | **data_loader_kwargs, 232 | ) for bin_id in bin_ids 233 | ], 234 | base_seed=base_seed, 235 | start_epoch=start_epoch, 236 | logger=logger, 237 | ) 238 | else: # un-binned 239 | data_loader = data_loader_class( 240 | BertPretrainDataset(all_file_paths, **dataset_kwargs), 241 | **data_loader_kwargs, 242 | ) 243 | 244 | return data_loader 245 | --------------------------------------------------------------------------------