├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug-report.yml
    │   ├── config.yml
    │   ├── documentation.yml
    │   └── feature_request.yml
    └── workflows
    │   └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .style.yapf
├── LICENSE
├── README.md
├── benchmark
    └── zero
    │   ├── README.md
    │   ├── colossalai_utils
    │       ├── gpt2_config.json
    │       ├── gpt2_config_v1.json
    │       ├── utils.py
    │       └── vit_config.json
    │   ├── common
    │       ├── gpt2.py
    │       ├── train.py
    │       ├── utils.py
    │       └── vit.py
    │   ├── deepspeed_utils
    │       ├── gpt2_config.json
    │       ├── utils.py
    │       └── vit_config.json
    │   ├── fairscale_utils
    │       ├── gpt2_config.json
    │       ├── utils.py
    │       └── vit_config.json
    │   ├── patrickstar_utils
    │       ├── gpt2_config.json
    │       ├── utils.py
    │       └── vit_config.json
    │   ├── requirement.txt
    │   ├── run.py
    │   └── torch_utils
    │       ├── gpt2_config.json
    │       ├── utils.py
    │       └── vit_config.json
├── features
    ├── amp
    │   ├── README.md
    │   ├── config
    │   │   ├── config_AMP_apex.py
    │   │   ├── config_AMP_naive.py
    │   │   ├── config_AMP_torch.py
    │   │   └── config_fp32.py
    │   ├── requirements.txt
    │   ├── scripts
    │   │   └── train_slurm.sh
    │   └── train.py
    ├── colotensor
    │   ├── README.md
    │   ├── gpt_megatron.py
    │   └── requirements.txt
    ├── gradient_accumulation
    │   ├── README.md
    │   ├── config.py
    │   ├── requirements.txt
    │   └── train.py
    ├── gradient_clipping
    │   ├── README.md
    │   ├── config.py
    │   ├── requirements.txt
    │   └── train.py
    ├── pipeline_parallel
    │   ├── .init
    │   ├── README.md
    │   ├── requirements.txt
    │   ├── resnet.py
    │   └── rpc
    │   │   ├── gpt
    │   │       ├── 1f1b.py
    │   │       ├── README.md
    │   │       ├── baseline.py
    │   │       └── dataset
    │   │       │   ├── webtext.py
    │   │       │   └── yuan.py
    │   │   ├── opt
    │   │       ├── 1f1b.py
    │   │       ├── README.md
    │   │       └── check
    │   │       │   └── opt_avail.py
    │   │   └── resnet
    │   │       ├── 1f1b.py
    │   │       ├── README.md
    │   │       ├── chimera.py
    │   │       └── fill_drain.py
    ├── tensor_parallel
    │   ├── README.md
    │   ├── configs
    │   │   ├── tp_1d.py
    │   │   ├── tp_2d.py
    │   │   ├── tp_2p5d.py
    │   │   └── tp_3d.py
    │   ├── requirements.txt
    │   └── run.py
    └── zero
    │   ├── README.md
    │   ├── requirements.txt
    │   ├── train.py
    │   └── train_v2.py
├── image
    ├── detr-debug
    │   ├── README.md
    │   ├── configs
    │   │   └── detr_1d.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── coco.py
    │   │   ├── coco_eval.py
    │   │   └── transforms.py
    │   ├── engine.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── backbone.py
    │   │   ├── detr.py
    │   │   ├── matcher.py
    │   │   ├── position_encoding.py
    │   │   └── transformer.py
    │   ├── requirements.txt
    │   ├── results
    │   │   ├── log.txt
    │   │   └── loss_curve.jpg
    │   ├── run_train.py
    │   └── util
    │   │   ├── __init__.py
    │   │   ├── box_ops.py
    │   │   ├── misc.py
    │   │   └── plot_utils.py
    ├── detr
    │   ├── README.md
    │   ├── config.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── coco.py
    │   │   ├── coco_eval.py
    │   │   └── transforms.py
    │   ├── engine.py
    │   ├── main.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── backbone.py
    │   │   ├── detr.py
    │   │   ├── matcher.py
    │   │   ├── position_encoding.py
    │   │   ├── segmentation.py
    │   │   └── transformer.py
    │   ├── requirements.txt
    │   └── util
    │   │   ├── __init__.py
    │   │   ├── box_ops.py
    │   │   ├── misc.py
    │   │   └── plot_utils.py
    ├── diffusion
    │   ├── LICENSE
    │   ├── README.md
    │   └── requirements.txt
    ├── mae
    │   ├── .gitignore
    │   ├── README.md
    │   ├── config
    │   │   ├── pretrain.py
    │   │   └── pretrain_1d_tp2.py
    │   ├── main_pretrain.py
    │   ├── models_mae_tp.py
    │   ├── requirements.txt
    │   └── util
    │   │   ├── crop.py
    │   │   ├── misc.py
    │   │   └── pos_embed.py
    ├── mlpmixer
    │   ├── README.md
    │   ├── colossalAI_mlpmixer.py
    │   ├── configs
    │   │   └── MlpMixer_vanilla.py
    │   ├── requirements.txt
    │   ├── train_data.py
    │   └── train_pipline.py
    ├── moe
    │   ├── README.md
    │   ├── config.py
    │   ├── requirements.txt
    │   └── train.py
    ├── resnet
    │   ├── README.md
    │   ├── auto_parallel
    │   │   ├── README.md
    │   │   └── auto_parallel_demo.py
    │   ├── config.py
    │   ├── requirements.txt
    │   ├── resnet.py
    │   └── train.py
    ├── simclr
    │   ├── NT_Xentloss.py
    │   ├── README.md
    │   ├── augmentation.py
    │   ├── config.py
    │   ├── le_config.py
    │   ├── models
    │   │   ├── Backbone.py
    │   │   ├── linear_eval.py
    │   │   └── simclr.py
    │   ├── myhooks.py
    │   ├── requirements.txt
    │   ├── results
    │   │   ├── embedding.npz
    │   │   ├── linear_eval_acc.png
    │   │   ├── linear_eval_loss.png
    │   │   ├── ssl_loss.png
    │   │   ├── test_tsne.png
    │   │   └── train_tsne.png
    │   ├── train.sh
    │   ├── train_linear.py
    │   ├── train_simclr.py
    │   └── visualization.py
    ├── vilt
    │   ├── .gitignore
    │   ├── README.md
    │   ├── configs.py
    │   ├── models
    │   │   └── vilt.py
    │   ├── prepare_dataset.sh
    │   ├── requirements.txt
    │   ├── run.py
    │   ├── run.sh
    │   ├── schedule.py
    │   └── utils
    │   │   ├── base_dataset.py
    │   │   ├── config.py
    │   │   ├── dataloader.py
    │   │   ├── datamodule_base.py
    │   │   ├── heads.py
    │   │   ├── makearrow.py
    │   │   ├── objectives.py
    │   │   ├── transforms
    │   │       ├── __init__.py
    │   │       ├── pixelbert.py
    │   │       ├── randaug.py
    │   │       └── utils.py
    │   │   └── write_coco_karpathy.py
    └── vision_transformer
    │   ├── colo_vit
    │       ├── README.md
    │       ├── configs
    │       │   └── vit_1d_tp2.py
    │       ├── requirements.txt
    │       ├── run.sh
    │       ├── test_vit.py
    │       ├── train.py
    │       ├── utils
    │       │   ├── dummy_data_generator.py
    │       │   └── util.py
    │       └── vit.py
    │   ├── data_parallel
    │       ├── README.md
    │       ├── config.py
    │       ├── mixup.py
    │       ├── myhooks.py
    │       ├── requirements.txt
    │       ├── results
    │       │   ├── acc.jpeg
    │       │   └── loss.jpeg
    │       ├── scripts
    │       │   └── train_slurm.sh
    │       ├── train.py
    │       └── train_with_cifar10.py
    │   └── hybrid_parallel
    │       ├── README.md
    │       ├── configs
    │           ├── vit_1d_tp2_pp2.py
    │           ├── vit_1d_tp4_pp16.py
    │           ├── vit_2d_tp4_pp16.py
    │           ├── vit_2p5d_tp4_pp16.py
    │           ├── vit_3d_tp8_pp8.py
    │           └── vit_pipeline.py
    │       ├── model
    │           ├── __init__.py
    │           └── vit.py
    │       ├── requirements.txt
    │       ├── train_with_cifar10.py
    │       ├── train_with_engine.py
    │       └── train_with_trainer.py
├── language
    ├── DeepNet
    │   ├── README.md
    │   ├── dataset
    │   │   └── webtext.py
    │   ├── decoder_configs
    │   │   └── deepnet_pp1d.py
    │   ├── requirements.txt
    │   └── train_deepnet_decoder.py
    ├── bert
    │   ├── colotensor
    │   │   ├── README.md
    │   │   ├── configs
    │   │   │   └── bert_base_tp1d.py
    │   │   ├── dataset
    │   │   │   ├── __init__.py
    │   │   │   └── wikitext.py
    │   │   ├── model
    │   │   │   ├── __init__.py
    │   │   │   └── hfmodel.py
    │   │   ├── requirements.txt
    │   │   └── train.py
    │   ├── hybrid_parallel
    │   │   ├── README.md
    │   │   ├── colossalai_utils
    │   │   │   ├── bert_config_pp.json
    │   │   │   ├── bert_config_tp1d.json
    │   │   │   ├── bert_config_tp1dpp.json
    │   │   │   ├── bert_config_tp2d.json
    │   │   │   ├── bert_config_tp2p5d.json
    │   │   │   ├── bert_config_tp3d.json
    │   │   │   ├── bert_config_zero.json
    │   │   │   ├── bert_config_zerotppp.json
    │   │   │   ├── model_zoo
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── colo_bert.py
    │   │   │   ├── requirement.txt
    │   │   │   └── utils.py
    │   │   ├── common
    │   │   │   ├── helper.py
    │   │   │   └── train.py
    │   │   ├── requirements.txt
    │   │   └── run.py
    │   ├── preprocessing
    │   │   ├── .gitignore
    │   │   ├── README.md
    │   │   ├── pretrain_preprocess.sh
    │   │   └── requirements.txt
    │   ├── requirements.txt
    │   ├── sequene_parallel
    │   │   ├── README.md
    │   │   ├── config.py
    │   │   ├── data
    │   │   │   ├── __init__.py
    │   │   │   ├── bert_helper.py
    │   │   │   ├── datasets
    │   │   │   │   ├── Makefile
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bert_dataset.py
    │   │   │   │   ├── blendable_dataset.py
    │   │   │   │   ├── builder.py
    │   │   │   │   ├── data_samplers.py
    │   │   │   │   ├── dataset_utils.py
    │   │   │   │   ├── helpers.cpp
    │   │   │   │   ├── ict_dataset.py
    │   │   │   │   ├── indexed_dataset.py
    │   │   │   │   └── test
    │   │   │   │   │   ├── test_indexed_dataset.py
    │   │   │   │   │   └── test_preprocess_data.sh
    │   │   │   └── tokenizer
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bert_tokenization.py
    │   │   │   │   └── tokenizer.py
    │   │   ├── loss_func
    │   │   │   ├── __init__.py
    │   │   │   ├── bert_loss.py
    │   │   │   ├── cross_entropy.py
    │   │   │   └── utils.py
    │   │   ├── lr_scheduler
    │   │   │   ├── __init__.py
    │   │   │   └── annealing_lr.py
    │   │   ├── model
    │   │   │   ├── __init__.py
    │   │   │   ├── bert.py
    │   │   │   └── layers
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bert_layer.py
    │   │   │   │   ├── dropout.py
    │   │   │   │   ├── embedding.py
    │   │   │   │   ├── head.py
    │   │   │   │   ├── init_method.py
    │   │   │   │   ├── linear.py
    │   │   │   │   ├── mlp.py
    │   │   │   │   ├── pooler.py
    │   │   │   │   └── preprocess.py
    │   │   ├── requirements.txt
    │   │   └── train.py
    │   └── zero
    │   │   ├── .gitignore
    │   │   ├── README.md
    │   │   ├── configs
    │   │       ├── bert_base.json
    │   │       ├── colossalai_amp.py
    │   │       └── colossalai_zero.py
    │   │   ├── finetuning
    │   │       └── glue
    │   │       │   ├── __init__.py
    │   │       │   ├── arguments.py
    │   │       │   ├── data.py
    │   │       │   ├── main.py
    │   │       │   ├── metrics.py
    │   │       │   ├── processors.py
    │   │       │   └── utils.py
    │   │   ├── pretraining
    │   │       ├── arguments.py
    │   │       ├── loss.py
    │   │       ├── pretrain_utils.py
    │   │       └── run_pretraining.py
    │   │   ├── requirements.txt
    │   │   └── scripts
    │   │       ├── download_finetune_dataset.sh
    │   │       ├── run_finetune_glue.sh
    │   │       └── run_pretrain.sh
    ├── gpt
    │   ├── README.md
    │   ├── dataset
    │   │   ├── webtext.py
    │   │   └── yuan.py
    │   ├── gpt2_configs
    │   │   ├── gpt2_1d.py
    │   │   ├── gpt2_2d.py
    │   │   ├── gpt2_2p5d.py
    │   │   ├── gpt2_3d.py
    │   │   ├── gpt2_pp.py
    │   │   ├── gpt2_pp1d.py
    │   │   ├── gpt2_vanilla.py
    │   │   ├── gpt2_zero3.py
    │   │   └── gpt2_zero3_pp1d.py
    │   ├── gpt3_configs
    │   │   ├── gpt3_pp1d.py
    │   │   ├── gpt3_pp1d_min.py
    │   │   ├── gpt3_pp2d.py
    │   │   └── gpt3_pp2p5d.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── embed.py
    │   │   ├── gpt1d.py
    │   │   └── pipeline_gpt1d.py
    │   ├── requirements.txt
    │   ├── tools
    │   │   ├── LSH
    │   │   │   └── cMinhash.cpp
    │   │   ├── Megatron
    │   │   │   ├── __init__.py
    │   │   │   ├── blacklist_urls.py
    │   │   │   ├── cleanup_dataset.py
    │   │   │   ├── cleanup_fix_dataset.py
    │   │   │   ├── find_duplicates.py
    │   │   │   ├── gpt2_tokenization.py
    │   │   │   ├── group_duplicate_url.py
    │   │   │   ├── remove_group_duplicates.py
    │   │   │   └── tokenizer.py
    │   │   └── download
    │   │   │   ├── download.py
    │   │   │   ├── download_old.py
    │   │   │   ├── filter.py
    │   │   │   ├── get_urls.py
    │   │   │   ├── scrapers.py
    │   │   │   └── utils.py
    │   └── train_gpt.py
    ├── knowledge_graph_embedding
    │   ├── README.md
    │   ├── config.py
    │   ├── dataloader
    │   │   └── dataloader.py
    │   ├── requirements.txt
    │   └── train.py
    ├── opt
    │   ├── README.md
    │   ├── benchmark.sh
    │   ├── colossalai_zero.py
    │   ├── requirements.txt
    │   ├── run_clm.py
    │   ├── run_clm.sh
    │   └── utils.py
    └── roberta
    │   ├── README.md
    │   ├── configs
    │       ├── colossalai_ddp.py
    │       └── colossalai_zero.py
    │   ├── preprocessing
    │       ├── Makefile
    │       ├── README.md
    │       ├── get_mask.py
    │       ├── mask.cpp
    │       ├── sentence_split.py
    │       └── tokenize_mask.py
    │   ├── pretraining
    │       ├── README.md
    │       ├── arguments.py
    │       ├── bert_dataset_provider.py
    │       ├── evaluation.py
    │       ├── hostfile
    │       ├── loss.py
    │       ├── model
    │       │   ├── bert.py
    │       │   └── deberta_v2.py
    │       ├── nvidia_bert_dataset_provider.py
    │       ├── pretrain_utils.py
    │       ├── run_pretrain.sh
    │       ├── run_pretrain_resume.sh
    │       ├── run_pretraining.py
    │       └── utils
    │       │   ├── WandbLog.py
    │       │   ├── exp_util.py
    │       │   ├── global_vars.py
    │       │   └── logger.py
    │   └── requirements.txt
├── requirements.txt
└── utils
    └── checkpoint
        ├── load.py
        ├── readme.md
        ├── save_engine.py
        └── save_trainer.py


/.github/ISSUE_TEMPLATE/bug-report.yml:
--------------------------------------------------------------------------------
 1 | name: 🐛 Bug Report
 2 | description: Create a report to help us reproduce and fix the bug
 3 | 
 4 | body:
 5 | - type: markdown
 6 |   attributes:
 7 |     value: >
 8 |       #### Not suitable for your needs? [Open a blank issue](https://github.com/hpcaitech/ColossalAI/issues/new).
 9 | - type: textarea
10 |   attributes:
11 |     label: 🐛 Describe the bug
12 |     description: |
13 |       **Describe the bug**
14 |       A clear and concise description of what the bug is.
15 |       **To Reproduce**
16 |       Steps or code snippet to reproduce the behavior.
17 |       **Expected behavior**
18 |       A clear and concise description of what you expected to happen.
19 |       **Screenshots**
20 |       If applicable, add screenshots to help explain your problem.
21 |     placeholder: |
22 |       A clear and concise description of what the bug is.
23 |   validations:
24 |     required: true
25 | - type: textarea
26 |   attributes:
27 |     label: Environment
28 |     description: |
29 |       Please provide the environment information, eg. CUDA/cuDNN/NCCL/Python/PyTorch version.
30 | 
31 | - type: markdown
32 |   attributes:
33 |     value: >
34 |       Thanks for contributing 🎉!
35 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 | contact_links:
3 |   - name: "😊 Discussions"
4 |     url: https://github.com/hpcaitech/ColossalAI/discussions
5 |     about: Ask questions and discuss with other Colossal-AI community members in our forum
6 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation.yml:
--------------------------------------------------------------------------------
 1 | name: 📚 Documentation
 2 | description: Report an issue related to https://www.colossalai.org/
 3 | 
 4 | body:
 5 | - type: markdown
 6 |   attributes:
 7 |     value: >
 8 |       #### Not suitable for your needs? [Open a blank issue](https://github.com/hpcaitech/ColossalAI/issues/new).
 9 | - type: textarea
10 |   attributes:
11 |     label: 📚 The doc issue
12 |     description: |
13 |       **Description** What content in [Documentation](https://www.colossalai.org/) is an issue?
14 |       **Location** Where is the issue location?
15 |       **Expectation** What is your expected content about it?
16 |       **Screenshots** If applicable, add screenshots to help explain your problem.
17 |       **Suggestions** Tell us how we could improve the documentation.
18 |     placeholder: |
19 |       A clear and concise description of the issue.
20 |   validations:
21 |     required: true
22 | 
23 | - type: markdown
24 |   attributes:
25 |     value: >
26 |       Thanks for contributing 🎉!
27 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yml:
--------------------------------------------------------------------------------
 1 | name: 🚀 Feature request
 2 | description: Suggest an idea for this project
 3 | 
 4 | body:
 5 | - type: markdown
 6 |   attributes:
 7 |     value: >
 8 |       #### Not suitable for your needs? [Open a blank issue](https://github.com/hpcaitech/ColossalAI/issues/new).
 9 | - type: textarea
10 |   attributes:
11 |     label: Describe the feature
12 |     description: |
13 |       **Is your feature request related to a problem? Please describe.**
14 |       A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
15 |       **Describe the solution you'd like**
16 |       A clear and concise description of what you want to happen.
17 |       **Describe alternatives you've considered**
18 |       A clear and concise description of any alternative solutions or features you've considered.
19 |       **Screenshots**
20 |       If applicable, add screenshots to help explain your problem.
21 |       **Suggest a potential alternative/fix**
22 |       Tell us how we could improve this project.
23 |     placeholder: |
24 |       A clear and concise description of your idea.
25 |   validations:
26 |     required: true
27 | 
28 | - type: markdown
29 |   attributes:
30 |     value: >
31 |       Thanks for contributing 🎉!
32 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/mirrors-yapf
 3 |     rev: v0.32.0
 4 |     hooks:
 5 |     - id: yapf
 6 |       args: ['--style=.style.yapf', '--parallel', '--in-place']
 7 |   - repo: https://github.com/pre-commit/mirrors-clang-format
 8 |     rev: v13.0.1
 9 |     hooks:
10 |     - id: clang-format
11 | 


--------------------------------------------------------------------------------
/.style.yapf:
--------------------------------------------------------------------------------
1 | [style]
2 | based_on_style = google
3 | spaces_before_comment = 4
4 | split_before_logical_operator = true
5 | column_limit = 120
6 | 


--------------------------------------------------------------------------------
/benchmark/zero/README.md:
--------------------------------------------------------------------------------
 1 | # GPT2 ZeRO Benchmark
 2 | GPT2 ZeRO benchmark with data parallelism to evaluate Colossal-AI, DeepSpeed, FairScale and PatrickStar.
 3 | 
 4 | ## Requirements
 5 | ```
 6 | CUDA>=11.3
 7 | torch>=1.10.0
 8 | deepspeed>=0.5.8
 9 | fairscale>=0.4.5
10 | patrickstar>=0.4.6
11 | nvidia-dali>=1.8.0
12 | ```
13 | 
14 | ## Setup
15 | 1. Install dependencies if you do not have them
16 | ```
17 | pip install -r requirement.txt
18 | ```
19 | 2. Also, download PatrickStar from github
20 | ```
21 | https://github.com/Tencent/PatrickStar.git
22 | ```
23 | 3. Install PatrickStar
24 | ```
25 | cd PatrickStar
26 | pip install .
27 | ```
28 | 4. Add root dir into PYTHONPATH
29 | ```
30 | export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH
31 | ```
32 | 
33 | ## GPT Usage
34 | 
35 | 1. Prepare datasets and tokenizers from HuggingFace Hub if necessary (e.g. we provide an example of training `wikitext-2`).
36 | 
37 | 2. Run benchmark with one of the systems to evaluate
38 | ```
39 | DATA=/PATH/TO/DATASET TOKENIZER=/PATH/TO/TOKENIZER LOG=/PATH/TO/LOG torchrun --nproc_per_node=NUM_GPUS run.py --config=CONFIG_FILE
40 | ```
41 | 
42 | ## VIT Usage
43 | 1. Prepare ImageNet-1k datasets (TFrecord version).
44 | 
45 | 2. Run benchmark with one of the systems to evaluate
46 | ```
47 | DATA=/PATH/TO/DATASET LOG=/PATH/TO/LOG torchrun --nproc_per_node=NUM_GPUS run.py --config=CONFIG_FILE
48 | ```
49 | 


--------------------------------------------------------------------------------
/benchmark/zero/colossalai_utils/gpt2_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "colossalai",
 3 |   "model": {
 4 |     "type": "gpt2_10b"
 5 |   },
 6 |   "hyperparameter": {
 7 |     "batch_size": 1,
 8 |     "steps_per_epoch": 3
 9 |   },
10 |   "fp16": {
11 |     "initial_scale": 32768,
12 |     "min_scale": 1,
13 |     "growth_factor": 2.0,
14 |     "backoff_factor": 0.5,
15 |     "growth_interval": 1000
16 |   },
17 |   "gradient_clipping": 0.0,
18 |   "zero": {
19 |     "reduce_scatter_bucket_size_mb": 25,
20 |     "fp32_reduce_scatter": false,
21 |     "offload_config": {
22 |       "device": "cpu"
23 |     },
24 |     "reuse_fp16_shard": true,
25 |     "version": 2
26 |   },
27 |   "use_mem_monitor": true
28 | }


--------------------------------------------------------------------------------
/benchmark/zero/colossalai_utils/gpt2_config_v1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "colossalai",
 3 |   "model": {
 4 |     "type": "gpt2_small"
 5 |   },
 6 |   "hyperparameter": {
 7 |     "batch_size": 3,
 8 |     "steps_per_epoch": 10
 9 |   },
10 |   "fp16": {
11 |     "initial_scale": 32768,
12 |     "min_scale": 1,
13 |     "growth_factor": 2.0,
14 |     "backoff_factor": 0.5,
15 |     "growth_interval": 1000
16 |   },
17 |   "gradient_clipping": 0.0,
18 |   "zero": {
19 |     "mixed_precision": true,
20 |     "reshard_after_forward": false,
21 |     "offload_config": {
22 |       "device": "cpu"
23 |     },
24 |     "version": 1
25 |   },
26 |   "use_mem_monitor": true
27 | }
28 | 


--------------------------------------------------------------------------------
/benchmark/zero/colossalai_utils/vit_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "colossalai",
 3 |   "model": {
 4 |     "type": "vit_h"
 5 |   },
 6 |   "hyperparameter": {
 7 |     "batch_size": 4,
 8 |     "steps_per_epoch": 10
 9 |   },
10 |   "fp16": {
11 |     "initial_scale": 32768,
12 |     "min_scale": 1,
13 |     "growth_factor": 2.0,
14 |     "backoff_factor": 0.5,
15 |     "growth_interval": 1000
16 |   },
17 |   "gradient_clipping": 1.0,
18 |   "zero": {
19 |     "reduce_scatter_bucket_size_mb": 25,
20 |     "fp32_reduce_scatter": false,
21 |     "offload_config": {
22 |       "device": "cpu"
23 |     },
24 |     "shard_param": true
25 |   },
26 |   "use_mem_monitor": true
27 | }


--------------------------------------------------------------------------------
/benchmark/zero/deepspeed_utils/gpt2_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "deepspeed",
 3 |   "model": {
 4 |     "type": "gpt2_10b"
 5 |   },
 6 |   "hyperparameter": {
 7 |     "batch_size": 20,
 8 |     "num_epochs": 2,
 9 |     "steps_per_epoch": 10,
10 |     "synthetic": true
11 |   },
12 |   "train_batch_size": 40,
13 |   "steps_per_print": 2147483647,
14 |   "zero_optimization": {
15 |     "stage": 3,
16 |     "offload_optimizer": {
17 |       "device": "cpu",
18 |       "pin_memory": true,
19 |       "buffer_count": 4,
20 |       "fast_init": false
21 |     },
22 |     "offload_param": {
23 |       "device": "cpu",
24 |       "pin_memory": true,
25 |       "buffer_count": 5,
26 |       "buffer_size": 1e8,
27 |       "max_in_cpu": 1e9
28 |     },
29 |     "allgather_partitions": true,
30 |     "allgather_bucket_size": 5e8,
31 |     "overlap_comm": true,
32 |     "reduce_scatter": true,
33 |     "reduce_bucket_size": 5e8,
34 |     "contiguous_gradients": true,
35 |     "stage3_max_live_parameters": 1e9,
36 |     "stage3_max_reuse_distance": 1e9,
37 |     "stage3_prefetch_bucket_size": 5e8,
38 |     "stage3_param_persistence_threshold": 1e6
39 |   },
40 |   "gradient_clipping": 1.0,
41 |   "fp16": {
42 |     "enabled": true,
43 |     "loss_scale": 0,
44 |     "initial_scale_power": 5,
45 |     "loss_scale_window": 1000,
46 |     "hysteresis": 2,
47 |     "min_loss_scale": 1
48 |   },
49 |   "use_mem_monitor": true
50 | }


--------------------------------------------------------------------------------
/benchmark/zero/deepspeed_utils/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from zero.common.utils import CONFIG, get_gpu_memory_mb, print_log
 3 | 
 4 | 
 5 | def init_w_ds(builder):
 6 |     import deepspeed
 7 | 
 8 |     config = CONFIG.copy()
 9 | 
10 |     deepspeed.init_distributed()
11 | 
12 |     if CONFIG.get('gpu_mem_fraction', None) is not None:
13 |         torch.cuda.set_per_process_memory_fraction(CONFIG['gpu_mem_fraction'])
14 |         print_log(f'Set max GPU mem: {get_gpu_memory_mb() * CONFIG["gpu_mem_fraction"]:.2f} MB')
15 | 
16 |     build_data, build_model, build_loss, build_optimizer, build_scheduler = builder()
17 | 
18 |     train_data, test_data = build_data()
19 | 
20 |     with deepspeed.zero.Init(config_dict_or_path=config):
21 |         model = build_model()
22 | 
23 |     criterion = build_loss()
24 | 
25 |     optimizer = build_optimizer(model.parameters())
26 | 
27 |     lr_scheduler = build_scheduler(len(train_data), optimizer)
28 | 
29 |     model, optimizer, _, lr_scheduler = deepspeed.initialize(model=model,
30 |                                                              optimizer=optimizer,
31 |                                                              lr_scheduler=lr_scheduler,
32 |                                                              config=config)
33 | 
34 |     return model, train_data, test_data, criterion, optimizer, None, lr_scheduler
35 | 


--------------------------------------------------------------------------------
/benchmark/zero/deepspeed_utils/vit_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "deepspeed",
 3 |   "model": {
 4 |     "type": "vit_h"
 5 |   },
 6 |   "hyperparameter": {
 7 |     "batch_size": 4,
 8 |     "steps_per_epoch": 10
 9 |   },
10 |   "train_batch_size": 32,
11 |   "steps_per_print": 2147483647,
12 |   "zero_optimization": {
13 |     "stage": 3,
14 |     "offload_optimizer": {
15 |       "device": "cpu",
16 |       "pin_memory": true,
17 |       "buffer_count": 4,
18 |       "fast_init": false
19 |     },
20 |     "offload_param": {
21 |       "device": "cpu",
22 |       "pin_memory": true,
23 |       "buffer_count": 5,
24 |       "buffer_size": 1e8,
25 |       "max_in_cpu": 1e9
26 |     },
27 |     "allgather_partitions": true,
28 |     "allgather_bucket_size": 5e8,
29 |     "overlap_comm": true,
30 |     "reduce_scatter": true,
31 |     "reduce_bucket_size": 5e8,
32 |     "contiguous_gradients": true,
33 |     "stage3_max_live_parameters": 1e9,
34 |     "stage3_max_reuse_distance": 1e9,
35 |     "stage3_prefetch_bucket_size": 5e8,
36 |     "stage3_param_persistence_threshold": 1e6
37 |   },
38 |   "gradient_clipping": 1.0,
39 |   "fp16": {
40 |     "enabled": true,
41 |     "loss_scale": 0,
42 |     "initial_scale_power": 15,
43 |     "loss_scale_window": 1000,
44 |     "hysteresis": 2,
45 |     "min_loss_scale": 1
46 |   },
47 |   "use_mem_monitor": true
48 | }


--------------------------------------------------------------------------------
/benchmark/zero/fairscale_utils/gpt2_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "fairscale",
 3 |   "model": {
 4 |     "type": "gpt2_10b"
 5 |   },
 6 |   "hyperparameter": {
 7 |     "batch_size": 1,
 8 |     "num_epochs": 2,
 9 |     "steps_per_epoch": 10,
10 |     "synthetic": true
11 |   },
12 |   "fp16": {
13 |     "enabled": true,
14 |     "init_scale": 32768,
15 |     "growth_factor": 2.0,
16 |     "backoff_factor": 0.5,
17 |     "growth_interval": 1000
18 |   },
19 |   "gradient_clipping": 1.0,
20 |   "fsdp": {
21 |     "reshard_after_forward": true,
22 |     "mixed_precision": true,
23 |     "fp32_reduce_scatter": false,
24 |     "flatten_parameters": true,
25 |     "move_params_to_cpu": true,
26 |     "bucket_cap_mb": 25,
27 |     "clear_autocast_cache": false,
28 |     "force_input_to_fp32": false,
29 |     "state_dict_on_rank_0_only": false
30 |   },
31 |   "use_mem_monitor": true
32 | }


--------------------------------------------------------------------------------
/benchmark/zero/fairscale_utils/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | from zero.common.utils import CONFIG, get_gpu_memory_mb, print_log
 5 | from torch.distributed import init_process_group
 6 | 
 7 | 
 8 | def init_w_fs(builder):
 9 |     from fairscale.nn.checkpoint import checkpoint_wrapper
10 |     from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP
11 |     from fairscale.optim.grad_scaler import ShardedGradScaler
12 | 
13 |     rank = int(os.environ['RANK'])
14 |     world_size = int(os.environ['WORLD_SIZE'])
15 |     host = os.environ['MASTER_ADDR']
16 |     port = int(os.environ['MASTER_PORT'])
17 |     init_process_group(rank=rank, world_size=world_size, init_method=f'tcp://{host}:{port}', backend='nccl')
18 | 
19 |     torch.cuda.set_device(rank)
20 |     if CONFIG.get('gpu_mem_fraction', None) is not None:
21 |         torch.cuda.set_per_process_memory_fraction(CONFIG['gpu_mem_fraction'])
22 |         print_log(f'Set max GPU mem: {get_gpu_memory_mb() * CONFIG["gpu_mem_fraction"]:.2f} MB')
23 | 
24 |     build_data, build_model, build_loss, build_optimizer, build_scheduler = builder()
25 | 
26 |     train_data, test_data = build_data()
27 | 
28 |     assert 'fsdp' in CONFIG
29 |     use_checkpoint = CONFIG['model'].get('checkpoint')
30 |     CONFIG['model']['checkpoint'] = False
31 |     model = build_model()
32 |     if use_checkpoint:
33 |         model = checkpoint_wrapper(model)
34 |     model = FSDP(model, **CONFIG['fsdp'])
35 | 
36 |     criterion = build_loss()
37 | 
38 |     optimizer = build_optimizer(model.parameters())
39 | 
40 |     scaler = ShardedGradScaler(**CONFIG['fp16']) if 'fp16' in CONFIG else None
41 | 
42 |     lr_scheduler = build_scheduler(len(train_data), optimizer)
43 | 
44 |     return model, train_data, test_data, criterion, optimizer, scaler, lr_scheduler
45 | 


--------------------------------------------------------------------------------
/benchmark/zero/fairscale_utils/vit_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "fairscale",
 3 |   "model": {
 4 |     "type": "vit_h",
 5 |     "checkpoint": false
 6 |   },
 7 |   "hyperparameter": {
 8 |     "batch_size": 4
 9 |   },
10 |   "fp16": {
11 |     "enabled": true,
12 |     "init_scale": 32768,
13 |     "growth_factor": 2.0,
14 |     "backoff_factor": 0.5,
15 |     "growth_interval": 1000
16 |   },
17 |   "gradient_clipping": 1.0,
18 |   "fsdp": {
19 |     "reshard_after_forward": true,
20 |     "mixed_precision": true,
21 |     "fp32_reduce_scatter": false,
22 |     "flatten_parameters": true,
23 |     "move_params_to_cpu": true,
24 |     "bucket_cap_mb": 25,
25 |     "clear_autocast_cache": false,
26 |     "force_input_to_fp32": false,
27 |     "state_dict_on_rank_0_only": false
28 |   },
29 |   "use_mem_monitor": true
30 | }


--------------------------------------------------------------------------------
/benchmark/zero/patrickstar_utils/gpt2_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "patrickstar",
 3 |   "model": {
 4 |     "type": "gpt2_10b"
 5 |   },
 6 |   "hyperparameter": {
 7 |     "batch_size": 8,
 8 |     "num_epochs": 2,
 9 |     "steps_per_epoch": 10,
10 |     "synthetic": true
11 |   },
12 |   "optimizer": {
13 |     "type": "AdamW",
14 |     "params": {
15 |       "lr": 0.0015,
16 |       "weight_decay": 0.01,
17 |       "use_hybrid_adam": true
18 |     }
19 |   },
20 |   "fp16": {
21 |     "enabled": true,
22 |     "loss_scale": 0,
23 |     "initial_scale_power": 15,
24 |     "loss_scale_window": 1000,
25 |     "hysteresis": 2,
26 |     "min_loss_scale": 1
27 |   },
28 |   "default_chunk_size": 1073741824,
29 |   "release_after_init": true,
30 |   "gradient_clipping": 1.0,
31 |   "use_cpu_embedding": false,
32 |   "use_mem_monitor": true
33 | }


--------------------------------------------------------------------------------
/benchmark/zero/patrickstar_utils/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | from zero.common.utils import CONFIG, get_gpu_memory_mb, print_log
 5 | from torch.distributed import init_process_group
 6 | 
 7 | 
 8 | def init_w_ps(builder):
 9 |     from patrickstar.runtime import initialize_engine
10 | 
11 |     config = CONFIG.copy()
12 | 
13 |     rank = int(os.environ['RANK'])
14 |     world_size = int(os.environ['WORLD_SIZE'])
15 |     host = os.environ['MASTER_ADDR']
16 |     port = int(os.environ['MASTER_PORT'])
17 |     init_process_group(rank=rank, world_size=world_size, init_method=f'tcp://{host}:{port}', backend='nccl')
18 | 
19 |     torch.cuda.set_device(rank)
20 |     if CONFIG.get('gpu_mem_fraction', None) is not None:
21 |         torch.cuda.set_per_process_memory_fraction(CONFIG['gpu_mem_fraction'])
22 |         print_log(f'Set max GPU mem: {get_gpu_memory_mb() * CONFIG["gpu_mem_fraction"]:.2f} MB')
23 | 
24 |     build_data, build_model, build_loss, _, build_scheduler = builder()
25 | 
26 |     train_data, test_data = build_data()
27 | 
28 |     criterion = build_loss()
29 | 
30 |     model, optimizer = initialize_engine(model_func=build_model, local_rank=rank, config=config)
31 | 
32 |     lr_scheduler = build_scheduler(len(train_data), optimizer)
33 | 
34 |     return model, train_data, test_data, criterion, optimizer, None, lr_scheduler
35 | 


--------------------------------------------------------------------------------
/benchmark/zero/patrickstar_utils/vit_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "patrickstar",
 3 |   "model": {
 4 |     "type": "vit_h"
 5 |   },
 6 |   "hyperparameter": {
 7 |     "batch_size": 4
 8 |   },
 9 |   "optimizer": {
10 |     "type": "AdamW",
11 |     "params": {
12 |       "lr": 0.0015,
13 |       "weight_decay": 0.01,
14 |       "use_hybrid_adam": true
15 |     }
16 |   },
17 |   "fp16": {
18 |     "enabled": true,
19 |     "loss_scale": 0,
20 |     "initial_scale_power": 15,
21 |     "loss_scale_window": 1000,
22 |     "hysteresis": 2,
23 |     "min_loss_scale": 1
24 |   },
25 |   "default_chunk_size": 67108864,
26 |   "release_after_init": true,
27 |   "gradient_clipping": 1.0,
28 |   "use_cpu_embedding": false,
29 |   "use_mem_monitor": true
30 | }


--------------------------------------------------------------------------------
/benchmark/zero/requirement.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | torch>=1.10 -f https://download.pytorch.org/whl/cu113/torch_stable.html
 3 | torchvision -f https://download.pytorch.org/whl/cu113/torch_stable.html
 4 | transformers
 5 | datasets
 6 | colossalai
 7 | deepspeed
 8 | fairscale
 9 | rich
10 | nvidia-dali-cuda110 --extra-index-url https://developer.download.nvidia.com/compute/redist


--------------------------------------------------------------------------------
/benchmark/zero/run.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from zero.colossalai_utils.utils import init_w_col
 4 | from zero.common.gpt2 import gpt2_builder
 5 | from zero.common.train import train
 6 | from zero.common.utils import CONFIG, load_config, print_log
 7 | from zero.common.vit import vit_builder
 8 | from zero.deepspeed_utils.utils import init_w_ds
 9 | from zero.fairscale_utils.utils import init_w_fs
10 | from zero.patrickstar_utils.utils import init_w_ps
11 | from zero.torch_utils.utils import init_w_torch
12 | 
13 | _zero_method = {
14 |     'fairscale': init_w_fs,
15 |     'colossalai': init_w_col,
16 |     'torch': init_w_torch,
17 |     'patrickstar': init_w_ps,
18 |     'deepspeed': init_w_ds
19 | }
20 | 
21 | _builder = {
22 |     'gpt2': gpt2_builder,
23 |     'vit': vit_builder,
24 | }
25 | 
26 | 
27 | def run_zero():
28 |     method = CONFIG['method']
29 |     assert method in ['colossalai', 'deepspeed', 'fairscale', 'patrickstar', 'torch'], f'No support for {method}.'
30 | 
31 |     model = CONFIG['model']['type']
32 |     model_type = model.split('_')[0]
33 |     assert model_type in ['gpt2', 'vit'], f'No support for {model}.'
34 | 
35 |     train(*_zero_method[method](_builder[model_type]))
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     load_config()
40 | 
41 |     CONFIG['log_path'] = os.environ.get('LOG', '.')
42 |     os.makedirs(CONFIG['log_path'], exist_ok=True)
43 | 
44 |     print_log(f'Initializing {CONFIG["method"]} ...')
45 | 
46 |     run_zero()
47 | 


--------------------------------------------------------------------------------
/benchmark/zero/torch_utils/gpt2_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "torch",
 3 |   "model": {
 4 |     "type": "gpt2_10b"
 5 |   },
 6 |   "hyperparameter": {
 7 |     "batch_size": 1,
 8 |     "num_epochs": 2,
 9 |     "steps_per_epoch": 10,
10 |     "synthetic": true
11 |   },
12 |   "fp16": {
13 |     "enabled": true,
14 |     "init_scale": 32768,
15 |     "growth_factor": 2.0,
16 |     "backoff_factor": 0.5,
17 |     "growth_interval": 1000
18 |   },
19 |   "gradient_clipping": 1.0,
20 |   "use_mem_monitor": true
21 | }
22 | 


--------------------------------------------------------------------------------
/benchmark/zero/torch_utils/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | from zero.common.utils import CONFIG, get_gpu_memory_mb, get_model_size, print_log
 5 | from torch.distributed import init_process_group
 6 | from torch.nn.parallel import DistributedDataParallel as DDP
 7 | 
 8 | 
 9 | def init_w_torch(builder):
10 |     rank = int(os.environ['RANK'])
11 |     world_size = int(os.environ['WORLD_SIZE'])
12 |     host = os.environ['MASTER_ADDR']
13 |     port = int(os.environ['MASTER_PORT'])
14 |     init_process_group(rank=rank, world_size=world_size, init_method=f'tcp://{host}:{port}', backend='nccl')
15 | 
16 |     torch.cuda.set_device(rank)
17 |     if CONFIG.get('gpu_mem_fraction', None) is not None:
18 |         torch.cuda.set_per_process_memory_fraction(CONFIG['gpu_mem_fraction'])
19 |         print_log(f'Set max GPU mem: {get_gpu_memory_mb() * CONFIG["gpu_mem_fraction"]:.2f} MB')
20 | 
21 |     build_data, build_model, build_loss, build_optimizer, build_scheduler = builder()
22 | 
23 |     train_data, test_data = build_data()
24 | 
25 |     model = build_model().to(rank)
26 |     if 'numel' not in CONFIG['model']:
27 |         CONFIG['model']['numel'] = get_model_size(model)
28 |     model = DDP(model)
29 | 
30 |     criterion = build_loss()
31 | 
32 |     optimizer = build_optimizer(model.parameters())
33 | 
34 |     scaler = torch.cuda.amp.GradScaler(**CONFIG['fp16']) if 'fp16' in CONFIG else None
35 | 
36 |     lr_scheduler = build_scheduler(len(train_data), optimizer)
37 | 
38 |     return model, train_data, test_data, criterion, optimizer, scaler, lr_scheduler
39 | 


--------------------------------------------------------------------------------
/benchmark/zero/torch_utils/vit_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "torch",
 3 |   "model": {
 4 |     "type": "vit_h"
 5 |   },
 6 |   "hyperparameter": {
 7 |     "batch_size": 4
 8 |   },
 9 |   "fp16": {
10 |     "enabled": true,
11 |     "init_scale": 32768,
12 |     "growth_factor": 2.0,
13 |     "backoff_factor": 0.5,
14 |     "growth_interval": 1000
15 |   },
16 |   "gradient_clipping": 1.0,
17 |   "use_mem_monitor": true
18 | }


--------------------------------------------------------------------------------
/features/amp/config/config_AMP_apex.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | 
 3 | # ViT Base
 4 | BATCH_SIZE = 128
 5 | DROP_RATE = 0.1
 6 | NUM_EPOCHS = 2
 7 | 
 8 | fp16 = dict(
 9 |     mode=AMP_TYPE.APEX,
10 | )
11 | 
12 | clip_grad_norm = 1.0
13 | 


--------------------------------------------------------------------------------
/features/amp/config/config_AMP_naive.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | 
 3 | # ViT Base
 4 | BATCH_SIZE = 128
 5 | DROP_RATE = 0.1
 6 | NUM_EPOCHS = 2
 7 | 
 8 | fp16 = dict(
 9 |     mode=AMP_TYPE.NAIVE,
10 | )
11 | 
12 | clip_grad_norm = 1.0
13 | 


--------------------------------------------------------------------------------
/features/amp/config/config_AMP_torch.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | 
 3 | # ViT Base
 4 | BATCH_SIZE = 128
 5 | DROP_RATE = 0.1
 6 | NUM_EPOCHS = 2
 7 | 
 8 | fp16 = dict(
 9 |     mode=AMP_TYPE.TORCH,
10 | )
11 | 
12 | clip_grad_norm = 1.0
13 | 


--------------------------------------------------------------------------------
/features/amp/config/config_fp32.py:
--------------------------------------------------------------------------------
1 | from colossalai.amp import AMP_TYPE
2 | 
3 | # ViT Base
4 | BATCH_SIZE = 128
5 | DROP_RATE = 0.1
6 | NUM_EPOCHS = 2
7 | 
8 | clip_grad_norm = 1.0
9 | 


--------------------------------------------------------------------------------
/features/amp/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/features/amp/scripts/train_slurm.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | python train_with_engine.py --host $HOST --config ./config/config_AMP_naive.py --port 29500


--------------------------------------------------------------------------------
/features/colotensor/README.md:
--------------------------------------------------------------------------------
 1 | # Use tensor model paralelism via ColoTensor
 2 | 
 3 | ## Introduction
 4 | 
 5 | This is an example of the turorial, **Parallelize Your Training like Megatron-LM via ColoTensor**.
 6 | It can tell you how to make your model adapted to tensor model parallelism.
 7 | Just use the below code to run the example.
 8 | 
 9 | ```bash
10 | colossalai run --nproc_per_node <world_size> gpt_megatron.py
11 | ```


--------------------------------------------------------------------------------
/features/colotensor/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/features/gradient_accumulation/README.md:
--------------------------------------------------------------------------------
 1 | # Gradient Accumulation
 2 | 
 3 | ## Prepare Dataset
 4 | 
 5 | We use CIFAR10 dataset in this example. The dataset will be downloaded to `./data` by default. 
 6 | If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command.
 7 | 
 8 | ```bash
 9 | export DATA=/path/to/data
10 | ```
11 | 
12 | ## Verify Gradient Accumulation
13 | 
14 | To verify gradient accumulation, we can just check the change of parameter values. When gradient accumulation is set, parameters
15 | are only updated in the last step. 
16 | 
17 | ```bash
18 | colossalai run --nproc_per_node 1 train.py
19 | ```


--------------------------------------------------------------------------------
/features/gradient_accumulation/config.py:
--------------------------------------------------------------------------------
1 | from colossalai.amp import AMP_TYPE
2 | 
3 | BATCH_SIZE = 128
4 | NUM_EPOCHS = 200
5 | 
6 | gradient_accumulation = 4
7 | 


--------------------------------------------------------------------------------
/features/gradient_accumulation/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/features/gradient_clipping/README.md:
--------------------------------------------------------------------------------
 1 | # Gradient Clipping
 2 | 
 3 | ## Usage
 4 | 
 5 | To use gradient clipping, you can just add the following code to your configuration file.
 6 | 
 7 | ```python
 8 | gradient_clipping = <float>
 9 | ```
10 | 
11 | ## Prepare Dataset
12 | 
13 | We use CIFAR10 dataset in this example. The dataset will be downloaded to `./data` by default. 
14 | If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command.
15 | 
16 | ```bash
17 | export DATA=/path/to/data
18 | ```
19 | 
20 | ## Verify Gradient Clipping
21 | 
22 | To verify gradient clipping, we can just check the change of parameter values.  
23 | 
24 | ```bash
25 | colossalai run --nproc_per_node 1 train.py
26 | ```


--------------------------------------------------------------------------------
/features/gradient_clipping/config.py:
--------------------------------------------------------------------------------
1 | from colossalai.amp import AMP_TYPE
2 | 
3 | BATCH_SIZE = 128
4 | NUM_EPOCHS = 200
5 | 
6 | gradient_clipping = 2.0
7 | 


--------------------------------------------------------------------------------
/features/gradient_clipping/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/features/gradient_clipping/train.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from colossalai.logging import get_dist_logger
 3 | import colossalai
 4 | import torch
 5 | import os
 6 | from colossalai.core import global_context as gpc
 7 | from colossalai.utils import get_dataloader
 8 | from torchvision import transforms
 9 | from colossalai.nn.lr_scheduler import CosineAnnealingLR
10 | from torchvision.datasets import CIFAR10
11 | from torchvision.models import resnet34
12 | from tqdm import tqdm
13 | 
14 | 
15 | def main():
16 |     colossalai.launch_from_torch(config='./config.py')
17 | 
18 |     logger = get_dist_logger()
19 | 
20 |     # build resnet
21 |     model = resnet34(num_classes=10)
22 | 
23 |     # build dataloaders
24 |     train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')),
25 |                             download=True,
26 |                             transform=transforms.Compose([
27 |                                 transforms.RandomCrop(size=32, padding=4),
28 |                                 transforms.RandomHorizontalFlip(),
29 |                                 transforms.ToTensor(),
30 |                                 transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
31 |                             ]))
32 | 
33 |     train_dataloader = get_dataloader(
34 |         dataset=train_dataset,
35 |         shuffle=True,
36 |         batch_size=gpc.config.BATCH_SIZE,
37 |         num_workers=1,
38 |         pin_memory=True,
39 |     )
40 | 
41 |     # build criterion
42 |     criterion = torch.nn.CrossEntropyLoss()
43 | 
44 |     # optimizer
45 |     optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
46 | 
47 |     # lr_scheduler
48 |     lr_scheduler = CosineAnnealingLR(optimizer, total_steps=gpc.config.NUM_EPOCHS)
49 | 
50 |     engine, train_dataloader, test_dataloader, _ = colossalai.initialize(
51 |         model,
52 |         optimizer,
53 |         criterion,
54 |         train_dataloader,
55 |     )
56 | 
57 |     # verify gradient accumulation
58 |     engine.train()
59 |     for idx, (img, label) in enumerate(train_dataloader):
60 |         img = img.cuda()
61 |         label = label.cuda()
62 | 
63 |         engine.zero_grad()
64 |         output = engine(img)
65 |         train_loss = engine.criterion(output, label)
66 |         engine.backward(train_loss)
67 |         engine.step()
68 |         lr_scheduler.step()
69 | 
70 |         ele_1st = next(model.parameters()).flatten()[0]
71 |         logger.info(f'iteration {idx}, loss: {train_loss}, 1st element of parameters: {ele_1st.item()}')
72 | 
73 |         # only run for 4 iterations
74 |         if idx == 3:
75 |             break
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     main()
80 | 


--------------------------------------------------------------------------------
/features/pipeline_parallel/.init:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/features/pipeline_parallel/.init


--------------------------------------------------------------------------------
/features/pipeline_parallel/README.md:
--------------------------------------------------------------------------------
 1 | # Train ResNet50 on CIFAR10 with pipeline
 2 | 
 3 | ## requirement
 4 | 
 5 | To use pipeline parallel training, you should install colossalai from the **latest** main branch.
 6 | ## How to run
 7 | 
 8 | We use `colossalai.launch_from_torch` as an example here. Before running, you should `export DATA=/path/to/cifar`. 
 9 | 
10 | If you are training with single node multiple GPUs:
11 | ```shell
12 | colossalai run --nproc_per_node <world_size> resnet.py
13 | ```
14 | 


--------------------------------------------------------------------------------
/features/pipeline_parallel/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/features/pipeline_parallel/rpc/gpt/dataset/webtext.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | import torch
 5 | from colossalai.registry import DATASETS
 6 | from torch.utils.data import Dataset
 7 | from transformers import GPT2Tokenizer
 8 | 
 9 | 
10 | @DATASETS.register_module
11 | class WebtextDataset(Dataset):
12 |     def __init__(self, path, seq_len=1024) -> None:
13 |         super().__init__()
14 |         root = os.path.dirname(path)
15 |         encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
16 |         if os.path.isfile(encoded_data_cache_path):
17 |             seq_len_, data, attention_mask = torch.load(encoded_data_cache_path)
18 |             if seq_len_ == seq_len:
19 |                 self.data = data
20 |                 self.attention_mask = attention_mask
21 |                 return
22 |         raw_data = []
23 |         with open(path) as f:
24 |             for line in f.readlines():
25 |                 raw_data.append(json.loads(line)['text'])
26 |         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
27 |         tokenizer.pad_token = tokenizer.unk_token
28 |         encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt')
29 |         self.data = encoded_data['input_ids']
30 |         self.attention_mask = encoded_data['attention_mask']
31 |         torch.save((seq_len, self.data, self.attention_mask), encoded_data_cache_path)
32 | 
33 |     def __len__(self):
34 |         return len(self.data)
35 | 
36 |     def __getitem__(self, index):
37 |         return {'input_ids': self.data[index],
38 |             'attention_mask': self.attention_mask[index]}, self.data[index]


--------------------------------------------------------------------------------
/features/pipeline_parallel/rpc/opt/README.md:
--------------------------------------------------------------------------------
 1 | # Example
 2 | 
 3 | Example of training OPT-125m through different PP strategies.
 4 | 
 5 | ## run non-interleaved 1F1B
 6 | 
 7 | ```bash
 8 | python3 1f1b.py --world_size=4 --num_microbatches=8 --device="cuda" --batch_size=16 --epoch=20 --master_port=29011
 9 | ```
10 | 
11 | > for customized world_size, please adjust partition strategy


--------------------------------------------------------------------------------
/features/pipeline_parallel/rpc/opt/check/opt_avail.py:
--------------------------------------------------------------------------------
 1 | from transformers import GPT2Tokenizer, OPTForCausalLM
 2 | 
 3 | model = OPTForCausalLM.from_pretrained("facebook/opt-125m")
 4 | tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-125m")
 5 | 
 6 | prompt = "Hey, are you consciours? Can you talk to me?"
 7 | inputs = tokenizer(prompt, return_tensors="pt")
 8 | 
 9 | generate_ids = model.generate(inputs.input_ids, max_length=30)
10 | print(tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])


--------------------------------------------------------------------------------
/features/pipeline_parallel/rpc/resnet/README.md:
--------------------------------------------------------------------------------
 1 | # Example
 2 | 
 3 | Example of training resnet on cifar through different PP strategies.
 4 | 
 5 | ## import data
 6 | 
 7 | ```bash
 8 | export DATA=/path/cifar-10
 9 | ```
10 | 
11 | ## run Fill Drain
12 | ```bash
13 | python3 fill_drain.py --epoch=1 --world_size=2 --batch_size=512 --chunk=1 --optimizer="SGD" --device="cuda" --num_microbatches=4
14 | ```
15 | 
16 | > for customized world_size, please adjust partition strategy
17 | 
18 | 
19 | ## run 1F1B
20 | 
21 | ```bash
22 | python3 1f1b.py --epoch=1 --world_size=2 --batch_size=512 --chunk=1 --optimizer="SGD" --device="cuda" --num_microbatches=4
23 | ```
24 | 
25 | > for customized world_size, please adjust partition strategy
26 | 
27 | ## run Chimera
28 | chimera is not stable, it is possible for the program here hang at some iteration.
29 | ```bash
30 | python3 chimera.py --world_size=2 --epoch=1 --batch_size=128 --chun=1 --optimizer="SGD" --device="cuda" --num_microbatches=4
31 | ```
32 | 
33 | > for customized world_size, please adjust partition strategy
34 | 
35 | ## help
36 | run `python3 1f1b.py --help` for available config of the pipeline:
37 | 
38 | ```
39 |   -h, --help            show this help message and exit
40 |   --epoch EPOCH
41 |   --world_size WORLD_SIZE
42 |   --batch_size BATCH_SIZE
43 |   --dp_degree DP_DEGREE
44 |   --tp_degree TP_DEGREE
45 |   --num_microbatches NUM_MICROBATCHES
46 |   --chunk CHUNK
47 |   --use_checkpoint
48 |   --optimizer {SGD,Adam,RMSprop}
49 |   --device {cpu,cuda}
50 |   --master_addr MASTER_ADDR
51 |   --master_port MASTER_PORT
52 |   --num_worker_threads NUM_WORKER_THREADS
53 | ```
54 | 
55 | `chunk` means the number of the virtual pipeline stages on each card. If `chunk==1`, then there is only one virtual stage on each card, equivalent to **non-overleaved** mode.
56 | 
57 | If `chunk>1`(`chunk=2` for example) then there are two virtual stages on each card, equivalent to **overleaved** mode.
58 | 
59 | As a result, actual number of pipeline stage (donated to `actual_stage_num`) is $\text{chunk} \times \text{world\_size}$.
60 | 
61 | It is recommended not to set `chunk>2`, too much communication payload on one card may make `torch.distributed.rpc` go wrong. It depends on your hardware.
62 | 
63 | In the demo of resnet, please set `worlds_size=2, chunk=1`, because current partition strategy only support this config.


--------------------------------------------------------------------------------
/features/tensor_parallel/README.md:
--------------------------------------------------------------------------------
 1 | # Tensor Parallelism
 2 | 
 3 | ## Usage
 4 | 
 5 | To use tensor parallelism, there are several steps to follow:
 6 | 
 7 | 1. define `parallel` in your configuration file. Set `mode` for `tensor` to `1d`, `2d`, `2.5d` or `3d`.
 8 | 2. construct your model, replace `torch.nn.Linear` with `colossalai.nn.Linear`.
 9 | 3. split the input data accordingly
10 | 
11 | ## Reference
12 | 
13 | If you wish to understand how tensor parallelism works exactly, you may refer to our [documentation](https://colossalai.org/docs/features/1D_tensor_parallel).
14 | 
15 | 
16 | ## How to run
17 | 
18 | In this example, we constructed a simple MLP model for demonstration purpose. You can execute the following commands to run the demo.
19 | 
20 | ```shell
21 | # run 1D tensor parallelism on 4 GPUs
22 | colossalai run --nproc_per_node=4 run.py --config ./configs/tp_1d.py
23 | 
24 | # run 2D tensor parallelism 4 GPUs
25 | colossalai run --nproc_per_node=4 run.py --config ./configs/tp_2d.py
26 | 
27 | # run 2.5D tensor parallelism 8 GPUs
28 | colossalai run --nproc_per_node=8 run.py --config ./configs/tp_2p5d.py
29 | 
30 | # run 3D tensor parallelism 8 GPUs
31 | colossalai run --nproc_per_node=8 run.py --config ./configs/tp_3d.py
32 | ```
33 | 


--------------------------------------------------------------------------------
/features/tensor_parallel/configs/tp_1d.py:
--------------------------------------------------------------------------------
1 | parallel = dict(
2 |     data=1,
3 |     pipeline=1,
4 |     tensor=dict(size=2, mode='1d'),
5 | )
6 | 


--------------------------------------------------------------------------------
/features/tensor_parallel/configs/tp_2d.py:
--------------------------------------------------------------------------------
1 | parallel = dict(
2 |     data=1,
3 |     pipeline=1,
4 |     tensor=dict(size=4, mode='2d'),
5 | )
6 | 


--------------------------------------------------------------------------------
/features/tensor_parallel/configs/tp_2p5d.py:
--------------------------------------------------------------------------------
1 | parallel = dict(
2 |     data=1,
3 |     pipeline=1,
4 |     tensor=dict(size=8, mode='2.5d', depth=2),
5 | )
6 | 


--------------------------------------------------------------------------------
/features/tensor_parallel/configs/tp_3d.py:
--------------------------------------------------------------------------------
1 | parallel = dict(
2 |     data=1,
3 |     pipeline=1,
4 |     tensor=dict(size=8, mode='3d'),
5 | )
6 | 


--------------------------------------------------------------------------------
/features/tensor_parallel/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/features/tensor_parallel/run.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import colossalai.nn as col_nn
 3 | import torch
 4 | from colossalai.context import ParallelMode
 5 | from colossalai.core import global_context as gpc
 6 | from colossalai.utils import get_current_device, print_rank_0
 7 | from colossalai.global_variables import tensor_parallel_env as tp_env
 8 | 
 9 | 
10 | class MLP(torch.nn.Module):
11 | 
12 |     def __init__(self, dim: int = 256):
13 |         super().__init__()
14 |         intermediate_dim = dim * 4
15 |         self.dense_1 = col_nn.Linear(dim, intermediate_dim)
16 |         print_rank_0(f'Weight of the first linear layer: {self.dense_1.weight.shape}')
17 |         self.activation = torch.nn.GELU()
18 |         self.dense_2 = col_nn.Linear(intermediate_dim, dim)
19 |         print_rank_0(f'Weight of the second linear layer: {self.dense_2.weight.shape}')
20 |         self.dropout = col_nn.Dropout(0.1)
21 | 
22 |     def forward(self, x):
23 |         x = self.dense_1(x)
24 |         print_rank_0(f'Output of the first linear layer: {x.shape}')
25 |         x = self.activation(x)
26 |         x = self.dense_2(x)
27 |         print_rank_0(f'Output of the second linear layer: {x.shape}')
28 |         x = self.dropout(x)
29 |         return x
30 | 
31 | 
32 | def main():
33 |     colossalai.logging.disable_existing_loggers()
34 |     parser = colossalai.get_default_parser()
35 |     args = parser.parse_args()
36 |     colossalai.launch_from_torch(config=args.config)
37 | 
38 |     m = MLP()
39 | 
40 |     x = torch.randn((16, 256), device=get_current_device())
41 |     torch.distributed.broadcast(x, src=0)
42 | 
43 |     # partition input
44 |     if tp_env.mode == '1d':
45 |         pass
46 |     elif tp_env.mode == '2d':
47 |         x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL)]
48 |         x = torch.chunk(x, 2, dim=-1)[gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW)]
49 |     elif tp_env.mode == '2.5d':
50 |         x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_DEP)]
51 |         x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL)]
52 |         x = torch.chunk(x, 2, dim=-1)[gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_ROW)]
53 |     elif tp_env.mode == '3d':
54 |         x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT)]
55 |         x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_3D_INPUT)]
56 |         x = torch.chunk(x, 2, dim=-1)[gpc.get_local_rank(ParallelMode.PARALLEL_3D_OUTPUT)]
57 |     print_rank_0(f'Input: {x.shape}')
58 | 
59 |     x = m(x)
60 | 
61 |     gpc.destroy()
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     main()
66 | 


--------------------------------------------------------------------------------
/features/zero/README.md:
--------------------------------------------------------------------------------
 1 | # ZeRO
 2 | 
 3 | This tutorial works for ColossalAI v0.1.10.
 4 | 
 5 | ## Prepare Model
 6 | 
 7 | In this example, we use `Hugging Face Transformers`. You have to install `transformers` before running this example. We will take `GPT2 Medium` as an example here.
 8 | 
 9 | ```shell
10 | # install huggingface transformers
11 | pip install transformers
12 | ```
13 | 
14 | ## Prepare Data
15 | 
16 | This example is intended for showing you how to use `ZeRO`. For simplicity, we just use randomly generated data here.
17 | 
18 | ## Run with ZeRO
19 | 
20 | We just use naive training loop in this example. `Engine` and `Trainer` are not used.
21 | 
22 | ```shell
23 | colossalai run --nproc_per_node=1 train.py
24 | ```


--------------------------------------------------------------------------------
/features/zero/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/image/detr-debug/README.md:
--------------------------------------------------------------------------------
 1 | # colossal_detr
 2 | Reproduce the DETR model with ColossalAI
 3 | 
 4 | ## Background
 5 | This project is the reproduction of [DETR model](https://arxiv.org/abs/2005.12872) with [ColossalAI](https://github.com/hpcaitech/ColossalAI) tool.
 6 | 
 7 | ## Envirionment setup
 8 | ```
 9 | git clone https://github.com/hpcaitech/ColossalAI.git
10 | cd ColossalAI
11 | # install dependency
12 | pip install -r requirements/requirements.txt
13 | 
14 | # install colossalai
15 | pip install .
16 | ```
17 | 
18 | ## How to run
19 | ```
20 | $ DATA=/path/to/data/ python -m torch.distributed.launch --nproc_per_node=nproc_per_node
21 |                                                          --master_addr MASTER_ADDR
22 |                                                          --master_port MASTER_PORT
23 |                                                          run_train.py
24 |                                                          --config=CONFIG_FILE
25 |                                                          --world_size=WORLD_SIZE
26 |                                                          --rank=RANK
27 |                                                          --local_rank=LOCAL_RANK
28 | ```
29 | 
30 | ## Cite us
31 | ```
32 | @article{bian2021colossal,
33 |   title={Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training},
34 |   author={Bian, Zhengda and Liu, Hongxin and Wang, Boxiang and Huang, Haichen and Li, Yongbin and Wang, Chuanrui and Cui, Fan and You, Yang},
35 |   journal={arXiv preprint arXiv:2110.14883},
36 |   year={2021}
37 | }
38 | ```


--------------------------------------------------------------------------------
/image/detr-debug/configs/detr_1d.py:
--------------------------------------------------------------------------------
 1 | BATCH_SIZE = 4
 2 | LEARNING_RATE = 2e-3
 3 | WEIGHT_DECAY = 3e-2
 4 | 
 5 | # pipeline config
 6 | parallel = dict(pipeline=2,)
 7 | NUM_MICRO_BATCHES = parallel['pipeline']
 8 | 
 9 | # tensor config
10 | #TENSOR_PARALLEL_SIZE = 2
11 | #TENSOR_PARALLEL_MODE = '1d'
12 | 
13 | NUM_EPOCHS = 800
14 | WARMUP_EPOCHS = 40
15 | clip_max_norm = 2.
16 | 
17 | seed = 77
18 | 
19 | LOG_PATH = f"./detr_1d_ai2d_tp2_bs{BATCH_SIZE}_lr{LEARNING_RATE}/"
20 | 
21 | 
22 | find_unused_parameters = True
23 | 
24 | coco_path = '/data/huxin/xjtuhx/projects/ai2d-detection-baselines/111/data_dir/ai2d/'
25 | pre_norm = False
26 | save_ckpt_freq = 50
27 | lr_backbone = 1e-5
28 | device = 'cuda'
29 | lr_drop = 200
30 | backbone = 'resnet34'
31 | dilation = None
32 | position_embedding = 'sine'
33 | enc_layers = 2
34 | dec_layers = 2
35 | dim_feedforward = 512
36 | hidden_dim = 256
37 | dropout = 0.1
38 | nheads = 1
39 | num_queries = 100
40 | masks = False
41 | set_cost_class = 1
42 | set_cost_bbox = 5
43 | set_cost_giou = 2
44 | mask_loss_coef = 1
45 | dice_loss_coef = 1
46 | bbox_loss_coef = 5
47 | giou_loss_coef = 2
48 | eos_coef = 0.1
49 | dataset_file = 'ai2d'
50 | remove_difficult = True
51 | output_dir = '/data/huxin/xjtuhx/projects/ai2d-detection-baselines/111/output_test/'
52 | resume = ''
53 | start_epoch = 0
54 | eval = False
55 | num_workers = 2
56 | world_size = 1
57 | dist_url = 'env://'
58 | distributed = True
59 | aux_loss = False
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/image/detr-debug/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | import torch.utils.data
 3 | import torchvision
 4 | # from util.params import opt
 5 | 
 6 | from .coco import build as build_coco
 7 | 
 8 | def get_coco_api_from_dataset(dataset):
 9 |     for _ in range(10):
10 |         if isinstance(dataset, torch.utils.data.Subset):
11 |             dataset = dataset.dataset
12 |     if isinstance(dataset, torchvision.datasets.CocoDetection):
13 |         return dataset.coco
14 | 
15 | 
16 | def build_dataset(image_set, args):
17 |     if args.dataset_file == 'coco':
18 |         return build_coco(image_set, args)
19 |     if args.dataset_file == 'ai2d':
20 |         return build_coco(image_set, args)
21 | 
22 |     raise ValueError(f'dataset {args.dataset_file} not supported')


--------------------------------------------------------------------------------
/image/detr-debug/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .detr import DETR
 2 | 
 3 | 
 4 | def build_model(backbone, transformer, num_classes):
 5 |     model = DETR(
 6 |         backbone,
 7 |         transformer,
 8 |         num_classes=num_classes,
 9 |         num_queries=50,
10 |         aux_loss=False,
11 |     )
12 | 
13 |     return model


--------------------------------------------------------------------------------
/image/detr-debug/models/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | """
 3 | Various positional encodings for the transformer.
 4 | """
 5 | import math
 6 | import torch
 7 | from torch import nn
 8 | from colossalai.registry import LAYERS, MODELS
 9 | 
10 | @LAYERS.register_module
11 | class PositionEmbeddingSine(nn.Module):
12 |     """
13 |     This is a more standard version of the position embedding, very similar to the one
14 |     used by the Attention is all you need paper, generalized to work on images.
15 |     """
16 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
17 |         super().__init__()
18 |         self.num_pos_feats = num_pos_feats
19 |         self.temperature = temperature
20 |         self.normalize = normalize
21 |         if scale is not None and normalize is False:
22 |             raise ValueError("normalize should be True if scale is passed")
23 |         if scale is None:
24 |             scale = 2 * math.pi
25 |         self.scale = scale
26 | 
27 |     def forward(self, tensor_list):
28 |         x = tensor_list.tensors
29 |         mask = tensor_list.mask
30 |         assert mask is not None
31 |         not_mask = ~mask
32 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
33 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
34 |         if self.normalize:
35 |             eps = 1e-6
36 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
37 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
38 | 
39 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
40 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
41 | 
42 |         pos_x = x_embed[:, :, :, None] / dim_t
43 |         pos_y = y_embed[:, :, :, None] / dim_t
44 |         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
45 |         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
46 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
47 |         return pos
48 | 
49 | 
50 | def build_position_encoding(args):
51 |     N_steps = args.hidden_dim // 2
52 |     # if args.position_embedding in ('v2', 'sine'):
53 |     position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
54 |     # elif args.position_embedding in ('v3', 'learned'):
55 |     #     position_embedding = PositionEmbeddingLearned(N_steps)
56 |     # else:
57 |     #     raise ValueError(f"not supported {args.position_embedding}")
58 | 
59 |     return position_embedding


--------------------------------------------------------------------------------
/image/detr-debug/requirements.txt:
--------------------------------------------------------------------------------
 1 | colossalai
 2 | -e git+ssh://git@github.com/hpcaitech/ColossalAI.git@7d15ec7fe20b07180f5dc3f4b580e2cba37c5b9e#egg=colossalai
 3 | absl-py==1.0.0
 4 | cachetools==4.2.4
 5 | certifi==2021.10.8
 6 | charset-normalizer==2.0.10
 7 | colossalai
 8 | einops==0.4.0
 9 | google-auth==2.3.3
10 | google-auth-oauthlib==0.4.6
11 | grpcio==1.43.0
12 | idna==3.3
13 | importlib-metadata==4.10.1
14 | Markdown==3.3.6
15 | numpy==1.21.5
16 | nvidia-dali-cuda102==1.6.0
17 | oauthlib==3.1.1
18 | packaging==21.3
19 | Pillow==9.0.0
20 | pip==21.2.2
21 | protobuf==3.19.3
22 | psutil==5.9.0
23 | pyasn1==0.4.8
24 | pyasn1-modules==0.2.8
25 | pyparsing==3.0.7
26 | requests==2.27.1
27 | requests-oauthlib==1.3.0
28 | rsa==4.8
29 | setuptools==58.0.4
30 | six==1.16.0
31 | tensorboard==2.8.0
32 | tensorboard-data-server==0.6.1
33 | tensorboard-plugin-wit==1.8.1
34 | tensorboardX==2.4.1
35 | timm==0.5.4
36 | torch==1.10.1
37 | torchvision==0.11.2
38 | tqdm==4.62.3
39 | typing_extensions==4.0.1
40 | urllib3==1.26.8
41 | Werkzeug==2.0.2
42 | wheel==0.37.1
43 | zipp==3.7.0


--------------------------------------------------------------------------------
/image/detr-debug/results/loss_curve.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/detr-debug/results/loss_curve.jpg


--------------------------------------------------------------------------------
/image/detr-debug/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/detr-debug/util/__init__.py


--------------------------------------------------------------------------------
/image/detr/README.md:
--------------------------------------------------------------------------------
 1 | # DEtection TRansformer (DETR) on Colossal-AI
 2 | 
 3 | ## Requirement
 4 | 
 5 | You should install colossalai from the **latest** main branch.
 6 | 
 7 | ---
 8 | 
 9 | ## How to run
10 | 
11 | On a single server, you can directly use torch.distributed to start pre-training on multiple GPUs in parallel. In Colossal-AI, we provided several launch methods to init the distributed backend. You can use `colossalai.launch` and `colossalai.get_default_parser` to pass the parameters via command line. If you happen to use launchers such as SLURM, OpenMPI and PyTorch launch utility, you can use `colossalai.launch_from_<torch/slurm/openmpi>` to read rank and world size from the environment variables directly for convenience. 
12 | 
13 | Before running, you should `export DATA=/path/to/coco`.
14 | 
15 | In your terminal
16 | ```shell
17 | colossalai run --nproc_per_node <world_size> main.py --config config.py
18 | ```
19 | 
20 | ---
21 | 
22 | 
23 | ## Details
24 | `config.py`
25 | 
26 | Containing configurations for DETR.
27 | 
28 | `main.py`
29 | 
30 | Engine is called through this file to start the training process using Colossal-AI.
31 | 
32 | `engine.py`
33 | 
34 | Process training and evaluating procedures about DETR.
35 | 
36 | `./datasets`
37 | 
38 | Dataset proprocessings.
39 | 
40 | `./models`
41 | 
42 | Model specifications of DETR model. Containing Transformer and Backbone implementations. 
43 | 
44 | `./util`
45 | 
46 | Utilities used in DETR.


--------------------------------------------------------------------------------
/image/detr/config.py:
--------------------------------------------------------------------------------
 1 | BATCH_SIZE = 2
 2 | LEARNING_RATE = 1e-4
 3 | WEIGHT_DECAY = 1e-4
 4 | 
 5 | TENSOR_PARALLEL_SIZE = 4
 6 | TENSOR_PARALLEL_MODE = '1d'
 7 | 
 8 | NUM_EPOCHS = 300
 9 | lr_drop = 200
10 | clip_max_norm = 0.1
11 | 
12 | # gradient_clipping = 0.1
13 | 
14 | parallel = dict(
15 |     pipeline=1,
16 |     tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
17 | )
18 | 
19 | cudnn_benchmark = False
20 | 
21 | seed = 42
22 | 
23 | LOG_PATH = f"./detr_{TENSOR_PARALLEL_MODE}_coco_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/"
24 | 
25 | 
26 | # find_unused_parameters = True
27 | 
28 | coco_path = '/data/scratch/coco'
29 | save_ckpt_freq = 50
30 | lr_backbone = 1e-5
31 | device = 'cuda'
32 | lr_drop = 200
33 | backbone = 'resnet50'
34 | dilation = False
35 | position_embedding = 'sine'
36 | enc_layers = 6
37 | dec_layers = 6
38 | dim_feedforward = 2048
39 | hidden_dim = 256
40 | dropout = 0.1
41 | nheads = 8
42 | num_queries = 100
43 | masks = False
44 | set_cost_class = 1
45 | set_cost_bbox = 5
46 | set_cost_giou = 2
47 | mask_loss_coef = 1
48 | dice_loss_coef = 1
49 | bbox_loss_coef = 5
50 | giou_loss_coef = 2
51 | eos_coef = 0.1
52 | dataset_file = 'coco'
53 | remove_difficult = False
54 | output_dir = ''
55 | resume = ''
56 | start_epoch = 0
57 | eval = False
58 | num_workers = 2
59 | dist_url = 'env://'
60 | distributed = True
61 | aux_loss = True
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/image/detr/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | import torch.utils.data
 3 | import torchvision
 4 | 
 5 | from .coco import build as build_coco
 6 | 
 7 | 
 8 | def get_coco_api_from_dataset(dataset):
 9 |     for _ in range(10):
10 |         # if isinstance(dataset, torchvision.datasets.CocoDetection):
11 |         #     break
12 |         if isinstance(dataset, torch.utils.data.Subset):
13 |             dataset = dataset.dataset
14 |     if isinstance(dataset, torchvision.datasets.CocoDetection):
15 |         return dataset.coco
16 | 
17 | 
18 | def build_dataset(image_set, args):
19 |     if args.dataset_file == 'coco':
20 |         return build_coco(image_set, args)
21 |     if args.dataset_file == 'coco_panoptic':
22 |         # to avoid making panopticapi required for coco
23 |         from .coco_panoptic import build as build_coco_panoptic
24 |         return build_coco_panoptic(image_set, args)
25 |     raise ValueError(f'dataset {args.dataset_file} not supported')
26 | 


--------------------------------------------------------------------------------
/image/detr/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .detr import build
2 | 
3 | 
4 | def build_model(args):
5 |     return build(args)
6 | 


--------------------------------------------------------------------------------
/image/detr/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/image/detr/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/detr/util/__init__.py


--------------------------------------------------------------------------------
/image/diffusion/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/image/mae/.gitignore:
--------------------------------------------------------------------------------
1 | ./data
2 | ./output


--------------------------------------------------------------------------------
/image/mae/README.md:
--------------------------------------------------------------------------------
 1 | # Pretrain MAE on ImageNet 1000 (mini)
 2 | 
 3 | Colossal-ai implementation of MAE, [arxiv](https//arxiv.org/abs/2111.06377).
 4 | 
 5 | As an example, we just cover the pretrain phase with ImageNet 1000
 6 | mini dataset. Helpers under subdir [util/](./util/) are from
 7 | [facebookresearch/deit](https://github.com/facebookresearch/deit),
 8 | under Apache License 2.0.
 9 | 
10 | ## Prepare Dataset
11 | 
12 | In the script, we used ImageNet 1000 (mini) dataset hosted on 
13 | [kaggle](https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000/discussion).
14 | 
15 | Download and extract the dataset, then setting the environment
16 | variable `DATA`, or soft link data to the default location `{config_dir}/data`
17 | 
18 | ```bash
19 | # example
20 | export DATA=/path/to/imagenet-mini/
21 | 
22 | # or link to default place
23 | ln -s /path/to/imagenet-mini/ ./data
24 | ```
25 | 
26 | ## Run single-GPU training
27 | 
28 | This example is developed and tested under PyTorch 1.10, use `torchrun`
29 | to run it:
30 | 
31 | ```bash
32 | torchrun --standalone --nnodes=1 --nproc_per_node 1 main_pretrain.py
33 | ```
34 | 
35 | It would read [./config/pretrain.py](./config/pretrain.py) as startup
36 | configuration, feel free to check it if you want to fine-tune the model
37 | or get some insight.
38 | 
39 | By default, the pretrained model would generate a series of checkpoints, named
40 | `./output/checkpoint-{epoch}.pth`.
41 | 
42 | 
43 | ## Run multi-GPU training
44 | 
45 | To run multi-GPU training on a single node, just change the `--nproc_per_node`
46 | parameter. For example, if `--nproc_per_node=4`, 4 GPUs on this machine will be
47 | used for training. However, to make sure the model converges well, you should 
48 | adjust your batch size and learning rate accordingly.
49 | 
50 | 
51 | ## Tensor Parallel
52 | 
53 | Model in [models_mae_tp.py](./models_mae_tp.py) is modified to support 1D tensor parallelism.
54 | About 1D tensor parallelism you can read [this documentation](https://www.colossalai.org/docs/features/1D_tensor_parallel).
55 | [./config/pretrain_1d_tp2.py](./config/pretrain_1d_tp2.py) is the 1D parallel configuration.
56 | 
57 | Pass file path with flag `--config`:
58 | 
59 | ```bash
60 | torchrun --standalone --nnodes 1 --nproc_per_node 2 main_pretrain.py --config ./config/pretrain_1d_tp2.py 
61 | ```
62 | 
63 | We can also increase data parallelism by increasing `--nproc_per_node`:
64 | 
65 | ```bash
66 | torchrun --standalone --nnodes 1 --nproc_per_node 4 main_pretrain.py --config ./config/pretrain_1d_tp2.py 
67 | ```
68 | 
69 | This will result in `data parallel size: 2, pipeline parallel size: 1, tensor parallel size: 2`
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/image/mae/config/pretrain.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | 
 4 | from colossalai.amp import AMP_TYPE
 5 | from torchvision import transforms
 6 | 
 7 | import util.misc as misc
 8 | from util.crop import RandomResizedCrop
 9 | 
10 | # ==== Colossal-AI Configuration ====
11 | 
12 | gradient_accumulation = 1
13 | fp16 = dict(mode=AMP_TYPE.TORCH)
14 | 
15 | # ==== Model Configuration ====
16 | #
17 | # Variable Naming Convension:
18 | #
19 | # 1. `THIS_WILL_BE_DERECTLY_ACCESSED_BY_MAIN`: All capital.
20 | #   eg: VERBOSE, LEARNING_RATE
21 | #
22 | # 2. `_THIS_WILL_BE_USED_TO_GENERATE_(1)`: Begin with underscore.
23 | #   eg: __BASE_LEARNING_RATE
24 | #
25 | # 3. `this_is_a_simple_helper`: Snake case.
26 | #   eg: eff_batch_size
27 | 
28 | # toggle more loggings
29 | VERBOSE = False
30 | DEBUG = False
31 | 
32 | NUM_EPOCHS = 800
33 | # epochs to warmup LR
34 | WARMUP_EPOCHS = 40 if NUM_EPOCHS > 40 else 0
35 | 
36 | # Interval to save a checkpoint
37 | CHECKPOINT_INTERVAL = 20
38 | 
39 | # Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus
40 | BATCH_SIZE = 4
41 | 
42 | # Place to save pretrained model
43 | OUTPUT_DIR = Path(__file__).parent.parent / "output"
44 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
45 | 
46 | # Masking ratio (percentage of removed patches).
47 | MASK_RATIO = 0.75
48 | 
49 | # learning rate (absolute lr)
50 | LEARNING_RATE = 0.01
51 | # lower lr bound for cyclic schedulers that hit 0
52 | MINIMUM_LEARNING_RATE = 0
53 | # base learning rate: absolute_lr = base_lr * total_batch_size / 256
54 | _BASE_LEARNING_RATE = 1e-3
55 | try:
56 |     LEARNING_RATE
57 | except NameError:
58 |     eff_batch_size = BATCH_SIZE * gradient_accumulation * misc.get_world_size()
59 |     LEARNING_RATE = _BASE_LEARNING_RATE * eff_batch_size / 256
60 | 
61 | WEIGHT_DECAY = 0.5
62 | 
63 | # Use (per-patch) normalized pixels as targets for computing loss
64 | NORM_PIX_LOSS = True
65 | 
66 | # resume from checkpoint
67 | RESUME = False
68 | if RESUME:
69 |     RESUME_ADDRESS = ""
70 | 
71 | TRANSFORM_TRAIN = transforms.Compose(
72 |     [
73 |         RandomResizedCrop(224, interpolation=3),
74 |         transforms.RandomHorizontalFlip(),
75 |         transforms.ToTensor(),
76 |         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
77 |     ]
78 | )
79 | 
80 | TRANSFORM_VAL = transforms.Compose(
81 |     [
82 |         transforms.Resize(256, interpolation=3),
83 |         transforms.CenterCrop(224),
84 |         transforms.ToTensor(),
85 |         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
86 |     ]
87 | )
88 | 
89 | # ==== Dynamic Configuration ====
90 | 
91 | try:
92 |     DATAPATH = Path(os.environ["DATA"])
93 | except KeyError:
94 |     DATAPATH = Path(__file__).parent.parent / "data"
95 | 


--------------------------------------------------------------------------------
/image/mae/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/image/mae/util/crop.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import math
 8 | 
 9 | import torch
10 | 
11 | from torchvision import transforms
12 | from torchvision.transforms import functional as F
13 | 
14 | 
15 | class RandomResizedCrop(transforms.RandomResizedCrop):
16 |     """
17 |     RandomResizedCrop for matching TF/TPU implementation: no for-loop is used.
18 |     This may lead to results different with torchvision's version.
19 |     Following BYOL's TF code:
20 |     https://github.com/deepmind/deepmind-research/blob/master/byol/utils/dataset.py#L206
21 |     """
22 | 
23 |     @staticmethod
24 |     def get_params(img, scale, ratio):
25 |         width, height = F.get_image_size(img)
26 |         area = height * width
27 | 
28 |         target_area = area * torch.empty(1).uniform_(scale[0], scale[1]).item()
29 |         log_ratio = torch.log(torch.tensor(ratio))
30 |         aspect_ratio = torch.exp(
31 |             torch.empty(1).uniform_(log_ratio[0], log_ratio[1])
32 |         ).item()
33 | 
34 |         w = int(round(math.sqrt(target_area * aspect_ratio)))
35 |         h = int(round(math.sqrt(target_area / aspect_ratio)))
36 | 
37 |         w = min(w, width)
38 |         h = min(h, height)
39 | 
40 |         i = torch.randint(0, height - h + 1, size=(1,)).item()
41 |         j = torch.randint(0, width - w + 1, size=(1,)).item()
42 | 
43 |         return i, j, h, w
44 | 


--------------------------------------------------------------------------------
/image/mlpmixer/README.md:
--------------------------------------------------------------------------------
 1 | # ColossalAI_MlpMixer
 2 | This project is the reproduction of MlpMixer model with ColossalAI tool.
 3 | 
 4 | # Result
 5 | 
 6 | | Task       | Model          | 	Training Time |  Top-1 Accuracy |
 7 | | ------------- |:-------------:| -----:| -----:|
 8 | | CIFAR10     |ColossalAI_MlpMixer | ~ 30 min |  ~ 89.42% |
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 
20 | ## Envirionment setup
21 | ```
22 | git clone https://github.com/hpcaitech/ColossalAI.git
23 | cd ColossalAI
24 | # install dependency
25 | pip install -r requirements/requirements.txt
26 | 
27 | # install colossalai
28 | pip install .
29 | ```
30 | 
31 | ## Usage
32 | 
33 | To start training, use the following command to run each worker:
34 | ```
35 | $ DATA=/path/to/dataset python train_data.py --world_size=WORLD_SIZE \
36 |                                         --rank=RANK \
37 |                                         --local_rank=LOCAL_RANK \
38 |                                         --host=MASTER_IP_ADDRESS \
39 |                                         --port=MASTER_PORT \
40 |                                         --config=CONFIG_FILE
41 | ```
42 | It is also recommended to start training with `torchrun` as:
43 | 
44 | ```
45 | $ DATA=/path/to/dataset torchrun --nproc_per_node=NUM_GPUS_PER_NODE \
46 |                                  --nnodes=NUM_NODES \
47 |                                  --node_rank=NODE_RANK \
48 |                                  --master_addr=MASTER_IP_ADDRESS \
49 |                                  --master_port=MASTER_PORT \
50 |                                  train_data.py --config=CONFIG_FILE
51 | ```
52 | For the pipeline parallelism, use the following command to run each worker:
53 | 
54 | ```
55 | $ DATA=/path/to/dataset torchrun --nproc_per_node=NUM_GPUS_PER_NODE \
56 |                                  --nnodes=NUM_NODES \
57 |                                  --node_rank=NODE_RANK \
58 |                                  --master_addr=MASTER_IP_ADDRESS \
59 |                                  --master_port=MASTER_PORT \
60 |                                  train_parallel.py
61 |  ```
62 |    
63 |                                 
64 |      
65 |         
66 | 
67 | 
68 | 
69 | 
70 | ## Cite us
71 | ```
72 | @article{bian2021colossal,
73 |   title={Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training},
74 |   author={Bian, Zhengda and Liu, Hongxin and Wang, Boxiang and Huang, Haichen and Li, Yongbin and Wang, Chuanrui and Cui, Fan and You, Yang},
75 |   journal={arXiv preprint arXiv:2110.14883},
76 |   year={2021}
77 | }
78 | ```
79 | 


--------------------------------------------------------------------------------
/image/mlpmixer/configs/MlpMixer_vanilla.py:
--------------------------------------------------------------------------------
 1 | BATCH_SIZE = 512
 2 | LEARNING_RATE = 2e-3
 3 | WEIGHT_DECAY = 3e-2
 4 | 
 5 | TENSOR_PARALLEL_SIZE = 1
 6 | TENSOR_PARALLEL_MODE = None
 7 | 
 8 | NUM_EPOCHS = 200
 9 | WARMUP_EPOCHS = 40
10 | 
11 | parallel = dict(
12 |     pipeline=1,
13 |     tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
14 | )
15 | 
16 | seed = 42
17 | 
18 | LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/"
19 | 


--------------------------------------------------------------------------------
/image/mlpmixer/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.8
2 | torchvision>=0.9
3 | numpy
4 | tqdm
5 | psutil
6 | tensorboard
7 | packaging
8 | colossalai


--------------------------------------------------------------------------------
/image/moe/README.md:
--------------------------------------------------------------------------------
 1 | # Overview
 2 | 
 3 | MoE is a new technique to enlarge neural networks while keeping the same throughput in our training. 
 4 | It is designed to improve the performance of our models without any additional time penalty. Our old 
 5 | version moe parallelism will cause a moderate computation overhead and additional memory usage. But
 6 | we are happy to announce that recently enabled CUDA kernels have solved the problem above. There 
 7 | are only two things that you need to concern. One is the additional communication time which highly
 8 | depends on the topology and bandwidth of the network in running environment. Another is extra memory usage,
 9 | since we have a larger model thanks to MoE. We will continuously maintain and optimize our MoE system
10 | and be encouraged by any issue that can help us improve our system.
11 | 
12 | At present, we have provided Widenet and ViT-MoE in our model zoo (more information about Widenet can be 
13 | found [here](https://arxiv.org/abs/2107.11817)). We now support a recent technique proposed by Microsoft, PR-MoE.
14 | You can access [here](https://arxiv.org/abs/2201.05596) to know more about PR-MoE.
15 | Directly use ViT-MoE in our model zoo or use MoeModule in your model to exploit PR-MoE.
16 | 
17 | Here is a simple example about how to run ViT-MoE Lite6 with PR-MoE on cifar10.
18 | 
19 | # How to run
20 | 
21 | Before running this training script, you must set a environment variable called `DATA` where you place
22 | cifar10 data or want to place cifar10 data.
23 | 
24 | ```shell
25 | export DATA=<absolute path where you store cifar10 data> 
26 | ```
27 | 
28 | On a single server, you can directly use torchrun to start pre-training on multiple GPUs in parallel. 
29 | If you use the script here to train, just use follow instruction in your terminal. `n_proc` is the 
30 | number of processes which commonly equals to the number GPUs.
31 | 
32 | ```shell
33 | torchrun --nnodes=1 --nproc_per_node=8 train.py \
34 |     --config ./config.py
35 | ```
36 | 
37 | If you want to use multi servers, please check our document about environment initialization.
38 | 
39 | Make sure to initialize moe running environment by `moe_set_seed` before building the model.
40 | 
41 | # Result
42 | 
43 | The best evaluation accuracy during training ViT-MoE Lite6 on cifar10 from scratch is 90.66%, which is better than average
44 | performance in training ViT Lite7. The result can be improved by data augmentations such as mixup and Randaug.
45 | We will offer those training scripts soon.


--------------------------------------------------------------------------------
/image/moe/config.py:
--------------------------------------------------------------------------------
 1 | BATCH_SIZE = 512
 2 | LEARNING_RATE = 2e-3
 3 | WEIGHT_DECAY = 3e-2
 4 | 
 5 | NUM_EPOCHS = 200
 6 | WARMUP_EPOCHS = 40
 7 | 
 8 | parallel = dict()
 9 | max_ep_size = 1  # all experts are replicated in the case that user only has 1 GPU
10 | clip_grad_norm = 1.0  # enable gradient clipping and set it to 1.0
11 | 
12 | LOG_PATH = f"./cifar10_moe"
13 | 


--------------------------------------------------------------------------------
/image/moe/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/image/resnet/README.md:
--------------------------------------------------------------------------------
 1 | # Train ResNet on CIFAR10
 2 | 
 3 | ## Prepare Dataset
 4 | 
 5 | We use CIFAR10 dataset in this example. The dataset will be downloaded to `./data` by default. 
 6 | If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command.
 7 | 
 8 | ```bash
 9 | export DATA=/path/to/data
10 | ```
11 | 
12 | 
13 | ## Run single-GPU training
14 | 
15 | We provide two examples of training resnet 18 on the CIFAR10 dataset. You can choose other ResNet models in `resnet.py` as well.
16 | You can change the value of `nproc_per_node` to adjust the number of GPUs used for training. 
17 | When the `nproc_per_node` is changed, you may need to adjust the learning rate and batch size in the `config.py` accordingly.
18 | Normally we follow the rule of linear scaling, which is `new_global_batch_size / new_learning_rate = old_global_batch_size / old_learning rate`.
19 | 
20 | ```bash
21 | # with engine
22 | colossalai run --nproc_per_node 1 train.py
23 | 
24 | # with trainer
25 | colossalai run --nproc_per_node 1 train.py --use_trainer
26 | ```
27 | 
28 | ## Experiment Results
29 | 
30 | | model      | dataset     | Testing Accuracy |
31 | | -          | -           | -                |
32 | | ResNet18   | CIFAR10     | 95.2%            |
33 | 


--------------------------------------------------------------------------------
/image/resnet/auto_parallel/README.md:
--------------------------------------------------------------------------------
 1 | # Train ResNet on CIFAR10 with auto_parallel
 2 | 
 3 | ## Prepare Dataset
 4 | 
 5 | We use CIFAR10 dataset in this example. The dataset will be downloaded to `./data` by default. 
 6 | If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command.
 7 | 
 8 | ```bash
 9 | export DATA=/path/to/data
10 | ```
11 | 
12 | 
13 | ## Run on 2*2 device mesh
14 | 
15 | ```bash
16 | colossalai run --nproc_per_node 4 auto_parallel_demo.py
17 | ```


--------------------------------------------------------------------------------
/image/resnet/config.py:
--------------------------------------------------------------------------------
1 | from colossalai.amp import AMP_TYPE
2 | 
3 | BATCH_SIZE = 128
4 | NUM_EPOCHS = 200
5 | 
6 | CONFIG = dict(fp16=dict(mode=AMP_TYPE.TORCH))
7 | 


--------------------------------------------------------------------------------
/image/resnet/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/image/simclr/NT_Xentloss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from colossalai.registry import LOSSES
 5 | from torch.nn.modules.linear import Linear
 6 | 
 7 | @LOSSES.register_module
 8 | class NT_Xentloss(nn.Module):
 9 |     def __init__(self, temperature=0.5):
10 |         super().__init__()
11 |         self.temperature = temperature
12 | 
13 |     def forward(self, z1, z2, label):
14 |         z1 = F.normalize(z1, dim=1)
15 |         z2 = F.normalize(z2, dim=1)
16 |         N, Z = z1.shape 
17 |         device = z1.device 
18 |         representations = torch.cat([z1, z2], dim=0)
19 |         similarity_matrix = F.cosine_similarity(representations.unsqueeze(1), representations.unsqueeze(0), dim=-1)
20 |         l_pos = torch.diag(similarity_matrix, N)
21 |         r_pos = torch.diag(similarity_matrix, -N)
22 |         positives = torch.cat([l_pos, r_pos]).view(2 * N, 1)
23 |         diag = torch.eye(2*N, dtype=torch.bool, device=device)
24 |         diag[N:,:N] = diag[:N,N:] = diag[:N,:N]
25 | 
26 |         negatives = similarity_matrix[~diag].view(2*N, -1)
27 | 
28 |         logits = torch.cat([positives, negatives], dim=1)
29 |         logits /= self.temperature
30 | 
31 |         labels = torch.zeros(2*N, device=device, dtype=torch.int64)
32 | 
33 |         loss = F.cross_entropy(logits, labels, reduction='sum')
34 |         return loss / (2 * N)
35 | 
36 | 
37 | if __name__=='__main__':
38 |     criterion = NT_Xentloss()
39 |     net = Linear(256,512)
40 |     output = [net(torch.randn(512,256)), net(torch.randn(512,256))]
41 |     label = [torch.randn(512)]
42 |     loss = criterion(*output, *label)
43 |     print(loss)
44 | 
45 |     


--------------------------------------------------------------------------------
/image/simclr/augmentation.py:
--------------------------------------------------------------------------------
 1 | from torchvision.transforms import transforms
 2 | 
 3 | class SimCLRTransform():
 4 |     def __init__(self):
 5 |         self.transform = transforms.Compose([
 6 |             transforms.RandomResizedCrop(size=32, scale=(0.2, 1.0)),
 7 |             transforms.RandomHorizontalFlip(),
 8 |             transforms.RandomApply([transforms.ColorJitter(0.8, 0.8, 0.8, 0.2)], p=0.8),
 9 |             transforms.RandomGrayscale(p=0.2),
10 |             transforms.RandomApply([transforms.GaussianBlur(kernel_size=32//20*2+1, sigma=(0.1, 2.0))], p=0.5),
11 |             transforms.ToTensor(),
12 |             transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010])
13 |         ])
14 | 
15 |     def __call__(self, x):
16 |         x1 = self.transform(x)
17 |         x2 = self.transform(x)
18 |         return x1, x2 
19 | 
20 | 
21 | class LeTransform():
22 |     def __init__(self):
23 |         self.transform = transforms.Compose([
24 |             transforms.RandomResizedCrop(size=32, scale=(0.2, 1.0)),
25 |             transforms.RandomHorizontalFlip(),
26 |             transforms.ToTensor(),
27 |             transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010])
28 |         ])
29 |         
30 |     def __call__(self, x):
31 |         x = self.transform(x)
32 |         return x


--------------------------------------------------------------------------------
/image/simclr/config.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | 
 3 | 
 4 | LOG_NAME = 'cifar-simclr'
 5 | 
 6 | BATCH_SIZE = 512
 7 | NUM_EPOCHS = 801
 8 | LEARNING_RATE = 0.03 * BATCH_SIZE / 256
 9 | WEIGHT_DECAY = 0.0005
10 | MOMENTUM = 0.9
11 | 
12 | 
13 | fp16 = dict(
14 |     mode=AMP_TYPE.TORCH,
15 | )
16 | 
17 | dataset = dict(
18 |     root='./dataset',
19 | )
20 | 
21 | gradient_accumulation = 2
22 | clip_grad_norm = 1.0
23 | 


--------------------------------------------------------------------------------
/image/simclr/le_config.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | 
 3 | 
 4 | LOG_NAME = 'cifar-simclr'
 5 | EPOCH = 800
 6 | 
 7 | BATCH_SIZE = 512
 8 | NUM_EPOCHS = 51
 9 | LEARNING_RATE = 0.03*BATCH_SIZE/256
10 | WEIGHT_DECAY = 0.0005
11 | MOMENTUM = 0.9
12 | 
13 | 
14 | fp16 = dict(
15 |     mode=AMP_TYPE.TORCH,
16 | )
17 | 
18 | dataset = dict(
19 |     root='./dataset',
20 | )
21 | 
22 | gradient_accumulation = 1
23 | clip_grad_norm = 1.0
24 | 


--------------------------------------------------------------------------------
/image/simclr/models/linear_eval.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from .Backbone import backbone
 5 | 
 6 | class Linear_eval(nn.Module):
 7 | 
 8 |     def __init__(self, model='resnet18', class_num=10, **kwargs):
 9 |         super().__init__()
10 |         
11 |         self.backbone = backbone(model, **kwargs)
12 |         self.backbone.requires_grad_(False)
13 |         self.fc = nn.Linear(self.backbone.output_dim, class_num)
14 |         
15 |     def forward(self, x):
16 | 
17 |         out = self.backbone(x)
18 |         out = self.fc(out)
19 |         return out
20 | 


--------------------------------------------------------------------------------
/image/simclr/models/simclr.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from .Backbone import backbone
 5 | 
 6 | class projection_MLP(nn.Module):
 7 |     def __init__(self, in_dim, out_dim=256):
 8 |         super().__init__()
 9 |         hidden_dim = in_dim
10 |         self.layer1 = nn.Sequential(
11 |             nn.Linear(in_dim, hidden_dim),
12 |             nn.ReLU(inplace=True)
13 |         )
14 |         self.layer2 = nn.Linear(hidden_dim, out_dim)
15 |     def forward(self, x):
16 |         x = self.layer1(x)
17 |         x = self.layer2(x)
18 |         return x 
19 | 
20 | class SimCLR(nn.Module):
21 | 
22 |     def __init__(self, model='resnet18', **kwargs):
23 |         super().__init__()
24 |         
25 |         self.backbone = backbone(model, **kwargs)
26 |         self.projector = projection_MLP(self.backbone.output_dim)
27 |         self.encoder = nn.Sequential(
28 |             self.backbone,
29 |             self.projector
30 |         )
31 |         
32 |     def forward(self, x1, x2):
33 | 
34 |         z1 = self.encoder(x1)
35 |         z2 = self.encoder(x2)
36 |         return z1, z2


--------------------------------------------------------------------------------
/image/simclr/myhooks.py:
--------------------------------------------------------------------------------
 1 | from colossalai.trainer.hooks import BaseHook
 2 | from colossalai.core import global_context as gpc
 3 | from colossalai.context import ParallelMode
 4 | from colossalai.logging import get_dist_logger
 5 | 
 6 | 
 7 | class TotalBatchsizeHook(BaseHook):
 8 |     def __init__(self, priority: int = 2) -> None:
 9 |         super().__init__(priority)
10 |         self.logger = get_dist_logger()
11 | 
12 |     def before_train(self, trainer):
13 |         total_batch_size = gpc.config.BATCH_SIZE * \
14 |             gpc.config.gradient_accumulation * gpc.get_world_size(ParallelMode.DATA)
15 |         self.logger.info(f'Total batch size = {total_batch_size}', ranks=[0])


--------------------------------------------------------------------------------
/image/simclr/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/image/simclr/results/embedding.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/simclr/results/embedding.npz


--------------------------------------------------------------------------------
/image/simclr/results/linear_eval_acc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/simclr/results/linear_eval_acc.png


--------------------------------------------------------------------------------
/image/simclr/results/linear_eval_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/simclr/results/linear_eval_loss.png


--------------------------------------------------------------------------------
/image/simclr/results/ssl_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/simclr/results/ssl_loss.png


--------------------------------------------------------------------------------
/image/simclr/results/test_tsne.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/simclr/results/test_tsne.png


--------------------------------------------------------------------------------
/image/simclr/results/train_tsne.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/simclr/results/train_tsne.png


--------------------------------------------------------------------------------
/image/simclr/train.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | 
3 | ## phase 1: self-supervised training
4 | python -m torch.distributed.launch --nproc_per_node 1 --master_addr localhost --master_port 29500 train_simclr.py
5 | 
6 | ## phase 2: linear evaluation
7 | python -m torch.distributed.launch --nproc_per_node 1 --master_addr localhost --master_port 29500 train_linear.py


--------------------------------------------------------------------------------
/image/vilt/.gitignore:
--------------------------------------------------------------------------------
1 | ckpt/
2 | logs/


--------------------------------------------------------------------------------
/image/vilt/README.md:
--------------------------------------------------------------------------------
 1 | # Train ViLT on COCO dataset with Colossal-AI
 2 | 
 3 | Colossal-AI implementation for the ICML 2021 (long talk) paper: "[ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334)"
 4 | 
 5 | ---
 6 | <!-- <p align="center">
 7 |   <img align="middle" src="./assets/vilt.png" alt="The main figure"/>
 8 | </p> -->
 9 | 
10 | 
11 | 
12 | ## Prepare Environment
13 | ```bash
14 | pip install -r requirements.txt
15 | ```
16 | 
17 | ## Prepare Dataset
18 | In this example we use the COCO Captions (COCO) dataset.
19 | 
20 | ```bash
21 | bash prepare_dataset.sh <DATA_ROOT>
22 | ```
23 | 
24 | ## Train masked language (MLM) Models
25 | 
26 | ```bash
27 | bash run.sh <DATA_ROOT> <NUM_GPUS>
28 | 
29 | ex)
30 | 
31 | bash run.sh /vilt_data 4
32 | ```
33 | 
34 | 
35 | ## Citation
36 | If you use any part of this code and pretrained weights for your own purpose, please cite the original [paper](https://arxiv.org/abs/2102.03334).
37 | ```
38 | @InProceedings{pmlr-v139-kim21k,
39 |   title = 	 {ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision},
40 |   author =       {Kim, Wonjae and Son, Bokyung and Kim, Ildoo},
41 |   booktitle = 	 {Proceedings of the 38th International Conference on Machine Learning},
42 |   pages = 	 {5583--5594},
43 |   year = 	 {2021},
44 |   editor = 	 {Meila, Marina and Zhang, Tong},
45 |   volume = 	 {139},
46 |   series = 	 {Proceedings of Machine Learning Research},
47 |   month = 	 {18--24 Jul},
48 |   publisher =    {PMLR},
49 |   pdf = 	 {http://proceedings.mlr.press/v139/kim21k/kim21k.pdf},
50 |   url = 	 {http://proceedings.mlr.press/v139/kim21k.html},
51 |   abstract = 	 {Vision-and-Language Pre-training (VLP) has improved performance on various joint vision-and-language downstream tasks. Current approaches to VLP heavily rely on image feature extraction processes, most of which involve region supervision (e.g., object detection) and the convolutional architecture (e.g., ResNet). Although disregarded in the literature, we find it problematic in terms of both (1) efficiency/speed, that simply extracting input features requires much more computation than the multimodal interaction steps; and (2) expressive power, as it is upper bounded to the expressive power of the visual embedder and its predefined visual vocabulary. In this paper, we present a minimal VLP model, Vision-and-Language Transformer (ViLT), monolithic in the sense that the processing of visual inputs is drastically simplified to just the same convolution-free manner that we process textual inputs. We show that ViLT is up to tens of times faster than previous VLP models, yet with competitive or better downstream task performance. Our code and pre-trained weights are available at https://github.com/dandelin/vilt.}
52 | }
53 | ```
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/image/vilt/configs.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | 
 3 | BATCH_SIZE = 256
 4 | DROP_RATE = 0.1
 5 | NUM_EPOCHS = 10
 6 | 
 7 | fp16 = dict(
 8 |     mode=AMP_TYPE.TORCH,
 9 | )
10 | 
11 | gradient_accumulation = 16
12 | gradient_clipping = 1.0
13 | 
14 | parallel = dict(
15 |     tensor=dict(size=2, mode='1d'),
16 | )
17 | num_epochs = 10
18 | 
19 | # config logging path
20 | logging = dict(
21 |     root_path='./logs'
22 | )


--------------------------------------------------------------------------------
/image/vilt/prepare_dataset.sh:
--------------------------------------------------------------------------------
 1 | WORKSPACE=$(pwd)
 2 | 
 3 | RAW_ROOT=$1
 4 | 
 5 | ARROW_ROOT=$RAW_ROOT/arrow
 6 | 
 7 | if [ -z $RAW_ROOT ]
 8 | then
 9 |     echo "Usage: $0 <DATA_ROOT>"
10 |     exit 1
11 | fi
12 | 
13 | if [ ! -e $ARROW_ROOT ]
14 | then
15 |     mkdir $ARROW_ROOT
16 | fi
17 | 
18 | if [ -e $RAW_ROOT ]
19 | then
20 |     cd $RAW_ROOT
21 | else
22 |     mkdir $RAW_ROOT
23 | fi
24 | 
25 | 
26 | 
27 | 
28 | # download all files
29 | wget http://images.cocodataset.org/zips/train2014.zip
30 | wget http://images.cocodataset.org/zips/val2014.zip
31 | wget https://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip
32 | 
33 | # unzip all files
34 | unzip train2014.zip -d $RAW_ROOT/train2014
35 | unzip val2014.zip -d $RAW_ROOT/val2014
36 | unzip caption_datasets.zip -d $RAW_ROOT/karpathy
37 | 
38 | # remove all files
39 | rm train2014.zip
40 | rm val2014.zip
41 | rm caption_datasets.zip
42 | 
43 | # converting the dataset
44 | cd $WORKSPACE
45 | python utils/makearrow.py $RAW_ROOT $ARROW_ROOT
46 | 


--------------------------------------------------------------------------------
/image/vilt/requirements.txt:
--------------------------------------------------------------------------------
 1 | transformers==4.2.1
 2 | Pillow==8.2.0
 3 | tqdm==4.56.0
 4 | ipdb==0.13.4
 5 | numpy==1.19.5
 6 | einops==0.3.0
 7 | pyarrow==2.0.0
 8 | sacred==0.8.2
 9 | pandas==1.1.5
10 | colossalai
11 | git+https://github.com/rwightman/pytorch-image-models.git


--------------------------------------------------------------------------------
/image/vilt/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORK_DIR=$(pwd)
 4 | DATA_ROOT=$1/arrow
 5 | NUM_GPUS=$2
 6 | 
 7 | if [ -z $DATA_ROOT ] || [ -z $NUM_GPUS ]
 8 | then
 9 |     echo "Usage: $0 <data_root> <num_gpus>"
10 |     exit 1
11 | fi
12 | 
13 | cd $WORK_DIR
14 | 
15 | if ! [ -x "$(command -v mpirun)" ]
16 | then
17 |     torchrun --nproc_per_node $NUM_GPUS --master_addr localhost --master_port 11455 run.py
18 | else
19 |     mpirun -np $NUM_GPUS python run.py with data_root=$DATA_ROOT num_gpus=$NUM_GPUS num_nodes=1 task_mlm_itm_s step200k per_gpu_batchsize=96
20 | fi
21 | 


--------------------------------------------------------------------------------
/image/vilt/schedule.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import torch
 3 | 
 4 | class viltSchedule(colossalai.engine.schedule.NonPipelineSchedule):
 5 |     @staticmethod
 6 |     def _call_engine_criterion(engine, outputs, labels):
 7 |         # assert isinstance(outputs, (torch.Tensor, list, tuple)
 8 |         #                   ), f'Expect output of model is (torch.Tensor, list, tuple), got {type(outputs)}'
 9 |         if isinstance(outputs, torch.Tensor):
10 |             outputs = (outputs, )
11 |         if isinstance(labels, torch.Tensor):
12 |             return engine.criterion(*outputs, labels)
13 |         else:
14 |             return engine.criterion(outputs)
15 | 
16 |     def __init__(self,batch_data_process_func) -> None:
17 |         super().__init__(batch_data_process_func)
18 |     @staticmethod
19 |     def _call_engine(engine, inputs):
20 |         return engine(inputs)
21 | 


--------------------------------------------------------------------------------
/image/vilt/utils/heads.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from transformers.models.bert.modeling_bert import BertPredictionHeadTransform
 6 | 
 7 | 
 8 | class Pooler(nn.Module):
 9 |     def __init__(self, hidden_size):
10 |         super().__init__()
11 |         self.dense = nn.Linear(hidden_size, hidden_size)
12 |         self.activation = nn.Tanh()
13 | 
14 |     def forward(self, hidden_states):
15 |         first_token_tensor = hidden_states[:, 0]
16 |         pooled_output = self.dense(first_token_tensor)
17 |         pooled_output = self.activation(pooled_output)
18 |         return pooled_output
19 | 
20 | 
21 | class ITMHead(nn.Module):
22 |     def __init__(self, hidden_size):
23 |         super().__init__()
24 |         self.fc = nn.Linear(hidden_size, 2)
25 | 
26 |     def forward(self, x):
27 |         x = self.fc(x)
28 |         return x
29 | 
30 | 
31 | class MLMHead(nn.Module):
32 |     def __init__(self, config, weight=None):
33 |         super().__init__()
34 |         self.transform = BertPredictionHeadTransform(config)
35 |         self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
36 |         self.bias = nn.Parameter(torch.zeros(config.vocab_size))
37 |         if weight is not None:
38 |             self.decoder.weight = weight
39 | 
40 |     def forward(self, x):
41 |         x = self.transform(x)
42 |         x = self.decoder(x) + self.bias
43 |         return x
44 | 
45 | 
46 | class MPPHead(nn.Module):
47 |     def __init__(self, config):
48 |         super().__init__()
49 |         self.transform = BertPredictionHeadTransform(config)
50 |         self.decoder = nn.Linear(config.hidden_size, 256 * 3)
51 | 
52 |     def forward(self, x):
53 |         x = self.transform(x)
54 |         x = self.decoder(x)
55 |         return x
56 | 


--------------------------------------------------------------------------------
/image/vilt/utils/makearrow.py:
--------------------------------------------------------------------------------
1 | from write_coco_karpathy import make_arrow
2 | import sys
3 | 
4 | make_arrow(sys.argv[1], sys.argv[2])


--------------------------------------------------------------------------------
/image/vilt/utils/transforms/__init__.py:
--------------------------------------------------------------------------------
 1 | from .pixelbert import (
 2 |     pixelbert_transform,
 3 |     pixelbert_transform_randaug,
 4 | )
 5 | 
 6 | _transforms = {
 7 |     "pixelbert": pixelbert_transform,
 8 |     "pixelbert_randaug": pixelbert_transform_randaug,
 9 | }
10 | 
11 | 
12 | def keys_to_transforms(keys: list, size=224):
13 |     return [_transforms[key](size=size) for key in keys]
14 | 


--------------------------------------------------------------------------------
/image/vilt/utils/transforms/pixelbert.py:
--------------------------------------------------------------------------------
 1 | from .utils import (
 2 |     inception_normalize,
 3 |     MinMaxResize,
 4 | )
 5 | from torchvision import transforms
 6 | from .randaug import RandAugment
 7 | 
 8 | 
 9 | def pixelbert_transform(size=800):
10 |     longer = int((1333 / 800) * size)
11 |     return transforms.Compose(
12 |         [
13 |             MinMaxResize(shorter=size, longer=longer),
14 |             transforms.ToTensor(),
15 |             inception_normalize,
16 |         ]
17 |     )
18 | 
19 | 
20 | def pixelbert_transform_randaug(size=800):
21 |     longer = int((1333 / 800) * size)
22 |     trs = transforms.Compose(
23 |         [
24 |             MinMaxResize(shorter=size, longer=longer),
25 |             transforms.ToTensor(),
26 |             inception_normalize,
27 |         ]
28 |     )
29 |     trs.transforms.insert(0, RandAugment(2, 9))
30 |     return trs
31 | 


--------------------------------------------------------------------------------
/image/vilt/utils/transforms/utils.py:
--------------------------------------------------------------------------------
 1 | from torchvision import transforms
 2 | from PIL import Image
 3 | 
 4 | 
 5 | class MinMaxResize:
 6 |     def __init__(self, shorter=800, longer=1333):
 7 |         self.min = shorter
 8 |         self.max = longer
 9 | 
10 |     def __call__(self, x):
11 |         w, h = x.size
12 |         scale = self.min / min(w, h)
13 |         if h < w:
14 |             newh, neww = self.min, scale * w
15 |         else:
16 |             newh, neww = scale * h, self.min
17 | 
18 |         if max(newh, neww) > self.max:
19 |             scale = self.max / max(newh, neww)
20 |             newh = newh * scale
21 |             neww = neww * scale
22 | 
23 |         newh, neww = int(newh + 0.5), int(neww + 0.5)
24 |         newh, neww = newh // 32 * 32, neww // 32 * 32
25 | 
26 |         return x.resize((neww, newh), resample=Image.BICUBIC)
27 | 
28 | 
29 | class UnNormalize(object):
30 |     def __init__(self, mean, std):
31 |         self.mean = mean
32 |         self.std = std
33 | 
34 |     def __call__(self, tensor):
35 |         """
36 |         Args:
37 |             tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
38 |         Returns:
39 |             Tensor: Normalized image.
40 |         """
41 |         for t, m, s in zip(tensor, self.mean, self.std):
42 |             t.mul_(s).add_(m)
43 |             # The normalize code -> t.sub_(m).div_(s)
44 |         return tensor
45 | 
46 | 
47 | # This is simple maximum entropy normalization performed in Inception paper
48 | inception_normalize = transforms.Compose(
49 |     [transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])]
50 | )
51 | 
52 | # ViT uses simple non-biased inception normalization
53 | # https://github.com/google-research/vision_transformer/blob/master/vit_jax/input_pipeline.py#L132
54 | inception_unnormalize = transforms.Compose(
55 |     [UnNormalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])]
56 | )
57 | 


--------------------------------------------------------------------------------
/image/vilt/utils/write_coco_karpathy.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import pandas as pd
 4 | import pyarrow as pa
 5 | import random
 6 | 
 7 | from tqdm import tqdm
 8 | from glob import glob
 9 | from collections import defaultdict
10 | 
11 | 
12 | def path2rest(path, iid2captions, iid2split):
13 |     name = path.split("/")[-1]
14 |     with open(path, "rb") as fp:
15 |         binary = fp.read()
16 |     captions = iid2captions[name]
17 |     split = iid2split[name]
18 |     return [binary, captions, name, split]
19 | 
20 | 
21 | def make_arrow(root, dataset_root):
22 |     with open(f"{root}/karpathy/dataset_coco.json", "r") as fp:
23 |         captions = json.load(fp)
24 | 
25 |     captions = captions["images"]
26 | 
27 |     iid2captions = defaultdict(list)
28 |     iid2split = dict()
29 | 
30 |     for cap in tqdm(captions):
31 |         filename = cap["filename"]
32 |         iid2split[filename] = cap["split"]
33 |         for c in cap["sentences"]:
34 |             iid2captions[filename].append(c["raw"])
35 | 
36 |     paths = list(glob(f"{root}/train2014/*.jpg")) + list(glob(f"{root}/val2014/*.jpg"))
37 |     random.shuffle(paths)
38 |     caption_paths = [path for path in paths if path.split("/")[-1] in iid2captions]
39 | 
40 |     if len(paths) == len(caption_paths):
41 |         print("all images have caption annotations")
42 |     else:
43 |         print("not all images have caption annotations")
44 |     print(
45 |         len(paths), len(caption_paths), len(iid2captions),
46 |     )
47 | 
48 |     bs = [path2rest(path, iid2captions, iid2split) for path in tqdm(caption_paths)]
49 | 
50 |     for split in ["train", "val", "restval", "test"]:
51 |         batches = [b for b in bs if b[-1] == split]
52 | 
53 |         dataframe = pd.DataFrame(
54 |             batches, columns=["image", "caption", "image_id", "split"],
55 |         )
56 | 
57 |         table = pa.Table.from_pandas(dataframe)
58 |         os.makedirs(dataset_root, exist_ok=True)
59 |         with pa.OSFile(
60 |             f"{dataset_root}/coco_caption_karpathy_{split}.arrow", "wb"
61 |         ) as sink:
62 |             with pa.RecordBatchFileWriter(sink, table.schema) as writer:
63 |                 writer.write_table(table)
64 | 


--------------------------------------------------------------------------------
/image/vision_transformer/colo_vit/README.md:
--------------------------------------------------------------------------------
 1 | # Vision Transformer with ColoTensor
 2 | 
 3 | # Overview
 4 | 
 5 | In this example, we will run Vision Transformer with ColoTensor.
 6 | 
 7 | We use model **ViTForImageClassification** from Hugging Face [Link](https://huggingface.co/docs/transformers/model_doc/vit) for unit test.
 8 | You can change world size or decide whether use DDP in our code.
 9 | 
10 | We use model **vision_transformer** from timm [Link](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) for training example.
11 | 
12 | (2022/6/28) The default configuration now supports 2DP+2TP with gradient accumulation and checkpoint support. Zero is not supported at present.
13 | 
14 | # Requirement
15 | 
16 | You should install colossalai from main branch with commit 561e904.
17 | 
18 | ## Unit test
19 | To run unit test, you should install pytest, transformers with:
20 | ```shell
21 | pip install pytest transformers
22 | ```
23 | 
24 | ## Training example
25 | To run training example with ViT-S, you should install **NVIDIA DALI** from [Link](https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html) for dataloader support.
26 | You also need to install timm and titans for model/dataloader support with:
27 | ```shell
28 | pip install timm titans
29 | ```
30 | 
31 | ### Data preparation
32 | You can download the ImageNet dataset from the [ImageNet official website](https://www.image-net.org/download.php). You should get the raw images after downloading the dataset. As we use **NVIDIA DALI** to read data, we use the TFRecords dataset instead of raw Imagenet dataset. This offers better speedup to IO. If you don't have TFRecords dataset, follow [imagenet-tools](https://github.com/ver217/imagenet-tools) to build one.
33 | 
34 | Before you start training, you need to set the environment variable `DATA` so that the script knows where to fetch the data for DALI dataloader.
35 | ```shell
36 | export DATA=/path/to/ILSVRC2012
37 | ```
38 | 
39 | 
40 | 
41 | # How to run
42 | 
43 | ## Unit test
44 | In your terminal
45 | ```shell
46 | pytest test_vit.py
47 | ```
48 | 
49 | This will evaluate models with different **world_size** and **use_ddp**.
50 | 
51 | ## Training example
52 | Modify the settings in run.sh according to your environment.
53 | For example, if you set `--nproc_per_node=8` in `run.sh` and `TP_WORLD_SIZE=2` in your config file, 
54 | data parallel size will be automatically calculated as 4.
55 | Thus, the parallel strategy is set to 4DP+2TP.
56 | 
57 | Then in your terminal 
58 | ```shell
59 | sh run.sh
60 | ```
61 | 
62 | This will start ViT-S training with ImageNet.


--------------------------------------------------------------------------------
/image/vision_transformer/colo_vit/configs/vit_1d_tp2.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | 
 3 | # hyperparameters
 4 | # BATCH_SIZE is as per GPU
 5 | # global batch size = BATCH_SIZE x data parallel size
 6 | BATCH_SIZE = 256
 7 | LEARNING_RATE = 3e-3
 8 | WEIGHT_DECAY = 0.3
 9 | NUM_EPOCHS = 300
10 | WARMUP_EPOCHS = 32
11 | 
12 | # model config
13 | IMG_SIZE = 224
14 | PATCH_SIZE = 16
15 | HIDDEN_SIZE = 384
16 | DEPTH = 12
17 | NUM_HEADS = 6
18 | MLP_RATIO = 4
19 | NUM_CLASSES = 1000
20 | CHECKPOINT = False
21 | SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
22 | 
23 | USE_DDP = True
24 | TP_WORLD_SIZE = 2
25 | TP_TYPE = 'row'
26 | parallel = dict(tensor=dict(mode="1d", size=TP_WORLD_SIZE),)
27 | 
28 | fp16 = dict(mode=AMP_TYPE.NAIVE)
29 | clip_grad_norm = 1.0
30 | gradient_accumulation = 8
31 | 
32 | LOG_PATH = "./log"
33 | 


--------------------------------------------------------------------------------
/image/vision_transformer/colo_vit/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/image/vision_transformer/colo_vit/run.sh:
--------------------------------------------------------------------------------
 1 | export DATA=/data/scratch/imagenet/tf_records
 2 | export OMP_NUM_THREADS=4
 3 | 
 4 | # resume
 5 | # CUDA_VISIBLE_DEVICES=4,5,6,7 colossalai run \
 6 | # --nproc_per_node 4 train.py \
 7 | # --config configs/vit_1d_tp2.py \
 8 | # --resume_from checkpoint/epoch_10 \
 9 | # --master_port 29598 | tee ./out 2>&1
10 | 
11 | # train
12 | CUDA_VISIBLE_DEVICES=4,5,6,7 colossalai run \
13 | --nproc_per_node 4 train.py \
14 | --config configs/vit_1d_tp2.py \
15 | --master_port 29598 | tee ./out 2>&1


--------------------------------------------------------------------------------
/image/vision_transformer/colo_vit/utils/dummy_data_generator.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | 
 4 | class DummyDataGenerator(ABC):
 5 | 
 6 |     def __init__(self, length=10):
 7 |         self.length = length
 8 | 
 9 |     @abstractmethod
10 |     def generate(self):
11 |         pass
12 | 
13 |     def __iter__(self):
14 |         self.step = 0
15 |         return self
16 | 
17 |     def __next__(self):
18 |         if self.step < self.length:
19 |             self.step += 1
20 |             return self.generate()
21 |         else:
22 |             raise StopIteration
23 | 
24 |     def __len__(self):
25 |         return self.length
26 | 


--------------------------------------------------------------------------------
/image/vision_transformer/colo_vit/utils/util.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | import numpy as np
 4 | import torch
 5 | import torch.distributed as dist
 6 | from colossalai.core import global_context as gpc
 7 | from colossalai.context import ParallelMode
 8 | 
 9 | 
10 | def set_seed(seed):
11 |     random.seed(seed)
12 |     os.environ['PYTHONHASHSEED'] = str(seed)
13 |     np.random.seed(seed)
14 |     torch.manual_seed(seed)
15 |     torch.cuda.manual_seed(seed)
16 |     torch.backends.cudnn.deterministic = True
17 | 
18 | 
19 | def check_equal(A, B):
20 |     assert torch.allclose(A, B, rtol=1e-3, atol=1e-1) == True
21 | 
22 | 
23 | def replace_parameter_add_grad(layer, weight=None, bias=None):
24 |     if weight is not None:
25 |         delattr(layer, 'weight')
26 |         setattr(layer, 'weight', weight)
27 |         layer.weight.requires_grad = True
28 |     if bias is not None:
29 |         delattr(layer, 'bias')
30 |         setattr(layer, 'bias', bias)
31 |         layer.bias.requires_grad = True
32 | 
33 | 
34 | def broadcast_tensor_chunk(tensor, chunk_size=1, local_rank=0):
35 |     dist.broadcast(tensor, src=0)
36 |     tensor_chunk = torch.chunk(tensor, chunk_size, dim=-1)[local_rank]
37 |     return tensor_chunk.clone()
38 | 
39 | 
40 | def tensor_equal(A, B):
41 |     return torch.allclose(A, B, rtol=1e-3, atol=1e-1)
42 | 
43 | 
44 | def tensor_shard_equal(tensor: torch.Tensor, shard: torch.Tensor):
45 |     assert tensor.ndim == shard.ndim
46 |     if tensor.shape == shard.shape:
47 |         return tensor_equal(tensor, shard)
48 |     else:
49 |         dims_not_eq = torch.nonzero(torch.tensor(tensor.shape) != torch.tensor(shard.shape))
50 |         if dims_not_eq.numel() == 1:
51 |             # 1D shard
52 |             dim = dims_not_eq.item()
53 |             world_size = gpc.get_world_size(ParallelMode.PARALLEL_1D)
54 |             rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
55 |             return tensor_equal(tensor.chunk(world_size, dim)[rank], shard)
56 |         else:
57 |             raise NotImplementedError
58 | 


--------------------------------------------------------------------------------
/image/vision_transformer/colo_vit/vit.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from transformers import ViTForImageClassification, ViTConfig
 4 | from utils.dummy_data_generator import DummyDataGenerator
 5 | from colossalai.utils.cuda import get_current_device
 6 | 
 7 | 
 8 | class DummyDataLoader(DummyDataGenerator):
 9 |     batch_size = 4
10 |     channel = 3
11 |     category = 8
12 |     image_size = 224
13 | 
14 |     def generate(self):
15 |         image_dict = {}
16 |         image_dict['pixel_values'] = torch.rand(DummyDataLoader.batch_size,
17 |                                                 DummyDataLoader.channel,
18 |                                                 DummyDataLoader.image_size,
19 |                                                 DummyDataLoader.image_size,
20 |                                                 device=get_current_device()) * 2 - 1
21 |         image_dict['label'] = torch.randint(DummyDataLoader.category, (DummyDataLoader.batch_size,),
22 |                                             dtype=torch.int64,
23 |                                             device=get_current_device())
24 |         return image_dict
25 | 
26 | 
27 | class ViTCVModel(nn.Module):
28 | 
29 |     def __init__(self,
30 |                  hidden_size=768,
31 |                  num_hidden_layers=12,
32 |                  num_attention_heads=12,
33 |                  image_size=224,
34 |                  patch_size=16,
35 |                  num_channels=3,
36 |                  num_labels=8,
37 |                  checkpoint=False):
38 |         super().__init__()
39 |         self.checkpoint = checkpoint
40 |         self.model = ViTForImageClassification(
41 |             ViTConfig(hidden_size=hidden_size,
42 |                       num_hidden_layers=num_hidden_layers,
43 |                       num_attention_heads=num_attention_heads,
44 |                       image_size=image_size,
45 |                       patch_size=patch_size,
46 |                       num_channels=num_channels,
47 |                       num_labels=num_labels))
48 |         if checkpoint:
49 |             self.model.gradient_checkpointing_enable()
50 | 
51 |     def forward(self, pixel_values):
52 |         return self.model(pixel_values=pixel_values)
53 | 
54 | 
55 | def vit_base_s(checkpoint=True):
56 |     return ViTCVModel(checkpoint=checkpoint)
57 | 
58 | 
59 | def vit_base_micro(checkpoint=True):
60 |     return ViTCVModel(hidden_size=32, num_hidden_layers=2, num_attention_heads=4, checkpoint=checkpoint)
61 | 
62 | 
63 | def get_training_components():
64 |     trainloader = DummyDataLoader()
65 |     testloader = DummyDataLoader()
66 |     return vit_base_micro, trainloader, testloader, torch.optim.Adam, torch.nn.functional.cross_entropy
67 | 


--------------------------------------------------------------------------------
/image/vision_transformer/data_parallel/config.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | 
 3 | # ViT Base
 4 | BATCH_SIZE = 256
 5 | DROP_RATE = 0.1
 6 | NUM_EPOCHS = 2
 7 | 
 8 | fp16 = dict(
 9 |     mode=AMP_TYPE.TORCH,
10 | )
11 | 
12 | gradient_accumulation = 16
13 | clip_grad_norm = 1.0
14 | 
15 | dali = dict(
16 |     gpu_aug=True,
17 |     mixup_alpha=0.2
18 | )
19 | 


--------------------------------------------------------------------------------
/image/vision_transformer/data_parallel/mixup.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from colossalai.registry import LOSSES
 3 | import torch
 4 | 
 5 | 
 6 | @LOSSES.register_module
 7 | class MixupLoss(nn.Module):
 8 |     def __init__(self, loss_fn_cls):
 9 |         super().__init__()
10 |         self.loss_fn = loss_fn_cls()
11 | 
12 |     def forward(self, inputs, targets_a, targets_b, lam):
13 |         return lam * self.loss_fn(inputs, targets_a) + (1 - lam) * self.loss_fn(inputs, targets_b)
14 | 
15 | 
16 | class MixupAccuracy(nn.Module):
17 |     def forward(self, logits, targets):
18 |         targets = targets['targets_a']
19 |         preds = torch.argmax(logits, dim=-1)
20 |         correct = torch.sum(targets == preds)
21 |         return correct
22 | 


--------------------------------------------------------------------------------
/image/vision_transformer/data_parallel/myhooks.py:
--------------------------------------------------------------------------------
 1 | from colossalai.trainer.hooks import BaseHook
 2 | from colossalai.core import global_context as gpc
 3 | from colossalai.context import ParallelMode
 4 | from colossalai.logging import get_dist_logger
 5 | 
 6 | 
 7 | class TotalBatchsizeHook(BaseHook):
 8 |     def __init__(self, priority: int = 2) -> None:
 9 |         super().__init__(priority)
10 |         self.logger = get_dist_logger()
11 | 
12 |     def before_train(self, trainer):
13 |         total_batch_size = gpc.config.BATCH_SIZE * \
14 |             gpc.config.gradient_accumulation * gpc.get_world_size(ParallelMode.DATA)
15 |         self.logger.info(f'Total batch size = {total_batch_size}', ranks=[0])
16 | 


--------------------------------------------------------------------------------
/image/vision_transformer/data_parallel/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/image/vision_transformer/data_parallel/results/acc.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/vision_transformer/data_parallel/results/acc.jpeg


--------------------------------------------------------------------------------
/image/vision_transformer/data_parallel/results/loss.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/vision_transformer/data_parallel/results/loss.jpeg


--------------------------------------------------------------------------------
/image/vision_transformer/data_parallel/scripts/train_slurm.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | python train.py --host $HOST --config ./config.py --port 29500


--------------------------------------------------------------------------------
/image/vision_transformer/data_parallel/train_with_cifar10.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import colossalai
 4 | import torch
 5 | from colossalai.context import ParallelMode
 6 | from colossalai.core import global_context as gpc
 7 | from colossalai.logging import disable_existing_loggers, get_dist_logger
 8 | from colossalai.utils import get_dataloader
 9 | from colossalai.nn.lr_scheduler import LinearWarmupLR
10 | from colossalai.nn.metric import Accuracy
11 | from colossalai.trainer import Trainer, hooks
12 | from timm.models import vit_base_patch16_224
13 | 
14 | from titans.dataloader.cifar10 import build_cifar
15 | 
16 | 
17 | def main():
18 |     # initialize distributed setting
19 |     parser = colossalai.get_default_parser()
20 |     args = parser.parse_args()
21 |     disable_existing_loggers()
22 | 
23 |     # launch from torch
24 |     colossalai.launch_from_torch(config=args.config)
25 | 
26 |     # get logger
27 |     logger = get_dist_logger()
28 |     logger.info("initialized distributed environment", ranks=[0])
29 | 
30 |     # build model
31 |     model = vit_base_patch16_224(drop_rate=0.1, num_classes=10)
32 | 
33 |     # build dataloader
34 |     root = os.environ.get('DATA', './data')
35 |     train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE, root, pad_if_needed=True)
36 | 
37 |     # build optimizer
38 |     optimizer = colossalai.nn.Lamb(model.parameters(), lr=1.8e-2, weight_decay=0.1)
39 | 
40 |     # build loss
41 |     criterion = torch.nn.CrossEntropyLoss()
42 | 
43 |     # lr_scheduler
44 |     lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=gpc.config.NUM_EPOCHS)
45 | 
46 |     engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model, optimizer, criterion, train_dataloader,
47 |                                                                          test_dataloader)
48 |     logger.info("initialized colossalai components", ranks=[0])
49 | 
50 |     # build trainer
51 |     trainer = Trainer(engine=engine, logger=logger)
52 | 
53 |     # build hooks
54 |     hook_list = [
55 |         hooks.LossHook(),
56 |         hooks.AccuracyHook(accuracy_func=Accuracy()),
57 |         hooks.LogMetricByEpochHook(logger),
58 |         hooks.LRSchedulerHook(lr_scheduler, by_epoch=True),
59 |     ]
60 | 
61 |     # start training
62 |     trainer.fit(train_dataloader=train_dataloader,
63 |                 test_dataloader=test_dataloader,
64 |                 epochs=gpc.config.NUM_EPOCHS,
65 |                 hooks=hook_list,
66 |                 display_progress=True,
67 |                 test_interval=1)
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     main()
72 | 


--------------------------------------------------------------------------------
/image/vision_transformer/hybrid_parallel/configs/vit_1d_tp2_pp2.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | 
 3 | # hyperparameters
 4 | # BATCH_SIZE is as per GPU
 5 | # global batch size = BATCH_SIZE x data parallel size
 6 | BATCH_SIZE = 256
 7 | LEARNING_RATE = 3e-3
 8 | WEIGHT_DECAY = 0.3
 9 | NUM_EPOCHS = 10
10 | WARMUP_EPOCHS = 3
11 | 
12 | # model config
13 | IMG_SIZE = 224
14 | PATCH_SIZE = 16
15 | HIDDEN_SIZE = 512
16 | DEPTH = 4
17 | NUM_HEADS = 4
18 | MLP_RATIO = 2
19 | NUM_CLASSES = 1000
20 | CHECKPOINT = False
21 | SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
22 | 
23 | # parallel setting
24 | TENSOR_PARALLEL_SIZE = 2
25 | TENSOR_PARALLEL_MODE = '1d'
26 | 
27 | parallel = dict(
28 |     pipeline=2,
29 |     tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
30 | )
31 | 
32 | fp16 = dict(mode=AMP_TYPE.NAIVE)
33 | clip_grad_norm = 1.0
34 | 
35 | # pipeline config
36 | NUM_MICRO_BATCHES = parallel['pipeline']
37 | 


--------------------------------------------------------------------------------
/image/vision_transformer/hybrid_parallel/configs/vit_1d_tp4_pp16.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | 
 3 | 
 4 | # hyperparameters
 5 | # BATCH_SIZE is as per GPU
 6 | # global batch size = BATCH_SIZE x data parallel size
 7 | BATCH_SIZE = 4096
 8 | LEARNING_RATE = 3e-3
 9 | WEIGHT_DECAY = 0.3
10 | NUM_EPOCHS = 300
11 | WARMUP_EPOCHS = 32
12 | 
13 | # model config
14 | IMG_SIZE = 224
15 | PATCH_SIZE = 16
16 | HIDDEN_SIZE = 4096
17 | DEPTH = 32
18 | NUM_HEADS = 64
19 | MLP_RATIO = 4
20 | NUM_CLASSES = 1000
21 | CHECKPOINT = True
22 | SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1  # add 1 for cls token
23 | 
24 | # parallel setting
25 | TENSOR_PARALLEL_SIZE = 4
26 | TENSOR_PARALLEL_MODE = '1d'
27 | 
28 | parallel = dict(
29 |     pipeline=16,
30 |     tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
31 | )
32 | 
33 | fp16 = dict(mode=AMP_TYPE.NAIVE)
34 | clip_grad_norm = 1.0
35 | 
36 | 
37 | # pipeline config
38 | NUM_MICRO_BATCHES = parallel['pipeline']
39 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LENGTH, HIDDEN_SIZE)
40 | 


--------------------------------------------------------------------------------
/image/vision_transformer/hybrid_parallel/configs/vit_2d_tp4_pp16.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from tkinter import HIDDEN
 3 | from colossalai.amp import AMP_TYPE
 4 | 
 5 | 
 6 | # hyperparameters
 7 | # BATCH_SIZE is as per GPU
 8 | # global batch size = BATCH_SIZE x data parallel size
 9 | BATCH_SIZE = 8192
10 | LEARNING_RATE = 3e-3
11 | WEIGHT_DECAY = 0.3
12 | NUM_EPOCHS = 300
13 | WARMUP_EPOCHS = 32
14 | 
15 | # model config
16 | IMG_SIZE = 224
17 | PATCH_SIZE = 16
18 | HIDDEN_SIZE = 4096
19 | DEPTH = 32
20 | NUM_HEADS = 64
21 | MLP_RATIO = 4
22 | NUM_CLASSES = 1000
23 | CHECKPOINT = True
24 | SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1  # add 1 for cls token
25 | 
26 | # parallel setting
27 | TENSOR_PARALLEL_SIZE = 4
28 | TENSOR_PARALLEL_MODE = '2d'
29 | 
30 | parallel = dict(
31 |     pipeline=16,
32 |     tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
33 | )
34 | 
35 | fp16 = dict(mode=AMP_TYPE.NAIVE)
36 | clip_grad_norm = 1.0
37 | 
38 | # pipeline config
39 | NUM_MICRO_BATCHES = parallel['pipeline']
40 | SUMMA_DIM = int(math.sqrt(TENSOR_PARALLEL_SIZE))
41 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES // SUMMA_DIM,
42 |                 SEQ_LENGTH,
43 |                 HIDDEN_SIZE // SUMMA_DIM)
44 | 


--------------------------------------------------------------------------------
/image/vision_transformer/hybrid_parallel/configs/vit_2p5d_tp4_pp16.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | from colossalai.amp import AMP_TYPE
 3 | import math
 4 | 
 5 | # hyperparameters
 6 | # BATCH_SIZE is as per GPU
 7 | # global batch size = BATCH_SIZE x data parallel size
 8 | BATCH_SIZE = 7168
 9 | LEARNING_RATE = 3e-3
10 | WEIGHT_DECAY = 0.3
11 | NUM_EPOCHS = 300
12 | WARMUP_EPOCHS = 32
13 | 
14 | # model config
15 | IMG_SIZE = 224
16 | PATCH_SIZE = 16
17 | HIDDEN_SIZE = 4096
18 | DEPTH = 32
19 | NUM_HEADS = 64
20 | MLP_RATIO = 4
21 | NUM_CLASSES = 1000
22 | CHECKPOINT = True
23 | SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1  # add 1 for cls token
24 | 
25 | # parallel setting
26 | TENSOR_PARALLEL_SIZE = 4
27 | TESSERACT_DEPTH = 1
28 | TENSOR_PARALLEL_MODE = '2.5d'
29 | 
30 | parallel = dict(
31 |     pipeline=16,
32 |     tensor=dict(
33 |         mode=TENSOR_PARALLEL_MODE,
34 |         size=TENSOR_PARALLEL_SIZE,
35 |         depth=TESSERACT_DEPTH
36 |     ),
37 | )
38 | 
39 | fp16 = dict(mode=AMP_TYPE.NAIVE)
40 | clip_grad_norm = 1.0
41 | 
42 | # pipeline config
43 | NUM_MICRO_BATCHES = parallel['pipeline']
44 | SUMMA_DIM = int(math.sqrt(TENSOR_PARALLEL_SIZE // TESSERACT_DEPTH))
45 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES // SUMMA_DIM,
46 |                 SEQ_LENGTH,
47 |                 HIDDEN_SIZE // SUMMA_DIM)
48 | 


--------------------------------------------------------------------------------
/image/vision_transformer/hybrid_parallel/configs/vit_3d_tp8_pp8.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | 
 3 | # hyperparameters
 4 | # BATCH_SIZE is as per GPU
 5 | # global batch size = BATCH_SIZE x data parallel size
 6 | BATCH_SIZE = 1536
 7 | LEARNING_RATE = 3e-3
 8 | WEIGHT_DECAY = 0.3
 9 | NUM_EPOCHS = 300
10 | WARMUP_EPOCHS = 32
11 | 
12 | # model config
13 | IMG_SIZE = 224
14 | PATCH_SIZE = 16
15 | HIDDEN_SIZE = 4096
16 | DEPTH = 32
17 | NUM_HEADS = 64
18 | MLP_RATIO = 4
19 | NUM_CLASSES = 1000
20 | CHECKPOINT = True
21 | SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1  # add 1 for cls token
22 | 
23 | # parallel setting
24 | TENSOR_PARALLEL_SIZE = 8
25 | TENSOR_PARALLEL_MODE = '3d'
26 | 
27 | NUM_EPOCHS = 300
28 | WARMUP_EPOCHS = 32
29 | 
30 | parallel = dict(
31 |     pipeline=8,
32 |     tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
33 | )
34 | 
35 | fp16 = dict(mode=AMP_TYPE.NAIVE)
36 | clip_grad_norm = 1.0
37 | 
38 | # pipeline config
39 | NUM_MICRO_BATCHES = parallel['pipeline']
40 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES // 4,
41 |                 SEQ_LENGTH,
42 |                 HIDDEN_SIZE // 2)
43 | 


--------------------------------------------------------------------------------
/image/vision_transformer/hybrid_parallel/configs/vit_pipeline.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | 
 3 | # hyperparameters
 4 | # BATCH_SIZE is as per GPU
 5 | # global batch size = BATCH_SIZE x data parallel size
 6 | BATCH_SIZE = 2048
 7 | LEARNING_RATE = 3e-3
 8 | WEIGHT_DECAY = 0.3
 9 | NUM_EPOCHS = 10
10 | WARMUP_EPOCHS = 3
11 | 
12 | # model config
13 | IMG_SIZE = 224
14 | PATCH_SIZE = 16
15 | HIDDEN_SIZE = 512
16 | DEPTH = 4
17 | NUM_HEADS = 4
18 | MLP_RATIO = 2
19 | NUM_CLASSES = 1000
20 | CHECKPOINT = False
21 | SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
22 | 
23 | # parallel setting
24 | parallel = dict(pipeline=2,)
25 | 
26 | fp16 = dict(mode=AMP_TYPE.NAIVE)
27 | clip_grad_norm = 1.0
28 | 
29 | # pipeline config
30 | NUM_MICRO_BATCHES = parallel['pipeline']
31 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LENGTH, HIDDEN_SIZE)
32 | 


--------------------------------------------------------------------------------
/image/vision_transformer/hybrid_parallel/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .vit import *


--------------------------------------------------------------------------------
/image/vision_transformer/hybrid_parallel/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/language/DeepNet/README.md:
--------------------------------------------------------------------------------
 1 | # [DeepNet](https://arxiv.org/pdf/2203.00555.pdf): An Implementation based on [Colossal-AI](https://www.colossalai.org/)
 2 | 
 3 | ## Overview
 4 | 
 5 | <p align="center">
 6 |   <img src="https://github.com/yuxuan-lou/ColossalAI-DeepNet/blob/main/IMG/overview.png" width="800">
 7 | </p>
 8 | 
 9 | This is the re-implementation of model DeepNet from paper [DeepNet: Scaling Transformers to 1,000 Layers](https://arxiv.org/pdf/2203.00555.pdf).
10 | 
11 | DeepNet can scale transformer models to 1000 layers by applying DeepNorm. This Colossal-AI based implementation support data parallelism, pipeline parallelism and 1D tensor parallelism for training.
12 | 
13 | ## How to prepare datasets
14 | 
15 | ### Decoder-only DeepNet
16 | The decoder-only DeepNet model is modified from the GPT model. In this example, we use WebText dataset for training. The way we prepare dataset is same as which in [Colossal-AI based GPT example](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/gpt).
17 | 
18 | ## requirement
19 | 
20 | To use pipeline parallel training, you should install colossalai from the **latest** main branch.
21 | 
22 | ## How to run
23 | 
24 | ### Decoder-only DeepNet
25 | 
26 | ```Bash
27 | #!/usr/bin/env sh
28 | export DATA=/path/to/train_data.json
29 | 
30 | colossalai run --nproc_per_node=<num_gpus> train_deepnet_decoder.py --config=decoder_configs/deepnet_pp1d.py
31 | ```
32 | 
33 | 
34 | Please modify `DATA`, `num_gpus` with the path to your dataset and the number of GPUs respectively.
35 | You can also modify the config `file decoder_configs/deepnet_pp1d.py` to further change parallel settings, training hyperparameters and model details.
36 | 
37 | ## features
38 | 
39 |  - [x] Decoder-only DeepNet
40 |  - [ ] Encoder-Decoder DeepNet
41 | 


--------------------------------------------------------------------------------
/language/DeepNet/dataset/webtext.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | import torch
 5 | from colossalai.registry import DATASETS
 6 | from torch.utils.data import Dataset
 7 | from transformers import GPT2Tokenizer
 8 | 
 9 | 
10 | @DATASETS.register_module
11 | class WebtextDataset(Dataset):
12 |     def __init__(self, path, seq_len=1024) -> None:
13 |         super().__init__()
14 |         root = os.path.dirname(path)
15 |         encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
16 |         if os.path.isfile(encoded_data_cache_path):
17 |             seq_len_, data, attention_mask = torch.load(encoded_data_cache_path)
18 |             if seq_len_ == seq_len:
19 |                 self.data = data
20 |                 self.attention_mask = attention_mask
21 |                 return
22 |         raw_data = []
23 |         with open(path) as f:
24 |             for line in f.readlines():
25 |                 raw_data.append(json.loads(line)['text'])
26 |         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
27 |         tokenizer.pad_token = tokenizer.unk_token
28 |         encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt')
29 |         self.data = encoded_data['input_ids']
30 |         self.attention_mask = encoded_data['attention_mask']
31 |         torch.save((seq_len, self.data, self.attention_mask), encoded_data_cache_path)
32 | 
33 |     def __len__(self):
34 |         return len(self.data)
35 | 
36 |     def __getitem__(self, index):
37 |         return {'input_ids': self.data[index],
38 |             'attention_mask': self.attention_mask[index]}, self.data[index]


--------------------------------------------------------------------------------
/language/DeepNet/decoder_configs/deepnet_pp1d.py:
--------------------------------------------------------------------------------
 1 | from torch.optim import Adam
 2 | from colossalai.amp import AMP_TYPE
 3 | import torch
 4 | from titans.model.deepnet import deepnet_small
 5 | from titans.loss.lm_loss import GPTLMLoss
 6 | 
 7 | BATCH_SIZE = 8
 8 | NUM_EPOCHS = 2
 9 | SEQ_LEN = 1024
10 | 
11 | NUM_MICRO_BATCHES = 1
12 | HIDDEN_SIZE = 768
13 | PIPELINE = 2
14 | TENSOR_PARALLEL = 2
15 | MODE = '1d'
16 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LEN, HIDDEN_SIZE)
17 | 
18 | fp16 = dict(mode=AMP_TYPE.NAIVE)
19 | 
20 | parallel = dict(pipeline=PIPELINE, tensor=dict(mode=MODE, size=TENSOR_PARALLEL))
21 | 
22 | optimizer = dict(
23 |     type=Adam,
24 |     lr=0.00015,
25 |     weight_decay=1e-2,
26 | )
27 | 
28 | model = dict(
29 |     type=deepnet_small,
30 |     checkpoint=True,
31 |     dtype=torch.half,
32 | )
33 | 
34 | loss = dict(type=GPTLMLoss,)
35 | 


--------------------------------------------------------------------------------
/language/DeepNet/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | contextlib
3 | torch
4 | transformers


--------------------------------------------------------------------------------
/language/bert/colotensor/README.md:
--------------------------------------------------------------------------------
1 | [WIP]
2 | 
3 | 


--------------------------------------------------------------------------------
/language/bert/colotensor/configs/bert_base_tp1d.py:
--------------------------------------------------------------------------------
 1 | SEQ_LENGTH = 512
 2 | BATCH_SIZE = 8
 3 | NUM_EPOCHS = 10
 4 | WARMUP_EPOCHS = 1
 5 | 
 6 | parallel = dict(
 7 |     tensor=dict(mode="1d", size=4),
 8 | )
 9 | 
10 | model = dict(
11 |     type="bert_base",
12 | )


--------------------------------------------------------------------------------
/language/bert/colotensor/dataset/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from language.bert.colotensor.dataset.wikitext import build_data_from_wikitext
 3 | from colossalai.core import global_context as gpc
 4 | 
 5 | _datasets = {
 6 |     "wikitext": build_data_from_wikitext,
 7 | }
 8 | 
 9 | def build_data(**args):
10 |     if hasattr(gpc.config, "dataset"):
11 |         assert (
12 |             gpc.config.dataset in _datasets.keys()
13 |         ), f"Invalid dataset name. dataset should be in {_datasets.keys()} or use default wikitext"
14 |         builder = _datasets[gpc.config.dataset]
15 |     else:
16 |         builder = _datasets["wikitext"]
17 |     return builder(**args)
18 | 
19 | 
20 | __all__ = ["build_data"]


--------------------------------------------------------------------------------
/language/bert/colotensor/dataset/wikitext.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import torch
 3 | import numpy as np
 4 | import copy
 5 | 
 6 | from itertools import chain
 7 | from datasets import load_from_disk, set_progress_bar_enabled
 8 | 
 9 | from torch.utils.data import DataLoader, DistributedSampler
10 | from torch.distributed import get_world_size
11 | 
12 | from transformers import BertTokenizer, default_data_collator
13 | from colossalai.logging import get_dist_logger
14 | 
15 | 
16 | def build_data_from_wikitext(dataset_path: str, tokenizer_path: str, seq_len: int = 512, batch_size: int = 8):
17 |     logger = get_dist_logger("build_data_from_wikitext")
18 |     logger.info("Building Wikitext-2 ...", ranks=[0])
19 |     world_size = get_world_size()
20 | 
21 |     set_progress_bar_enabled(False)
22 |     dataset = load_from_disk(dataset_path)
23 | 
24 |     tokenizer = BertTokenizer(vocab_file=tokenizer_path + "/vocab.txt")
25 | 
26 |     def tokenize(examples):
27 |         seq_length = seq_len
28 |         examples = tokenizer(examples["text"])
29 |         concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
30 |         total_length = len(concatenated_examples[list(examples.keys())[0]])
31 |         if total_length >= seq_length:
32 |             total_length = (total_length // seq_length) * seq_length
33 | 
34 |         result = {
35 |             k: [t[i : i + seq_len] for i in range(0, total_length, seq_length)]
36 |             for k, t in concatenated_examples.items()
37 |         }
38 | 
39 |         return result
40 | 
41 |     tokenized_dataset = dataset.map(
42 |         tokenize, batched=True, num_proc=16, load_from_cache_file=False, keep_in_memory=True, remove_columns="text"
43 |     )
44 | 
45 |     def seed_worker():
46 |         worker_seed = 1024
47 |         np.random.seed(worker_seed)
48 |         torch.manual_seed(worker_seed)
49 |         random.seed(worker_seed)
50 | 
51 |     train_sampler = DistributedSampler(tokenized_dataset["train"], shuffle=True) if world_size > 1 else None
52 |     train_data = DataLoader(
53 |         tokenized_dataset["train"],
54 |         shuffle=(train_sampler is None),
55 |         sampler=train_sampler,
56 |         drop_last=True,
57 |         collate_fn=default_data_collator,
58 |         worker_init_fn=seed_worker,
59 |         batch_size=batch_size,
60 |         pin_memory=True,
61 |     )
62 |     test_sampler = DistributedSampler(tokenized_dataset["validation"], shuffle=False) if world_size > 1 else None
63 |     test_data = DataLoader(
64 |         tokenized_dataset["validation"],
65 |         sampler=test_sampler,
66 |         drop_last=True,
67 |         collate_fn=default_data_collator,
68 |         worker_init_fn=seed_worker,
69 |         batch_size=batch_size,
70 |         pin_memory=True,
71 |     )
72 | 
73 |     return train_data, test_data


--------------------------------------------------------------------------------
/language/bert/colotensor/model/__init__.py:
--------------------------------------------------------------------------------
 1 | from language.bert.colotensor.model.hfmodel import ModelFromHF
 2 | from colossalai.core import global_context as gpc
 3 | from transformers import BertConfig, BertForMaskedLM
 4 | 
 5 | _bert_base = dict(
 6 |     seq_length=512,
 7 |     vocab_size=50304,
 8 |     hidden_size=768,
 9 |     num_heads=12,
10 |     depth=12,
11 |     ff_size=3072,
12 |     checkpoint=False,
13 |     evaluation='ppl',
14 | )
15 | 
16 | _bert_large = dict(
17 |     seq_length=512,
18 |     vocab_size=50304,
19 |     hidden_size=1024,
20 |     num_heads=16,
21 |     depth=24,
22 |     ff_size=3072,
23 |     checkpoint=False,
24 |     evaluation='ppl',
25 | )
26 | 
27 | _bert_configurations = dict(
28 |     bert=_bert_base,
29 |     bert_base=_bert_base,
30 |     bert_large=_bert_large
31 | )
32 | 
33 | def build_model():
34 |     model_cfg = _bert_configurations[gpc.config.model.type]
35 |     bert_cfg = BertConfig(vocab_size=model_cfg['vocab_size'],
36 |                           hidden_size=model_cfg['hidden_size'],
37 |                           num_hidden_layers=model_cfg['depth'],
38 |                           num_attention_heads=model_cfg['num_heads'],
39 |                           intermediate_size=model_cfg['ff_size'],
40 |                           max_position_embeddings=model_cfg['seq_length'],
41 |                           use_cache=not gpc.config.model.get('checkpoint', False))
42 | 
43 |     model = ModelFromHF(bert_cfg, BertForMaskedLM)
44 | 
45 |     return model
46 | 
47 | __all__ = ["build_model"]


--------------------------------------------------------------------------------
/language/bert/colotensor/model/hfmodel.py:
--------------------------------------------------------------------------------
 1 | from colossalai.core import global_context as gpc
 2 | import torch
 3 | 
 4 | class ModelFromHF(torch.nn.Module):
 5 |     def __init__(self, config, model_cls):
 6 |         super().__init__()
 7 |         self.module = model_cls(config)
 8 |         if gpc.config.model.get('checkpoint'):
 9 |             self.module.apply(self.set_checkpointing)
10 | 
11 |     def set_checkpointing(self, module):
12 |         if hasattr(module, 'gradient_checkpointing'):
13 |             module.gradient_checkpointing = True
14 | 
15 |     def forward(self, *args, **kwargs):
16 |         output = self.module(*args, **kwargs)
17 |         return output.logits


--------------------------------------------------------------------------------
/language/bert/colotensor/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/language/bert/hybrid_parallel/README.md:
--------------------------------------------------------------------------------
 1 | # Bert
 2 | 
 3 | ![Still In Progress](https://img.shields.io/badge/-Still%20In%20Progress-orange)
 4 | 
 5 | Bert Benchmark with data parallel, tensor parallel(tp), pipeline parallel(pp) and ZeRO.
 6 | 
 7 | ## Setup
 8 | 1. Install dependencies if you do not have them
 9 | ```
10 | pip install -r requirement.txt
11 | ```
12 | 
13 | 2. Add root dir into PYTHONPATH
14 | ```
15 | export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH
16 | ```
17 | 
18 | ## Bert Usage
19 | 
20 | 1. Prepare datasets and tokenizers from HuggingFace Hub if necessary (e.g. we provide an example of training `wikitext-2`).
21 | 
22 | 2. Run benchmark with one of the systems to evaluate
23 | ```
24 | DATA=/PATH/TO/DATASET TOKENIZER=/PATH/TO/TOKENIZER LOG=/PATH/TO/LOG torchrun --nproc_per_node=NUM_GPUS run.py --config=CONFIG_FILE
25 | ```


--------------------------------------------------------------------------------
/language/bert/hybrid_parallel/colossalai_utils/bert_config_pp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "colossalai",
 3 |   "model": {
 4 |     "type": "bert_base"
 5 |   },
 6 |   "hyperparameter": {
 7 |     "batch_size": 8,
 8 |     "num_epochs": 20,
 9 |     "steps_per_epoch": 10
10 |   },
11 |   "gradient_clipping": 1.0,
12 |   "parallel": {
13 |     "pipeline": 4,
14 |     "tensor": {
15 |       "mode": "1d",
16 |       "size": 1
17 |     }
18 |   },
19 |   "use_mem_monitor": true
20 | }
21 | 


--------------------------------------------------------------------------------
/language/bert/hybrid_parallel/colossalai_utils/bert_config_tp1d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "colossalai",
 3 |   "model": {
 4 |     "type": "bert_base"
 5 |   },
 6 |   "hyperparameter": {
 7 |     "batch_size": 8,
 8 |     "num_epochs": 10,
 9 |     "steps_per_epoch": 10
10 |   },
11 |   "gradient_clipping": 1.0,
12 |   "parallel": {
13 |     "pipeline": 1,
14 |     "tensor": {
15 |       "mode": "1d",
16 |       "size": 2
17 |     }
18 |   },
19 |   "use_mem_monitor": true
20 | }
21 | 


--------------------------------------------------------------------------------
/language/bert/hybrid_parallel/colossalai_utils/bert_config_tp1dpp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "colossalai",
 3 |   "model": {
 4 |     "type": "bert_base"
 5 |   },
 6 |   "hyperparameter": {
 7 |     "batch_size": 8,
 8 |     "num_epochs": 20,
 9 |     "steps_per_epoch": 10
10 |   },
11 |   "gradient_clipping": 1.0,
12 |   "parallel": {
13 |     "pipeline": 2,
14 |     "tensor": {
15 |       "mode": "1d",
16 |       "size": 2
17 |     }
18 |   },
19 |   "use_mem_monitor": true
20 | }
21 | 


--------------------------------------------------------------------------------
/language/bert/hybrid_parallel/colossalai_utils/bert_config_tp2d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "colossalai",
 3 |   "model": {
 4 |     "type": "bert_base"
 5 |   },
 6 |   "hyperparameter": {
 7 |     "batch_size": 8,
 8 |     "num_epochs": 20,
 9 |     "steps_per_epoch": 10
10 |   },
11 |   "gradient_clipping": 1.0,
12 |   "parallel": {
13 |     "pipeline": 1,
14 |     "tensor": {
15 |       "mode": "2d",
16 |       "size": 4
17 |     }
18 |   },
19 |   "use_mem_monitor": true
20 | }
21 | 


--------------------------------------------------------------------------------
/language/bert/hybrid_parallel/colossalai_utils/bert_config_tp2p5d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "colossalai",
 3 |   "model": {
 4 |     "type": "bert_base"
 5 |   },
 6 |   "hyperparameter": {
 7 |     "batch_size": 8,
 8 |     "num_epochs": 20,
 9 |     "steps_per_epoch": 10
10 |   },
11 |   "gradient_clipping": 1.0,
12 |   "parallel": {
13 |     "pipeline": 1,
14 |     "tensor": {
15 |       "mode": "2.5d",
16 |       "size": 8,
17 |       "depth": 2
18 |     }
19 |   },
20 |   "use_mem_monitor": true
21 | }
22 | 


--------------------------------------------------------------------------------
/language/bert/hybrid_parallel/colossalai_utils/bert_config_tp3d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "colossalai",
 3 |   "model": {
 4 |     "type": "bert_base"
 5 |   },
 6 |   "hyperparameter": {
 7 |     "batch_size": 8,
 8 |     "num_epochs": 20,
 9 |     "steps_per_epoch": 10
10 |   },
11 |   "gradient_clipping": 1.0,
12 |   "parallel": {
13 |     "pipeline": 1,
14 |     "tensor": {
15 |       "mode": "3d",
16 |       "size": 8
17 |     }
18 |   },
19 |   "use_mem_monitor": true
20 | }
21 | 


--------------------------------------------------------------------------------
/language/bert/hybrid_parallel/colossalai_utils/bert_config_zero.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "colossalai",
 3 |   "model": {
 4 |     "type": "bert_base"
 5 |   },
 6 |   "hyperparameter": {
 7 |     "batch_size": 8,
 8 |     "num_epochs": 20,
 9 |     "steps_per_epoch": 10
10 |   },
11 |   "gradient_clipping": 1.0,
12 |   "zero": {
13 |     "model_config": {
14 |       "offload_config": {
15 |         "device": "cpu"
16 |       }
17 |     },
18 |     "optimizer_config": {
19 |       "cpu_offload": true,
20 |       "initial_scale": 256,
21 |       "min_scale": 1,
22 |       "growth_factor": 2.0,
23 |       "backoff_factor": 0.5,
24 |       "growth_interval": 1000
25 |     }
26 |   },
27 |   "use_mem_monitor": true
28 | }
29 | 


--------------------------------------------------------------------------------
/language/bert/hybrid_parallel/colossalai_utils/bert_config_zerotppp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "method": "colossalai",
 3 |   "model": {
 4 |     "type": "bert_base"
 5 |   },
 6 |   "hyperparameter": {
 7 |     "batch_size": 8,
 8 |     "num_epochs": 100,
 9 |     "steps_per_epoch": 10
10 |   },
11 |   "gradient_clipping": 1.0,
12 |   "zero": {
13 |     "model_config": {
14 |       "offload_config": {
15 |         "device": "cpu"
16 |       }
17 |     },
18 |     "optimizer_config": {
19 |       "cpu_offload": true,
20 |       "initial_scale": 256,
21 |       "min_scale": 1,
22 |       "growth_factor": 2.0,
23 |       "backoff_factor": 0.5,
24 |       "growth_interval": 1000
25 |     }
26 |   },
27 |   "parallel": {
28 |     "pipeline": 1,
29 |     "tensor": {
30 |       "mode": "1d",
31 |       "size": 2
32 |     }
33 |   },
34 |   "use_mem_monitor": true
35 | }
36 | 


--------------------------------------------------------------------------------
/language/bert/hybrid_parallel/colossalai_utils/model_zoo/__init__.py:
--------------------------------------------------------------------------------
1 | from .colo_bert import create_colo_bert_pipeline_model, ColoBertForMaskedLM, ColoBertMaskedLMLoss
2 | 
3 | __all__ = ['create_colo_bert_pipeline_model', 'ColoBertForMaskedLM', 'ColoBertMaskedLMLoss']
4 | 


--------------------------------------------------------------------------------
/language/bert/hybrid_parallel/colossalai_utils/requirement.txt:
--------------------------------------------------------------------------------
1 | 
2 | torch>=1.10 -f https://download.pytorch.org/whl/cu113/torch_stable.html
3 | torchvision -f https://download.pytorch.org/whl/cu113/torch_stable.html
4 | transformers
5 | datasets
6 | colossalai
7 | rich


--------------------------------------------------------------------------------
/language/bert/hybrid_parallel/colossalai_utils/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from zero.common.utils import CONFIG, print_log
 3 | from torch.cuda import max_memory_allocated, reset_peak_memory_stats
 4 | from torch.distributed import get_rank
 5 | 
 6 | 
 7 | def init_w_col(builder):
 8 |     import colossalai
 9 |     from colossalai.core import global_context as gpc
10 |     from colossalai.nn.optimizer import CPUAdam
11 |     from colossalai.zero.init_ctx import ZeroInitContext
12 |     from colossalai.zero.shard_utils import (BucketTensorShardStrategy)
13 | 
14 |     from colossalai.utils.memory_utils.utils import colo_set_process_memory_fraction
15 |     colo_set_process_memory_fraction(0.2)
16 | 
17 |     colossalai.launch_from_torch(config=CONFIG)
18 | 
19 |     build_data, build_model, build_loss, optimizer_class, build_scheduler = builder()
20 | 
21 |     print_log('Building data')
22 |     train_data, test_data = build_data()
23 | 
24 |     use_zero = "zero" in gpc.config
25 |     if use_zero:
26 |         cpu_offload = gpc.config.zero.model_config.offload_config.device == 'cpu'
27 |     else:
28 |         cpu_offload = None
29 | 
30 |     rank = get_rank()
31 |     reset_peak_memory_stats(rank)
32 | 
33 |     print_log('Building model')
34 |     if use_zero:
35 |         shard_strategy = BucketTensorShardStrategy()
36 |         with ZeroInitContext(target_device=torch.cuda.current_device(), shard_strategy=shard_strategy,
37 |                              shard_param=True):
38 |             model = build_model()
39 |         gpc.config.zero.model_config['shard_strategy'] = shard_strategy
40 | 
41 |     else:
42 |         model = build_model()
43 | 
44 |     criterion = build_loss()
45 | 
46 |     print_log(f'Peak Memory = {max_memory_allocated(rank) / (1024 * 1024)} M')
47 |     reset_peak_memory_stats(rank)
48 | 
49 |     optimizer_kwargs = {}
50 |     if use_zero and cpu_offload:
51 |         optimizer_class = CPUAdam
52 |         optimizer_kwargs = {
53 |             'lr': CONFIG['hyperparameter']['learning_rate'],
54 |             'weight_decay': CONFIG['hyperparameter']['weight_decay']
55 |         }
56 | 
57 |     optimizer = optimizer_class(model.parameters())
58 | 
59 |     lr_scheduler = build_scheduler(len(train_data), optimizer)
60 |     print_log(f'Peak Memory = {max_memory_allocated(rank) / (1024 * 1024)} M')
61 | 
62 |     engine, train_data, test_data, lr_scheduler = colossalai.initialize(model, optimizer, criterion, train_data,
63 |                                                                         test_data, lr_scheduler)
64 |     model = engine
65 |     criterion = engine.criterion
66 |     optimizer = engine
67 | 
68 |     return model, train_data, test_data, criterion, optimizer, None, lr_scheduler
69 | 


--------------------------------------------------------------------------------
/language/bert/hybrid_parallel/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/language/bert/hybrid_parallel/run.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from bert.common.helper import bert_builder
 4 | from bert.colossalai_utils.utils import init_w_col
 5 | from bert.common.train import train
 6 | from zero.common.utils import CONFIG, load_config, print_log
 7 | from zero.torch_utils.utils import init_w_torch
 8 | 
 9 | _method = {
10 |     'torch': init_w_torch,
11 |     'colossalai': init_w_col,
12 | }
13 | 
14 | _builder = {
15 |     'bert': bert_builder,
16 | }
17 | 
18 | 
19 | def run_bert():
20 |     method = CONFIG['method']
21 | 
22 |     model = CONFIG['model']['type']
23 |     model_type = model.split('_')[0]
24 | 
25 |     train(*_method[method](_builder[model_type]))
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     load_config()
30 | 
31 |     CONFIG['log_path'] = os.environ.get('LOG', '.')
32 |     os.makedirs(CONFIG['log_path'], exist_ok=True)
33 | 
34 |     print_log(f'Initializing {CONFIG["method"]} ...')
35 | 
36 |     run_bert()
37 | 


--------------------------------------------------------------------------------
/language/bert/preprocessing/.gitignore:
--------------------------------------------------------------------------------
1 | pretrain/
2 | wikipedia/


--------------------------------------------------------------------------------
/language/bert/preprocessing/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/language/bert/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/config.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | 
 3 | DATA_PATH = ''
 4 | VOCAB_FILE_PATH = ''
 5 | 
 6 | # hyper-parameters
 7 | TRAIN_ITERS = 1000000
 8 | DECAY_ITERS = 990000
 9 | WARMUP_FRACTION = 0.01
10 | GLOBAL_BATCH_SIZE = 32    # dp world size * sentences per GPU
11 | EVAL_ITERS = 10
12 | EVAL_INTERVAL = 10
13 | LR = 0.0001
14 | MIN_LR = 1e-05
15 | WEIGHT_DECAY = 0.01
16 | SEQ_LENGTH = 512
17 | 
18 | # BERT config
19 | DEPTH = 12
20 | NUM_ATTENTION_HEADS = 12
21 | HIDDEN_SIZE = 768
22 | 
23 | # model config
24 | ADD_BINARY_HEAD = False
25 | 
26 | # random seed
27 | SEED = 1234
28 | 
29 | # pipeline config
30 | # only enabled when pipeline > 1
31 | NUM_MICRO_BATCHES = 4
32 | 
33 | # colossalai config
34 | parallel = dict(pipeline=1, tensor=dict(size=4, mode='sequence'))
35 | 
36 | fp16 = dict(mode=AMP_TYPE.NAIVE, verbose=True)
37 | 
38 | clip_grad_norm = 1.0
39 | 
40 | gradient_handler = [dict(type='SequenceParallelGradientHandler')]
41 | 


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/data/datasets/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = helpers
 4 | LIBEXT = $(shell python3-config --extension-suffix)
 5 | 
 6 | default: $(LIBNAME)$(LIBEXT)
 7 | 
 8 | %$(LIBEXT): %.cpp
 9 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 | 


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from . import indexed_dataset
2 | 


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/data/datasets/blendable_dataset.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Blendable dataset."""
17 | 
18 | import time
19 | 
20 | import numpy as np
21 | import torch
22 | 
23 | 
24 | class BlendableDataset(torch.utils.data.Dataset):
25 | 
26 |     def __init__(self, datasets, weights):
27 | 
28 |         self.datasets = datasets
29 |         num_datasets = len(datasets)
30 |         assert num_datasets == len(weights)
31 | 
32 |         self.size = 0
33 |         for dataset in self.datasets:
34 |             self.size += len(dataset)
35 | 
36 |         # Normalize weights.
37 |         weights = np.array(weights, dtype=np.float64)
38 |         sum_weights = np.sum(weights)
39 |         assert sum_weights > 0.0
40 |         weights /= sum_weights
41 | 
42 |         # Build indices.
43 |         start_time = time.time()
44 |         assert num_datasets < 255
45 |         self.dataset_index = np.zeros(self.size, dtype=np.uint8)
46 |         self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
47 | 
48 |         from . import helpers
49 |         helpers.build_blending_indices(self.dataset_index,
50 |                                        self.dataset_sample_index,
51 |                                        weights, num_datasets, self.size,
52 |                                        torch.distributed.get_rank() == 0)
53 |         print('> elapsed time for building blendable dataset indices: '
54 |               '{:.2f} (sec)'.format(time.time() - start_time))
55 | 
56 |     def __len__(self):
57 |         return self.size
58 | 
59 |     def __getitem__(self, idx):
60 |         dataset_idx = self.dataset_index[idx]
61 |         sample_idx = self.dataset_sample_index[idx]
62 |         return self.datasets[dataset_idx][sample_idx]
63 | 


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/data/datasets/test/test_preprocess_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | IMPL=cached
 4 | python ../preprocess_data.py \
 5 |        --input test_samples.json \
 6 |        --vocab vocab.txt \
 7 |        --dataset-impl ${IMPL} \
 8 |        --output-prefix test_samples_${IMPL} \
 9 |        --workers 1 \
10 |        --log-interval 2
11 | 


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/data/tokenizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | from .tokenizer import build_tokenizer
18 | 
19 | 
20 | _TOKENIZER = None
21 | _PADDED_VOCAB_SIZE = -1
22 | 
23 | 
24 | def initialize_tokenizer(vocab_file, tokenizer_type, vocab_extra_ids=0):
25 |     tokenizer, padded_vocab_size = build_tokenizer(vocab_file, tokenizer_type, vocab_extra_ids)
26 |     global _TOKENIZER, _PADDED_VOCAB_SIZE
27 |     _TOKENIZER = tokenizer
28 |     _PADDED_VOCAB_SIZE = padded_vocab_size
29 | 
30 | 
31 | def get_tokenizer():
32 |     global _TOKENIZER
33 |     return _TOKENIZER
34 | 
35 | 
36 | def get_padded_vocab_size():
37 |     global _PADDED_VOCAB_SIZE
38 |     return _PADDED_VOCAB_SIZE
39 | 


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/loss_func/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/language/bert/sequene_parallel/loss_func/__init__.py


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/loss_func/bert_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from colossalai.core import global_context as gpc
 4 | from colossalai.context import ParallelMode
 5 | from colossalai.logging import get_dist_logger
 6 | import torch.nn.functional as F
 7 | import torch.distributed as dist
 8 | from .cross_entropy import vocab_cross_entropy
 9 | 
10 | 
11 | class BertLoss(nn.Module):
12 | 
13 |     def forward(self,
14 |                 lm_loss,
15 |                 sop_logits,
16 |                 loss_mask,
17 |                 sentence_order):
18 |         lm_loss_ = lm_loss.float()
19 |         loss_mask = loss_mask.float()
20 |         loss_mask_sum = loss_mask.sum()
21 |         lm_loss = torch.sum(
22 |             lm_loss_.view(-1) * loss_mask.reshape(-1))
23 | 
24 |         lm_loss /= loss_mask_sum
25 | 
26 |         torch.distributed.all_reduce(
27 |             lm_loss,
28 |             group=gpc.get_group(ParallelMode.SEQUENCE)
29 |         )
30 | 
31 |         if sop_logits is not None:
32 |             sop_loss = F.cross_entropy(sop_logits.view(-1, 2).float(),
33 |                                        sentence_order.view(-1),
34 |                                        ignore_index=-1)
35 |             sop_loss = sop_loss.float()
36 |             loss = lm_loss + sop_loss * gpc.get_world_size(ParallelMode.SEQUENCE)
37 |         else:
38 |             sop_loss = None
39 |             loss = lm_loss
40 | 
41 |         return loss
42 | 


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/loss_func/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import torch
 3 | 
 4 | 
 5 | def ensure_divisibility(numerator, denominator):
 6 |     """Ensure that numerator is divisible by the denominator."""
 7 |     assert numerator % denominator == 0, '{} is not divisible by {}'.format(
 8 |         numerator, denominator)
 9 | 
10 | 
11 | def divide(numerator, denominator):
12 |     """Ensure that numerator is divisible by the denominator and return
13 |     the division value."""
14 |     ensure_divisibility(numerator, denominator)
15 |     return numerator // denominator
16 | 
17 | 
18 | def split_tensor_along_last_dim(tensor, num_partitions,
19 |                                 contiguous_split_chunks=False):
20 |     """Split a tensor along its last dimension.
21 |     Arguments:
22 |         tensor: input tensor.
23 |         num_partitions: number of partitions to split the tensor
24 |         contiguous_split_chunks: If True, make each chunk contiguous
25 |                                  in memory.
26 |     """
27 |     # Get the size and dimension.
28 |     last_dim = tensor.dim() - 1
29 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
30 |     # Split.
31 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
32 |     # Note: torch.split does not create contiguous tensors by default.
33 |     if contiguous_split_chunks:
34 |         return tuple(chunk.contiguous() for chunk in tensor_list)
35 | 
36 |     return tensor_list
37 | 
38 | 
39 | class VocabUtility:
40 |     """Split the vocabulary into `world_size` chunks amd return the
41 |         first and last index of the vocabulary belonging to the `rank`
42 |         partition: Note that indices in [fist, last)"""
43 | 
44 |     @staticmethod
45 |     def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
46 |                                                   rank, world_size):
47 |         index_f = rank * per_partition_vocab_size
48 |         index_l = index_f + per_partition_vocab_size
49 |         return index_f, index_l
50 | 
51 |     @staticmethod
52 |     def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
53 |         per_partition_vocab_size = divide(global_vocab_size, world_size)
54 |         return VocabUtility.vocab_range_from_per_partition_vocab_size(
55 |             per_partition_vocab_size, rank, world_size)
56 | 


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/lr_scheduler/__init__.py:
--------------------------------------------------------------------------------
1 | from .annealing_lr import AnnealingLR
2 | 


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/model/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/model/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .embedding import VocabEmbedding, Embedding
2 | from .bert_layer import BertLayer
3 | from .head import BertDualHead
4 | from .preprocess import PreProcessor
5 | 


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/model/layers/dropout.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | def bias_dropout_add(x, bias, residual, prob, training):
 4 |     # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
 5 |     out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
 6 |     out = residual + out
 7 |     return out
 8 | 
 9 | 
10 | def get_bias_dropout_add(training):
11 |     def _bias_dropout_add(x, bias, residual, prob):
12 |         return bias_dropout_add(x, bias, residual, prob, training)
13 |     return _bias_dropout_add


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/model/layers/head.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from .pooler import Pooler
 6 | from .linear import Linear
 7 | from .embedding import VocabEmbedding
 8 | from colossalai.core import global_context as gpc
 9 | from colossalai.context import ParallelMode
10 | from colossalai.kernel import LayerNorm
11 | from loss_func.cross_entropy import vocab_cross_entropy
12 | 
13 | 
14 | class BertLMHead(nn.Module):
15 |     """Masked LM head for Bert
16 |     Arguments:
17 |         hidden_size: hidden size
18 |         init_method: init method for weight initialization
19 |         layernorm_epsilon: tolerance for layer norm divisions
20 |     """
21 | 
22 |     def __init__(self,
23 |                  vocab_size,
24 |                  hidden_size,
25 |                  ):
26 | 
27 |         super(BertLMHead, self).__init__()
28 |         self.bias = torch.nn.Parameter(torch.zeros(vocab_size))
29 | 
30 |         self.dense = Linear(hidden_size, hidden_size)
31 |         self.layernorm = LayerNorm(hidden_size)
32 |         self.gelu = torch.nn.functional.gelu
33 | 
34 |     def forward(self, hidden_states, word_embeddings_weight, lm_labels):
35 |         hidden_states = self.dense(hidden_states)
36 |         hidden_states = self.gelu(hidden_states)
37 |         hidden_states = self.layernorm(hidden_states)
38 | 
39 |         output = F.linear(hidden_states, word_embeddings_weight, self.bias)
40 |         lm_loss = vocab_cross_entropy(output, lm_labels)
41 | 
42 |         return lm_loss
43 | 
44 | 
45 | class BertBinaryHead(nn.Module):
46 | 
47 |     def __init__(self, hidden_size):
48 |         super().__init__()
49 |         self.pooler = Pooler(hidden_size)
50 |         self.dense = Linear(hidden_size, 2)
51 | 
52 |     def forward(self, hidden_states):
53 |         if gpc.get_local_rank(ParallelMode.SEQUENCE) == 0:
54 |             output = self.pooler(hidden_states)
55 |             output = self.dense(output)
56 |         else:
57 |             output = None
58 |         return output
59 | 
60 | 
61 | class BertDualHead(nn.Module):
62 | 
63 |     def __init__(self, hidden_size, vocab_size, add_binary_head):
64 |         super().__init__()
65 |         self.lm_head = BertLMHead(vocab_size, hidden_size)
66 |         self.add_binary_head = add_binary_head
67 |         if add_binary_head:
68 |             self.binary_head = BertBinaryHead(hidden_size)
69 |         else:
70 |             self.binary_head = None
71 | 
72 |     def forward(self, hidden_states, word_embeddings_weight, lm_labels):
73 |         if self.add_binary_head:
74 |             binary_output = self.binary_head(hidden_states)
75 |         else:
76 |             binary_output = None
77 |         lm_loss = self.lm_head(hidden_states, word_embeddings_weight, lm_labels)
78 |         return lm_loss, binary_output
79 | 


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/model/layers/init_method.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import math
 3 | 
 4 | def init_normal(tensor, sigma):
 5 |     """Init method based on N(0, sigma)."""
 6 |     torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
 7 | 
 8 | 
 9 | def output_init_normal(tensor, sigma, num_layers):
10 |     """Init method based on N(0, sigma/sqrt(2*num_layers)."""
11 |     std = sigma / math.sqrt(2.0 * num_layers)
12 |     torch.nn.init.normal_(tensor, mean=0.0, std=std)
13 | 


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/model/layers/linear.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn import Parameter
 4 | import torch.nn.functional as F
 5 | import torch.nn.init as init
 6 | 
 7 | 
 8 | class Linear(nn.Module):
 9 |     """Linear layer with column parallelism.
10 |     The linear layer is defined as Y = XA + b. A is parallelized along
11 |     its second dimension as A = [A_1, ..., A_p].
12 |     Arguments:
13 |         input_size: first dimension of matrix A.
14 |         output_size: second dimension of matrix A.
15 |         bias: If true, add bias
16 |         init_method: method to initialize weights. Note that bias is always set
17 |                      to zero.
18 |         stride: For the strided linear layers.
19 |         keep_master_weight_for_test: This was added for testing and should be
20 |                                      set to False. It returns the master weights
21 |                                      used for initialization.
22 |         skip_bias_add: This was added to enable performance optimations where bias
23 |                        can be fused with other elementwise operations. we skip
24 |                        adding bias but instead return it.
25 |     """
26 | 
27 |     def __init__(self,
28 |                  input_size,
29 |                  output_size,
30 |                  bias=True,
31 |                  skip_bias_add=False):
32 |         super(Linear, self).__init__()
33 | 
34 |         # Keep input parameters
35 |         self.input_size = input_size
36 |         self.output_size = output_size
37 |         self.skip_bias_add = skip_bias_add
38 | 
39 |         self.weight = Parameter(torch.empty(self.output_size,
40 |                                             self.input_size,
41 |                                             ))
42 |         init.normal_(self.weight)
43 |         if bias:
44 |             self.bias = Parameter(torch.empty(self.output_size))
45 |             # Always initialize bias to zero.
46 |             with torch.no_grad():
47 |                 self.bias.zero_()
48 |         else:
49 |             self.register_parameter('bias', None)
50 | 
51 |     def forward(self, input_):
52 |         # Matrix multiply.
53 |         bias = self.bias if not self.skip_bias_add else None
54 |         output = F.linear(input_, self.weight, bias)
55 | 
56 |         if self.skip_bias_add:
57 |             return output, self.bias
58 |         else:
59 |             return output
60 | 
61 |     def __repr__(self):
62 |         return f'Linear(in_features={self.input_size}, out_features={self.output_size}, ' + \
63 |             f'bias={self.bias is not None}, skip_bias_add={self.skip_bias_add})'
64 | 


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/model/layers/mlp.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from .linear import Linear
 6 | from colossalai.kernel.jit import bias_gelu_impl
 7 | 
 8 | 
 9 | class TransformerMLP(nn.Module):
10 |     """MLP.
11 |     MLP will take the input with h hidden state, project it to 4*h
12 |     hidden dimension, perform nonlinear transformation, and project the
13 |     state back into h hidden dimension. At the end, dropout is also
14 |     applied.
15 |     """
16 | 
17 |     def __init__(self, hidden_size, mlp_ratio, fuse_gelu=True):
18 |         super(TransformerMLP, self).__init__()
19 | 
20 |         # Project to 4h.
21 |         self.dense_h_to_4h = Linear(
22 |             hidden_size,
23 |             int(hidden_size*mlp_ratio),
24 |             skip_bias_add=True)
25 | 
26 |         self.bias_gelu_fusion = fuse_gelu
27 |         self.activation_func = F.gelu
28 | 
29 |         # Project back to h.
30 |         self.dense_4h_to_h = Linear(
31 |             int(hidden_size*mlp_ratio),
32 |             hidden_size,
33 |             skip_bias_add=True)
34 | 
35 |     def forward(self, hidden_states):
36 |         # hidden states should be in the shape of [s, b, h]
37 |         # it will be projects into [s, b, 4h]
38 |         # and projected back to [s, b, h]
39 |         intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)
40 | 
41 |         if self.bias_gelu_fusion:
42 |             intermediate_parallel = \
43 |                 bias_gelu_impl(intermediate_parallel, bias_parallel)
44 |         else:
45 |             intermediate_parallel = \
46 |                 self.activation_func(intermediate_parallel + bias_parallel)
47 | 
48 |         # [s, b, h]
49 |         output, output_bias = self.dense_4h_to_h(intermediate_parallel)
50 |         return output, output_bias
51 | 


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/model/layers/pooler.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from .linear import Linear
 4 | 
 5 | 
 6 | class Pooler(nn.Module):
 7 |     """Pooler layer.
 8 | 
 9 |     Pool hidden states of a specific token (for example start of the
10 |     sequence) and add a linear transformation followed by a tanh.
11 | 
12 |     Arguments:
13 |         hidden_size: hidden size
14 |         init_method: weight initialization method for the linear layer.
15 |             bias is set to zero.
16 |     """
17 | 
18 |     def __init__(self, hidden_size):
19 |         super(Pooler, self).__init__()
20 |         self.dense = Linear(hidden_size, hidden_size)
21 | 
22 |     def forward(self, hidden_states, sequence_index=0):
23 |         # hidden_states: [b, s, h]
24 |         # sequence_index: index of the token to pool.
25 |         pooled = hidden_states[:, sequence_index, :]
26 |         pooled = self.dense(pooled)
27 |         pooled = torch.tanh(pooled)
28 |         return pooled
29 | 


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/model/layers/preprocess.py:
--------------------------------------------------------------------------------
 1 | from colossalai.context.parallel_mode import ParallelMode
 2 | import torch
 3 | import torch.nn as nn
 4 | from colossalai.core import global_context as gpc
 5 | 
 6 | 
 7 | class PreProcessor(nn.Module):
 8 | 
 9 |     def __init__(self, sub_seq_length):
10 |         super().__init__()
11 |         self.sub_seq_length = sub_seq_length
12 | 
13 |     def bert_position_ids(self, token_ids):
14 |         # Create position ids
15 |         seq_length = token_ids.size(1)
16 |         local_rank = gpc.get_local_rank(ParallelMode.SEQUENCE)
17 |         position_ids = torch.arange(seq_length*local_rank,
18 |                                     seq_length * (local_rank+1),
19 |                                     dtype=torch.long,
20 |                                     device=token_ids.device)
21 |         position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
22 | 
23 |         return position_ids
24 | 
25 |     def bert_extended_attention_mask(self, attention_mask):
26 |         local_rank = gpc.get_local_rank(ParallelMode.SEQUENCE)
27 |         start_index = local_rank * self.sub_seq_length
28 |         end_index = (local_rank + 1) * self.sub_seq_length
29 | 
30 |         # We create a 3D attention mask from a 2D tensor mask.
31 |         # [b, 1, s]
32 |         attention_mask_b1s = attention_mask.unsqueeze(1)
33 |         # [b, s, 1]
34 |         attention_mask_bs1 = attention_mask.unsqueeze(2)
35 |         # [b, s/D, s]
36 |         attention_mask_bss = attention_mask_b1s * attention_mask_bs1
37 | 
38 |         attention_mask_bss = attention_mask_bss[:, start_index:end_index, :]
39 | 
40 |         # [b, 1, s/D, s]
41 |         extended_attention_mask = attention_mask_bss.unsqueeze(1)
42 | 
43 |         # Convert attention mask to binary:
44 |         extended_attention_mask = (extended_attention_mask < 0.5)
45 | 
46 |         return extended_attention_mask
47 | 
48 |     def forward(self, input_ids=None, attention_mask=None):
49 |         if attention_mask is not None:
50 |             extended_attention_mask = self.bert_extended_attention_mask(attention_mask)
51 |         else:
52 |             extended_attention_mask = None
53 | 
54 |         if input_ids is not None:
55 |             position_ids = self.bert_position_ids(input_ids)
56 |         else:
57 |             position_ids = None
58 |         return position_ids, extended_attention_mask
59 | 


--------------------------------------------------------------------------------
/language/bert/sequene_parallel/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/language/bert/zero/.gitignore:
--------------------------------------------------------------------------------
1 | download/
2 | pretrain/


--------------------------------------------------------------------------------
/language/bert/zero/README.md:
--------------------------------------------------------------------------------
 1 | ## Train BERT with ZeRO
 2 | 
 3 | ![Still In Progress](https://img.shields.io/badge/-Still%20In%20Progress-orange)
 4 | 
 5 | ### About ZeRO
 6 | 
 7 | Zero redundancy optimizer is a memory-optimization method for large-scale model training. 
 8 | It shards tensors in optimizer states, gradients, and parameters so that large models can be accommodated by limited GPU memory.
 9 | Offloading techniques are integrated to further utilize the CPU memory space.
10 | Colossal-AI has an optimized ZeRO module equipped with our unique chunk mechanism to maximize the memory utilization and higher training throughput.
11 | More details can be found in our [documentation](https://www.colossalai.org/docs/features/zero_redundancy_and_zero_offload).
12 | 
13 | ## Pretraining
14 | 
15 | ### Data Preparation
16 | 
17 | You need to follow the [documentation](../preprocessing/README.md) in the `preprocessing folder` to preprocess the WikiPedia dataset.
18 | You should obtain a `wikipedia` folder. Use symbolic link to link it to the current directory (i.e. `ln -s ../preprocessing/pretrain ./pretrain_data` )
19 | 
20 | ### Execute Pretraining
21 | 
22 | Use the command below to start pretraining. If you want to do multi-node training, you can refer to the [documentation on how to launch multi-node training](https://www.colossalai.org/docs/basics/launch_colossalai).
23 | 
24 | ```bash
25 | bash ./scripts/run_pretrain.sh
26 | ```
27 | 
28 | ## Fine-tuning
29 | 
30 | In this repository, we provided finetuning examples for different downstream tasks. Each section comes with step-by-step instructions to fine-tune the pretrained bert model.
31 | 
32 | ### GLUE
33 | 
34 | 1. Prepare the dataset
35 | 
36 | Execute the command below. This will create a `download` folder in the current directory. This folder contains the downstream task datasets.
37 | 
38 | ```bash
39 | bash ./scripts/download_finetune_dataset.sh
40 | ```
41 | 
42 | 2. Fine-tuning
43 | 
44 | Run the fine-tuning script. This script by defualt uses 1 GPU only. If you wish to use more GPUs, you can change the batch size per GPU. 
45 | The SOTA results are reproduced with global batch size 128.
46 | 
47 | ```bash
48 | bash ./scripts/run_finetune_glue.sh
49 | ```
50 | 
51 | Reproduced results:
52 | 
53 | | Metric | Value |
54 | | -      | -     |
55 | | F1     | 89.1  |
56 | | Accurarcy | 84.31 |
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/language/bert/zero/configs/bert_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 768,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 3072,
 8 |   "max_position_embeddings": 512,
 9 |   "num_attention_heads": 12,
10 |   "num_hidden_layers": 12,
11 |   "type_vocab_size": 2,
12 |   "vocab_size": 30522
13 | }


--------------------------------------------------------------------------------
/language/bert/zero/configs/colossalai_amp.py:
--------------------------------------------------------------------------------
1 | from colossalai.amp import AMP_TYPE
2 | 
3 | fp16 = dict(mode=AMP_TYPE.TORCH)
4 | 
5 | seed = 2
6 | 


--------------------------------------------------------------------------------
/language/bert/zero/configs/colossalai_zero.py:
--------------------------------------------------------------------------------
 1 | from colossalai.zero.shard_utils import TensorShardStrategy
 2 | 
 3 | zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(),
 4 |                               reduce_scatter_bucket_size_mb=25,
 5 |                               fp32_reduce_scatter=False,
 6 |                               tensor_placement_policy="cuda",
 7 |                               gradient_predivide_factor=1.0,
 8 |                               reuse_fp16_shard=True),
 9 |             optimizer_config=dict(gpu_margin_mem_ratio=0.8,
10 |                                   initial_scale=2**5,
11 |                                   min_scale=1,
12 |                                   growth_factor=2,
13 |                                   backoff_factor=0.5,
14 |                                   growth_interval=1000,
15 |                                   hysteresis=2,
16 |                                   max_scale=2**32))
17 | 


--------------------------------------------------------------------------------
/language/bert/zero/finetuning/glue/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/language/bert/zero/finetuning/glue/__init__.py


--------------------------------------------------------------------------------
/language/bert/zero/finetuning/glue/data.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | import pickle
 4 | from colossalai.logging import get_dist_logger
 5 | from colossalai.core import global_context as gpc
 6 | from torch.utils.data import TensorDataset
 7 | from processors import convert_examples_to_features
 8 | 
 9 | 
10 | def gen_tensor_dataset(features):
11 |     all_input_ids = torch.tensor(
12 |         [f.input_ids for f in features],
13 |         dtype=torch.long,
14 |     )
15 |     all_input_mask = torch.tensor(
16 |         [f.input_mask for f in features],
17 |         dtype=torch.long,
18 |     )
19 |     all_segment_ids = torch.tensor(
20 |         [f.segment_ids for f in features],
21 |         dtype=torch.long,
22 |     )
23 |     all_label_ids = torch.tensor(
24 |         [f.label_id for f in features],
25 |         dtype=torch.long,
26 |     )
27 |     return TensorDataset(
28 |         all_input_ids,
29 |         all_input_mask,
30 |         all_segment_ids,
31 |         all_label_ids,
32 |     )
33 | 
34 | 
35 | def get_train_features(data_dir, vocab_file, max_seq_length, do_lower_case, tokenizer, processor):
36 | 
37 |     cached_train_features_file = os.path.join(
38 |         data_dir,
39 |         '{0}_{1}_{2}'.format(
40 |             vocab_file,
41 |             str(max_seq_length),
42 |             str(do_lower_case),
43 |         ),
44 |     )
45 |     train_features = None
46 |     logger = get_dist_logger()
47 |     try:
48 |         with open(cached_train_features_file, "rb") as reader:
49 |             train_features = pickle.load(reader)
50 |         logger.info("Loaded pre-processed features from {}".format(cached_train_features_file))
51 |     except:
52 |         logger.info("Did not find pre-processed features from {}".format(cached_train_features_file))
53 |         train_examples = processor.get_train_examples(data_dir)
54 |         train_features, _ = convert_examples_to_features(
55 |             train_examples,
56 |             processor.get_labels(),
57 |             max_seq_length,
58 |             tokenizer,
59 |         )
60 |         if gpc.get_global_rank() == 0:
61 |             logger.info("  Saving train features into cached file %s", cached_train_features_file)
62 |             with open(cached_train_features_file, "wb") as writer:
63 |                 pickle.dump(train_features, writer)
64 |     return train_features
65 | 


--------------------------------------------------------------------------------
/language/bert/zero/pretraining/arguments.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | from numpy import require
 3 | 
 4 | __all__ = ['parse_args']
 5 | 
 6 | 
 7 | def parse_args():
 8 |     parser = colossalai.get_default_parser()
 9 |     parser.add_argument('--bert-config', type=str, required=True)
10 |     parser.add_argument('--lr', type=float, required=True)
11 |     parser.add_argument('--data', type=str, required=True)
12 |     parser.add_argument('--warmup-ratio', default=0.01, type=float)
13 |     parser.add_argument('--vocab-file', type=str, required=True)
14 |     parser.add_argument('--epoch', type=int, required=True)
15 |     parser.add_argument('--batch-size', type=int, required=True)
16 |     parser.add_argument('--save-checkpoint-interval', type=int, required=True)
17 |     parser.add_argument('--output-dir', type=str, required=True)
18 |     args = parser.parse_args()
19 |     return args
20 | 


--------------------------------------------------------------------------------
/language/bert/zero/pretraining/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | __all__ = ['LossForPretraining']
 4 | 
 5 | 
 6 | class LossForPretraining(torch.nn.Module):
 7 | 
 8 |     def __init__(self, vocab_size):
 9 |         super(LossForPretraining, self).__init__()
10 |         self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1)
11 |         self.vocab_size = vocab_size
12 | 
13 |     def forward(self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels):
14 |         masked_lm_loss = self.loss_fn(prediction_scores.view(-1, self.vocab_size), masked_lm_labels.view(-1))
15 |         next_sentence_loss = self.loss_fn(seq_relationship_score.view(-1, 2), next_sentence_labels.view(-1))
16 |         total_loss = masked_lm_loss + next_sentence_loss
17 |         return total_loss
18 | 


--------------------------------------------------------------------------------
/language/bert/zero/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/language/bert/zero/scripts/download_finetune_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2019-2020 NVIDIA CORPORATION. All rights reserved.
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | git clone https://github.com/NVIDIA/DeepLearningExamples.git
17 | mv DeepLearningExamples/PyTorch/LanguageModeling/BERT/data ./nv-dl-examples-data
18 | rm -rf DeepLearningExamples
19 | pip install wget
20 | 
21 | export BERT_PREP_WORKING_DIR=$PWD
22 | 
23 | python3 ./nv-dl-examples-data/bertPrep.py --action download --dataset squad
24 | python3 ./nv-dl-examples-data/bertPrep.py --action download --dataset mrpc
25 | python3 ./nv-dl-examples-data/bertPrep.py --action download --dataset sst-2


--------------------------------------------------------------------------------
/language/bert/zero/scripts/run_finetune_glue.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | GLUE_DATASET=$PWD/download/glue/MRPC
 3 | VOCAB_FILE="bert-base-uncased"
 4 | CODE_DIR=$PWD/finetuning/glue
 5 | 
 6 | colossalai run --nproc_per_node 1 \
 7 |     --master_port 29510 \
 8 |     $CODE_DIR/main.py \
 9 |     --data_dir $GLUE_DATASET \
10 |     --task_name mrpc \
11 |     --bert_config ./configs/bert_base.json \
12 |     --vocab_file $VOCAB_FILE \
13 |     --output_dir ./finetuning_outputs \
14 |     --train_batch_size 128 \
15 |     --eval_batch_size 128 \
16 |     --num_train_epochs 3 \
17 |     --train \
18 |     --eval \
19 |     --predict 
20 | 


--------------------------------------------------------------------------------
/language/bert/zero/scripts/run_pretrain.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | BERT_CONFIG_PATH='./configs/bert_base.json'
 3 | PY_FILE_PATH='./pretraining/run_pretraining.py'
 4 | DATA_PATH='./pretrain_data/phase1/unbinned/parquet'
 5 | VOCAB_FILE='bert-base-uncased'
 6 | 
 7 | export PYTHONPATH=$PWD
 8 | 
 9 | colossalai run --nproc_per_node 8 \
10 |     --master_port 29550 \
11 |     $PY_FILE_PATH \
12 |     --bert-config $BERT_CONFIG_PATH \
13 |     --lr 1e-4 \
14 |     --data $DATA_PATH \
15 |     --vocab-file $VOCAB_FILE \
16 |     --batch-size 32 \
17 |     --epoch 100 \
18 |     --output-dir ./pretrain_outputs \
19 |     --save-checkpoint-interval 5
20 | 


--------------------------------------------------------------------------------
/language/gpt/dataset/webtext.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | import torch
 5 | from colossalai.registry import DATASETS
 6 | from torch.utils.data import Dataset
 7 | from transformers import GPT2Tokenizer
 8 | 
 9 | 
10 | @DATASETS.register_module
11 | class WebtextDataset(Dataset):
12 |     def __init__(self, path, seq_len=1024) -> None:
13 |         super().__init__()
14 |         root = os.path.dirname(path)
15 |         encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
16 |         if os.path.isfile(encoded_data_cache_path):
17 |             seq_len_, data, attention_mask = torch.load(encoded_data_cache_path)
18 |             if seq_len_ == seq_len:
19 |                 self.data = data
20 |                 self.attention_mask = attention_mask
21 |                 return
22 |         raw_data = []
23 |         with open(path) as f:
24 |             for line in f.readlines():
25 |                 raw_data.append(json.loads(line)['text'])
26 |         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
27 |         tokenizer.pad_token = tokenizer.unk_token
28 |         encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt')
29 |         self.data = encoded_data['input_ids']
30 |         self.attention_mask = encoded_data['attention_mask']
31 |         torch.save((seq_len, self.data, self.attention_mask), encoded_data_cache_path)
32 | 
33 |     def __len__(self):
34 |         return len(self.data)
35 | 
36 |     def __getitem__(self, index):
37 |         return {'input_ids': self.data[index],
38 |             'attention_mask': self.attention_mask[index]}, self.data[index]


--------------------------------------------------------------------------------
/language/gpt/gpt2_configs/gpt2_1d.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | from titans.loss.lm_loss import GPTLMLoss
 3 | from titans.model.gpt import gpt2_small, gpt2_large, gpt2_xl, gpt2_8B
 4 | from torch.optim import Adam
 5 | 
 6 | # change bs here
 7 | BATCH_SIZE = 32
 8 | SEQ_LEN = 1024
 9 | NUM_EPOCHS = 60
10 | 
11 | TENSOR_PARALLEL = 4
12 | 
13 | optimizer = dict(
14 |     type=Adam,
15 |     lr=0.00015,
16 |     weight_decay=1e-2,
17 | )
18 | 
19 | fp16 = dict(mode=AMP_TYPE.NAIVE)
20 | 
21 | loss = dict(type=GPTLMLoss,)
22 | 
23 | model = dict(
24 |     type=gpt2_8B,
25 |     checkpoint=True,
26 | )
27 | 
28 | parallel = dict(
29 |     pipeline=1,
30 |     tensor=dict(size=TENSOR_PARALLEL, mode='1d'),
31 | )
32 | 


--------------------------------------------------------------------------------
/language/gpt/gpt2_configs/gpt2_2d.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | from titans.loss.lm_loss import GPTLMLoss
 3 | from titans.model.gpt import gpt2_small
 4 | from torch.optim import Adam
 5 | 
 6 | BATCH_SIZE = 4
 7 | SEQ_LEN = 1024
 8 | NUM_EPOCHS = 60
 9 | TENSOR_PARALLEL = 4
10 | 
11 | optimizer = dict(
12 |     type=Adam,
13 |     lr=0.00015,
14 |     weight_decay=1e-2,
15 | )
16 | 
17 | fp16 = dict(
18 |     mode=AMP_TYPE.NAIVE
19 | )
20 | 
21 | loss = dict(
22 |     type=GPTLMLoss,
23 | )
24 | 
25 | model = dict(
26 |     type=gpt2_small,
27 |     checkpoint=True,
28 | )
29 | 
30 | parallel = dict(
31 |     pipeline=1,
32 |     tensor=dict(size=TENSOR_PARALLEL, mode='2d'),
33 | )
34 | 


--------------------------------------------------------------------------------
/language/gpt/gpt2_configs/gpt2_2p5d.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | from titans.loss.lm_loss import GPTLMLoss
 3 | from titans.model.gpt import gpt2_small
 4 | from torch.optim import Adam
 5 | 
 6 | BATCH_SIZE = 4
 7 | SEQ_LEN = 1024
 8 | NUM_EPOCHS = 60
 9 | TENSOR_PARALLEL = 8
10 | DEPTH = 2
11 | 
12 | 
13 | optimizer = dict(
14 |     type=Adam,
15 |     lr=0.00015,
16 |     weight_decay=1e-2,
17 | )
18 | 
19 | fp16 = dict(
20 |     mode=AMP_TYPE.NAIVE
21 | )
22 | 
23 | loss = dict(
24 |     type=GPTLMLoss,
25 | )
26 | 
27 | model = dict(
28 |     type=gpt2_small,
29 |     checkpoint=True,
30 | )
31 | 
32 | 
33 | parallel = dict(
34 |     pipeline=1,
35 |     tensor=dict(size=TENSOR_PARALLEL, depth=DEPTH, mode='2.5d'),
36 | )
37 | 


--------------------------------------------------------------------------------
/language/gpt/gpt2_configs/gpt2_3d.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | from titans.loss.lm_loss import GPTLMLoss
 3 | from titans.model.gpt import gpt2_small
 4 | from torch.optim import Adam
 5 | 
 6 | BATCH_SIZE = 4
 7 | SEQ_LEN = 1024
 8 | NUM_EPOCHS = 60
 9 | TENSOR_PARALLEL = 8
10 | 
11 | optimizer = dict(
12 |     type=Adam,
13 |     lr=0.00015,
14 |     weight_decay=1e-2,
15 | )
16 | 
17 | fp16 = dict(
18 |     mode=AMP_TYPE.NAIVE
19 | )
20 | 
21 | loss = dict(
22 |     type=GPTLMLoss,
23 | )
24 | 
25 | model = dict(
26 |     type=gpt2_small,
27 |     checkpoint=True,
28 | )
29 | 
30 | parallel = dict(
31 |     pipeline=1,
32 |     tensor=dict(size=TENSOR_PARALLEL, mode='3d'),
33 | )
34 | 


--------------------------------------------------------------------------------
/language/gpt/gpt2_configs/gpt2_pp.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | from titans.loss.lm_loss import GPTLMLoss
 3 | from titans.model.gpt import gpt2_small
 4 | #from model_zoo.gpt.gpt import gpt2_small_pipeline
 5 | from torch.optim import Adam
 6 | 
 7 | 
 8 | BATCH_SIZE = 8
 9 | SEQ_LEN = 1024
10 | NUM_EPOCHS = 60
11 | HIDDEN_SIZE = 768
12 | NUM_MICRO_BATCHES = 4
13 | PIPELINE = 2
14 | 
15 | optimizer = dict(
16 |     type=Adam,
17 |     lr=0.00015,
18 |     weight_decay=1e-2,
19 | )
20 | 
21 | fp16 = dict(
22 |     mode=AMP_TYPE.NAIVE
23 | )
24 | 
25 | loss = dict(
26 |     type=GPTLMLoss,
27 | )
28 | 
29 | model = dict(
30 |     type=gpt2_small,
31 |     checkpoint=True,
32 | )
33 | 
34 | parallel = dict(
35 |     pipeline=PIPELINE,
36 |     tensor=dict(size=1, mode=None),
37 | )
38 | 


--------------------------------------------------------------------------------
/language/gpt/gpt2_configs/gpt2_pp1d.py:
--------------------------------------------------------------------------------
 1 | from titans.loss.lm_loss import GPTLMLoss
 2 | from titans.loss.vocab_cross_entropy import vocab_parallel_cross_entropy
 3 | from titans.model.gpt import gpt2_small
 4 | from torch.optim import Adam
 5 | from colossalai.amp import AMP_TYPE
 6 | import torch
 7 | 
 8 | BATCH_SIZE = 8
 9 | NUM_EPOCHS = 60
10 | SEQ_LEN = 1024
11 | 
12 | NUM_MICRO_BATCHES = 4
13 | HIDDEN_SIZE = 768
14 | PIPELINE = 2
15 | TENSOR_PARALLEL = 2
16 | MODE = '1d'
17 | 
18 | fp16 = dict(mode=AMP_TYPE.NAIVE)
19 | 
20 | parallel = dict(pipeline=PIPELINE, tensor=dict(mode=MODE, size=TENSOR_PARALLEL))
21 | 
22 | optimizer = dict(
23 |     type=Adam,
24 |     lr=0.00015,
25 |     weight_decay=1e-2,
26 | )
27 | 
28 | model = dict(
29 |     type=gpt2_small,
30 |     checkpoint=True,
31 |     dtype=torch.half,
32 | )
33 | 
34 | loss_fn = dict(type=vocab_parallel_cross_entropy)
35 | 


--------------------------------------------------------------------------------
/language/gpt/gpt2_configs/gpt2_vanilla.py:
--------------------------------------------------------------------------------
 1 | from colossalai.amp import AMP_TYPE
 2 | from titans.model.gpt import gpt2_small
 3 | from torch.optim import Adam
 4 | 
 5 | 
 6 | BATCH_SIZE = 1
 7 | NUM_EPOCHS = 60
 8 | SEQ_LEN = 1024
 9 | 
10 | optimizer = dict(
11 |     type=Adam,
12 |     lr=0.00015,
13 |     weight_decay=1e-2,
14 | )
15 | 
16 | fp16 = dict(
17 |     mode=AMP_TYPE.NAIVE
18 | )
19 | 
20 | 
21 | model = dict(
22 |     type=gpt2_small,
23 |     checkpoint=True,
24 | )
25 | 
26 | parallel = dict(
27 |     pipeline=1,
28 |     tensor=dict(size=1, mode=None),
29 | )


--------------------------------------------------------------------------------
/language/gpt/gpt2_configs/gpt2_zero3.py:
--------------------------------------------------------------------------------
 1 | from colossalai.nn.optimizer import HybridAdam
 2 | from colossalai.zero.shard_utils import TensorShardStrategy
 3 | from titans.model.gpt import gpt2_small
 4 | 
 5 | BATCH_SIZE = 2
 6 | NUM_EPOCHS = 60
 7 | SEQ_LEN = 1024
 8 | 
 9 | 
10 | zero = dict(
11 |     model_config=dict(
12 |         tensor_placement_policy='cpu',
13 |         shard_strategy=TensorShardStrategy(),
14 |         reuse_fp16_shard=True
15 |     ),
16 |     optimizer_config=dict()
17 | )
18 | 
19 | 
20 | optimizer = dict(
21 |     type=HybridAdam,
22 |     lr=0.00015,
23 |     weight_decay=1e-2,
24 | )
25 | 
26 | model = dict(
27 |     type=gpt2_small,
28 |     checkpoint=True,
29 | )
30 | 


--------------------------------------------------------------------------------
/language/gpt/gpt2_configs/gpt2_zero3_pp1d.py:
--------------------------------------------------------------------------------
 1 | from colossalai.nn.optimizer import HybridAdam
 2 | from colossalai.zero.shard_utils import (BucketTensorShardStrategy,
 3 |                                          TensorShardStrategy)
 4 | from model import GPT2_small_pipeline_hybrid
 5 | 
 6 | BATCH_SIZE = 8
 7 | NUM_EPOCHS = 60
 8 | SEQ_LEN = 1024
 9 | NUM_MICRO_BATCHES = 4
10 | HIDDEN_SIZE = 768
11 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LEN, HIDDEN_SIZE)
12 | zero = dict(
13 |     model_config=dict(
14 |         tensor_placement_policy='cpu',
15 |         shard_strategy=BucketTensorShardStrategy()
16 |     ),
17 |     optimizer_config=dict()
18 | )
19 | 
20 | 
21 | optimizer = dict(
22 |     type=HybridAdam,
23 |     lr=0.00015,
24 |     weight_decay=1e-2,
25 | )
26 | 
27 | model = dict(
28 |     type=GPT2_small_pipeline_hybrid,
29 |     checkpoint=True,
30 |     num_chunks=1
31 | )
32 | 
33 | parallel = dict(
34 |     pipeline=2,
35 |     tensor=dict(size=2, mode='1d'),
36 | )
37 | 


--------------------------------------------------------------------------------
/language/gpt/gpt3_configs/gpt3_pp1d.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from titans.model.gpt import gpt3
 3 | from titans.loss.vocab_cross_entropy import vocab_parallel_cross_entropy
 4 | from torch.optim import Adam
 5 | from colossalai.amp import AMP_TYPE
 6 | 
 7 | 
 8 | 
 9 | BATCH_SIZE = 192
10 | NUM_EPOCHS = 60
11 | SEQ_LEN = 2048
12 | NUM_MICRO_BATCHES = 192
13 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LEN, 12288)
14 | 
15 | fp16 = dict(
16 |     mode=AMP_TYPE.NAIVE
17 | )
18 | 
19 | parallel = dict(
20 |     pipeline=32,
21 |     tensor=dict(mode='1d', size=4)
22 | )
23 | 
24 | optimizer = dict(
25 |     type=Adam,
26 |     lr=0.00015,
27 |     weight_decay=1e-2,
28 | )
29 | 
30 | model = dict(
31 |     type=gpt3,
32 |     checkpoint=True,
33 |     dtype=torch.half,
34 | )
35 | 
36 | loss_fn = dict(type=vocab_parallel_cross_entropy)
37 | 


--------------------------------------------------------------------------------
/language/gpt/gpt3_configs/gpt3_pp1d_min.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from titans.model.gpt import gpt3
 3 | from titans.loss.vocab_cross_entropy import vocab_parallel_cross_entropy
 4 | from torch.optim import Adam
 5 | from colossalai.amp import AMP_TYPE
 6 | 
 7 | 
 8 | 
 9 | BATCH_SIZE = 192
10 | NUM_EPOCHS = 60
11 | SEQ_LEN = 2048
12 | NUM_MICRO_BATCHES = 192
13 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LEN, 12288)
14 | 
15 | fp16 = dict(
16 |     mode=AMP_TYPE.NAIVE
17 | )
18 | 
19 | parallel = dict(
20 |     pipeline=24,
21 |     tensor=dict(mode='1d', size=4)
22 | )
23 | 
24 | optimizer = dict(
25 |     type=Adam,
26 |     lr=0.00015,
27 |     weight_decay=1e-2,
28 | )
29 | 
30 | model = dict(
31 |     type=gpt3,
32 |     checkpoint=True,
33 |     dtype=torch.half,
34 | )
35 | 
36 | loss_fn = dict(type=vocab_parallel_cross_entropy)
37 | 


--------------------------------------------------------------------------------
/language/gpt/gpt3_configs/gpt3_pp2d.py:
--------------------------------------------------------------------------------
 1 | from titans.model.gpt import gpt3
 2 | from torch.optim import Adam
 3 | from colossalai.amp import AMP_TYPE
 4 | import torch
 5 | 
 6 | 
 7 | BATCH_SIZE = 2*48
 8 | NUM_EPOCHS = 60
 9 | SEQ_LEN = 2048
10 | NUM_MICRO_BATCHES = 48
11 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES // 2, SEQ_LEN, 12288 // 2)
12 | 
13 | fp16 = dict(
14 |     mode=AMP_TYPE.NAIVE
15 | )
16 | 
17 | parallel = dict(
18 |     pipeline=24,
19 |     tensor=dict(mode='2d', size=4)
20 | )
21 | 
22 | optimizer = dict(
23 |     type=Adam,
24 |     lr=0.00015,
25 |     weight_decay=1e-2,
26 | )
27 | 
28 | model = dict(
29 |     type=gpt3,
30 |     checkpoint=True,
31 |     dtype=torch.half,
32 | )
33 | 


--------------------------------------------------------------------------------
/language/gpt/gpt3_configs/gpt3_pp2p5d.py:
--------------------------------------------------------------------------------
 1 | from titans.model.gpt import gpt3
 2 | from torch.optim import Adam
 3 | from colossalai.amp import AMP_TYPE
 4 | import torch
 5 | 
 6 | 
 7 | BATCH_SIZE = 2*48
 8 | NUM_EPOCHS = 60
 9 | SEQ_LEN = 2048
10 | NUM_MICRO_BATCHES = 48
11 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES // 2, SEQ_LEN, 12288 // 2)
12 | 
13 | fp16 = dict(
14 |     mode=AMP_TYPE.NAIVE
15 | )
16 | 
17 | parallel = dict(
18 |     pipeline=24,
19 |     tensor=dict(mode='2.5d', depth = 1, size=4)
20 | )
21 | 
22 | optimizer = dict(
23 |     type=Adam,
24 |     lr=0.00015,
25 |     weight_decay=1e-2,
26 | )
27 | 
28 | model = dict(
29 |     type=gpt3,
30 |     checkpoint=True,
31 |     dtype=torch.half,
32 | )
33 | 


--------------------------------------------------------------------------------
/language/gpt/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .embed import vocab_parallel_cross_entropy
2 | from .gpt1d import *
3 | from .pipeline_gpt1d import *
4 | 


--------------------------------------------------------------------------------
/language/gpt/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/language/gpt/tools/Megatron/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/language/gpt/tools/Megatron/__init__.py


--------------------------------------------------------------------------------
/language/gpt/tools/Megatron/remove_group_duplicates.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import json
18 | import time
19 | import sys
20 | 
21 | 
22 | if __name__ == '__main__':
23 | 
24 |     url_filename = sys.argv[1]
25 |     data_filename = sys.argv[2]
26 |     output_filename = sys.argv[3]
27 | 
28 |     urls = set()
29 |     with open(url_filename, 'r') as f:
30 |         for line in f:
31 |             myjson = json.loads(line)
32 |             for key in myjson:
33 |                 this_urls = myjson[key]
34 |                 for i in range(1, len(this_urls)):
35 |                     urls.add(this_urls[i])
36 |     print('will be removing {} urls'.format(len(urls)), flush=True)
37 | 
38 |     written_docs = 0
39 |     removed_docs = 0
40 |     removed_chars = 0
41 |     start_time = time.time()
42 |     with open(output_filename, 'wb') as fout:
43 |         with open(data_filename, 'r') as fin:
44 |             for line in fin:
45 |                 try:
46 |                     myjson = json.loads(line)
47 |                     url = myjson['url']
48 |                     if url in urls:
49 |                         print('removing', myjson)
50 |                         removed_docs += 1
51 |                         removed_chars += len(myjson['text'])
52 |                         continue
53 |                     myjson = json.dumps(myjson, ensure_ascii=False)
54 |                     fout.write(myjson.encode('utf-8'))
55 |                     fout.write('\n'.encode('utf-8'))
56 |                     written_docs += 1
57 |                     if written_docs % 10000 == 0:
58 |                         print(' [PROCESSED] time (s): {:.2f} | written: {} '
59 |                               '| removed: {} (char: {})'.format(
60 |                                   time.time() - start_time,
61 |                                   written_docs, removed_docs, removed_chars))
62 |                 except Exception as e:
63 |                     print('[SKIPPING]', line, e)
64 | 
65 |     print(' [PROCESSED] time (s): {:.2f} | written: {} '
66 |           '| removed: {} (char: {})'.format(
67 |               time.time() - start_time,
68 |               written_docs, removed_docs, removed_chars))
69 |     print('done :-)')
70 | 


--------------------------------------------------------------------------------
/language/gpt/tools/Megatron/tokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import sys
17 | sys.path.append('..')
18 | 
19 | from gpt2_tokenization import GPT2Tokenizer
20 | 
21 | 
22 | class Tokenizer:
23 | 
24 |     def __init__(self, cache_dir=None):
25 |         self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
26 |                                                        cache_dir=cache_dir)
27 |         self.tokenizer.max_len = int(1e12)
28 |         self.eod_token = self.tokenizer.encoder['<|endoftext|>']
29 |         assert self.eod_token < 65535, 'vocab size will not fit in uint16'
30 |         print('> GPT2 tokenizer with {} vocab size and eod token {} ...'.format(
31 |             len(self.tokenizer.encoder), self.eod_token))
32 | 
33 |     def tokenize_document(self, document):
34 |         tokens = self.tokenizer.encode(document)
35 |         tokens.append(self.eod_token)
36 |         return tokens
37 | 


--------------------------------------------------------------------------------
/language/gpt/tools/download/download_old.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing as mp
 2 | import newspaper
 3 | import os
 4 | import hashlib
 5 | import traceback
 6 | import tldextract
 7 | import tqdm
 8 | from filter import should_exclude
 9 | 
10 | hash = hashlib.sha256
11 | 
12 | try:
13 |     os.mkdir('data')
14 | except FileExistsError:
15 |     pass
16 | 
17 | 
18 | def dl(url):
19 |     url = url.strip()
20 | 
21 |     if should_exclude(url):
22 |         return
23 | 
24 |     ext = tldextract.extract(url)
25 |     domain = '.'.join([x for x in ext if x])
26 | 
27 |     fname = 'data/{}-{}.txt'.format(domain, hash(url.encode()).hexdigest())
28 |     if os.path.isfile(fname):
29 |         return
30 | #    print('Downloading', url)
31 |     try:
32 |         article = newspaper.Article(url, fetch_images=False)
33 |         article.download()
34 |         article.parse()
35 |     except newspaper.article.ArticleException:
36 | #        print('Dead link:', url)
37 |         return
38 | #        traceback.print_exc()
39 | 
40 |     text = article.text
41 | 
42 |     
43 |     if text.strip() == '':
44 | #        print('Empty')
45 |         return
46 | 
47 |     with open(fname, 'w') as out:
48 |         out.write(text)
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     p = mp.Pool(100) # num of download threads
53 |     with open('urls.txt') as fh:
54 |         urls = list(fh)
55 | 
56 |         list(tqdm.tqdm(p.imap(dl, urls), total=len(urls)))
57 |         print('Done!')
58 | 


--------------------------------------------------------------------------------
/language/gpt/tools/download/get_urls.py:
--------------------------------------------------------------------------------
 1 | import praw
 2 | import psaw
 3 | import tqdm
 4 | import datetime
 5 | 
 6 | 
 7 | api = psaw.PushshiftAPI()
 8 | 
 9 | 
10 | # all posts until the end of 2017
11 | end_time = int(datetime.datetime(2018, 1, 1).timestamp())
12 | 
13 | 
14 | query = api.search_submissions(before=end_time,
15 |                                filter=['url', 'score'],
16 |                                sort='desc',
17 |                                score='>2',
18 |                                is_self=False,
19 |                                over_18=False)
20 | 
21 | with tqdm.tqdm() as pbar:
22 |     # download links from submissions
23 |     with open('urls.txt', 'w') as fh:
24 |         for subm in query:
25 |             url = subm.url
26 | 
27 |             # weird issue with psaw/pushshift that breaks score=">2"
28 |             if subm.score < 3:
29 |                 continue
30 |             #print(subm.score)
31 | #            pbar.write(str(datetime.datetime.fromtimestamp(subm.created_utc)))
32 |             pbar.update(1)
33 |             fh.write(url + '\n')
34 |         fh.flush()
35 | 


--------------------------------------------------------------------------------
/language/gpt/tools/download/utils.py:
--------------------------------------------------------------------------------
 1 | # Code taken in large part from https://github.com/jcpeterson/openwebtext
 2 | 
 3 | 
 4 | import os
 5 | import os.path as op
 6 | import tarfile
 7 | import re
 8 | import collections
 9 | 
10 | 
11 | def extract_month(url_file_name):
12 |     month_re = r"(RS_.*2\d{3}-\d{2})"
13 |     month = op.split(url_file_name)[-1]
14 |     month = re.match(month_re, month).group()
15 |     return month
16 | 
17 | 
18 | def chunks(l, n, s=0):
19 |     """Yield successive n-sized chunks from l, skipping the first s chunks."""
20 |     if isinstance(l, collections.Iterable):
21 |         chnk = []
22 |         for i, elem in enumerate(l):
23 |             if i < s:
24 |                 continue
25 | 
26 |             chnk.append(elem)
27 |             if len(chnk) == n:
28 |                 yield chnk
29 |                 chnk = []
30 |         if len(chnk) != 0:
31 |             yield chnk
32 | 
33 |     else:
34 |         for i in range(s, len(l), n):
35 |             yield l[i : i + n]
36 | 
37 | 
38 | def extract_archive(archive_fp, outdir="."):
39 |     with tarfile.open(archive_fp, "r") as tar:
40 |         tar.extractall(outdir)
41 |     return outdir
42 | 
43 | 
44 | def mkdir(fp):
45 |     try:
46 |         os.makedirs(fp)
47 |     except FileExistsError:
48 |         pass
49 |     return fp
50 | 
51 | 
52 | def linecount(filename):
53 |     f = open(filename, 'rb')
54 |     lines = 0
55 |     buf_size = 1024 * 1024
56 |     read_f = f.raw.read
57 | 
58 |     buf = read_f(buf_size)
59 |     while buf:
60 |         lines += buf.count(b'\n')
61 |         buf = read_f(buf_size)
62 | 
63 |     return lines
64 | 


--------------------------------------------------------------------------------
/language/knowledge_graph_embedding/config.py:
--------------------------------------------------------------------------------
1 | from colossalai.amp import AMP_TYPE
2 | 
3 | CONFIG = dict(
4 |     fp16=dict(
5 |         mode=AMP_TYPE.TORCH
6 |     )
7 | )
8 | 


--------------------------------------------------------------------------------
/language/knowledge_graph_embedding/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/language/opt/benchmark.sh:
--------------------------------------------------------------------------------
 1 | export BS=16
 2 | export MEMCAP=0
 3 | export MODEL="6.7b"
 4 | export GPUNUM=1
 5 | 
 6 | for MODEL in "6.7b" "13b" "1.3b"
 7 | do
 8 | for GPUNUM in 8 1
 9 | do
10 | for BS in 16 24 32 8
11 | do 
12 | for MEMCAP in 0 40
13 | do 
14 | pkill -9 torchrun
15 | pkill -9 python
16 | 
17 | bash ./run_clm.sh $BS $MEMCAP $MODEL $GPUNUM
18 | done
19 | done
20 | done
21 | done


--------------------------------------------------------------------------------
/language/opt/colossalai_zero.py:
--------------------------------------------------------------------------------
1 | from colossalai.zero.shard_utils import TensorShardStrategy
2 | 
3 | zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(),
4 |                               tensor_placement_policy="auto",
5 |                               reuse_fp16_shard=True),
6 |             optimizer_config=dict(gpu_margin_mem_ratio=0.8, initial_scale=16384))
7 | 


--------------------------------------------------------------------------------
/language/opt/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | datasets >= 1.8.0
4 | sentencepiece != 0.1.92
5 | protobuf
6 | 


--------------------------------------------------------------------------------
/language/opt/run_clm.sh:
--------------------------------------------------------------------------------
 1 | export BS=${1:-16}
 2 | export MEMCAP=${2:-0}
 3 | export MODEL=${3:-"1.3b"}
 4 | export GPUNUM=${4:-1}
 5 | # export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
 6 | 
 7 | # make directory for logs
 8 | mkdir -p ./logs
 9 | 
10 | # env PYTORCH_NO_CUDA_MEMORY_CACHING=1 
11 | torchrun \
12 |   --nproc_per_node ${GPUNUM} \
13 |   --master_port 19198 \
14 |   run_clm.py \
15 |   --dataset_name wikitext \
16 |   --dataset_config_name wikitext-2-raw-v1 \
17 |   --model_name_or_path facebook/opt-${MODEL} \
18 |   --output_dir $PWD \
19 |   --mem_cap ${MEMCAP} \
20 |   --per_device_train_batch_size ${BS} 2>&1 | tee ./logs/colo_${MODEL}_bs_${BS}_cap_${MEMCAP}_gpu_${GPUNUM}.log
21 | 
22 | 


--------------------------------------------------------------------------------
/language/opt/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | 
 4 | 
 5 | def memory_cap(size_in_GB):
 6 |     print(f"use only {size_in_GB} GB of CUDA memory")
 7 |     assert dist.is_initialized(), "memory_cap must be used after dist init"
 8 |     local_rank = dist.get_rank()
 9 |     cuda_capacity = torch.cuda.get_device_properties(local_rank).total_memory
10 |     size_in_B = (size_in_GB * 1024**3)
11 |     if size_in_B > cuda_capacity:
12 |         print(f'memory_cap is uselsess since {cuda_capacity / 1024**3} less than {size_in_GB}')
13 |         return
14 |     fraction = (size_in_GB * 1024**3) / cuda_capacity
15 |     print(f'mem faction is {fraction}')
16 |     torch.cuda.set_per_process_memory_fraction(fraction, local_rank)
17 | 
18 | 
19 | def colo_memory_cap(size_in_GB):
20 |     from colossalai.utils import colo_set_process_memory_fraction, colo_device_memory_capacity
21 |     from colossalai.utils import get_current_device
22 |     cuda_capacity = colo_device_memory_capacity(get_current_device())
23 |     if size_in_GB * (1024**3) < cuda_capacity:
24 |         colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity)
25 |         print("Using {} GB of GPU memory".format(size_in_GB))
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     memory_cap(40)
30 | 


--------------------------------------------------------------------------------
/language/roberta/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | This repo introduce how to pretrain a chinese roberta-large from scratch
 3 | 
 4 | ## 1. Corpus Preprocessing 
 5 | ```bash
 6 | cd preprocessing
 7 | ```
 8 | following the `README.md`, preprocess orginal corpus to h5py+numpy
 9 | 
10 | ## 2. Pretrain
11 | 
12 | ```bash
13 | cd pretraining
14 | ```
15 | following the `README.md`, load the output generated by preprocess to pretrain the model
16 | 
17 | ## 3. Finetune
18 | 
19 | The checkpoint produced by this repo can replace `pytorch_model.bin` from  [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transfomers from HuggingFace to finetune downstream application.


--------------------------------------------------------------------------------
/language/roberta/configs/colossalai_ddp.py:
--------------------------------------------------------------------------------
1 | from colossalai.zero.shard_utils import TensorShardStrategy
2 | from colossalai.nn.optimizer import FusedAdam
3 | 
4 | clip_grad_norm = 1.0
5 | 


--------------------------------------------------------------------------------
/language/roberta/configs/colossalai_zero.py:
--------------------------------------------------------------------------------
 1 | from colossalai.zero.shard_utils import TensorShardStrategy
 2 | from colossalai.nn.optimizer import FusedAdam
 3 | 
 4 | # fp16 = dict(
 5 | #     mode=AMP_TYPE.TORCH,
 6 | # )
 7 | 
 8 | # seed = 2
 9 | zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(),
10 |                               reduce_scatter_bucket_size_mb=25,
11 |                               fp32_reduce_scatter=False,
12 |                               tensor_placement_policy="cuda",
13 |                               gradient_predivide_factor=1.0,
14 |                               reuse_fp16_shard=False),
15 |             optimizer_config=dict(gpu_margin_mem_ratio=0.8,
16 |                                   initial_scale=2**5,
17 |                                   min_scale=1,
18 |                                   growth_factor=2,
19 |                                   backoff_factor=0.5,
20 |                                   growth_interval=1000,
21 |                                   hysteresis=2,
22 |                                   max_scale=2**32))
23 | 
24 | # gradient_accumulation = 4
25 | clip_grad_norm = 1.0
26 | optimizer = dict(
27 |     type=FusedAdam,
28 |     lr=0.00015,
29 |     weight_decay=1e-2,
30 | )
31 | 
32 | # 64433


--------------------------------------------------------------------------------
/language/roberta/preprocessing/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++14 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = mask
 4 | LIBEXT = $(shell python3-config --extension-suffix)
 5 | 
 6 | default: $(LIBNAME)$(LIBEXT)
 7 | 
 8 | %$(LIBEXT): %.cpp
 9 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 | 


--------------------------------------------------------------------------------
/language/roberta/pretraining/README.md:
--------------------------------------------------------------------------------
 1 | # Pretraining
 2 | 1. Pretraining roberta through running the script below. Detailed parameter descriptions can be found in the arguments.py. `data_path_prefix` is absolute path specifies output of preprocessing
 3 | 
 4 | ```bash
 5 | bash run_pretrain.sh
 6 | ```
 7 | * `--hostfile`: servers' host name from /etc/hosts
 8 | * `--include`: servers which will be used
 9 | * `--nproc_per_node`: number of process(GPU) from each server
10 | * `--data_path_prefix`: absolute location of train data, e.g., /h5/0.h5
11 | * `--eval_data_path_prefix`: absolute location of eval data
12 | * `--tokenizer_path`: tokenizer path contains huggingface tokenizer.json, e.g./tokenizer/tokenizer.json
13 | * `--bert_config`: config.json which represent model
14 | * `--mlm`: model type of backbone, bert or deberta_v2
15 | 
16 | 2. if resume training from earylier checkpoint, run the script below.
17 | 
18 | ```shell
19 | bash run_pretrain_resume.sh
20 | ```
21 | * `--resume_train`: whether to resume training
22 | * `--load_pretrain_model`: absolute path which contains model checkpoint 
23 | * `--load_optimizer_lr`: absolute path which contains optimizer checkpoint 
24 | 
25 | 


--------------------------------------------------------------------------------
/language/roberta/pretraining/bert_dataset_provider.py:
--------------------------------------------------------------------------------
 1 | class BertDatasetProviderInterface:
 2 |     def get_shard(self, index, shuffle=True):
 3 |         raise NotImplementedError
 4 | 
 5 |     def release_shard(self, index):
 6 |         raise NotImplementedError
 7 | 
 8 |     def prefetch_shard(self, index):
 9 |         raise NotImplementedError
10 | 
11 |     def get_batch(self, batch_iter):
12 |         raise NotImplementedError
13 | 
14 |     def prefetch_batch(self):
15 |         raise NotImplementedError
16 | 


--------------------------------------------------------------------------------
/language/roberta/pretraining/hostfile:
--------------------------------------------------------------------------------
 1 | GPU001
 2 | GPU002
 3 | GPU003
 4 | GPU004
 5 | GPU005
 6 | GPU006
 7 | GPU007
 8 | GPU008
 9 | GPU009
10 | GPU010
11 | 


--------------------------------------------------------------------------------
/language/roberta/pretraining/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | __all__ = ['LossForPretraining']
 4 | 
 5 | 
 6 | class LossForPretraining(torch.nn.Module):
 7 | 
 8 |     def __init__(self, vocab_size):
 9 |         super(LossForPretraining, self).__init__()
10 |         self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1)
11 |         self.vocab_size = vocab_size
12 | 
13 |     def forward(self, prediction_scores, masked_lm_labels, next_sentence_labels=None):
14 |         masked_lm_loss = self.loss_fn(prediction_scores.view(-1, self.vocab_size), masked_lm_labels.view(-1))
15 |         # next_sentence_loss = self.loss_fn(seq_relationship_score.view(-1, 2), next_sentence_labels.view(-1))
16 |         total_loss = masked_lm_loss #+ next_sentence_loss
17 |         return total_loss
18 | 


--------------------------------------------------------------------------------
/language/roberta/pretraining/run_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | 
 3 | root_path=$PWD
 4 | PY_FILE_PATH="$root_path/run_pretraining.py"
 5 | 
 6 | tensorboard_path="$root_path/tensorboard"
 7 | log_path="$root_path/exp_log"
 8 | ckpt_path="$root_path/ckpt"
 9 | 
10 | colossal_config="$root_path/../configs/colossalai_ddp.py"
11 | 
12 | mkdir -p $tensorboard_path
13 | mkdir -p $log_path
14 | mkdir -p $ckpt_path
15 | 
16 | export PYTHONPATH=$PWD
17 | 
18 | env OMP_NUM_THREADS=40 colossalai run --hostfile ./hostfile \
19 |                 --include GPU002,GPU003,GPU004,GPU007 \
20 |                 --nproc_per_node=8 \
21 |                 $PY_FILE_PATH \
22 |                 --master_addr GPU007 \
23 |                 --master_port 20024 \
24 |                 --lr 2.0e-4 \
25 |                 --train_micro_batch_size_per_gpu 190 \
26 |                 --eval_micro_batch_size_per_gpu 20 \
27 |                 --epoch 15 \
28 |                 --data_path_prefix /h5 \
29 |                 --eval_data_path_prefix /eval_h5 \
30 |                 --tokenizer_path /roberta \
31 |                 --bert_config /roberta/config.json \
32 |                 --tensorboard_path $tensorboard_path \
33 |                 --log_path $log_path \
34 |                 --ckpt_path $ckpt_path \
35 |                 --colossal_config $colossal_config \
36 |                 --log_interval 50 \
37 |                 --mlm bert \
38 |                 --wandb \
39 |                 --checkpoint_activations \
40 |                 


--------------------------------------------------------------------------------
/language/roberta/pretraining/run_pretrain_resume.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | 
 3 | root_path=$PWD
 4 | PY_FILE_PATH="$root_path/run_pretraining.py"
 5 | 
 6 | tensorboard_path="$root_path/tensorboard"
 7 | log_path="$root_path/exp_log"
 8 | ckpt_path="$root_path/ckpt"
 9 | 
10 | colossal_config="$root_path/../configs/colossalai_ddp.py"
11 | 
12 | mkdir -p $tensorboard_path
13 | mkdir -p $log_path
14 | mkdir -p $ckpt_path
15 | 
16 | export PYTHONPATH=$PWD
17 | 
18 | env OMP_NUM_THREADS=40 colossalai run --hostfile ./hostfile \
19 |                 --include GPU002,GPU003,GPU004,GPU007 \
20 |                 --nproc_per_node=8 \
21 |                 $PY_FILE_PATH \
22 |                 --master_addr GPU007 \
23 |                 --master_port 20024 \
24 |                 --lr 2.0e-4 \
25 |                 --train_micro_batch_size_per_gpu 190 \
26 |                 --eval_micro_batch_size_per_gpu 20 \
27 |                 --epoch 15 \
28 |                 --data_path_prefix /h5 \
29 |                 --eval_data_path_prefix /eval_h5 \
30 |                 --tokenizer_path /roberta \
31 |                 --bert_config /roberta/config.json \
32 |                 --tensorboard_path $tensorboard_path \
33 |                 --log_path $log_path \
34 |                 --ckpt_path $ckpt_path \
35 |                 --colossal_config $colossal_config \
36 |                 --log_interval 50 \
37 |                 --mlm bert \
38 |                 --wandb \
39 |                 --checkpoint_activations \
40 |                 --resume_train \
41 |                 --load_pretrain_model /ckpt/1.pt \
42 |                 --load_optimizer_lr /ckpt/1.op_lrs \
43 |                 


--------------------------------------------------------------------------------
/language/roberta/pretraining/utils/WandbLog.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import wandb
 3 | import os
 4 | from torch.utils.tensorboard import SummaryWriter
 5 | 
 6 | class WandbLog:
 7 | 
 8 |     @classmethod
 9 |     def init_wandb(cls, project, notes=None, name=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), config=None):
10 |         wandb.init(project=project, notes=notes, name=name, config=config)
11 | 
12 |     @classmethod
13 |     def log(cls, result, model=None, gradient=None):
14 |         wandb.log(result)
15 | 
16 |         if model:
17 |             wandb.watch(model)
18 |         
19 |         if gradient:
20 |             wandb.watch(gradient)
21 | 
22 | 
23 | class TensorboardLog:
24 | 
25 |     def __init__(self, location, name=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), config=None):
26 |         if not os.path.exists(location):
27 |             os.mkdir(location)
28 |         self.writer = SummaryWriter(location, comment=name)
29 | 
30 |     def log_train(self, result, step):
31 |         for k, v in result.items():
32 |             self.writer.add_scalar(f'{k}/train', v, step)
33 |     
34 |     def log_eval(self, result, step):
35 |         for k, v in result.items():
36 |             self.writer.add_scalar(f'{k}/eval', v, step)
37 | 
38 |     def log_zeroshot(self, result, step):
39 |         for k, v in result.items():
40 |             self.writer.add_scalar(f'{k}_acc/eval', v, step)
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/language/roberta/pretraining/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import torch.distributed as dist
 4 | 
 5 | logging.basicConfig(
 6 |     format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
 7 |     datefmt='%m/%d/%Y %H:%M:%S',
 8 |     level=logging.INFO)
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | class Logger():
13 |     def __init__(self, log_path, cuda=False):
14 |         self.logger = logging.getLogger(__name__)
15 |         self.cuda = cuda
16 |         self.log_path = log_path
17 | 
18 | 
19 |     def info(self, message, log_=True, print_=True, *args, **kwargs):
20 |         if (self.cuda and dist.get_rank() == 0) or not self.cuda:
21 |             if print_:
22 |                 self.logger.info(message, *args, **kwargs)
23 | 
24 |             if log_:
25 |                 with open(self.log_path, 'a+') as f_log:
26 |                     f_log.write(message + '\n')
27 | 
28 | 
29 |     def error(self, message, *args, **kwargs):
30 |         self.logger.error(message, *args, **kwargs)
31 | 


--------------------------------------------------------------------------------
/language/roberta/requirements.txt:
--------------------------------------------------------------------------------
1 | colossalai
2 | torch >= 1.8.1
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch >= 1.8.0
 2 | torchvision
 3 | colossalai
 4 | titans
 5 | datasets >= 1.8.0
 6 | sentencepiece != 0.1.92
 7 | protobuf
 8 | transformers
 9 | Pillow
10 | tqdm
11 | ipdb
12 | numpy
13 | einops
14 | pyarrow
15 | sacred
16 | pandas
17 | git+https://github.com/rwightman/pytorch-image-models.git
18 | psutil
19 | tensorboard
20 | packaging
21 | 


--------------------------------------------------------------------------------
/utils/checkpoint/readme.md:
--------------------------------------------------------------------------------
 1 | # Model Checkpoint
 2 | 
 3 | Examples of how to use model checkpoint.
 4 | 
 5 | ## How to run
 6 | We use `colossalai.launch_from_torch` as an example here. Before running, you should `export DATA=/path/to/cifar-10`. 
 7 | 
 8 | If you are training with single node multiple GPUs:
 9 | ```shell
10 | # If your torch >= 1.10.0
11 | torchrun --standalone --nproc_per_node <world_size> save_engine.py
12 | 
13 | # If your torch >= 1.9.0
14 | python -m torch.distributed.run --standalone --nproc_per_node=<world_size> save_engine.py
15 | 
16 | # Otherwise
17 | python -m torch.distributed.launch --nproc_per_node <world_size> --master_addr <node_name> --master_port 29500 save_engine.py
18 | ```
19 | 
20 | If you are using multiple nodes, see [torchrun](https://pytorch.org/docs/stable/elastic/run.html#launcher-api).


--------------------------------------------------------------------------------
/utils/checkpoint/save_trainer.py:
--------------------------------------------------------------------------------
 1 | import colossalai
 2 | import torch
 3 | import os
 4 | 
 5 | import colossalai.nn as col_nn
 6 | from colossalai.utils import get_dataloader,  MultiTimer
 7 | from colossalai.logging import get_dist_logger
 8 | from colossalai.core import global_context as gpc
 9 | from torch.nn.modules import CrossEntropyLoss
10 | from torchvision import transforms
11 | from torchvision.datasets import CIFAR10
12 | from colossalai.trainer import Trainer, hooks
13 | from model_zoo.vit import vit_tiny_patch4_32
14 | 
15 | def build_cifar(batch_size):
16 |     transform_train = transforms.Compose([
17 |         transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.CIFAR10),
18 |         transforms.ToTensor(),
19 |         transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
20 |     ])
21 |     transform_test = transforms.Compose([
22 |         transforms.ToTensor(),
23 |         transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
24 |     ])
25 | 
26 |     train_dataset = CIFAR10(root=os.environ['DATA'], train=True, download=True, transform=transform_train)
27 |     test_dataset = CIFAR10(root=os.environ['DATA'], train=False, download=True, transform=transform_test)
28 |     train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=batch_size, pin_memory=True)
29 |     test_dataloader = get_dataloader(dataset=test_dataset, batch_size=batch_size, pin_memory=True)
30 |     return train_dataloader, test_dataloader
31 | 
32 | 
33 | BATCH_SIZE = 128
34 | NUM_EPOCHS = 10
35 | CONFIG = dict()
36 | 
37 | 
38 | def train():
39 |     args = colossalai.get_default_parser().parse_args()
40 |     colossalai.launch_from_torch(backend=args.backend, config=CONFIG)
41 |     
42 |     logger = get_dist_logger()
43 |     model = vit_tiny_patch4_32()
44 |     criterion = CrossEntropyLoss()
45 |     optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
46 |     train_dataloader, test_dataloader = build_cifar(BATCH_SIZE)
47 | 
48 |     engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model, optimizer, criterion,
49 |                                                                          train_dataloader, test_dataloader)
50 |     timer = MultiTimer()
51 | 
52 |     trainer = Trainer(engine=engine, timer=timer, logger=logger)
53 | 
54 |     hook_list = [
55 |         hooks.LossHook(),
56 |         hooks.AccuracyHook(col_nn.metric.Accuracy()),
57 |         hooks.LogMetricByEpochHook(logger),
58 |         hooks.SaveCheckpointHook(1, 'vit_cifar.pt', model)
59 |     ]
60 | 
61 |     trainer.fit(train_dataloader=train_dataloader,
62 |                 epochs=NUM_EPOCHS,
63 |                 test_dataloader=test_dataloader,
64 |                 test_interval=1,
65 |                 hooks=hook_list,
66 |                 display_progress=True)
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     train()
71 |     


--------------------------------------------------------------------------------