├── .github ├── ISSUE_TEMPLATE │ ├── bug-report.yml │ ├── config.yml │ ├── documentation.yml │ └── feature_request.yml └── workflows │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .style.yapf ├── LICENSE ├── README.md ├── benchmark └── zero │ ├── README.md │ ├── colossalai_utils │ ├── gpt2_config.json │ ├── gpt2_config_v1.json │ ├── utils.py │ └── vit_config.json │ ├── common │ ├── gpt2.py │ ├── train.py │ ├── utils.py │ └── vit.py │ ├── deepspeed_utils │ ├── gpt2_config.json │ ├── utils.py │ └── vit_config.json │ ├── fairscale_utils │ ├── gpt2_config.json │ ├── utils.py │ └── vit_config.json │ ├── patrickstar_utils │ ├── gpt2_config.json │ ├── utils.py │ └── vit_config.json │ ├── requirement.txt │ ├── run.py │ └── torch_utils │ ├── gpt2_config.json │ ├── utils.py │ └── vit_config.json ├── features ├── amp │ ├── README.md │ ├── config │ │ ├── config_AMP_apex.py │ │ ├── config_AMP_naive.py │ │ ├── config_AMP_torch.py │ │ └── config_fp32.py │ ├── requirements.txt │ ├── scripts │ │ └── train_slurm.sh │ └── train.py ├── colotensor │ ├── README.md │ ├── gpt_megatron.py │ └── requirements.txt ├── gradient_accumulation │ ├── README.md │ ├── config.py │ ├── requirements.txt │ └── train.py ├── gradient_clipping │ ├── README.md │ ├── config.py │ ├── requirements.txt │ └── train.py ├── pipeline_parallel │ ├── .init │ ├── README.md │ ├── requirements.txt │ ├── resnet.py │ └── rpc │ │ ├── gpt │ │ ├── 1f1b.py │ │ ├── README.md │ │ ├── baseline.py │ │ └── dataset │ │ │ ├── webtext.py │ │ │ └── yuan.py │ │ ├── opt │ │ ├── 1f1b.py │ │ ├── README.md │ │ └── check │ │ │ └── opt_avail.py │ │ └── resnet │ │ ├── 1f1b.py │ │ ├── README.md │ │ ├── chimera.py │ │ └── fill_drain.py ├── tensor_parallel │ ├── README.md │ ├── configs │ │ ├── tp_1d.py │ │ ├── tp_2d.py │ │ ├── tp_2p5d.py │ │ └── tp_3d.py │ ├── requirements.txt │ └── run.py └── zero │ ├── README.md │ ├── requirements.txt │ ├── train.py │ └── train_v2.py ├── image ├── detr-debug │ ├── README.md │ ├── configs │ │ └── detr_1d.py │ ├── datasets │ │ ├── __init__.py │ │ ├── coco.py │ │ ├── coco_eval.py │ │ └── transforms.py │ ├── engine.py │ ├── models │ │ ├── __init__.py │ │ ├── backbone.py │ │ ├── detr.py │ │ ├── matcher.py │ │ ├── position_encoding.py │ │ └── transformer.py │ ├── requirements.txt │ ├── results │ │ ├── log.txt │ │ └── loss_curve.jpg │ ├── run_train.py │ └── util │ │ ├── __init__.py │ │ ├── box_ops.py │ │ ├── misc.py │ │ └── plot_utils.py ├── detr │ ├── README.md │ ├── config.py │ ├── datasets │ │ ├── __init__.py │ │ ├── coco.py │ │ ├── coco_eval.py │ │ └── transforms.py │ ├── engine.py │ ├── main.py │ ├── models │ │ ├── __init__.py │ │ ├── backbone.py │ │ ├── detr.py │ │ ├── matcher.py │ │ ├── position_encoding.py │ │ ├── segmentation.py │ │ └── transformer.py │ ├── requirements.txt │ └── util │ │ ├── __init__.py │ │ ├── box_ops.py │ │ ├── misc.py │ │ └── plot_utils.py ├── diffusion │ ├── LICENSE │ ├── README.md │ └── requirements.txt ├── mae │ ├── .gitignore │ ├── README.md │ ├── config │ │ ├── pretrain.py │ │ └── pretrain_1d_tp2.py │ ├── main_pretrain.py │ ├── models_mae_tp.py │ ├── requirements.txt │ └── util │ │ ├── crop.py │ │ ├── misc.py │ │ └── pos_embed.py ├── mlpmixer │ ├── README.md │ ├── colossalAI_mlpmixer.py │ ├── configs │ │ └── MlpMixer_vanilla.py │ ├── requirements.txt │ ├── train_data.py │ └── train_pipline.py ├── moe │ ├── README.md │ ├── config.py │ ├── requirements.txt │ └── train.py ├── resnet │ ├── README.md │ ├── auto_parallel │ │ ├── README.md │ │ └── auto_parallel_demo.py │ ├── config.py │ ├── requirements.txt │ ├── resnet.py │ └── train.py ├── simclr │ ├── NT_Xentloss.py │ ├── README.md │ ├── augmentation.py │ ├── config.py │ ├── le_config.py │ ├── models │ │ ├── Backbone.py │ │ ├── linear_eval.py │ │ └── simclr.py │ ├── myhooks.py │ ├── requirements.txt │ ├── results │ │ ├── embedding.npz │ │ ├── linear_eval_acc.png │ │ ├── linear_eval_loss.png │ │ ├── ssl_loss.png │ │ ├── test_tsne.png │ │ └── train_tsne.png │ ├── train.sh │ ├── train_linear.py │ ├── train_simclr.py │ └── visualization.py ├── vilt │ ├── .gitignore │ ├── README.md │ ├── configs.py │ ├── models │ │ └── vilt.py │ ├── prepare_dataset.sh │ ├── requirements.txt │ ├── run.py │ ├── run.sh │ ├── schedule.py │ └── utils │ │ ├── base_dataset.py │ │ ├── config.py │ │ ├── dataloader.py │ │ ├── datamodule_base.py │ │ ├── heads.py │ │ ├── makearrow.py │ │ ├── objectives.py │ │ ├── transforms │ │ ├── __init__.py │ │ ├── pixelbert.py │ │ ├── randaug.py │ │ └── utils.py │ │ └── write_coco_karpathy.py └── vision_transformer │ ├── colo_vit │ ├── README.md │ ├── configs │ │ └── vit_1d_tp2.py │ ├── requirements.txt │ ├── run.sh │ ├── test_vit.py │ ├── train.py │ ├── utils │ │ ├── dummy_data_generator.py │ │ └── util.py │ └── vit.py │ ├── data_parallel │ ├── README.md │ ├── config.py │ ├── mixup.py │ ├── myhooks.py │ ├── requirements.txt │ ├── results │ │ ├── acc.jpeg │ │ └── loss.jpeg │ ├── scripts │ │ └── train_slurm.sh │ ├── train.py │ └── train_with_cifar10.py │ └── hybrid_parallel │ ├── README.md │ ├── configs │ ├── vit_1d_tp2_pp2.py │ ├── vit_1d_tp4_pp16.py │ ├── vit_2d_tp4_pp16.py │ ├── vit_2p5d_tp4_pp16.py │ ├── vit_3d_tp8_pp8.py │ └── vit_pipeline.py │ ├── model │ ├── __init__.py │ └── vit.py │ ├── requirements.txt │ ├── train_with_cifar10.py │ ├── train_with_engine.py │ └── train_with_trainer.py ├── language ├── DeepNet │ ├── README.md │ ├── dataset │ │ └── webtext.py │ ├── decoder_configs │ │ └── deepnet_pp1d.py │ ├── requirements.txt │ └── train_deepnet_decoder.py ├── bert │ ├── colotensor │ │ ├── README.md │ │ ├── configs │ │ │ └── bert_base_tp1d.py │ │ ├── dataset │ │ │ ├── __init__.py │ │ │ └── wikitext.py │ │ ├── model │ │ │ ├── __init__.py │ │ │ └── hfmodel.py │ │ ├── requirements.txt │ │ └── train.py │ ├── hybrid_parallel │ │ ├── README.md │ │ ├── colossalai_utils │ │ │ ├── bert_config_pp.json │ │ │ ├── bert_config_tp1d.json │ │ │ ├── bert_config_tp1dpp.json │ │ │ ├── bert_config_tp2d.json │ │ │ ├── bert_config_tp2p5d.json │ │ │ ├── bert_config_tp3d.json │ │ │ ├── bert_config_zero.json │ │ │ ├── bert_config_zerotppp.json │ │ │ ├── model_zoo │ │ │ │ ├── __init__.py │ │ │ │ └── colo_bert.py │ │ │ ├── requirement.txt │ │ │ └── utils.py │ │ ├── common │ │ │ ├── helper.py │ │ │ └── train.py │ │ ├── requirements.txt │ │ └── run.py │ ├── preprocessing │ │ ├── .gitignore │ │ ├── README.md │ │ ├── pretrain_preprocess.sh │ │ └── requirements.txt │ ├── requirements.txt │ ├── sequene_parallel │ │ ├── README.md │ │ ├── config.py │ │ ├── data │ │ │ ├── __init__.py │ │ │ ├── bert_helper.py │ │ │ ├── datasets │ │ │ │ ├── Makefile │ │ │ │ ├── __init__.py │ │ │ │ ├── bert_dataset.py │ │ │ │ ├── blendable_dataset.py │ │ │ │ ├── builder.py │ │ │ │ ├── data_samplers.py │ │ │ │ ├── dataset_utils.py │ │ │ │ ├── helpers.cpp │ │ │ │ ├── ict_dataset.py │ │ │ │ ├── indexed_dataset.py │ │ │ │ └── test │ │ │ │ │ ├── test_indexed_dataset.py │ │ │ │ │ └── test_preprocess_data.sh │ │ │ └── tokenizer │ │ │ │ ├── __init__.py │ │ │ │ ├── bert_tokenization.py │ │ │ │ └── tokenizer.py │ │ ├── loss_func │ │ │ ├── __init__.py │ │ │ ├── bert_loss.py │ │ │ ├── cross_entropy.py │ │ │ └── utils.py │ │ ├── lr_scheduler │ │ │ ├── __init__.py │ │ │ └── annealing_lr.py │ │ ├── model │ │ │ ├── __init__.py │ │ │ ├── bert.py │ │ │ └── layers │ │ │ │ ├── __init__.py │ │ │ │ ├── bert_layer.py │ │ │ │ ├── dropout.py │ │ │ │ ├── embedding.py │ │ │ │ ├── head.py │ │ │ │ ├── init_method.py │ │ │ │ ├── linear.py │ │ │ │ ├── mlp.py │ │ │ │ ├── pooler.py │ │ │ │ └── preprocess.py │ │ ├── requirements.txt │ │ └── train.py │ └── zero │ │ ├── .gitignore │ │ ├── README.md │ │ ├── configs │ │ ├── bert_base.json │ │ ├── colossalai_amp.py │ │ └── colossalai_zero.py │ │ ├── finetuning │ │ └── glue │ │ │ ├── __init__.py │ │ │ ├── arguments.py │ │ │ ├── data.py │ │ │ ├── main.py │ │ │ ├── metrics.py │ │ │ ├── processors.py │ │ │ └── utils.py │ │ ├── pretraining │ │ ├── arguments.py │ │ ├── loss.py │ │ ├── pretrain_utils.py │ │ └── run_pretraining.py │ │ ├── requirements.txt │ │ └── scripts │ │ ├── download_finetune_dataset.sh │ │ ├── run_finetune_glue.sh │ │ └── run_pretrain.sh ├── gpt │ ├── README.md │ ├── dataset │ │ ├── webtext.py │ │ └── yuan.py │ ├── gpt2_configs │ │ ├── gpt2_1d.py │ │ ├── gpt2_2d.py │ │ ├── gpt2_2p5d.py │ │ ├── gpt2_3d.py │ │ ├── gpt2_pp.py │ │ ├── gpt2_pp1d.py │ │ ├── gpt2_vanilla.py │ │ ├── gpt2_zero3.py │ │ └── gpt2_zero3_pp1d.py │ ├── gpt3_configs │ │ ├── gpt3_pp1d.py │ │ ├── gpt3_pp1d_min.py │ │ ├── gpt3_pp2d.py │ │ └── gpt3_pp2p5d.py │ ├── model │ │ ├── __init__.py │ │ ├── embed.py │ │ ├── gpt1d.py │ │ └── pipeline_gpt1d.py │ ├── requirements.txt │ ├── tools │ │ ├── LSH │ │ │ └── cMinhash.cpp │ │ ├── Megatron │ │ │ ├── __init__.py │ │ │ ├── blacklist_urls.py │ │ │ ├── cleanup_dataset.py │ │ │ ├── cleanup_fix_dataset.py │ │ │ ├── find_duplicates.py │ │ │ ├── gpt2_tokenization.py │ │ │ ├── group_duplicate_url.py │ │ │ ├── remove_group_duplicates.py │ │ │ └── tokenizer.py │ │ └── download │ │ │ ├── download.py │ │ │ ├── download_old.py │ │ │ ├── filter.py │ │ │ ├── get_urls.py │ │ │ ├── scrapers.py │ │ │ └── utils.py │ └── train_gpt.py ├── knowledge_graph_embedding │ ├── README.md │ ├── config.py │ ├── dataloader │ │ └── dataloader.py │ ├── requirements.txt │ └── train.py ├── opt │ ├── README.md │ ├── benchmark.sh │ ├── colossalai_zero.py │ ├── requirements.txt │ ├── run_clm.py │ ├── run_clm.sh │ └── utils.py └── roberta │ ├── README.md │ ├── configs │ ├── colossalai_ddp.py │ └── colossalai_zero.py │ ├── preprocessing │ ├── Makefile │ ├── README.md │ ├── get_mask.py │ ├── mask.cpp │ ├── sentence_split.py │ └── tokenize_mask.py │ ├── pretraining │ ├── README.md │ ├── arguments.py │ ├── bert_dataset_provider.py │ ├── evaluation.py │ ├── hostfile │ ├── loss.py │ ├── model │ │ ├── bert.py │ │ └── deberta_v2.py │ ├── nvidia_bert_dataset_provider.py │ ├── pretrain_utils.py │ ├── run_pretrain.sh │ ├── run_pretrain_resume.sh │ ├── run_pretraining.py │ └── utils │ │ ├── WandbLog.py │ │ ├── exp_util.py │ │ ├── global_vars.py │ │ └── logger.py │ └── requirements.txt ├── requirements.txt └── utils └── checkpoint ├── load.py ├── readme.md ├── save_engine.py └── save_trainer.py /.github/ISSUE_TEMPLATE/bug-report.yml: -------------------------------------------------------------------------------- 1 | name: 🐛 Bug Report 2 | description: Create a report to help us reproduce and fix the bug 3 | 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: > 8 | #### Not suitable for your needs? [Open a blank issue](https://github.com/hpcaitech/ColossalAI/issues/new). 9 | - type: textarea 10 | attributes: 11 | label: 🐛 Describe the bug 12 | description: | 13 | **Describe the bug** 14 | A clear and concise description of what the bug is. 15 | **To Reproduce** 16 | Steps or code snippet to reproduce the behavior. 17 | **Expected behavior** 18 | A clear and concise description of what you expected to happen. 19 | **Screenshots** 20 | If applicable, add screenshots to help explain your problem. 21 | placeholder: | 22 | A clear and concise description of what the bug is. 23 | validations: 24 | required: true 25 | - type: textarea 26 | attributes: 27 | label: Environment 28 | description: | 29 | Please provide the environment information, eg. CUDA/cuDNN/NCCL/Python/PyTorch version. 30 | 31 | - type: markdown 32 | attributes: 33 | value: > 34 | Thanks for contributing 🎉! 35 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | contact_links: 3 | - name: "😊 Discussions" 4 | url: https://github.com/hpcaitech/ColossalAI/discussions 5 | about: Ask questions and discuss with other Colossal-AI community members in our forum 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation.yml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation 2 | description: Report an issue related to https://www.colossalai.org/ 3 | 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: > 8 | #### Not suitable for your needs? [Open a blank issue](https://github.com/hpcaitech/ColossalAI/issues/new). 9 | - type: textarea 10 | attributes: 11 | label: 📚 The doc issue 12 | description: | 13 | **Description** What content in [Documentation](https://www.colossalai.org/) is an issue? 14 | **Location** Where is the issue location? 15 | **Expectation** What is your expected content about it? 16 | **Screenshots** If applicable, add screenshots to help explain your problem. 17 | **Suggestions** Tell us how we could improve the documentation. 18 | placeholder: | 19 | A clear and concise description of the issue. 20 | validations: 21 | required: true 22 | 23 | - type: markdown 24 | attributes: 25 | value: > 26 | Thanks for contributing 🎉! 27 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yml: -------------------------------------------------------------------------------- 1 | name: 🚀 Feature request 2 | description: Suggest an idea for this project 3 | 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: > 8 | #### Not suitable for your needs? [Open a blank issue](https://github.com/hpcaitech/ColossalAI/issues/new). 9 | - type: textarea 10 | attributes: 11 | label: Describe the feature 12 | description: | 13 | **Is your feature request related to a problem? Please describe.** 14 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 15 | **Describe the solution you'd like** 16 | A clear and concise description of what you want to happen. 17 | **Describe alternatives you've considered** 18 | A clear and concise description of any alternative solutions or features you've considered. 19 | **Screenshots** 20 | If applicable, add screenshots to help explain your problem. 21 | **Suggest a potential alternative/fix** 22 | Tell us how we could improve this project. 23 | placeholder: | 24 | A clear and concise description of your idea. 25 | validations: 26 | required: true 27 | 28 | - type: markdown 29 | attributes: 30 | value: > 31 | Thanks for contributing 🎉! 32 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/mirrors-yapf 3 | rev: v0.32.0 4 | hooks: 5 | - id: yapf 6 | args: ['--style=.style.yapf', '--parallel', '--in-place'] 7 | - repo: https://github.com/pre-commit/mirrors-clang-format 8 | rev: v13.0.1 9 | hooks: 10 | - id: clang-format 11 | -------------------------------------------------------------------------------- /.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | based_on_style = google 3 | spaces_before_comment = 4 4 | split_before_logical_operator = true 5 | column_limit = 120 6 | -------------------------------------------------------------------------------- /benchmark/zero/README.md: -------------------------------------------------------------------------------- 1 | # GPT2 ZeRO Benchmark 2 | GPT2 ZeRO benchmark with data parallelism to evaluate Colossal-AI, DeepSpeed, FairScale and PatrickStar. 3 | 4 | ## Requirements 5 | ``` 6 | CUDA>=11.3 7 | torch>=1.10.0 8 | deepspeed>=0.5.8 9 | fairscale>=0.4.5 10 | patrickstar>=0.4.6 11 | nvidia-dali>=1.8.0 12 | ``` 13 | 14 | ## Setup 15 | 1. Install dependencies if you do not have them 16 | ``` 17 | pip install -r requirement.txt 18 | ``` 19 | 2. Also, download PatrickStar from github 20 | ``` 21 | https://github.com/Tencent/PatrickStar.git 22 | ``` 23 | 3. Install PatrickStar 24 | ``` 25 | cd PatrickStar 26 | pip install . 27 | ``` 28 | 4. Add root dir into PYTHONPATH 29 | ``` 30 | export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH 31 | ``` 32 | 33 | ## GPT Usage 34 | 35 | 1. Prepare datasets and tokenizers from HuggingFace Hub if necessary (e.g. we provide an example of training `wikitext-2`). 36 | 37 | 2. Run benchmark with one of the systems to evaluate 38 | ``` 39 | DATA=/PATH/TO/DATASET TOKENIZER=/PATH/TO/TOKENIZER LOG=/PATH/TO/LOG torchrun --nproc_per_node=NUM_GPUS run.py --config=CONFIG_FILE 40 | ``` 41 | 42 | ## VIT Usage 43 | 1. Prepare ImageNet-1k datasets (TFrecord version). 44 | 45 | 2. Run benchmark with one of the systems to evaluate 46 | ``` 47 | DATA=/PATH/TO/DATASET LOG=/PATH/TO/LOG torchrun --nproc_per_node=NUM_GPUS run.py --config=CONFIG_FILE 48 | ``` 49 | -------------------------------------------------------------------------------- /benchmark/zero/colossalai_utils/gpt2_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "colossalai", 3 | "model": { 4 | "type": "gpt2_10b" 5 | }, 6 | "hyperparameter": { 7 | "batch_size": 1, 8 | "steps_per_epoch": 3 9 | }, 10 | "fp16": { 11 | "initial_scale": 32768, 12 | "min_scale": 1, 13 | "growth_factor": 2.0, 14 | "backoff_factor": 0.5, 15 | "growth_interval": 1000 16 | }, 17 | "gradient_clipping": 0.0, 18 | "zero": { 19 | "reduce_scatter_bucket_size_mb": 25, 20 | "fp32_reduce_scatter": false, 21 | "offload_config": { 22 | "device": "cpu" 23 | }, 24 | "reuse_fp16_shard": true, 25 | "version": 2 26 | }, 27 | "use_mem_monitor": true 28 | } -------------------------------------------------------------------------------- /benchmark/zero/colossalai_utils/gpt2_config_v1.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "colossalai", 3 | "model": { 4 | "type": "gpt2_small" 5 | }, 6 | "hyperparameter": { 7 | "batch_size": 3, 8 | "steps_per_epoch": 10 9 | }, 10 | "fp16": { 11 | "initial_scale": 32768, 12 | "min_scale": 1, 13 | "growth_factor": 2.0, 14 | "backoff_factor": 0.5, 15 | "growth_interval": 1000 16 | }, 17 | "gradient_clipping": 0.0, 18 | "zero": { 19 | "mixed_precision": true, 20 | "reshard_after_forward": false, 21 | "offload_config": { 22 | "device": "cpu" 23 | }, 24 | "version": 1 25 | }, 26 | "use_mem_monitor": true 27 | } 28 | -------------------------------------------------------------------------------- /benchmark/zero/colossalai_utils/vit_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "colossalai", 3 | "model": { 4 | "type": "vit_h" 5 | }, 6 | "hyperparameter": { 7 | "batch_size": 4, 8 | "steps_per_epoch": 10 9 | }, 10 | "fp16": { 11 | "initial_scale": 32768, 12 | "min_scale": 1, 13 | "growth_factor": 2.0, 14 | "backoff_factor": 0.5, 15 | "growth_interval": 1000 16 | }, 17 | "gradient_clipping": 1.0, 18 | "zero": { 19 | "reduce_scatter_bucket_size_mb": 25, 20 | "fp32_reduce_scatter": false, 21 | "offload_config": { 22 | "device": "cpu" 23 | }, 24 | "shard_param": true 25 | }, 26 | "use_mem_monitor": true 27 | } -------------------------------------------------------------------------------- /benchmark/zero/deepspeed_utils/gpt2_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "deepspeed", 3 | "model": { 4 | "type": "gpt2_10b" 5 | }, 6 | "hyperparameter": { 7 | "batch_size": 20, 8 | "num_epochs": 2, 9 | "steps_per_epoch": 10, 10 | "synthetic": true 11 | }, 12 | "train_batch_size": 40, 13 | "steps_per_print": 2147483647, 14 | "zero_optimization": { 15 | "stage": 3, 16 | "offload_optimizer": { 17 | "device": "cpu", 18 | "pin_memory": true, 19 | "buffer_count": 4, 20 | "fast_init": false 21 | }, 22 | "offload_param": { 23 | "device": "cpu", 24 | "pin_memory": true, 25 | "buffer_count": 5, 26 | "buffer_size": 1e8, 27 | "max_in_cpu": 1e9 28 | }, 29 | "allgather_partitions": true, 30 | "allgather_bucket_size": 5e8, 31 | "overlap_comm": true, 32 | "reduce_scatter": true, 33 | "reduce_bucket_size": 5e8, 34 | "contiguous_gradients": true, 35 | "stage3_max_live_parameters": 1e9, 36 | "stage3_max_reuse_distance": 1e9, 37 | "stage3_prefetch_bucket_size": 5e8, 38 | "stage3_param_persistence_threshold": 1e6 39 | }, 40 | "gradient_clipping": 1.0, 41 | "fp16": { 42 | "enabled": true, 43 | "loss_scale": 0, 44 | "initial_scale_power": 5, 45 | "loss_scale_window": 1000, 46 | "hysteresis": 2, 47 | "min_loss_scale": 1 48 | }, 49 | "use_mem_monitor": true 50 | } -------------------------------------------------------------------------------- /benchmark/zero/deepspeed_utils/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from zero.common.utils import CONFIG, get_gpu_memory_mb, print_log 3 | 4 | 5 | def init_w_ds(builder): 6 | import deepspeed 7 | 8 | config = CONFIG.copy() 9 | 10 | deepspeed.init_distributed() 11 | 12 | if CONFIG.get('gpu_mem_fraction', None) is not None: 13 | torch.cuda.set_per_process_memory_fraction(CONFIG['gpu_mem_fraction']) 14 | print_log(f'Set max GPU mem: {get_gpu_memory_mb() * CONFIG["gpu_mem_fraction"]:.2f} MB') 15 | 16 | build_data, build_model, build_loss, build_optimizer, build_scheduler = builder() 17 | 18 | train_data, test_data = build_data() 19 | 20 | with deepspeed.zero.Init(config_dict_or_path=config): 21 | model = build_model() 22 | 23 | criterion = build_loss() 24 | 25 | optimizer = build_optimizer(model.parameters()) 26 | 27 | lr_scheduler = build_scheduler(len(train_data), optimizer) 28 | 29 | model, optimizer, _, lr_scheduler = deepspeed.initialize(model=model, 30 | optimizer=optimizer, 31 | lr_scheduler=lr_scheduler, 32 | config=config) 33 | 34 | return model, train_data, test_data, criterion, optimizer, None, lr_scheduler 35 | -------------------------------------------------------------------------------- /benchmark/zero/deepspeed_utils/vit_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "deepspeed", 3 | "model": { 4 | "type": "vit_h" 5 | }, 6 | "hyperparameter": { 7 | "batch_size": 4, 8 | "steps_per_epoch": 10 9 | }, 10 | "train_batch_size": 32, 11 | "steps_per_print": 2147483647, 12 | "zero_optimization": { 13 | "stage": 3, 14 | "offload_optimizer": { 15 | "device": "cpu", 16 | "pin_memory": true, 17 | "buffer_count": 4, 18 | "fast_init": false 19 | }, 20 | "offload_param": { 21 | "device": "cpu", 22 | "pin_memory": true, 23 | "buffer_count": 5, 24 | "buffer_size": 1e8, 25 | "max_in_cpu": 1e9 26 | }, 27 | "allgather_partitions": true, 28 | "allgather_bucket_size": 5e8, 29 | "overlap_comm": true, 30 | "reduce_scatter": true, 31 | "reduce_bucket_size": 5e8, 32 | "contiguous_gradients": true, 33 | "stage3_max_live_parameters": 1e9, 34 | "stage3_max_reuse_distance": 1e9, 35 | "stage3_prefetch_bucket_size": 5e8, 36 | "stage3_param_persistence_threshold": 1e6 37 | }, 38 | "gradient_clipping": 1.0, 39 | "fp16": { 40 | "enabled": true, 41 | "loss_scale": 0, 42 | "initial_scale_power": 15, 43 | "loss_scale_window": 1000, 44 | "hysteresis": 2, 45 | "min_loss_scale": 1 46 | }, 47 | "use_mem_monitor": true 48 | } -------------------------------------------------------------------------------- /benchmark/zero/fairscale_utils/gpt2_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "fairscale", 3 | "model": { 4 | "type": "gpt2_10b" 5 | }, 6 | "hyperparameter": { 7 | "batch_size": 1, 8 | "num_epochs": 2, 9 | "steps_per_epoch": 10, 10 | "synthetic": true 11 | }, 12 | "fp16": { 13 | "enabled": true, 14 | "init_scale": 32768, 15 | "growth_factor": 2.0, 16 | "backoff_factor": 0.5, 17 | "growth_interval": 1000 18 | }, 19 | "gradient_clipping": 1.0, 20 | "fsdp": { 21 | "reshard_after_forward": true, 22 | "mixed_precision": true, 23 | "fp32_reduce_scatter": false, 24 | "flatten_parameters": true, 25 | "move_params_to_cpu": true, 26 | "bucket_cap_mb": 25, 27 | "clear_autocast_cache": false, 28 | "force_input_to_fp32": false, 29 | "state_dict_on_rank_0_only": false 30 | }, 31 | "use_mem_monitor": true 32 | } -------------------------------------------------------------------------------- /benchmark/zero/fairscale_utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from zero.common.utils import CONFIG, get_gpu_memory_mb, print_log 5 | from torch.distributed import init_process_group 6 | 7 | 8 | def init_w_fs(builder): 9 | from fairscale.nn.checkpoint import checkpoint_wrapper 10 | from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP 11 | from fairscale.optim.grad_scaler import ShardedGradScaler 12 | 13 | rank = int(os.environ['RANK']) 14 | world_size = int(os.environ['WORLD_SIZE']) 15 | host = os.environ['MASTER_ADDR'] 16 | port = int(os.environ['MASTER_PORT']) 17 | init_process_group(rank=rank, world_size=world_size, init_method=f'tcp://{host}:{port}', backend='nccl') 18 | 19 | torch.cuda.set_device(rank) 20 | if CONFIG.get('gpu_mem_fraction', None) is not None: 21 | torch.cuda.set_per_process_memory_fraction(CONFIG['gpu_mem_fraction']) 22 | print_log(f'Set max GPU mem: {get_gpu_memory_mb() * CONFIG["gpu_mem_fraction"]:.2f} MB') 23 | 24 | build_data, build_model, build_loss, build_optimizer, build_scheduler = builder() 25 | 26 | train_data, test_data = build_data() 27 | 28 | assert 'fsdp' in CONFIG 29 | use_checkpoint = CONFIG['model'].get('checkpoint') 30 | CONFIG['model']['checkpoint'] = False 31 | model = build_model() 32 | if use_checkpoint: 33 | model = checkpoint_wrapper(model) 34 | model = FSDP(model, **CONFIG['fsdp']) 35 | 36 | criterion = build_loss() 37 | 38 | optimizer = build_optimizer(model.parameters()) 39 | 40 | scaler = ShardedGradScaler(**CONFIG['fp16']) if 'fp16' in CONFIG else None 41 | 42 | lr_scheduler = build_scheduler(len(train_data), optimizer) 43 | 44 | return model, train_data, test_data, criterion, optimizer, scaler, lr_scheduler 45 | -------------------------------------------------------------------------------- /benchmark/zero/fairscale_utils/vit_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "fairscale", 3 | "model": { 4 | "type": "vit_h", 5 | "checkpoint": false 6 | }, 7 | "hyperparameter": { 8 | "batch_size": 4 9 | }, 10 | "fp16": { 11 | "enabled": true, 12 | "init_scale": 32768, 13 | "growth_factor": 2.0, 14 | "backoff_factor": 0.5, 15 | "growth_interval": 1000 16 | }, 17 | "gradient_clipping": 1.0, 18 | "fsdp": { 19 | "reshard_after_forward": true, 20 | "mixed_precision": true, 21 | "fp32_reduce_scatter": false, 22 | "flatten_parameters": true, 23 | "move_params_to_cpu": true, 24 | "bucket_cap_mb": 25, 25 | "clear_autocast_cache": false, 26 | "force_input_to_fp32": false, 27 | "state_dict_on_rank_0_only": false 28 | }, 29 | "use_mem_monitor": true 30 | } -------------------------------------------------------------------------------- /benchmark/zero/patrickstar_utils/gpt2_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "patrickstar", 3 | "model": { 4 | "type": "gpt2_10b" 5 | }, 6 | "hyperparameter": { 7 | "batch_size": 8, 8 | "num_epochs": 2, 9 | "steps_per_epoch": 10, 10 | "synthetic": true 11 | }, 12 | "optimizer": { 13 | "type": "AdamW", 14 | "params": { 15 | "lr": 0.0015, 16 | "weight_decay": 0.01, 17 | "use_hybrid_adam": true 18 | } 19 | }, 20 | "fp16": { 21 | "enabled": true, 22 | "loss_scale": 0, 23 | "initial_scale_power": 15, 24 | "loss_scale_window": 1000, 25 | "hysteresis": 2, 26 | "min_loss_scale": 1 27 | }, 28 | "default_chunk_size": 1073741824, 29 | "release_after_init": true, 30 | "gradient_clipping": 1.0, 31 | "use_cpu_embedding": false, 32 | "use_mem_monitor": true 33 | } -------------------------------------------------------------------------------- /benchmark/zero/patrickstar_utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from zero.common.utils import CONFIG, get_gpu_memory_mb, print_log 5 | from torch.distributed import init_process_group 6 | 7 | 8 | def init_w_ps(builder): 9 | from patrickstar.runtime import initialize_engine 10 | 11 | config = CONFIG.copy() 12 | 13 | rank = int(os.environ['RANK']) 14 | world_size = int(os.environ['WORLD_SIZE']) 15 | host = os.environ['MASTER_ADDR'] 16 | port = int(os.environ['MASTER_PORT']) 17 | init_process_group(rank=rank, world_size=world_size, init_method=f'tcp://{host}:{port}', backend='nccl') 18 | 19 | torch.cuda.set_device(rank) 20 | if CONFIG.get('gpu_mem_fraction', None) is not None: 21 | torch.cuda.set_per_process_memory_fraction(CONFIG['gpu_mem_fraction']) 22 | print_log(f'Set max GPU mem: {get_gpu_memory_mb() * CONFIG["gpu_mem_fraction"]:.2f} MB') 23 | 24 | build_data, build_model, build_loss, _, build_scheduler = builder() 25 | 26 | train_data, test_data = build_data() 27 | 28 | criterion = build_loss() 29 | 30 | model, optimizer = initialize_engine(model_func=build_model, local_rank=rank, config=config) 31 | 32 | lr_scheduler = build_scheduler(len(train_data), optimizer) 33 | 34 | return model, train_data, test_data, criterion, optimizer, None, lr_scheduler 35 | -------------------------------------------------------------------------------- /benchmark/zero/patrickstar_utils/vit_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "patrickstar", 3 | "model": { 4 | "type": "vit_h" 5 | }, 6 | "hyperparameter": { 7 | "batch_size": 4 8 | }, 9 | "optimizer": { 10 | "type": "AdamW", 11 | "params": { 12 | "lr": 0.0015, 13 | "weight_decay": 0.01, 14 | "use_hybrid_adam": true 15 | } 16 | }, 17 | "fp16": { 18 | "enabled": true, 19 | "loss_scale": 0, 20 | "initial_scale_power": 15, 21 | "loss_scale_window": 1000, 22 | "hysteresis": 2, 23 | "min_loss_scale": 1 24 | }, 25 | "default_chunk_size": 67108864, 26 | "release_after_init": true, 27 | "gradient_clipping": 1.0, 28 | "use_cpu_embedding": false, 29 | "use_mem_monitor": true 30 | } -------------------------------------------------------------------------------- /benchmark/zero/requirement.txt: -------------------------------------------------------------------------------- 1 | 2 | torch>=1.10 -f https://download.pytorch.org/whl/cu113/torch_stable.html 3 | torchvision -f https://download.pytorch.org/whl/cu113/torch_stable.html 4 | transformers 5 | datasets 6 | colossalai 7 | deepspeed 8 | fairscale 9 | rich 10 | nvidia-dali-cuda110 --extra-index-url https://developer.download.nvidia.com/compute/redist -------------------------------------------------------------------------------- /benchmark/zero/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from zero.colossalai_utils.utils import init_w_col 4 | from zero.common.gpt2 import gpt2_builder 5 | from zero.common.train import train 6 | from zero.common.utils import CONFIG, load_config, print_log 7 | from zero.common.vit import vit_builder 8 | from zero.deepspeed_utils.utils import init_w_ds 9 | from zero.fairscale_utils.utils import init_w_fs 10 | from zero.patrickstar_utils.utils import init_w_ps 11 | from zero.torch_utils.utils import init_w_torch 12 | 13 | _zero_method = { 14 | 'fairscale': init_w_fs, 15 | 'colossalai': init_w_col, 16 | 'torch': init_w_torch, 17 | 'patrickstar': init_w_ps, 18 | 'deepspeed': init_w_ds 19 | } 20 | 21 | _builder = { 22 | 'gpt2': gpt2_builder, 23 | 'vit': vit_builder, 24 | } 25 | 26 | 27 | def run_zero(): 28 | method = CONFIG['method'] 29 | assert method in ['colossalai', 'deepspeed', 'fairscale', 'patrickstar', 'torch'], f'No support for {method}.' 30 | 31 | model = CONFIG['model']['type'] 32 | model_type = model.split('_')[0] 33 | assert model_type in ['gpt2', 'vit'], f'No support for {model}.' 34 | 35 | train(*_zero_method[method](_builder[model_type])) 36 | 37 | 38 | if __name__ == '__main__': 39 | load_config() 40 | 41 | CONFIG['log_path'] = os.environ.get('LOG', '.') 42 | os.makedirs(CONFIG['log_path'], exist_ok=True) 43 | 44 | print_log(f'Initializing {CONFIG["method"]} ...') 45 | 46 | run_zero() 47 | -------------------------------------------------------------------------------- /benchmark/zero/torch_utils/gpt2_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "torch", 3 | "model": { 4 | "type": "gpt2_10b" 5 | }, 6 | "hyperparameter": { 7 | "batch_size": 1, 8 | "num_epochs": 2, 9 | "steps_per_epoch": 10, 10 | "synthetic": true 11 | }, 12 | "fp16": { 13 | "enabled": true, 14 | "init_scale": 32768, 15 | "growth_factor": 2.0, 16 | "backoff_factor": 0.5, 17 | "growth_interval": 1000 18 | }, 19 | "gradient_clipping": 1.0, 20 | "use_mem_monitor": true 21 | } 22 | -------------------------------------------------------------------------------- /benchmark/zero/torch_utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from zero.common.utils import CONFIG, get_gpu_memory_mb, get_model_size, print_log 5 | from torch.distributed import init_process_group 6 | from torch.nn.parallel import DistributedDataParallel as DDP 7 | 8 | 9 | def init_w_torch(builder): 10 | rank = int(os.environ['RANK']) 11 | world_size = int(os.environ['WORLD_SIZE']) 12 | host = os.environ['MASTER_ADDR'] 13 | port = int(os.environ['MASTER_PORT']) 14 | init_process_group(rank=rank, world_size=world_size, init_method=f'tcp://{host}:{port}', backend='nccl') 15 | 16 | torch.cuda.set_device(rank) 17 | if CONFIG.get('gpu_mem_fraction', None) is not None: 18 | torch.cuda.set_per_process_memory_fraction(CONFIG['gpu_mem_fraction']) 19 | print_log(f'Set max GPU mem: {get_gpu_memory_mb() * CONFIG["gpu_mem_fraction"]:.2f} MB') 20 | 21 | build_data, build_model, build_loss, build_optimizer, build_scheduler = builder() 22 | 23 | train_data, test_data = build_data() 24 | 25 | model = build_model().to(rank) 26 | if 'numel' not in CONFIG['model']: 27 | CONFIG['model']['numel'] = get_model_size(model) 28 | model = DDP(model) 29 | 30 | criterion = build_loss() 31 | 32 | optimizer = build_optimizer(model.parameters()) 33 | 34 | scaler = torch.cuda.amp.GradScaler(**CONFIG['fp16']) if 'fp16' in CONFIG else None 35 | 36 | lr_scheduler = build_scheduler(len(train_data), optimizer) 37 | 38 | return model, train_data, test_data, criterion, optimizer, scaler, lr_scheduler 39 | -------------------------------------------------------------------------------- /benchmark/zero/torch_utils/vit_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "torch", 3 | "model": { 4 | "type": "vit_h" 5 | }, 6 | "hyperparameter": { 7 | "batch_size": 4 8 | }, 9 | "fp16": { 10 | "enabled": true, 11 | "init_scale": 32768, 12 | "growth_factor": 2.0, 13 | "backoff_factor": 0.5, 14 | "growth_interval": 1000 15 | }, 16 | "gradient_clipping": 1.0, 17 | "use_mem_monitor": true 18 | } -------------------------------------------------------------------------------- /features/amp/config/config_AMP_apex.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | # ViT Base 4 | BATCH_SIZE = 128 5 | DROP_RATE = 0.1 6 | NUM_EPOCHS = 2 7 | 8 | fp16 = dict( 9 | mode=AMP_TYPE.APEX, 10 | ) 11 | 12 | clip_grad_norm = 1.0 13 | -------------------------------------------------------------------------------- /features/amp/config/config_AMP_naive.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | # ViT Base 4 | BATCH_SIZE = 128 5 | DROP_RATE = 0.1 6 | NUM_EPOCHS = 2 7 | 8 | fp16 = dict( 9 | mode=AMP_TYPE.NAIVE, 10 | ) 11 | 12 | clip_grad_norm = 1.0 13 | -------------------------------------------------------------------------------- /features/amp/config/config_AMP_torch.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | # ViT Base 4 | BATCH_SIZE = 128 5 | DROP_RATE = 0.1 6 | NUM_EPOCHS = 2 7 | 8 | fp16 = dict( 9 | mode=AMP_TYPE.TORCH, 10 | ) 11 | 12 | clip_grad_norm = 1.0 13 | -------------------------------------------------------------------------------- /features/amp/config/config_fp32.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | # ViT Base 4 | BATCH_SIZE = 128 5 | DROP_RATE = 0.1 6 | NUM_EPOCHS = 2 7 | 8 | clip_grad_norm = 1.0 9 | -------------------------------------------------------------------------------- /features/amp/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /features/amp/scripts/train_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python train_with_engine.py --host $HOST --config ./config/config_AMP_naive.py --port 29500 -------------------------------------------------------------------------------- /features/colotensor/README.md: -------------------------------------------------------------------------------- 1 | # Use tensor model paralelism via ColoTensor 2 | 3 | ## Introduction 4 | 5 | This is an example of the turorial, **Parallelize Your Training like Megatron-LM via ColoTensor**. 6 | It can tell you how to make your model adapted to tensor model parallelism. 7 | Just use the below code to run the example. 8 | 9 | ```bash 10 | colossalai run --nproc_per_node gpt_megatron.py 11 | ``` -------------------------------------------------------------------------------- /features/colotensor/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /features/gradient_accumulation/README.md: -------------------------------------------------------------------------------- 1 | # Gradient Accumulation 2 | 3 | ## Prepare Dataset 4 | 5 | We use CIFAR10 dataset in this example. The dataset will be downloaded to `./data` by default. 6 | If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command. 7 | 8 | ```bash 9 | export DATA=/path/to/data 10 | ``` 11 | 12 | ## Verify Gradient Accumulation 13 | 14 | To verify gradient accumulation, we can just check the change of parameter values. When gradient accumulation is set, parameters 15 | are only updated in the last step. 16 | 17 | ```bash 18 | colossalai run --nproc_per_node 1 train.py 19 | ``` -------------------------------------------------------------------------------- /features/gradient_accumulation/config.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | BATCH_SIZE = 128 4 | NUM_EPOCHS = 200 5 | 6 | gradient_accumulation = 4 7 | -------------------------------------------------------------------------------- /features/gradient_accumulation/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /features/gradient_clipping/README.md: -------------------------------------------------------------------------------- 1 | # Gradient Clipping 2 | 3 | ## Usage 4 | 5 | To use gradient clipping, you can just add the following code to your configuration file. 6 | 7 | ```python 8 | gradient_clipping = 9 | ``` 10 | 11 | ## Prepare Dataset 12 | 13 | We use CIFAR10 dataset in this example. The dataset will be downloaded to `./data` by default. 14 | If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command. 15 | 16 | ```bash 17 | export DATA=/path/to/data 18 | ``` 19 | 20 | ## Verify Gradient Clipping 21 | 22 | To verify gradient clipping, we can just check the change of parameter values. 23 | 24 | ```bash 25 | colossalai run --nproc_per_node 1 train.py 26 | ``` -------------------------------------------------------------------------------- /features/gradient_clipping/config.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | BATCH_SIZE = 128 4 | NUM_EPOCHS = 200 5 | 6 | gradient_clipping = 2.0 7 | -------------------------------------------------------------------------------- /features/gradient_clipping/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /features/gradient_clipping/train.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from colossalai.logging import get_dist_logger 3 | import colossalai 4 | import torch 5 | import os 6 | from colossalai.core import global_context as gpc 7 | from colossalai.utils import get_dataloader 8 | from torchvision import transforms 9 | from colossalai.nn.lr_scheduler import CosineAnnealingLR 10 | from torchvision.datasets import CIFAR10 11 | from torchvision.models import resnet34 12 | from tqdm import tqdm 13 | 14 | 15 | def main(): 16 | colossalai.launch_from_torch(config='./config.py') 17 | 18 | logger = get_dist_logger() 19 | 20 | # build resnet 21 | model = resnet34(num_classes=10) 22 | 23 | # build dataloaders 24 | train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')), 25 | download=True, 26 | transform=transforms.Compose([ 27 | transforms.RandomCrop(size=32, padding=4), 28 | transforms.RandomHorizontalFlip(), 29 | transforms.ToTensor(), 30 | transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]), 31 | ])) 32 | 33 | train_dataloader = get_dataloader( 34 | dataset=train_dataset, 35 | shuffle=True, 36 | batch_size=gpc.config.BATCH_SIZE, 37 | num_workers=1, 38 | pin_memory=True, 39 | ) 40 | 41 | # build criterion 42 | criterion = torch.nn.CrossEntropyLoss() 43 | 44 | # optimizer 45 | optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) 46 | 47 | # lr_scheduler 48 | lr_scheduler = CosineAnnealingLR(optimizer, total_steps=gpc.config.NUM_EPOCHS) 49 | 50 | engine, train_dataloader, test_dataloader, _ = colossalai.initialize( 51 | model, 52 | optimizer, 53 | criterion, 54 | train_dataloader, 55 | ) 56 | 57 | # verify gradient accumulation 58 | engine.train() 59 | for idx, (img, label) in enumerate(train_dataloader): 60 | img = img.cuda() 61 | label = label.cuda() 62 | 63 | engine.zero_grad() 64 | output = engine(img) 65 | train_loss = engine.criterion(output, label) 66 | engine.backward(train_loss) 67 | engine.step() 68 | lr_scheduler.step() 69 | 70 | ele_1st = next(model.parameters()).flatten()[0] 71 | logger.info(f'iteration {idx}, loss: {train_loss}, 1st element of parameters: {ele_1st.item()}') 72 | 73 | # only run for 4 iterations 74 | if idx == 3: 75 | break 76 | 77 | 78 | if __name__ == '__main__': 79 | main() 80 | -------------------------------------------------------------------------------- /features/pipeline_parallel/.init: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/features/pipeline_parallel/.init -------------------------------------------------------------------------------- /features/pipeline_parallel/README.md: -------------------------------------------------------------------------------- 1 | # Train ResNet50 on CIFAR10 with pipeline 2 | 3 | ## requirement 4 | 5 | To use pipeline parallel training, you should install colossalai from the **latest** main branch. 6 | ## How to run 7 | 8 | We use `colossalai.launch_from_torch` as an example here. Before running, you should `export DATA=/path/to/cifar`. 9 | 10 | If you are training with single node multiple GPUs: 11 | ```shell 12 | colossalai run --nproc_per_node resnet.py 13 | ``` 14 | -------------------------------------------------------------------------------- /features/pipeline_parallel/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /features/pipeline_parallel/rpc/gpt/dataset/webtext.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import torch 5 | from colossalai.registry import DATASETS 6 | from torch.utils.data import Dataset 7 | from transformers import GPT2Tokenizer 8 | 9 | 10 | @DATASETS.register_module 11 | class WebtextDataset(Dataset): 12 | def __init__(self, path, seq_len=1024) -> None: 13 | super().__init__() 14 | root = os.path.dirname(path) 15 | encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt') 16 | if os.path.isfile(encoded_data_cache_path): 17 | seq_len_, data, attention_mask = torch.load(encoded_data_cache_path) 18 | if seq_len_ == seq_len: 19 | self.data = data 20 | self.attention_mask = attention_mask 21 | return 22 | raw_data = [] 23 | with open(path) as f: 24 | for line in f.readlines(): 25 | raw_data.append(json.loads(line)['text']) 26 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 27 | tokenizer.pad_token = tokenizer.unk_token 28 | encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt') 29 | self.data = encoded_data['input_ids'] 30 | self.attention_mask = encoded_data['attention_mask'] 31 | torch.save((seq_len, self.data, self.attention_mask), encoded_data_cache_path) 32 | 33 | def __len__(self): 34 | return len(self.data) 35 | 36 | def __getitem__(self, index): 37 | return {'input_ids': self.data[index], 38 | 'attention_mask': self.attention_mask[index]}, self.data[index] -------------------------------------------------------------------------------- /features/pipeline_parallel/rpc/opt/README.md: -------------------------------------------------------------------------------- 1 | # Example 2 | 3 | Example of training OPT-125m through different PP strategies. 4 | 5 | ## run non-interleaved 1F1B 6 | 7 | ```bash 8 | python3 1f1b.py --world_size=4 --num_microbatches=8 --device="cuda" --batch_size=16 --epoch=20 --master_port=29011 9 | ``` 10 | 11 | > for customized world_size, please adjust partition strategy -------------------------------------------------------------------------------- /features/pipeline_parallel/rpc/opt/check/opt_avail.py: -------------------------------------------------------------------------------- 1 | from transformers import GPT2Tokenizer, OPTForCausalLM 2 | 3 | model = OPTForCausalLM.from_pretrained("facebook/opt-125m") 4 | tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-125m") 5 | 6 | prompt = "Hey, are you consciours? Can you talk to me?" 7 | inputs = tokenizer(prompt, return_tensors="pt") 8 | 9 | generate_ids = model.generate(inputs.input_ids, max_length=30) 10 | print(tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]) -------------------------------------------------------------------------------- /features/pipeline_parallel/rpc/resnet/README.md: -------------------------------------------------------------------------------- 1 | # Example 2 | 3 | Example of training resnet on cifar through different PP strategies. 4 | 5 | ## import data 6 | 7 | ```bash 8 | export DATA=/path/cifar-10 9 | ``` 10 | 11 | ## run Fill Drain 12 | ```bash 13 | python3 fill_drain.py --epoch=1 --world_size=2 --batch_size=512 --chunk=1 --optimizer="SGD" --device="cuda" --num_microbatches=4 14 | ``` 15 | 16 | > for customized world_size, please adjust partition strategy 17 | 18 | 19 | ## run 1F1B 20 | 21 | ```bash 22 | python3 1f1b.py --epoch=1 --world_size=2 --batch_size=512 --chunk=1 --optimizer="SGD" --device="cuda" --num_microbatches=4 23 | ``` 24 | 25 | > for customized world_size, please adjust partition strategy 26 | 27 | ## run Chimera 28 | chimera is not stable, it is possible for the program here hang at some iteration. 29 | ```bash 30 | python3 chimera.py --world_size=2 --epoch=1 --batch_size=128 --chun=1 --optimizer="SGD" --device="cuda" --num_microbatches=4 31 | ``` 32 | 33 | > for customized world_size, please adjust partition strategy 34 | 35 | ## help 36 | run `python3 1f1b.py --help` for available config of the pipeline: 37 | 38 | ``` 39 | -h, --help show this help message and exit 40 | --epoch EPOCH 41 | --world_size WORLD_SIZE 42 | --batch_size BATCH_SIZE 43 | --dp_degree DP_DEGREE 44 | --tp_degree TP_DEGREE 45 | --num_microbatches NUM_MICROBATCHES 46 | --chunk CHUNK 47 | --use_checkpoint 48 | --optimizer {SGD,Adam,RMSprop} 49 | --device {cpu,cuda} 50 | --master_addr MASTER_ADDR 51 | --master_port MASTER_PORT 52 | --num_worker_threads NUM_WORKER_THREADS 53 | ``` 54 | 55 | `chunk` means the number of the virtual pipeline stages on each card. If `chunk==1`, then there is only one virtual stage on each card, equivalent to **non-overleaved** mode. 56 | 57 | If `chunk>1`(`chunk=2` for example) then there are two virtual stages on each card, equivalent to **overleaved** mode. 58 | 59 | As a result, actual number of pipeline stage (donated to `actual_stage_num`) is $\text{chunk} \times \text{world\_size}$. 60 | 61 | It is recommended not to set `chunk>2`, too much communication payload on one card may make `torch.distributed.rpc` go wrong. It depends on your hardware. 62 | 63 | In the demo of resnet, please set `worlds_size=2, chunk=1`, because current partition strategy only support this config. -------------------------------------------------------------------------------- /features/tensor_parallel/README.md: -------------------------------------------------------------------------------- 1 | # Tensor Parallelism 2 | 3 | ## Usage 4 | 5 | To use tensor parallelism, there are several steps to follow: 6 | 7 | 1. define `parallel` in your configuration file. Set `mode` for `tensor` to `1d`, `2d`, `2.5d` or `3d`. 8 | 2. construct your model, replace `torch.nn.Linear` with `colossalai.nn.Linear`. 9 | 3. split the input data accordingly 10 | 11 | ## Reference 12 | 13 | If you wish to understand how tensor parallelism works exactly, you may refer to our [documentation](https://colossalai.org/docs/features/1D_tensor_parallel). 14 | 15 | 16 | ## How to run 17 | 18 | In this example, we constructed a simple MLP model for demonstration purpose. You can execute the following commands to run the demo. 19 | 20 | ```shell 21 | # run 1D tensor parallelism on 4 GPUs 22 | colossalai run --nproc_per_node=4 run.py --config ./configs/tp_1d.py 23 | 24 | # run 2D tensor parallelism 4 GPUs 25 | colossalai run --nproc_per_node=4 run.py --config ./configs/tp_2d.py 26 | 27 | # run 2.5D tensor parallelism 8 GPUs 28 | colossalai run --nproc_per_node=8 run.py --config ./configs/tp_2p5d.py 29 | 30 | # run 3D tensor parallelism 8 GPUs 31 | colossalai run --nproc_per_node=8 run.py --config ./configs/tp_3d.py 32 | ``` 33 | -------------------------------------------------------------------------------- /features/tensor_parallel/configs/tp_1d.py: -------------------------------------------------------------------------------- 1 | parallel = dict( 2 | data=1, 3 | pipeline=1, 4 | tensor=dict(size=2, mode='1d'), 5 | ) 6 | -------------------------------------------------------------------------------- /features/tensor_parallel/configs/tp_2d.py: -------------------------------------------------------------------------------- 1 | parallel = dict( 2 | data=1, 3 | pipeline=1, 4 | tensor=dict(size=4, mode='2d'), 5 | ) 6 | -------------------------------------------------------------------------------- /features/tensor_parallel/configs/tp_2p5d.py: -------------------------------------------------------------------------------- 1 | parallel = dict( 2 | data=1, 3 | pipeline=1, 4 | tensor=dict(size=8, mode='2.5d', depth=2), 5 | ) 6 | -------------------------------------------------------------------------------- /features/tensor_parallel/configs/tp_3d.py: -------------------------------------------------------------------------------- 1 | parallel = dict( 2 | data=1, 3 | pipeline=1, 4 | tensor=dict(size=8, mode='3d'), 5 | ) 6 | -------------------------------------------------------------------------------- /features/tensor_parallel/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /features/tensor_parallel/run.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import colossalai.nn as col_nn 3 | import torch 4 | from colossalai.context import ParallelMode 5 | from colossalai.core import global_context as gpc 6 | from colossalai.utils import get_current_device, print_rank_0 7 | from colossalai.global_variables import tensor_parallel_env as tp_env 8 | 9 | 10 | class MLP(torch.nn.Module): 11 | 12 | def __init__(self, dim: int = 256): 13 | super().__init__() 14 | intermediate_dim = dim * 4 15 | self.dense_1 = col_nn.Linear(dim, intermediate_dim) 16 | print_rank_0(f'Weight of the first linear layer: {self.dense_1.weight.shape}') 17 | self.activation = torch.nn.GELU() 18 | self.dense_2 = col_nn.Linear(intermediate_dim, dim) 19 | print_rank_0(f'Weight of the second linear layer: {self.dense_2.weight.shape}') 20 | self.dropout = col_nn.Dropout(0.1) 21 | 22 | def forward(self, x): 23 | x = self.dense_1(x) 24 | print_rank_0(f'Output of the first linear layer: {x.shape}') 25 | x = self.activation(x) 26 | x = self.dense_2(x) 27 | print_rank_0(f'Output of the second linear layer: {x.shape}') 28 | x = self.dropout(x) 29 | return x 30 | 31 | 32 | def main(): 33 | colossalai.logging.disable_existing_loggers() 34 | parser = colossalai.get_default_parser() 35 | args = parser.parse_args() 36 | colossalai.launch_from_torch(config=args.config) 37 | 38 | m = MLP() 39 | 40 | x = torch.randn((16, 256), device=get_current_device()) 41 | torch.distributed.broadcast(x, src=0) 42 | 43 | # partition input 44 | if tp_env.mode == '1d': 45 | pass 46 | elif tp_env.mode == '2d': 47 | x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL)] 48 | x = torch.chunk(x, 2, dim=-1)[gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW)] 49 | elif tp_env.mode == '2.5d': 50 | x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_DEP)] 51 | x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL)] 52 | x = torch.chunk(x, 2, dim=-1)[gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_ROW)] 53 | elif tp_env.mode == '3d': 54 | x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT)] 55 | x = torch.chunk(x, 2, dim=0)[gpc.get_local_rank(ParallelMode.PARALLEL_3D_INPUT)] 56 | x = torch.chunk(x, 2, dim=-1)[gpc.get_local_rank(ParallelMode.PARALLEL_3D_OUTPUT)] 57 | print_rank_0(f'Input: {x.shape}') 58 | 59 | x = m(x) 60 | 61 | gpc.destroy() 62 | 63 | 64 | if __name__ == '__main__': 65 | main() 66 | -------------------------------------------------------------------------------- /features/zero/README.md: -------------------------------------------------------------------------------- 1 | # ZeRO 2 | 3 | This tutorial works for ColossalAI v0.1.10. 4 | 5 | ## Prepare Model 6 | 7 | In this example, we use `Hugging Face Transformers`. You have to install `transformers` before running this example. We will take `GPT2 Medium` as an example here. 8 | 9 | ```shell 10 | # install huggingface transformers 11 | pip install transformers 12 | ``` 13 | 14 | ## Prepare Data 15 | 16 | This example is intended for showing you how to use `ZeRO`. For simplicity, we just use randomly generated data here. 17 | 18 | ## Run with ZeRO 19 | 20 | We just use naive training loop in this example. `Engine` and `Trainer` are not used. 21 | 22 | ```shell 23 | colossalai run --nproc_per_node=1 train.py 24 | ``` -------------------------------------------------------------------------------- /features/zero/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /image/detr-debug/README.md: -------------------------------------------------------------------------------- 1 | # colossal_detr 2 | Reproduce the DETR model with ColossalAI 3 | 4 | ## Background 5 | This project is the reproduction of [DETR model](https://arxiv.org/abs/2005.12872) with [ColossalAI](https://github.com/hpcaitech/ColossalAI) tool. 6 | 7 | ## Envirionment setup 8 | ``` 9 | git clone https://github.com/hpcaitech/ColossalAI.git 10 | cd ColossalAI 11 | # install dependency 12 | pip install -r requirements/requirements.txt 13 | 14 | # install colossalai 15 | pip install . 16 | ``` 17 | 18 | ## How to run 19 | ``` 20 | $ DATA=/path/to/data/ python -m torch.distributed.launch --nproc_per_node=nproc_per_node 21 | --master_addr MASTER_ADDR 22 | --master_port MASTER_PORT 23 | run_train.py 24 | --config=CONFIG_FILE 25 | --world_size=WORLD_SIZE 26 | --rank=RANK 27 | --local_rank=LOCAL_RANK 28 | ``` 29 | 30 | ## Cite us 31 | ``` 32 | @article{bian2021colossal, 33 | title={Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training}, 34 | author={Bian, Zhengda and Liu, Hongxin and Wang, Boxiang and Huang, Haichen and Li, Yongbin and Wang, Chuanrui and Cui, Fan and You, Yang}, 35 | journal={arXiv preprint arXiv:2110.14883}, 36 | year={2021} 37 | } 38 | ``` -------------------------------------------------------------------------------- /image/detr-debug/configs/detr_1d.py: -------------------------------------------------------------------------------- 1 | BATCH_SIZE = 4 2 | LEARNING_RATE = 2e-3 3 | WEIGHT_DECAY = 3e-2 4 | 5 | # pipeline config 6 | parallel = dict(pipeline=2,) 7 | NUM_MICRO_BATCHES = parallel['pipeline'] 8 | 9 | # tensor config 10 | #TENSOR_PARALLEL_SIZE = 2 11 | #TENSOR_PARALLEL_MODE = '1d' 12 | 13 | NUM_EPOCHS = 800 14 | WARMUP_EPOCHS = 40 15 | clip_max_norm = 2. 16 | 17 | seed = 77 18 | 19 | LOG_PATH = f"./detr_1d_ai2d_tp2_bs{BATCH_SIZE}_lr{LEARNING_RATE}/" 20 | 21 | 22 | find_unused_parameters = True 23 | 24 | coco_path = '/data/huxin/xjtuhx/projects/ai2d-detection-baselines/111/data_dir/ai2d/' 25 | pre_norm = False 26 | save_ckpt_freq = 50 27 | lr_backbone = 1e-5 28 | device = 'cuda' 29 | lr_drop = 200 30 | backbone = 'resnet34' 31 | dilation = None 32 | position_embedding = 'sine' 33 | enc_layers = 2 34 | dec_layers = 2 35 | dim_feedforward = 512 36 | hidden_dim = 256 37 | dropout = 0.1 38 | nheads = 1 39 | num_queries = 100 40 | masks = False 41 | set_cost_class = 1 42 | set_cost_bbox = 5 43 | set_cost_giou = 2 44 | mask_loss_coef = 1 45 | dice_loss_coef = 1 46 | bbox_loss_coef = 5 47 | giou_loss_coef = 2 48 | eos_coef = 0.1 49 | dataset_file = 'ai2d' 50 | remove_difficult = True 51 | output_dir = '/data/huxin/xjtuhx/projects/ai2d-detection-baselines/111/output_test/' 52 | resume = '' 53 | start_epoch = 0 54 | eval = False 55 | num_workers = 2 56 | world_size = 1 57 | dist_url = 'env://' 58 | distributed = True 59 | aux_loss = False 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /image/detr-debug/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import torch.utils.data 3 | import torchvision 4 | # from util.params import opt 5 | 6 | from .coco import build as build_coco 7 | 8 | def get_coco_api_from_dataset(dataset): 9 | for _ in range(10): 10 | if isinstance(dataset, torch.utils.data.Subset): 11 | dataset = dataset.dataset 12 | if isinstance(dataset, torchvision.datasets.CocoDetection): 13 | return dataset.coco 14 | 15 | 16 | def build_dataset(image_set, args): 17 | if args.dataset_file == 'coco': 18 | return build_coco(image_set, args) 19 | if args.dataset_file == 'ai2d': 20 | return build_coco(image_set, args) 21 | 22 | raise ValueError(f'dataset {args.dataset_file} not supported') -------------------------------------------------------------------------------- /image/detr-debug/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .detr import DETR 2 | 3 | 4 | def build_model(backbone, transformer, num_classes): 5 | model = DETR( 6 | backbone, 7 | transformer, 8 | num_classes=num_classes, 9 | num_queries=50, 10 | aux_loss=False, 11 | ) 12 | 13 | return model -------------------------------------------------------------------------------- /image/detr-debug/models/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Various positional encodings for the transformer. 4 | """ 5 | import math 6 | import torch 7 | from torch import nn 8 | from colossalai.registry import LAYERS, MODELS 9 | 10 | @LAYERS.register_module 11 | class PositionEmbeddingSine(nn.Module): 12 | """ 13 | This is a more standard version of the position embedding, very similar to the one 14 | used by the Attention is all you need paper, generalized to work on images. 15 | """ 16 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 17 | super().__init__() 18 | self.num_pos_feats = num_pos_feats 19 | self.temperature = temperature 20 | self.normalize = normalize 21 | if scale is not None and normalize is False: 22 | raise ValueError("normalize should be True if scale is passed") 23 | if scale is None: 24 | scale = 2 * math.pi 25 | self.scale = scale 26 | 27 | def forward(self, tensor_list): 28 | x = tensor_list.tensors 29 | mask = tensor_list.mask 30 | assert mask is not None 31 | not_mask = ~mask 32 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 33 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 34 | if self.normalize: 35 | eps = 1e-6 36 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 37 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 38 | 39 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 40 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 41 | 42 | pos_x = x_embed[:, :, :, None] / dim_t 43 | pos_y = y_embed[:, :, :, None] / dim_t 44 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 45 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 46 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 47 | return pos 48 | 49 | 50 | def build_position_encoding(args): 51 | N_steps = args.hidden_dim // 2 52 | # if args.position_embedding in ('v2', 'sine'): 53 | position_embedding = PositionEmbeddingSine(N_steps, normalize=True) 54 | # elif args.position_embedding in ('v3', 'learned'): 55 | # position_embedding = PositionEmbeddingLearned(N_steps) 56 | # else: 57 | # raise ValueError(f"not supported {args.position_embedding}") 58 | 59 | return position_embedding -------------------------------------------------------------------------------- /image/detr-debug/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | -e git+ssh://git@github.com/hpcaitech/ColossalAI.git@7d15ec7fe20b07180f5dc3f4b580e2cba37c5b9e#egg=colossalai 3 | absl-py==1.0.0 4 | cachetools==4.2.4 5 | certifi==2021.10.8 6 | charset-normalizer==2.0.10 7 | colossalai 8 | einops==0.4.0 9 | google-auth==2.3.3 10 | google-auth-oauthlib==0.4.6 11 | grpcio==1.43.0 12 | idna==3.3 13 | importlib-metadata==4.10.1 14 | Markdown==3.3.6 15 | numpy==1.21.5 16 | nvidia-dali-cuda102==1.6.0 17 | oauthlib==3.1.1 18 | packaging==21.3 19 | Pillow==9.0.0 20 | pip==21.2.2 21 | protobuf==3.19.3 22 | psutil==5.9.0 23 | pyasn1==0.4.8 24 | pyasn1-modules==0.2.8 25 | pyparsing==3.0.7 26 | requests==2.27.1 27 | requests-oauthlib==1.3.0 28 | rsa==4.8 29 | setuptools==58.0.4 30 | six==1.16.0 31 | tensorboard==2.8.0 32 | tensorboard-data-server==0.6.1 33 | tensorboard-plugin-wit==1.8.1 34 | tensorboardX==2.4.1 35 | timm==0.5.4 36 | torch==1.10.1 37 | torchvision==0.11.2 38 | tqdm==4.62.3 39 | typing_extensions==4.0.1 40 | urllib3==1.26.8 41 | Werkzeug==2.0.2 42 | wheel==0.37.1 43 | zipp==3.7.0 -------------------------------------------------------------------------------- /image/detr-debug/results/loss_curve.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/detr-debug/results/loss_curve.jpg -------------------------------------------------------------------------------- /image/detr-debug/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/detr-debug/util/__init__.py -------------------------------------------------------------------------------- /image/detr/README.md: -------------------------------------------------------------------------------- 1 | # DEtection TRansformer (DETR) on Colossal-AI 2 | 3 | ## Requirement 4 | 5 | You should install colossalai from the **latest** main branch. 6 | 7 | --- 8 | 9 | ## How to run 10 | 11 | On a single server, you can directly use torch.distributed to start pre-training on multiple GPUs in parallel. In Colossal-AI, we provided several launch methods to init the distributed backend. You can use `colossalai.launch` and `colossalai.get_default_parser` to pass the parameters via command line. If you happen to use launchers such as SLURM, OpenMPI and PyTorch launch utility, you can use `colossalai.launch_from_` to read rank and world size from the environment variables directly for convenience. 12 | 13 | Before running, you should `export DATA=/path/to/coco`. 14 | 15 | In your terminal 16 | ```shell 17 | colossalai run --nproc_per_node main.py --config config.py 18 | ``` 19 | 20 | --- 21 | 22 | 23 | ## Details 24 | `config.py` 25 | 26 | Containing configurations for DETR. 27 | 28 | `main.py` 29 | 30 | Engine is called through this file to start the training process using Colossal-AI. 31 | 32 | `engine.py` 33 | 34 | Process training and evaluating procedures about DETR. 35 | 36 | `./datasets` 37 | 38 | Dataset proprocessings. 39 | 40 | `./models` 41 | 42 | Model specifications of DETR model. Containing Transformer and Backbone implementations. 43 | 44 | `./util` 45 | 46 | Utilities used in DETR. -------------------------------------------------------------------------------- /image/detr/config.py: -------------------------------------------------------------------------------- 1 | BATCH_SIZE = 2 2 | LEARNING_RATE = 1e-4 3 | WEIGHT_DECAY = 1e-4 4 | 5 | TENSOR_PARALLEL_SIZE = 4 6 | TENSOR_PARALLEL_MODE = '1d' 7 | 8 | NUM_EPOCHS = 300 9 | lr_drop = 200 10 | clip_max_norm = 0.1 11 | 12 | # gradient_clipping = 0.1 13 | 14 | parallel = dict( 15 | pipeline=1, 16 | tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), 17 | ) 18 | 19 | cudnn_benchmark = False 20 | 21 | seed = 42 22 | 23 | LOG_PATH = f"./detr_{TENSOR_PARALLEL_MODE}_coco_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/" 24 | 25 | 26 | # find_unused_parameters = True 27 | 28 | coco_path = '/data/scratch/coco' 29 | save_ckpt_freq = 50 30 | lr_backbone = 1e-5 31 | device = 'cuda' 32 | lr_drop = 200 33 | backbone = 'resnet50' 34 | dilation = False 35 | position_embedding = 'sine' 36 | enc_layers = 6 37 | dec_layers = 6 38 | dim_feedforward = 2048 39 | hidden_dim = 256 40 | dropout = 0.1 41 | nheads = 8 42 | num_queries = 100 43 | masks = False 44 | set_cost_class = 1 45 | set_cost_bbox = 5 46 | set_cost_giou = 2 47 | mask_loss_coef = 1 48 | dice_loss_coef = 1 49 | bbox_loss_coef = 5 50 | giou_loss_coef = 2 51 | eos_coef = 0.1 52 | dataset_file = 'coco' 53 | remove_difficult = False 54 | output_dir = '' 55 | resume = '' 56 | start_epoch = 0 57 | eval = False 58 | num_workers = 2 59 | dist_url = 'env://' 60 | distributed = True 61 | aux_loss = True 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /image/detr/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import torch.utils.data 3 | import torchvision 4 | 5 | from .coco import build as build_coco 6 | 7 | 8 | def get_coco_api_from_dataset(dataset): 9 | for _ in range(10): 10 | # if isinstance(dataset, torchvision.datasets.CocoDetection): 11 | # break 12 | if isinstance(dataset, torch.utils.data.Subset): 13 | dataset = dataset.dataset 14 | if isinstance(dataset, torchvision.datasets.CocoDetection): 15 | return dataset.coco 16 | 17 | 18 | def build_dataset(image_set, args): 19 | if args.dataset_file == 'coco': 20 | return build_coco(image_set, args) 21 | if args.dataset_file == 'coco_panoptic': 22 | # to avoid making panopticapi required for coco 23 | from .coco_panoptic import build as build_coco_panoptic 24 | return build_coco_panoptic(image_set, args) 25 | raise ValueError(f'dataset {args.dataset_file} not supported') 26 | -------------------------------------------------------------------------------- /image/detr/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .detr import build 2 | 3 | 4 | def build_model(args): 5 | return build(args) 6 | -------------------------------------------------------------------------------- /image/detr/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /image/detr/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/detr/util/__init__.py -------------------------------------------------------------------------------- /image/diffusion/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /image/mae/.gitignore: -------------------------------------------------------------------------------- 1 | ./data 2 | ./output -------------------------------------------------------------------------------- /image/mae/README.md: -------------------------------------------------------------------------------- 1 | # Pretrain MAE on ImageNet 1000 (mini) 2 | 3 | Colossal-ai implementation of MAE, [arxiv](https//arxiv.org/abs/2111.06377). 4 | 5 | As an example, we just cover the pretrain phase with ImageNet 1000 6 | mini dataset. Helpers under subdir [util/](./util/) are from 7 | [facebookresearch/deit](https://github.com/facebookresearch/deit), 8 | under Apache License 2.0. 9 | 10 | ## Prepare Dataset 11 | 12 | In the script, we used ImageNet 1000 (mini) dataset hosted on 13 | [kaggle](https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000/discussion). 14 | 15 | Download and extract the dataset, then setting the environment 16 | variable `DATA`, or soft link data to the default location `{config_dir}/data` 17 | 18 | ```bash 19 | # example 20 | export DATA=/path/to/imagenet-mini/ 21 | 22 | # or link to default place 23 | ln -s /path/to/imagenet-mini/ ./data 24 | ``` 25 | 26 | ## Run single-GPU training 27 | 28 | This example is developed and tested under PyTorch 1.10, use `torchrun` 29 | to run it: 30 | 31 | ```bash 32 | torchrun --standalone --nnodes=1 --nproc_per_node 1 main_pretrain.py 33 | ``` 34 | 35 | It would read [./config/pretrain.py](./config/pretrain.py) as startup 36 | configuration, feel free to check it if you want to fine-tune the model 37 | or get some insight. 38 | 39 | By default, the pretrained model would generate a series of checkpoints, named 40 | `./output/checkpoint-{epoch}.pth`. 41 | 42 | 43 | ## Run multi-GPU training 44 | 45 | To run multi-GPU training on a single node, just change the `--nproc_per_node` 46 | parameter. For example, if `--nproc_per_node=4`, 4 GPUs on this machine will be 47 | used for training. However, to make sure the model converges well, you should 48 | adjust your batch size and learning rate accordingly. 49 | 50 | 51 | ## Tensor Parallel 52 | 53 | Model in [models_mae_tp.py](./models_mae_tp.py) is modified to support 1D tensor parallelism. 54 | About 1D tensor parallelism you can read [this documentation](https://www.colossalai.org/docs/features/1D_tensor_parallel). 55 | [./config/pretrain_1d_tp2.py](./config/pretrain_1d_tp2.py) is the 1D parallel configuration. 56 | 57 | Pass file path with flag `--config`: 58 | 59 | ```bash 60 | torchrun --standalone --nnodes 1 --nproc_per_node 2 main_pretrain.py --config ./config/pretrain_1d_tp2.py 61 | ``` 62 | 63 | We can also increase data parallelism by increasing `--nproc_per_node`: 64 | 65 | ```bash 66 | torchrun --standalone --nnodes 1 --nproc_per_node 4 main_pretrain.py --config ./config/pretrain_1d_tp2.py 67 | ``` 68 | 69 | This will result in `data parallel size: 2, pipeline parallel size: 1, tensor parallel size: 2` 70 | 71 | 72 | -------------------------------------------------------------------------------- /image/mae/config/pretrain.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | from colossalai.amp import AMP_TYPE 5 | from torchvision import transforms 6 | 7 | import util.misc as misc 8 | from util.crop import RandomResizedCrop 9 | 10 | # ==== Colossal-AI Configuration ==== 11 | 12 | gradient_accumulation = 1 13 | fp16 = dict(mode=AMP_TYPE.TORCH) 14 | 15 | # ==== Model Configuration ==== 16 | # 17 | # Variable Naming Convension: 18 | # 19 | # 1. `THIS_WILL_BE_DERECTLY_ACCESSED_BY_MAIN`: All capital. 20 | # eg: VERBOSE, LEARNING_RATE 21 | # 22 | # 2. `_THIS_WILL_BE_USED_TO_GENERATE_(1)`: Begin with underscore. 23 | # eg: __BASE_LEARNING_RATE 24 | # 25 | # 3. `this_is_a_simple_helper`: Snake case. 26 | # eg: eff_batch_size 27 | 28 | # toggle more loggings 29 | VERBOSE = False 30 | DEBUG = False 31 | 32 | NUM_EPOCHS = 800 33 | # epochs to warmup LR 34 | WARMUP_EPOCHS = 40 if NUM_EPOCHS > 40 else 0 35 | 36 | # Interval to save a checkpoint 37 | CHECKPOINT_INTERVAL = 20 38 | 39 | # Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus 40 | BATCH_SIZE = 4 41 | 42 | # Place to save pretrained model 43 | OUTPUT_DIR = Path(__file__).parent.parent / "output" 44 | OUTPUT_DIR.mkdir(parents=True, exist_ok=True) 45 | 46 | # Masking ratio (percentage of removed patches). 47 | MASK_RATIO = 0.75 48 | 49 | # learning rate (absolute lr) 50 | LEARNING_RATE = 0.01 51 | # lower lr bound for cyclic schedulers that hit 0 52 | MINIMUM_LEARNING_RATE = 0 53 | # base learning rate: absolute_lr = base_lr * total_batch_size / 256 54 | _BASE_LEARNING_RATE = 1e-3 55 | try: 56 | LEARNING_RATE 57 | except NameError: 58 | eff_batch_size = BATCH_SIZE * gradient_accumulation * misc.get_world_size() 59 | LEARNING_RATE = _BASE_LEARNING_RATE * eff_batch_size / 256 60 | 61 | WEIGHT_DECAY = 0.5 62 | 63 | # Use (per-patch) normalized pixels as targets for computing loss 64 | NORM_PIX_LOSS = True 65 | 66 | # resume from checkpoint 67 | RESUME = False 68 | if RESUME: 69 | RESUME_ADDRESS = "" 70 | 71 | TRANSFORM_TRAIN = transforms.Compose( 72 | [ 73 | RandomResizedCrop(224, interpolation=3), 74 | transforms.RandomHorizontalFlip(), 75 | transforms.ToTensor(), 76 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 77 | ] 78 | ) 79 | 80 | TRANSFORM_VAL = transforms.Compose( 81 | [ 82 | transforms.Resize(256, interpolation=3), 83 | transforms.CenterCrop(224), 84 | transforms.ToTensor(), 85 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 86 | ] 87 | ) 88 | 89 | # ==== Dynamic Configuration ==== 90 | 91 | try: 92 | DATAPATH = Path(os.environ["DATA"]) 93 | except KeyError: 94 | DATAPATH = Path(__file__).parent.parent / "data" 95 | -------------------------------------------------------------------------------- /image/mae/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /image/mae/util/crop.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import math 8 | 9 | import torch 10 | 11 | from torchvision import transforms 12 | from torchvision.transforms import functional as F 13 | 14 | 15 | class RandomResizedCrop(transforms.RandomResizedCrop): 16 | """ 17 | RandomResizedCrop for matching TF/TPU implementation: no for-loop is used. 18 | This may lead to results different with torchvision's version. 19 | Following BYOL's TF code: 20 | https://github.com/deepmind/deepmind-research/blob/master/byol/utils/dataset.py#L206 21 | """ 22 | 23 | @staticmethod 24 | def get_params(img, scale, ratio): 25 | width, height = F.get_image_size(img) 26 | area = height * width 27 | 28 | target_area = area * torch.empty(1).uniform_(scale[0], scale[1]).item() 29 | log_ratio = torch.log(torch.tensor(ratio)) 30 | aspect_ratio = torch.exp( 31 | torch.empty(1).uniform_(log_ratio[0], log_ratio[1]) 32 | ).item() 33 | 34 | w = int(round(math.sqrt(target_area * aspect_ratio))) 35 | h = int(round(math.sqrt(target_area / aspect_ratio))) 36 | 37 | w = min(w, width) 38 | h = min(h, height) 39 | 40 | i = torch.randint(0, height - h + 1, size=(1,)).item() 41 | j = torch.randint(0, width - w + 1, size=(1,)).item() 42 | 43 | return i, j, h, w 44 | -------------------------------------------------------------------------------- /image/mlpmixer/README.md: -------------------------------------------------------------------------------- 1 | # ColossalAI_MlpMixer 2 | This project is the reproduction of MlpMixer model with ColossalAI tool. 3 | 4 | # Result 5 | 6 | | Task | Model | Training Time | Top-1 Accuracy | 7 | | ------------- |:-------------:| -----:| -----:| 8 | | CIFAR10 |ColossalAI_MlpMixer | ~ 30 min | ~ 89.42% | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | ## Envirionment setup 21 | ``` 22 | git clone https://github.com/hpcaitech/ColossalAI.git 23 | cd ColossalAI 24 | # install dependency 25 | pip install -r requirements/requirements.txt 26 | 27 | # install colossalai 28 | pip install . 29 | ``` 30 | 31 | ## Usage 32 | 33 | To start training, use the following command to run each worker: 34 | ``` 35 | $ DATA=/path/to/dataset python train_data.py --world_size=WORLD_SIZE \ 36 | --rank=RANK \ 37 | --local_rank=LOCAL_RANK \ 38 | --host=MASTER_IP_ADDRESS \ 39 | --port=MASTER_PORT \ 40 | --config=CONFIG_FILE 41 | ``` 42 | It is also recommended to start training with `torchrun` as: 43 | 44 | ``` 45 | $ DATA=/path/to/dataset torchrun --nproc_per_node=NUM_GPUS_PER_NODE \ 46 | --nnodes=NUM_NODES \ 47 | --node_rank=NODE_RANK \ 48 | --master_addr=MASTER_IP_ADDRESS \ 49 | --master_port=MASTER_PORT \ 50 | train_data.py --config=CONFIG_FILE 51 | ``` 52 | For the pipeline parallelism, use the following command to run each worker: 53 | 54 | ``` 55 | $ DATA=/path/to/dataset torchrun --nproc_per_node=NUM_GPUS_PER_NODE \ 56 | --nnodes=NUM_NODES \ 57 | --node_rank=NODE_RANK \ 58 | --master_addr=MASTER_IP_ADDRESS \ 59 | --master_port=MASTER_PORT \ 60 | train_parallel.py 61 | ``` 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | ## Cite us 71 | ``` 72 | @article{bian2021colossal, 73 | title={Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training}, 74 | author={Bian, Zhengda and Liu, Hongxin and Wang, Boxiang and Huang, Haichen and Li, Yongbin and Wang, Chuanrui and Cui, Fan and You, Yang}, 75 | journal={arXiv preprint arXiv:2110.14883}, 76 | year={2021} 77 | } 78 | ``` 79 | -------------------------------------------------------------------------------- /image/mlpmixer/configs/MlpMixer_vanilla.py: -------------------------------------------------------------------------------- 1 | BATCH_SIZE = 512 2 | LEARNING_RATE = 2e-3 3 | WEIGHT_DECAY = 3e-2 4 | 5 | TENSOR_PARALLEL_SIZE = 1 6 | TENSOR_PARALLEL_MODE = None 7 | 8 | NUM_EPOCHS = 200 9 | WARMUP_EPOCHS = 40 10 | 11 | parallel = dict( 12 | pipeline=1, 13 | tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), 14 | ) 15 | 16 | seed = 42 17 | 18 | LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/" 19 | -------------------------------------------------------------------------------- /image/mlpmixer/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.8 2 | torchvision>=0.9 3 | numpy 4 | tqdm 5 | psutil 6 | tensorboard 7 | packaging 8 | colossalai -------------------------------------------------------------------------------- /image/moe/README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | MoE is a new technique to enlarge neural networks while keeping the same throughput in our training. 4 | It is designed to improve the performance of our models without any additional time penalty. Our old 5 | version moe parallelism will cause a moderate computation overhead and additional memory usage. But 6 | we are happy to announce that recently enabled CUDA kernels have solved the problem above. There 7 | are only two things that you need to concern. One is the additional communication time which highly 8 | depends on the topology and bandwidth of the network in running environment. Another is extra memory usage, 9 | since we have a larger model thanks to MoE. We will continuously maintain and optimize our MoE system 10 | and be encouraged by any issue that can help us improve our system. 11 | 12 | At present, we have provided Widenet and ViT-MoE in our model zoo (more information about Widenet can be 13 | found [here](https://arxiv.org/abs/2107.11817)). We now support a recent technique proposed by Microsoft, PR-MoE. 14 | You can access [here](https://arxiv.org/abs/2201.05596) to know more about PR-MoE. 15 | Directly use ViT-MoE in our model zoo or use MoeModule in your model to exploit PR-MoE. 16 | 17 | Here is a simple example about how to run ViT-MoE Lite6 with PR-MoE on cifar10. 18 | 19 | # How to run 20 | 21 | Before running this training script, you must set a environment variable called `DATA` where you place 22 | cifar10 data or want to place cifar10 data. 23 | 24 | ```shell 25 | export DATA= 26 | ``` 27 | 28 | On a single server, you can directly use torchrun to start pre-training on multiple GPUs in parallel. 29 | If you use the script here to train, just use follow instruction in your terminal. `n_proc` is the 30 | number of processes which commonly equals to the number GPUs. 31 | 32 | ```shell 33 | torchrun --nnodes=1 --nproc_per_node=8 train.py \ 34 | --config ./config.py 35 | ``` 36 | 37 | If you want to use multi servers, please check our document about environment initialization. 38 | 39 | Make sure to initialize moe running environment by `moe_set_seed` before building the model. 40 | 41 | # Result 42 | 43 | The best evaluation accuracy during training ViT-MoE Lite6 on cifar10 from scratch is 90.66%, which is better than average 44 | performance in training ViT Lite7. The result can be improved by data augmentations such as mixup and Randaug. 45 | We will offer those training scripts soon. -------------------------------------------------------------------------------- /image/moe/config.py: -------------------------------------------------------------------------------- 1 | BATCH_SIZE = 512 2 | LEARNING_RATE = 2e-3 3 | WEIGHT_DECAY = 3e-2 4 | 5 | NUM_EPOCHS = 200 6 | WARMUP_EPOCHS = 40 7 | 8 | parallel = dict() 9 | max_ep_size = 1 # all experts are replicated in the case that user only has 1 GPU 10 | clip_grad_norm = 1.0 # enable gradient clipping and set it to 1.0 11 | 12 | LOG_PATH = f"./cifar10_moe" 13 | -------------------------------------------------------------------------------- /image/moe/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /image/resnet/README.md: -------------------------------------------------------------------------------- 1 | # Train ResNet on CIFAR10 2 | 3 | ## Prepare Dataset 4 | 5 | We use CIFAR10 dataset in this example. The dataset will be downloaded to `./data` by default. 6 | If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command. 7 | 8 | ```bash 9 | export DATA=/path/to/data 10 | ``` 11 | 12 | 13 | ## Run single-GPU training 14 | 15 | We provide two examples of training resnet 18 on the CIFAR10 dataset. You can choose other ResNet models in `resnet.py` as well. 16 | You can change the value of `nproc_per_node` to adjust the number of GPUs used for training. 17 | When the `nproc_per_node` is changed, you may need to adjust the learning rate and batch size in the `config.py` accordingly. 18 | Normally we follow the rule of linear scaling, which is `new_global_batch_size / new_learning_rate = old_global_batch_size / old_learning rate`. 19 | 20 | ```bash 21 | # with engine 22 | colossalai run --nproc_per_node 1 train.py 23 | 24 | # with trainer 25 | colossalai run --nproc_per_node 1 train.py --use_trainer 26 | ``` 27 | 28 | ## Experiment Results 29 | 30 | | model | dataset | Testing Accuracy | 31 | | - | - | - | 32 | | ResNet18 | CIFAR10 | 95.2% | 33 | -------------------------------------------------------------------------------- /image/resnet/auto_parallel/README.md: -------------------------------------------------------------------------------- 1 | # Train ResNet on CIFAR10 with auto_parallel 2 | 3 | ## Prepare Dataset 4 | 5 | We use CIFAR10 dataset in this example. The dataset will be downloaded to `./data` by default. 6 | If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command. 7 | 8 | ```bash 9 | export DATA=/path/to/data 10 | ``` 11 | 12 | 13 | ## Run on 2*2 device mesh 14 | 15 | ```bash 16 | colossalai run --nproc_per_node 4 auto_parallel_demo.py 17 | ``` -------------------------------------------------------------------------------- /image/resnet/config.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | BATCH_SIZE = 128 4 | NUM_EPOCHS = 200 5 | 6 | CONFIG = dict(fp16=dict(mode=AMP_TYPE.TORCH)) 7 | -------------------------------------------------------------------------------- /image/resnet/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /image/simclr/NT_Xentloss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from colossalai.registry import LOSSES 5 | from torch.nn.modules.linear import Linear 6 | 7 | @LOSSES.register_module 8 | class NT_Xentloss(nn.Module): 9 | def __init__(self, temperature=0.5): 10 | super().__init__() 11 | self.temperature = temperature 12 | 13 | def forward(self, z1, z2, label): 14 | z1 = F.normalize(z1, dim=1) 15 | z2 = F.normalize(z2, dim=1) 16 | N, Z = z1.shape 17 | device = z1.device 18 | representations = torch.cat([z1, z2], dim=0) 19 | similarity_matrix = F.cosine_similarity(representations.unsqueeze(1), representations.unsqueeze(0), dim=-1) 20 | l_pos = torch.diag(similarity_matrix, N) 21 | r_pos = torch.diag(similarity_matrix, -N) 22 | positives = torch.cat([l_pos, r_pos]).view(2 * N, 1) 23 | diag = torch.eye(2*N, dtype=torch.bool, device=device) 24 | diag[N:,:N] = diag[:N,N:] = diag[:N,:N] 25 | 26 | negatives = similarity_matrix[~diag].view(2*N, -1) 27 | 28 | logits = torch.cat([positives, negatives], dim=1) 29 | logits /= self.temperature 30 | 31 | labels = torch.zeros(2*N, device=device, dtype=torch.int64) 32 | 33 | loss = F.cross_entropy(logits, labels, reduction='sum') 34 | return loss / (2 * N) 35 | 36 | 37 | if __name__=='__main__': 38 | criterion = NT_Xentloss() 39 | net = Linear(256,512) 40 | output = [net(torch.randn(512,256)), net(torch.randn(512,256))] 41 | label = [torch.randn(512)] 42 | loss = criterion(*output, *label) 43 | print(loss) 44 | 45 | -------------------------------------------------------------------------------- /image/simclr/augmentation.py: -------------------------------------------------------------------------------- 1 | from torchvision.transforms import transforms 2 | 3 | class SimCLRTransform(): 4 | def __init__(self): 5 | self.transform = transforms.Compose([ 6 | transforms.RandomResizedCrop(size=32, scale=(0.2, 1.0)), 7 | transforms.RandomHorizontalFlip(), 8 | transforms.RandomApply([transforms.ColorJitter(0.8, 0.8, 0.8, 0.2)], p=0.8), 9 | transforms.RandomGrayscale(p=0.2), 10 | transforms.RandomApply([transforms.GaussianBlur(kernel_size=32//20*2+1, sigma=(0.1, 2.0))], p=0.5), 11 | transforms.ToTensor(), 12 | transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]) 13 | ]) 14 | 15 | def __call__(self, x): 16 | x1 = self.transform(x) 17 | x2 = self.transform(x) 18 | return x1, x2 19 | 20 | 21 | class LeTransform(): 22 | def __init__(self): 23 | self.transform = transforms.Compose([ 24 | transforms.RandomResizedCrop(size=32, scale=(0.2, 1.0)), 25 | transforms.RandomHorizontalFlip(), 26 | transforms.ToTensor(), 27 | transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]) 28 | ]) 29 | 30 | def __call__(self, x): 31 | x = self.transform(x) 32 | return x -------------------------------------------------------------------------------- /image/simclr/config.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | 4 | LOG_NAME = 'cifar-simclr' 5 | 6 | BATCH_SIZE = 512 7 | NUM_EPOCHS = 801 8 | LEARNING_RATE = 0.03 * BATCH_SIZE / 256 9 | WEIGHT_DECAY = 0.0005 10 | MOMENTUM = 0.9 11 | 12 | 13 | fp16 = dict( 14 | mode=AMP_TYPE.TORCH, 15 | ) 16 | 17 | dataset = dict( 18 | root='./dataset', 19 | ) 20 | 21 | gradient_accumulation = 2 22 | clip_grad_norm = 1.0 23 | -------------------------------------------------------------------------------- /image/simclr/le_config.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | 4 | LOG_NAME = 'cifar-simclr' 5 | EPOCH = 800 6 | 7 | BATCH_SIZE = 512 8 | NUM_EPOCHS = 51 9 | LEARNING_RATE = 0.03*BATCH_SIZE/256 10 | WEIGHT_DECAY = 0.0005 11 | MOMENTUM = 0.9 12 | 13 | 14 | fp16 = dict( 15 | mode=AMP_TYPE.TORCH, 16 | ) 17 | 18 | dataset = dict( 19 | root='./dataset', 20 | ) 21 | 22 | gradient_accumulation = 1 23 | clip_grad_norm = 1.0 24 | -------------------------------------------------------------------------------- /image/simclr/models/linear_eval.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from .Backbone import backbone 5 | 6 | class Linear_eval(nn.Module): 7 | 8 | def __init__(self, model='resnet18', class_num=10, **kwargs): 9 | super().__init__() 10 | 11 | self.backbone = backbone(model, **kwargs) 12 | self.backbone.requires_grad_(False) 13 | self.fc = nn.Linear(self.backbone.output_dim, class_num) 14 | 15 | def forward(self, x): 16 | 17 | out = self.backbone(x) 18 | out = self.fc(out) 19 | return out 20 | -------------------------------------------------------------------------------- /image/simclr/models/simclr.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from .Backbone import backbone 5 | 6 | class projection_MLP(nn.Module): 7 | def __init__(self, in_dim, out_dim=256): 8 | super().__init__() 9 | hidden_dim = in_dim 10 | self.layer1 = nn.Sequential( 11 | nn.Linear(in_dim, hidden_dim), 12 | nn.ReLU(inplace=True) 13 | ) 14 | self.layer2 = nn.Linear(hidden_dim, out_dim) 15 | def forward(self, x): 16 | x = self.layer1(x) 17 | x = self.layer2(x) 18 | return x 19 | 20 | class SimCLR(nn.Module): 21 | 22 | def __init__(self, model='resnet18', **kwargs): 23 | super().__init__() 24 | 25 | self.backbone = backbone(model, **kwargs) 26 | self.projector = projection_MLP(self.backbone.output_dim) 27 | self.encoder = nn.Sequential( 28 | self.backbone, 29 | self.projector 30 | ) 31 | 32 | def forward(self, x1, x2): 33 | 34 | z1 = self.encoder(x1) 35 | z2 = self.encoder(x2) 36 | return z1, z2 -------------------------------------------------------------------------------- /image/simclr/myhooks.py: -------------------------------------------------------------------------------- 1 | from colossalai.trainer.hooks import BaseHook 2 | from colossalai.core import global_context as gpc 3 | from colossalai.context import ParallelMode 4 | from colossalai.logging import get_dist_logger 5 | 6 | 7 | class TotalBatchsizeHook(BaseHook): 8 | def __init__(self, priority: int = 2) -> None: 9 | super().__init__(priority) 10 | self.logger = get_dist_logger() 11 | 12 | def before_train(self, trainer): 13 | total_batch_size = gpc.config.BATCH_SIZE * \ 14 | gpc.config.gradient_accumulation * gpc.get_world_size(ParallelMode.DATA) 15 | self.logger.info(f'Total batch size = {total_batch_size}', ranks=[0]) -------------------------------------------------------------------------------- /image/simclr/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /image/simclr/results/embedding.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/simclr/results/embedding.npz -------------------------------------------------------------------------------- /image/simclr/results/linear_eval_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/simclr/results/linear_eval_acc.png -------------------------------------------------------------------------------- /image/simclr/results/linear_eval_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/simclr/results/linear_eval_loss.png -------------------------------------------------------------------------------- /image/simclr/results/ssl_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/simclr/results/ssl_loss.png -------------------------------------------------------------------------------- /image/simclr/results/test_tsne.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/simclr/results/test_tsne.png -------------------------------------------------------------------------------- /image/simclr/results/train_tsne.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/simclr/results/train_tsne.png -------------------------------------------------------------------------------- /image/simclr/train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | ## phase 1: self-supervised training 4 | python -m torch.distributed.launch --nproc_per_node 1 --master_addr localhost --master_port 29500 train_simclr.py 5 | 6 | ## phase 2: linear evaluation 7 | python -m torch.distributed.launch --nproc_per_node 1 --master_addr localhost --master_port 29500 train_linear.py -------------------------------------------------------------------------------- /image/vilt/.gitignore: -------------------------------------------------------------------------------- 1 | ckpt/ 2 | logs/ -------------------------------------------------------------------------------- /image/vilt/README.md: -------------------------------------------------------------------------------- 1 | # Train ViLT on COCO dataset with Colossal-AI 2 | 3 | Colossal-AI implementation for the ICML 2021 (long talk) paper: "[ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334)" 4 | 5 | --- 6 | 9 | 10 | 11 | 12 | ## Prepare Environment 13 | ```bash 14 | pip install -r requirements.txt 15 | ``` 16 | 17 | ## Prepare Dataset 18 | In this example we use the COCO Captions (COCO) dataset. 19 | 20 | ```bash 21 | bash prepare_dataset.sh 22 | ``` 23 | 24 | ## Train masked language (MLM) Models 25 | 26 | ```bash 27 | bash run.sh 28 | 29 | ex) 30 | 31 | bash run.sh /vilt_data 4 32 | ``` 33 | 34 | 35 | ## Citation 36 | If you use any part of this code and pretrained weights for your own purpose, please cite the original [paper](https://arxiv.org/abs/2102.03334). 37 | ``` 38 | @InProceedings{pmlr-v139-kim21k, 39 | title = {ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision}, 40 | author = {Kim, Wonjae and Son, Bokyung and Kim, Ildoo}, 41 | booktitle = {Proceedings of the 38th International Conference on Machine Learning}, 42 | pages = {5583--5594}, 43 | year = {2021}, 44 | editor = {Meila, Marina and Zhang, Tong}, 45 | volume = {139}, 46 | series = {Proceedings of Machine Learning Research}, 47 | month = {18--24 Jul}, 48 | publisher = {PMLR}, 49 | pdf = {http://proceedings.mlr.press/v139/kim21k/kim21k.pdf}, 50 | url = {http://proceedings.mlr.press/v139/kim21k.html}, 51 | abstract = {Vision-and-Language Pre-training (VLP) has improved performance on various joint vision-and-language downstream tasks. Current approaches to VLP heavily rely on image feature extraction processes, most of which involve region supervision (e.g., object detection) and the convolutional architecture (e.g., ResNet). Although disregarded in the literature, we find it problematic in terms of both (1) efficiency/speed, that simply extracting input features requires much more computation than the multimodal interaction steps; and (2) expressive power, as it is upper bounded to the expressive power of the visual embedder and its predefined visual vocabulary. In this paper, we present a minimal VLP model, Vision-and-Language Transformer (ViLT), monolithic in the sense that the processing of visual inputs is drastically simplified to just the same convolution-free manner that we process textual inputs. We show that ViLT is up to tens of times faster than previous VLP models, yet with competitive or better downstream task performance. Our code and pre-trained weights are available at https://github.com/dandelin/vilt.} 52 | } 53 | ``` 54 | 55 | 56 | -------------------------------------------------------------------------------- /image/vilt/configs.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | BATCH_SIZE = 256 4 | DROP_RATE = 0.1 5 | NUM_EPOCHS = 10 6 | 7 | fp16 = dict( 8 | mode=AMP_TYPE.TORCH, 9 | ) 10 | 11 | gradient_accumulation = 16 12 | gradient_clipping = 1.0 13 | 14 | parallel = dict( 15 | tensor=dict(size=2, mode='1d'), 16 | ) 17 | num_epochs = 10 18 | 19 | # config logging path 20 | logging = dict( 21 | root_path='./logs' 22 | ) -------------------------------------------------------------------------------- /image/vilt/prepare_dataset.sh: -------------------------------------------------------------------------------- 1 | WORKSPACE=$(pwd) 2 | 3 | RAW_ROOT=$1 4 | 5 | ARROW_ROOT=$RAW_ROOT/arrow 6 | 7 | if [ -z $RAW_ROOT ] 8 | then 9 | echo "Usage: $0 " 10 | exit 1 11 | fi 12 | 13 | if [ ! -e $ARROW_ROOT ] 14 | then 15 | mkdir $ARROW_ROOT 16 | fi 17 | 18 | if [ -e $RAW_ROOT ] 19 | then 20 | cd $RAW_ROOT 21 | else 22 | mkdir $RAW_ROOT 23 | fi 24 | 25 | 26 | 27 | 28 | # download all files 29 | wget http://images.cocodataset.org/zips/train2014.zip 30 | wget http://images.cocodataset.org/zips/val2014.zip 31 | wget https://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip 32 | 33 | # unzip all files 34 | unzip train2014.zip -d $RAW_ROOT/train2014 35 | unzip val2014.zip -d $RAW_ROOT/val2014 36 | unzip caption_datasets.zip -d $RAW_ROOT/karpathy 37 | 38 | # remove all files 39 | rm train2014.zip 40 | rm val2014.zip 41 | rm caption_datasets.zip 42 | 43 | # converting the dataset 44 | cd $WORKSPACE 45 | python utils/makearrow.py $RAW_ROOT $ARROW_ROOT 46 | -------------------------------------------------------------------------------- /image/vilt/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.2.1 2 | Pillow==8.2.0 3 | tqdm==4.56.0 4 | ipdb==0.13.4 5 | numpy==1.19.5 6 | einops==0.3.0 7 | pyarrow==2.0.0 8 | sacred==0.8.2 9 | pandas==1.1.5 10 | colossalai 11 | git+https://github.com/rwightman/pytorch-image-models.git -------------------------------------------------------------------------------- /image/vilt/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORK_DIR=$(pwd) 4 | DATA_ROOT=$1/arrow 5 | NUM_GPUS=$2 6 | 7 | if [ -z $DATA_ROOT ] || [ -z $NUM_GPUS ] 8 | then 9 | echo "Usage: $0 " 10 | exit 1 11 | fi 12 | 13 | cd $WORK_DIR 14 | 15 | if ! [ -x "$(command -v mpirun)" ] 16 | then 17 | torchrun --nproc_per_node $NUM_GPUS --master_addr localhost --master_port 11455 run.py 18 | else 19 | mpirun -np $NUM_GPUS python run.py with data_root=$DATA_ROOT num_gpus=$NUM_GPUS num_nodes=1 task_mlm_itm_s step200k per_gpu_batchsize=96 20 | fi 21 | -------------------------------------------------------------------------------- /image/vilt/schedule.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import torch 3 | 4 | class viltSchedule(colossalai.engine.schedule.NonPipelineSchedule): 5 | @staticmethod 6 | def _call_engine_criterion(engine, outputs, labels): 7 | # assert isinstance(outputs, (torch.Tensor, list, tuple) 8 | # ), f'Expect output of model is (torch.Tensor, list, tuple), got {type(outputs)}' 9 | if isinstance(outputs, torch.Tensor): 10 | outputs = (outputs, ) 11 | if isinstance(labels, torch.Tensor): 12 | return engine.criterion(*outputs, labels) 13 | else: 14 | return engine.criterion(outputs) 15 | 16 | def __init__(self,batch_data_process_func) -> None: 17 | super().__init__(batch_data_process_func) 18 | @staticmethod 19 | def _call_engine(engine, inputs): 20 | return engine(inputs) 21 | -------------------------------------------------------------------------------- /image/vilt/utils/heads.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from transformers.models.bert.modeling_bert import BertPredictionHeadTransform 6 | 7 | 8 | class Pooler(nn.Module): 9 | def __init__(self, hidden_size): 10 | super().__init__() 11 | self.dense = nn.Linear(hidden_size, hidden_size) 12 | self.activation = nn.Tanh() 13 | 14 | def forward(self, hidden_states): 15 | first_token_tensor = hidden_states[:, 0] 16 | pooled_output = self.dense(first_token_tensor) 17 | pooled_output = self.activation(pooled_output) 18 | return pooled_output 19 | 20 | 21 | class ITMHead(nn.Module): 22 | def __init__(self, hidden_size): 23 | super().__init__() 24 | self.fc = nn.Linear(hidden_size, 2) 25 | 26 | def forward(self, x): 27 | x = self.fc(x) 28 | return x 29 | 30 | 31 | class MLMHead(nn.Module): 32 | def __init__(self, config, weight=None): 33 | super().__init__() 34 | self.transform = BertPredictionHeadTransform(config) 35 | self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 36 | self.bias = nn.Parameter(torch.zeros(config.vocab_size)) 37 | if weight is not None: 38 | self.decoder.weight = weight 39 | 40 | def forward(self, x): 41 | x = self.transform(x) 42 | x = self.decoder(x) + self.bias 43 | return x 44 | 45 | 46 | class MPPHead(nn.Module): 47 | def __init__(self, config): 48 | super().__init__() 49 | self.transform = BertPredictionHeadTransform(config) 50 | self.decoder = nn.Linear(config.hidden_size, 256 * 3) 51 | 52 | def forward(self, x): 53 | x = self.transform(x) 54 | x = self.decoder(x) 55 | return x 56 | -------------------------------------------------------------------------------- /image/vilt/utils/makearrow.py: -------------------------------------------------------------------------------- 1 | from write_coco_karpathy import make_arrow 2 | import sys 3 | 4 | make_arrow(sys.argv[1], sys.argv[2]) -------------------------------------------------------------------------------- /image/vilt/utils/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | from .pixelbert import ( 2 | pixelbert_transform, 3 | pixelbert_transform_randaug, 4 | ) 5 | 6 | _transforms = { 7 | "pixelbert": pixelbert_transform, 8 | "pixelbert_randaug": pixelbert_transform_randaug, 9 | } 10 | 11 | 12 | def keys_to_transforms(keys: list, size=224): 13 | return [_transforms[key](size=size) for key in keys] 14 | -------------------------------------------------------------------------------- /image/vilt/utils/transforms/pixelbert.py: -------------------------------------------------------------------------------- 1 | from .utils import ( 2 | inception_normalize, 3 | MinMaxResize, 4 | ) 5 | from torchvision import transforms 6 | from .randaug import RandAugment 7 | 8 | 9 | def pixelbert_transform(size=800): 10 | longer = int((1333 / 800) * size) 11 | return transforms.Compose( 12 | [ 13 | MinMaxResize(shorter=size, longer=longer), 14 | transforms.ToTensor(), 15 | inception_normalize, 16 | ] 17 | ) 18 | 19 | 20 | def pixelbert_transform_randaug(size=800): 21 | longer = int((1333 / 800) * size) 22 | trs = transforms.Compose( 23 | [ 24 | MinMaxResize(shorter=size, longer=longer), 25 | transforms.ToTensor(), 26 | inception_normalize, 27 | ] 28 | ) 29 | trs.transforms.insert(0, RandAugment(2, 9)) 30 | return trs 31 | -------------------------------------------------------------------------------- /image/vilt/utils/transforms/utils.py: -------------------------------------------------------------------------------- 1 | from torchvision import transforms 2 | from PIL import Image 3 | 4 | 5 | class MinMaxResize: 6 | def __init__(self, shorter=800, longer=1333): 7 | self.min = shorter 8 | self.max = longer 9 | 10 | def __call__(self, x): 11 | w, h = x.size 12 | scale = self.min / min(w, h) 13 | if h < w: 14 | newh, neww = self.min, scale * w 15 | else: 16 | newh, neww = scale * h, self.min 17 | 18 | if max(newh, neww) > self.max: 19 | scale = self.max / max(newh, neww) 20 | newh = newh * scale 21 | neww = neww * scale 22 | 23 | newh, neww = int(newh + 0.5), int(neww + 0.5) 24 | newh, neww = newh // 32 * 32, neww // 32 * 32 25 | 26 | return x.resize((neww, newh), resample=Image.BICUBIC) 27 | 28 | 29 | class UnNormalize(object): 30 | def __init__(self, mean, std): 31 | self.mean = mean 32 | self.std = std 33 | 34 | def __call__(self, tensor): 35 | """ 36 | Args: 37 | tensor (Tensor): Tensor image of size (C, H, W) to be normalized. 38 | Returns: 39 | Tensor: Normalized image. 40 | """ 41 | for t, m, s in zip(tensor, self.mean, self.std): 42 | t.mul_(s).add_(m) 43 | # The normalize code -> t.sub_(m).div_(s) 44 | return tensor 45 | 46 | 47 | # This is simple maximum entropy normalization performed in Inception paper 48 | inception_normalize = transforms.Compose( 49 | [transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])] 50 | ) 51 | 52 | # ViT uses simple non-biased inception normalization 53 | # https://github.com/google-research/vision_transformer/blob/master/vit_jax/input_pipeline.py#L132 54 | inception_unnormalize = transforms.Compose( 55 | [UnNormalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])] 56 | ) 57 | -------------------------------------------------------------------------------- /image/vilt/utils/write_coco_karpathy.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pandas as pd 4 | import pyarrow as pa 5 | import random 6 | 7 | from tqdm import tqdm 8 | from glob import glob 9 | from collections import defaultdict 10 | 11 | 12 | def path2rest(path, iid2captions, iid2split): 13 | name = path.split("/")[-1] 14 | with open(path, "rb") as fp: 15 | binary = fp.read() 16 | captions = iid2captions[name] 17 | split = iid2split[name] 18 | return [binary, captions, name, split] 19 | 20 | 21 | def make_arrow(root, dataset_root): 22 | with open(f"{root}/karpathy/dataset_coco.json", "r") as fp: 23 | captions = json.load(fp) 24 | 25 | captions = captions["images"] 26 | 27 | iid2captions = defaultdict(list) 28 | iid2split = dict() 29 | 30 | for cap in tqdm(captions): 31 | filename = cap["filename"] 32 | iid2split[filename] = cap["split"] 33 | for c in cap["sentences"]: 34 | iid2captions[filename].append(c["raw"]) 35 | 36 | paths = list(glob(f"{root}/train2014/*.jpg")) + list(glob(f"{root}/val2014/*.jpg")) 37 | random.shuffle(paths) 38 | caption_paths = [path for path in paths if path.split("/")[-1] in iid2captions] 39 | 40 | if len(paths) == len(caption_paths): 41 | print("all images have caption annotations") 42 | else: 43 | print("not all images have caption annotations") 44 | print( 45 | len(paths), len(caption_paths), len(iid2captions), 46 | ) 47 | 48 | bs = [path2rest(path, iid2captions, iid2split) for path in tqdm(caption_paths)] 49 | 50 | for split in ["train", "val", "restval", "test"]: 51 | batches = [b for b in bs if b[-1] == split] 52 | 53 | dataframe = pd.DataFrame( 54 | batches, columns=["image", "caption", "image_id", "split"], 55 | ) 56 | 57 | table = pa.Table.from_pandas(dataframe) 58 | os.makedirs(dataset_root, exist_ok=True) 59 | with pa.OSFile( 60 | f"{dataset_root}/coco_caption_karpathy_{split}.arrow", "wb" 61 | ) as sink: 62 | with pa.RecordBatchFileWriter(sink, table.schema) as writer: 63 | writer.write_table(table) 64 | -------------------------------------------------------------------------------- /image/vision_transformer/colo_vit/README.md: -------------------------------------------------------------------------------- 1 | # Vision Transformer with ColoTensor 2 | 3 | # Overview 4 | 5 | In this example, we will run Vision Transformer with ColoTensor. 6 | 7 | We use model **ViTForImageClassification** from Hugging Face [Link](https://huggingface.co/docs/transformers/model_doc/vit) for unit test. 8 | You can change world size or decide whether use DDP in our code. 9 | 10 | We use model **vision_transformer** from timm [Link](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) for training example. 11 | 12 | (2022/6/28) The default configuration now supports 2DP+2TP with gradient accumulation and checkpoint support. Zero is not supported at present. 13 | 14 | # Requirement 15 | 16 | You should install colossalai from main branch with commit 561e904. 17 | 18 | ## Unit test 19 | To run unit test, you should install pytest, transformers with: 20 | ```shell 21 | pip install pytest transformers 22 | ``` 23 | 24 | ## Training example 25 | To run training example with ViT-S, you should install **NVIDIA DALI** from [Link](https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html) for dataloader support. 26 | You also need to install timm and titans for model/dataloader support with: 27 | ```shell 28 | pip install timm titans 29 | ``` 30 | 31 | ### Data preparation 32 | You can download the ImageNet dataset from the [ImageNet official website](https://www.image-net.org/download.php). You should get the raw images after downloading the dataset. As we use **NVIDIA DALI** to read data, we use the TFRecords dataset instead of raw Imagenet dataset. This offers better speedup to IO. If you don't have TFRecords dataset, follow [imagenet-tools](https://github.com/ver217/imagenet-tools) to build one. 33 | 34 | Before you start training, you need to set the environment variable `DATA` so that the script knows where to fetch the data for DALI dataloader. 35 | ```shell 36 | export DATA=/path/to/ILSVRC2012 37 | ``` 38 | 39 | 40 | 41 | # How to run 42 | 43 | ## Unit test 44 | In your terminal 45 | ```shell 46 | pytest test_vit.py 47 | ``` 48 | 49 | This will evaluate models with different **world_size** and **use_ddp**. 50 | 51 | ## Training example 52 | Modify the settings in run.sh according to your environment. 53 | For example, if you set `--nproc_per_node=8` in `run.sh` and `TP_WORLD_SIZE=2` in your config file, 54 | data parallel size will be automatically calculated as 4. 55 | Thus, the parallel strategy is set to 4DP+2TP. 56 | 57 | Then in your terminal 58 | ```shell 59 | sh run.sh 60 | ``` 61 | 62 | This will start ViT-S training with ImageNet. -------------------------------------------------------------------------------- /image/vision_transformer/colo_vit/configs/vit_1d_tp2.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | # hyperparameters 4 | # BATCH_SIZE is as per GPU 5 | # global batch size = BATCH_SIZE x data parallel size 6 | BATCH_SIZE = 256 7 | LEARNING_RATE = 3e-3 8 | WEIGHT_DECAY = 0.3 9 | NUM_EPOCHS = 300 10 | WARMUP_EPOCHS = 32 11 | 12 | # model config 13 | IMG_SIZE = 224 14 | PATCH_SIZE = 16 15 | HIDDEN_SIZE = 384 16 | DEPTH = 12 17 | NUM_HEADS = 6 18 | MLP_RATIO = 4 19 | NUM_CLASSES = 1000 20 | CHECKPOINT = False 21 | SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1 # add 1 for cls token 22 | 23 | USE_DDP = True 24 | TP_WORLD_SIZE = 2 25 | TP_TYPE = 'row' 26 | parallel = dict(tensor=dict(mode="1d", size=TP_WORLD_SIZE),) 27 | 28 | fp16 = dict(mode=AMP_TYPE.NAIVE) 29 | clip_grad_norm = 1.0 30 | gradient_accumulation = 8 31 | 32 | LOG_PATH = "./log" 33 | -------------------------------------------------------------------------------- /image/vision_transformer/colo_vit/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /image/vision_transformer/colo_vit/run.sh: -------------------------------------------------------------------------------- 1 | export DATA=/data/scratch/imagenet/tf_records 2 | export OMP_NUM_THREADS=4 3 | 4 | # resume 5 | # CUDA_VISIBLE_DEVICES=4,5,6,7 colossalai run \ 6 | # --nproc_per_node 4 train.py \ 7 | # --config configs/vit_1d_tp2.py \ 8 | # --resume_from checkpoint/epoch_10 \ 9 | # --master_port 29598 | tee ./out 2>&1 10 | 11 | # train 12 | CUDA_VISIBLE_DEVICES=4,5,6,7 colossalai run \ 13 | --nproc_per_node 4 train.py \ 14 | --config configs/vit_1d_tp2.py \ 15 | --master_port 29598 | tee ./out 2>&1 -------------------------------------------------------------------------------- /image/vision_transformer/colo_vit/utils/dummy_data_generator.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class DummyDataGenerator(ABC): 5 | 6 | def __init__(self, length=10): 7 | self.length = length 8 | 9 | @abstractmethod 10 | def generate(self): 11 | pass 12 | 13 | def __iter__(self): 14 | self.step = 0 15 | return self 16 | 17 | def __next__(self): 18 | if self.step < self.length: 19 | self.step += 1 20 | return self.generate() 21 | else: 22 | raise StopIteration 23 | 24 | def __len__(self): 25 | return self.length 26 | -------------------------------------------------------------------------------- /image/vision_transformer/colo_vit/utils/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import numpy as np 4 | import torch 5 | import torch.distributed as dist 6 | from colossalai.core import global_context as gpc 7 | from colossalai.context import ParallelMode 8 | 9 | 10 | def set_seed(seed): 11 | random.seed(seed) 12 | os.environ['PYTHONHASHSEED'] = str(seed) 13 | np.random.seed(seed) 14 | torch.manual_seed(seed) 15 | torch.cuda.manual_seed(seed) 16 | torch.backends.cudnn.deterministic = True 17 | 18 | 19 | def check_equal(A, B): 20 | assert torch.allclose(A, B, rtol=1e-3, atol=1e-1) == True 21 | 22 | 23 | def replace_parameter_add_grad(layer, weight=None, bias=None): 24 | if weight is not None: 25 | delattr(layer, 'weight') 26 | setattr(layer, 'weight', weight) 27 | layer.weight.requires_grad = True 28 | if bias is not None: 29 | delattr(layer, 'bias') 30 | setattr(layer, 'bias', bias) 31 | layer.bias.requires_grad = True 32 | 33 | 34 | def broadcast_tensor_chunk(tensor, chunk_size=1, local_rank=0): 35 | dist.broadcast(tensor, src=0) 36 | tensor_chunk = torch.chunk(tensor, chunk_size, dim=-1)[local_rank] 37 | return tensor_chunk.clone() 38 | 39 | 40 | def tensor_equal(A, B): 41 | return torch.allclose(A, B, rtol=1e-3, atol=1e-1) 42 | 43 | 44 | def tensor_shard_equal(tensor: torch.Tensor, shard: torch.Tensor): 45 | assert tensor.ndim == shard.ndim 46 | if tensor.shape == shard.shape: 47 | return tensor_equal(tensor, shard) 48 | else: 49 | dims_not_eq = torch.nonzero(torch.tensor(tensor.shape) != torch.tensor(shard.shape)) 50 | if dims_not_eq.numel() == 1: 51 | # 1D shard 52 | dim = dims_not_eq.item() 53 | world_size = gpc.get_world_size(ParallelMode.PARALLEL_1D) 54 | rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D) 55 | return tensor_equal(tensor.chunk(world_size, dim)[rank], shard) 56 | else: 57 | raise NotImplementedError 58 | -------------------------------------------------------------------------------- /image/vision_transformer/colo_vit/vit.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from transformers import ViTForImageClassification, ViTConfig 4 | from utils.dummy_data_generator import DummyDataGenerator 5 | from colossalai.utils.cuda import get_current_device 6 | 7 | 8 | class DummyDataLoader(DummyDataGenerator): 9 | batch_size = 4 10 | channel = 3 11 | category = 8 12 | image_size = 224 13 | 14 | def generate(self): 15 | image_dict = {} 16 | image_dict['pixel_values'] = torch.rand(DummyDataLoader.batch_size, 17 | DummyDataLoader.channel, 18 | DummyDataLoader.image_size, 19 | DummyDataLoader.image_size, 20 | device=get_current_device()) * 2 - 1 21 | image_dict['label'] = torch.randint(DummyDataLoader.category, (DummyDataLoader.batch_size,), 22 | dtype=torch.int64, 23 | device=get_current_device()) 24 | return image_dict 25 | 26 | 27 | class ViTCVModel(nn.Module): 28 | 29 | def __init__(self, 30 | hidden_size=768, 31 | num_hidden_layers=12, 32 | num_attention_heads=12, 33 | image_size=224, 34 | patch_size=16, 35 | num_channels=3, 36 | num_labels=8, 37 | checkpoint=False): 38 | super().__init__() 39 | self.checkpoint = checkpoint 40 | self.model = ViTForImageClassification( 41 | ViTConfig(hidden_size=hidden_size, 42 | num_hidden_layers=num_hidden_layers, 43 | num_attention_heads=num_attention_heads, 44 | image_size=image_size, 45 | patch_size=patch_size, 46 | num_channels=num_channels, 47 | num_labels=num_labels)) 48 | if checkpoint: 49 | self.model.gradient_checkpointing_enable() 50 | 51 | def forward(self, pixel_values): 52 | return self.model(pixel_values=pixel_values) 53 | 54 | 55 | def vit_base_s(checkpoint=True): 56 | return ViTCVModel(checkpoint=checkpoint) 57 | 58 | 59 | def vit_base_micro(checkpoint=True): 60 | return ViTCVModel(hidden_size=32, num_hidden_layers=2, num_attention_heads=4, checkpoint=checkpoint) 61 | 62 | 63 | def get_training_components(): 64 | trainloader = DummyDataLoader() 65 | testloader = DummyDataLoader() 66 | return vit_base_micro, trainloader, testloader, torch.optim.Adam, torch.nn.functional.cross_entropy 67 | -------------------------------------------------------------------------------- /image/vision_transformer/data_parallel/config.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | # ViT Base 4 | BATCH_SIZE = 256 5 | DROP_RATE = 0.1 6 | NUM_EPOCHS = 2 7 | 8 | fp16 = dict( 9 | mode=AMP_TYPE.TORCH, 10 | ) 11 | 12 | gradient_accumulation = 16 13 | clip_grad_norm = 1.0 14 | 15 | dali = dict( 16 | gpu_aug=True, 17 | mixup_alpha=0.2 18 | ) 19 | -------------------------------------------------------------------------------- /image/vision_transformer/data_parallel/mixup.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from colossalai.registry import LOSSES 3 | import torch 4 | 5 | 6 | @LOSSES.register_module 7 | class MixupLoss(nn.Module): 8 | def __init__(self, loss_fn_cls): 9 | super().__init__() 10 | self.loss_fn = loss_fn_cls() 11 | 12 | def forward(self, inputs, targets_a, targets_b, lam): 13 | return lam * self.loss_fn(inputs, targets_a) + (1 - lam) * self.loss_fn(inputs, targets_b) 14 | 15 | 16 | class MixupAccuracy(nn.Module): 17 | def forward(self, logits, targets): 18 | targets = targets['targets_a'] 19 | preds = torch.argmax(logits, dim=-1) 20 | correct = torch.sum(targets == preds) 21 | return correct 22 | -------------------------------------------------------------------------------- /image/vision_transformer/data_parallel/myhooks.py: -------------------------------------------------------------------------------- 1 | from colossalai.trainer.hooks import BaseHook 2 | from colossalai.core import global_context as gpc 3 | from colossalai.context import ParallelMode 4 | from colossalai.logging import get_dist_logger 5 | 6 | 7 | class TotalBatchsizeHook(BaseHook): 8 | def __init__(self, priority: int = 2) -> None: 9 | super().__init__(priority) 10 | self.logger = get_dist_logger() 11 | 12 | def before_train(self, trainer): 13 | total_batch_size = gpc.config.BATCH_SIZE * \ 14 | gpc.config.gradient_accumulation * gpc.get_world_size(ParallelMode.DATA) 15 | self.logger.info(f'Total batch size = {total_batch_size}', ranks=[0]) 16 | -------------------------------------------------------------------------------- /image/vision_transformer/data_parallel/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /image/vision_transformer/data_parallel/results/acc.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/vision_transformer/data_parallel/results/acc.jpeg -------------------------------------------------------------------------------- /image/vision_transformer/data_parallel/results/loss.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/image/vision_transformer/data_parallel/results/loss.jpeg -------------------------------------------------------------------------------- /image/vision_transformer/data_parallel/scripts/train_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python train.py --host $HOST --config ./config.py --port 29500 -------------------------------------------------------------------------------- /image/vision_transformer/data_parallel/train_with_cifar10.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import colossalai 4 | import torch 5 | from colossalai.context import ParallelMode 6 | from colossalai.core import global_context as gpc 7 | from colossalai.logging import disable_existing_loggers, get_dist_logger 8 | from colossalai.utils import get_dataloader 9 | from colossalai.nn.lr_scheduler import LinearWarmupLR 10 | from colossalai.nn.metric import Accuracy 11 | from colossalai.trainer import Trainer, hooks 12 | from timm.models import vit_base_patch16_224 13 | 14 | from titans.dataloader.cifar10 import build_cifar 15 | 16 | 17 | def main(): 18 | # initialize distributed setting 19 | parser = colossalai.get_default_parser() 20 | args = parser.parse_args() 21 | disable_existing_loggers() 22 | 23 | # launch from torch 24 | colossalai.launch_from_torch(config=args.config) 25 | 26 | # get logger 27 | logger = get_dist_logger() 28 | logger.info("initialized distributed environment", ranks=[0]) 29 | 30 | # build model 31 | model = vit_base_patch16_224(drop_rate=0.1, num_classes=10) 32 | 33 | # build dataloader 34 | root = os.environ.get('DATA', './data') 35 | train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE, root, pad_if_needed=True) 36 | 37 | # build optimizer 38 | optimizer = colossalai.nn.Lamb(model.parameters(), lr=1.8e-2, weight_decay=0.1) 39 | 40 | # build loss 41 | criterion = torch.nn.CrossEntropyLoss() 42 | 43 | # lr_scheduler 44 | lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=gpc.config.NUM_EPOCHS) 45 | 46 | engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model, optimizer, criterion, train_dataloader, 47 | test_dataloader) 48 | logger.info("initialized colossalai components", ranks=[0]) 49 | 50 | # build trainer 51 | trainer = Trainer(engine=engine, logger=logger) 52 | 53 | # build hooks 54 | hook_list = [ 55 | hooks.LossHook(), 56 | hooks.AccuracyHook(accuracy_func=Accuracy()), 57 | hooks.LogMetricByEpochHook(logger), 58 | hooks.LRSchedulerHook(lr_scheduler, by_epoch=True), 59 | ] 60 | 61 | # start training 62 | trainer.fit(train_dataloader=train_dataloader, 63 | test_dataloader=test_dataloader, 64 | epochs=gpc.config.NUM_EPOCHS, 65 | hooks=hook_list, 66 | display_progress=True, 67 | test_interval=1) 68 | 69 | 70 | if __name__ == '__main__': 71 | main() 72 | -------------------------------------------------------------------------------- /image/vision_transformer/hybrid_parallel/configs/vit_1d_tp2_pp2.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | # hyperparameters 4 | # BATCH_SIZE is as per GPU 5 | # global batch size = BATCH_SIZE x data parallel size 6 | BATCH_SIZE = 256 7 | LEARNING_RATE = 3e-3 8 | WEIGHT_DECAY = 0.3 9 | NUM_EPOCHS = 10 10 | WARMUP_EPOCHS = 3 11 | 12 | # model config 13 | IMG_SIZE = 224 14 | PATCH_SIZE = 16 15 | HIDDEN_SIZE = 512 16 | DEPTH = 4 17 | NUM_HEADS = 4 18 | MLP_RATIO = 2 19 | NUM_CLASSES = 1000 20 | CHECKPOINT = False 21 | SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1 # add 1 for cls token 22 | 23 | # parallel setting 24 | TENSOR_PARALLEL_SIZE = 2 25 | TENSOR_PARALLEL_MODE = '1d' 26 | 27 | parallel = dict( 28 | pipeline=2, 29 | tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), 30 | ) 31 | 32 | fp16 = dict(mode=AMP_TYPE.NAIVE) 33 | clip_grad_norm = 1.0 34 | 35 | # pipeline config 36 | NUM_MICRO_BATCHES = parallel['pipeline'] 37 | -------------------------------------------------------------------------------- /image/vision_transformer/hybrid_parallel/configs/vit_1d_tp4_pp16.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | 4 | # hyperparameters 5 | # BATCH_SIZE is as per GPU 6 | # global batch size = BATCH_SIZE x data parallel size 7 | BATCH_SIZE = 4096 8 | LEARNING_RATE = 3e-3 9 | WEIGHT_DECAY = 0.3 10 | NUM_EPOCHS = 300 11 | WARMUP_EPOCHS = 32 12 | 13 | # model config 14 | IMG_SIZE = 224 15 | PATCH_SIZE = 16 16 | HIDDEN_SIZE = 4096 17 | DEPTH = 32 18 | NUM_HEADS = 64 19 | MLP_RATIO = 4 20 | NUM_CLASSES = 1000 21 | CHECKPOINT = True 22 | SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1 # add 1 for cls token 23 | 24 | # parallel setting 25 | TENSOR_PARALLEL_SIZE = 4 26 | TENSOR_PARALLEL_MODE = '1d' 27 | 28 | parallel = dict( 29 | pipeline=16, 30 | tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), 31 | ) 32 | 33 | fp16 = dict(mode=AMP_TYPE.NAIVE) 34 | clip_grad_norm = 1.0 35 | 36 | 37 | # pipeline config 38 | NUM_MICRO_BATCHES = parallel['pipeline'] 39 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LENGTH, HIDDEN_SIZE) 40 | -------------------------------------------------------------------------------- /image/vision_transformer/hybrid_parallel/configs/vit_2d_tp4_pp16.py: -------------------------------------------------------------------------------- 1 | import math 2 | from tkinter import HIDDEN 3 | from colossalai.amp import AMP_TYPE 4 | 5 | 6 | # hyperparameters 7 | # BATCH_SIZE is as per GPU 8 | # global batch size = BATCH_SIZE x data parallel size 9 | BATCH_SIZE = 8192 10 | LEARNING_RATE = 3e-3 11 | WEIGHT_DECAY = 0.3 12 | NUM_EPOCHS = 300 13 | WARMUP_EPOCHS = 32 14 | 15 | # model config 16 | IMG_SIZE = 224 17 | PATCH_SIZE = 16 18 | HIDDEN_SIZE = 4096 19 | DEPTH = 32 20 | NUM_HEADS = 64 21 | MLP_RATIO = 4 22 | NUM_CLASSES = 1000 23 | CHECKPOINT = True 24 | SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1 # add 1 for cls token 25 | 26 | # parallel setting 27 | TENSOR_PARALLEL_SIZE = 4 28 | TENSOR_PARALLEL_MODE = '2d' 29 | 30 | parallel = dict( 31 | pipeline=16, 32 | tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), 33 | ) 34 | 35 | fp16 = dict(mode=AMP_TYPE.NAIVE) 36 | clip_grad_norm = 1.0 37 | 38 | # pipeline config 39 | NUM_MICRO_BATCHES = parallel['pipeline'] 40 | SUMMA_DIM = int(math.sqrt(TENSOR_PARALLEL_SIZE)) 41 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES // SUMMA_DIM, 42 | SEQ_LENGTH, 43 | HIDDEN_SIZE // SUMMA_DIM) 44 | -------------------------------------------------------------------------------- /image/vision_transformer/hybrid_parallel/configs/vit_2p5d_tp4_pp16.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | from colossalai.amp import AMP_TYPE 3 | import math 4 | 5 | # hyperparameters 6 | # BATCH_SIZE is as per GPU 7 | # global batch size = BATCH_SIZE x data parallel size 8 | BATCH_SIZE = 7168 9 | LEARNING_RATE = 3e-3 10 | WEIGHT_DECAY = 0.3 11 | NUM_EPOCHS = 300 12 | WARMUP_EPOCHS = 32 13 | 14 | # model config 15 | IMG_SIZE = 224 16 | PATCH_SIZE = 16 17 | HIDDEN_SIZE = 4096 18 | DEPTH = 32 19 | NUM_HEADS = 64 20 | MLP_RATIO = 4 21 | NUM_CLASSES = 1000 22 | CHECKPOINT = True 23 | SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1 # add 1 for cls token 24 | 25 | # parallel setting 26 | TENSOR_PARALLEL_SIZE = 4 27 | TESSERACT_DEPTH = 1 28 | TENSOR_PARALLEL_MODE = '2.5d' 29 | 30 | parallel = dict( 31 | pipeline=16, 32 | tensor=dict( 33 | mode=TENSOR_PARALLEL_MODE, 34 | size=TENSOR_PARALLEL_SIZE, 35 | depth=TESSERACT_DEPTH 36 | ), 37 | ) 38 | 39 | fp16 = dict(mode=AMP_TYPE.NAIVE) 40 | clip_grad_norm = 1.0 41 | 42 | # pipeline config 43 | NUM_MICRO_BATCHES = parallel['pipeline'] 44 | SUMMA_DIM = int(math.sqrt(TENSOR_PARALLEL_SIZE // TESSERACT_DEPTH)) 45 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES // SUMMA_DIM, 46 | SEQ_LENGTH, 47 | HIDDEN_SIZE // SUMMA_DIM) 48 | -------------------------------------------------------------------------------- /image/vision_transformer/hybrid_parallel/configs/vit_3d_tp8_pp8.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | # hyperparameters 4 | # BATCH_SIZE is as per GPU 5 | # global batch size = BATCH_SIZE x data parallel size 6 | BATCH_SIZE = 1536 7 | LEARNING_RATE = 3e-3 8 | WEIGHT_DECAY = 0.3 9 | NUM_EPOCHS = 300 10 | WARMUP_EPOCHS = 32 11 | 12 | # model config 13 | IMG_SIZE = 224 14 | PATCH_SIZE = 16 15 | HIDDEN_SIZE = 4096 16 | DEPTH = 32 17 | NUM_HEADS = 64 18 | MLP_RATIO = 4 19 | NUM_CLASSES = 1000 20 | CHECKPOINT = True 21 | SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1 # add 1 for cls token 22 | 23 | # parallel setting 24 | TENSOR_PARALLEL_SIZE = 8 25 | TENSOR_PARALLEL_MODE = '3d' 26 | 27 | NUM_EPOCHS = 300 28 | WARMUP_EPOCHS = 32 29 | 30 | parallel = dict( 31 | pipeline=8, 32 | tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), 33 | ) 34 | 35 | fp16 = dict(mode=AMP_TYPE.NAIVE) 36 | clip_grad_norm = 1.0 37 | 38 | # pipeline config 39 | NUM_MICRO_BATCHES = parallel['pipeline'] 40 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES // 4, 41 | SEQ_LENGTH, 42 | HIDDEN_SIZE // 2) 43 | -------------------------------------------------------------------------------- /image/vision_transformer/hybrid_parallel/configs/vit_pipeline.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | # hyperparameters 4 | # BATCH_SIZE is as per GPU 5 | # global batch size = BATCH_SIZE x data parallel size 6 | BATCH_SIZE = 2048 7 | LEARNING_RATE = 3e-3 8 | WEIGHT_DECAY = 0.3 9 | NUM_EPOCHS = 10 10 | WARMUP_EPOCHS = 3 11 | 12 | # model config 13 | IMG_SIZE = 224 14 | PATCH_SIZE = 16 15 | HIDDEN_SIZE = 512 16 | DEPTH = 4 17 | NUM_HEADS = 4 18 | MLP_RATIO = 2 19 | NUM_CLASSES = 1000 20 | CHECKPOINT = False 21 | SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1 # add 1 for cls token 22 | 23 | # parallel setting 24 | parallel = dict(pipeline=2,) 25 | 26 | fp16 = dict(mode=AMP_TYPE.NAIVE) 27 | clip_grad_norm = 1.0 28 | 29 | # pipeline config 30 | NUM_MICRO_BATCHES = parallel['pipeline'] 31 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LENGTH, HIDDEN_SIZE) 32 | -------------------------------------------------------------------------------- /image/vision_transformer/hybrid_parallel/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .vit import * -------------------------------------------------------------------------------- /image/vision_transformer/hybrid_parallel/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /language/DeepNet/README.md: -------------------------------------------------------------------------------- 1 | # [DeepNet](https://arxiv.org/pdf/2203.00555.pdf): An Implementation based on [Colossal-AI](https://www.colossalai.org/) 2 | 3 | ## Overview 4 | 5 |

6 | 7 |

8 | 9 | This is the re-implementation of model DeepNet from paper [DeepNet: Scaling Transformers to 1,000 Layers](https://arxiv.org/pdf/2203.00555.pdf). 10 | 11 | DeepNet can scale transformer models to 1000 layers by applying DeepNorm. This Colossal-AI based implementation support data parallelism, pipeline parallelism and 1D tensor parallelism for training. 12 | 13 | ## How to prepare datasets 14 | 15 | ### Decoder-only DeepNet 16 | The decoder-only DeepNet model is modified from the GPT model. In this example, we use WebText dataset for training. The way we prepare dataset is same as which in [Colossal-AI based GPT example](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/language/gpt). 17 | 18 | ## requirement 19 | 20 | To use pipeline parallel training, you should install colossalai from the **latest** main branch. 21 | 22 | ## How to run 23 | 24 | ### Decoder-only DeepNet 25 | 26 | ```Bash 27 | #!/usr/bin/env sh 28 | export DATA=/path/to/train_data.json 29 | 30 | colossalai run --nproc_per_node= train_deepnet_decoder.py --config=decoder_configs/deepnet_pp1d.py 31 | ``` 32 | 33 | 34 | Please modify `DATA`, `num_gpus` with the path to your dataset and the number of GPUs respectively. 35 | You can also modify the config `file decoder_configs/deepnet_pp1d.py` to further change parallel settings, training hyperparameters and model details. 36 | 37 | ## features 38 | 39 | - [x] Decoder-only DeepNet 40 | - [ ] Encoder-Decoder DeepNet 41 | -------------------------------------------------------------------------------- /language/DeepNet/dataset/webtext.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import torch 5 | from colossalai.registry import DATASETS 6 | from torch.utils.data import Dataset 7 | from transformers import GPT2Tokenizer 8 | 9 | 10 | @DATASETS.register_module 11 | class WebtextDataset(Dataset): 12 | def __init__(self, path, seq_len=1024) -> None: 13 | super().__init__() 14 | root = os.path.dirname(path) 15 | encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt') 16 | if os.path.isfile(encoded_data_cache_path): 17 | seq_len_, data, attention_mask = torch.load(encoded_data_cache_path) 18 | if seq_len_ == seq_len: 19 | self.data = data 20 | self.attention_mask = attention_mask 21 | return 22 | raw_data = [] 23 | with open(path) as f: 24 | for line in f.readlines(): 25 | raw_data.append(json.loads(line)['text']) 26 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 27 | tokenizer.pad_token = tokenizer.unk_token 28 | encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt') 29 | self.data = encoded_data['input_ids'] 30 | self.attention_mask = encoded_data['attention_mask'] 31 | torch.save((seq_len, self.data, self.attention_mask), encoded_data_cache_path) 32 | 33 | def __len__(self): 34 | return len(self.data) 35 | 36 | def __getitem__(self, index): 37 | return {'input_ids': self.data[index], 38 | 'attention_mask': self.attention_mask[index]}, self.data[index] -------------------------------------------------------------------------------- /language/DeepNet/decoder_configs/deepnet_pp1d.py: -------------------------------------------------------------------------------- 1 | from torch.optim import Adam 2 | from colossalai.amp import AMP_TYPE 3 | import torch 4 | from titans.model.deepnet import deepnet_small 5 | from titans.loss.lm_loss import GPTLMLoss 6 | 7 | BATCH_SIZE = 8 8 | NUM_EPOCHS = 2 9 | SEQ_LEN = 1024 10 | 11 | NUM_MICRO_BATCHES = 1 12 | HIDDEN_SIZE = 768 13 | PIPELINE = 2 14 | TENSOR_PARALLEL = 2 15 | MODE = '1d' 16 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LEN, HIDDEN_SIZE) 17 | 18 | fp16 = dict(mode=AMP_TYPE.NAIVE) 19 | 20 | parallel = dict(pipeline=PIPELINE, tensor=dict(mode=MODE, size=TENSOR_PARALLEL)) 21 | 22 | optimizer = dict( 23 | type=Adam, 24 | lr=0.00015, 25 | weight_decay=1e-2, 26 | ) 27 | 28 | model = dict( 29 | type=deepnet_small, 30 | checkpoint=True, 31 | dtype=torch.half, 32 | ) 33 | 34 | loss = dict(type=GPTLMLoss,) 35 | -------------------------------------------------------------------------------- /language/DeepNet/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | contextlib 3 | torch 4 | transformers -------------------------------------------------------------------------------- /language/bert/colotensor/README.md: -------------------------------------------------------------------------------- 1 | [WIP] 2 | 3 | -------------------------------------------------------------------------------- /language/bert/colotensor/configs/bert_base_tp1d.py: -------------------------------------------------------------------------------- 1 | SEQ_LENGTH = 512 2 | BATCH_SIZE = 8 3 | NUM_EPOCHS = 10 4 | WARMUP_EPOCHS = 1 5 | 6 | parallel = dict( 7 | tensor=dict(mode="1d", size=4), 8 | ) 9 | 10 | model = dict( 11 | type="bert_base", 12 | ) -------------------------------------------------------------------------------- /language/bert/colotensor/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from language.bert.colotensor.dataset.wikitext import build_data_from_wikitext 3 | from colossalai.core import global_context as gpc 4 | 5 | _datasets = { 6 | "wikitext": build_data_from_wikitext, 7 | } 8 | 9 | def build_data(**args): 10 | if hasattr(gpc.config, "dataset"): 11 | assert ( 12 | gpc.config.dataset in _datasets.keys() 13 | ), f"Invalid dataset name. dataset should be in {_datasets.keys()} or use default wikitext" 14 | builder = _datasets[gpc.config.dataset] 15 | else: 16 | builder = _datasets["wikitext"] 17 | return builder(**args) 18 | 19 | 20 | __all__ = ["build_data"] -------------------------------------------------------------------------------- /language/bert/colotensor/dataset/wikitext.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | import numpy as np 4 | import copy 5 | 6 | from itertools import chain 7 | from datasets import load_from_disk, set_progress_bar_enabled 8 | 9 | from torch.utils.data import DataLoader, DistributedSampler 10 | from torch.distributed import get_world_size 11 | 12 | from transformers import BertTokenizer, default_data_collator 13 | from colossalai.logging import get_dist_logger 14 | 15 | 16 | def build_data_from_wikitext(dataset_path: str, tokenizer_path: str, seq_len: int = 512, batch_size: int = 8): 17 | logger = get_dist_logger("build_data_from_wikitext") 18 | logger.info("Building Wikitext-2 ...", ranks=[0]) 19 | world_size = get_world_size() 20 | 21 | set_progress_bar_enabled(False) 22 | dataset = load_from_disk(dataset_path) 23 | 24 | tokenizer = BertTokenizer(vocab_file=tokenizer_path + "/vocab.txt") 25 | 26 | def tokenize(examples): 27 | seq_length = seq_len 28 | examples = tokenizer(examples["text"]) 29 | concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} 30 | total_length = len(concatenated_examples[list(examples.keys())[0]]) 31 | if total_length >= seq_length: 32 | total_length = (total_length // seq_length) * seq_length 33 | 34 | result = { 35 | k: [t[i : i + seq_len] for i in range(0, total_length, seq_length)] 36 | for k, t in concatenated_examples.items() 37 | } 38 | 39 | return result 40 | 41 | tokenized_dataset = dataset.map( 42 | tokenize, batched=True, num_proc=16, load_from_cache_file=False, keep_in_memory=True, remove_columns="text" 43 | ) 44 | 45 | def seed_worker(): 46 | worker_seed = 1024 47 | np.random.seed(worker_seed) 48 | torch.manual_seed(worker_seed) 49 | random.seed(worker_seed) 50 | 51 | train_sampler = DistributedSampler(tokenized_dataset["train"], shuffle=True) if world_size > 1 else None 52 | train_data = DataLoader( 53 | tokenized_dataset["train"], 54 | shuffle=(train_sampler is None), 55 | sampler=train_sampler, 56 | drop_last=True, 57 | collate_fn=default_data_collator, 58 | worker_init_fn=seed_worker, 59 | batch_size=batch_size, 60 | pin_memory=True, 61 | ) 62 | test_sampler = DistributedSampler(tokenized_dataset["validation"], shuffle=False) if world_size > 1 else None 63 | test_data = DataLoader( 64 | tokenized_dataset["validation"], 65 | sampler=test_sampler, 66 | drop_last=True, 67 | collate_fn=default_data_collator, 68 | worker_init_fn=seed_worker, 69 | batch_size=batch_size, 70 | pin_memory=True, 71 | ) 72 | 73 | return train_data, test_data -------------------------------------------------------------------------------- /language/bert/colotensor/model/__init__.py: -------------------------------------------------------------------------------- 1 | from language.bert.colotensor.model.hfmodel import ModelFromHF 2 | from colossalai.core import global_context as gpc 3 | from transformers import BertConfig, BertForMaskedLM 4 | 5 | _bert_base = dict( 6 | seq_length=512, 7 | vocab_size=50304, 8 | hidden_size=768, 9 | num_heads=12, 10 | depth=12, 11 | ff_size=3072, 12 | checkpoint=False, 13 | evaluation='ppl', 14 | ) 15 | 16 | _bert_large = dict( 17 | seq_length=512, 18 | vocab_size=50304, 19 | hidden_size=1024, 20 | num_heads=16, 21 | depth=24, 22 | ff_size=3072, 23 | checkpoint=False, 24 | evaluation='ppl', 25 | ) 26 | 27 | _bert_configurations = dict( 28 | bert=_bert_base, 29 | bert_base=_bert_base, 30 | bert_large=_bert_large 31 | ) 32 | 33 | def build_model(): 34 | model_cfg = _bert_configurations[gpc.config.model.type] 35 | bert_cfg = BertConfig(vocab_size=model_cfg['vocab_size'], 36 | hidden_size=model_cfg['hidden_size'], 37 | num_hidden_layers=model_cfg['depth'], 38 | num_attention_heads=model_cfg['num_heads'], 39 | intermediate_size=model_cfg['ff_size'], 40 | max_position_embeddings=model_cfg['seq_length'], 41 | use_cache=not gpc.config.model.get('checkpoint', False)) 42 | 43 | model = ModelFromHF(bert_cfg, BertForMaskedLM) 44 | 45 | return model 46 | 47 | __all__ = ["build_model"] -------------------------------------------------------------------------------- /language/bert/colotensor/model/hfmodel.py: -------------------------------------------------------------------------------- 1 | from colossalai.core import global_context as gpc 2 | import torch 3 | 4 | class ModelFromHF(torch.nn.Module): 5 | def __init__(self, config, model_cls): 6 | super().__init__() 7 | self.module = model_cls(config) 8 | if gpc.config.model.get('checkpoint'): 9 | self.module.apply(self.set_checkpointing) 10 | 11 | def set_checkpointing(self, module): 12 | if hasattr(module, 'gradient_checkpointing'): 13 | module.gradient_checkpointing = True 14 | 15 | def forward(self, *args, **kwargs): 16 | output = self.module(*args, **kwargs) 17 | return output.logits -------------------------------------------------------------------------------- /language/bert/colotensor/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /language/bert/hybrid_parallel/README.md: -------------------------------------------------------------------------------- 1 | # Bert 2 | 3 | ![Still In Progress](https://img.shields.io/badge/-Still%20In%20Progress-orange) 4 | 5 | Bert Benchmark with data parallel, tensor parallel(tp), pipeline parallel(pp) and ZeRO. 6 | 7 | ## Setup 8 | 1. Install dependencies if you do not have them 9 | ``` 10 | pip install -r requirement.txt 11 | ``` 12 | 13 | 2. Add root dir into PYTHONPATH 14 | ``` 15 | export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH 16 | ``` 17 | 18 | ## Bert Usage 19 | 20 | 1. Prepare datasets and tokenizers from HuggingFace Hub if necessary (e.g. we provide an example of training `wikitext-2`). 21 | 22 | 2. Run benchmark with one of the systems to evaluate 23 | ``` 24 | DATA=/PATH/TO/DATASET TOKENIZER=/PATH/TO/TOKENIZER LOG=/PATH/TO/LOG torchrun --nproc_per_node=NUM_GPUS run.py --config=CONFIG_FILE 25 | ``` -------------------------------------------------------------------------------- /language/bert/hybrid_parallel/colossalai_utils/bert_config_pp.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "colossalai", 3 | "model": { 4 | "type": "bert_base" 5 | }, 6 | "hyperparameter": { 7 | "batch_size": 8, 8 | "num_epochs": 20, 9 | "steps_per_epoch": 10 10 | }, 11 | "gradient_clipping": 1.0, 12 | "parallel": { 13 | "pipeline": 4, 14 | "tensor": { 15 | "mode": "1d", 16 | "size": 1 17 | } 18 | }, 19 | "use_mem_monitor": true 20 | } 21 | -------------------------------------------------------------------------------- /language/bert/hybrid_parallel/colossalai_utils/bert_config_tp1d.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "colossalai", 3 | "model": { 4 | "type": "bert_base" 5 | }, 6 | "hyperparameter": { 7 | "batch_size": 8, 8 | "num_epochs": 10, 9 | "steps_per_epoch": 10 10 | }, 11 | "gradient_clipping": 1.0, 12 | "parallel": { 13 | "pipeline": 1, 14 | "tensor": { 15 | "mode": "1d", 16 | "size": 2 17 | } 18 | }, 19 | "use_mem_monitor": true 20 | } 21 | -------------------------------------------------------------------------------- /language/bert/hybrid_parallel/colossalai_utils/bert_config_tp1dpp.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "colossalai", 3 | "model": { 4 | "type": "bert_base" 5 | }, 6 | "hyperparameter": { 7 | "batch_size": 8, 8 | "num_epochs": 20, 9 | "steps_per_epoch": 10 10 | }, 11 | "gradient_clipping": 1.0, 12 | "parallel": { 13 | "pipeline": 2, 14 | "tensor": { 15 | "mode": "1d", 16 | "size": 2 17 | } 18 | }, 19 | "use_mem_monitor": true 20 | } 21 | -------------------------------------------------------------------------------- /language/bert/hybrid_parallel/colossalai_utils/bert_config_tp2d.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "colossalai", 3 | "model": { 4 | "type": "bert_base" 5 | }, 6 | "hyperparameter": { 7 | "batch_size": 8, 8 | "num_epochs": 20, 9 | "steps_per_epoch": 10 10 | }, 11 | "gradient_clipping": 1.0, 12 | "parallel": { 13 | "pipeline": 1, 14 | "tensor": { 15 | "mode": "2d", 16 | "size": 4 17 | } 18 | }, 19 | "use_mem_monitor": true 20 | } 21 | -------------------------------------------------------------------------------- /language/bert/hybrid_parallel/colossalai_utils/bert_config_tp2p5d.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "colossalai", 3 | "model": { 4 | "type": "bert_base" 5 | }, 6 | "hyperparameter": { 7 | "batch_size": 8, 8 | "num_epochs": 20, 9 | "steps_per_epoch": 10 10 | }, 11 | "gradient_clipping": 1.0, 12 | "parallel": { 13 | "pipeline": 1, 14 | "tensor": { 15 | "mode": "2.5d", 16 | "size": 8, 17 | "depth": 2 18 | } 19 | }, 20 | "use_mem_monitor": true 21 | } 22 | -------------------------------------------------------------------------------- /language/bert/hybrid_parallel/colossalai_utils/bert_config_tp3d.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "colossalai", 3 | "model": { 4 | "type": "bert_base" 5 | }, 6 | "hyperparameter": { 7 | "batch_size": 8, 8 | "num_epochs": 20, 9 | "steps_per_epoch": 10 10 | }, 11 | "gradient_clipping": 1.0, 12 | "parallel": { 13 | "pipeline": 1, 14 | "tensor": { 15 | "mode": "3d", 16 | "size": 8 17 | } 18 | }, 19 | "use_mem_monitor": true 20 | } 21 | -------------------------------------------------------------------------------- /language/bert/hybrid_parallel/colossalai_utils/bert_config_zero.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "colossalai", 3 | "model": { 4 | "type": "bert_base" 5 | }, 6 | "hyperparameter": { 7 | "batch_size": 8, 8 | "num_epochs": 20, 9 | "steps_per_epoch": 10 10 | }, 11 | "gradient_clipping": 1.0, 12 | "zero": { 13 | "model_config": { 14 | "offload_config": { 15 | "device": "cpu" 16 | } 17 | }, 18 | "optimizer_config": { 19 | "cpu_offload": true, 20 | "initial_scale": 256, 21 | "min_scale": 1, 22 | "growth_factor": 2.0, 23 | "backoff_factor": 0.5, 24 | "growth_interval": 1000 25 | } 26 | }, 27 | "use_mem_monitor": true 28 | } 29 | -------------------------------------------------------------------------------- /language/bert/hybrid_parallel/colossalai_utils/bert_config_zerotppp.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "colossalai", 3 | "model": { 4 | "type": "bert_base" 5 | }, 6 | "hyperparameter": { 7 | "batch_size": 8, 8 | "num_epochs": 100, 9 | "steps_per_epoch": 10 10 | }, 11 | "gradient_clipping": 1.0, 12 | "zero": { 13 | "model_config": { 14 | "offload_config": { 15 | "device": "cpu" 16 | } 17 | }, 18 | "optimizer_config": { 19 | "cpu_offload": true, 20 | "initial_scale": 256, 21 | "min_scale": 1, 22 | "growth_factor": 2.0, 23 | "backoff_factor": 0.5, 24 | "growth_interval": 1000 25 | } 26 | }, 27 | "parallel": { 28 | "pipeline": 1, 29 | "tensor": { 30 | "mode": "1d", 31 | "size": 2 32 | } 33 | }, 34 | "use_mem_monitor": true 35 | } 36 | -------------------------------------------------------------------------------- /language/bert/hybrid_parallel/colossalai_utils/model_zoo/__init__.py: -------------------------------------------------------------------------------- 1 | from .colo_bert import create_colo_bert_pipeline_model, ColoBertForMaskedLM, ColoBertMaskedLMLoss 2 | 3 | __all__ = ['create_colo_bert_pipeline_model', 'ColoBertForMaskedLM', 'ColoBertMaskedLMLoss'] 4 | -------------------------------------------------------------------------------- /language/bert/hybrid_parallel/colossalai_utils/requirement.txt: -------------------------------------------------------------------------------- 1 | 2 | torch>=1.10 -f https://download.pytorch.org/whl/cu113/torch_stable.html 3 | torchvision -f https://download.pytorch.org/whl/cu113/torch_stable.html 4 | transformers 5 | datasets 6 | colossalai 7 | rich -------------------------------------------------------------------------------- /language/bert/hybrid_parallel/colossalai_utils/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from zero.common.utils import CONFIG, print_log 3 | from torch.cuda import max_memory_allocated, reset_peak_memory_stats 4 | from torch.distributed import get_rank 5 | 6 | 7 | def init_w_col(builder): 8 | import colossalai 9 | from colossalai.core import global_context as gpc 10 | from colossalai.nn.optimizer import CPUAdam 11 | from colossalai.zero.init_ctx import ZeroInitContext 12 | from colossalai.zero.shard_utils import (BucketTensorShardStrategy) 13 | 14 | from colossalai.utils.memory_utils.utils import colo_set_process_memory_fraction 15 | colo_set_process_memory_fraction(0.2) 16 | 17 | colossalai.launch_from_torch(config=CONFIG) 18 | 19 | build_data, build_model, build_loss, optimizer_class, build_scheduler = builder() 20 | 21 | print_log('Building data') 22 | train_data, test_data = build_data() 23 | 24 | use_zero = "zero" in gpc.config 25 | if use_zero: 26 | cpu_offload = gpc.config.zero.model_config.offload_config.device == 'cpu' 27 | else: 28 | cpu_offload = None 29 | 30 | rank = get_rank() 31 | reset_peak_memory_stats(rank) 32 | 33 | print_log('Building model') 34 | if use_zero: 35 | shard_strategy = BucketTensorShardStrategy() 36 | with ZeroInitContext(target_device=torch.cuda.current_device(), shard_strategy=shard_strategy, 37 | shard_param=True): 38 | model = build_model() 39 | gpc.config.zero.model_config['shard_strategy'] = shard_strategy 40 | 41 | else: 42 | model = build_model() 43 | 44 | criterion = build_loss() 45 | 46 | print_log(f'Peak Memory = {max_memory_allocated(rank) / (1024 * 1024)} M') 47 | reset_peak_memory_stats(rank) 48 | 49 | optimizer_kwargs = {} 50 | if use_zero and cpu_offload: 51 | optimizer_class = CPUAdam 52 | optimizer_kwargs = { 53 | 'lr': CONFIG['hyperparameter']['learning_rate'], 54 | 'weight_decay': CONFIG['hyperparameter']['weight_decay'] 55 | } 56 | 57 | optimizer = optimizer_class(model.parameters()) 58 | 59 | lr_scheduler = build_scheduler(len(train_data), optimizer) 60 | print_log(f'Peak Memory = {max_memory_allocated(rank) / (1024 * 1024)} M') 61 | 62 | engine, train_data, test_data, lr_scheduler = colossalai.initialize(model, optimizer, criterion, train_data, 63 | test_data, lr_scheduler) 64 | model = engine 65 | criterion = engine.criterion 66 | optimizer = engine 67 | 68 | return model, train_data, test_data, criterion, optimizer, None, lr_scheduler 69 | -------------------------------------------------------------------------------- /language/bert/hybrid_parallel/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /language/bert/hybrid_parallel/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from bert.common.helper import bert_builder 4 | from bert.colossalai_utils.utils import init_w_col 5 | from bert.common.train import train 6 | from zero.common.utils import CONFIG, load_config, print_log 7 | from zero.torch_utils.utils import init_w_torch 8 | 9 | _method = { 10 | 'torch': init_w_torch, 11 | 'colossalai': init_w_col, 12 | } 13 | 14 | _builder = { 15 | 'bert': bert_builder, 16 | } 17 | 18 | 19 | def run_bert(): 20 | method = CONFIG['method'] 21 | 22 | model = CONFIG['model']['type'] 23 | model_type = model.split('_')[0] 24 | 25 | train(*_method[method](_builder[model_type])) 26 | 27 | 28 | if __name__ == '__main__': 29 | load_config() 30 | 31 | CONFIG['log_path'] = os.environ.get('LOG', '.') 32 | os.makedirs(CONFIG['log_path'], exist_ok=True) 33 | 34 | print_log(f'Initializing {CONFIG["method"]} ...') 35 | 36 | run_bert() 37 | -------------------------------------------------------------------------------- /language/bert/preprocessing/.gitignore: -------------------------------------------------------------------------------- 1 | pretrain/ 2 | wikipedia/ -------------------------------------------------------------------------------- /language/bert/preprocessing/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /language/bert/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /language/bert/sequene_parallel/config.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | DATA_PATH = '' 4 | VOCAB_FILE_PATH = '' 5 | 6 | # hyper-parameters 7 | TRAIN_ITERS = 1000000 8 | DECAY_ITERS = 990000 9 | WARMUP_FRACTION = 0.01 10 | GLOBAL_BATCH_SIZE = 32 # dp world size * sentences per GPU 11 | EVAL_ITERS = 10 12 | EVAL_INTERVAL = 10 13 | LR = 0.0001 14 | MIN_LR = 1e-05 15 | WEIGHT_DECAY = 0.01 16 | SEQ_LENGTH = 512 17 | 18 | # BERT config 19 | DEPTH = 12 20 | NUM_ATTENTION_HEADS = 12 21 | HIDDEN_SIZE = 768 22 | 23 | # model config 24 | ADD_BINARY_HEAD = False 25 | 26 | # random seed 27 | SEED = 1234 28 | 29 | # pipeline config 30 | # only enabled when pipeline > 1 31 | NUM_MICRO_BATCHES = 4 32 | 33 | # colossalai config 34 | parallel = dict(pipeline=1, tensor=dict(size=4, mode='sequence')) 35 | 36 | fp16 = dict(mode=AMP_TYPE.NAIVE, verbose=True) 37 | 38 | clip_grad_norm = 1.0 39 | 40 | gradient_handler = [dict(type='SequenceParallelGradientHandler')] 41 | -------------------------------------------------------------------------------- /language/bert/sequene_parallel/data/datasets/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /language/bert/sequene_parallel/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from . import indexed_dataset 2 | -------------------------------------------------------------------------------- /language/bert/sequene_parallel/data/datasets/blendable_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Blendable dataset.""" 17 | 18 | import time 19 | 20 | import numpy as np 21 | import torch 22 | 23 | 24 | class BlendableDataset(torch.utils.data.Dataset): 25 | 26 | def __init__(self, datasets, weights): 27 | 28 | self.datasets = datasets 29 | num_datasets = len(datasets) 30 | assert num_datasets == len(weights) 31 | 32 | self.size = 0 33 | for dataset in self.datasets: 34 | self.size += len(dataset) 35 | 36 | # Normalize weights. 37 | weights = np.array(weights, dtype=np.float64) 38 | sum_weights = np.sum(weights) 39 | assert sum_weights > 0.0 40 | weights /= sum_weights 41 | 42 | # Build indices. 43 | start_time = time.time() 44 | assert num_datasets < 255 45 | self.dataset_index = np.zeros(self.size, dtype=np.uint8) 46 | self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) 47 | 48 | from . import helpers 49 | helpers.build_blending_indices(self.dataset_index, 50 | self.dataset_sample_index, 51 | weights, num_datasets, self.size, 52 | torch.distributed.get_rank() == 0) 53 | print('> elapsed time for building blendable dataset indices: ' 54 | '{:.2f} (sec)'.format(time.time() - start_time)) 55 | 56 | def __len__(self): 57 | return self.size 58 | 59 | def __getitem__(self, idx): 60 | dataset_idx = self.dataset_index[idx] 61 | sample_idx = self.dataset_sample_index[idx] 62 | return self.datasets[dataset_idx][sample_idx] 63 | -------------------------------------------------------------------------------- /language/bert/sequene_parallel/data/datasets/test/test_preprocess_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IMPL=cached 4 | python ../preprocess_data.py \ 5 | --input test_samples.json \ 6 | --vocab vocab.txt \ 7 | --dataset-impl ${IMPL} \ 8 | --output-prefix test_samples_${IMPL} \ 9 | --workers 1 \ 10 | --log-interval 2 11 | -------------------------------------------------------------------------------- /language/bert/sequene_parallel/data/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from .tokenizer import build_tokenizer 18 | 19 | 20 | _TOKENIZER = None 21 | _PADDED_VOCAB_SIZE = -1 22 | 23 | 24 | def initialize_tokenizer(vocab_file, tokenizer_type, vocab_extra_ids=0): 25 | tokenizer, padded_vocab_size = build_tokenizer(vocab_file, tokenizer_type, vocab_extra_ids) 26 | global _TOKENIZER, _PADDED_VOCAB_SIZE 27 | _TOKENIZER = tokenizer 28 | _PADDED_VOCAB_SIZE = padded_vocab_size 29 | 30 | 31 | def get_tokenizer(): 32 | global _TOKENIZER 33 | return _TOKENIZER 34 | 35 | 36 | def get_padded_vocab_size(): 37 | global _PADDED_VOCAB_SIZE 38 | return _PADDED_VOCAB_SIZE 39 | -------------------------------------------------------------------------------- /language/bert/sequene_parallel/loss_func/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/language/bert/sequene_parallel/loss_func/__init__.py -------------------------------------------------------------------------------- /language/bert/sequene_parallel/loss_func/bert_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from colossalai.core import global_context as gpc 4 | from colossalai.context import ParallelMode 5 | from colossalai.logging import get_dist_logger 6 | import torch.nn.functional as F 7 | import torch.distributed as dist 8 | from .cross_entropy import vocab_cross_entropy 9 | 10 | 11 | class BertLoss(nn.Module): 12 | 13 | def forward(self, 14 | lm_loss, 15 | sop_logits, 16 | loss_mask, 17 | sentence_order): 18 | lm_loss_ = lm_loss.float() 19 | loss_mask = loss_mask.float() 20 | loss_mask_sum = loss_mask.sum() 21 | lm_loss = torch.sum( 22 | lm_loss_.view(-1) * loss_mask.reshape(-1)) 23 | 24 | lm_loss /= loss_mask_sum 25 | 26 | torch.distributed.all_reduce( 27 | lm_loss, 28 | group=gpc.get_group(ParallelMode.SEQUENCE) 29 | ) 30 | 31 | if sop_logits is not None: 32 | sop_loss = F.cross_entropy(sop_logits.view(-1, 2).float(), 33 | sentence_order.view(-1), 34 | ignore_index=-1) 35 | sop_loss = sop_loss.float() 36 | loss = lm_loss + sop_loss * gpc.get_world_size(ParallelMode.SEQUENCE) 37 | else: 38 | sop_loss = None 39 | loss = lm_loss 40 | 41 | return loss 42 | -------------------------------------------------------------------------------- /language/bert/sequene_parallel/loss_func/utils.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | 4 | 5 | def ensure_divisibility(numerator, denominator): 6 | """Ensure that numerator is divisible by the denominator.""" 7 | assert numerator % denominator == 0, '{} is not divisible by {}'.format( 8 | numerator, denominator) 9 | 10 | 11 | def divide(numerator, denominator): 12 | """Ensure that numerator is divisible by the denominator and return 13 | the division value.""" 14 | ensure_divisibility(numerator, denominator) 15 | return numerator // denominator 16 | 17 | 18 | def split_tensor_along_last_dim(tensor, num_partitions, 19 | contiguous_split_chunks=False): 20 | """Split a tensor along its last dimension. 21 | Arguments: 22 | tensor: input tensor. 23 | num_partitions: number of partitions to split the tensor 24 | contiguous_split_chunks: If True, make each chunk contiguous 25 | in memory. 26 | """ 27 | # Get the size and dimension. 28 | last_dim = tensor.dim() - 1 29 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 30 | # Split. 31 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 32 | # Note: torch.split does not create contiguous tensors by default. 33 | if contiguous_split_chunks: 34 | return tuple(chunk.contiguous() for chunk in tensor_list) 35 | 36 | return tensor_list 37 | 38 | 39 | class VocabUtility: 40 | """Split the vocabulary into `world_size` chunks amd return the 41 | first and last index of the vocabulary belonging to the `rank` 42 | partition: Note that indices in [fist, last)""" 43 | 44 | @staticmethod 45 | def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, 46 | rank, world_size): 47 | index_f = rank * per_partition_vocab_size 48 | index_l = index_f + per_partition_vocab_size 49 | return index_f, index_l 50 | 51 | @staticmethod 52 | def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size): 53 | per_partition_vocab_size = divide(global_vocab_size, world_size) 54 | return VocabUtility.vocab_range_from_per_partition_vocab_size( 55 | per_partition_vocab_size, rank, world_size) 56 | -------------------------------------------------------------------------------- /language/bert/sequene_parallel/lr_scheduler/__init__.py: -------------------------------------------------------------------------------- 1 | from .annealing_lr import AnnealingLR 2 | -------------------------------------------------------------------------------- /language/bert/sequene_parallel/model/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /language/bert/sequene_parallel/model/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .embedding import VocabEmbedding, Embedding 2 | from .bert_layer import BertLayer 3 | from .head import BertDualHead 4 | from .preprocess import PreProcessor 5 | -------------------------------------------------------------------------------- /language/bert/sequene_parallel/model/layers/dropout.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def bias_dropout_add(x, bias, residual, prob, training): 4 | # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor 5 | out = torch.nn.functional.dropout(x + bias, p=prob, training=training) 6 | out = residual + out 7 | return out 8 | 9 | 10 | def get_bias_dropout_add(training): 11 | def _bias_dropout_add(x, bias, residual, prob): 12 | return bias_dropout_add(x, bias, residual, prob, training) 13 | return _bias_dropout_add -------------------------------------------------------------------------------- /language/bert/sequene_parallel/model/layers/head.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from .pooler import Pooler 6 | from .linear import Linear 7 | from .embedding import VocabEmbedding 8 | from colossalai.core import global_context as gpc 9 | from colossalai.context import ParallelMode 10 | from colossalai.kernel import LayerNorm 11 | from loss_func.cross_entropy import vocab_cross_entropy 12 | 13 | 14 | class BertLMHead(nn.Module): 15 | """Masked LM head for Bert 16 | Arguments: 17 | hidden_size: hidden size 18 | init_method: init method for weight initialization 19 | layernorm_epsilon: tolerance for layer norm divisions 20 | """ 21 | 22 | def __init__(self, 23 | vocab_size, 24 | hidden_size, 25 | ): 26 | 27 | super(BertLMHead, self).__init__() 28 | self.bias = torch.nn.Parameter(torch.zeros(vocab_size)) 29 | 30 | self.dense = Linear(hidden_size, hidden_size) 31 | self.layernorm = LayerNorm(hidden_size) 32 | self.gelu = torch.nn.functional.gelu 33 | 34 | def forward(self, hidden_states, word_embeddings_weight, lm_labels): 35 | hidden_states = self.dense(hidden_states) 36 | hidden_states = self.gelu(hidden_states) 37 | hidden_states = self.layernorm(hidden_states) 38 | 39 | output = F.linear(hidden_states, word_embeddings_weight, self.bias) 40 | lm_loss = vocab_cross_entropy(output, lm_labels) 41 | 42 | return lm_loss 43 | 44 | 45 | class BertBinaryHead(nn.Module): 46 | 47 | def __init__(self, hidden_size): 48 | super().__init__() 49 | self.pooler = Pooler(hidden_size) 50 | self.dense = Linear(hidden_size, 2) 51 | 52 | def forward(self, hidden_states): 53 | if gpc.get_local_rank(ParallelMode.SEQUENCE) == 0: 54 | output = self.pooler(hidden_states) 55 | output = self.dense(output) 56 | else: 57 | output = None 58 | return output 59 | 60 | 61 | class BertDualHead(nn.Module): 62 | 63 | def __init__(self, hidden_size, vocab_size, add_binary_head): 64 | super().__init__() 65 | self.lm_head = BertLMHead(vocab_size, hidden_size) 66 | self.add_binary_head = add_binary_head 67 | if add_binary_head: 68 | self.binary_head = BertBinaryHead(hidden_size) 69 | else: 70 | self.binary_head = None 71 | 72 | def forward(self, hidden_states, word_embeddings_weight, lm_labels): 73 | if self.add_binary_head: 74 | binary_output = self.binary_head(hidden_states) 75 | else: 76 | binary_output = None 77 | lm_loss = self.lm_head(hidden_states, word_embeddings_weight, lm_labels) 78 | return lm_loss, binary_output 79 | -------------------------------------------------------------------------------- /language/bert/sequene_parallel/model/layers/init_method.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | 4 | def init_normal(tensor, sigma): 5 | """Init method based on N(0, sigma).""" 6 | torch.nn.init.normal_(tensor, mean=0.0, std=sigma) 7 | 8 | 9 | def output_init_normal(tensor, sigma, num_layers): 10 | """Init method based on N(0, sigma/sqrt(2*num_layers).""" 11 | std = sigma / math.sqrt(2.0 * num_layers) 12 | torch.nn.init.normal_(tensor, mean=0.0, std=std) 13 | -------------------------------------------------------------------------------- /language/bert/sequene_parallel/model/layers/linear.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import Parameter 4 | import torch.nn.functional as F 5 | import torch.nn.init as init 6 | 7 | 8 | class Linear(nn.Module): 9 | """Linear layer with column parallelism. 10 | The linear layer is defined as Y = XA + b. A is parallelized along 11 | its second dimension as A = [A_1, ..., A_p]. 12 | Arguments: 13 | input_size: first dimension of matrix A. 14 | output_size: second dimension of matrix A. 15 | bias: If true, add bias 16 | init_method: method to initialize weights. Note that bias is always set 17 | to zero. 18 | stride: For the strided linear layers. 19 | keep_master_weight_for_test: This was added for testing and should be 20 | set to False. It returns the master weights 21 | used for initialization. 22 | skip_bias_add: This was added to enable performance optimations where bias 23 | can be fused with other elementwise operations. we skip 24 | adding bias but instead return it. 25 | """ 26 | 27 | def __init__(self, 28 | input_size, 29 | output_size, 30 | bias=True, 31 | skip_bias_add=False): 32 | super(Linear, self).__init__() 33 | 34 | # Keep input parameters 35 | self.input_size = input_size 36 | self.output_size = output_size 37 | self.skip_bias_add = skip_bias_add 38 | 39 | self.weight = Parameter(torch.empty(self.output_size, 40 | self.input_size, 41 | )) 42 | init.normal_(self.weight) 43 | if bias: 44 | self.bias = Parameter(torch.empty(self.output_size)) 45 | # Always initialize bias to zero. 46 | with torch.no_grad(): 47 | self.bias.zero_() 48 | else: 49 | self.register_parameter('bias', None) 50 | 51 | def forward(self, input_): 52 | # Matrix multiply. 53 | bias = self.bias if not self.skip_bias_add else None 54 | output = F.linear(input_, self.weight, bias) 55 | 56 | if self.skip_bias_add: 57 | return output, self.bias 58 | else: 59 | return output 60 | 61 | def __repr__(self): 62 | return f'Linear(in_features={self.input_size}, out_features={self.output_size}, ' + \ 63 | f'bias={self.bias is not None}, skip_bias_add={self.skip_bias_add})' 64 | -------------------------------------------------------------------------------- /language/bert/sequene_parallel/model/layers/mlp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from .linear import Linear 6 | from colossalai.kernel.jit import bias_gelu_impl 7 | 8 | 9 | class TransformerMLP(nn.Module): 10 | """MLP. 11 | MLP will take the input with h hidden state, project it to 4*h 12 | hidden dimension, perform nonlinear transformation, and project the 13 | state back into h hidden dimension. At the end, dropout is also 14 | applied. 15 | """ 16 | 17 | def __init__(self, hidden_size, mlp_ratio, fuse_gelu=True): 18 | super(TransformerMLP, self).__init__() 19 | 20 | # Project to 4h. 21 | self.dense_h_to_4h = Linear( 22 | hidden_size, 23 | int(hidden_size*mlp_ratio), 24 | skip_bias_add=True) 25 | 26 | self.bias_gelu_fusion = fuse_gelu 27 | self.activation_func = F.gelu 28 | 29 | # Project back to h. 30 | self.dense_4h_to_h = Linear( 31 | int(hidden_size*mlp_ratio), 32 | hidden_size, 33 | skip_bias_add=True) 34 | 35 | def forward(self, hidden_states): 36 | # hidden states should be in the shape of [s, b, h] 37 | # it will be projects into [s, b, 4h] 38 | # and projected back to [s, b, h] 39 | intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states) 40 | 41 | if self.bias_gelu_fusion: 42 | intermediate_parallel = \ 43 | bias_gelu_impl(intermediate_parallel, bias_parallel) 44 | else: 45 | intermediate_parallel = \ 46 | self.activation_func(intermediate_parallel + bias_parallel) 47 | 48 | # [s, b, h] 49 | output, output_bias = self.dense_4h_to_h(intermediate_parallel) 50 | return output, output_bias 51 | -------------------------------------------------------------------------------- /language/bert/sequene_parallel/model/layers/pooler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from .linear import Linear 4 | 5 | 6 | class Pooler(nn.Module): 7 | """Pooler layer. 8 | 9 | Pool hidden states of a specific token (for example start of the 10 | sequence) and add a linear transformation followed by a tanh. 11 | 12 | Arguments: 13 | hidden_size: hidden size 14 | init_method: weight initialization method for the linear layer. 15 | bias is set to zero. 16 | """ 17 | 18 | def __init__(self, hidden_size): 19 | super(Pooler, self).__init__() 20 | self.dense = Linear(hidden_size, hidden_size) 21 | 22 | def forward(self, hidden_states, sequence_index=0): 23 | # hidden_states: [b, s, h] 24 | # sequence_index: index of the token to pool. 25 | pooled = hidden_states[:, sequence_index, :] 26 | pooled = self.dense(pooled) 27 | pooled = torch.tanh(pooled) 28 | return pooled 29 | -------------------------------------------------------------------------------- /language/bert/sequene_parallel/model/layers/preprocess.py: -------------------------------------------------------------------------------- 1 | from colossalai.context.parallel_mode import ParallelMode 2 | import torch 3 | import torch.nn as nn 4 | from colossalai.core import global_context as gpc 5 | 6 | 7 | class PreProcessor(nn.Module): 8 | 9 | def __init__(self, sub_seq_length): 10 | super().__init__() 11 | self.sub_seq_length = sub_seq_length 12 | 13 | def bert_position_ids(self, token_ids): 14 | # Create position ids 15 | seq_length = token_ids.size(1) 16 | local_rank = gpc.get_local_rank(ParallelMode.SEQUENCE) 17 | position_ids = torch.arange(seq_length*local_rank, 18 | seq_length * (local_rank+1), 19 | dtype=torch.long, 20 | device=token_ids.device) 21 | position_ids = position_ids.unsqueeze(0).expand_as(token_ids) 22 | 23 | return position_ids 24 | 25 | def bert_extended_attention_mask(self, attention_mask): 26 | local_rank = gpc.get_local_rank(ParallelMode.SEQUENCE) 27 | start_index = local_rank * self.sub_seq_length 28 | end_index = (local_rank + 1) * self.sub_seq_length 29 | 30 | # We create a 3D attention mask from a 2D tensor mask. 31 | # [b, 1, s] 32 | attention_mask_b1s = attention_mask.unsqueeze(1) 33 | # [b, s, 1] 34 | attention_mask_bs1 = attention_mask.unsqueeze(2) 35 | # [b, s/D, s] 36 | attention_mask_bss = attention_mask_b1s * attention_mask_bs1 37 | 38 | attention_mask_bss = attention_mask_bss[:, start_index:end_index, :] 39 | 40 | # [b, 1, s/D, s] 41 | extended_attention_mask = attention_mask_bss.unsqueeze(1) 42 | 43 | # Convert attention mask to binary: 44 | extended_attention_mask = (extended_attention_mask < 0.5) 45 | 46 | return extended_attention_mask 47 | 48 | def forward(self, input_ids=None, attention_mask=None): 49 | if attention_mask is not None: 50 | extended_attention_mask = self.bert_extended_attention_mask(attention_mask) 51 | else: 52 | extended_attention_mask = None 53 | 54 | if input_ids is not None: 55 | position_ids = self.bert_position_ids(input_ids) 56 | else: 57 | position_ids = None 58 | return position_ids, extended_attention_mask 59 | -------------------------------------------------------------------------------- /language/bert/sequene_parallel/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /language/bert/zero/.gitignore: -------------------------------------------------------------------------------- 1 | download/ 2 | pretrain/ -------------------------------------------------------------------------------- /language/bert/zero/README.md: -------------------------------------------------------------------------------- 1 | ## Train BERT with ZeRO 2 | 3 | ![Still In Progress](https://img.shields.io/badge/-Still%20In%20Progress-orange) 4 | 5 | ### About ZeRO 6 | 7 | Zero redundancy optimizer is a memory-optimization method for large-scale model training. 8 | It shards tensors in optimizer states, gradients, and parameters so that large models can be accommodated by limited GPU memory. 9 | Offloading techniques are integrated to further utilize the CPU memory space. 10 | Colossal-AI has an optimized ZeRO module equipped with our unique chunk mechanism to maximize the memory utilization and higher training throughput. 11 | More details can be found in our [documentation](https://www.colossalai.org/docs/features/zero_redundancy_and_zero_offload). 12 | 13 | ## Pretraining 14 | 15 | ### Data Preparation 16 | 17 | You need to follow the [documentation](../preprocessing/README.md) in the `preprocessing folder` to preprocess the WikiPedia dataset. 18 | You should obtain a `wikipedia` folder. Use symbolic link to link it to the current directory (i.e. `ln -s ../preprocessing/pretrain ./pretrain_data` ) 19 | 20 | ### Execute Pretraining 21 | 22 | Use the command below to start pretraining. If you want to do multi-node training, you can refer to the [documentation on how to launch multi-node training](https://www.colossalai.org/docs/basics/launch_colossalai). 23 | 24 | ```bash 25 | bash ./scripts/run_pretrain.sh 26 | ``` 27 | 28 | ## Fine-tuning 29 | 30 | In this repository, we provided finetuning examples for different downstream tasks. Each section comes with step-by-step instructions to fine-tune the pretrained bert model. 31 | 32 | ### GLUE 33 | 34 | 1. Prepare the dataset 35 | 36 | Execute the command below. This will create a `download` folder in the current directory. This folder contains the downstream task datasets. 37 | 38 | ```bash 39 | bash ./scripts/download_finetune_dataset.sh 40 | ``` 41 | 42 | 2. Fine-tuning 43 | 44 | Run the fine-tuning script. This script by defualt uses 1 GPU only. If you wish to use more GPUs, you can change the batch size per GPU. 45 | The SOTA results are reproduced with global batch size 128. 46 | 47 | ```bash 48 | bash ./scripts/run_finetune_glue.sh 49 | ``` 50 | 51 | Reproduced results: 52 | 53 | | Metric | Value | 54 | | - | - | 55 | | F1 | 89.1 | 56 | | Accurarcy | 84.31 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /language/bert/zero/configs/bert_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 768, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 3072, 8 | "max_position_embeddings": 512, 9 | "num_attention_heads": 12, 10 | "num_hidden_layers": 12, 11 | "type_vocab_size": 2, 12 | "vocab_size": 30522 13 | } -------------------------------------------------------------------------------- /language/bert/zero/configs/colossalai_amp.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | fp16 = dict(mode=AMP_TYPE.TORCH) 4 | 5 | seed = 2 6 | -------------------------------------------------------------------------------- /language/bert/zero/configs/colossalai_zero.py: -------------------------------------------------------------------------------- 1 | from colossalai.zero.shard_utils import TensorShardStrategy 2 | 3 | zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(), 4 | reduce_scatter_bucket_size_mb=25, 5 | fp32_reduce_scatter=False, 6 | tensor_placement_policy="cuda", 7 | gradient_predivide_factor=1.0, 8 | reuse_fp16_shard=True), 9 | optimizer_config=dict(gpu_margin_mem_ratio=0.8, 10 | initial_scale=2**5, 11 | min_scale=1, 12 | growth_factor=2, 13 | backoff_factor=0.5, 14 | growth_interval=1000, 15 | hysteresis=2, 16 | max_scale=2**32)) 17 | -------------------------------------------------------------------------------- /language/bert/zero/finetuning/glue/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/language/bert/zero/finetuning/glue/__init__.py -------------------------------------------------------------------------------- /language/bert/zero/finetuning/glue/data.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | import pickle 4 | from colossalai.logging import get_dist_logger 5 | from colossalai.core import global_context as gpc 6 | from torch.utils.data import TensorDataset 7 | from processors import convert_examples_to_features 8 | 9 | 10 | def gen_tensor_dataset(features): 11 | all_input_ids = torch.tensor( 12 | [f.input_ids for f in features], 13 | dtype=torch.long, 14 | ) 15 | all_input_mask = torch.tensor( 16 | [f.input_mask for f in features], 17 | dtype=torch.long, 18 | ) 19 | all_segment_ids = torch.tensor( 20 | [f.segment_ids for f in features], 21 | dtype=torch.long, 22 | ) 23 | all_label_ids = torch.tensor( 24 | [f.label_id for f in features], 25 | dtype=torch.long, 26 | ) 27 | return TensorDataset( 28 | all_input_ids, 29 | all_input_mask, 30 | all_segment_ids, 31 | all_label_ids, 32 | ) 33 | 34 | 35 | def get_train_features(data_dir, vocab_file, max_seq_length, do_lower_case, tokenizer, processor): 36 | 37 | cached_train_features_file = os.path.join( 38 | data_dir, 39 | '{0}_{1}_{2}'.format( 40 | vocab_file, 41 | str(max_seq_length), 42 | str(do_lower_case), 43 | ), 44 | ) 45 | train_features = None 46 | logger = get_dist_logger() 47 | try: 48 | with open(cached_train_features_file, "rb") as reader: 49 | train_features = pickle.load(reader) 50 | logger.info("Loaded pre-processed features from {}".format(cached_train_features_file)) 51 | except: 52 | logger.info("Did not find pre-processed features from {}".format(cached_train_features_file)) 53 | train_examples = processor.get_train_examples(data_dir) 54 | train_features, _ = convert_examples_to_features( 55 | train_examples, 56 | processor.get_labels(), 57 | max_seq_length, 58 | tokenizer, 59 | ) 60 | if gpc.get_global_rank() == 0: 61 | logger.info(" Saving train features into cached file %s", cached_train_features_file) 62 | with open(cached_train_features_file, "wb") as writer: 63 | pickle.dump(train_features, writer) 64 | return train_features 65 | -------------------------------------------------------------------------------- /language/bert/zero/pretraining/arguments.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | from numpy import require 3 | 4 | __all__ = ['parse_args'] 5 | 6 | 7 | def parse_args(): 8 | parser = colossalai.get_default_parser() 9 | parser.add_argument('--bert-config', type=str, required=True) 10 | parser.add_argument('--lr', type=float, required=True) 11 | parser.add_argument('--data', type=str, required=True) 12 | parser.add_argument('--warmup-ratio', default=0.01, type=float) 13 | parser.add_argument('--vocab-file', type=str, required=True) 14 | parser.add_argument('--epoch', type=int, required=True) 15 | parser.add_argument('--batch-size', type=int, required=True) 16 | parser.add_argument('--save-checkpoint-interval', type=int, required=True) 17 | parser.add_argument('--output-dir', type=str, required=True) 18 | args = parser.parse_args() 19 | return args 20 | -------------------------------------------------------------------------------- /language/bert/zero/pretraining/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | __all__ = ['LossForPretraining'] 4 | 5 | 6 | class LossForPretraining(torch.nn.Module): 7 | 8 | def __init__(self, vocab_size): 9 | super(LossForPretraining, self).__init__() 10 | self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1) 11 | self.vocab_size = vocab_size 12 | 13 | def forward(self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels): 14 | masked_lm_loss = self.loss_fn(prediction_scores.view(-1, self.vocab_size), masked_lm_labels.view(-1)) 15 | next_sentence_loss = self.loss_fn(seq_relationship_score.view(-1, 2), next_sentence_labels.view(-1)) 16 | total_loss = masked_lm_loss + next_sentence_loss 17 | return total_loss 18 | -------------------------------------------------------------------------------- /language/bert/zero/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /language/bert/zero/scripts/download_finetune_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2019-2020 NVIDIA CORPORATION. All rights reserved. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | git clone https://github.com/NVIDIA/DeepLearningExamples.git 17 | mv DeepLearningExamples/PyTorch/LanguageModeling/BERT/data ./nv-dl-examples-data 18 | rm -rf DeepLearningExamples 19 | pip install wget 20 | 21 | export BERT_PREP_WORKING_DIR=$PWD 22 | 23 | python3 ./nv-dl-examples-data/bertPrep.py --action download --dataset squad 24 | python3 ./nv-dl-examples-data/bertPrep.py --action download --dataset mrpc 25 | python3 ./nv-dl-examples-data/bertPrep.py --action download --dataset sst-2 -------------------------------------------------------------------------------- /language/bert/zero/scripts/run_finetune_glue.sh: -------------------------------------------------------------------------------- 1 | 2 | GLUE_DATASET=$PWD/download/glue/MRPC 3 | VOCAB_FILE="bert-base-uncased" 4 | CODE_DIR=$PWD/finetuning/glue 5 | 6 | colossalai run --nproc_per_node 1 \ 7 | --master_port 29510 \ 8 | $CODE_DIR/main.py \ 9 | --data_dir $GLUE_DATASET \ 10 | --task_name mrpc \ 11 | --bert_config ./configs/bert_base.json \ 12 | --vocab_file $VOCAB_FILE \ 13 | --output_dir ./finetuning_outputs \ 14 | --train_batch_size 128 \ 15 | --eval_batch_size 128 \ 16 | --num_train_epochs 3 \ 17 | --train \ 18 | --eval \ 19 | --predict 20 | -------------------------------------------------------------------------------- /language/bert/zero/scripts/run_pretrain.sh: -------------------------------------------------------------------------------- 1 | 2 | BERT_CONFIG_PATH='./configs/bert_base.json' 3 | PY_FILE_PATH='./pretraining/run_pretraining.py' 4 | DATA_PATH='./pretrain_data/phase1/unbinned/parquet' 5 | VOCAB_FILE='bert-base-uncased' 6 | 7 | export PYTHONPATH=$PWD 8 | 9 | colossalai run --nproc_per_node 8 \ 10 | --master_port 29550 \ 11 | $PY_FILE_PATH \ 12 | --bert-config $BERT_CONFIG_PATH \ 13 | --lr 1e-4 \ 14 | --data $DATA_PATH \ 15 | --vocab-file $VOCAB_FILE \ 16 | --batch-size 32 \ 17 | --epoch 100 \ 18 | --output-dir ./pretrain_outputs \ 19 | --save-checkpoint-interval 5 20 | -------------------------------------------------------------------------------- /language/gpt/dataset/webtext.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import torch 5 | from colossalai.registry import DATASETS 6 | from torch.utils.data import Dataset 7 | from transformers import GPT2Tokenizer 8 | 9 | 10 | @DATASETS.register_module 11 | class WebtextDataset(Dataset): 12 | def __init__(self, path, seq_len=1024) -> None: 13 | super().__init__() 14 | root = os.path.dirname(path) 15 | encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt') 16 | if os.path.isfile(encoded_data_cache_path): 17 | seq_len_, data, attention_mask = torch.load(encoded_data_cache_path) 18 | if seq_len_ == seq_len: 19 | self.data = data 20 | self.attention_mask = attention_mask 21 | return 22 | raw_data = [] 23 | with open(path) as f: 24 | for line in f.readlines(): 25 | raw_data.append(json.loads(line)['text']) 26 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 27 | tokenizer.pad_token = tokenizer.unk_token 28 | encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt') 29 | self.data = encoded_data['input_ids'] 30 | self.attention_mask = encoded_data['attention_mask'] 31 | torch.save((seq_len, self.data, self.attention_mask), encoded_data_cache_path) 32 | 33 | def __len__(self): 34 | return len(self.data) 35 | 36 | def __getitem__(self, index): 37 | return {'input_ids': self.data[index], 38 | 'attention_mask': self.attention_mask[index]}, self.data[index] -------------------------------------------------------------------------------- /language/gpt/gpt2_configs/gpt2_1d.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | from titans.loss.lm_loss import GPTLMLoss 3 | from titans.model.gpt import gpt2_small, gpt2_large, gpt2_xl, gpt2_8B 4 | from torch.optim import Adam 5 | 6 | # change bs here 7 | BATCH_SIZE = 32 8 | SEQ_LEN = 1024 9 | NUM_EPOCHS = 60 10 | 11 | TENSOR_PARALLEL = 4 12 | 13 | optimizer = dict( 14 | type=Adam, 15 | lr=0.00015, 16 | weight_decay=1e-2, 17 | ) 18 | 19 | fp16 = dict(mode=AMP_TYPE.NAIVE) 20 | 21 | loss = dict(type=GPTLMLoss,) 22 | 23 | model = dict( 24 | type=gpt2_8B, 25 | checkpoint=True, 26 | ) 27 | 28 | parallel = dict( 29 | pipeline=1, 30 | tensor=dict(size=TENSOR_PARALLEL, mode='1d'), 31 | ) 32 | -------------------------------------------------------------------------------- /language/gpt/gpt2_configs/gpt2_2d.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | from titans.loss.lm_loss import GPTLMLoss 3 | from titans.model.gpt import gpt2_small 4 | from torch.optim import Adam 5 | 6 | BATCH_SIZE = 4 7 | SEQ_LEN = 1024 8 | NUM_EPOCHS = 60 9 | TENSOR_PARALLEL = 4 10 | 11 | optimizer = dict( 12 | type=Adam, 13 | lr=0.00015, 14 | weight_decay=1e-2, 15 | ) 16 | 17 | fp16 = dict( 18 | mode=AMP_TYPE.NAIVE 19 | ) 20 | 21 | loss = dict( 22 | type=GPTLMLoss, 23 | ) 24 | 25 | model = dict( 26 | type=gpt2_small, 27 | checkpoint=True, 28 | ) 29 | 30 | parallel = dict( 31 | pipeline=1, 32 | tensor=dict(size=TENSOR_PARALLEL, mode='2d'), 33 | ) 34 | -------------------------------------------------------------------------------- /language/gpt/gpt2_configs/gpt2_2p5d.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | from titans.loss.lm_loss import GPTLMLoss 3 | from titans.model.gpt import gpt2_small 4 | from torch.optim import Adam 5 | 6 | BATCH_SIZE = 4 7 | SEQ_LEN = 1024 8 | NUM_EPOCHS = 60 9 | TENSOR_PARALLEL = 8 10 | DEPTH = 2 11 | 12 | 13 | optimizer = dict( 14 | type=Adam, 15 | lr=0.00015, 16 | weight_decay=1e-2, 17 | ) 18 | 19 | fp16 = dict( 20 | mode=AMP_TYPE.NAIVE 21 | ) 22 | 23 | loss = dict( 24 | type=GPTLMLoss, 25 | ) 26 | 27 | model = dict( 28 | type=gpt2_small, 29 | checkpoint=True, 30 | ) 31 | 32 | 33 | parallel = dict( 34 | pipeline=1, 35 | tensor=dict(size=TENSOR_PARALLEL, depth=DEPTH, mode='2.5d'), 36 | ) 37 | -------------------------------------------------------------------------------- /language/gpt/gpt2_configs/gpt2_3d.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | from titans.loss.lm_loss import GPTLMLoss 3 | from titans.model.gpt import gpt2_small 4 | from torch.optim import Adam 5 | 6 | BATCH_SIZE = 4 7 | SEQ_LEN = 1024 8 | NUM_EPOCHS = 60 9 | TENSOR_PARALLEL = 8 10 | 11 | optimizer = dict( 12 | type=Adam, 13 | lr=0.00015, 14 | weight_decay=1e-2, 15 | ) 16 | 17 | fp16 = dict( 18 | mode=AMP_TYPE.NAIVE 19 | ) 20 | 21 | loss = dict( 22 | type=GPTLMLoss, 23 | ) 24 | 25 | model = dict( 26 | type=gpt2_small, 27 | checkpoint=True, 28 | ) 29 | 30 | parallel = dict( 31 | pipeline=1, 32 | tensor=dict(size=TENSOR_PARALLEL, mode='3d'), 33 | ) 34 | -------------------------------------------------------------------------------- /language/gpt/gpt2_configs/gpt2_pp.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | from titans.loss.lm_loss import GPTLMLoss 3 | from titans.model.gpt import gpt2_small 4 | #from model_zoo.gpt.gpt import gpt2_small_pipeline 5 | from torch.optim import Adam 6 | 7 | 8 | BATCH_SIZE = 8 9 | SEQ_LEN = 1024 10 | NUM_EPOCHS = 60 11 | HIDDEN_SIZE = 768 12 | NUM_MICRO_BATCHES = 4 13 | PIPELINE = 2 14 | 15 | optimizer = dict( 16 | type=Adam, 17 | lr=0.00015, 18 | weight_decay=1e-2, 19 | ) 20 | 21 | fp16 = dict( 22 | mode=AMP_TYPE.NAIVE 23 | ) 24 | 25 | loss = dict( 26 | type=GPTLMLoss, 27 | ) 28 | 29 | model = dict( 30 | type=gpt2_small, 31 | checkpoint=True, 32 | ) 33 | 34 | parallel = dict( 35 | pipeline=PIPELINE, 36 | tensor=dict(size=1, mode=None), 37 | ) 38 | -------------------------------------------------------------------------------- /language/gpt/gpt2_configs/gpt2_pp1d.py: -------------------------------------------------------------------------------- 1 | from titans.loss.lm_loss import GPTLMLoss 2 | from titans.loss.vocab_cross_entropy import vocab_parallel_cross_entropy 3 | from titans.model.gpt import gpt2_small 4 | from torch.optim import Adam 5 | from colossalai.amp import AMP_TYPE 6 | import torch 7 | 8 | BATCH_SIZE = 8 9 | NUM_EPOCHS = 60 10 | SEQ_LEN = 1024 11 | 12 | NUM_MICRO_BATCHES = 4 13 | HIDDEN_SIZE = 768 14 | PIPELINE = 2 15 | TENSOR_PARALLEL = 2 16 | MODE = '1d' 17 | 18 | fp16 = dict(mode=AMP_TYPE.NAIVE) 19 | 20 | parallel = dict(pipeline=PIPELINE, tensor=dict(mode=MODE, size=TENSOR_PARALLEL)) 21 | 22 | optimizer = dict( 23 | type=Adam, 24 | lr=0.00015, 25 | weight_decay=1e-2, 26 | ) 27 | 28 | model = dict( 29 | type=gpt2_small, 30 | checkpoint=True, 31 | dtype=torch.half, 32 | ) 33 | 34 | loss_fn = dict(type=vocab_parallel_cross_entropy) 35 | -------------------------------------------------------------------------------- /language/gpt/gpt2_configs/gpt2_vanilla.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | from titans.model.gpt import gpt2_small 3 | from torch.optim import Adam 4 | 5 | 6 | BATCH_SIZE = 1 7 | NUM_EPOCHS = 60 8 | SEQ_LEN = 1024 9 | 10 | optimizer = dict( 11 | type=Adam, 12 | lr=0.00015, 13 | weight_decay=1e-2, 14 | ) 15 | 16 | fp16 = dict( 17 | mode=AMP_TYPE.NAIVE 18 | ) 19 | 20 | 21 | model = dict( 22 | type=gpt2_small, 23 | checkpoint=True, 24 | ) 25 | 26 | parallel = dict( 27 | pipeline=1, 28 | tensor=dict(size=1, mode=None), 29 | ) -------------------------------------------------------------------------------- /language/gpt/gpt2_configs/gpt2_zero3.py: -------------------------------------------------------------------------------- 1 | from colossalai.nn.optimizer import HybridAdam 2 | from colossalai.zero.shard_utils import TensorShardStrategy 3 | from titans.model.gpt import gpt2_small 4 | 5 | BATCH_SIZE = 2 6 | NUM_EPOCHS = 60 7 | SEQ_LEN = 1024 8 | 9 | 10 | zero = dict( 11 | model_config=dict( 12 | tensor_placement_policy='cpu', 13 | shard_strategy=TensorShardStrategy(), 14 | reuse_fp16_shard=True 15 | ), 16 | optimizer_config=dict() 17 | ) 18 | 19 | 20 | optimizer = dict( 21 | type=HybridAdam, 22 | lr=0.00015, 23 | weight_decay=1e-2, 24 | ) 25 | 26 | model = dict( 27 | type=gpt2_small, 28 | checkpoint=True, 29 | ) 30 | -------------------------------------------------------------------------------- /language/gpt/gpt2_configs/gpt2_zero3_pp1d.py: -------------------------------------------------------------------------------- 1 | from colossalai.nn.optimizer import HybridAdam 2 | from colossalai.zero.shard_utils import (BucketTensorShardStrategy, 3 | TensorShardStrategy) 4 | from model import GPT2_small_pipeline_hybrid 5 | 6 | BATCH_SIZE = 8 7 | NUM_EPOCHS = 60 8 | SEQ_LEN = 1024 9 | NUM_MICRO_BATCHES = 4 10 | HIDDEN_SIZE = 768 11 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LEN, HIDDEN_SIZE) 12 | zero = dict( 13 | model_config=dict( 14 | tensor_placement_policy='cpu', 15 | shard_strategy=BucketTensorShardStrategy() 16 | ), 17 | optimizer_config=dict() 18 | ) 19 | 20 | 21 | optimizer = dict( 22 | type=HybridAdam, 23 | lr=0.00015, 24 | weight_decay=1e-2, 25 | ) 26 | 27 | model = dict( 28 | type=GPT2_small_pipeline_hybrid, 29 | checkpoint=True, 30 | num_chunks=1 31 | ) 32 | 33 | parallel = dict( 34 | pipeline=2, 35 | tensor=dict(size=2, mode='1d'), 36 | ) 37 | -------------------------------------------------------------------------------- /language/gpt/gpt3_configs/gpt3_pp1d.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from titans.model.gpt import gpt3 3 | from titans.loss.vocab_cross_entropy import vocab_parallel_cross_entropy 4 | from torch.optim import Adam 5 | from colossalai.amp import AMP_TYPE 6 | 7 | 8 | 9 | BATCH_SIZE = 192 10 | NUM_EPOCHS = 60 11 | SEQ_LEN = 2048 12 | NUM_MICRO_BATCHES = 192 13 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LEN, 12288) 14 | 15 | fp16 = dict( 16 | mode=AMP_TYPE.NAIVE 17 | ) 18 | 19 | parallel = dict( 20 | pipeline=32, 21 | tensor=dict(mode='1d', size=4) 22 | ) 23 | 24 | optimizer = dict( 25 | type=Adam, 26 | lr=0.00015, 27 | weight_decay=1e-2, 28 | ) 29 | 30 | model = dict( 31 | type=gpt3, 32 | checkpoint=True, 33 | dtype=torch.half, 34 | ) 35 | 36 | loss_fn = dict(type=vocab_parallel_cross_entropy) 37 | -------------------------------------------------------------------------------- /language/gpt/gpt3_configs/gpt3_pp1d_min.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from titans.model.gpt import gpt3 3 | from titans.loss.vocab_cross_entropy import vocab_parallel_cross_entropy 4 | from torch.optim import Adam 5 | from colossalai.amp import AMP_TYPE 6 | 7 | 8 | 9 | BATCH_SIZE = 192 10 | NUM_EPOCHS = 60 11 | SEQ_LEN = 2048 12 | NUM_MICRO_BATCHES = 192 13 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LEN, 12288) 14 | 15 | fp16 = dict( 16 | mode=AMP_TYPE.NAIVE 17 | ) 18 | 19 | parallel = dict( 20 | pipeline=24, 21 | tensor=dict(mode='1d', size=4) 22 | ) 23 | 24 | optimizer = dict( 25 | type=Adam, 26 | lr=0.00015, 27 | weight_decay=1e-2, 28 | ) 29 | 30 | model = dict( 31 | type=gpt3, 32 | checkpoint=True, 33 | dtype=torch.half, 34 | ) 35 | 36 | loss_fn = dict(type=vocab_parallel_cross_entropy) 37 | -------------------------------------------------------------------------------- /language/gpt/gpt3_configs/gpt3_pp2d.py: -------------------------------------------------------------------------------- 1 | from titans.model.gpt import gpt3 2 | from torch.optim import Adam 3 | from colossalai.amp import AMP_TYPE 4 | import torch 5 | 6 | 7 | BATCH_SIZE = 2*48 8 | NUM_EPOCHS = 60 9 | SEQ_LEN = 2048 10 | NUM_MICRO_BATCHES = 48 11 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES // 2, SEQ_LEN, 12288 // 2) 12 | 13 | fp16 = dict( 14 | mode=AMP_TYPE.NAIVE 15 | ) 16 | 17 | parallel = dict( 18 | pipeline=24, 19 | tensor=dict(mode='2d', size=4) 20 | ) 21 | 22 | optimizer = dict( 23 | type=Adam, 24 | lr=0.00015, 25 | weight_decay=1e-2, 26 | ) 27 | 28 | model = dict( 29 | type=gpt3, 30 | checkpoint=True, 31 | dtype=torch.half, 32 | ) 33 | -------------------------------------------------------------------------------- /language/gpt/gpt3_configs/gpt3_pp2p5d.py: -------------------------------------------------------------------------------- 1 | from titans.model.gpt import gpt3 2 | from torch.optim import Adam 3 | from colossalai.amp import AMP_TYPE 4 | import torch 5 | 6 | 7 | BATCH_SIZE = 2*48 8 | NUM_EPOCHS = 60 9 | SEQ_LEN = 2048 10 | NUM_MICRO_BATCHES = 48 11 | TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES // 2, SEQ_LEN, 12288 // 2) 12 | 13 | fp16 = dict( 14 | mode=AMP_TYPE.NAIVE 15 | ) 16 | 17 | parallel = dict( 18 | pipeline=24, 19 | tensor=dict(mode='2.5d', depth = 1, size=4) 20 | ) 21 | 22 | optimizer = dict( 23 | type=Adam, 24 | lr=0.00015, 25 | weight_decay=1e-2, 26 | ) 27 | 28 | model = dict( 29 | type=gpt3, 30 | checkpoint=True, 31 | dtype=torch.half, 32 | ) 33 | -------------------------------------------------------------------------------- /language/gpt/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .embed import vocab_parallel_cross_entropy 2 | from .gpt1d import * 3 | from .pipeline_gpt1d import * 4 | -------------------------------------------------------------------------------- /language/gpt/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /language/gpt/tools/Megatron/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpcaitech/ColossalAI-Examples/ea860bc3e747aa6b4871ab841de607e5b35ce679/language/gpt/tools/Megatron/__init__.py -------------------------------------------------------------------------------- /language/gpt/tools/Megatron/remove_group_duplicates.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import json 18 | import time 19 | import sys 20 | 21 | 22 | if __name__ == '__main__': 23 | 24 | url_filename = sys.argv[1] 25 | data_filename = sys.argv[2] 26 | output_filename = sys.argv[3] 27 | 28 | urls = set() 29 | with open(url_filename, 'r') as f: 30 | for line in f: 31 | myjson = json.loads(line) 32 | for key in myjson: 33 | this_urls = myjson[key] 34 | for i in range(1, len(this_urls)): 35 | urls.add(this_urls[i]) 36 | print('will be removing {} urls'.format(len(urls)), flush=True) 37 | 38 | written_docs = 0 39 | removed_docs = 0 40 | removed_chars = 0 41 | start_time = time.time() 42 | with open(output_filename, 'wb') as fout: 43 | with open(data_filename, 'r') as fin: 44 | for line in fin: 45 | try: 46 | myjson = json.loads(line) 47 | url = myjson['url'] 48 | if url in urls: 49 | print('removing', myjson) 50 | removed_docs += 1 51 | removed_chars += len(myjson['text']) 52 | continue 53 | myjson = json.dumps(myjson, ensure_ascii=False) 54 | fout.write(myjson.encode('utf-8')) 55 | fout.write('\n'.encode('utf-8')) 56 | written_docs += 1 57 | if written_docs % 10000 == 0: 58 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 59 | '| removed: {} (char: {})'.format( 60 | time.time() - start_time, 61 | written_docs, removed_docs, removed_chars)) 62 | except Exception as e: 63 | print('[SKIPPING]', line, e) 64 | 65 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 66 | '| removed: {} (char: {})'.format( 67 | time.time() - start_time, 68 | written_docs, removed_docs, removed_chars)) 69 | print('done :-)') 70 | -------------------------------------------------------------------------------- /language/gpt/tools/Megatron/tokenizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import sys 17 | sys.path.append('..') 18 | 19 | from gpt2_tokenization import GPT2Tokenizer 20 | 21 | 22 | class Tokenizer: 23 | 24 | def __init__(self, cache_dir=None): 25 | self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2', 26 | cache_dir=cache_dir) 27 | self.tokenizer.max_len = int(1e12) 28 | self.eod_token = self.tokenizer.encoder['<|endoftext|>'] 29 | assert self.eod_token < 65535, 'vocab size will not fit in uint16' 30 | print('> GPT2 tokenizer with {} vocab size and eod token {} ...'.format( 31 | len(self.tokenizer.encoder), self.eod_token)) 32 | 33 | def tokenize_document(self, document): 34 | tokens = self.tokenizer.encode(document) 35 | tokens.append(self.eod_token) 36 | return tokens 37 | -------------------------------------------------------------------------------- /language/gpt/tools/download/download_old.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | import newspaper 3 | import os 4 | import hashlib 5 | import traceback 6 | import tldextract 7 | import tqdm 8 | from filter import should_exclude 9 | 10 | hash = hashlib.sha256 11 | 12 | try: 13 | os.mkdir('data') 14 | except FileExistsError: 15 | pass 16 | 17 | 18 | def dl(url): 19 | url = url.strip() 20 | 21 | if should_exclude(url): 22 | return 23 | 24 | ext = tldextract.extract(url) 25 | domain = '.'.join([x for x in ext if x]) 26 | 27 | fname = 'data/{}-{}.txt'.format(domain, hash(url.encode()).hexdigest()) 28 | if os.path.isfile(fname): 29 | return 30 | # print('Downloading', url) 31 | try: 32 | article = newspaper.Article(url, fetch_images=False) 33 | article.download() 34 | article.parse() 35 | except newspaper.article.ArticleException: 36 | # print('Dead link:', url) 37 | return 38 | # traceback.print_exc() 39 | 40 | text = article.text 41 | 42 | 43 | if text.strip() == '': 44 | # print('Empty') 45 | return 46 | 47 | with open(fname, 'w') as out: 48 | out.write(text) 49 | 50 | 51 | if __name__ == '__main__': 52 | p = mp.Pool(100) # num of download threads 53 | with open('urls.txt') as fh: 54 | urls = list(fh) 55 | 56 | list(tqdm.tqdm(p.imap(dl, urls), total=len(urls))) 57 | print('Done!') 58 | -------------------------------------------------------------------------------- /language/gpt/tools/download/get_urls.py: -------------------------------------------------------------------------------- 1 | import praw 2 | import psaw 3 | import tqdm 4 | import datetime 5 | 6 | 7 | api = psaw.PushshiftAPI() 8 | 9 | 10 | # all posts until the end of 2017 11 | end_time = int(datetime.datetime(2018, 1, 1).timestamp()) 12 | 13 | 14 | query = api.search_submissions(before=end_time, 15 | filter=['url', 'score'], 16 | sort='desc', 17 | score='>2', 18 | is_self=False, 19 | over_18=False) 20 | 21 | with tqdm.tqdm() as pbar: 22 | # download links from submissions 23 | with open('urls.txt', 'w') as fh: 24 | for subm in query: 25 | url = subm.url 26 | 27 | # weird issue with psaw/pushshift that breaks score=">2" 28 | if subm.score < 3: 29 | continue 30 | #print(subm.score) 31 | # pbar.write(str(datetime.datetime.fromtimestamp(subm.created_utc))) 32 | pbar.update(1) 33 | fh.write(url + '\n') 34 | fh.flush() 35 | -------------------------------------------------------------------------------- /language/gpt/tools/download/utils.py: -------------------------------------------------------------------------------- 1 | # Code taken in large part from https://github.com/jcpeterson/openwebtext 2 | 3 | 4 | import os 5 | import os.path as op 6 | import tarfile 7 | import re 8 | import collections 9 | 10 | 11 | def extract_month(url_file_name): 12 | month_re = r"(RS_.*2\d{3}-\d{2})" 13 | month = op.split(url_file_name)[-1] 14 | month = re.match(month_re, month).group() 15 | return month 16 | 17 | 18 | def chunks(l, n, s=0): 19 | """Yield successive n-sized chunks from l, skipping the first s chunks.""" 20 | if isinstance(l, collections.Iterable): 21 | chnk = [] 22 | for i, elem in enumerate(l): 23 | if i < s: 24 | continue 25 | 26 | chnk.append(elem) 27 | if len(chnk) == n: 28 | yield chnk 29 | chnk = [] 30 | if len(chnk) != 0: 31 | yield chnk 32 | 33 | else: 34 | for i in range(s, len(l), n): 35 | yield l[i : i + n] 36 | 37 | 38 | def extract_archive(archive_fp, outdir="."): 39 | with tarfile.open(archive_fp, "r") as tar: 40 | tar.extractall(outdir) 41 | return outdir 42 | 43 | 44 | def mkdir(fp): 45 | try: 46 | os.makedirs(fp) 47 | except FileExistsError: 48 | pass 49 | return fp 50 | 51 | 52 | def linecount(filename): 53 | f = open(filename, 'rb') 54 | lines = 0 55 | buf_size = 1024 * 1024 56 | read_f = f.raw.read 57 | 58 | buf = read_f(buf_size) 59 | while buf: 60 | lines += buf.count(b'\n') 61 | buf = read_f(buf_size) 62 | 63 | return lines 64 | -------------------------------------------------------------------------------- /language/knowledge_graph_embedding/config.py: -------------------------------------------------------------------------------- 1 | from colossalai.amp import AMP_TYPE 2 | 3 | CONFIG = dict( 4 | fp16=dict( 5 | mode=AMP_TYPE.TORCH 6 | ) 7 | ) 8 | -------------------------------------------------------------------------------- /language/knowledge_graph_embedding/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /language/opt/benchmark.sh: -------------------------------------------------------------------------------- 1 | export BS=16 2 | export MEMCAP=0 3 | export MODEL="6.7b" 4 | export GPUNUM=1 5 | 6 | for MODEL in "6.7b" "13b" "1.3b" 7 | do 8 | for GPUNUM in 8 1 9 | do 10 | for BS in 16 24 32 8 11 | do 12 | for MEMCAP in 0 40 13 | do 14 | pkill -9 torchrun 15 | pkill -9 python 16 | 17 | bash ./run_clm.sh $BS $MEMCAP $MODEL $GPUNUM 18 | done 19 | done 20 | done 21 | done -------------------------------------------------------------------------------- /language/opt/colossalai_zero.py: -------------------------------------------------------------------------------- 1 | from colossalai.zero.shard_utils import TensorShardStrategy 2 | 3 | zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(), 4 | tensor_placement_policy="auto", 5 | reuse_fp16_shard=True), 6 | optimizer_config=dict(gpu_margin_mem_ratio=0.8, initial_scale=16384)) 7 | -------------------------------------------------------------------------------- /language/opt/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | datasets >= 1.8.0 4 | sentencepiece != 0.1.92 5 | protobuf 6 | -------------------------------------------------------------------------------- /language/opt/run_clm.sh: -------------------------------------------------------------------------------- 1 | export BS=${1:-16} 2 | export MEMCAP=${2:-0} 3 | export MODEL=${3:-"1.3b"} 4 | export GPUNUM=${4:-1} 5 | # export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128 6 | 7 | # make directory for logs 8 | mkdir -p ./logs 9 | 10 | # env PYTORCH_NO_CUDA_MEMORY_CACHING=1 11 | torchrun \ 12 | --nproc_per_node ${GPUNUM} \ 13 | --master_port 19198 \ 14 | run_clm.py \ 15 | --dataset_name wikitext \ 16 | --dataset_config_name wikitext-2-raw-v1 \ 17 | --model_name_or_path facebook/opt-${MODEL} \ 18 | --output_dir $PWD \ 19 | --mem_cap ${MEMCAP} \ 20 | --per_device_train_batch_size ${BS} 2>&1 | tee ./logs/colo_${MODEL}_bs_${BS}_cap_${MEMCAP}_gpu_${GPUNUM}.log 21 | 22 | -------------------------------------------------------------------------------- /language/opt/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | 4 | 5 | def memory_cap(size_in_GB): 6 | print(f"use only {size_in_GB} GB of CUDA memory") 7 | assert dist.is_initialized(), "memory_cap must be used after dist init" 8 | local_rank = dist.get_rank() 9 | cuda_capacity = torch.cuda.get_device_properties(local_rank).total_memory 10 | size_in_B = (size_in_GB * 1024**3) 11 | if size_in_B > cuda_capacity: 12 | print(f'memory_cap is uselsess since {cuda_capacity / 1024**3} less than {size_in_GB}') 13 | return 14 | fraction = (size_in_GB * 1024**3) / cuda_capacity 15 | print(f'mem faction is {fraction}') 16 | torch.cuda.set_per_process_memory_fraction(fraction, local_rank) 17 | 18 | 19 | def colo_memory_cap(size_in_GB): 20 | from colossalai.utils import colo_set_process_memory_fraction, colo_device_memory_capacity 21 | from colossalai.utils import get_current_device 22 | cuda_capacity = colo_device_memory_capacity(get_current_device()) 23 | if size_in_GB * (1024**3) < cuda_capacity: 24 | colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity) 25 | print("Using {} GB of GPU memory".format(size_in_GB)) 26 | 27 | 28 | if __name__ == '__main__': 29 | memory_cap(40) 30 | -------------------------------------------------------------------------------- /language/roberta/README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | This repo introduce how to pretrain a chinese roberta-large from scratch 3 | 4 | ## 1. Corpus Preprocessing 5 | ```bash 6 | cd preprocessing 7 | ``` 8 | following the `README.md`, preprocess orginal corpus to h5py+numpy 9 | 10 | ## 2. Pretrain 11 | 12 | ```bash 13 | cd pretraining 14 | ``` 15 | following the `README.md`, load the output generated by preprocess to pretrain the model 16 | 17 | ## 3. Finetune 18 | 19 | The checkpoint produced by this repo can replace `pytorch_model.bin` from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transfomers from HuggingFace to finetune downstream application. -------------------------------------------------------------------------------- /language/roberta/configs/colossalai_ddp.py: -------------------------------------------------------------------------------- 1 | from colossalai.zero.shard_utils import TensorShardStrategy 2 | from colossalai.nn.optimizer import FusedAdam 3 | 4 | clip_grad_norm = 1.0 5 | -------------------------------------------------------------------------------- /language/roberta/configs/colossalai_zero.py: -------------------------------------------------------------------------------- 1 | from colossalai.zero.shard_utils import TensorShardStrategy 2 | from colossalai.nn.optimizer import FusedAdam 3 | 4 | # fp16 = dict( 5 | # mode=AMP_TYPE.TORCH, 6 | # ) 7 | 8 | # seed = 2 9 | zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(), 10 | reduce_scatter_bucket_size_mb=25, 11 | fp32_reduce_scatter=False, 12 | tensor_placement_policy="cuda", 13 | gradient_predivide_factor=1.0, 14 | reuse_fp16_shard=False), 15 | optimizer_config=dict(gpu_margin_mem_ratio=0.8, 16 | initial_scale=2**5, 17 | min_scale=1, 18 | growth_factor=2, 19 | backoff_factor=0.5, 20 | growth_interval=1000, 21 | hysteresis=2, 22 | max_scale=2**32)) 23 | 24 | # gradient_accumulation = 4 25 | clip_grad_norm = 1.0 26 | optimizer = dict( 27 | type=FusedAdam, 28 | lr=0.00015, 29 | weight_decay=1e-2, 30 | ) 31 | 32 | # 64433 -------------------------------------------------------------------------------- /language/roberta/preprocessing/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++14 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = mask 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /language/roberta/pretraining/README.md: -------------------------------------------------------------------------------- 1 | # Pretraining 2 | 1. Pretraining roberta through running the script below. Detailed parameter descriptions can be found in the arguments.py. `data_path_prefix` is absolute path specifies output of preprocessing 3 | 4 | ```bash 5 | bash run_pretrain.sh 6 | ``` 7 | * `--hostfile`: servers' host name from /etc/hosts 8 | * `--include`: servers which will be used 9 | * `--nproc_per_node`: number of process(GPU) from each server 10 | * `--data_path_prefix`: absolute location of train data, e.g., /h5/0.h5 11 | * `--eval_data_path_prefix`: absolute location of eval data 12 | * `--tokenizer_path`: tokenizer path contains huggingface tokenizer.json, e.g./tokenizer/tokenizer.json 13 | * `--bert_config`: config.json which represent model 14 | * `--mlm`: model type of backbone, bert or deberta_v2 15 | 16 | 2. if resume training from earylier checkpoint, run the script below. 17 | 18 | ```shell 19 | bash run_pretrain_resume.sh 20 | ``` 21 | * `--resume_train`: whether to resume training 22 | * `--load_pretrain_model`: absolute path which contains model checkpoint 23 | * `--load_optimizer_lr`: absolute path which contains optimizer checkpoint 24 | 25 | -------------------------------------------------------------------------------- /language/roberta/pretraining/bert_dataset_provider.py: -------------------------------------------------------------------------------- 1 | class BertDatasetProviderInterface: 2 | def get_shard(self, index, shuffle=True): 3 | raise NotImplementedError 4 | 5 | def release_shard(self, index): 6 | raise NotImplementedError 7 | 8 | def prefetch_shard(self, index): 9 | raise NotImplementedError 10 | 11 | def get_batch(self, batch_iter): 12 | raise NotImplementedError 13 | 14 | def prefetch_batch(self): 15 | raise NotImplementedError 16 | -------------------------------------------------------------------------------- /language/roberta/pretraining/hostfile: -------------------------------------------------------------------------------- 1 | GPU001 2 | GPU002 3 | GPU003 4 | GPU004 5 | GPU005 6 | GPU006 7 | GPU007 8 | GPU008 9 | GPU009 10 | GPU010 11 | -------------------------------------------------------------------------------- /language/roberta/pretraining/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | __all__ = ['LossForPretraining'] 4 | 5 | 6 | class LossForPretraining(torch.nn.Module): 7 | 8 | def __init__(self, vocab_size): 9 | super(LossForPretraining, self).__init__() 10 | self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1) 11 | self.vocab_size = vocab_size 12 | 13 | def forward(self, prediction_scores, masked_lm_labels, next_sentence_labels=None): 14 | masked_lm_loss = self.loss_fn(prediction_scores.view(-1, self.vocab_size), masked_lm_labels.view(-1)) 15 | # next_sentence_loss = self.loss_fn(seq_relationship_score.view(-1, 2), next_sentence_labels.view(-1)) 16 | total_loss = masked_lm_loss #+ next_sentence_loss 17 | return total_loss 18 | -------------------------------------------------------------------------------- /language/roberta/pretraining/run_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | root_path=$PWD 4 | PY_FILE_PATH="$root_path/run_pretraining.py" 5 | 6 | tensorboard_path="$root_path/tensorboard" 7 | log_path="$root_path/exp_log" 8 | ckpt_path="$root_path/ckpt" 9 | 10 | colossal_config="$root_path/../configs/colossalai_ddp.py" 11 | 12 | mkdir -p $tensorboard_path 13 | mkdir -p $log_path 14 | mkdir -p $ckpt_path 15 | 16 | export PYTHONPATH=$PWD 17 | 18 | env OMP_NUM_THREADS=40 colossalai run --hostfile ./hostfile \ 19 | --include GPU002,GPU003,GPU004,GPU007 \ 20 | --nproc_per_node=8 \ 21 | $PY_FILE_PATH \ 22 | --master_addr GPU007 \ 23 | --master_port 20024 \ 24 | --lr 2.0e-4 \ 25 | --train_micro_batch_size_per_gpu 190 \ 26 | --eval_micro_batch_size_per_gpu 20 \ 27 | --epoch 15 \ 28 | --data_path_prefix /h5 \ 29 | --eval_data_path_prefix /eval_h5 \ 30 | --tokenizer_path /roberta \ 31 | --bert_config /roberta/config.json \ 32 | --tensorboard_path $tensorboard_path \ 33 | --log_path $log_path \ 34 | --ckpt_path $ckpt_path \ 35 | --colossal_config $colossal_config \ 36 | --log_interval 50 \ 37 | --mlm bert \ 38 | --wandb \ 39 | --checkpoint_activations \ 40 | -------------------------------------------------------------------------------- /language/roberta/pretraining/run_pretrain_resume.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | root_path=$PWD 4 | PY_FILE_PATH="$root_path/run_pretraining.py" 5 | 6 | tensorboard_path="$root_path/tensorboard" 7 | log_path="$root_path/exp_log" 8 | ckpt_path="$root_path/ckpt" 9 | 10 | colossal_config="$root_path/../configs/colossalai_ddp.py" 11 | 12 | mkdir -p $tensorboard_path 13 | mkdir -p $log_path 14 | mkdir -p $ckpt_path 15 | 16 | export PYTHONPATH=$PWD 17 | 18 | env OMP_NUM_THREADS=40 colossalai run --hostfile ./hostfile \ 19 | --include GPU002,GPU003,GPU004,GPU007 \ 20 | --nproc_per_node=8 \ 21 | $PY_FILE_PATH \ 22 | --master_addr GPU007 \ 23 | --master_port 20024 \ 24 | --lr 2.0e-4 \ 25 | --train_micro_batch_size_per_gpu 190 \ 26 | --eval_micro_batch_size_per_gpu 20 \ 27 | --epoch 15 \ 28 | --data_path_prefix /h5 \ 29 | --eval_data_path_prefix /eval_h5 \ 30 | --tokenizer_path /roberta \ 31 | --bert_config /roberta/config.json \ 32 | --tensorboard_path $tensorboard_path \ 33 | --log_path $log_path \ 34 | --ckpt_path $ckpt_path \ 35 | --colossal_config $colossal_config \ 36 | --log_interval 50 \ 37 | --mlm bert \ 38 | --wandb \ 39 | --checkpoint_activations \ 40 | --resume_train \ 41 | --load_pretrain_model /ckpt/1.pt \ 42 | --load_optimizer_lr /ckpt/1.op_lrs \ 43 | -------------------------------------------------------------------------------- /language/roberta/pretraining/utils/WandbLog.py: -------------------------------------------------------------------------------- 1 | import time 2 | import wandb 3 | import os 4 | from torch.utils.tensorboard import SummaryWriter 5 | 6 | class WandbLog: 7 | 8 | @classmethod 9 | def init_wandb(cls, project, notes=None, name=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), config=None): 10 | wandb.init(project=project, notes=notes, name=name, config=config) 11 | 12 | @classmethod 13 | def log(cls, result, model=None, gradient=None): 14 | wandb.log(result) 15 | 16 | if model: 17 | wandb.watch(model) 18 | 19 | if gradient: 20 | wandb.watch(gradient) 21 | 22 | 23 | class TensorboardLog: 24 | 25 | def __init__(self, location, name=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), config=None): 26 | if not os.path.exists(location): 27 | os.mkdir(location) 28 | self.writer = SummaryWriter(location, comment=name) 29 | 30 | def log_train(self, result, step): 31 | for k, v in result.items(): 32 | self.writer.add_scalar(f'{k}/train', v, step) 33 | 34 | def log_eval(self, result, step): 35 | for k, v in result.items(): 36 | self.writer.add_scalar(f'{k}/eval', v, step) 37 | 38 | def log_zeroshot(self, result, step): 39 | for k, v in result.items(): 40 | self.writer.add_scalar(f'{k}_acc/eval', v, step) 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /language/roberta/pretraining/utils/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import torch.distributed as dist 4 | 5 | logging.basicConfig( 6 | format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 7 | datefmt='%m/%d/%Y %H:%M:%S', 8 | level=logging.INFO) 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class Logger(): 13 | def __init__(self, log_path, cuda=False): 14 | self.logger = logging.getLogger(__name__) 15 | self.cuda = cuda 16 | self.log_path = log_path 17 | 18 | 19 | def info(self, message, log_=True, print_=True, *args, **kwargs): 20 | if (self.cuda and dist.get_rank() == 0) or not self.cuda: 21 | if print_: 22 | self.logger.info(message, *args, **kwargs) 23 | 24 | if log_: 25 | with open(self.log_path, 'a+') as f_log: 26 | f_log.write(message + '\n') 27 | 28 | 29 | def error(self, message, *args, **kwargs): 30 | self.logger.error(message, *args, **kwargs) 31 | -------------------------------------------------------------------------------- /language/roberta/requirements.txt: -------------------------------------------------------------------------------- 1 | colossalai 2 | torch >= 1.8.1 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch >= 1.8.0 2 | torchvision 3 | colossalai 4 | titans 5 | datasets >= 1.8.0 6 | sentencepiece != 0.1.92 7 | protobuf 8 | transformers 9 | Pillow 10 | tqdm 11 | ipdb 12 | numpy 13 | einops 14 | pyarrow 15 | sacred 16 | pandas 17 | git+https://github.com/rwightman/pytorch-image-models.git 18 | psutil 19 | tensorboard 20 | packaging 21 | -------------------------------------------------------------------------------- /utils/checkpoint/readme.md: -------------------------------------------------------------------------------- 1 | # Model Checkpoint 2 | 3 | Examples of how to use model checkpoint. 4 | 5 | ## How to run 6 | We use `colossalai.launch_from_torch` as an example here. Before running, you should `export DATA=/path/to/cifar-10`. 7 | 8 | If you are training with single node multiple GPUs: 9 | ```shell 10 | # If your torch >= 1.10.0 11 | torchrun --standalone --nproc_per_node save_engine.py 12 | 13 | # If your torch >= 1.9.0 14 | python -m torch.distributed.run --standalone --nproc_per_node= save_engine.py 15 | 16 | # Otherwise 17 | python -m torch.distributed.launch --nproc_per_node --master_addr --master_port 29500 save_engine.py 18 | ``` 19 | 20 | If you are using multiple nodes, see [torchrun](https://pytorch.org/docs/stable/elastic/run.html#launcher-api). -------------------------------------------------------------------------------- /utils/checkpoint/save_trainer.py: -------------------------------------------------------------------------------- 1 | import colossalai 2 | import torch 3 | import os 4 | 5 | import colossalai.nn as col_nn 6 | from colossalai.utils import get_dataloader, MultiTimer 7 | from colossalai.logging import get_dist_logger 8 | from colossalai.core import global_context as gpc 9 | from torch.nn.modules import CrossEntropyLoss 10 | from torchvision import transforms 11 | from torchvision.datasets import CIFAR10 12 | from colossalai.trainer import Trainer, hooks 13 | from model_zoo.vit import vit_tiny_patch4_32 14 | 15 | def build_cifar(batch_size): 16 | transform_train = transforms.Compose([ 17 | transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.CIFAR10), 18 | transforms.ToTensor(), 19 | transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), 20 | ]) 21 | transform_test = transforms.Compose([ 22 | transforms.ToTensor(), 23 | transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), 24 | ]) 25 | 26 | train_dataset = CIFAR10(root=os.environ['DATA'], train=True, download=True, transform=transform_train) 27 | test_dataset = CIFAR10(root=os.environ['DATA'], train=False, download=True, transform=transform_test) 28 | train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=batch_size, pin_memory=True) 29 | test_dataloader = get_dataloader(dataset=test_dataset, batch_size=batch_size, pin_memory=True) 30 | return train_dataloader, test_dataloader 31 | 32 | 33 | BATCH_SIZE = 128 34 | NUM_EPOCHS = 10 35 | CONFIG = dict() 36 | 37 | 38 | def train(): 39 | args = colossalai.get_default_parser().parse_args() 40 | colossalai.launch_from_torch(backend=args.backend, config=CONFIG) 41 | 42 | logger = get_dist_logger() 43 | model = vit_tiny_patch4_32() 44 | criterion = CrossEntropyLoss() 45 | optimizer = torch.optim.Adam(model.parameters(), lr=0.001) 46 | train_dataloader, test_dataloader = build_cifar(BATCH_SIZE) 47 | 48 | engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model, optimizer, criterion, 49 | train_dataloader, test_dataloader) 50 | timer = MultiTimer() 51 | 52 | trainer = Trainer(engine=engine, timer=timer, logger=logger) 53 | 54 | hook_list = [ 55 | hooks.LossHook(), 56 | hooks.AccuracyHook(col_nn.metric.Accuracy()), 57 | hooks.LogMetricByEpochHook(logger), 58 | hooks.SaveCheckpointHook(1, 'vit_cifar.pt', model) 59 | ] 60 | 61 | trainer.fit(train_dataloader=train_dataloader, 62 | epochs=NUM_EPOCHS, 63 | test_dataloader=test_dataloader, 64 | test_interval=1, 65 | hooks=hook_list, 66 | display_progress=True) 67 | 68 | 69 | if __name__ == '__main__': 70 | train() 71 | --------------------------------------------------------------------------------