├── .gitignore ├── .pytorch_pretrained_bert ├── bert-base-uncased-vocab.txt ├── bert-large-uncased-vocab.txt ├── gpt2-merges.txt ├── gpt2-vocab.json ├── roberta-merges.txt └── roberta-vocab.json ├── LICENSE ├── README.md ├── arguments.py ├── blocklm_utils.py ├── change_mp.py ├── chinese_sentencepiece ├── cog-pretrain.model └── cog-pretrain.vocab ├── config ├── config_block_10B.json ├── config_block_10B_longer.json ├── config_block_base.json ├── config_block_large.json ├── config_block_large_chinese.json ├── config_blockta_large.json ├── ds_block_10B.sh ├── ds_block_10B_chinese.sh ├── ds_block_10B_chinese_longer.sh ├── ds_block_10B_longer.sh ├── ds_block_base.sh ├── ds_block_large.sh ├── ds_block_large_chinese.sh └── ds_blockta_large.sh ├── config_tasks ├── config_blocklm_10B.json ├── config_blocklm_10B_cnndm.json ├── config_blocklm_10B_record.json ├── model_blocklm_1.25_generation.sh ├── model_blocklm_1.5_generation.sh ├── model_blocklm_10B.sh ├── model_blocklm_10B_chinese.sh ├── model_blocklm_2B.sh ├── model_blocklm_base.sh ├── model_blocklm_large.sh ├── model_blocklm_large_chinese.sh ├── model_blocklm_large_generation.sh ├── model_blocklm_roberta_1.25.sh ├── model_blocklm_roberta_large.sh ├── seq_blank.sh ├── seq_cmrc.sh ├── seq_cnndm.sh ├── seq_cnndm_org.sh ├── seq_customization.sh ├── seq_gigaword.sh ├── seq_squad.sh ├── seq_squad_generation.sh ├── seq_squad_v1.sh ├── seq_xsum.sh ├── task_afqmc.sh ├── task_boolq.sh ├── task_cb.sh ├── task_cluewsc.sh ├── task_cmrc.sh ├── task_copa.sh ├── task_multirc.sh ├── task_record.sh ├── task_rte.sh ├── task_tnews.sh ├── task_wic.sh ├── task_wsc.sh ├── task_wsc_generative.sh ├── zero_lambada.sh ├── zero_lambada_uni.sh ├── zero_lm.sh ├── zero_lm_uni.sh └── zero_wikitext.sh ├── configure_data.py ├── data_utils ├── __init__.py ├── corpora.py ├── datasets.py ├── extraction.py ├── file_utils.py ├── lazy_loader.py ├── samplers.py ├── sp_tokenizer.py ├── tokenization.py ├── tokenization_gpt2.py └── wordpiece.py ├── docker ├── cuda102.dockerfile ├── cuda112.dockerfile ├── prepare.sh └── ssh-env-config.sh ├── examples └── README.md ├── finetune_glm.py ├── fp16 ├── __init__.py ├── fp16.py ├── fp16util.py └── loss_scaler.py ├── generate_samples.py ├── generation_utils.py ├── learning_rates.py ├── model ├── __init__.py ├── distributed.py ├── downstream.py ├── modeling_bert.py ├── modeling_glm.py └── prompt.py ├── mpu ├── __init__.py ├── cross_entropy.py ├── data.py ├── grads.py ├── initialize.py ├── layers.py ├── mappings.py ├── random.py ├── tests │ ├── __init__.py │ ├── commons.py │ ├── test_cross_entropy.py │ ├── test_data.py │ ├── test_initialize.py │ ├── test_layers.py │ └── test_random.py ├── transformer.py └── utils.py ├── pretrain_glm.py ├── process_grid.py ├── requirements.txt ├── run_test.py ├── scripts ├── convert_glm_checkpoint_to_transformers.py ├── dispatcher.py ├── ds_finetune_record.sh ├── ds_finetune_seq2seq.sh ├── ds_finetune_superglue.sh ├── ds_finetune_superglue_prompt.sh ├── ds_pretrain_nvidia.sh ├── evaluate_lm.sh ├── evaluate_multichoice.sh ├── evaluate_seq2seq.sh ├── finetune_blank.sh ├── finetune_seq2seq.sh ├── finetune_seq2seq_grid.sh ├── finetune_superglue.sh ├── finetune_superglue_fast.sh ├── finetune_superglue_grid.sh ├── generate_block.sh └── multi-bleu.perl ├── tasks ├── data_utils.py ├── eval_utils.py ├── language_model │ ├── dataset.py │ ├── detokenizer.py │ └── finetune.py ├── seq2seq │ ├── dataset.py │ ├── evaluate.py │ └── finetune.py └── superglue │ ├── README.md │ ├── __init__.py │ ├── dataset.py │ ├── evaluate.py │ ├── finetune.py │ └── pvp.py ├── test ├── __init__.py ├── test_block.py └── test_rel_shift.py ├── train_utils.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | /.ipynb_checkpoints/ 3 | .DS_Store 4 | *.pyc 5 | logs 6 | runs 7 | settings.json 8 | .gitignore 9 | .vscode/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 THUDM 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /change_mp.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import torch 4 | import copy 5 | 6 | checkpoint = sys.argv[1] 7 | target_mp = int(sys.argv[2]) 8 | 9 | assert os.path.isdir(checkpoint) 10 | iteration_file = os.path.join(checkpoint, 'latest_checkpointed_iteration.txt') 11 | if not os.path.exists(iteration_file): 12 | iteration_file = os.path.join(checkpoint, 'latest') 13 | if os.path.exists(iteration_file): 14 | with open(iteration_file) as fin: 15 | iteration = int(fin.read().strip()) 16 | checkpoint = os.path.join(checkpoint, str(iteration)) 17 | else: 18 | iteration = None 19 | 20 | filenames = os.listdir(checkpoint) 21 | filenames = [filename for filename in filenames if filename.startswith("mp_rank_")] 22 | filenames = sorted(filenames, 23 | key=lambda x: int(x.split('_')[2])) 24 | filenames = [os.path.join(checkpoint, x) for x in filenames] 25 | 26 | if target_mp == len(filenames): 27 | print("MP size keeps the same.") 28 | exit(0) 29 | 30 | if sys.argv[1][-1] == '/': 31 | new_checkpoint = sys.argv[1][:-1] + '_MP' + sys.argv[2] 32 | else: 33 | new_checkpoint = sys.argv[1] + '_MP' + sys.argv[2] 34 | if not os.path.exists(new_checkpoint): 35 | os.mkdir(new_checkpoint) 36 | if iteration is not None: 37 | with open(os.path.join(new_checkpoint, 'latest_checkpointed_iteration.txt'), 'w') as fout: 38 | fout.write("{}\n".format(iteration)) 39 | with open(os.path.join(new_checkpoint, 'latest'), 'w') as fout: 40 | fout.write("{}\n".format(iteration)) 41 | new_checkpoint = os.path.join(new_checkpoint, str(iteration)) 42 | if not os.path.exists(new_checkpoint): 43 | os.mkdir(new_checkpoint) 44 | 45 | preserve_keys = [ 46 | "lr_scheduler", 47 | "skipped_steps", 48 | "global_steps", 49 | "global_samples", 50 | "dp_world_size", 51 | "iteration", 52 | "client_lr_scheduler", 53 | "np_rng_state", 54 | "random_rng_state", 55 | "torch_rng_state", 56 | "cuda_rng_state", 57 | "rng_tracker_states", 58 | 59 | ] 60 | 61 | if target_mp < len(filenames): 62 | print("Decrease MP size.") 63 | assert len(filenames) % target_mp == 0 64 | ratio = len(filenames) // target_mp 65 | for i in range(target_mp): 66 | start = ratio * i 67 | end = ratio * (i + 1) 68 | d = torch.load(filenames[start], 69 | map_location='cpu') 70 | for k in d.keys(): 71 | if k != 'module': 72 | if k in preserve_keys: 73 | pass 74 | elif k == "mp_world_size": 75 | d[k] = target_mp 76 | else: 77 | d[k] = None 78 | for j in range(start + 1, end): 79 | d_new = torch.load(filenames[j], 80 | map_location='cpu') 81 | for k, v in d_new['module'].items(): 82 | assert len(v.shape) < 3 83 | if len(v.shape) == 2 and 'position' not in k: 84 | if 'query' in k: 85 | size_1 = d['module'][k].shape[0] // 3 86 | size_2 = v.shape[0] // 3 87 | target = d['module'][k] 88 | d['module'][k] = torch.cat([ 89 | target[:size_1, :], v[:size_2, :], 90 | target[size_1:size_1 * 2, :], v[size_2:size_2 * 2, :], 91 | target[size_1 * 2:, :], v[size_2 * 2:, :]], 0) 92 | elif 'word' in k or 'h_to_4h' in k or 'relative' in k or "r_w_bias" in k or "r_r_bias" in k: 93 | d['module'][k] = torch.cat([d['module'][k], v], 0) 94 | else: 95 | d['module'][k] = torch.cat([d['module'][k], v], 1) 96 | elif len(v.shape) == 1 and 'query_key_value' in k: 97 | size_1 = d['module'][k].shape[0] // 3 98 | size_2 = v.shape[0] // 3 99 | target = d['module'][k] 100 | d['module'][k] = torch.cat([ 101 | target[:size_1], v[:size_2], 102 | target[size_1:size_1 * 2], v[size_2:size_2 * 2], 103 | target[size_1 * 2:], v[size_2 * 2:]], 0) 104 | elif len(v.shape) == 1 and ('dense_h_to_4h' in k or "attention.relative" in k): 105 | d['module'][k] = torch.cat([d['module'][k], v], 0) 106 | filename = os.path.join(new_checkpoint, "mp_rank_{:02d}_model_states.pt".format(i)) 107 | torch.save(d, filename) 108 | 109 | if target_mp > len(filenames): 110 | print("Increase MP size.") 111 | assert target_mp % len(filenames) == 0 112 | ratio = target_mp // len(filenames) 113 | for i in range(len(filenames)): 114 | start = ratio * i 115 | end = ratio * (i + 1) 116 | d = torch.load(filenames[i], 117 | map_location='cpu') 118 | for j in range(start, end): 119 | d_new = {} 120 | shift = j - start 121 | for k, v in d.items(): 122 | if k != 'module': 123 | if k in preserve_keys: 124 | d_new[k] = copy.deepcopy(d[k]) 125 | elif k == "mp_world_size": 126 | d_new[k] = target_mp 127 | else: 128 | d_new[k] = None 129 | d_new['module'] = {} 130 | with torch.no_grad(): 131 | for k, v in d['module'].items(): 132 | assert len(v.shape) < 3 133 | if len(v.shape) == 2 and 'position' not in k: 134 | if 'query' in k: 135 | part = v.shape[0] // ratio // 3 136 | d_new['module'][k] = torch.cat([v[shift * part:(shift + 1) * part, :].clone(), 137 | v[(shift + ratio) * part:(shift + 1 + ratio) * part, 138 | :].clone(), 139 | v[(shift + 2 * ratio) * part:(shift + 1 + 2 * ratio) * part, 140 | :].clone()], 0) 141 | elif 'word' in k or 'h_to_4h' in k or 'relative' in k or "r_w_bias" in k or "r_r_bias" in k: 142 | part = v.shape[0] // ratio 143 | d_new['module'][k] = v[shift * part:(shift + 1) * part, :].clone() 144 | else: 145 | part = v.shape[1] // ratio 146 | d_new['module'][k] = v[:, shift * part:(shift + 1) * part].clone() 147 | elif len(v.shape) == 1 and ('dense_h_to_4h' in k or "attention.relative" in k): 148 | part = v.shape[0] // ratio 149 | d_new['module'][k] = v[shift * part:(shift + 1) * part].clone() 150 | elif len(v.shape) == 1 and 'query_key_value' in k: 151 | part = v.shape[0] // ratio // 3 152 | d_new['module'][k] = torch.cat( 153 | [v[shift * part:(shift + 1) * part].clone(), 154 | v[(shift + ratio) * part:(shift + 1 + ratio) * part].clone(), 155 | v[(shift + 2 * ratio) * part:(shift + 1 + 2 * ratio) * part].clone()], 0) 156 | else: 157 | d_new['module'][k] = v.clone() 158 | filename = os.path.join(new_checkpoint, "mp_rank_{:02d}_model_states.pt".format(j)) 159 | torch.save(d_new, filename) 160 | -------------------------------------------------------------------------------- /chinese_sentencepiece/cog-pretrain.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/GLM/4f61ed7237a3b0187f4d62062429348276a78c84/chinese_sentencepiece/cog-pretrain.model -------------------------------------------------------------------------------- /config/config_block_10B.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 21, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 50, 5 | "gradient_clipping": 1.0, 6 | "zero_optimization": { 7 | "stage": 2, 8 | "contiguous_gradients": false, 9 | "overlap_comm": true, 10 | "reduce_scatter": true, 11 | "reduce_bucket_size": 50000000, 12 | "allgather_bucket_size": 500000000 13 | }, 14 | "zero_allow_untested_optimizer": true, 15 | "fp16": { 16 | "enabled": true, 17 | "loss_scale": 0, 18 | "loss_scale_window": 1000, 19 | "hysteresis": 2, 20 | "min_loss_scale": 1 21 | }, 22 | "optimizer": { 23 | "type": "Adam", 24 | "params": { 25 | "lr": 0.0001, 26 | "betas": [ 27 | 0.9, 28 | 0.95 29 | ], 30 | "eps": 1e-8, 31 | "weight_decay": 1e-1 32 | } 33 | }, 34 | "activation_checkpointing": { 35 | "partition_activations": false, 36 | "contiguous_memory_optimization": false 37 | }, 38 | "wall_clock_breakdown": false 39 | } -------------------------------------------------------------------------------- /config/config_block_10B_longer.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 8, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 50, 5 | "gradient_clipping": 1.0, 6 | "zero_optimization": { 7 | "stage": 2, 8 | "contiguous_gradients": true, 9 | "overlap_comm": true, 10 | "reduce_scatter": true, 11 | "reduce_bucket_size": 50000000, 12 | "allgather_bucket_size": 500000000 13 | }, 14 | "zero_allow_untested_optimizer": true, 15 | "fp16": { 16 | "enabled": true, 17 | "loss_scale": 0, 18 | "loss_scale_window": 1000, 19 | "hysteresis": 2, 20 | "min_loss_scale": 1 21 | }, 22 | "optimizer": { 23 | "type": "Adam", 24 | "params": { 25 | "lr": 0.00003, 26 | "betas": [ 27 | 0.9, 28 | 0.95 29 | ], 30 | "eps": 1e-8, 31 | "weight_decay": 1e-1 32 | } 33 | }, 34 | "activation_checkpointing": { 35 | "partition_activations": false, 36 | "contiguous_memory_optimization": false 37 | }, 38 | "wall_clock_breakdown": false 39 | } -------------------------------------------------------------------------------- /config/config_block_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 16, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 100, 5 | "gradient_clipping": 1.0, 6 | "fp16": { 7 | "enabled": true, 8 | "loss_scale": 0, 9 | "loss_scale_window": 1000, 10 | "hysteresis": 2, 11 | "min_loss_scale": 1 12 | }, 13 | "optimizer": { 14 | "type": "Adam", 15 | "params": { 16 | "lr": 0.0004, 17 | "weight_decay": 0.1, 18 | "betas": [ 19 | 0.9, 20 | 0.98 21 | ], 22 | "eps": 1e-6 23 | } 24 | }, 25 | "activation_checkpointing": { 26 | "partition_activations": false, 27 | "contiguous_memory_optimization": false 28 | }, 29 | "wall_clock_breakdown": false 30 | } -------------------------------------------------------------------------------- /config/config_block_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 16, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 100, 5 | "gradient_clipping": 1.0, 6 | "fp16": { 7 | "enabled": true, 8 | "loss_scale": 0, 9 | "loss_scale_window": 1000, 10 | "hysteresis": 2, 11 | "min_loss_scale": 1 12 | }, 13 | "optimizer": { 14 | "type": "Adam", 15 | "params": { 16 | "lr": 0.0002, 17 | "weight_decay": 0.1, 18 | "betas": [ 19 | 0.9, 20 | 0.98 21 | ], 22 | "eps": 1e-6 23 | } 24 | }, 25 | "activation_checkpointing": { 26 | "partition_activations": false, 27 | "contiguous_memory_optimization": false 28 | }, 29 | "wall_clock_breakdown": false 30 | } -------------------------------------------------------------------------------- /config/config_block_large_chinese.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 32, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 100, 5 | "gradient_clipping": 1.0, 6 | "fp16": { 7 | "enabled": true, 8 | "loss_scale": 0, 9 | "loss_scale_window": 1000, 10 | "hysteresis": 2, 11 | "min_loss_scale": 1 12 | }, 13 | "optimizer": { 14 | "type": "Adam", 15 | "params": { 16 | "lr": 0.0004, 17 | "weight_decay": 0.01, 18 | "betas": [ 19 | 0.9, 20 | 0.98 21 | ], 22 | "eps": 1e-6 23 | } 24 | }, 25 | "activation_checkpointing": { 26 | "partition_activations": false, 27 | "contiguous_memory_optimization": false 28 | }, 29 | "wall_clock_breakdown": false 30 | } -------------------------------------------------------------------------------- /config/config_blockta_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 16, 3 | "gradient_accumulation_steps": 8, 4 | "steps_per_print": 100, 5 | "gradient_clipping": 1.0, 6 | "fp16": { 7 | "enabled": true, 8 | "loss_scale": 0, 9 | "loss_scale_window": 1000, 10 | "hysteresis": 2, 11 | "min_loss_scale": 1 12 | }, 13 | "optimizer": { 14 | "type": "Adam", 15 | "params": { 16 | "lr": 0.0004, 17 | "weight_decay": 0.01, 18 | "betas": [ 19 | 0.9, 20 | 0.98 21 | ], 22 | "eps": 1e-6 23 | } 24 | }, 25 | "activation_checkpointing": { 26 | "partition_activations": false, 27 | "contiguous_memory_optimization": false 28 | }, 29 | "wall_clock_breakdown": false 30 | } -------------------------------------------------------------------------------- /config/ds_block_10B.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | script_path=$(realpath $BASH_SOURCE) 4 | script_dir=$(dirname $script_path) 5 | 6 | config_json="$script_dir/config_block_10B.json" 7 | gpt_options=" \ 8 | --block-lm \ 9 | --task-mask \ 10 | --bert-prob 0.5 \ 11 | --gap-sentence-prob 0.3 \ 12 | --avg-block-length 3 \ 13 | --gpt-min-ratio 0.25 \ 14 | --block-mask-prob 0.1 \ 15 | --short-seq-prob 0.02 \ 16 | --experiment-name blocklm-10b \ 17 | --model-parallel-size ${MP_SIZE} \ 18 | --num-layers 48 \ 19 | --hidden-size 4096 \ 20 | --num-attention-heads 64 \ 21 | --seq-length 512 \ 22 | --max-position-embeddings 1024 \ 23 | --save /dataset/fd5061f6/english_data/checkpoints \ 24 | --log-interval 50 \ 25 | --eval-interval 1000 \ 26 | --save-interval 2000 \ 27 | --train-iters 250000 \ 28 | --train-data pile cc-news \ 29 | --resume-dataloader \ 30 | --filter-english \ 31 | --loader-scatter 32 \ 32 | --tokenizer-type GPT2BPETokenizer \ 33 | --split 949,50,1 \ 34 | --distributed-backend nccl \ 35 | --lr-decay-style cosine \ 36 | --lr-decay-ratio 0.1 \ 37 | --lr-decay-iters 175000 \ 38 | --warmup 0.04 \ 39 | --checkpoint-activations \ 40 | --deepspeed-activation-checkpointing \ 41 | --fp16 \ 42 | " 43 | gpt_options="${gpt_options} 44 | --deepspeed \ 45 | --deepspeed_config ${config_json} \ 46 | " -------------------------------------------------------------------------------- /config/ds_block_10B_chinese.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | script_path=$(realpath $BASH_SOURCE) 4 | script_dir=$(dirname $script_path) 5 | 6 | config_json="$script_dir/config_block_10B.json" 7 | gpt_options=" \ 8 | --block-lm \ 9 | --task-mask \ 10 | --bert-prob 0.5 \ 11 | --gap-sentence-prob 0.3 \ 12 | --avg-block-length 3 \ 13 | --gpt-min-ratio 0.25 \ 14 | --block-mask-prob 0.1 \ 15 | --short-seq-prob 0.02 \ 16 | --experiment-name blocklm-10b-chinese \ 17 | --model-parallel-size ${MP_SIZE} \ 18 | --num-layers 48 \ 19 | --hidden-size 4096 \ 20 | --num-attention-heads 64 \ 21 | --seq-length 512 \ 22 | --max-position-embeddings 1024 \ 23 | --save /dataset/fd5061f6/english_data/checkpoints \ 24 | --load /dataset/fd5061f6/english_data/checkpoints/blocklm-10b-chinese07-08-15-28 \ 25 | --log-interval 50 \ 26 | --eval-interval 1000 \ 27 | --save-interval 2000 \ 28 | --train-iters 150000 \ 29 | --train-data wudao baike zhihu \ 30 | --resume-dataloader \ 31 | --loader-scatter 32 \ 32 | --no-lazy-loader \ 33 | --tokenizer-type ChineseSPTokenizer \ 34 | --split 949,50,1 \ 35 | --distributed-backend nccl \ 36 | --lr-decay-style cosine \ 37 | --lr-decay-ratio 0.1 \ 38 | --lr-decay-iters 120000 \ 39 | --warmup 0.04 \ 40 | --checkpoint-activations \ 41 | --deepspeed-activation-checkpointing \ 42 | --fp16 \ 43 | " 44 | gpt_options="${gpt_options} 45 | --deepspeed \ 46 | --deepspeed_config ${config_json} \ 47 | " -------------------------------------------------------------------------------- /config/ds_block_10B_chinese_longer.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | script_path=$(realpath $BASH_SOURCE) 4 | script_dir=$(dirname $script_path) 5 | 6 | config_json="$script_dir/config_block_10B_longer.json" 7 | gpt_options=" \ 8 | --block-lm \ 9 | --task-mask \ 10 | --bert-prob 0.5 \ 11 | --gap-sentence-prob 0.3 \ 12 | --avg-block-length 3 \ 13 | --gpt-min-ratio 0.25 \ 14 | --block-mask-prob 0.1 \ 15 | --short-seq-prob 0.5 \ 16 | --experiment-name blocklm-10b-chinese \ 17 | --model-parallel-size ${MP_SIZE} \ 18 | --num-layers 48 \ 19 | --hidden-size 4096 \ 20 | --num-attention-heads 64 \ 21 | --seq-length 1024 \ 22 | --max-position-embeddings 1024 \ 23 | --save /dataset/fd5061f6/english_data/checkpoints \ 24 | --load /dataset/fd5061f6/english_data/checkpoints/blocklm-10b-chinese07-08-15-28 \ 25 | --no-load-lr-scheduler \ 26 | --log-interval 50 \ 27 | --eval-interval 1000 \ 28 | --save-interval 2000 \ 29 | --train-iters 150000 \ 30 | --train-data wudao baike zhihu \ 31 | --resume-dataloader \ 32 | --loader-scatter 32 \ 33 | --no-lazy-loader \ 34 | --tokenizer-type ChineseSPTokenizer \ 35 | --split 949,50,1 \ 36 | --distributed-backend nccl \ 37 | --lr-decay-style cosine \ 38 | --lr-decay-ratio 0.1 \ 39 | --lr-decay-iters 20000 \ 40 | --warmup 0.025 \ 41 | --checkpoint-activations \ 42 | --deepspeed-activation-checkpointing \ 43 | --fp16 \ 44 | " 45 | gpt_options="${gpt_options} 46 | --deepspeed \ 47 | --deepspeed_config ${config_json} \ 48 | " -------------------------------------------------------------------------------- /config/ds_block_10B_longer.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | script_path=$(realpath $BASH_SOURCE) 4 | script_dir=$(dirname $script_path) 5 | 6 | config_json="$script_dir/config_block_10B_longer.json" 7 | gpt_options=" \ 8 | --block-lm \ 9 | --task-mask \ 10 | --bert-prob 0.4 \ 11 | --gap-sentence-prob 0.3 \ 12 | --single-span-prob 0.05 \ 13 | --avg-block-length 3 \ 14 | --gpt-min-ratio 0.25 \ 15 | --block-mask-prob 0.1 \ 16 | --short-seq-prob 0.5 \ 17 | --experiment-name blocklm-10b \ 18 | --model-parallel-size ${MP_SIZE} \ 19 | --num-layers 48 \ 20 | --hidden-size 4096 \ 21 | --num-attention-heads 64 \ 22 | --seq-length 1024 \ 23 | --max-position-embeddings 1024 \ 24 | --save /mnt/model_checkpoints \ 25 | --load /cache/blocklm-10b-512 \ 26 | --no-load-lr-scheduler \ 27 | --log-interval 25 \ 28 | --train-iters 250000 \ 29 | --train-data pile cc-news \ 30 | --resume-dataloader \ 31 | --filter-english \ 32 | --loader-scatter 32 \ 33 | --no-lazy-loader \ 34 | --tokenizer-type GPT2BPETokenizer \ 35 | --split 949,50,1 \ 36 | --distributed-backend nccl \ 37 | --lr-decay-style linear \ 38 | --lr-decay-ratio 0.1 \ 39 | --lr-decay-iters 50000 \ 40 | --warmup 0.005 \ 41 | --checkpoint-activations \ 42 | --deepspeed-activation-checkpointing \ 43 | --fp16 \ 44 | " 45 | gpt_options="${gpt_options} 46 | --deepspeed \ 47 | --deepspeed_config ${config_json} \ 48 | " -------------------------------------------------------------------------------- /config/ds_block_base.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | script_path=$(realpath $BASH_SOURCE) 4 | script_dir=$(dirname $script_path) 5 | 6 | config_json="$script_dir/config_block_base.json" 7 | gpt_options=" \ 8 | --block-lm \ 9 | --bert-prob 1.0 \ 10 | --experiment-name blocklm-blank \ 11 | --model-parallel-size ${MP_SIZE} \ 12 | --num-layers 12 \ 13 | --hidden-size 768 \ 14 | --num-attention-heads 12 \ 15 | --seq-length 512 \ 16 | --max-position-embeddings 512 \ 17 | --save /root/data/checkpoints \ 18 | --train-iters 150000 \ 19 | --resume-dataloader \ 20 | --train-data bert-base \ 21 | --lazy-loader \ 22 | --tokenizer-type BertWordPieceTokenizer \ 23 | --tokenizer-model-type bert-base-uncased \ 24 | --split 949,50,1 \ 25 | --distributed-backend nccl \ 26 | --lr-decay-style cosine \ 27 | --lr-decay-iters 120000 \ 28 | --lr-decay-ratio 0.05 \ 29 | --warmup .05 \ 30 | --checkpoint-activations \ 31 | --deepspeed-activation-checkpointing \ 32 | --fp16 \ 33 | " 34 | gpt_options="${gpt_options} 35 | --deepspeed \ 36 | --deepspeed_config ${config_json} \ 37 | " 38 | -------------------------------------------------------------------------------- /config/ds_block_large.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | script_path=$(realpath $BASH_SOURCE) 4 | script_dir=$(dirname $script_path) 5 | 6 | config_json="$script_dir/config_block_large.json" 7 | gpt_options=" \ 8 | --block-lm \ 9 | --bert-prob 1.0 \ 10 | --avg-block-length 3 \ 11 | --experiment-name blocklm-large-blank \ 12 | --model-parallel-size ${MP_SIZE} \ 13 | --num-layers 24 \ 14 | --hidden-size 1024 \ 15 | --num-attention-heads 16 \ 16 | --seq-length 512 \ 17 | --max-position-embeddings 512 \ 18 | --save /root/data/checkpoints \ 19 | --train-iters 200000 \ 20 | --resume-dataloader \ 21 | --train-data bert-large \ 22 | --tokenizer-type BertWordPieceTokenizer \ 23 | --tokenizer-model-type bert-large-uncased \ 24 | --split 949,50,1 \ 25 | --distributed-backend nccl \ 26 | --lr-decay-style cosine \ 27 | --lr-decay-iters 160000 \ 28 | --lr-decay-ratio 0.05 \ 29 | --warmup .05 \ 30 | --checkpoint-activations \ 31 | --deepspeed-activation-checkpointing \ 32 | --fp16 \ 33 | " 34 | gpt_options="${gpt_options} 35 | --deepspeed \ 36 | --deepspeed_config ${config_json} \ 37 | " -------------------------------------------------------------------------------- /config/ds_block_large_chinese.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | script_path=$(realpath $BASH_SOURCE) 4 | script_dir=$(dirname $script_path) 5 | 6 | config_json="$script_dir/config_block_large_chinese.json" 7 | gpt_options=" \ 8 | --block-lm \ 9 | --task-mask \ 10 | --bert-prob 0.4 \ 11 | --gap-sentence-prob 0.3 \ 12 | --avg-block-length 3 \ 13 | --gpt-min-ratio 0.25 \ 14 | --block-mask-prob 0.1 \ 15 | --short-seq-prob 0.02 \ 16 | --experiment-name blocklm-large-chinese \ 17 | --model-parallel-size ${MP_SIZE} \ 18 | --num-layers 24 \ 19 | --hidden-size 1024 \ 20 | --num-attention-heads 16 \ 21 | --seq-length 512 \ 22 | --max-position-embeddings 1024 \ 23 | --save /dataset/fd5061f6/english_data/checkpoints \ 24 | --load /dataset/fd5061f6/english_data/checkpoints/blocklm-large-chinese08-12-12-48 \ 25 | --log-interval 50 \ 26 | --eval-interval 1000 \ 27 | --save-interval 2000 \ 28 | --train-iters 250000 \ 29 | --train-data wudao baike zhihu \ 30 | --resume-dataloader \ 31 | --loader-scatter 32 \ 32 | --no-lazy-loader \ 33 | --tokenizer-type ChineseSPTokenizer \ 34 | --fix-command-token \ 35 | --split 949,50,1 \ 36 | --distributed-backend nccl \ 37 | --lr-decay-style cosine \ 38 | --lr-decay-ratio 0.1 \ 39 | --lr-decay-iters 200000 \ 40 | --warmup 0.04 \ 41 | --checkpoint-activations \ 42 | --deepspeed-activation-checkpointing \ 43 | --fp16 \ 44 | " 45 | gpt_options="${gpt_options} 46 | --deepspeed \ 47 | --deepspeed_config ${config_json} \ 48 | " -------------------------------------------------------------------------------- /config/ds_blockta_large.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | script_path=$(realpath $BASH_SOURCE) 4 | script_dir=$(dirname $script_path) 5 | 6 | config_json="$script_dir/config_blockta_large.json" 7 | gpt_options=" \ 8 | --block-lm \ 9 | --task-mask \ 10 | --bert-prob 1.0 \ 11 | --avg-block-length 3 \ 12 | --experiment-name blocklm-roberta-large-blank \ 13 | --model-parallel-size ${MP_SIZE} \ 14 | --num-layers 24 \ 15 | --hidden-size 1024 \ 16 | --num-attention-heads 16 \ 17 | --seq-length 512 \ 18 | --max-position-embeddings 512 \ 19 | --save /dataset/fd5061f6/english_data/checkpoints \ 20 | --save-interval 2500 \ 21 | --train-iters 500000 \ 22 | --resume-dataloader \ 23 | --train-data wikibook cc-news openwebtext \ 24 | --shuffle \ 25 | --tokenizer-type GPT2BPETokenizer \ 26 | --tokenizer-model-type roberta \ 27 | --split 949,50,1 \ 28 | --distributed-backend nccl \ 29 | --lr-decay-style linear \ 30 | --lr-decay-iters 500000 \ 31 | --lr-decay-ratio 0.025 \ 32 | --warmup .06 \ 33 | --checkpoint-activations \ 34 | --deepspeed-activation-checkpointing \ 35 | --fp16 \ 36 | " 37 | gpt_options="${gpt_options} 38 | --deepspeed \ 39 | --deepspeed_config ${config_json} \ 40 | " -------------------------------------------------------------------------------- /config_tasks/config_blocklm_10B.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 4, 3 | "gradient_accumulation_steps": 2, 4 | "steps_per_print": 50, 5 | "gradient_clipping": 1.0, 6 | "zero_optimization": { 7 | "stage": 2, 8 | "contiguous_gradients": false, 9 | "overlap_comm": true, 10 | "reduce_scatter": true, 11 | "reduce_bucket_size": 5e7, 12 | "allgather_bucket_size": 5e7, 13 | "cpu_offload": true 14 | }, 15 | "zero_allow_untested_optimizer": true, 16 | "fp16": { 17 | "enabled": true, 18 | "loss_scale": 0, 19 | "loss_scale_window": 1000, 20 | "hysteresis": 2, 21 | "min_loss_scale": 1 22 | }, 23 | "optimizer": { 24 | "type": "Adam", 25 | "params": { 26 | "lr": 5e-6, 27 | "betas": [ 28 | 0.9, 29 | 0.95 30 | ], 31 | "eps": 1e-8, 32 | "weight_decay": 1e-2 33 | } 34 | }, 35 | "activation_checkpointing": { 36 | "partition_activations": false, 37 | "contiguous_memory_optimization": false 38 | }, 39 | "wall_clock_breakdown": false 40 | } -------------------------------------------------------------------------------- /config_tasks/config_blocklm_10B_cnndm.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 4, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 50, 5 | "gradient_clipping": 1.0, 6 | "zero_optimization": { 7 | "stage": 2, 8 | "contiguous_gradients": false, 9 | "overlap_comm": true, 10 | "reduce_scatter": true, 11 | "reduce_bucket_size": 5e7, 12 | "allgather_bucket_size": 5e7, 13 | "cpu_offload": true 14 | }, 15 | "zero_allow_untested_optimizer": true, 16 | "fp16": { 17 | "enabled": true, 18 | "loss_scale": 0, 19 | "loss_scale_window": 1000, 20 | "hysteresis": 2, 21 | "min_loss_scale": 1 22 | }, 23 | "optimizer": { 24 | "type": "Adam", 25 | "params": { 26 | "lr": 5e-6, 27 | "betas": [ 28 | 0.9, 29 | 0.95 30 | ], 31 | "eps": 1e-8, 32 | "weight_decay": 1e-2 33 | } 34 | }, 35 | "activation_checkpointing": { 36 | "partition_activations": false, 37 | "contiguous_memory_optimization": false 38 | }, 39 | "wall_clock_breakdown": false 40 | } -------------------------------------------------------------------------------- /config_tasks/config_blocklm_10B_record.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 1, 3 | "gradient_accumulation_steps": 8, 4 | "steps_per_print": 50, 5 | "gradient_clipping": 1.0, 6 | "zero_optimization": { 7 | "stage": 2, 8 | "contiguous_gradients": false, 9 | "overlap_comm": true, 10 | "reduce_scatter": true, 11 | "reduce_bucket_size": 5e7, 12 | "allgather_bucket_size": 5e7, 13 | "cpu_offload": true 14 | }, 15 | "zero_allow_untested_optimizer": true, 16 | "fp16": { 17 | "enabled": true, 18 | "loss_scale": 0, 19 | "loss_scale_window": 1000, 20 | "hysteresis": 2, 21 | "min_loss_scale": 1 22 | }, 23 | "optimizer": { 24 | "type": "Adam", 25 | "params": { 26 | "lr": 5e-6, 27 | "betas": [ 28 | 0.9, 29 | 0.95 30 | ], 31 | "eps": 1e-8, 32 | "weight_decay": 1e-2 33 | } 34 | }, 35 | "activation_checkpointing": { 36 | "partition_activations": false, 37 | "contiguous_memory_optimization": false 38 | }, 39 | "wall_clock_breakdown": false 40 | } -------------------------------------------------------------------------------- /config_tasks/model_blocklm_1.25_generation.sh: -------------------------------------------------------------------------------- 1 | MODEL_TYPE="blocklm-1.25-generation" 2 | MODEL_ARGS="--block-lm \ 3 | --cloze-eval \ 4 | --num-layers 30 \ 5 | --hidden-size 1024 \ 6 | --num-attention-heads 16 \ 7 | --max-position-embeddings 512 \ 8 | --tokenizer-model-type bert-large-uncased \ 9 | --tokenizer-type BertWordPieceTokenizer \ 10 | --load-pretrained ${CHECKPOINT_PATH}/blocklm-1.25-generation" -------------------------------------------------------------------------------- /config_tasks/model_blocklm_1.5_generation.sh: -------------------------------------------------------------------------------- 1 | MODEL_TYPE="blocklm-1.5-generation" 2 | MODEL_ARGS="--block-lm \ 3 | --cloze-eval \ 4 | --num-layers 30 \ 5 | --hidden-size 1152 \ 6 | --num-attention-heads 18 \ 7 | --max-position-embeddings 512 \ 8 | --tokenizer-model-type bert-large-uncased \ 9 | --tokenizer-type BertWordPieceTokenizer \ 10 | --load-pretrained ${CHECKPOINT_PATH}/blocklm-1.5-generation" -------------------------------------------------------------------------------- /config_tasks/model_blocklm_10B.sh: -------------------------------------------------------------------------------- 1 | MODEL_TYPE="GLM-10B" 2 | MODEL_ARGS="--block-lm \ 3 | --cloze-eval \ 4 | --task-mask \ 5 | --num-layers 48 \ 6 | --hidden-size 4096 \ 7 | --num-attention-heads 64 \ 8 | --max-position-embeddings 1024 \ 9 | --tokenizer-type GPT2BPETokenizer \ 10 | --load-pretrained ${CHECKPOINT_PATH}/glm-10b-1024" -------------------------------------------------------------------------------- /config_tasks/model_blocklm_10B_chinese.sh: -------------------------------------------------------------------------------- 1 | MODEL_TYPE="GLM-10B-chinese" 2 | MODEL_ARGS="--block-lm \ 3 | --cloze-eval \ 4 | --task-mask \ 5 | --num-layers 48 \ 6 | --hidden-size 4096 \ 7 | --num-attention-heads 64 \ 8 | --max-position-embeddings 1024 \ 9 | --tokenizer-type ChineseSPTokenizer \ 10 | --load-pretrained ${CHECKPOINT_PATH}/glm-10b-chinese" -------------------------------------------------------------------------------- /config_tasks/model_blocklm_2B.sh: -------------------------------------------------------------------------------- 1 | MODEL_TYPE="blocklm-2B" 2 | MODEL_ARGS="--block-lm \ 3 | --cloze-eval \ 4 | --task-mask \ 5 | --num-layers 36 \ 6 | --hidden-size 2048 \ 7 | --num-attention-heads 32 \ 8 | --max-position-embeddings 1024 \ 9 | --tokenizer-type GPT2BPETokenizer \ 10 | --load-pretrained ${CHECKPOINT_PATH}/blocklm-2b-512" -------------------------------------------------------------------------------- /config_tasks/model_blocklm_base.sh: -------------------------------------------------------------------------------- 1 | MODEL_TYPE="blank-base" 2 | MODEL_ARGS="--block-lm \ 3 | --num-layers 12 \ 4 | --hidden-size 768 \ 5 | --num-attention-heads 12 \ 6 | --max-position-embeddings 512 \ 7 | --tokenizer-model-type bert-base-uncased \ 8 | --tokenizer-type BertWordPieceTokenizer \ 9 | --load-pretrained ${CHECKPOINT_PATH}/blocklm-base-blank" -------------------------------------------------------------------------------- /config_tasks/model_blocklm_large.sh: -------------------------------------------------------------------------------- 1 | MODEL_TYPE="blank-large" 2 | MODEL_ARGS="--block-lm \ 3 | --num-layers 24 \ 4 | --hidden-size 1024 \ 5 | --num-attention-heads 16 \ 6 | --max-position-embeddings 512 \ 7 | --tokenizer-model-type bert-large-uncased \ 8 | --tokenizer-type BertWordPieceTokenizer \ 9 | --load-pretrained ${CHECKPOINT_PATH}/blocklm-large-blank" -------------------------------------------------------------------------------- /config_tasks/model_blocklm_large_chinese.sh: -------------------------------------------------------------------------------- 1 | MODEL_TYPE="blocklm-large-chinese" 2 | MODEL_ARGS="--block-lm \ 3 | --cloze-eval \ 4 | --task-mask \ 5 | --num-layers 24 \ 6 | --hidden-size 1024 \ 7 | --num-attention-heads 16 \ 8 | --max-position-embeddings 1024 \ 9 | --tokenizer-type ChineseSPTokenizer \ 10 | --fix-command-token \ 11 | --load-pretrained ${CHECKPOINT_PATH}/blocklm-large-chinese" -------------------------------------------------------------------------------- /config_tasks/model_blocklm_large_generation.sh: -------------------------------------------------------------------------------- 1 | MODEL_TYPE="generation-large" 2 | MODEL_ARGS="--block-lm \ 3 | --num-layers 24 \ 4 | --hidden-size 1024 \ 5 | --num-attention-heads 16 \ 6 | --max-position-embeddings 512 \ 7 | --tokenizer-model-type bert-large-uncased \ 8 | --tokenizer-type BertWordPieceTokenizer \ 9 | --load-pretrained ${CHECKPOINT_PATH}/blocklm-large-generation" -------------------------------------------------------------------------------- /config_tasks/model_blocklm_roberta_1.25.sh: -------------------------------------------------------------------------------- 1 | MODEL_TYPE="blocklm-roberta-1.25" 2 | MODEL_ARGS="--block-lm \ 3 | --cloze-eval \ 4 | --task-mask \ 5 | --num-layers 24 \ 6 | --hidden-size 1152 \ 7 | --num-attention-heads 18 \ 8 | --max-position-embeddings 1024 \ 9 | --tokenizer-model-type roberta \ 10 | --tokenizer-type GPT2BPETokenizer \ 11 | --load-pretrained /dataset/c07bd62b/checkpoints/blocklm-roberta-1.25-blank04-22-14-01" -------------------------------------------------------------------------------- /config_tasks/model_blocklm_roberta_large.sh: -------------------------------------------------------------------------------- 1 | MODEL_TYPE="blocklm-roberta-large" 2 | MODEL_ARGS="--block-lm \ 3 | --cloze-eval \ 4 | --num-layers 24 \ 5 | --hidden-size 1024 \ 6 | --num-attention-heads 16 \ 7 | --max-position-embeddings 512 \ 8 | --tokenizer-model-type roberta \ 9 | --tokenizer-type GPT2BPETokenizer \ 10 | --load-pretrained ${CHECKPOINT_PATH}/blocklm-roberta-large-blank" -------------------------------------------------------------------------------- /config_tasks/seq_blank.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-blank-${MASK_RATIO} 2 | TASK_NAME=blank 3 | DATA_PATH="${DATA_ROOT}/blank_yahoo" 4 | 5 | TRAIN_ARGS="--epochs 5 \ 6 | --batch-size 16 \ 7 | --lr 1e-5 \ 8 | --lr-decay-style linear \ 9 | --warmup 0.06 \ 10 | --weight-decay 1.0e-1 11 | --label-smoothing 0.1 \ 12 | --blank-maskratio ${MASK_RATIO} \ 13 | --save-epoch 5" 14 | 15 | COMMON_ARGS="--save-interval 10000 \ 16 | --log-interval 50 \ 17 | --eval-interval 1000 \ 18 | --eval-iters 100" 19 | 20 | TASK_ARGS="--src-seq-length 256 \ 21 | --tgt-seq-length 200 \ 22 | --min-tgt-length 0 \ 23 | --length-penalty 1 \ 24 | --no-repeat-ngram-size 3 \ 25 | --eval-batch-size 8" -------------------------------------------------------------------------------- /config_tasks/seq_cmrc.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-cmrc 2 | TASK_NAME=cmrc 3 | DATA_PATH="${DATA_ROOT}/CMRC2018" 4 | 5 | TRAIN_ARGS="--epochs 10 \ 6 | --batch-size 8 \ 7 | --lr 2e-5 \ 8 | --lr-decay-style linear \ 9 | --warmup 0.06 \ 10 | --weight-decay 1.0e-1 \ 11 | --label-smoothing 0.1" 12 | 13 | COMMON_ARGS="--save-interval 10000 \ 14 | --log-interval 50 \ 15 | --eval-interval 1000 \ 16 | --eval-iters 100 \ 17 | --eval-epoch 10" 18 | 19 | TASK_ARGS="--src-seq-length 464 \ 20 | --tgt-seq-length 48 \ 21 | --min-tgt-length 0 \ 22 | --length-penalty 0.7 \ 23 | --num-beams 5 \ 24 | --select-topk \ 25 | --eval-batch-size 4" -------------------------------------------------------------------------------- /config_tasks/seq_cnndm.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-cnndm 2 | TASK_NAME=cnn_dm 3 | DATA_PATH="/root/data/cnn_dm" 4 | 5 | TRAIN_ARGS="--epochs 15 \ 6 | --lr 3e-5 \ 7 | --lr-decay-style linear \ 8 | --warmup 0.06 \ 9 | --weight-decay 1.0e-1 \ 10 | --label-smoothing 0.1" 11 | 12 | COMMON_ARGS="--save-interval 10000 \ 13 | --log-interval 50 \ 14 | --eval-interval 1000 \ 15 | --eval-iters 100" 16 | 17 | TASK_ARGS="--src-seq-length 608 \ 18 | --tgt-seq-length 160 \ 19 | --min-tgt-length 55 \ 20 | --length-penalty 0.7 \ 21 | --no-repeat-ngram-size 3 \ 22 | --num-beams 5 \ 23 | --select-topk \ 24 | --eval-batch-size 4" -------------------------------------------------------------------------------- /config_tasks/seq_cnndm_org.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-cnndm_org 2 | TASK_NAME=cnn_dm_original 3 | DATA_PATH="${DATA_ROOT}/cnn_dm_original" 4 | 5 | TRAIN_ARGS="--epochs 10 \ 6 | --lr 1e-5 \ 7 | --lr-decay-style linear \ 8 | --warmup 0.06 \ 9 | --weight-decay 1.0e-1 \ 10 | --label-smoothing 0.1" 11 | 12 | COMMON_ARGS="--save-interval 10000 \ 13 | --log-interval 50 \ 14 | --eval-interval 1000 \ 15 | --eval-iters 100 \ 16 | --eval-epoch 2" 17 | 18 | TASK_ARGS="--src-seq-length 608 \ 19 | --tgt-seq-length 160 \ 20 | --min-tgt-length 55 \ 21 | --length-penalty 0.7 \ 22 | --no-repeat-ngram-size 3 \ 23 | --num-beams 5 \ 24 | --select-topk \ 25 | --eval-batch-size 1" -------------------------------------------------------------------------------- /config_tasks/seq_customization.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-customization 2 | TASK_NAME=customization 3 | DATA_PATH="${DATA_ROOT}/customization" 4 | 5 | TRAIN_ARGS="--epochs 10 \ 6 | --lr 1e-5 \ 7 | --lr-decay-style linear \ 8 | --warmup 0.06 \ 9 | --label-smoothing 0.1" 10 | 11 | COMMON_ARGS="--save-interval 10000 \ 12 | --log-interval 50 \ 13 | --eval-interval 1000 \ 14 | --eval-iters 100 \ 15 | --eval-epoch 2" 16 | 17 | TASK_ARGS="--src-seq-length 512 \ 18 | --tgt-seq-length 128 \ 19 | --min-tgt-length 55 \ 20 | --length-penalty 0.7 \ 21 | --no-repeat-ngram-size 3 \ 22 | --num-beams 5 \ 23 | --select-topk \ 24 | --eval-batch-size 1" -------------------------------------------------------------------------------- /config_tasks/seq_gigaword.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-ggw 2 | TASK_NAME=gigaword 3 | DATA_PATH="${DATA_ROOT}/gigaword/org_data" 4 | 5 | TRAIN_ARGS="--epochs 10 \ 6 | --lr 3e-5 \ 7 | --lr-decay-style linear \ 8 | --warmup 0.06 \ 9 | --weight-decay 1.0e-1 \ 10 | --label-smoothing 0.1" 11 | 12 | COMMON_ARGS="--save-interval 10000 \ 13 | --log-interval 50 \ 14 | --eval-interval 1000 \ 15 | --eval-iters 100" 16 | 17 | TASK_ARGS="--src-seq-length 192 \ 18 | --tgt-seq-length 32 \ 19 | --min-tgt-length 0 \ 20 | --length-penalty 0.6 \ 21 | --no-repeat-ngram-size 3 \ 22 | --num-beams 5 \ 23 | --select-topk \ 24 | --eval-batch-size 4" -------------------------------------------------------------------------------- /config_tasks/seq_squad.sh: -------------------------------------------------------------------------------- 1 | TASK_NAME=squad 2 | EXPERIMENT_NAME=${MODEL_TYPE}-${TASK_NAME} 3 | DATA_PATH="/dataset/fd5061f6/english_data/SQuAD" 4 | 5 | LR_SINGLE=1e-5 6 | EPOCH_SINGLE=10 7 | BATCH_SINGLE=12 8 | 9 | TRAIN_ARGS="--lr-decay-style linear \ 10 | --warmup 0.06 \ 11 | --weight-decay 1.0e-1 \ 12 | --label-smoothing 0.1" 13 | 14 | COMMON_ARGS="--save-interval 10000 \ 15 | --log-interval 200 \ 16 | --eval-interval 1000 \ 17 | --eval-iters 100 \ 18 | --eval-epoch 1 \ 19 | --overwrite" 20 | 21 | TASK_ARGS="--src-seq-length 512 \ 22 | --tgt-seq-length 64 \ 23 | --min-tgt-length 0 \ 24 | --length-penalty 0 \ 25 | --num-beams 5 \ 26 | --select-topk \ 27 | --eval-batch-size 8 \ 28 | --validation-metric F1" 29 | -------------------------------------------------------------------------------- /config_tasks/seq_squad_generation.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-squad 2 | TASK_NAME=squad_generation 3 | DATA_PATH="/dataset/c07bd62b/nqg/raw" 4 | 5 | TRAIN_ARGS="--epochs 10 \ 6 | --lr 2e-5 \ 7 | --lr-decay-style linear \ 8 | --warmup 0.06 \ 9 | --weight-decay 1.0e-1 \ 10 | --label-smoothing 0.1" 11 | 12 | COMMON_ARGS="--save-interval 10000 \ 13 | --log-interval 50 \ 14 | --eval-interval 1000 \ 15 | --eval-iters 100 \ 16 | --eval-epoch 10" 17 | 18 | TASK_ARGS="--src-seq-length 464 \ 19 | --tgt-seq-length 48 \ 20 | --min-tgt-length 0 \ 21 | --length-penalty 0.7 \ 22 | --num-beams 5 \ 23 | --select-topk \ 24 | --eval-batch-size 4" -------------------------------------------------------------------------------- /config_tasks/seq_squad_v1.sh: -------------------------------------------------------------------------------- 1 | TASK_NAME=squad_v1 2 | EXPERIMENT_NAME=${MODEL_TYPE}-${TASK_NAME} 3 | DATA_PATH="/dataset/fd5061f6/english_data/SQuAD" 4 | 5 | LR_SINGLE=1e-5 6 | EPOCH_SINGLE=5 7 | BATCH_SINGLE=12 8 | 9 | TRAIN_ARGS="--lr-decay-style linear \ 10 | --warmup 0.06 \ 11 | --weight-decay 1.0e-1 \ 12 | --label-smoothing 0.1" 13 | 14 | COMMON_ARGS="--save-interval 10000 \ 15 | --log-interval 200 \ 16 | --eval-interval 1000 \ 17 | --eval-iters 100 \ 18 | --eval-epoch 1 \ 19 | --overwrite" 20 | 21 | TASK_ARGS="--src-seq-length 512 \ 22 | --tgt-seq-length 64 \ 23 | --min-tgt-length 0 \ 24 | --length-penalty 0 \ 25 | --num-beams 5 \ 26 | --select-topk \ 27 | --eval-batch-size 8 \ 28 | --validation-metric F1" 29 | 30 | # --load /dataset/fd5061f6/finetune_checkpoints/blank-base-squad_v1 31 | # --load /dataset/fd5061f6/finetune_checkpoints/blocklm-roberta-large-squad_v1 -------------------------------------------------------------------------------- /config_tasks/seq_xsum.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-xsum 2 | TASK_NAME=xsum 3 | DATA_PATH="${DATA_ROOT}/bbc-summary-data" 4 | 5 | TRAIN_ARGS="--epochs 6 \ 6 | --lr 1e-5 \ 7 | --lr-decay-style linear \ 8 | --warmup 0.06 \ 9 | --weight-decay 1.0e-1 \ 10 | --label-smoothing 0.1" 11 | 12 | COMMON_ARGS="--save-interval 10000 \ 13 | --log-interval 50 \ 14 | --eval-interval 1000 \ 15 | --eval-iters 100 \ 16 | --eval-epoch 2" 17 | 18 | TASK_ARGS="--src-seq-length 608 \ 19 | --tgt-seq-length 60 \ 20 | --min-tgt-length 10 \ 21 | --length-penalty 1.0 \ 22 | --no-repeat-ngram-size 3 \ 23 | --num-beams 6 \ 24 | --select-topk \ 25 | --eval-batch-size 1" -------------------------------------------------------------------------------- /config_tasks/task_afqmc.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-AFQMC 2 | TASK_NAME=afqmc 3 | DATA_PATH="${DATA_ROOT}/AFQMC" 4 | MAX_SEQ_LEN=256 5 | 6 | LR_SINGLE=1e-5 7 | EPOCH_SINGLE=10 8 | XXLARGE_EPOCH=20 9 | 10 | TRAIN_ARGS="--lr-decay-style linear \ 11 | --warmup 0.1 \ 12 | --weight-decay 1.0e-1 \ 13 | --pattern-id 0" 14 | 15 | COMMON_ARGS="--save-interval 10000 \ 16 | --log-interval 50 \ 17 | --eval-interval 1000 \ 18 | --eval-iters 100" 19 | 20 | PATTERN_IDS=(0 1) 21 | PROMPT_IDS=(1 2 3) 22 | 23 | BATCH_SIZE=16 -------------------------------------------------------------------------------- /config_tasks/task_boolq.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-boolq 2 | TASK_NAME=BoolQ 3 | DATA_PATH="${DATA_ROOT}/BoolQ" 4 | MAX_SEQ_LEN=256 5 | 6 | LR_SINGLE=1e-5 7 | EPOCH_SINGLE=20 8 | XXLARGE_EPOCH=24 9 | 10 | TRAIN_ARGS="--lr-decay-style linear \ 11 | --warmup 0.1 \ 12 | --weight-decay 1.0e-1 \ 13 | --pattern-id 4" 14 | 15 | COMMON_ARGS="--save-interval 10000 \ 16 | --log-interval 50 \ 17 | --eval-interval 10000000 \ 18 | --eval-iters 100" 19 | 20 | PATTERN_IDS=(0 1 2 3 4 5) 21 | PROMPT_IDS=(1 2 3) 22 | 23 | BATCH_SIZE=16 -------------------------------------------------------------------------------- /config_tasks/task_cb.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-CB 2 | TASK_NAME=cb 3 | DATA_PATH="${DATA_ROOT}/CB" 4 | MAX_SEQ_LEN=256 5 | 6 | LR_SINGLE=1e-5 7 | EPOCH_SINGLE=50 8 | XXLARGE_EPOCH=100 9 | 10 | TRAIN_ARGS="--lr-decay-style linear \ 11 | --warmup 0.1 \ 12 | --weight-decay 1.0e-1 \ 13 | --pattern-id 3" 14 | 15 | COMMON_ARGS="--save-interval 10000 \ 16 | --log-interval 50 \ 17 | --eval-interval 1000 \ 18 | --eval-iters 100" 19 | 20 | PATTERN_IDS=(0 1 2 3) 21 | PROMPT_IDS=(1 2 3) 22 | 23 | BATCH_SIZE=16 -------------------------------------------------------------------------------- /config_tasks/task_cluewsc.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-CLUEWSC 2 | TASK_NAME=cluewsc 3 | DATA_PATH="${DATA_ROOT}/CLUEWSC" 4 | MAX_SEQ_LEN=256 5 | 6 | LR_SINGLE=1e-5 7 | EPOCH_SINGLE=10 8 | XXLARGE_EPOCH=12 9 | 10 | TRAIN_ARGS="--lr-decay-style linear \ 11 | --warmup 0.1 \ 12 | --weight-decay 1.0e-1 \ 13 | --pattern-id 0" 14 | 15 | COMMON_ARGS="--save-interval 10000 \ 16 | --log-interval 50 \ 17 | --eval-interval 1000 \ 18 | --eval-iters 100" 19 | 20 | PATTERN_IDS=(0 1) 21 | PROMPT_IDS=(1 2 3) 22 | 23 | BATCH_SIZE=16 -------------------------------------------------------------------------------- /config_tasks/task_cmrc.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-CMRC 2 | TASK_NAME=cmrc 3 | DATA_PATH="${DATA_ROOT}/CMRC" 4 | MAX_SEQ_LEN=256 5 | 6 | LR_SINGLE=1e-5 7 | EPOCH_SINGLE=10 8 | XXLARGE_EPOCH=12 9 | 10 | TRAIN_ARGS="--lr-decay-style linear \ 11 | --warmup 0.1 \ 12 | --weight-decay 1.0e-1 \ 13 | --pattern-id 0" 14 | 15 | COMMON_ARGS="--save-interval 10000 \ 16 | --log-interval 50 \ 17 | --eval-interval 1000 \ 18 | --eval-iters 100" 19 | 20 | PATTERN_IDS=(0 1) 21 | PROMPT_IDS=(1 2 3) 22 | 23 | BATCH_SIZE=16 -------------------------------------------------------------------------------- /config_tasks/task_copa.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-copa 2 | TASK_NAME=COPA 3 | DATA_PATH="${DATA_ROOT}/COPA" 4 | MAX_SEQ_LEN=256 5 | 6 | LR_SINGLE=1e-5 7 | EPOCH_SINGLE=50 8 | XXLARGE_EPOCH=100 9 | 10 | TRAIN_ARGS="--lr-decay-style linear \ 11 | --warmup 0.1 \ 12 | --weight-decay 1.0e-1 \ 13 | --pattern-id 0" 14 | 15 | COMMON_ARGS="--save-interval 10000 \ 16 | --log-interval 20 \ 17 | --eval-interval 1000 \ 18 | --eval-iters 100" 19 | 20 | PATTERN_IDS=(0 1) 21 | PROMPT_IDS=(1 2) 22 | 23 | BATCH_SIZE=16 -------------------------------------------------------------------------------- /config_tasks/task_multirc.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-MultiRC 2 | TASK_NAME=multirc 3 | DATA_PATH="${DATA_ROOT}/MultiRC" 4 | MAX_SEQ_LEN=512 5 | 6 | LR_SINGLE=1e-5 7 | EPOCH_SINGLE=15 8 | XXLARGE_EPOCH=12 9 | 10 | TRAIN_ARGS="--batch-size 16 \ 11 | --lr-decay-style linear \ 12 | --warmup 0.1 \ 13 | --weight-decay 1.0e-1 \ 14 | --pattern-id 0" 15 | 16 | COMMON_ARGS="--save-interval 10000 \ 17 | --log-interval 50 \ 18 | --eval-interval 10000000 \ 19 | --eval-iters 100" 20 | 21 | PATTERN_IDS=(0 1 2) 22 | PROMPT_IDS=(1 2 3) 23 | 24 | BATCH_SIZE=64 -------------------------------------------------------------------------------- /config_tasks/task_record.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-record 2 | TASK_NAME=ReCoRD 3 | DATA_PATH="${DATA_ROOT}/ReCoRD" 4 | MAX_SEQ_LEN=512 5 | 6 | LR_SINGLE=1e-5 7 | EPOCH_SINGLE=5 8 | XXLARGE_EPOCH=3 9 | 10 | TRAIN_ARGS="--lr-decay-style linear \ 11 | --warmup 0.1 \ 12 | --weight-decay 1.0e-1 \ 13 | --pattern-id 0" 14 | 15 | COMMON_ARGS="--save-interval 10000 \ 16 | --log-interval 50 \ 17 | --eval-interval 1000 \ 18 | --eval-iters 100 \ 19 | --deepspeed_config config_tasks/config_blocklm_10B_record.json" 20 | 21 | PATTERN_IDS=(0) 22 | 23 | BATCH_SIZE=64 -------------------------------------------------------------------------------- /config_tasks/task_rte.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-rte 2 | TASK_NAME=RTE 3 | DATA_PATH="${DATA_ROOT}/RTE" 4 | MAX_SEQ_LEN=256 5 | 6 | LR_SINGLE=1e-5 7 | EPOCH_SINGLE=50 8 | XXLARGE_EPOCH=50 9 | 10 | TRAIN_ARGS="--lr-decay-style linear \ 11 | --warmup 0.1 \ 12 | --weight-decay 1.0e-1 \ 13 | --pattern-id 0" 14 | 15 | COMMON_ARGS="--save-interval 10000 \ 16 | --log-interval 50 \ 17 | --eval-interval 10000000 \ 18 | --eval-iters 100" 19 | 20 | PATTERN_IDS=(0 1 2 3) 21 | PROMPT_IDS=(1 2 3) 22 | 23 | BATCH_SIZE=16 -------------------------------------------------------------------------------- /config_tasks/task_tnews.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-TNews 2 | TASK_NAME=tnews 3 | DATA_PATH="${DATA_ROOT}/TNews" 4 | MAX_SEQ_LEN=256 5 | 6 | LR_SINGLE=1e-5 7 | EPOCH_SINGLE=10 8 | XXLARGE_EPOCH=12 9 | 10 | TRAIN_ARGS="--lr-decay-style linear \ 11 | --warmup 0.1 \ 12 | --weight-decay 1.0e-1 \ 13 | --pattern-id 0" 14 | 15 | COMMON_ARGS="--save-interval 10000 \ 16 | --log-interval 50 \ 17 | --eval-interval 1000 \ 18 | --eval-iters 100" 19 | 20 | PATTERN_IDS=(0 1) 21 | PROMPT_IDS=(1 2 3) 22 | 23 | BATCH_SIZE=16 -------------------------------------------------------------------------------- /config_tasks/task_wic.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-wic 2 | TASK_NAME=wic 3 | DATA_PATH="${DATA_ROOT}/WiC" 4 | MAX_SEQ_LEN=256 5 | 6 | LR_SINGLE=1e-5 7 | EPOCH_SINGLE=30 8 | XXLARGE_EPOCH=40 9 | 10 | TRAIN_ARGS="--lr-decay-style linear \ 11 | --warmup 0.1 \ 12 | --weight-decay 1.0e-1 \ 13 | --pattern-id 1" 14 | 15 | COMMON_ARGS="--save-interval 10000 \ 16 | --log-interval 50 \ 17 | --eval-interval 10000000 \ 18 | --eval-iters 100" 19 | 20 | PATTERN_IDS=(0 1 2) 21 | PROMPT_IDS=(1 2 3) 22 | 23 | BATCH_SIZE=16 -------------------------------------------------------------------------------- /config_tasks/task_wsc.sh: -------------------------------------------------------------------------------- 1 | TASK_NAME=wsc 2 | EXPERIMENT_NAME=${MODEL_TYPE}-${TASK_NAME} 3 | DATA_PATH="${DATA_ROOT}/WSC-negative" 4 | MAX_SEQ_LEN=128 5 | 6 | LR_SINGLE=1e-5 7 | EPOCH_SINGLE=50 8 | XXLARGE_EPOCH=100 9 | 10 | TRAIN_ARGS="--lr-decay-style linear \ 11 | --warmup 0.1 \ 12 | --weight-decay 0.1 \ 13 | --loss-func mix \ 14 | --wsc-negative \ 15 | --length-penalty 1 \ 16 | --pattern-id 2" 17 | 18 | COMMON_ARGS="--save-interval 10000 \ 19 | --log-interval 50 \ 20 | --eval-interval 1000 \ 21 | --eval-iters 100" 22 | 23 | PATTERN_IDS=(0 1 2) 24 | PROMPT_IDS=(1 2 3) 25 | 26 | BATCH_SIZE=16 -------------------------------------------------------------------------------- /config_tasks/task_wsc_generative.sh: -------------------------------------------------------------------------------- 1 | TASK_NAME=wsc 2 | EXPERIMENT_NAME=${MODEL_TYPE}-${TASK_NAME}_generative 3 | DATA_PATH="${DATA_ROOT}/WSC" 4 | MAX_SEQ_LEN=128 5 | 6 | LR_SINGLE=1e-5 7 | EPOCH_SINGLE=50 8 | XXLARGE_EPOCH=100 9 | 10 | TRAIN_ARGS="--lr-decay-style linear \ 11 | --warmup 0.1 \ 12 | --weight-decay 0.1" 13 | 14 | COMMON_ARGS="--save-interval 10000 \ 15 | --log-interval 50 \ 16 | --eval-interval 1000 \ 17 | --eval-iters 100" 18 | 19 | BATCH_SIZE=16 20 | -------------------------------------------------------------------------------- /config_tasks/zero_lambada.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-lambda 2 | TASK_NAME=lambda 3 | DATA_PATH="${DATA_ROOT}/lambada_test.jsonl" 4 | EVALUATE_ARGS="--eval-batch-size 16 \ 5 | --seq-length 512" -------------------------------------------------------------------------------- /config_tasks/zero_lambada_uni.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-lambda_uni 2 | TASK_NAME=lambda 3 | DATA_PATH="${DATA_ROOT}/lambada_test.jsonl" 4 | EVALUATE_ARGS="--eval-batch-size 16 \ 5 | --seq-length 512 \ 6 | --unidirectional" -------------------------------------------------------------------------------- /config_tasks/zero_lm.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-lm 2 | TASK_NAME=language_model 3 | DATA_PATH=${DATA_ROOT}/bert-large-test.txt 4 | EVALUATE_ARGS="--eval-batch-size 16 \ 5 | --seq-length 512 \ 6 | --overlapping-eval 256" -------------------------------------------------------------------------------- /config_tasks/zero_lm_uni.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-lm_uni 2 | TASK_NAME=language_model 3 | DATA_PATH=${DATA_ROOT}/bert-large-test.txt 4 | EVALUATE_ARGS="--eval-batch-size 16 \ 5 | --seq-length 512 \ 6 | --overlapping-eval 256 \ 7 | --unidirectional" -------------------------------------------------------------------------------- /config_tasks/zero_wikitext.sh: -------------------------------------------------------------------------------- 1 | EXPERIMENT_NAME=${MODEL_TYPE}-wikitext 2 | TASK_NAME=wikitext 3 | DATA_PATH=/dataset/c07bd62b/wikitext-103/wiki.test.tokens 4 | EVALUATE_ARGS="--eval-batch-size 16 \ 5 | --seq-length 1024 \ 6 | --overlapping-eval 256" -------------------------------------------------------------------------------- /data_utils/extraction.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import glob 3 | import json 4 | import os 5 | 6 | nltk.download('punkt') 7 | 8 | 9 | class NLTKSegmenter: 10 | def __init(self): 11 | pass 12 | 13 | @staticmethod 14 | def segment_string(article): 15 | return nltk.tokenize.sent_tokenize(article) 16 | 17 | 18 | wiki_path = "data/extracted" 19 | output_path = "formatted/wiki-key.txt" 20 | segmenter = NLTKSegmenter() 21 | with open(output_path, "w") as output: 22 | for dirname in glob.glob(os.path.join(wiki_path, '*'), recursive=False): 23 | for filename in glob.glob(os.path.join(dirname, 'wiki_*'), recursive=True): 24 | print(filename) 25 | article_lines = [] 26 | article_open = False 27 | with open(filename, mode='r', newline='\n') as file: 28 | for line in file: 29 | line = line.rstrip() 30 | if '' in line: 33 | key_sentences, contents = [], [] 34 | key, content = None, [] 35 | for sentences in article_lines[1:]: 36 | if len(sentences) > 1: 37 | if key: 38 | if len(content) > 0 or len(contents) == 0: 39 | key_sentences.append(key) 40 | contents.append(content) 41 | else: 42 | contents[-1].append(key) 43 | key, content = None, [] 44 | key_sentences.append(sentences[0]) 45 | contents.append(sentences[1:]) 46 | elif len(sentences) > 0: 47 | if key: 48 | content.append(sentences[0]) 49 | else: 50 | key = sentences[0] 51 | if key: 52 | if len(content) > 0 or len(contents) == 0: 53 | key_sentences.append(key) 54 | contents.append(content) 55 | else: 56 | contents[-1].append(key) 57 | contents = [" ".join(content) for content in contents] 58 | article = {"key": key_sentences, "content": contents} 59 | output.write(json.dumps(article)) 60 | output.write("\n") 61 | article_open = False 62 | article_lines = [] 63 | else: 64 | if article_open and line: 65 | sentences = segmenter.segment_string(line) 66 | article_lines.append(sentences) 67 | -------------------------------------------------------------------------------- /data_utils/samplers.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """batch samplers that work with either random or sequential data samplers""" 16 | import math 17 | import os 18 | import sys 19 | 20 | import torch 21 | from torch.utils import data 22 | import numpy as np 23 | 24 | 25 | class RandomSampler(data.sampler.Sampler): 26 | r""" 27 | Based off of pytorch RandomSampler and DistributedSampler. Essentially a RandomSampler, 28 | but this class lets the user set an epoch like DistributedSampler 29 | Samples elements randomly. If without replacement, then sample from a shuffled dataset. 30 | If with replacement, then user can specify ``num_samples`` to draw. 31 | Arguments: 32 | data_source (Dataset): dataset to sample from 33 | num_samples (int): number of samples to draw, default=len(dataset) 34 | replacement (bool): samples are drawn with replacement if ``True``, default=False 35 | """ 36 | 37 | def __init__(self, data_source, replacement=False, num_samples=None): 38 | super(RandomSampler, self).__init__(data_source) 39 | self.data_source = data_source 40 | self.replacement = replacement 41 | self._num_samples = num_samples 42 | self.epoch = -1 43 | 44 | if self._num_samples is not None and replacement is False: 45 | raise ValueError("With replacement=False, num_samples should not be specified, " 46 | "since a random permute will be performed.") 47 | 48 | if not isinstance(self.num_samples, int) or self.num_samples <= 0: 49 | raise ValueError("num_samples should be a positive integer " 50 | "value, but got num_samples={}".format(self.num_samples)) 51 | if not isinstance(self.replacement, bool): 52 | raise ValueError("replacement should be a boolean value, but got " 53 | "replacement={}".format(self.replacement)) 54 | 55 | @property 56 | def num_samples(self): 57 | # dataset size might change at runtime 58 | if self._num_samples is None: 59 | return len(self.data_source) 60 | return self._num_samples 61 | 62 | def __iter__(self): 63 | n = len(self.data_source) 64 | g = torch.Generator() 65 | if self.epoch >= 0: 66 | g.manual_seed(self.epoch) 67 | if self.replacement: 68 | for _ in range(self.num_samples // 32): 69 | yield from torch.randint(high=n, size=(32,), dtype=torch.int64, generator=g).tolist() 70 | yield from torch.randint(high=n, size=(self.num_samples % 32,), dtype=torch.int64, 71 | generator=g).tolist() 72 | else: 73 | yield from torch.randperm(n, generator=self.generator).tolist() 74 | 75 | def __len__(self): 76 | return self.num_samples 77 | 78 | def set_epoch(self, epoch): 79 | self.epoch = epoch 80 | 81 | 82 | class DistributedSequentialSampler(data.sampler.Sampler): 83 | def __init__(self, num_samples, train_iters, batch_size, rank=-1, world_size=2): 84 | super().__init__(num_samples) 85 | if rank == -1: 86 | rank = 0 87 | world_size = 1 88 | self.num_samples = num_samples 89 | self.rank = rank 90 | self.world_size = world_size 91 | self.start_iter = 0 92 | self.train_iters = train_iters 93 | self.batch_size = batch_size 94 | self.batch_bias = [i * (num_samples // batch_size) for i in range(batch_size)] 95 | 96 | def __iter__(self): 97 | for idx in range(self.start_iter, self.train_iters * 10): 98 | batch = [(idx + bias) % self.num_samples for bias in self.batch_bias] 99 | tbatch = self._batch(batch) 100 | yield tbatch 101 | 102 | def __len__(self): 103 | return self.train_iters 104 | 105 | def _batch(self, batch): 106 | """extracts samples only pertaining to this worker's batch""" 107 | start = self.rank*self.batch_size//self.world_size 108 | end = (self.rank+1)*self.batch_size//self.world_size 109 | return batch[start:end] 110 | 111 | 112 | class DistributedBatchSampler(data.sampler.BatchSampler): 113 | """ 114 | similar to normal implementation of distributed sampler, except implementation is at the 115 | batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary 116 | data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler. 117 | """ 118 | def __init__(self, sampler, batch_size, drop_last, rank=-1, world_size=2, wrap_last=False, gradient_accumulation_steps=None): 119 | super(DistributedBatchSampler, self).__init__(sampler, batch_size, drop_last) 120 | if rank == -1: 121 | assert False, 'should not be here' 122 | self.rank = rank 123 | self.world_size = world_size 124 | self.sampler.wrap_around = 0 125 | self.wrap_around = 0 126 | self.wrap_last = wrap_last 127 | self.start_iter = 0 128 | self.effective_batch_size = batch_size if gradient_accumulation_steps is None else batch_size * gradient_accumulation_steps 129 | 130 | def __iter__(self): 131 | batch = [] 132 | i = 0 133 | for idx in self.data_iterator(self.sampler, wrap_around=False): 134 | batch.append(idx) 135 | if len(batch) == self.batch_size: 136 | tbatch = self._batch(batch) 137 | if i >= self.start_iter * self.effective_batch_size: 138 | yield tbatch 139 | self.start_iter = 0 140 | i += len(batch) 141 | batch = [] 142 | batch_len = len(batch) 143 | if batch_len > 0 and not self.drop_last: 144 | if self.wrap_last: 145 | self.sampler.wrap_around -= (self.batch_size) 146 | self.wrap_around += (len(batch)) 147 | self.wrap_around %= self.batch_size 148 | yield self._batch(batch) 149 | if self.wrap_last: 150 | self.sampler.wrap_around += self.batch_size 151 | 152 | def data_iterator(self, _iter, wrap_around=False): 153 | """iterates through data and handles wrap around""" 154 | for i, idx in enumerate(_iter): 155 | if i < self.wrap_around%self.batch_size: 156 | continue 157 | if wrap_around: 158 | self.wrap_around += 1 159 | self.wrap_around %= self.batch_size 160 | yield idx 161 | 162 | def _batch(self, batch): 163 | """extracts samples only pertaining to this worker's batch""" 164 | start = self.rank*self.batch_size//self.world_size 165 | end = (self.rank+1)*self.batch_size//self.world_size 166 | return batch[start:end] 167 | -------------------------------------------------------------------------------- /data_utils/sp_tokenizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | from https://github.com/openai/gpt-2/, changed for chinese 3 | """ 4 | import json 5 | import os 6 | import sentencepiece as spm 7 | 8 | """ 9 | SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation 10 | systems where the vocabulary size is predetermined prior to the neural model training. SentencePiece implements 11 | subword units (e.g., byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model [Kudo.]) with the 12 | extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end 13 | system that does not depend on language-specific pre/postprocessing. 14 | https://github.com/google/sentencepiece 15 | 16 | pip install sentencepiece 17 | 18 | or git clone https://github.com/google/sentencepiece.git 19 | python setup.py install 20 | 21 | """ 22 | PRETRAINED_MODEL_FILE = "chinese_sentencepiece/cog-pretrain.model" 23 | 24 | 25 | def get_pairs(word): 26 | pairs = set() 27 | prev_char = word[0] 28 | for char in word[1:]: 29 | pairs.add((prev_char, char)) 30 | prev_char = char 31 | return pairs 32 | 33 | 34 | class Encoder: 35 | def __init__(self, encoder, bpe_merges): 36 | self.encoder = encoder 37 | self.decoder = {v: k for k, v in self.encoder.items()} 38 | self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) 39 | self.cache = {} 40 | self.max_len = 0 41 | 42 | def bpe(self, token): 43 | if token in self.cache: 44 | return self.cache[token] 45 | word = tuple(token) 46 | pairs = get_pairs(word) 47 | if not pairs: 48 | return token 49 | 50 | while True: 51 | bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) 52 | if bigram not in self.bpe_ranks: 53 | break 54 | first, second = bigram 55 | new_word = [] 56 | i = 0 57 | while i < len(word): 58 | try: 59 | j = word.index(first, i) 60 | new_word.extend(word[i:j]) 61 | i = j 62 | except: 63 | new_word.extend(word[i:]) 64 | break 65 | 66 | if word[i] == first and i < len(word) - 1 and word[i + 1] == second: 67 | new_word.append(first + second) 68 | i += 2 69 | else: 70 | new_word.append(word[i]) 71 | i += 1 72 | new_word = tuple(new_word) 73 | word = new_word 74 | if len(word) == 1: 75 | break 76 | else: 77 | pairs = get_pairs(word) 78 | word = ' '.join(word) 79 | self.cache[token] = word 80 | return word 81 | 82 | def encode(self, text): 83 | return [self.encoder.get(token, 1) for token in self.tokenize(text)] 84 | 85 | def decode(self, tokens): 86 | text = ''.join([self.decoder[token] for token in tokens]) 87 | return text 88 | 89 | def tokenize(self, text): 90 | bpe_tokens = [] 91 | bpe_tokens.extend(bpe_token for bpe_token in self.bpe(text).split(' ')) 92 | return bpe_tokens 93 | 94 | def convert_tokens_to_ids(self, tokens): 95 | return [self.encoder.get(token, 1) for token in tokens] 96 | 97 | 98 | class Encoder_SP: 99 | def __init__(self, model_path): 100 | self.sp = spm.SentencePieceProcessor() 101 | self.sp.Load(model_path) 102 | 103 | def encode(self, text): 104 | """ 105 | text="...." 106 | """ 107 | return self.sp.EncodeAsIds(text) 108 | 109 | def decode(self, tokens): 110 | """ 111 | tokens=[x1,x2,...] 112 | """ 113 | text = [int(token) for token in tokens] 114 | # print(text) 115 | return self.sp.DecodeIds(text) 116 | 117 | def tokenize(self, text): 118 | return self.sp.EncodeAsPieces(text) 119 | 120 | def convert_tokens_to_ids(self, tokens): 121 | return [self.sp.PieceToId(token) for token in tokens] 122 | 123 | def convert_token_to_id(self, token): 124 | return self.sp.PieceToId(token) 125 | 126 | def convert_id_to_token(self, idx): 127 | return self.sp.IdToPiece(idx) 128 | 129 | 130 | def get_encoder(encoder_file, bpe_file): 131 | # 以下是为了同一个函数入兼容sentencepiece 132 | filepath, filename = os.path.split(encoder_file) 133 | shotname, extension = os.path.splitext(filename) 134 | 135 | if (".model" == extension) and (bpe_file == ""): 136 | return Encoder_SP(encoder_file) 137 | else: 138 | with open(encoder_file, 'r', encoding="utf-8") as f: 139 | encoder = json.load(f) 140 | with open(bpe_file, 'r', encoding="utf-8") as f: 141 | bpe_data = f.read() 142 | bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]] 143 | return Encoder( 144 | encoder=encoder, 145 | bpe_merges=bpe_merges, 146 | ) 147 | 148 | 149 | def from_pretrained(): 150 | return get_encoder(PRETRAINED_MODEL_FILE, "") -------------------------------------------------------------------------------- /docker/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | config=/root/.jupyter/jupyter_notebook_config.py 4 | 5 | if [ ! -f $config ]; then 6 | 7 | cat > $config <> $config 23 | else 24 | default_pwd=`python -c "from notebook.auth import passwd; pwd=passwd('${default_pwd}'); print(pwd);"` 25 | echo "sha1 password: $default_pwd" 26 | echo "default password: $default_pwd" 27 | 28 | echo "c.NotebookApp.password ='${default_pwd}'" >> $config 29 | fi 30 | 31 | fi 32 | -------------------------------------------------------------------------------- /docker/ssh-env-config.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This command wrapper sets up SSH config files based on the following 4 | # environment variables: 5 | # 6 | # SSH_CONFIG - contents of an SSH config file 7 | # SSH_KNOWN_HOSTS - contents of a SSH known_hosts file 8 | # SSH_PRIVATE_RSA_KEY - contents of a SSH private RSA key 9 | # SSH_PRIVATE_DSA_KEY - contents of a SSH private DSA key 10 | # SSH_DEBUG - switch to a high debug level 3 for all hosts, to help solve SSH issues 11 | # 12 | # The environment variables are unset after the files are created to help 13 | # prevent accidental output in logs 14 | 15 | set -e 16 | 17 | if [ -z "$SSH_CONFIG" ] && \ 18 | [ -z "$SSH_CONFIG_B64" ] && \ 19 | [ -z "$SSH_CONFIG_PATH" ] && \ 20 | [ -z "$SSH_KNOWN_HOSTS" ] && \ 21 | [ -z "$SSH_KNOWN_HOSTS_B64" ] && \ 22 | [ -z "$SSH_KNOWN_HOSTS_PATH" ] && \ 23 | [ -z "$SSH_PRIVATE_RSA_KEY" ] && \ 24 | [ -z "$SSH_PRIVATE_RSA_KEY_B64" ] && \ 25 | [ -z "$SSH_PRIVATE_RSA_KEY_PATH" ] && \ 26 | [ -z "$SSH_PRIVATE_DSA_KEY" ] && \ 27 | [ -z "$SSH_PRIVATE_DSA_KEY_B64" ] && \ 28 | [ -z "$SSH_PRIVATE_DSA_KEY_PATH" ] && \ 29 | [ -z "$SSH_DEBUG" ]; then 30 | # none of the ENV vars we care about found, so skip the logic in this script 31 | [[ $1 ]] && exec "$@" 32 | fi 33 | 34 | mkdir -p ~/.ssh 35 | chmod 700 ~/.ssh 36 | 37 | decode_base64() { 38 | # Determine the platform dependent base64 decode argument 39 | if [ "$(echo 'eA==' | base64 -d 2> /dev/null)" = 'x' ]; then 40 | local BASE64_DECODE_ARG='-d' 41 | else 42 | local BASE64_DECODE_ARG='--decode' 43 | fi 44 | 45 | echo "$1" | tr -d '\n' | base64 "$BASE64_DECODE_ARG" 46 | } 47 | 48 | ## ~/.ssh/config 49 | 50 | [[ ! -z "$SSH_CONFIG" ]] && \ 51 | echo "$SSH_CONFIG" > ~/.ssh/config && \ 52 | chmod 600 ~/.ssh/config && \ 53 | unset SSH_CONFIG 54 | 55 | [[ ! -z "$SSH_CONFIG_B64" ]] && \ 56 | decode_base64 "$SSH_CONFIG_B64" > ~/.ssh/config && \ 57 | chmod 600 ~/.ssh/config && \ 58 | unset SSH_CONFIG_B64 59 | 60 | [[ ! -z "$SSH_CONFIG_PATH" && ! -a ~/.ssh/config ]] && \ 61 | cp "$SSH_CONFIG_PATH" ~/.ssh/config && \ 62 | chmod 600 ~/.ssh/config && \ 63 | unset SSH_CONFIG_PATH 64 | 65 | ## ~/.ssh/known_hosts 66 | 67 | [[ ! -z "$SSH_KNOWN_HOSTS" ]] && \ 68 | echo "$SSH_KNOWN_HOSTS" > ~/.ssh/known_hosts && \ 69 | chmod 600 ~/.ssh/known_hosts && \ 70 | unset SSH_KNOWN_HOSTS 71 | 72 | [[ ! -z "$SSH_KNOWN_HOSTS_B64" ]] && \ 73 | decode_base64 "$SSH_KNOWN_HOSTS_B64" > ~/.ssh/known_hosts && \ 74 | chmod 600 ~/.ssh/known_hosts && \ 75 | unset SSH_KNOWN_HOSTS_B64 76 | 77 | [[ ! -z "$SSH_KNOWN_HOSTS_PATH" && ! -a ~/.ssh/known_hosts ]] && \ 78 | cp "$SSH_KNOWN_HOSTS_PATH" ~/.ssh/known_hosts && \ 79 | chmod 600 ~/.ssh/known_hosts && \ 80 | unset SSH_KNOWN_HOSTS_PATH 81 | 82 | ## ~/.ssh/id_rsa 83 | 84 | [[ ! -z "$SSH_PRIVATE_RSA_KEY" ]] && \ 85 | echo "$SSH_PRIVATE_RSA_KEY" > ~/.ssh/id_rsa && \ 86 | chmod 600 ~/.ssh/id_rsa && \ 87 | unset SSH_PRIVATE_RSA_KEY 88 | 89 | [[ ! -z "$SSH_PRIVATE_RSA_KEY_B64" ]] && \ 90 | decode_base64 "$SSH_PRIVATE_RSA_KEY_B64" > ~/.ssh/id_rsa && \ 91 | chmod 600 ~/.ssh/id_rsa && \ 92 | unset SSH_PRIVATE_RSA_KEY_B64 93 | 94 | [[ ! -z "$SSH_PRIVATE_RSA_KEY_PATH" && ! -a ~/.ssh/id_rsa ]] && \ 95 | cp "$SSH_PRIVATE_RSA_KEY_PATH" ~/.ssh/id_rsa && \ 96 | chmod 600 ~/.ssh/id_rsa && \ 97 | unset SSH_PRIVATE_RSA_KEY_PATH 98 | 99 | ## ~/.ssh/id_dsa 100 | 101 | [[ ! -z "$SSH_PRIVATE_DSA_KEY" ]] && \ 102 | echo "$SSH_PRIVATE_DSA_KEY" > ~/.ssh/id_dsa && \ 103 | chmod 600 ~/.ssh/id_dsa && \ 104 | unset SSH_PRIVATE_DSA_KEY 105 | 106 | [[ ! -z "$SSH_PRIVATE_DSA_KEY_B64" ]] && \ 107 | decode_base64 "$SSH_PRIVATE_DSA_KEY_B64" > ~/.ssh/id_dsa && \ 108 | chmod 600 ~/.ssh/id_dsa && \ 109 | unset SSH_PRIVATE_DSA_KEY_B64 110 | 111 | [[ ! -z "$SSH_PRIVATE_DSA_KEY_PATH" && ! -a ~/.ssh/id_dsa ]] && \ 112 | cp "$SSH_PRIVATE_DSA_KEY_PATH" ~/.ssh/id_dsa && \ 113 | chmod 600 ~/.ssh/id_dsa && \ 114 | unset SSH_PRIVATE_DSA_KEY_PATH 115 | 116 | ## ssh debug mode 117 | 118 | [[ ! -z "$SSH_DEBUG" ]] && \ 119 | touch ~/.ssh/config && \ 120 | chmod 600 ~/.ssh/config && \ 121 | echo -e "Host *\n LogLevel DEBUG3" >> ~/.ssh/config && \ 122 | unset SSH_DEBUG 123 | 124 | [[ $1 ]] && exec "$@" 125 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # GLM Examples 2 | This is a directory that collects GLM's implementation over various NLP datasets. 3 | We feel lucky to collaborate with all contributors that share their implementations here. 4 | 5 | ## Make You Pull Requests (PRs) 6 | If you also want to become a contributor of GLM, we encourage you to make PR to this repo according to the following PR rules. 7 | The maintainer will check the validity before accept the PR. 8 | 9 | ### Directory Structure 10 | Each PR should include the code and markdown description in a subdirectory of the current `examples` directory. 11 | An example subdirectory tree is as follows: 12 | 13 | ``` 14 | └── examples 15 | └── : Huggingface Datasets identifier (recommended) or customized name 16 | ├── README.md 17 | ├── requirements.txt 18 | └── 19 | ``` 20 | 21 | Please exclude data files in the PR as they take up too much space, and describe the method to acquire the data in your `README.md`. 22 | 23 | ### Task Description (README.md) 24 | Please at least include the following sections in your README to help its better use: 25 | 26 | + **Dataset Name**: serves as the markdown title. 27 | + **Authors**: Your name(s), contacts (email), and the url to your homepage(s) (if available). 28 | + **Task Description**: A short paragraph to briefly introduce what the dataset and corresponding task is about. 29 | + **Running Commands**: Provide the bash/shell commands for preprocessing, training, and inference. 30 | + **Results**: Please provide your implementation's final performance, along with other available comparison methods'. Ensure that they are reproducible once using your provided `Running Commands`. 31 | + **Reference**: Proper citation information for the dataset and related comparison methods. 32 | 33 | ### Environment Requirements (requirements.txt) 34 | Please include the necessary python packages in the file for other users to reproduce your results. 35 | 36 | ## Example List (To Be Updated) 37 | TODO 38 | -------------------------------------------------------------------------------- /fp16/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from .fp16util import ( 16 | BN_convert_float, 17 | network_to_half, 18 | prep_param_lists, 19 | model_grads_to_master_grads, 20 | master_params_to_model_params, 21 | tofp16, 22 | to_python_float, 23 | clip_grad_norm, 24 | convert_module, 25 | convert_network, 26 | FP16Model, 27 | ) 28 | 29 | from .fp16 import * 30 | from .loss_scaler import * 31 | -------------------------------------------------------------------------------- /fp16/fp16util.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | import torch.nn as nn 18 | from torch.autograd import Variable 19 | from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors 20 | 21 | import mpu 22 | 23 | 24 | class tofp16(nn.Module): 25 | """ 26 | Utility module that implements:: 27 | 28 | def forward(self, input): 29 | return input.half() 30 | """ 31 | 32 | def __init__(self): 33 | super(tofp16, self).__init__() 34 | 35 | def forward(self, input): 36 | return input.half() 37 | 38 | 39 | def BN_convert_float(module): 40 | """ 41 | Utility function for network_to_half(). 42 | 43 | Retained for legacy purposes. 44 | """ 45 | if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True: 46 | module.float() 47 | for child in module.children(): 48 | BN_convert_float(child) 49 | return module 50 | 51 | 52 | def network_to_half(network): 53 | """ 54 | Convert model to half precision in a batchnorm-safe way. 55 | 56 | Retained for legacy purposes. It is recommended to use FP16Model. 57 | """ 58 | return nn.Sequential(tofp16(), BN_convert_float(network.half())) 59 | 60 | 61 | def convert_module(module, dtype): 62 | """ 63 | Converts a module's immediate parameters and buffers to dtype. 64 | """ 65 | for param in module.parameters(recurse=False): 66 | if param is not None: 67 | if param.data.dtype.is_floating_point: 68 | param.data = param.data.to(dtype=dtype) 69 | if param._grad is not None and param._grad.data.dtype.is_floating_point: 70 | param._grad.data = param._grad.data.to(dtype=dtype) 71 | 72 | for buf in module.buffers(recurse=False): 73 | if buf is not None and buf.data.dtype.is_floating_point: 74 | buf.data = buf.data.to(dtype=dtype) 75 | 76 | 77 | def convert_network(network, dtype): 78 | """ 79 | Converts a network's parameters and buffers to dtype. 80 | """ 81 | for module in network.modules(): 82 | if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True: 83 | continue 84 | convert_module(module, dtype) 85 | return network 86 | 87 | 88 | class FP16Model(nn.Module): 89 | """ 90 | Convert model to half precision in a batchnorm-safe way. 91 | """ 92 | 93 | def __init__(self, network): 94 | super(FP16Model, self).__init__() 95 | self.network = convert_network(network, dtype=torch.half) 96 | 97 | def forward(self, *inputs): 98 | inputs = tuple(t.half() for t in inputs) 99 | return self.network(*inputs) 100 | 101 | 102 | def backwards_debug_hook(grad): 103 | raise RuntimeError("master_params recieved a gradient in the backward pass!") 104 | 105 | def prep_param_lists(model, flat_master=False): 106 | """ 107 | Creates a list of FP32 master parameters for a given model, as in 108 | `Training Neural Networks with Mixed Precision: Real Examples`_. 109 | 110 | Args: 111 | model (torch.nn.Module): Existing Pytorch model 112 | flat_master (bool, optional, default=False): Flatten the master parameters into a single tensor, as a performance optimization. 113 | Returns: 114 | A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`. ``master_params`` is a list of FP32 master gradients. If ``flat_master=True``, ``master_params`` will be a list with one element. 115 | 116 | Example:: 117 | 118 | model_params, master_params = prep_param_lists(model) 119 | 120 | .. warning:: 121 | Currently, if ``flat_master=True``, all the model's parameters must be the same type. If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`. 122 | 123 | .. _`Training Neural Networks with Mixed Precision: Real Examples`: 124 | http://on-demand.gputechconf.com/gtc/2018/video/S81012/ 125 | """ 126 | model_params = [param for param in model.parameters() if param.requires_grad] 127 | 128 | if flat_master: 129 | # Give the user some more useful error messages 130 | try: 131 | # flatten_dense_tensors returns a contiguous flat array. 132 | # http://pytorch.org/docs/master/_modules/torch/_utils.html 133 | master_params = _flatten_dense_tensors([param.data for param in model_params]).float() 134 | except: 135 | print("Error in prep_param_lists: model may contain a mixture of parameters " 136 | "of different types. Use flat_master=False, or use F16_Optimizer.") 137 | raise 138 | master_params = torch.nn.Parameter(master_params) 139 | master_params.requires_grad = True 140 | # master_params.register_hook(backwards_debug_hook) 141 | if master_params.grad is None: 142 | master_params.grad = master_params.new(*master_params.size()) 143 | return model_params, [master_params] 144 | else: 145 | master_params = [param.clone().float().detach() for param in model_params] 146 | for param in master_params: 147 | param.requires_grad = True 148 | return model_params, master_params 149 | 150 | 151 | def model_grads_to_master_grads(model_params, master_params, flat_master=False): 152 | """ 153 | Copy model gradients to master gradients. 154 | 155 | Args: 156 | model_params: List of model parameters created by :func:`prep_param_lists`. 157 | master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`. 158 | """ 159 | if flat_master: 160 | # The flattening may incur one more deep copy than is necessary. 161 | master_params[0].grad.data.copy_( 162 | _flatten_dense_tensors([p.grad.data for p in model_params])) 163 | else: 164 | for model, master in zip(model_params, master_params): 165 | if model.grad is not None: 166 | if master.grad is None: 167 | master.grad = Variable(master.data.new(*master.data.size())) 168 | master.grad.data.copy_(model.grad.data) 169 | else: 170 | master.grad = None 171 | 172 | 173 | def master_params_to_model_params(model_params, master_params, flat_master=False): 174 | """ 175 | Copy master parameters to model parameters. 176 | 177 | Args: 178 | model_params: List of model parameters created by :func:`prep_param_lists`. 179 | master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`. 180 | """ 181 | if flat_master: 182 | for model, master in zip(model_params, 183 | _unflatten_dense_tensors(master_params[0].data, model_params)): 184 | model.data.copy_(master) 185 | else: 186 | for model, master in zip(model_params, master_params): 187 | model.data.copy_(master.data) 188 | 189 | # Backward compatibility fixes 190 | 191 | def to_python_float(t): 192 | if hasattr(t, 'item'): 193 | return t.item() 194 | else: 195 | return t[0] 196 | 197 | TORCH_MAJOR = int(torch.__version__.split('.')[0]) 198 | TORCH_MINOR = int(torch.__version__.split('.')[1]) 199 | 200 | clip_grad_norm = mpu.clip_grad_norm 201 | #elif TORCH_MAJOR == 0 and TORCH_MINOR <= 4: 202 | # clip_grad_norm = torch.nn.utils.clip_grad_norm 203 | #else: 204 | # clip_grad_norm = torch.nn.utils.clip_grad_norm_ 205 | -------------------------------------------------------------------------------- /learning_rates.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch DataLoader for TFRecords""" 16 | 17 | import torch 18 | from torch.optim.lr_scheduler import _LRScheduler 19 | import math 20 | 21 | 22 | class AnnealingLR(_LRScheduler): 23 | """Anneals the learning rate from start to zero along a cosine curve.""" 24 | 25 | DECAY_STYLES = ['linear', 'cosine', 'exponential', 'constant', 'None'] 26 | 27 | def __init__(self, optimizer, start_lr, warmup_iter, num_iters, decay_style=None, last_iter=-1, decay_ratio=0.5): 28 | assert warmup_iter <= num_iters 29 | self.optimizer = optimizer 30 | self.start_lr = start_lr 31 | self.warmup_iter = warmup_iter 32 | self.num_iters = last_iter + 1 33 | self.end_iter = num_iters 34 | self.decay_style = decay_style.lower() if isinstance(decay_style, str) else None 35 | self.decay_ratio = 1 / decay_ratio 36 | self.step(self.num_iters) 37 | if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: 38 | print(f'learning rate decaying style {self.decay_style}, ratio {self.decay_ratio}') 39 | 40 | def get_lr(self): 41 | # https://openreview.net/pdf?id=BJYwwY9ll pg. 4 42 | if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter: 43 | return float(self.start_lr) * self.num_iters / self.warmup_iter 44 | else: 45 | if self.decay_style == self.DECAY_STYLES[0]: 46 | decay_step_ratio = (self.num_iters - self.warmup_iter) / self.end_iter 47 | return self.start_lr - self.start_lr * (1 - 1 / self.decay_ratio) * decay_step_ratio 48 | elif self.decay_style == self.DECAY_STYLES[1]: 49 | decay_step_ratio = min(1.0, (self.num_iters - self.warmup_iter) / self.end_iter) 50 | return self.start_lr / self.decay_ratio * ( 51 | (math.cos(math.pi * decay_step_ratio) + 1) * (self.decay_ratio - 1) / 2 + 1) 52 | elif self.decay_style == self.DECAY_STYLES[2]: 53 | # TODO: implement exponential decay 54 | return self.start_lr 55 | else: 56 | return self.start_lr 57 | 58 | def step(self, step_num=None): 59 | if step_num is None: 60 | step_num = self.num_iters + 1 61 | self.num_iters = step_num 62 | new_lr = self.get_lr() 63 | for group in self.optimizer.param_groups: 64 | group['lr'] = new_lr 65 | 66 | def state_dict(self): 67 | sd = { 68 | # 'start_lr': self.start_lr, 69 | 'warmup_iter': self.warmup_iter, 70 | 'num_iters': self.num_iters, 71 | 'decay_style': self.decay_style, 72 | 'end_iter': self.end_iter, 73 | 'decay_ratio': self.decay_ratio 74 | } 75 | return sd 76 | 77 | def load_state_dict(self, sd): 78 | # self.start_lr = sd['start_lr'] 79 | self.warmup_iter = sd['warmup_iter'] 80 | self.num_iters = sd['num_iters'] 81 | # self.end_iter = sd['end_iter'] 82 | # self.decay_style = sd['decay_style'] 83 | # if 'decay_ratio' in sd: 84 | # self.decay_ratio = sd['decay_ratio'] 85 | self.step(self.num_iters) 86 | 87 | def switch_linear(self, args): 88 | current_lr = self.get_lr() 89 | self.start_lr = current_lr 90 | self.end_iter = args.train_iters - self.num_iters 91 | self.num_iters = 0 92 | self.decay_style = "linear" 93 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .distributed import PyTorchDistributedDataParallel, DistributedDataParallel 17 | from .modeling_glm import GLMModel, glm_get_params_for_weight_decay_optimization 18 | from .downstream import GLMForMultiTokenCloze, GLMForMultiTokenClozeFast, GLMForSingleTokenCloze, \ 19 | GLMForSequenceClassification 20 | -------------------------------------------------------------------------------- /model/distributed.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors 18 | import torch.distributed as dist 19 | from torch.nn.modules import Module 20 | from torch.autograd import Variable 21 | from torch.nn.parallel.distributed import DistributedDataParallel as DDP 22 | 23 | import mpu 24 | 25 | 26 | class PyTorchDistributedDataParallel(DDP): 27 | def named_parameters(self, prefix: str = '', recurse: bool = True): 28 | return self.module.named_parameters(prefix=prefix, recurse=recurse) 29 | 30 | def state_dict(self, destination=None, prefix='', keep_vars=False): 31 | sd = self.module.state_dict(destination, prefix, keep_vars) 32 | return sd 33 | 34 | def load_state_dict(self, state_dict, strict=True): 35 | return self.module.load_state_dict(state_dict, strict=strict) 36 | 37 | 38 | class DistributedDataParallel(Module): 39 | 40 | def __init__(self, module): 41 | super(DistributedDataParallel, self).__init__() 42 | self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False 43 | 44 | self.module = module 45 | self.data_parallel_group = mpu.get_data_parallel_group() 46 | src_rank = mpu.get_model_parallel_rank() 47 | for p in self.module.parameters(): 48 | if torch.is_tensor(p): 49 | dist.broadcast(p, src_rank, group=self.data_parallel_group) 50 | 51 | def allreduce_params(reduce_after=True, no_scale=False, fp32_allreduce=False): 52 | if (self.needs_reduction): 53 | self.needs_reduction = False 54 | buckets = {} 55 | for name, param in self.module.named_parameters(): 56 | if param.requires_grad and param.grad is not None: 57 | tp = (param.data.type()) 58 | if tp not in buckets: 59 | buckets[tp] = [] 60 | buckets[tp].append(param) 61 | if self.warn_on_half: 62 | if torch.cuda.HalfTensor in buckets: 63 | print("WARNING: gloo dist backend for half parameters may be extremely slow." + 64 | " It is recommended to use the NCCL backend in this case.") 65 | self.warn_on_half = False 66 | for tp in buckets: 67 | bucket = buckets[tp] 68 | grads = [param.grad.data for param in bucket] 69 | coalesced = _flatten_dense_tensors(grads) 70 | if fp32_allreduce: 71 | coalesced = coalesced.float() 72 | if not no_scale and not reduce_after: 73 | coalesced /= dist.get_world_size(group=self.data_parallel_group) 74 | dist.all_reduce(coalesced, group=self.data_parallel_group) 75 | torch.cuda.synchronize() 76 | if not no_scale and reduce_after: 77 | coalesced /= dist.get_world_size(group=self.data_parallel_group) 78 | for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): 79 | buf.copy_(synced) 80 | 81 | self.hook_handles = [] 82 | self.hooks = [] 83 | for param in list(self.module.parameters()): 84 | def allreduce_hook(*unused): 85 | Variable._execution_engine.queue_callback(allreduce_params) 86 | # handle = param.register_hook(allreduce_hook) 87 | # self.hooks.append(allreduce_hook) 88 | # self.hook_handles.append(handle) 89 | self.allreduce_params = allreduce_params 90 | 91 | def forward(self, *inputs, **kwargs): 92 | self.needs_reduction = True 93 | return self.module(*inputs, **kwargs) 94 | 95 | def state_dict(self, destination=None, prefix='', keep_vars=False): 96 | # [h.remove() for h in self.hook_handles] 97 | sd = self.module.state_dict(destination, prefix, keep_vars) 98 | return sd 99 | 100 | def load_state_dict(self, state_dict, strict=True): 101 | return self.module.load_state_dict(state_dict, strict=strict) 102 | 103 | def named_parameters(self, prefix: str = '', recurse: bool = True): 104 | return self.module.named_parameters(prefix=prefix, recurse=recurse) 105 | 106 | ''' 107 | def _sync_buffers(self): 108 | buffers = list(self.module._all_buffers()) 109 | if len(buffers) > 0: 110 | # cross-node buffer sync 111 | flat_buffers = _flatten_dense_tensors(buffers) 112 | dist.broadcast(flat_buffers, 0) 113 | for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)): 114 | buf.copy_(synced) 115 | def train(self, mode=True): 116 | # Clear NCCL communicator and CUDA event cache of the default group ID, 117 | # These cache will be recreated at the later call. This is currently a 118 | # work-around for a potential NCCL deadlock. 119 | if dist._backend == dist.dist_backend.NCCL: 120 | dist._clear_group_cache() 121 | super(DistributedDataParallel, self).train(mode) 122 | self.module.train(mode) 123 | ''' 124 | -------------------------------------------------------------------------------- /model/prompt.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | 4 | 5 | class PromptSpell(torch.nn.Module): 6 | def __init__(self, spell_length, hidden_size, spell_func): 7 | super(PromptSpell, self).__init__() 8 | self.spell_length = spell_length 9 | self.hidden_size = hidden_size 10 | self.spell_embeddings = torch.nn.Embedding(self.spell_length, self.hidden_size) 11 | self.spell_func = spell_func 12 | if self.spell_func == "lstm": 13 | self.lstm_head = torch.nn.LSTM(input_size=self.hidden_size, 14 | hidden_size=self.hidden_size, 15 | num_layers=2, 16 | # dropout=self.lstm_dropout, 17 | bidirectional=True, 18 | batch_first=True) # .to(torch.device("cuda")) 19 | self.mlp_head = torch.nn.Sequential(torch.nn.Linear(2 * self.hidden_size, self.hidden_size), 20 | torch.nn.ReLU(), 21 | torch.nn.Linear(self.hidden_size, self.hidden_size)) 22 | elif self.spell_func == "mlp": 23 | self.mlp_head = torch.nn.Sequential(torch.nn.Linear(self.hidden_size, self.hidden_size), 24 | torch.nn.ReLU(), 25 | torch.nn.Linear(self.hidden_size, self.hidden_size)) 26 | elif self.spell_func != "none": 27 | raise NotImplementedError("Prompt function " + self.spell_func) 28 | 29 | def init_embedding(self, word_embeddings=None, task_tokens=None): 30 | num_words = 5000 31 | with torch.no_grad(): 32 | for i in range(self.spell_length): 33 | rand_token = random.randrange(num_words) 34 | if task_tokens is None: 35 | target_embedding = word_embeddings[rand_token] 36 | else: 37 | word_embedding = word_embeddings[rand_token] 38 | task_token = random.choice(task_tokens) 39 | task_embedding = word_embeddings[task_token] 40 | ratio = random.random() 41 | target_embedding = word_embedding * ratio + task_embedding * (1 - ratio) 42 | self.spell_embeddings.weight.data[i] = target_embedding 43 | 44 | def forward(self): 45 | prompt_embeds = self.spell_embeddings.weight.unsqueeze(0) 46 | if self.spell_func == "lstm": 47 | prompt_embeds = self.lstm_head(prompt_embeds)[0] 48 | if self.spell_func == "lstm" or self.spell_func == "mlp": 49 | prompt_embeds = self.mlp_head(prompt_embeds) 50 | return prompt_embeds 51 | -------------------------------------------------------------------------------- /mpu/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Model parallel utility interface.""" 17 | 18 | from .cross_entropy import vocab_parallel_cross_entropy 19 | 20 | from .data import broadcast_data 21 | 22 | from .grads import clip_grad_norm 23 | 24 | from .initialize import destroy_model_parallel 25 | from .initialize import get_data_parallel_group 26 | from .initialize import get_data_parallel_rank 27 | from .initialize import get_data_parallel_world_size 28 | from .initialize import get_model_parallel_group 29 | from .initialize import get_model_parallel_rank 30 | from .initialize import get_model_parallel_src_rank 31 | from .initialize import get_model_parallel_world_size 32 | from .initialize import initialize_model_parallel 33 | from .initialize import model_parallel_is_initialized 34 | 35 | from .layers import ColumnParallelLinear 36 | from .layers import ParallelEmbedding 37 | from .layers import RowParallelLinear 38 | from .layers import VocabParallelEmbedding 39 | 40 | from .mappings import copy_to_model_parallel_region 41 | from .mappings import gather_from_model_parallel_region 42 | from .mappings import reduce_from_model_parallel_region 43 | from .mappings import scatter_to_model_parallel_region 44 | 45 | from .random import checkpoint 46 | from .random import partition_activations_in_checkpoint 47 | from .random import get_cuda_rng_tracker 48 | from .random import model_parallel_cuda_manual_seed 49 | 50 | from .transformer import GPT2ParallelTransformer 51 | from .transformer import LayerNorm 52 | -------------------------------------------------------------------------------- /mpu/cross_entropy.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import torch 18 | 19 | from .initialize import get_model_parallel_group 20 | from .initialize import get_model_parallel_rank 21 | from .initialize import get_model_parallel_world_size 22 | from .utils import VocabUtility 23 | 24 | 25 | class _VocabParallelCrossEntropy(torch.autograd.Function): 26 | 27 | @staticmethod 28 | def forward(ctx, vocab_parallel_logits, target): 29 | 30 | # Copy so the input remains unchanged. 31 | logits = vocab_parallel_logits.clone() 32 | # Maximum value along vocab dimension across all GPUs. 33 | logits_max = torch.max(logits, dim=-1)[0] 34 | torch.distributed.all_reduce(logits_max, 35 | op=torch.distributed.ReduceOp.MAX, 36 | group=get_model_parallel_group()) 37 | # Subtract the maximum value. 38 | logits.sub_(logits_max.unsqueeze(dim=-1)) 39 | # Sum of exponential of logits along vocab dimension across all GPUs. 40 | exp_logits = logits.exp() 41 | sum_exp_logits = exp_logits.sum(dim=-1) 42 | torch.distributed.all_reduce(sum_exp_logits, 43 | op=torch.distributed.ReduceOp.SUM, 44 | group=get_model_parallel_group()) 45 | 46 | # Get the partition's vocab indecies 47 | get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size 48 | partition_vocab_size = vocab_parallel_logits.size()[-1] 49 | rank = get_model_parallel_rank() 50 | world_size = get_model_parallel_world_size() 51 | vocab_start_index, vocab_end_index = get_vocab_range( 52 | partition_vocab_size, rank, world_size) 53 | 54 | # Create a mask of valid vocab ids (1 means it needs to be masked). 55 | target_mask = (target < vocab_start_index) | (target >= vocab_end_index) 56 | masked_target = target.clone() - vocab_start_index 57 | masked_target[target_mask] = 0 58 | 59 | # Get predicted-logits = logits[target]. 60 | # For Simplicity, we convert logits to a 2-D tensor with size 61 | # [*, partition-vocab-size] and target to a 1-D tensor of size [*]. 62 | logits_2d = logits.view(-1, partition_vocab_size) 63 | masked_target_1d = masked_target.view(-1) 64 | arange_1d = torch.arange(start=0, end=logits_2d.size()[0], 65 | device=logits_2d.device) 66 | predicted_logits_1d = logits_2d[arange_1d, masked_target_1d] 67 | predicted_logits = predicted_logits_1d.view_as(target) 68 | predicted_logits[target_mask] = 0.0 69 | # All reduce is needed to get the chunks from other GPUs. 70 | torch.distributed.all_reduce(predicted_logits, 71 | op=torch.distributed.ReduceOp.SUM, 72 | group=get_model_parallel_group()) 73 | 74 | # Loss = log(sum(exp(logits))) - predicted-logit. 75 | loss = torch.log(sum_exp_logits) - predicted_logits 76 | 77 | # Store softmax, target-mask and masked-target for backward pass. 78 | exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1)) 79 | ctx.save_for_backward(exp_logits, target_mask, masked_target_1d) 80 | 81 | return loss 82 | 83 | @staticmethod 84 | def backward(ctx, grad_output): 85 | 86 | # Retreive tensors from the forward path. 87 | softmax, target_mask, masked_target_1d = ctx.saved_tensors 88 | 89 | # All the inputs have softmax as thier gradient. 90 | grad_input = softmax 91 | # For simplicity, work with the 2D gradient. 92 | partition_vocab_size = softmax.size()[-1] 93 | grad_2d = grad_input.view(-1, partition_vocab_size) 94 | 95 | # Add the gradient from matching classes. 96 | arange_1d = torch.arange(start=0, end=grad_2d.size()[0], 97 | device=grad_2d.device) 98 | grad_2d[arange_1d, masked_target_1d] -= ( 99 | 1.0 - target_mask.view(-1).float()) 100 | 101 | # Finally elementwise multiplication with the output gradients. 102 | grad_input.mul_(grad_output.unsqueeze(dim=-1)) 103 | 104 | return grad_input, None 105 | 106 | 107 | def vocab_parallel_cross_entropy(vocab_parallel_logits, target): 108 | """Helper function for the cross entropy.""" 109 | return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target) 110 | -------------------------------------------------------------------------------- /mpu/data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | from .initialize import get_model_parallel_group 19 | from .initialize import get_model_parallel_rank 20 | from .initialize import get_model_parallel_src_rank 21 | 22 | 23 | _MAX_DATA_DIM = 5 24 | 25 | 26 | def _check_data_types(keys, data, target_dtype): 27 | """Check that all the keys have the same target data type.""" 28 | for key in keys: 29 | assert data[key].dtype == target_dtype, '{} has data type {} which '\ 30 | 'is different than {}'.format(key, data[key].dtype, target_dtype) 31 | 32 | 33 | def _build_key_size_numel_dictionaries(keys, data): 34 | """Build the size on rank 0 and broadcast.""" 35 | max_dim = _MAX_DATA_DIM 36 | sizes = [0 for _ in range(max_dim) for _ in keys] 37 | 38 | # Pack the sizes on rank zero. 39 | if get_model_parallel_rank() == 0: 40 | offset = 0 41 | for key in keys: 42 | assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM' 43 | size = data[key].size() 44 | for i, s in enumerate(size): 45 | sizes[i + offset] = s 46 | offset += max_dim 47 | 48 | # Move to GPU and broadcast. 49 | sizes_cuda = torch.cuda.LongTensor(sizes) 50 | torch.distributed.broadcast(sizes_cuda, get_model_parallel_src_rank(), 51 | group=get_model_parallel_group()) 52 | 53 | # Move back to cpu and unpack. 54 | sizes_cpu = sizes_cuda.cpu() 55 | key_size = {} 56 | key_numel = {} 57 | total_numel = 0 58 | offset = 0 59 | for key in keys: 60 | i = 0 61 | size = [] 62 | numel = 1 63 | while sizes_cpu[offset + i] > 0: 64 | this_size = sizes_cpu[offset + i] 65 | size.append(this_size) 66 | numel *= this_size 67 | i += 1 68 | key_size[key] = size 69 | key_numel[key] = numel 70 | total_numel += numel 71 | offset += max_dim 72 | 73 | return key_size, key_numel, total_numel 74 | 75 | 76 | def broadcast_data(keys, data, datatype): 77 | """Broadcast data from rank zero of each model parallel group to the 78 | members of the same model parallel group. 79 | 80 | Arguments: 81 | keys: list of keys in the data disctionary to be broadcasted 82 | data: data dictionary of string keys and cpu tensor values. 83 | datatype: torch data type of all tensors in data associated 84 | with keys. 85 | """ 86 | # Build (key, size) and (key, number of elements) dictionaries along 87 | # with the total number of elements on all ranks. 88 | key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, 89 | data) 90 | 91 | # Pack on rank zero. 92 | if get_model_parallel_rank() == 0: 93 | # Check that all keys have the same data type. 94 | _check_data_types(keys, data, datatype) 95 | # Flatten the data associated with the keys 96 | flatten_data = torch.cat( 97 | [data[key].contiguous().view(-1) for key in keys], dim=0).cuda() 98 | else: 99 | flatten_data = torch.empty(total_numel, 100 | device=torch.cuda.current_device(), 101 | dtype=datatype) 102 | 103 | # Boradcast 104 | torch.distributed.broadcast(flatten_data, get_model_parallel_src_rank(), 105 | group=get_model_parallel_group()) 106 | 107 | # Unpack 108 | output = {} 109 | offset = 0 110 | for key in keys: 111 | size = key_size[key] 112 | numel = key_numel[key] 113 | output[key] = flatten_data.narrow(0, offset, numel).view(size) 114 | offset += numel 115 | 116 | return output 117 | -------------------------------------------------------------------------------- /mpu/grads.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # Parts of the code here are adapted from PyTorch 18 | # repo: https://github.com/pytorch/pytorch 19 | 20 | 21 | import torch 22 | from torch._six import inf 23 | 24 | from .initialize import get_model_parallel_group 25 | from .initialize import get_model_parallel_rank 26 | 27 | 28 | def clip_grad_norm(parameters, max_norm, norm_type=2): 29 | """Clips gradient norm of an iterable of parameters. 30 | 31 | This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and 32 | added functionality to handle model parallel parameters. Note that 33 | the gradients are modified in place. 34 | 35 | Arguments: 36 | parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a 37 | single Tensor that will have gradients normalized 38 | max_norm (float or int): max norm of the gradients 39 | norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for 40 | infinity norm. 41 | 42 | Returns: 43 | Total norm of the parameters (viewed as a single vector). 44 | """ 45 | if isinstance(parameters, torch.Tensor): 46 | parameters = [parameters] 47 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 48 | max_norm = float(max_norm) 49 | norm_type = float(norm_type) 50 | if norm_type == inf: 51 | total_norm = max(p.grad.data.abs().max() for p in parameters) 52 | total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) 53 | # Take max across all GPUs. 54 | torch.distributed.all_reduce(total_norm_cuda, 55 | op=torch.distributed.ReduceOp.MAX, 56 | group=get_model_parallel_group()) 57 | total_norm = total_norm_cuda[0].item() 58 | else: 59 | total_norm = 0 60 | for p in parameters: 61 | if p.model_parallel or (get_model_parallel_rank() == 0): 62 | param_norm = p.grad.data.norm(norm_type) 63 | total_norm += param_norm.item() ** norm_type 64 | # Sum across all model parallel GPUs. 65 | total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) 66 | torch.distributed.all_reduce(total_norm_cuda, 67 | op=torch.distributed.ReduceOp.SUM, 68 | group=get_model_parallel_group()) 69 | total_norm = total_norm_cuda[0].item() ** (1. / norm_type) 70 | clip_coef = max_norm / (total_norm + 1e-6) 71 | if clip_coef < 1: 72 | for p in parameters: 73 | p.grad.data.mul_(clip_coef) 74 | return total_norm 75 | -------------------------------------------------------------------------------- /mpu/initialize.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | """Model and data parallel groups.""" 18 | 19 | import torch 20 | 21 | from .utils import ensure_divisibility 22 | 23 | 24 | # Model parallel group that the current rank belongs to. 25 | _MODEL_PARALLEL_GROUP = None 26 | # Data parallel group that the current rank belongs to. 27 | _DATA_PARALLEL_GROUP = None 28 | 29 | 30 | def initialize_model_parallel(model_parallel_size_): 31 | """ 32 | Initialize model data parallel groups. 33 | 34 | Arguments: 35 | model_parallel_size: number of GPUs used to parallelize model. 36 | 37 | Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we 38 | use 2 GPUs to parallelize the model. The present function will 39 | create 4 model parallel groups and 2 data parallel grous as: 40 | 4 model parallel groups: 41 | [g0, g1], [g2, g3], [g4, g5], [g6, g7] 42 | 2 data parallel groups: 43 | [g0, g2, g4, g6], [g1, g3, g5, g7] 44 | Note that for efficiency, the caller should make sure adjacent ranks 45 | are on the same DGX box. For example if we are using 2 DGX-1 boxes 46 | with a total of 16 GPUs, rank 0 to 7 belong to the first box and 47 | ranks 8 to 15 belong to the second box. 48 | """ 49 | if torch.distributed.get_rank() == 0: 50 | print('> initializing model parallel with size {}'.format( 51 | model_parallel_size_)) 52 | # Get world size and rank. Ensure some consistencies. 53 | assert torch.distributed.is_initialized() 54 | world_size = torch.distributed.get_world_size() 55 | model_parallel_size = min(model_parallel_size_, world_size) 56 | ensure_divisibility(world_size, model_parallel_size) 57 | rank = torch.distributed.get_rank() 58 | 59 | # Build the data parallel groups. 60 | global _DATA_PARALLEL_GROUP 61 | assert _DATA_PARALLEL_GROUP is None, \ 62 | 'data parallel group is already initialized' 63 | for i in range(model_parallel_size): 64 | ranks = range(i, world_size, model_parallel_size) 65 | group = torch.distributed.new_group(ranks) 66 | if i == (rank % model_parallel_size): 67 | _DATA_PARALLEL_GROUP = group 68 | 69 | # Build the model parallel groups. 70 | global _MODEL_PARALLEL_GROUP 71 | assert _MODEL_PARALLEL_GROUP is None, \ 72 | 'model parallel group is already initialized' 73 | for i in range(world_size // model_parallel_size): 74 | ranks = range(i * model_parallel_size, 75 | (i + 1) * model_parallel_size) 76 | group = torch.distributed.new_group(ranks) 77 | if i == (rank // model_parallel_size): 78 | _MODEL_PARALLEL_GROUP = group 79 | 80 | 81 | def model_parallel_is_initialized(): 82 | """Check if model and data parallel groups are initialized.""" 83 | if _MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None: 84 | return False 85 | return True 86 | 87 | 88 | def get_model_parallel_group(): 89 | """Get the model parallel group the caller rank belongs to.""" 90 | assert _MODEL_PARALLEL_GROUP is not None, \ 91 | 'model parallel group is not initialized' 92 | return _MODEL_PARALLEL_GROUP 93 | 94 | 95 | def get_data_parallel_group(): 96 | """Get the data parallel group the caller rank belongs to.""" 97 | assert _DATA_PARALLEL_GROUP is not None, \ 98 | 'data parallel group is not initialized' 99 | return _DATA_PARALLEL_GROUP 100 | 101 | 102 | def get_model_parallel_world_size(): 103 | """Return world size for the model parallel group.""" 104 | return torch.distributed.get_world_size(group=get_model_parallel_group()) 105 | 106 | 107 | def get_model_parallel_rank(): 108 | """Return my rank for the model parallel group.""" 109 | return torch.distributed.get_rank(group=get_model_parallel_group()) 110 | 111 | 112 | def get_model_parallel_src_rank(): 113 | """Calculate the global rank corresponding to a local rank zeor 114 | in the model parallel group.""" 115 | global_rank = torch.distributed.get_rank() 116 | local_world_size = get_model_parallel_world_size() 117 | return (global_rank // local_world_size) * local_world_size 118 | 119 | 120 | def get_data_parallel_world_size(): 121 | """Return world size for the data parallel group.""" 122 | return torch.distributed.get_world_size(group=get_data_parallel_group()) 123 | 124 | 125 | def get_data_parallel_rank(): 126 | """Return my rank for the data parallel group.""" 127 | return torch.distributed.get_rank(group=get_data_parallel_group()) 128 | 129 | 130 | def destroy_model_parallel(): 131 | """Set the groups to none.""" 132 | global _MODEL_PARALLEL_GROUP 133 | _MODEL_PARALLEL_GROUP = None 134 | global _DATA_PARALLEL_GROUP 135 | _DATA_PARALLEL_GROUP = None 136 | -------------------------------------------------------------------------------- /mpu/mappings.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | from .initialize import get_model_parallel_group 19 | from .utils import split_tensor_along_last_dim 20 | 21 | 22 | def _reduce(input_): 23 | """All-reduce the the input tensor across model parallel group.""" 24 | group = get_model_parallel_group() 25 | 26 | # Bypass the function if we are using only 1 GPU. 27 | if torch.distributed.get_world_size(group=group) == 1: 28 | return input_ 29 | 30 | # All-reduce. 31 | torch.distributed.all_reduce(input_, group=group) 32 | 33 | return input_ 34 | 35 | 36 | def _split(input_): 37 | """Split the tensor along its last dimension and keep the 38 | corresponding slice.""" 39 | group = get_model_parallel_group() 40 | 41 | # Bypass the function if we are using only 1 GPU. 42 | if torch.distributed.get_world_size(group=group) == 1: 43 | return input_ 44 | 45 | # Split along last dimension. 46 | world_size = torch.distributed.get_world_size(group=group) 47 | input_list = split_tensor_along_last_dim(input_, world_size) 48 | 49 | # Note: torch.split does not create contiguous tensors by default. 50 | rank = torch.distributed.get_rank(group=group) 51 | output = input_list[rank].contiguous() 52 | 53 | return output 54 | 55 | 56 | def _gather(input_): 57 | """Gather tensors and concatinate along the last dimension.""" 58 | group = get_model_parallel_group() 59 | 60 | # Bypass the function if we are using only 1 GPU. 61 | if torch.distributed.get_world_size(group=group) == 1: 62 | return input_ 63 | 64 | # Size and dimension. 65 | last_dim = input_.dim() - 1 66 | rank = torch.distributed.get_rank(group=group) 67 | world_size = torch.distributed.get_world_size(group=group) 68 | 69 | tensor_list = [torch.empty_like(input_) for _ in range(world_size)] 70 | tensor_list[rank] = input_ 71 | torch.distributed.all_gather(tensor_list, input_, group=group) 72 | 73 | # Note: torch.cat already creates a contiguous tensor. 74 | output = torch.cat(tensor_list, dim=last_dim).contiguous() 75 | 76 | return output 77 | 78 | 79 | class _CopyToModelParallelRegion(torch.autograd.Function): 80 | """Pass the input to the model parallel region.""" 81 | 82 | @staticmethod 83 | def forward(ctx, input_): 84 | return input_ 85 | 86 | @staticmethod 87 | def backward(ctx, grad_output): 88 | return _reduce(grad_output) 89 | 90 | 91 | class _ReduceFromModelParallelRegion(torch.autograd.Function): 92 | """All-redcue the input from the model parallel region.""" 93 | 94 | @staticmethod 95 | def forward(ctx, input_): 96 | return _reduce(input_) 97 | 98 | @staticmethod 99 | def backward(ctx, grad_output): 100 | return grad_output 101 | 102 | 103 | class _ScatterToModelParallelRegion(torch.autograd.Function): 104 | """Split the input and keep only the corresponding chuck to the rank.""" 105 | 106 | @staticmethod 107 | def forward(ctx, input_): 108 | return _split(input_) 109 | 110 | @staticmethod 111 | def backward(ctx, grad_output): 112 | return _gather(grad_output) 113 | 114 | 115 | class _GatherFromModelParallelRegion(torch.autograd.Function): 116 | """Gather the input from model parallel region and concatinate.""" 117 | 118 | @staticmethod 119 | def forward(ctx, input_): 120 | return _gather(input_) 121 | 122 | @staticmethod 123 | def backward(ctx, grad_output): 124 | return _split(grad_output) 125 | 126 | 127 | # ----------------- 128 | # Helper functions. 129 | # ----------------- 130 | 131 | def copy_to_model_parallel_region(input_): 132 | return _CopyToModelParallelRegion.apply(input_) 133 | 134 | def reduce_from_model_parallel_region(input_): 135 | return _ReduceFromModelParallelRegion.apply(input_) 136 | 137 | def scatter_to_model_parallel_region(input_): 138 | return _ScatterToModelParallelRegion.apply(input_) 139 | 140 | def gather_from_model_parallel_region(input_): 141 | return _GatherFromModelParallelRegion.apply(input_) 142 | -------------------------------------------------------------------------------- /mpu/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/GLM/4f61ed7237a3b0187f4d62062429348276a78c84/mpu/tests/__init__.py -------------------------------------------------------------------------------- /mpu/tests/commons.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import argparse 17 | import os 18 | import random 19 | import numpy 20 | import torch 21 | 22 | import mpu 23 | 24 | 25 | class IdentityLayer(torch.nn.Module): 26 | def __init__(self, size, scale=1.0): 27 | super(IdentityLayer, self).__init__() 28 | self.weight = torch.nn.Parameter(scale * torch.randn(size)) 29 | def forward(self): 30 | return self.weight 31 | 32 | 33 | def set_random_seed(seed): 34 | """Set random seed for reproducability.""" 35 | random.seed(seed) 36 | numpy.random.seed(seed) 37 | torch.manual_seed(seed) 38 | mpu.model_parallel_cuda_manual_seed(seed) 39 | 40 | 41 | def initialize_distributed(backend='nccl'): 42 | """Initialize torch.distributed.""" 43 | # Get local rank in case it is provided. 44 | parser = argparse.ArgumentParser() 45 | parser.add_argument('--local_rank', type=int, default=None, 46 | help='local rank passed from distributed launcher') 47 | args = parser.parse_args() 48 | local_rank = args.local_rank 49 | 50 | # Get rank and world size. 51 | rank = int(os.getenv('RANK', '0')) 52 | world_size = int(os.getenv("WORLD_SIZE", '1')) 53 | 54 | print('> initializing torch.distributed with local rank: {}, ' 55 | 'rank: {}, world size: {}'.format(local_rank, rank, world_size)) 56 | 57 | # Set the device id. 58 | device = rank % torch.cuda.device_count() 59 | if local_rank is not None: 60 | device = local_rank 61 | torch.cuda.set_device(device) 62 | 63 | # Call the init process. 64 | init_method = 'tcp://' 65 | master_ip = os.getenv('MASTER_ADDR', 'localhost') 66 | master_port = os.getenv('MASTER_PORT', '6000') 67 | init_method += master_ip + ':' + master_port 68 | torch.distributed.init_process_group( 69 | backend=backend, 70 | world_size=world_size, 71 | rank=rank, 72 | init_method=init_method) 73 | 74 | 75 | def print_separator(message): 76 | torch.distributed.barrier() 77 | filler_len = (78 - len(message)) // 2 78 | filler = '-' * filler_len 79 | string = '\n' + filler + ' {} '.format(message) + filler 80 | if torch.distributed.get_rank() == 0: 81 | print(string, flush=True) 82 | torch.distributed.barrier() 83 | -------------------------------------------------------------------------------- /mpu/tests/test_cross_entropy.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import random 17 | import sys 18 | sys.path.append("../..") 19 | 20 | import torch 21 | import torch.nn.functional as F 22 | import mpu 23 | from mpu.cross_entropy import vocab_parallel_cross_entropy 24 | 25 | from commons import initialize_distributed 26 | from commons import print_separator 27 | from commons import IdentityLayer 28 | from commons import set_random_seed 29 | 30 | 31 | def torch_cross_entropy(batch_size, seq_length, vocab_size, 32 | logits_scale, seed): 33 | set_random_seed(seed) 34 | identity = IdentityLayer((batch_size, seq_length, vocab_size), 35 | scale=logits_scale).cuda() 36 | logits = identity() 37 | target = torch.cuda.LongTensor( 38 | size=(batch_size, seq_length)).random_(0, vocab_size) 39 | loss = F.cross_entropy(logits.view(-1, logits.size()[-1]), 40 | target.view(-1), 41 | reduction='none').view_as(target).mean() 42 | loss.backward() 43 | return loss, identity.weight.grad 44 | 45 | 46 | def mpu_cross_entropy(batch_size, seq_length, vocab_size, 47 | logits_scale, seed): 48 | set_random_seed(seed) 49 | identity = IdentityLayer((batch_size, seq_length, vocab_size), 50 | scale=logits_scale).cuda() 51 | logits = identity() 52 | logits_parallel = mpu.scatter_to_model_parallel_region(logits) 53 | target = torch.cuda.LongTensor( 54 | size=(batch_size, seq_length)).random_(0, vocab_size) 55 | loss = vocab_parallel_cross_entropy(logits_parallel, target).mean() 56 | loss.backward() 57 | return loss, identity.weight.grad 58 | 59 | 60 | def test_cross_entropy(model_parallel_size): 61 | 62 | if torch.distributed.get_rank() == 0: 63 | print('> testing cross entropy with model parallel size {} ...'. 64 | format(model_parallel_size)) 65 | 66 | mpu.initialize_model_parallel(model_parallel_size) 67 | model_parallel_size = mpu.get_model_parallel_world_size() 68 | 69 | batch_size = 13 70 | seq_length = 17 71 | vocab_size_per_partition = 11 72 | logits_scale = 1000.0 73 | vocab_size = vocab_size_per_partition * model_parallel_size 74 | seed = 1234 75 | 76 | loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length, 77 | vocab_size, logits_scale, 78 | seed) 79 | loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length, 80 | vocab_size, logits_scale, 81 | seed) 82 | 83 | error = loss_torch.sub_(loss_mpu).abs().max() 84 | print(' max error in loss on global rank {}: {}'.format( 85 | torch.distributed.get_rank(), error)) 86 | assert error < 1.0e-6 87 | 88 | error = grad_torch.sub_(grad_mpu).abs().max() 89 | print(' max error in grad on global rank {}: {}'.format( 90 | torch.distributed.get_rank(), error)) 91 | assert error < 1.0e-6 92 | 93 | # Reset groups 94 | mpu.destroy_model_parallel() 95 | 96 | torch.distributed.barrier() 97 | if torch.distributed.get_rank() == 0: 98 | print('>> passed the test :-)') 99 | 100 | 101 | if __name__ == '__main__': 102 | 103 | initialize_distributed() 104 | world_size = torch.distributed.get_world_size() 105 | 106 | model_parallel_size = 1 107 | while model_parallel_size <= world_size: 108 | print_separator('test cross entropy') 109 | test_cross_entropy(model_parallel_size) 110 | model_parallel_size *= 2 111 | -------------------------------------------------------------------------------- /mpu/tests/test_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import functools 17 | import operator 18 | import sys 19 | sys.path.append("../..") 20 | 21 | import torch 22 | import mpu 23 | from mpu import data as data_utils 24 | 25 | from commons import initialize_distributed 26 | from commons import print_separator 27 | 28 | 29 | def test_boradcast_data(model_parallel_size): 30 | 31 | if torch.distributed.get_rank() == 0: 32 | print('> testing boradcast_data with model parallel size {} ...'. 33 | format(model_parallel_size)) 34 | 35 | mpu.initialize_model_parallel(model_parallel_size) 36 | torch.manual_seed(1234 + mpu.get_data_parallel_rank()) 37 | model_parallel_size = mpu.get_model_parallel_world_size() 38 | 39 | key_size_t = {'key1': [7, 11], 40 | 'key2': [8, 2, 1], 41 | 'key3': [13], 42 | 'key4': [5, 1, 2], 43 | 'key5': [5, 12]} 44 | keys = list(key_size_t.keys()) 45 | 46 | data = {} 47 | data_t = {} 48 | for key in key_size_t: 49 | data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000) 50 | data_t[key] = data[key].clone() 51 | data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000) 52 | data_t['keyX'] = data['keyX'].clone() 53 | if mpu.get_model_parallel_rank() != 0: 54 | data = None 55 | 56 | data_utils._check_data_types(keys, data_t, torch.int64) 57 | key_size, key_numel, \ 58 | total_numel = data_utils._build_key_size_numel_dictionaries(keys, data) 59 | for key in keys: 60 | assert key_size[key] == key_size_t[key] 61 | total_numel_t = 0 62 | for key in keys: 63 | target_size = functools.reduce(operator.mul, key_size_t[key], 1) 64 | assert key_numel[key] == target_size 65 | total_numel_t += target_size 66 | assert total_numel == total_numel_t 67 | 68 | data_b = data_utils.broadcast_data(keys, data, torch.int64) 69 | for key in keys: 70 | tensor = data_t[key].cuda() 71 | assert data_b[key].sub(tensor).abs().max() == 0 72 | 73 | # Reset groups 74 | mpu.destroy_model_parallel() 75 | 76 | torch.distributed.barrier() 77 | if torch.distributed.get_rank() == 0: 78 | print('>> passed the test :-)') 79 | 80 | 81 | if __name__ == '__main__': 82 | 83 | initialize_distributed() 84 | world_size = torch.distributed.get_world_size() 85 | 86 | model_parallel_size = 1 87 | while model_parallel_size <= world_size: 88 | print_separator('test test boradcast data') 89 | test_boradcast_data(model_parallel_size) 90 | model_parallel_size *= 2 91 | 92 | 93 | -------------------------------------------------------------------------------- /mpu/tests/test_initialize.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import sys 17 | sys.path.append("../..") 18 | 19 | import torch 20 | import mpu 21 | 22 | from commons import initialize_distributed 23 | from commons import print_separator 24 | 25 | 26 | def test_initialize_model_parallel(model_parallel_size): 27 | 28 | if torch.distributed.get_rank() == 0: 29 | print('> testing initialize_model_parallel with size {} ...'.format( 30 | model_parallel_size)) 31 | model_parallel_size_ = min(model_parallel_size, 32 | torch.distributed.get_world_size()) 33 | assert not mpu.model_parallel_is_initialized() 34 | mpu.initialize_model_parallel(model_parallel_size_) 35 | assert mpu.model_parallel_is_initialized() 36 | 37 | # Checks. 38 | def check(group, world_size, rank): 39 | assert world_size == torch.distributed.get_world_size(group=group) 40 | assert rank == torch.distributed.get_rank(group=group) 41 | 42 | # Model parallel. 43 | world_size = model_parallel_size_ 44 | rank = torch.distributed.get_rank() % model_parallel_size_ 45 | assert world_size == mpu.get_model_parallel_world_size() 46 | assert rank == mpu.get_model_parallel_rank() 47 | check(mpu.get_model_parallel_group(), world_size, rank) 48 | 49 | 50 | # Data parallel. 51 | world_size = torch.distributed.get_world_size() // model_parallel_size_ 52 | rank = torch.distributed.get_rank() // model_parallel_size 53 | assert world_size == mpu.get_data_parallel_world_size() 54 | assert rank == mpu.get_data_parallel_rank() 55 | check(mpu.get_data_parallel_group(), world_size, rank) 56 | 57 | # Reset groups 58 | mpu.destroy_model_parallel() 59 | 60 | torch.distributed.barrier() 61 | if torch.distributed.get_rank() == 0: 62 | print('>> passed the test :-)') 63 | 64 | 65 | def test_get_model_parallel_src_rank(model_parallel_size_): 66 | 67 | if torch.distributed.get_rank() == 0: 68 | print('> testing get_model_parallel_src_rank with size {} ...'.format( 69 | model_parallel_size_)) 70 | model_parallel_size = min(model_parallel_size_, 71 | torch.distributed.get_world_size()) 72 | assert not mpu.model_parallel_is_initialized() 73 | mpu.initialize_model_parallel(model_parallel_size) 74 | assert mpu.model_parallel_is_initialized() 75 | 76 | # Checks 77 | src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank() 78 | assert mpu.get_model_parallel_src_rank() == src_rank 79 | 80 | # Reset groups 81 | mpu.destroy_model_parallel() 82 | 83 | torch.distributed.barrier() 84 | if torch.distributed.get_rank() == 0: 85 | print('>> passed the test :-)') 86 | 87 | 88 | if __name__ == '__main__': 89 | 90 | initialize_distributed() 91 | world_size = torch.distributed.get_world_size() 92 | model_parallel_size = 1 93 | while model_parallel_size <= world_size: 94 | print_separator('test initialize model parallel') 95 | test_initialize_model_parallel(model_parallel_size) 96 | print_separator('test model parallel source rank') 97 | test_get_model_parallel_src_rank(model_parallel_size) 98 | model_parallel_size *= 2 99 | -------------------------------------------------------------------------------- /mpu/tests/test_random.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import sys 17 | sys.path.append("../..") 18 | 19 | import torch 20 | import mpu 21 | 22 | from commons import initialize_distributed 23 | from commons import print_separator 24 | 25 | 26 | def test_set_cuda_rng_state(model_parallel_size): 27 | 28 | if torch.distributed.get_rank() == 0: 29 | print('> testing set_rng_state with size {} ...'. 30 | format(model_parallel_size)) 31 | 32 | mpu.initialize_model_parallel(model_parallel_size) 33 | model_parallel_size = mpu.get_model_parallel_world_size() 34 | 35 | size = 123 36 | seed = 1234 37 | torch.cuda.manual_seed(1234) 38 | tensor = torch.cuda.FloatTensor(size) 39 | 40 | # Get the state 41 | rng_state = torch.cuda.get_rng_state() 42 | rng_state_copy = rng_state.clone() 43 | 44 | # Do some stuff. 45 | for _ in range(5): 46 | torch.randn(size, out=tensor) 47 | result_1 = tensor.clone() 48 | 49 | assert rng_state.sub(rng_state_copy).max() == 0 50 | assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0 51 | 52 | # State should be different. 53 | new_rng_state = torch.cuda.get_rng_state() 54 | max_diff = new_rng_state.sub(rng_state).max() 55 | print(' max diff in rng state (should be non-zero) on global rank {}: {}'. 56 | format(torch.distributed.get_rank(), max_diff)) 57 | assert max_diff > 0 58 | 59 | # Reset the rng state and do the same stuff. 60 | mpu.random._set_cuda_rng_state(rng_state) 61 | for _ in range(5): 62 | torch.randn(size, out=tensor) 63 | mpu.random._set_cuda_rng_state(rng_state) 64 | for _ in range(5): 65 | torch.randn(size, out=tensor) 66 | result_2 = tensor.clone() 67 | 68 | # Results should be the same 69 | error = result_2.sub(result_1).abs().max() 70 | print(' max error in generated tensors (should be zero) on ' 71 | 'global rank {}: {}'.format(torch.distributed.get_rank(), error)) 72 | assert error < 1.0e-6 73 | 74 | # Input state should have remained intact. 75 | error = rng_state.sub(rng_state_copy).max() 76 | print(' max error in rng state (should be zero) on global rank {}: {}'. 77 | format(torch.distributed.get_rank(), error)) 78 | assert error == 0 79 | 80 | # Reset groups 81 | mpu.destroy_model_parallel() 82 | 83 | torch.distributed.barrier() 84 | if torch.distributed.get_rank() == 0: 85 | print('>> passed the test :-)') 86 | 87 | 88 | def test_cuda_rng_tracker(model_parallel_size): 89 | 90 | if torch.distributed.get_rank() == 0: 91 | print('> testing cuda rng tracker with size {} ...'. 92 | format(model_parallel_size)) 93 | 94 | mpu.initialize_model_parallel(model_parallel_size) 95 | model_parallel_size = mpu.get_model_parallel_world_size() 96 | 97 | seed_1 = 1234 98 | seed_2 = 4321 99 | size = [12, 21] 100 | tensor = torch.cuda.FloatTensor(size) 101 | 102 | # Set to seed_1 and generate two tensors. 103 | torch.cuda.manual_seed(seed_1) 104 | torch.randn(size, out=tensor) 105 | target_11 = tensor.clone() 106 | torch.randn(size, out=tensor) 107 | target_12 = tensor.clone() 108 | 109 | # Set to seed_2 and generate two tensors. 110 | torch.cuda.manual_seed(seed_2) 111 | torch.randn(size, out=tensor) 112 | target_21 = tensor.clone() 113 | torch.randn(size, out=tensor) 114 | target_22 = tensor.clone() 115 | 116 | # Now if we interleave seed_1 and seed_2, 117 | # we should still get the same tensors 118 | torch.cuda.manual_seed(seed_1) 119 | mpu.get_cuda_rng_tracker().add('test', seed_2) 120 | 121 | torch.randn(size, out=tensor) 122 | result_11 = tensor.clone() 123 | 124 | with mpu.get_cuda_rng_tracker().fork('test'): 125 | torch.randn(size, out=tensor) 126 | result_21 = tensor.clone() 127 | 128 | torch.randn(size, out=tensor) 129 | result_12 = tensor.clone() 130 | 131 | with mpu.get_cuda_rng_tracker().fork('test'): 132 | torch.randn(size, out=tensor) 133 | result_22 = tensor.clone() 134 | 135 | diff = result_11.sub(result_21).abs().max() 136 | diff = min(diff, result_12.sub(result_22).abs().max()) 137 | print(' max diff in generated tensors (should be non-zero) on ' 138 | 'global rank {}: {}'.format(torch.distributed.get_rank(), diff)) 139 | assert diff > 1.0e-6 140 | error = max(result_11.sub(target_11).abs().max(), 141 | result_12.sub(target_12).abs().max()) 142 | error = max(error, result_21.sub(target_21).abs().max()) 143 | error = max(error, result_22.sub(target_22).abs().max()) 144 | print(' max error in generated tensors (should be zero) on ' 145 | 'global rank {}: {}'.format(torch.distributed.get_rank(), error)) 146 | assert error < 1.0e-6 147 | 148 | # Reset the tracker 149 | mpu.get_cuda_rng_tracker().reset() 150 | 151 | # Reset groups 152 | mpu.destroy_model_parallel() 153 | 154 | torch.distributed.barrier() 155 | if torch.distributed.get_rank() == 0: 156 | print('>> passed the test :-)') 157 | 158 | 159 | def test_model_parallel_cuda_manual_seed(model_parallel_size): 160 | 161 | if torch.distributed.get_rank() == 0: 162 | print('> testing model parallel cuda manual seed with size {} ...'. 163 | format(model_parallel_size)) 164 | 165 | mpu.initialize_model_parallel(model_parallel_size) 166 | model_parallel_size = mpu.get_model_parallel_world_size() 167 | 168 | mpu.model_parallel_cuda_manual_seed(12345) 169 | assert torch.cuda.initial_seed() == 12345 170 | with mpu.get_cuda_rng_tracker().fork(): 171 | assert torch.cuda.initial_seed() == (12345 + 2718 + 172 | mpu.get_model_parallel_rank()) 173 | 174 | # Reset the tracker 175 | mpu.get_cuda_rng_tracker().reset() 176 | 177 | # Reset groups 178 | mpu.destroy_model_parallel() 179 | 180 | torch.distributed.barrier() 181 | if torch.distributed.get_rank() == 0: 182 | print('>> passed the test :-)') 183 | 184 | 185 | if __name__ == '__main__': 186 | 187 | initialize_distributed() 188 | world_size = torch.distributed.get_world_size() 189 | 190 | model_parallel_size = 1 191 | while model_parallel_size <= world_size: 192 | print_separator('test set rng state') 193 | test_set_cuda_rng_state(model_parallel_size) 194 | model_parallel_size *= 2 195 | 196 | model_parallel_size = 1 197 | while model_parallel_size <= world_size: 198 | print_separator('test cuda rng tracker') 199 | test_cuda_rng_tracker(model_parallel_size) 200 | model_parallel_size *= 2 201 | 202 | model_parallel_size = 1 203 | while model_parallel_size <= world_size: 204 | print_separator('test model parallel cuda manual seed') 205 | test_model_parallel_cuda_manual_seed(model_parallel_size) 206 | model_parallel_size *= 2 207 | 208 | -------------------------------------------------------------------------------- /mpu/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import torch 18 | 19 | 20 | def ensure_divisibility(numerator, denominator): 21 | """Ensure that numerator is divisible by the denominator.""" 22 | assert numerator % denominator == 0, '{} is not divisible by {}'.format( 23 | numerator, denominator) 24 | 25 | 26 | def divide(numerator, denominator): 27 | """Ensure that numerator is divisible by the denominator and return 28 | the division value.""" 29 | ensure_divisibility(numerator, denominator) 30 | return numerator // denominator 31 | 32 | 33 | def split_tensor_along_last_dim(tensor, num_partitions, 34 | contiguous_split_chunks=False): 35 | """Split a tensor along its last dimension. 36 | Arguments: 37 | tensor: input tensor. 38 | num_partitions: number of partitions to split the tensor 39 | contiguous_split_chunks: If True, make each chunk contiguous 40 | in memory. 41 | """ 42 | # Get the size and dimension. 43 | last_dim = tensor.dim() - 1 44 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 45 | # Split. 46 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 47 | # Note: torch.split does not create contiguous tensors by default. 48 | if contiguous_split_chunks: 49 | return tuple(chunk.contiguous() for chunk in tensor_list) 50 | 51 | return tensor_list 52 | 53 | 54 | class VocabUtility: 55 | """Split the vocabulary into `world_size` chunks amd return the 56 | first and last index of the vocabulary belonging to the `rank` 57 | partition: Note that indecies in [fist, last)""" 58 | 59 | @staticmethod 60 | def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, 61 | rank, world_size): 62 | index_f = rank * per_partition_vocab_size 63 | index_l = index_f + per_partition_vocab_size 64 | return index_f, index_l 65 | 66 | @staticmethod 67 | def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size): 68 | per_partition_vocab_size = divide(global_vocab_size, world_size) 69 | return VocabUtility.vocab_range_from_per_partition_vocab_size( 70 | per_partition_vocab_size, rank, world_size) 71 | -------------------------------------------------------------------------------- /process_grid.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import json 4 | import glob 5 | import statistics 6 | 7 | path_pattern = sys.argv[1] 8 | target_type = sys.argv[2] 9 | best_value, best_result, best_name = None, None, None 10 | mean_result = {} 11 | print(path_pattern) 12 | for dir_path in glob.glob(path_pattern, recursive=True): 13 | entry = os.path.basename(dir_path) 14 | valid_result = None 15 | test_found = os.path.exists(os.path.join(dir_path, "test_results.json")) 16 | valid_path = os.path.join(dir_path, "results.json") 17 | if os.path.exists(valid_path): 18 | print(entry) 19 | with open(valid_path) as file: 20 | valid_result = json.load(file) 21 | else: 22 | print(f"{entry} no validation results") 23 | continue 24 | if not test_found: 25 | print(f"{entry} not tested yet") 26 | if target_type == "max": 27 | metric = sys.argv[3] 28 | metric_value = valid_result[metric] 29 | if best_value is None or metric_value > best_value: 30 | best_value = metric_value 31 | best_result = valid_result 32 | best_name = entry 33 | elif target_type == "mean" or target_type == "median": 34 | if mean_result: 35 | for metric, value in valid_result.items(): 36 | if metric not in ["type", "epoch"]: 37 | mean_result[metric].append(value) 38 | else: 39 | mean_result = {metric: [value] for metric, value in valid_result.items() if 40 | metric not in ["type", "epoch"]} 41 | 42 | if target_type == "max": 43 | print(f"Best result found at {best_name}: {best_result}") 44 | elif target_type == "mean": 45 | mean_result = {metric: sum(value) / len(value) for metric, value in mean_result.items()} 46 | print(f"Mean result {mean_result}") 47 | elif target_type == "median": 48 | mean_result = {metric: statistics.median(value) for metric, value in mean_result.items()} 49 | print(f"Mean result {mean_result}") 50 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | botocore 2 | boto3 3 | deepspeed 4 | filelock 5 | scipy 6 | nltk 7 | regex 8 | tqdm 9 | matplotlib 10 | pandas 11 | requests 12 | sentencepiece 13 | ftfy 14 | langdetect 15 | lsh 16 | scikit_learn 17 | tensorboardX 18 | termcolor 19 | tldextract 20 | transformers 21 | rouge_score 22 | fasttext 23 | unidecode -------------------------------------------------------------------------------- /run_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | if sys.argv[1] == 'block': 4 | from test.test_block import main 5 | main() 6 | elif sys.argv[1] == 'rel_shift': 7 | from test.test_rel_shift import main 8 | main() -------------------------------------------------------------------------------- /scripts/convert_glm_checkpoint_to_transformers.py: -------------------------------------------------------------------------------- 1 | """ 2 | You can use `scripts/convert_glm_checkpoint_to_transformers.py` to convert the checkpoint 3 | ```shell 4 | python scripts/convert_glm_checkpoint_to_transformers.py CHECKPOINT_PATH MODEL_NAME 5 | ``` 6 | where `CHECKPOINT_PATH` is the path to the `mp_rank_00_model_states.pt` file, 7 | MODEL_NAME is the repo name on huggingface hub 8 | (should be in `["glm-large", "glm-roberta-large", "glm-large-chinese", "glm-515m", "glm-2b", "glm-10b", 9 | "glm-10b-chinese"]`). 10 | The `pytorch_model.bin` will be saved under the same directory as `mp_rank_00_model_states.pt`. 11 | """ 12 | import os 13 | import sys 14 | import torch 15 | 16 | 17 | def convert_glm_checkpoint_to_transformers(checkpoint_path, copy_dict=None): 18 | checkpoint = torch.load(checkpoint_path, map_location='cpu') 19 | state_dict = checkpoint['module'] 20 | if copy_dict is not None: 21 | word_embeddings = state_dict['word_embeddings.weight'] 22 | for src_id, dest_id in copy_dict: 23 | word_embeddings[dest_id] = word_embeddings[src_id] 24 | directory = os.path.dirname(checkpoint_path) 25 | output_path = os.path.join(directory, "pytorch_model.bin") 26 | torch.save(state_dict, output_path) 27 | 28 | 29 | if __name__ == "__main__": 30 | checkpoint_path = sys.argv[1] 31 | model_name = sys.argv[2] 32 | copy_dict = None 33 | assert model_name in ["glm-large", "glm-roberta-large", "glm-large-chinese", "glm-515m", "glm-2b", "glm-10b", 34 | "glm-10b-chinese"] 35 | if model_name == "glm-10b-chinese": 36 | copy_dict = [(50007, 50009)] 37 | convert_glm_checkpoint_to_transformers(checkpoint_path, copy_dict) 38 | -------------------------------------------------------------------------------- /scripts/dispatcher.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import argparse 3 | import subprocess 4 | import multiprocessing 5 | import datetime 6 | import pickle 7 | import csv 8 | import random 9 | import copy 10 | 11 | import itertools as it 12 | import json 13 | 14 | from termcolor import colored 15 | 16 | 17 | CONFIG = [ 18 | { 19 | "lr": [1e-5,2e-5], 20 | "batch-size": [16,32], 21 | "epochs": [20,40], 22 | "warmup": [0.1], 23 | "weight-decay": [0.1], 24 | # "adam-beta2": [0.98], 25 | # "adam-eps": [1e-8], 26 | "seed": [1,2,3] 27 | } 28 | ] 29 | 30 | TASK_CONFIG = { 31 | "rte": ( 32 | "--task rte " 33 | "--data-dir /root/data/superglue/RTE " 34 | "--seq-length 256 " 35 | ), 36 | "cb": ( 37 | "--task cb " 38 | "--data-dir /root/data/superglue/CB " 39 | "--seq-length 256 " 40 | ), 41 | "multirc": ( 42 | "--task multirc " 43 | "--data-dir /root/data/superglue/MultiRC " 44 | "--seq-length 430 " 45 | ), 46 | } 47 | 48 | MODEL_CONFIG = { 49 | "blocklm-roberta-large": ( 50 | "--block-lm " 51 | "--cloze-eval " 52 | "--num-layers 24 " 53 | "--hidden-size 1024 " 54 | "--num-attention-heads 16 " 55 | "--max-position-embeddings 512 " 56 | "--tokenizer-model-type roberta " 57 | "--tokenizer-type GPT2BPETokenizer " 58 | "--load-pretrained /root/data/checkpoints/blocklm-roberta-large/250000 " 59 | ), 60 | "blocklm-base-na": ( 61 | "--block-lm " 62 | "--cloze-eval " 63 | "--num-layers 12 " 64 | "--hidden-size 768 " 65 | "--num-attention-heads 12 " 66 | "--max-position-embeddings 512 " 67 | "--tokenizer-model-type bert-base-uncased " 68 | "--tokenizer-type BertWordPieceTokenizer " 69 | "--load-pretrained /root/data/checkpoints/blocklm-base-len6-na03-12-21-21" 70 | ), 71 | } 72 | 73 | CHECKPOINT_PATH = "/root/data/finetune_checkpoints" 74 | RESULT_PATH = "runs/{EXPERIMENT_NAME}/results.json" 75 | LOG_PATH = "logs/" 76 | 77 | DISTRIBUTED_ARGS = "--nproc_per_node {N_GPU} --nnodes 1 --node_rank 0 --master_addr localhost --master_port {MASTER_PORT}" 78 | 79 | COMMON_ARGS = ( 80 | "--save-interval 10000 " 81 | "--log-interval 50 " 82 | "--eval-interval 1000 " 83 | "--eval-iters 100 ") 84 | 85 | 86 | def get_command(model, task, n_gpu, config, overwrite=True): 87 | 88 | distributed_args = DISTRIBUTED_ARGS.format(N_GPU=n_gpu, MASTER_PORT=random.randint(10000, 65535)) 89 | 90 | config = copy.deepcopy(config) 91 | hyper = "-".join([f"{k}-{v}" for k,v in config.items()]) 92 | experiment_name = f"{model}-{task}/{hyper}" 93 | 94 | command = (f"python -m torch.distributed.launch {distributed_args} finetune_gpt2.py " 95 | f"--finetune {MODEL_CONFIG[model]} {TASK_CONFIG[task]} {COMMON_ARGS} " 96 | f"--experiment-name {experiment_name} " 97 | f"--save {CHECKPOINT_PATH} " 98 | f"--checkpoint-activations " 99 | f"--eval-batch-size 16 ") 100 | 101 | config["batch-size"] = config["batch-size"] // n_gpu 102 | command = update_cmd(command, config) 103 | if overwrite: 104 | command += "--overwrite " 105 | 106 | result_path = RESULT_PATH.format(EXPERIMENT_NAME=experiment_name) 107 | log_path = LOG_PATH + f"{model}-{task}-{hyper}.txt" 108 | 109 | return command, result_path, log_path 110 | 111 | 112 | def chain_configs(configs): 113 | ''' 114 | @param configs list of configurations 115 | ''' 116 | all_configs = [] 117 | for config in configs: 118 | # preserve order of configs 119 | keys = sorted(config) 120 | all_args = it.product(*(config[k] for k in keys)) 121 | all_args_dict = [dict(zip(keys, c)) for c in all_args] 122 | 123 | all_configs.append(all_args_dict) 124 | 125 | return it.chain(*all_configs) # flatten result 126 | 127 | 128 | def update_cmd(cmd, config): 129 | ''' 130 | @param cmd str 131 | @param configs list of dicts 132 | ''' 133 | for k, v in config.items(): 134 | if v is None: 135 | continue 136 | if type(v) == bool: 137 | if v: 138 | cmd += "--{} ".format(k) 139 | else: 140 | cmd += "--{} {} ".format(k, v) 141 | 142 | return cmd 143 | 144 | 145 | def parse_args(): 146 | parser = argparse.ArgumentParser(description='Dispatcher to run all experiments') 147 | 148 | parser.add_argument("--gpu", type=str, default='0,1,2,3', 149 | help='list of available gpus') 150 | parser.add_argument("--n_gpu", type=int, default=1, 151 | help="number of gpus per job") 152 | parser.add_argument("--model", type=str, default='blocklm-roberta-large') 153 | parser.add_argument("--task", type=str, required=True) 154 | parser.add_argument("--overwrite", action='store_true', default=False, 155 | help='whether to rerun experiments with the same result ' 156 | 'file location') 157 | parser.add_argument("--debug", action='store_true', default=False) 158 | 159 | return parser.parse_args() 160 | 161 | 162 | def main(): 163 | args = parse_args() 164 | assert args.model in MODEL_CONFIG 165 | assert args.task in TASK_CONFIG 166 | 167 | # compute cartesian product for each set of configurations 168 | configs = chain_configs(CONFIG) 169 | all_configs = configs 170 | 171 | # queues 172 | gpu_list = args.gpu.split(',') 173 | total_gpu = len(gpu_list) 174 | 175 | gpu_queues = [] 176 | for i in range(0, total_gpu, args.n_gpu): 177 | gpu = ','.join(gpu_list[i:i+args.n_gpu]) 178 | gpu_queues.append((multiprocessing.Queue(), gpu)) 179 | done_queue = multiprocessing.Queue() 180 | 181 | results = [] 182 | indx = 0 183 | num_jobs = 0 184 | 185 | for config in all_configs: 186 | gpu_queues[indx][0].put(config) 187 | indx = (indx + 1) % len(gpu_queues) 188 | num_jobs += 1 189 | 190 | for job_queue, gpu in gpu_queues: 191 | print("Start GPU worker {} with {} jobs".format(gpu, job_queue.qsize())) 192 | multiprocessing.Process(target=_worker, args=(gpu, job_queue, done_queue, args)).start() 193 | 194 | timestamp = datetime.datetime.now().strftime("%m-%d-%H-%M") 195 | summary_path = LOG_PATH + f"grid_{args.model}-{args.task}_{timestamp}.txt" 196 | 197 | print("Summary path:", summary_path) 198 | 199 | for _ in range(num_jobs): 200 | result_path, config = done_queue.get() 201 | 202 | try: 203 | res = json.load(open(result_path)) 204 | except Exception as e: 205 | print("Experiment at {} failed".format(colored(result_path, 'red'))) 206 | print(e) 207 | continue 208 | 209 | with open(summary_path, "a") as f: 210 | f.write("Config: " + json.dumps(config) + "\n") 211 | f.write(json.dumps(res) + "\n") 212 | 213 | print('Done') 214 | 215 | 216 | def _worker(gpu, queue, done_queue, args): 217 | while not queue.empty(): 218 | config = queue.get() 219 | if config is None: 220 | return 221 | done_queue.put(_launch_experiment(gpu, config, args)) 222 | 223 | 224 | def _launch_experiment(gpu, config, args): 225 | 226 | command, result_path, log_path = get_command(args.model, args.task, args.n_gpu, config, args.overwrite) 227 | 228 | shell_cmd = f"CUDA_VISIBLE_DEVICES={gpu} " + command 229 | if not args.debug: 230 | shell_cmd += f" > {log_path} 2>&1; " 231 | 232 | print("Time {}, launched exp: {}".format(str(datetime.datetime.now()), log_path)) 233 | 234 | # if experiment has already been run, skip 235 | if not os.path.exists(result_path) or args.overwrite: 236 | return_code = subprocess.call(shell_cmd, shell=True) 237 | 238 | if not os.path.exists(result_path): 239 | # running this process failed, alert me 240 | print("Dispatcher, Alert! Job has crashed! Check logfile at:[{}]".format(log_path)) 241 | 242 | return result_path, config 243 | 244 | 245 | 246 | if __name__ == "__main__": 247 | main() 248 | -------------------------------------------------------------------------------- /scripts/ds_finetune_record.sh: -------------------------------------------------------------------------------- 1 | MP_SIZE=1 2 | DATA_ROOT=/dataset/c07bd62b/superglue 3 | GLUE_DATA_ROOT=/dataset/c07bd62b/glue_data 4 | source config_tasks/model_blocklm_10B.sh 5 | source config_tasks/task_record.sh 6 | 7 | CHECKPOINT_PATH="/dataset/c07bd62b/finetune_checkpoints" 8 | 9 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 10 | 11 | OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2" 12 | DISTRIBUTED_ARGS="${OPTIONS_NCCL} deepspeed --num_gpus 4 --num_nodes 1 --master_port $MASTER_PORT" 13 | DATESTR=$(date +"%m-%d-%H-%M") 14 | 15 | EXPERIMENT_NAME=${EXPERIMENT_NAME}_${DATESTR} 16 | 17 | mkdir logs 18 | run_cmd="${DISTRIBUTED_ARGS} finetune_gpt2.py \ 19 | --deepspeed \ 20 | --deepspeed_config config_tasks/config_blocklm_10B_record.json \ 21 | --finetune \ 22 | --experiment-name ${EXPERIMENT_NAME} \ 23 | --task ${TASK_NAME} \ 24 | --data-dir ${DATA_PATH} \ 25 | --save ${CHECKPOINT_PATH} \ 26 | --seq-length ${MAX_SEQ_LEN} \ 27 | --checkpoint-activations \ 28 | --eval-batch-size 2 \ 29 | --save-epoch 100 \ 30 | --num-workers 1 \ 31 | --no-load-optim \ 32 | --no-load-lr-scheduler \ 33 | --fp16 \ 34 | $MODEL_ARGS \ 35 | $TRAIN_ARGS \ 36 | $COMMON_ARGS \ 37 | --model-parallel-size ${MP_SIZE} \ 38 | --epochs ${EPOCH_SINGLE} \ 39 | --overwrite \ 40 | 2>&1 | tee logs/log-${EXPERIMENT_NAME}.txt" 41 | 42 | echo ${run_cmd} 43 | eval ${run_cmd} 44 | -------------------------------------------------------------------------------- /scripts/ds_finetune_seq2seq.sh: -------------------------------------------------------------------------------- 1 | DATA_ROOT=/root/data 2 | CHECKPOINT_PATH="/dataset/c07bd62b/finetune_checkpoints" 3 | SAVE_PATH=/root/data/finetune_checkpoints 4 | DATESTR=$(date +"%m-%d-%H-%M") 5 | 6 | source $1 # Model 7 | source $2 # Task 8 | 9 | NUM_WORKERS=2 10 | NUM_GPUS_PER_WORKER=8 11 | HOST_FILE_PATH="./hostfile" 12 | MP_SIZE=1 13 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 14 | 15 | OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2" 16 | DISTRIBUTED_ARGS="${OPTIONS_NCCL} deepspeed --hostfile ${HOST_FILE_PATH} --master_port ${MASTER_PORT} --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER}" 17 | 18 | EXPERIMENT_NAME=${EXPERIMENT_NAME}_${DATESTR} 19 | mkdir logs 20 | run_cmd="${DISTRIBUTED_ARGS} finetune_glm.py \ 21 | --deepspeed \ 22 | --deepspeed_config config_tasks/config_blocklm_10B_cnndm.json \ 23 | --finetune \ 24 | --experiment-name ${EXPERIMENT_NAME} \ 25 | --task ${TASK_NAME} \ 26 | --data-dir ${DATA_PATH} \ 27 | --save ${SAVE_PATH} \ 28 | --checkpoint-activations \ 29 | --num-workers 1 \ 30 | --no-load-lr-scheduler \ 31 | $MODEL_ARGS \ 32 | $TRAIN_ARGS \ 33 | $COMMON_ARGS \ 34 | $TASK_ARGS \ 35 | --fp16 \ 36 | --model-parallel-size ${MP_SIZE} \ 37 | --overwrite \ 38 | 2>&1 | tee logs/log-${EXPERIMENT_NAME}.txt" 39 | 40 | echo ${run_cmd} 41 | eval ${run_cmd} 42 | -------------------------------------------------------------------------------- /scripts/ds_finetune_superglue.sh: -------------------------------------------------------------------------------- 1 | DATA_ROOT=/dataset/fd5061f6/tuteng/BlockLM/data 2 | CHECKPOINT_PATH=/dataset/fd5061f6/english_data/checkpoints 3 | SAVE_PATH=/dataset/fd5061f6/tuteng/BlockLM/finetune_checkpoints 4 | DATESTR=$(date +"%m-%d-%H-%M") 5 | 6 | source $1 # Model 7 | source $2 # Task 8 | 9 | NUM_WORKERS=1 10 | NUM_GPUS_PER_WORKER=8 11 | MP_SIZE=1 12 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 13 | 14 | OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2" 15 | DISTRIBUTED_ARGS="${OPTIONS_NCCL} deepspeed --master_port $MASTER_PORT --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER}" 16 | 17 | EXPERIMENT_NAME=${EXPERIMENT_NAME}_${DATESTR} 18 | mkdir logs 19 | run_cmd="${DISTRIBUTED_ARGS} finetune_glm.py \ 20 | --deepspeed \ 21 | --deepspeed_config config_tasks/config_blocklm_10B.json \ 22 | --finetune \ 23 | --cloze-eval \ 24 | --experiment-name ${EXPERIMENT_NAME} \ 25 | --task ${TASK_NAME} \ 26 | --data-dir ${DATA_PATH} \ 27 | --save ${CHECKPOINT_PATH} \ 28 | --seq-length ${MAX_SEQ_LEN} \ 29 | --checkpoint-activations \ 30 | --eval-batch-size 16 \ 31 | --save-epoch 100000 \ 32 | --num-workers 1 \ 33 | --no-load-optim \ 34 | --no-load-lr-scheduler \ 35 | $MODEL_ARGS \ 36 | $TRAIN_ARGS \ 37 | $COMMON_ARGS \ 38 | --pattern-id 0 \ 39 | --fp16 \ 40 | --model-parallel-size ${MP_SIZE} \ 41 | --epochs ${XXLARGE_EPOCH} \ 42 | --overwrite \ 43 | 2>&1 | tee logs/log-${EXPERIMENT_NAME}.txt" 44 | 45 | echo ${run_cmd} 46 | eval ${run_cmd} 47 | -------------------------------------------------------------------------------- /scripts/ds_finetune_superglue_prompt.sh: -------------------------------------------------------------------------------- 1 | DATA_ROOT=/root/data/superglue 2 | CHECKPOINT_PATH=/root/data/checkpoints 3 | SAVE_PATH=/root/data/finetune_checkpoints 4 | DATESTR=$(date +"%m-%d-%H-%M") 5 | 6 | source $1 # Model 7 | source $2 # Task 8 | 9 | NUM_WORKERS=1 10 | NUM_GPUS_PER_WORKER=8 11 | MP_SIZE=1 12 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 13 | 14 | OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2" 15 | DISTRIBUTED_ARGS="${OPTIONS_NCCL} deepspeed --master_port $MASTER_PORT --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER}" 16 | 17 | EXPERIMENT_NAME=${EXPERIMENT_NAME}_${DATESTR} 18 | mkdir logs 19 | run_cmd="${DISTRIBUTED_ARGS} finetune_glm.py \ 20 | --deepspeed \ 21 | --deepspeed_config config_tasks/config_blocklm_10B.json \ 22 | --finetune \ 23 | --cloze-eval \ 24 | --experiment-name ${EXPERIMENT_NAME} \ 25 | --task ${TASK_NAME} \ 26 | --data-dir ${DATA_PATH} \ 27 | --save ${CHECKPOINT_PATH} \ 28 | --seq-length ${MAX_SEQ_LEN} \ 29 | --checkpoint-activations \ 30 | --eval-batch-size 16 \ 31 | --save-epoch 100000 \ 32 | --num-workers 1 \ 33 | --no-load-optim \ 34 | --no-load-lr-scheduler \ 35 | $MODEL_ARGS \ 36 | $TRAIN_ARGS \ 37 | $COMMON_ARGS \ 38 | --fp16 \ 39 | --model-parallel-size ${MP_SIZE} \ 40 | --continuous-prompt \ 41 | --num-prompt-tokens 3 \ 42 | --epochs ${XXLARGE_EPOCH} \ 43 | --overwrite \ 44 | 2>&1 | tee logs/log-${EXPERIMENT_NAME}.txt" 45 | 46 | echo ${run_cmd} 47 | eval ${run_cmd} 48 | -------------------------------------------------------------------------------- /scripts/ds_pretrain_nvidia.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Change for multinode config 4 | 5 | NUM_WORKERS=32 6 | NUM_GPUS_PER_WORKER=8 7 | MP_SIZE=1 8 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 9 | 10 | source $1 11 | DATESTR=$(date +"%m-%d-%H-%M") 12 | 13 | OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2" 14 | HOST_FILE_PATH="/workspace/hostfile" 15 | 16 | mkdir logs 17 | run_cmd="${OPTIONS_NCCL} deepspeed --master_port ${MASTER_PORT} --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} --hostfile ${HOST_FILE_PATH} pretrain_glm.py ${gpt_options} 2>&1 | tee logs/log-${DATESTR}.txt" 18 | echo ${run_cmd} 19 | eval ${run_cmd} 20 | 21 | set +x -------------------------------------------------------------------------------- /scripts/evaluate_lm.sh: -------------------------------------------------------------------------------- 1 | DATA_ROOT=/dataset/c07bd62b 2 | CHECKPOINT_PATH="/dataset/c07bd62b/checkpoints" 3 | 4 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 5 | DISTRIBUTED_ARGS="--nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr localhost --master_port $MASTER_PORT" 6 | DATESTR=$(date +"%m-%d-%H-%M") 7 | 8 | source $1 # Model 9 | source $2 # Task 10 | 11 | python -m torch.distributed.launch $DISTRIBUTED_ARGS finetune_glm.py \ 12 | --deepspeed \ 13 | --finetune \ 14 | --experiment-name ${EXPERIMENT_NAME} \ 15 | --task ${TASK_NAME} \ 16 | --valid-data ${DATA_PATH} \ 17 | --save ${CHECKPOINT_PATH} \ 18 | --checkpoint-activations \ 19 | --fp16 \ 20 | --overwrite \ 21 | $MODEL_ARGS \ 22 | $EVALUATE_ARGS \ 23 | 2>&1 | tee logs/log-${EXPERIMENT_NAME}_${DATESTR}.txt -------------------------------------------------------------------------------- /scripts/evaluate_multichoice.sh: -------------------------------------------------------------------------------- 1 | CHECKPOINT_PATH= 2 | DATA_PATH= 3 | 4 | source $1 # Model 5 | 6 | NUM_WORKERS=1 7 | NUM_GPUS_PER_WORKER=1 8 | MP_SIZE=1 9 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 10 | MAX_SEQ_LEN=512 11 | 12 | OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2" 13 | DISTRIBUTED_ARGS="${OPTIONS_NCCL} deepspeed --master_port $MASTER_PORT --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER}" 14 | 15 | mkdir logs 16 | run_cmd="${DISTRIBUTED_ARGS} finetune_glm.py \ 17 | --deepspeed \ 18 | --deepspeed_config config_tasks/config_blocklm_10B.json \ 19 | --finetune \ 20 | --cloze-eval \ 21 | --task multichoice \ 22 | --test-data ${DATA_PATH} \ 23 | --seq-length ${MAX_SEQ_LEN} \ 24 | --checkpoint-activations \ 25 | --eval-batch-size 16 \ 26 | --num-workers 1 \ 27 | --no-load-optim \ 28 | --no-load-lr-scheduler \ 29 | $MODEL_ARGS \ 30 | --fp16 \ 31 | --model-parallel-size ${MP_SIZE} \ 32 | --epochs 0 \ 33 | --overwrite \ 34 | 2>&1" 35 | 36 | echo ${run_cmd} 37 | eval ${run_cmd} 38 | -------------------------------------------------------------------------------- /scripts/evaluate_seq2seq.sh: -------------------------------------------------------------------------------- 1 | export CLASSPATH=/path/to/stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0.jar 2 | mkdir tmp 3 | cat $1 | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > tmp/test.hypo.tokenized 4 | cat $2 | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > tmp/test.hypo.target 5 | files2rouge tmp/test.hypo.tokenized tmp/test.hypo.target -------------------------------------------------------------------------------- /scripts/finetune_blank.sh: -------------------------------------------------------------------------------- 1 | DATA_ROOT=/root/data 2 | CHECKPOINT_PATH=/root/data/checkpoints 3 | SAVE_PATH=/root/data/finetune_checkpoints 4 | DATESTR=$(date +"%m-%d-%H-%M") 5 | 6 | MASK_RATIO=0.1 7 | 8 | source $1 # Model 9 | source $2 # Task 10 | 11 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 12 | DISTRIBUTED_ARGS="--nproc_per_node 4 --nnodes 1 --node_rank 0 --master_addr localhost --master_port $MASTER_PORT" 13 | 14 | python -m torch.distributed.launch $DISTRIBUTED_ARGS finetune_glm.py \ 15 | --finetune \ 16 | --experiment-name ${EXPERIMENT_NAME} \ 17 | --task ${TASK_NAME} \ 18 | --data-dir ${DATA_PATH} \ 19 | --save ${SAVE_PATH} \ 20 | --checkpoint-activations \ 21 | --overwrite \ 22 | $MODEL_ARGS \ 23 | $TRAIN_ARGS \ 24 | $COMMON_ARGS \ 25 | $TASK_ARGS \ 26 | 2>&1 | tee logs/log-${EXPERIMENT_NAME}.txt -------------------------------------------------------------------------------- /scripts/finetune_seq2seq.sh: -------------------------------------------------------------------------------- 1 | DATA_ROOT=/root/data 2 | CHECKPOINT_PATH=/dataset/fd5061f6/pretrained_models 3 | SAVE_PATH=/dataset/fd5061f6/finetune_checkpoints 4 | DATESTR=$(date +"%m-%d-%H-%M") 5 | 6 | source $1 # Model 7 | source $2 # Task 8 | 9 | if [ -z $N_GPU ];then 10 | N_GPU=4 11 | fi 12 | 13 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 14 | DISTRIBUTED_ARGS="--nproc_per_node ${N_GPU} --nnodes 1 --node_rank 0 --master_addr localhost --master_port $MASTER_PORT" 15 | 16 | DATESTR=$(date +"%m-%d-%H-%M") 17 | EXPERIMENT_NAME=${EXPERIMENT_NAME} #-${DATESTR} 18 | 19 | TOKENIZERS_PARALLELISM=false 20 | 21 | mkdir logs 22 | python -m torch.distributed.launch $DISTRIBUTED_ARGS finetune_glm.py \ 23 | --finetune \ 24 | --experiment-name ${EXPERIMENT_NAME} \ 25 | --task ${TASK_NAME} \ 26 | --data-dir ${DATA_PATH} \ 27 | --save ${SAVE_PATH} \ 28 | --checkpoint-activations \ 29 | --epochs ${EPOCH_SINGLE} \ 30 | --batch-size ${BATCH_SINGLE} \ 31 | --lr ${LR_SINGLE} \ 32 | $MODEL_ARGS \ 33 | $TRAIN_ARGS \ 34 | $COMMON_ARGS \ 35 | $TASK_ARGS \ 36 | 2>&1 | tee logs/log-${EXPERIMENT_NAME}.txt 37 | 38 | -------------------------------------------------------------------------------- /scripts/finetune_seq2seq_grid.sh: -------------------------------------------------------------------------------- 1 | DATA_ROOT=/root/data 2 | CHECKPOINT_PATH=/dataset/fd5061f6/pretrained_models 3 | SAVE_PATH=/dataset/fd5061f6/finetune_checkpoints 4 | DATESTR=$(date +"%m-%d-%H-%M") 5 | 6 | source $1 # Model 7 | source $2 # Task 8 | 9 | if [ -z $N_GPU ];then 10 | N_GPU=4 11 | fi 12 | 13 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 14 | DISTRIBUTED_ARGS="--nproc_per_node ${N_GPU} --nnodes 1 --node_rank 0 --master_addr localhost --master_port $MASTER_PORT" 15 | 16 | DATESTR=$(date +"%m-%d-%H-%M") 17 | EXPERIMENT_NAME=${EXPERIMENT_NAME} #-${DATESTR} 18 | 19 | TOKENIZERS_PARALLELISM=false 20 | 21 | mkdir logs 22 | GRID_LOG=logs/grid_${EXPERIMENT_NAME}_${DATESTR}.txt 23 | 24 | for lr in 5e-6 1e-5 2e-5 25 | do 26 | for batch in 4 8 12 27 | do 28 | for epoch in 5 10 29 | do 30 | HYPER=${lr}-${batch}-${epoch} 31 | python -m torch.distributed.launch $DISTRIBUTED_ARGS finetune_glm.py \ 32 | --finetune \ 33 | --experiment-name ${EXPERIMENT_NAME}/${HYPER} \ 34 | --task ${TASK_NAME} \ 35 | --data-dir ${DATA_PATH} \ 36 | --save ${SAVE_PATH} \ 37 | --checkpoint-activations \ 38 | --epochs ${epoch} \ 39 | --batch-size ${batch} \ 40 | --lr ${lr} \ 41 | $MODEL_ARGS \ 42 | $TRAIN_ARGS \ 43 | $COMMON_ARGS \ 44 | $TASK_ARGS \ 45 | 2>&1 | tee logs/log-${EXPERIMENT_NAME}-${HYPER}.txt 46 | echo $lr $batch $epoch >> $GRID_LOG 47 | cat runs/${EXPERIMENT_NAME}/${HYPER}/results.json >> $GRID_LOG 48 | done 49 | done 50 | done 51 | 52 | -------------------------------------------------------------------------------- /scripts/finetune_superglue.sh: -------------------------------------------------------------------------------- 1 | DATA_ROOT=/dataset/fd5061f6/english_data/superglue 2 | CHECKPOINT_PATH=/dataset/fd5061f6/pretrained_models 3 | SAVE_PATH=/dataset/fd5061f6/finetune_checkpoints 4 | 5 | source $1 # Model 6 | source $2 # Task 7 | 8 | if [ -z $N_GPU ];then 9 | N_GPU=1 10 | fi 11 | 12 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 13 | DISTRIBUTED_ARGS="--nproc_per_node ${N_GPU} --nnodes 1 --node_rank 0 --master_addr localhost --master_port $MASTER_PORT" 14 | 15 | PER_GPU_BS=$(($BATCH_SIZE/$N_GPU)) 16 | DATESTR=$(date +"%m-%d-%H-%M") 17 | EXPERIMENT_NAME=${EXPERIMENT_NAME}-${DATESTR} 18 | 19 | mkdir logs 20 | python -m torch.distributed.launch $DISTRIBUTED_ARGS finetune_glm.py \ 21 | --finetune \ 22 | --cloze-eval \ 23 | --experiment-name ${EXPERIMENT_NAME} \ 24 | --task ${TASK_NAME} \ 25 | --data-dir ${DATA_PATH} \ 26 | --save ${SAVE_PATH} \ 27 | --seq-length ${MAX_SEQ_LEN} \ 28 | --checkpoint-activations \ 29 | --eval-batch-size 16 \ 30 | --save-epoch 100000 \ 31 | $MODEL_ARGS \ 32 | $TRAIN_ARGS \ 33 | $COMMON_ARGS \ 34 | --fp16 \ 35 | --batch-size ${PER_GPU_BS} \ 36 | --epochs ${EPOCH_SINGLE} \ 37 | --lr ${LR_SINGLE} \ 38 | --overwrite \ 39 | 2>&1 | tee logs/log-${EXPERIMENT_NAME}.txt 40 | -------------------------------------------------------------------------------- /scripts/finetune_superglue_fast.sh: -------------------------------------------------------------------------------- 1 | DATA_ROOT=/root/data/superglue 2 | source config_tasks/model_blocklm.sh 3 | source $1 4 | 5 | CHECKPOINT_PATH="/root/data/finetune_checkpoints" 6 | 7 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 8 | DISTRIBUTED_ARGS="--nproc_per_node 4 --nnodes 1 --node_rank 0 --master_addr localhost --master_port $MASTER_PORT" 9 | DATESTR=$(date +"%m-%d-%H-%M") 10 | 11 | mkdir logs 12 | python -m torch.distributed.launch $DISTRIBUTED_ARGS finetune_glm.py \ 13 | --finetune \ 14 | --cloze-eval \ 15 | --experiment-name ${EXPERIMENT_NAME} \ 16 | --task ${TASK_NAME} \ 17 | --data-dir ${DATA_PATH} \ 18 | --save ${CHECKPOINT_PATH} \ 19 | --seq-length ${MAX_SEQ_LEN} \ 20 | --fast-decode \ 21 | --batch-size 8 \ 22 | --eval-batch-size 16 \ 23 | --save-epoch 5 \ 24 | $MODEL_ARGS \ 25 | $TRAIN_ARGS \ 26 | $COMMON_ARGS \ 27 | --epochs ${EPOCH_SINGLE} \ 28 | --lr ${LR_SINGLE} \ 29 | --overwrite \ 30 | 2>&1 | tee logs/log-${EXPERIMENT_NAME}.txt 31 | -------------------------------------------------------------------------------- /scripts/finetune_superglue_grid.sh: -------------------------------------------------------------------------------- 1 | DATA_ROOT=/dataset/c07bd62b/superglue 2 | source config_tasks/model_blocklm_roberta_1.25.sh 3 | source $1 4 | 5 | CHECKPOINT_PATH="/dataset/c07bd62b/finetune_checkpoints" 6 | 7 | if [ -z $N_GPU ];then 8 | N_GPU=2 9 | fi 10 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 11 | DISTRIBUTED_ARGS="--nproc_per_node ${N_GPU} --nnodes 1 --node_rank 0 --master_addr localhost --master_port $MASTER_PORT" 12 | 13 | DATESTR=$(date +"%m-%d-%H-%M") 14 | GRID_LOG=logs/grid_${EXPERIMENT_NAME}_${DATESTR}.txt 15 | mkdir logs 16 | for lr in 6e-6 1e-5 2e-5 17 | do 18 | for seed in 1234 5678 3456 19 | do 20 | HYPER=${lr}-${seed} 21 | PER_GPU_BS=$(($BATCH_SIZE/$N_GPU)) 22 | if [ ! -f runs/${EXPERIMENT_NAME}/${HYPER}/test_results.json ]; then 23 | echo runs/${EXPERIMENT_NAME}/${HYPER} 24 | python -m torch.distributed.launch $DISTRIBUTED_ARGS finetune_gpt2.py \ 25 | --finetune \ 26 | --experiment-name ${EXPERIMENT_NAME}/${HYPER} \ 27 | --task ${TASK_NAME} \ 28 | --data-dir ${DATA_PATH} \ 29 | --save ${CHECKPOINT_PATH} \ 30 | --seq-length ${MAX_SEQ_LEN} \ 31 | --checkpoint-activations \ 32 | --eval-batch-size 16 \ 33 | --save-epoch 1000 \ 34 | $MODEL_ARGS \ 35 | $TRAIN_ARGS \ 36 | $COMMON_ARGS \ 37 | --fp16 \ 38 | --attention-scale 8.0 \ 39 | --batch-size ${PER_GPU_BS} \ 40 | --epochs ${EPOCH_SINGLE} \ 41 | --lr-decay-style linear \ 42 | --lr ${lr} \ 43 | --seed ${seed} \ 44 | --overwrite \ 45 | 2>&1 | tee logs/log-${EXPERIMENT_NAME}-${HYPER}.txt 46 | fi 47 | echo $lr $seed >> $GRID_LOG 48 | cat runs/${EXPERIMENT_NAME}/${HYPER}/results.json >> $GRID_LOG 49 | done 50 | done 51 | 52 | echo $EXPERIMENT_NAME >> $GRID_LOG -------------------------------------------------------------------------------- /scripts/generate_block.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CHECKPOINT_PATH=/zhangpai21/checkpoints 3 | 4 | source $1 5 | 6 | MPSIZE=1 7 | MAXSEQLEN=512 8 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 9 | 10 | #SAMPLING ARGS 11 | TEMP=0.9 12 | #If TOPK/TOPP are 0 it defaults to greedy sampling, top-k will also override top-p 13 | TOPK=40 14 | TOPP=0 15 | 16 | script_path=$(realpath $0) 17 | script_dir=$(dirname $script_path) 18 | 19 | config_json="$script_dir/ds_config.json" 20 | 21 | python -m torch.distributed.launch --nproc_per_node=$MPSIZE --master_port $MASTER_PORT generate_samples.py \ 22 | --DDP-impl none \ 23 | --model-parallel-size $MPSIZE \ 24 | $MODEL_ARGS \ 25 | --fp16 \ 26 | --cache-dir cache \ 27 | --out-seq-length $MAXSEQLEN \ 28 | --seq-length 512 \ 29 | --temperature $TEMP \ 30 | --top-k $TOPK \ 31 | --top-p $TOPP 32 | -------------------------------------------------------------------------------- /scripts/multi-bleu.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | # $Id$ 7 | use warnings; 8 | use strict; 9 | 10 | my $lowercase = 0; 11 | if ($ARGV[0] eq "-lc") { 12 | $lowercase = 1; 13 | shift; 14 | } 15 | 16 | my $stem = $ARGV[0]; 17 | if (!defined $stem) { 18 | print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n"; 19 | print STDERR "Reads the references from reference or reference0, reference1, ...\n"; 20 | exit(1); 21 | } 22 | 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; 24 | 25 | my @REF; 26 | my $ref=0; 27 | while(-e "$stem$ref") { 28 | &add_to_ref("$stem$ref",\@REF); 29 | $ref++; 30 | } 31 | &add_to_ref($stem,\@REF) if -e $stem; 32 | die("ERROR: could not find reference file $stem") unless scalar @REF; 33 | 34 | # add additional references explicitly specified on the command line 35 | shift; 36 | foreach my $stem (@ARGV) { 37 | &add_to_ref($stem,\@REF) if -e $stem; 38 | } 39 | 40 | 41 | 42 | sub add_to_ref { 43 | my ($file,$REF) = @_; 44 | my $s=0; 45 | if ($file =~ /.gz$/) { 46 | open(REF,"gzip -dc $file|") or die "Can't read $file"; 47 | } else { 48 | open(REF,$file) or die "Can't read $file"; 49 | } 50 | while() { 51 | chop; 52 | push @{$$REF[$s++]}, $_; 53 | } 54 | close(REF); 55 | } 56 | 57 | my(@CORRECT,@TOTAL,$length_translation,$length_reference); 58 | my $s=0; 59 | while() { 60 | chop; 61 | $_ = lc if $lowercase; 62 | my @WORD = split; 63 | my %REF_NGRAM = (); 64 | my $length_translation_this_sentence = scalar(@WORD); 65 | my ($closest_diff,$closest_length) = (9999,9999); 66 | foreach my $reference (@{$REF[$s]}) { 67 | # print "$s $_ <=> $reference\n"; 68 | $reference = lc($reference) if $lowercase; 69 | my @WORD = split(' ',$reference); 70 | my $length = scalar(@WORD); 71 | my $diff = abs($length_translation_this_sentence-$length); 72 | if ($diff < $closest_diff) { 73 | $closest_diff = $diff; 74 | $closest_length = $length; 75 | # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; 76 | } elsif ($diff == $closest_diff) { 77 | $closest_length = $length if $length < $closest_length; 78 | # from two references with the same closeness to me 79 | # take the *shorter* into account, not the "first" one. 80 | } 81 | for(my $n=1;$n<=4;$n++) { 82 | my %REF_NGRAM_N = (); 83 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 84 | my $ngram = "$n"; 85 | for(my $w=0;$w<$n;$w++) { 86 | $ngram .= " ".$WORD[$start+$w]; 87 | } 88 | $REF_NGRAM_N{$ngram}++; 89 | } 90 | foreach my $ngram (keys %REF_NGRAM_N) { 91 | if (!defined($REF_NGRAM{$ngram}) || 92 | $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { 93 | $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; 94 | # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}
\n"; 95 | } 96 | } 97 | } 98 | } 99 | $length_translation += $length_translation_this_sentence; 100 | $length_reference += $closest_length; 101 | for(my $n=1;$n<=4;$n++) { 102 | my %T_NGRAM = (); 103 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 104 | my $ngram = "$n"; 105 | for(my $w=0;$w<$n;$w++) { 106 | $ngram .= " ".$WORD[$start+$w]; 107 | } 108 | $T_NGRAM{$ngram}++; 109 | } 110 | foreach my $ngram (keys %T_NGRAM) { 111 | $ngram =~ /^(\d+) /; 112 | my $n = $1; 113 | # my $corr = 0; 114 | # print "$i e $ngram $T_NGRAM{$ngram}
\n"; 115 | $TOTAL[$n] += $T_NGRAM{$ngram}; 116 | if (defined($REF_NGRAM{$ngram})) { 117 | if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { 118 | $CORRECT[$n] += $T_NGRAM{$ngram}; 119 | # $corr = $T_NGRAM{$ngram}; 120 | # print "$i e correct1 $T_NGRAM{$ngram}
\n"; 121 | } 122 | else { 123 | $CORRECT[$n] += $REF_NGRAM{$ngram}; 124 | # $corr = $REF_NGRAM{$ngram}; 125 | # print "$i e correct2 $REF_NGRAM{$ngram}
\n"; 126 | } 127 | } 128 | # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; 129 | # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" 130 | } 131 | } 132 | $s++; 133 | } 134 | my $brevity_penalty = 1; 135 | my $bleu = 0; 136 | 137 | my @bleu=(); 138 | 139 | for(my $n=1;$n<=4;$n++) { 140 | if (defined ($TOTAL[$n])){ 141 | $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; 142 | # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; 143 | }else{ 144 | $bleu[$n]=0; 145 | } 146 | } 147 | 148 | if ($length_reference==0){ 149 | printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; 150 | exit(1); 151 | } 152 | 153 | if ($length_translation<$length_reference) { 154 | $brevity_penalty = exp(1-$length_reference/$length_translation); 155 | } 156 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + 157 | my_log( $bleu[2] ) + 158 | my_log( $bleu[3] ) + 159 | my_log( $bleu[4] ) ) / 4) ; 160 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", 161 | 100*$bleu, 162 | 100*$bleu[1], 163 | 100*$bleu[2], 164 | 100*$bleu[3], 165 | 100*$bleu[4], 166 | $brevity_penalty, 167 | $length_translation / $length_reference, 168 | $length_translation, 169 | $length_reference; 170 | 171 | sub my_log { 172 | return -9999999999 unless $_[0]; 173 | return log($_[0]); 174 | } -------------------------------------------------------------------------------- /tasks/language_model/detokenizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def ptb_detokenizer(string): 5 | string = string.replace(" '", "'") 6 | string = string.replace(" \n", "\n") 7 | string = string.replace("\n ", "\n") 8 | string = string.replace(" n't", "n't") 9 | string = string.replace(" N ", "1 ") 10 | string = string.replace("$ 1", "$1") 11 | string = string.replace("# 1", "#1") 12 | return string 13 | 14 | 15 | def wikitext_detokenizer(string): 16 | # contractions 17 | string = string.replace("s '", "s'") 18 | string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) 19 | # number separators 20 | string = string.replace(" @-@ ", "-") 21 | string = string.replace(" @,@ ", ",") 22 | string = string.replace(" @.@ ", ".") 23 | # punctuation 24 | string = string.replace(" : ", ": ") 25 | string = string.replace(" ; ", "; ") 26 | string = string.replace(" . ", ". ") 27 | string = string.replace(" ! ", "! ") 28 | string = string.replace(" ? ", "? ") 29 | string = string.replace(" , ", ", ") 30 | # double brackets 31 | string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) 32 | string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) 33 | string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) 34 | string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) 35 | string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) 36 | # miscellaneous 37 | string = string.replace("= = = =", "====") 38 | string = string.replace("= = =", "===") 39 | string = string.replace("= =", "==") 40 | string = string.replace(" " + chr(176) + " ", chr(176)) 41 | string = string.replace(" \n", "\n") 42 | string = string.replace("\n ", "\n") 43 | string = string.replace(" N ", " 1 ") 44 | string = string.replace(" 's", "'s") 45 | 46 | return string 47 | 48 | 49 | def lambada_detokenizer(string): 50 | return string 51 | 52 | 53 | def get_detokenizer(dataset): 54 | return DETOKENIZERS[dataset] 55 | 56 | 57 | DETOKENIZERS = { 58 | 'ptb': ptb_detokenizer, 59 | 'wikitext': wikitext_detokenizer, 60 | 'lambada': lambada_detokenizer, 61 | } 62 | -------------------------------------------------------------------------------- /tasks/seq2seq/finetune.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Race.""" 17 | import torch 18 | import mpu 19 | import json 20 | import functools 21 | from tasks.eval_utils import accuracy_func_provider 22 | from finetune_glm import finetune 23 | from pretrain_glm import get_batch 24 | from collections import OrderedDict 25 | from tasks.seq2seq.dataset import Seq2SeqDataset, BlankLMDataset, ExtractionDataset, CustomizationDataset 26 | from tasks.seq2seq.evaluate import rouge_metric, DecoderEvaluater, BlankLMEvaluater 27 | from tasks.superglue.evaluate import squad_exact_match, squad_f1 28 | 29 | global_tokenizer = None 30 | 31 | 32 | def seq2seq_forward_step(data, model, args, timers, mems): 33 | """Forward step.""" 34 | 35 | # Get the batch. 36 | if timers is not None: 37 | timers('batch generator').start() 38 | tokens, labels, loss_mask, attention_mask, position_ids = get_batch(data, args) 39 | if timers is not None: 40 | timers('batch generator').stop() 41 | # Forward model. 42 | logits, *mems = model(tokens, position_ids, attention_mask, *mems) 43 | # logits, loss_mask = logits[:, args.src_seq_length:], loss_mask[:, args.src_seq_length:] 44 | # target_ids = target_ids[:, args.src_seq_length:] 45 | losses = mpu.vocab_parallel_cross_entropy(logits.contiguous().float(), labels) 46 | if args.label_smoothing > 0.0: 47 | epsilon = args.label_smoothing 48 | smooth_loss = -torch.nn.functional.log_softmax(logits, dim=-1).mean(dim=-1) 49 | losses = (1 - epsilon) * losses + epsilon * smooth_loss 50 | loss_mask = loss_mask.reshape(-1) 51 | # The loss is not normalized for fair comparison 52 | loss = torch.sum(losses.reshape(-1) * loss_mask) / loss_mask.sum() 53 | return loss, mems, 'bert' 54 | 55 | 56 | def train_valid_datasets_provider(args, tokenizer): 57 | """Provide train and validation datasets.""" 58 | if args.task.lower() == 'blank': 59 | train_dataset = BlankLMDataset(args, split='train', tokenizer=tokenizer) 60 | valid_dataset = None 61 | elif args.task.lower() == 'extraction': 62 | train_dataset = ExtractionDataset(args, split='train', tokenizer=tokenizer) 63 | valid_dataset = None 64 | elif args.task.lower() == 'customization': 65 | train_dataset = CustomizationDataset(args, split='train', tokenizer=tokenizer) 66 | valid_dataset = None 67 | else: 68 | train_dataset = Seq2SeqDataset(args, split='train', tokenizer=tokenizer) 69 | valid_dataset = None 70 | global global_tokenizer 71 | global_tokenizer = tokenizer 72 | return train_dataset, valid_dataset 73 | 74 | 75 | def metrics_func_provider(args, tokenizer, is_test): 76 | """Provide metrics callback function.""" 77 | 78 | def single_dataset_provider(split): 79 | if args.task.lower() == 'blank': 80 | return BlankLMDataset(args, split=split, tokenizer=tokenizer) 81 | elif args.task.lower() == 'extraction': 82 | return ExtractionDataset(args, split=split, tokenizer=tokenizer) 83 | elif args.task.lower() == 'customization': 84 | return CustomizationDataset(args, split=split, tokenizer=tokenizer) 85 | else: 86 | return Seq2SeqDataset(args, split=split, tokenizer=tokenizer) 87 | 88 | if args.task.lower() in ['blank', 'extraction']: 89 | evaluater = BlankLMEvaluater(args, tokenizer) 90 | eval_func = evaluater.evaluate 91 | metric_dict = {} 92 | else: 93 | evaluater = DecoderEvaluater(args, tokenizer) 94 | eval_func = evaluater.evaluate 95 | if args.tokenizer_type == "BertWordPieceTokenizer": 96 | dataset = 'cnn_dm' 97 | elif args.task.lower() == 'gigaword': 98 | dataset = 'gigaword' 99 | else: 100 | dataset = 'cnn_dm_org' 101 | if args.task.lower() in ['squad', 'squad_v1']: 102 | metric_dict = {"EM": squad_exact_match, "F1": squad_f1} 103 | else: 104 | metric_dict = OrderedDict({"rouge-1": functools.partial(rouge_metric, metric="rouge-1", dataset=dataset), 105 | "rouge-2": functools.partial(rouge_metric, metric="rouge-2", dataset=dataset), 106 | "rouge-l": functools.partial(rouge_metric, metric="rouge-l", dataset=dataset)}) 107 | 108 | def output_func(predictions, examples, output_file): 109 | if args.task.lower() in ['squad', 'squad_v1']: 110 | with open(output_file, "w", encoding='utf-8') as output: 111 | res = {} 112 | for prediction, example in zip(predictions, examples): 113 | idx = example.idx 114 | if prediction.lower().replace(' ', '') == 'n/a': 115 | prediction = '' 116 | if idx not in res or res[idx] == '': 117 | res[idx] = prediction 118 | json.dump(res, output) 119 | with open(output_file + ".refs", "w", encoding='utf-8') as output: 120 | for prediction, example in zip(predictions, examples): 121 | res = {'id': example.idx, 'pred': prediction, 'gold': example.meta['answers']} 122 | output.write(json.dumps(res) + '\n') 123 | return 124 | with open(output_file + ".hyps", "w", encoding='utf-8') as output: 125 | for prediction in predictions: 126 | output.write(prediction) 127 | output.write("\n") 128 | with open(output_file + ".refs", "w", encoding='utf-8') as output: 129 | for example in examples: 130 | output.write(example.meta["ref"]) 131 | output.write("\n") 132 | if args.task.lower() == 'squad_generation': 133 | with open(output_file + ".source", "w", encoding='utf-8') as output: 134 | for example in examples: 135 | output.write(example.text_a.replace("\n", " ") + " Answer: " + example.meta["answer"]) 136 | output.write("\n") 137 | 138 | return accuracy_func_provider(single_dataset_provider, metric_dict, args, is_test=is_test, eval_func=eval_func, 139 | output_func=output_func, only_rank0=False) 140 | 141 | 142 | def main(args): 143 | if args.src_seq_length > args.max_position_embeddings: 144 | args.max_position_embeddings = args.src_seq_length 145 | if args.task.lower() in ['cnn_dm', 'cnn_dm_original', 'gigaword', 'blank', 'squad_generation', 'xsum', 146 | 'squad', 'squad_v1', 'extraction', 'cmrc', 'customization']: 147 | finetune(args, train_valid_datasets_provider, {}, end_of_epoch_callback_provider=metrics_func_provider, 148 | forward_step=seq2seq_forward_step) 149 | else: 150 | raise NotImplementedError(args.task) 151 | -------------------------------------------------------------------------------- /tasks/superglue/README.md: -------------------------------------------------------------------------------- 1 | # Use GLM for your NLU tasks 2 | To use GLM for your own NLU tasks, you should implement a subclass of `DataProcessor` in [tasks/superglue/dataset.py](dataset.py) and a subclass of `PVP` in [tasks/superglue/pvp.py](pvp.py). You should also specify the We will take the RTE and ReCoRD tasks in SuperGLUE as an example. 3 | 4 | ## 1. Design your patterns 5 | RTE is an NLI task in which the model is required to predict text entailment between a premise and a hypothesis. The label can be `entailment` or `not_entailment` One sample from the training set is 6 | ``` 7 | premise: No Weapons of Mass Destruction Found in Iraq Yet. 8 | hypothesis: Weapons of Mass Destruction Found in Iraq. 9 | label: not_entailment 10 | ``` 11 | We design the pattern as 12 | ``` 13 | "`hypothesis`"?, [MASK], "`premise`" 14 | ``` 15 | GLM predicts "Yes" for `entailment` and "No" for `not_entailment`. "Yes" and "No" are called verbalizers for `entailment` and `not_entailment`. 16 | 17 | ReCoRD is a multi-choice QA task. Each example consists of a news article and a Cloze-style question about the article in which one entity is masked out. The system must predict the masked out entity from a list of possible entities in the provided passage. We directly adopt the cloze-style question as our pattern and use GLM to predict the masked entity. 18 | 19 | ## 2. Implement subclass of `DataProcessor` 20 | A subclass of `DataProcessor` should implement `get_train_examples`, `get_dev_examples` and `get_test_examples`, which return the examples of the train, dev, and test sets. The returned value is a list of `InputExample`. It should also implement `get_labels` to return the list of possible labels. Hete we take the `RTEProcessor` as an example: 21 | ```python 22 | class RteProcessor(DataProcessor): 23 | """Processor for the RTE data set.""" 24 | 25 | def get_train_examples(self, data_dir): 26 | return self._create_examples(os.path.join(data_dir, "train.jsonl"), "train") 27 | 28 | def get_dev_examples(self, data_dir, for_train=False): 29 | return self._create_examples(os.path.join(data_dir, "val.jsonl"), "dev") 30 | 31 | def get_test_examples(self, data_dir): 32 | return self._create_examples(os.path.join(data_dir, "test.jsonl"), "test") 33 | 34 | def get_unlabeled_examples(self, data_dir): 35 | return self._create_examples(os.path.join(data_dir, "unlabeled.jsonl"), "unlabeled") 36 | 37 | def get_labels(self): 38 | return ["entailment", "not_entailment"] 39 | 40 | def _create_examples(self, path: str, set_type: str, hypothesis_name: str = "hypothesis", 41 | premise_name: str = "premise") -> List[InputExample]: 42 | examples = [] 43 | 44 | with open(path, encoding='utf8') as f: 45 | for line_idx, line in enumerate(f): 46 | example_json = json.loads(line) 47 | idx = example_json['idx'] 48 | if isinstance(idx, str): 49 | try: 50 | idx = int(idx) 51 | except ValueError: 52 | idx = line_idx 53 | label = example_json.get('label') 54 | guid = "%s-%s" % (set_type, idx) 55 | text_a = example_json[premise_name] 56 | text_b = example_json[hypothesis_name] 57 | 58 | example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx) 59 | examples.append(example) 60 | 61 | return examples 62 | ``` 63 | After that, you should add the implemented class to ``PROCESSORS`` at the end of [tasks/superglue/dataset.py](dataset.py): 64 | ```python 65 | PROCESSORS = { 66 | ... 67 | "rte": RteProcessor 68 | } 69 | ``` 70 | 71 | ## 3. Implement subclass of `PVP` 72 | To implement a subclass of `PVP`, you should first decide your verbalizers is single-token or multi-token. The verbalizers in RTE, "Yes" and "No" are single-token. Instead, the verbalizers in ReCoRD are multi-token, as one entity can be tokenized into multiple tokens with WordPiece or BPE tokenizer. 73 | 74 | For single-token task, you should set `is_multi_token=False` in the class definition. You should implement `get_parts` to return the inputs to GLM given an example and `verbalize` to return the verbalizer given a label. Take `RTEPVP` as an example: 75 | ```python 76 | class RtePVP(PVP): 77 | is_multi_token = False 78 | VERBALIZER = { 79 | "not_entailment": [" No"], 80 | "entailment": [" Yes"] 81 | } 82 | 83 | @property 84 | def spell_length(self): 85 | return self.pattern_id 86 | 87 | def get_parts(self, example: InputExample) -> FilledPattern: 88 | # switch text_a and text_b to get the correct order 89 | text_a = example.text_a 90 | text_b = example.text_b.rstrip(string.punctuation) 91 | return ['"', self.shortenable(text_b), '" ?'], [[self.mask], ', "', self.shortenable(text_a), '"'] 92 | 93 | def verbalize(self, label) -> List[str]: 94 | return RtePVP.VERBALIZER[label] 95 | ``` 96 | We use `PvP.shortenable` to mark the segments that can be truncated when exceeding the maximum sequence length. 97 | 98 | For multi-token task, you should set `is_multi_token=True` in the class definition. You should implement `get_parts` to return the inputs to GLM given an example and `get_answers` to return the candidates. Take `ReCoRDPVP` as an example: 99 | ```python 100 | class RecordPVP(PVP): 101 | is_multi_token = True 102 | 103 | def get_answers(self, example: InputExample): 104 | choices = example.meta['candidates'] 105 | choices = [" " + choice for choice in choices] 106 | return choices 107 | 108 | def get_parts(self, example: InputExample) -> FilledPattern: 109 | premise = self.shortenable(example.text_a) 110 | 111 | assert '@placeholder' in example.text_b, f'question "{example.text_b}" does not contain a @placeholder token' 112 | question_a, question_b = example.text_b.split('@placeholder') 113 | return [premise, " " + question_a.rstrip(), [self.mask], question_b], [] 114 | ``` 115 | After that, you should implement the class to `PVPS` at the end of [tasks/superglue/pvp.py](pvp.py): 116 | ```python 117 | PVPS = { 118 | ... 119 | 'rte': RtePVP, 120 | 'record': RecordPVP 121 | } 122 | ``` 123 | ## 4. Run the experiment 124 | To run the experiment for your new task, you should create a config file like [config_tasks/task_rte.sh](/config_tasks/task_rte.sh). You should also specify the evaluation metrics for the task in `DEFAULT_METRICS` of [tasks/superglue/finetune.py](finetune.py): 125 | ```python 126 | DEFAULT_METRICS = { 127 | ... 128 | "record": [("EM", qa_exact_match), ("F1", qa_f1)], 129 | "rte": [("accuracy", accuracy_metric)] 130 | } 131 | ``` 132 | Then you can run the experiment with [finetune_superglue.sh](/scripts/finetune_superglue.sh): 133 | ```shell 134 | bash scripts/finetune_superglue.sh \ 135 | config_tasks/model_blocklm_large.sh \ 136 | config_tasks/task_rte.sh 137 | ``` -------------------------------------------------------------------------------- /tasks/superglue/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/GLM/4f61ed7237a3b0187f4d62062429348276a78c84/tasks/superglue/__init__.py -------------------------------------------------------------------------------- /tasks/superglue/evaluate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Official evaluation script for ReCoRD v1.0. 3 | (Some functions are adopted from the SQuAD evaluation script.) 4 | """ 5 | 6 | from __future__ import print_function 7 | from collections import Counter 8 | import string 9 | import re 10 | from tasks.data_utils import InputExample 11 | from typing import List 12 | import functools 13 | from collections import defaultdict 14 | import unidecode 15 | 16 | 17 | def normalize_answer(s): 18 | """Lower text and remove punctuation, articles and extra whitespace.""" 19 | 20 | def remove_articles(text): 21 | return re.sub(r'\b(a|an|the)\b', ' ', text) 22 | 23 | def white_space_fix(text): 24 | return ' '.join(text.split()) 25 | 26 | def remove_punc(text): 27 | exclude = set(string.punctuation) 28 | return ''.join(ch for ch in text if ch not in exclude) 29 | 30 | def lower(text): 31 | return unidecode.unidecode(text.lower()) 32 | 33 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 34 | 35 | 36 | def f1_score(prediction, ground_truth): 37 | prediction_tokens = normalize_answer(prediction).split() 38 | ground_truth_tokens = normalize_answer(ground_truth).split() 39 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 40 | num_same = sum(common.values()) 41 | if num_same == 0: 42 | return 0 43 | precision = 1.0 * num_same / len(prediction_tokens) 44 | recall = 1.0 * num_same / len(ground_truth_tokens) 45 | f1 = (2 * precision * recall) / (precision + recall) 46 | return f1 47 | 48 | 49 | def exact_match_score(prediction, ground_truth): 50 | return normalize_answer(prediction) == normalize_answer(ground_truth) 51 | 52 | 53 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 54 | if not ground_truths: 55 | return 0.0 56 | scores_for_ground_truths = [] 57 | for ground_truth in ground_truths: 58 | score = metric_fn(prediction, ground_truth) 59 | scores_for_ground_truths.append(score) 60 | return max(scores_for_ground_truths) 61 | 62 | 63 | def qa_evaluate(predictions, labels, examples: List[InputExample], metric): 64 | assert len(examples) == len(predictions) 65 | score = 0.0 66 | for example, prediction in zip(examples, predictions): 67 | ground_truths = example.meta["answers"] 68 | prediction = example.meta["candidates"][prediction] 69 | if ground_truths: 70 | score += metric_max_over_ground_truths(metric, prediction, ground_truths) 71 | score = 100.0 * score / len(predictions) 72 | return score 73 | 74 | 75 | def squad_evaluate(predictions, labels, examples, metric): 76 | assert len(examples) == len(predictions) 77 | score = 0.0 78 | idx2predictions = {} 79 | idx2ground_truths = {} 80 | for example, prediction in zip(examples, predictions): 81 | idx = example.idx 82 | if idx not in idx2predictions: 83 | idx2predictions[idx] = [] 84 | idx2ground_truths[idx] = example.meta["answers"] 85 | idx2predictions[idx].append(prediction) 86 | # assert len(predictions) == len(idx2predictions) 87 | for idx, predictions in idx2predictions.items(): 88 | prediction = 'N/A' 89 | for i in range(len(predictions)): 90 | prediction = predictions[i] 91 | if prediction.lower().replace(' ', '') == 'n/a': 92 | prediction = 'N/A' 93 | else: 94 | break 95 | ground_truths = idx2ground_truths[idx] 96 | if len(ground_truths) == 1 and ground_truths[0] == 'N/A': 97 | score += (prediction == 'N/A') 98 | else: 99 | score += metric_max_over_ground_truths(metric, prediction, ground_truths) 100 | score = 100.0 * score / len(idx2predictions) 101 | return score 102 | 103 | 104 | def multirc_em(predictions, labels, examples: List[InputExample]): 105 | """Compute the exact match (EM) for a sequence of predictions and actual labels""" 106 | question_ids = [example.meta["question_idx"] for example in examples] 107 | unique_questions = set(question_ids) 108 | 109 | q_actuals = list(zip(question_ids, labels)) 110 | q_predictions = list(zip(question_ids, predictions)) 111 | 112 | actuals_per_question = defaultdict(list) 113 | predictions_per_question = defaultdict(list) 114 | 115 | for qid, val in q_actuals: 116 | actuals_per_question[qid].append(val) 117 | for qid, val in q_predictions: 118 | predictions_per_question[qid].append(val) 119 | 120 | em = 0 121 | for qid in unique_questions: 122 | if actuals_per_question[qid] == predictions_per_question[qid]: 123 | em += 1 124 | em /= len(unique_questions) 125 | return em 126 | 127 | 128 | qa_exact_match = functools.partial(qa_evaluate, metric=exact_match_score) 129 | qa_f1 = functools.partial(qa_evaluate, metric=f1_score) 130 | 131 | squad_exact_match = functools.partial(squad_evaluate, metric=exact_match_score) 132 | squad_f1 = functools.partial(squad_evaluate, metric=f1_score) 133 | -------------------------------------------------------------------------------- /tasks/superglue/finetune.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Race.""" 17 | 18 | from collections import OrderedDict 19 | from finetune_glm import finetune 20 | from tasks.superglue.dataset import SuperGlueDataset, MultiChoiceDataset, PROCESSORS, get_output_func 21 | from tasks.superglue.dataset import CLASSIFICATION_DATASETS, MULTI_CHOICE_DATASETS 22 | from tasks.superglue.evaluate import qa_exact_match, qa_f1, multirc_em, squad_exact_match, squad_f1 23 | from tasks.superglue.pvp import PVPS 24 | from tasks.eval_utils import accuracy_func_provider 25 | from tasks.eval_utils import accuracy_metric, f1_macro_metric, f1_metric 26 | from glob import glob 27 | 28 | DEFAULT_METRICS = { 29 | "record": [("EM", qa_exact_match), ("F1", qa_f1)], 30 | "copa": [("accuracy", accuracy_metric)], 31 | "rte": [("accuracy", accuracy_metric)], 32 | "boolq": [("accuracy", accuracy_metric)], 33 | "wic": [("accuracy", accuracy_metric)], 34 | "wsc": [("accuracy", accuracy_metric)], 35 | "cb": [("accuracy", accuracy_metric), ("f1-macro", f1_macro_metric)], 36 | "multirc": [("f1a", f1_metric), ("em", multirc_em), ("acc", accuracy_metric)], 37 | "mnli": [("accuracy", accuracy_metric)], 38 | "sst2": [("accuracy", accuracy_metric)], 39 | "qnli": [("accuracy", accuracy_metric)], 40 | "qqp": [("accuracy", accuracy_metric)], 41 | "mrpc": [("accuracy", accuracy_metric)], 42 | "cola": [("accuracy", accuracy_metric)], 43 | "squad": [("accuracy", accuracy_metric)], 44 | "afqmc": [("accuracy", accuracy_metric)], 45 | "tnews": [("accuracy", accuracy_metric)], 46 | "cluewsc": [("accuracy", accuracy_metric)], 47 | "cmrc": [("accuracy", accuracy_metric)], 48 | "multichoice": [("accuracy", accuracy_metric)] 49 | } 50 | 51 | 52 | def train_valid_datasets_provider(args, tokenizer, pattern_text=False): 53 | """Provide train and validation datasets.""" 54 | task_name = args.task.lower() 55 | data_dir = args.data_dir 56 | train_dataset = SuperGlueDataset(args, task_name, data_dir, args.seq_length, "train", tokenizer, 57 | pattern_text=pattern_text) 58 | valid_dataset = SuperGlueDataset(args, task_name, data_dir, args.seq_length, "dev", tokenizer, for_train=True, 59 | pattern_text=pattern_text) 60 | 61 | return train_dataset, valid_dataset 62 | 63 | 64 | def metrics_func_provider(args, tokenizer, is_test): 65 | """Privde metrics callback function.""" 66 | 67 | def single_dataset_provider(split): 68 | if args.task == "multichoice": 69 | return MultiChoiceDataset(args, split, tokenizer, args.seq_length) 70 | else: 71 | return SuperGlueDataset(args, args.task.lower(), args.data_dir, args.seq_length, split, tokenizer) 72 | 73 | output_func = get_output_func(args.task.lower(), args) 74 | eval_func = None 75 | if args.task.lower() == 'wsc' and args.cloze_eval and not args.wsc_negative: 76 | from tasks.language_model.finetune import classify_evaluate 77 | eval_func = classify_evaluate 78 | metric_dict = OrderedDict(DEFAULT_METRICS[args.task.lower()]) 79 | return accuracy_func_provider(single_dataset_provider, metric_dict, args, is_test=is_test, eval_func=eval_func, 80 | output_func=output_func, only_rank0=False, tokenizer=tokenizer) 81 | 82 | 83 | def main(args): 84 | model_kwargs = {} 85 | if args.task.lower() != "multichoice": 86 | processor = PROCESSORS[args.task.lower()](args) 87 | pvp = PVPS[args.task.lower()](args, None, processor.get_labels(), args.seq_length, 88 | pattern_id=args.pattern_id, is_multi_token=args.multi_token, 89 | num_prompt_tokens=args.num_prompt_tokens) 90 | else: 91 | patterns = args.test_data 92 | datapaths = [] 93 | for pattern in patterns: 94 | for path in glob(pattern, recursive=True): 95 | datapaths.append(path) 96 | args.test_data = datapaths 97 | if args.continuous_prompt: 98 | model_kwargs["spell_length"] = pvp.spell_length 99 | if args.task.lower() == 'wsc' and args.cloze_eval and not args.wsc_negative: 100 | from tasks.language_model.finetune import lm_forward_step 101 | finetune(args, train_valid_datasets_provider, model_kwargs, 102 | end_of_epoch_callback_provider=metrics_func_provider, forward_step=lm_forward_step) 103 | else: 104 | if args.task.lower() == "multichoice": 105 | multi_token = True 106 | elif args.cloze_eval: 107 | multi_token = pvp.is_multi_token 108 | else: 109 | multi_token = args.task.lower() in MULTI_CHOICE_DATASETS 110 | args.multi_token = multi_token 111 | if not multi_token: 112 | model_kwargs["model_type"] = "multiple_choice" if args.cloze_eval else "classification" 113 | model_kwargs["multi_token"] = False 114 | model_kwargs["num_labels"] = len(processor.get_labels()) 115 | else: 116 | model_kwargs["model_type"] = "multiple_choice" 117 | model_kwargs["multi_token"] = True 118 | model_kwargs["num_labels"] = 1 119 | finetune(args, train_valid_datasets_provider, model_kwargs, 120 | end_of_epoch_callback_provider=metrics_func_provider) 121 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/GLM/4f61ed7237a3b0187f4d62062429348276a78c84/test/__init__.py -------------------------------------------------------------------------------- /test/test_block.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | from blocklm_utils import ConstructBlockStrategy 4 | from argparse import Namespace 5 | 6 | 7 | # rng = random.Random() 8 | # span_lengths = [2, 3, 4, 2, 3, 4] 9 | # length = 100 10 | # 11 | # counts = np.array([0] * length) 12 | # for _ in range(10000): 13 | # rng.shuffle(span_lengths) 14 | # spans = ConstructBlockStrategy.sample_spans(span_lengths, length, rng) 15 | # for start, end in spans: 16 | # counts[start: end] += 1 17 | # print(counts) 18 | def main(): 19 | args = Namespace() 20 | args.seq_length = 10 21 | args.eod_token = 0 22 | 23 | strategy = ConstructBlockStrategy(args, None, bert_ratio=0.4, max_seq_length=128) 24 | counts = np.array([0] * 10) 25 | for _ in range(10000): 26 | spans = strategy.sample_span_in_document(np.array([1, 2, 3, 0, 4, 5, 6, 7, 9, 0], dtype=np.long), [1, 1], 27 | random.Random()) 28 | for start, end in spans: 29 | counts[start: end] += 1 30 | 31 | print(counts) 32 | -------------------------------------------------------------------------------- /test/test_rel_shift.py: -------------------------------------------------------------------------------- 1 | # import torch 2 | # from mpu.transformer import GPT2ParallelSelfAttention 3 | # 4 | # b = torch.arange(2) * 1000 5 | # h = torch.arange(3) * 100 6 | # pos_seq = torch.arange(9, -1, -1) 7 | # query = torch.arange(7) * 10 8 | # s = pos_seq.unsqueeze(0) + query.unsqueeze(1) 9 | # s = b.view(-1, 1, 1, 1) + h.view(1, -1, 1, 1) + s 10 | # s = GPT2ParallelSelfAttention._rel_shift(s) 11 | # print(s) 12 | 13 | from torch.nn.modules import Linear 14 | from torch.optim import Adam 15 | from learning_rates import AnnealingLR 16 | import matplotlib.pyplot as plt 17 | import numpy as np 18 | 19 | 20 | def main(): 21 | model = Linear(10, 10) 22 | optimizer = Adam(model.parameters()) 23 | lr_scheduler = AnnealingLR(optimizer, 24 | start_lr=0.00015, 25 | warmup_iter=3000, 26 | num_iters=300000, 27 | decay_style='cosine', 28 | decay_ratio=0.1) 29 | steps = np.arange(0, 400000, 10, dtype=np.long) 30 | rates = [] 31 | for step in steps: 32 | lr_scheduler.num_iters = step 33 | rates.append(lr_scheduler.get_lr()) 34 | print(rates) 35 | plt.plot(steps, rates) 36 | plt.savefig("lr.pdf", format='pdf') --------------------------------------------------------------------------------