├── .gitignore
├── .pytorch_pretrained_bert
    ├── bert-base-uncased-vocab.txt
    ├── bert-large-uncased-vocab.txt
    ├── gpt2-merges.txt
    ├── gpt2-vocab.json
    ├── roberta-merges.txt
    └── roberta-vocab.json
├── LICENSE
├── README.md
├── arguments.py
├── blocklm_utils.py
├── change_mp.py
├── chinese_sentencepiece
    ├── cog-pretrain.model
    └── cog-pretrain.vocab
├── config
    ├── config_block_10B.json
    ├── config_block_10B_longer.json
    ├── config_block_base.json
    ├── config_block_large.json
    ├── config_block_large_chinese.json
    ├── config_blockta_large.json
    ├── ds_block_10B.sh
    ├── ds_block_10B_chinese.sh
    ├── ds_block_10B_chinese_longer.sh
    ├── ds_block_10B_longer.sh
    ├── ds_block_base.sh
    ├── ds_block_large.sh
    ├── ds_block_large_chinese.sh
    └── ds_blockta_large.sh
├── config_tasks
    ├── config_blocklm_10B.json
    ├── config_blocklm_10B_cnndm.json
    ├── config_blocklm_10B_record.json
    ├── model_blocklm_1.25_generation.sh
    ├── model_blocklm_1.5_generation.sh
    ├── model_blocklm_10B.sh
    ├── model_blocklm_10B_chinese.sh
    ├── model_blocklm_2B.sh
    ├── model_blocklm_base.sh
    ├── model_blocklm_large.sh
    ├── model_blocklm_large_chinese.sh
    ├── model_blocklm_large_generation.sh
    ├── model_blocklm_roberta_1.25.sh
    ├── model_blocklm_roberta_large.sh
    ├── seq_blank.sh
    ├── seq_cmrc.sh
    ├── seq_cnndm.sh
    ├── seq_cnndm_org.sh
    ├── seq_customization.sh
    ├── seq_gigaword.sh
    ├── seq_squad.sh
    ├── seq_squad_generation.sh
    ├── seq_squad_v1.sh
    ├── seq_xsum.sh
    ├── task_afqmc.sh
    ├── task_boolq.sh
    ├── task_cb.sh
    ├── task_cluewsc.sh
    ├── task_cmrc.sh
    ├── task_copa.sh
    ├── task_multirc.sh
    ├── task_record.sh
    ├── task_rte.sh
    ├── task_tnews.sh
    ├── task_wic.sh
    ├── task_wsc.sh
    ├── task_wsc_generative.sh
    ├── zero_lambada.sh
    ├── zero_lambada_uni.sh
    ├── zero_lm.sh
    ├── zero_lm_uni.sh
    └── zero_wikitext.sh
├── configure_data.py
├── data_utils
    ├── __init__.py
    ├── corpora.py
    ├── datasets.py
    ├── extraction.py
    ├── file_utils.py
    ├── lazy_loader.py
    ├── samplers.py
    ├── sp_tokenizer.py
    ├── tokenization.py
    ├── tokenization_gpt2.py
    └── wordpiece.py
├── docker
    ├── cuda102.dockerfile
    ├── cuda112.dockerfile
    ├── prepare.sh
    └── ssh-env-config.sh
├── examples
    └── README.md
├── finetune_glm.py
├── fp16
    ├── __init__.py
    ├── fp16.py
    ├── fp16util.py
    └── loss_scaler.py
├── generate_samples.py
├── generation_utils.py
├── learning_rates.py
├── model
    ├── __init__.py
    ├── distributed.py
    ├── downstream.py
    ├── modeling_bert.py
    ├── modeling_glm.py
    └── prompt.py
├── mpu
    ├── __init__.py
    ├── cross_entropy.py
    ├── data.py
    ├── grads.py
    ├── initialize.py
    ├── layers.py
    ├── mappings.py
    ├── random.py
    ├── tests
    │   ├── __init__.py
    │   ├── commons.py
    │   ├── test_cross_entropy.py
    │   ├── test_data.py
    │   ├── test_initialize.py
    │   ├── test_layers.py
    │   └── test_random.py
    ├── transformer.py
    └── utils.py
├── pretrain_glm.py
├── process_grid.py
├── requirements.txt
├── run_test.py
├── scripts
    ├── convert_glm_checkpoint_to_transformers.py
    ├── dispatcher.py
    ├── ds_finetune_record.sh
    ├── ds_finetune_seq2seq.sh
    ├── ds_finetune_superglue.sh
    ├── ds_finetune_superglue_prompt.sh
    ├── ds_pretrain_nvidia.sh
    ├── evaluate_lm.sh
    ├── evaluate_multichoice.sh
    ├── evaluate_seq2seq.sh
    ├── finetune_blank.sh
    ├── finetune_seq2seq.sh
    ├── finetune_seq2seq_grid.sh
    ├── finetune_superglue.sh
    ├── finetune_superglue_fast.sh
    ├── finetune_superglue_grid.sh
    ├── generate_block.sh
    └── multi-bleu.perl
├── tasks
    ├── data_utils.py
    ├── eval_utils.py
    ├── language_model
    │   ├── dataset.py
    │   ├── detokenizer.py
    │   └── finetune.py
    ├── seq2seq
    │   ├── dataset.py
    │   ├── evaluate.py
    │   └── finetune.py
    └── superglue
    │   ├── README.md
    │   ├── __init__.py
    │   ├── dataset.py
    │   ├── evaluate.py
    │   ├── finetune.py
    │   └── pvp.py
├── test
    ├── __init__.py
    ├── test_block.py
    └── test_rel_shift.py
├── train_utils.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | /.ipynb_checkpoints/
3 | .DS_Store
4 | *.pyc
5 | logs
6 | runs
7 | settings.json
8 | .gitignore
9 | .vscode/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 THUDM
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/change_mp.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import torch
  4 | import copy
  5 | 
  6 | checkpoint = sys.argv[1]
  7 | target_mp = int(sys.argv[2])
  8 | 
  9 | assert os.path.isdir(checkpoint)
 10 | iteration_file = os.path.join(checkpoint, 'latest_checkpointed_iteration.txt')
 11 | if not os.path.exists(iteration_file):
 12 |     iteration_file = os.path.join(checkpoint, 'latest')
 13 | if os.path.exists(iteration_file):
 14 |     with open(iteration_file) as fin:
 15 |         iteration = int(fin.read().strip())
 16 |     checkpoint = os.path.join(checkpoint, str(iteration))
 17 | else:
 18 |     iteration = None
 19 | 
 20 | filenames = os.listdir(checkpoint)
 21 | filenames = [filename for filename in filenames if filename.startswith("mp_rank_")]
 22 | filenames = sorted(filenames,
 23 |                    key=lambda x: int(x.split('_')[2]))
 24 | filenames = [os.path.join(checkpoint, x) for x in filenames]
 25 | 
 26 | if target_mp == len(filenames):
 27 |     print("MP size keeps the same.")
 28 |     exit(0)
 29 | 
 30 | if sys.argv[1][-1] == '/':
 31 |     new_checkpoint = sys.argv[1][:-1] + '_MP' + sys.argv[2]
 32 | else:
 33 |     new_checkpoint = sys.argv[1] + '_MP' + sys.argv[2]
 34 | if not os.path.exists(new_checkpoint):
 35 |     os.mkdir(new_checkpoint)
 36 | if iteration is not None:
 37 |     with open(os.path.join(new_checkpoint, 'latest_checkpointed_iteration.txt'), 'w') as fout:
 38 |         fout.write("{}\n".format(iteration))
 39 |     with open(os.path.join(new_checkpoint, 'latest'), 'w') as fout:
 40 |         fout.write("{}\n".format(iteration))
 41 |     new_checkpoint = os.path.join(new_checkpoint, str(iteration))
 42 |     if not os.path.exists(new_checkpoint):
 43 |         os.mkdir(new_checkpoint)
 44 | 
 45 | preserve_keys = [
 46 |     "lr_scheduler",
 47 |     "skipped_steps",
 48 |     "global_steps",
 49 |     "global_samples",
 50 |     "dp_world_size",
 51 |     "iteration",
 52 |     "client_lr_scheduler",
 53 |     "np_rng_state",
 54 |     "random_rng_state",
 55 |     "torch_rng_state",
 56 |     "cuda_rng_state",
 57 |     "rng_tracker_states",
 58 | 
 59 | ]
 60 | 
 61 | if target_mp < len(filenames):
 62 |     print("Decrease MP size.")
 63 |     assert len(filenames) % target_mp == 0
 64 |     ratio = len(filenames) // target_mp
 65 |     for i in range(target_mp):
 66 |         start = ratio * i
 67 |         end = ratio * (i + 1)
 68 |         d = torch.load(filenames[start],
 69 |                        map_location='cpu')
 70 |         for k in d.keys():
 71 |             if k != 'module':
 72 |                 if k in preserve_keys:
 73 |                     pass
 74 |                 elif k == "mp_world_size":
 75 |                     d[k] = target_mp
 76 |                 else:
 77 |                     d[k] = None
 78 |         for j in range(start + 1, end):
 79 |             d_new = torch.load(filenames[j],
 80 |                                map_location='cpu')
 81 |             for k, v in d_new['module'].items():
 82 |                 assert len(v.shape) < 3
 83 |                 if len(v.shape) == 2 and 'position' not in k:
 84 |                     if 'query' in k:
 85 |                         size_1 = d['module'][k].shape[0] // 3
 86 |                         size_2 = v.shape[0] // 3
 87 |                         target = d['module'][k]
 88 |                         d['module'][k] = torch.cat([
 89 |                             target[:size_1, :], v[:size_2, :],
 90 |                             target[size_1:size_1 * 2, :], v[size_2:size_2 * 2, :],
 91 |                             target[size_1 * 2:, :], v[size_2 * 2:, :]], 0)
 92 |                     elif 'word' in k or 'h_to_4h' in k or 'relative' in k or "r_w_bias" in k or "r_r_bias" in k:
 93 |                         d['module'][k] = torch.cat([d['module'][k], v], 0)
 94 |                     else:
 95 |                         d['module'][k] = torch.cat([d['module'][k], v], 1)
 96 |                 elif len(v.shape) == 1 and 'query_key_value' in k:
 97 |                     size_1 = d['module'][k].shape[0] // 3
 98 |                     size_2 = v.shape[0] // 3
 99 |                     target = d['module'][k]
100 |                     d['module'][k] = torch.cat([
101 |                         target[:size_1], v[:size_2],
102 |                         target[size_1:size_1 * 2], v[size_2:size_2 * 2],
103 |                         target[size_1 * 2:], v[size_2 * 2:]], 0)
104 |                 elif len(v.shape) == 1 and ('dense_h_to_4h' in k or "attention.relative" in k):
105 |                     d['module'][k] = torch.cat([d['module'][k], v], 0)
106 |         filename = os.path.join(new_checkpoint, "mp_rank_{:02d}_model_states.pt".format(i))
107 |         torch.save(d, filename)
108 | 
109 | if target_mp > len(filenames):
110 |     print("Increase MP size.")
111 |     assert target_mp % len(filenames) == 0
112 |     ratio = target_mp // len(filenames)
113 |     for i in range(len(filenames)):
114 |         start = ratio * i
115 |         end = ratio * (i + 1)
116 |         d = torch.load(filenames[i],
117 |                        map_location='cpu')
118 |         for j in range(start, end):
119 |             d_new = {}
120 |             shift = j - start
121 |             for k, v in d.items():
122 |                 if k != 'module':
123 |                     if k in preserve_keys:
124 |                         d_new[k] = copy.deepcopy(d[k])
125 |                     elif k == "mp_world_size":
126 |                         d_new[k] = target_mp
127 |                     else:
128 |                         d_new[k] = None
129 |             d_new['module'] = {}
130 |             with torch.no_grad():
131 |                 for k, v in d['module'].items():
132 |                     assert len(v.shape) < 3
133 |                     if len(v.shape) == 2 and 'position' not in k:
134 |                         if 'query' in k:
135 |                             part = v.shape[0] // ratio // 3
136 |                             d_new['module'][k] = torch.cat([v[shift * part:(shift + 1) * part, :].clone(),
137 |                                                             v[(shift + ratio) * part:(shift + 1 + ratio) * part,
138 |                                                             :].clone(),
139 |                                                             v[(shift + 2 * ratio) * part:(shift + 1 + 2 * ratio) * part,
140 |                                                             :].clone()], 0)
141 |                         elif 'word' in k or 'h_to_4h' in k or 'relative' in k or "r_w_bias" in k or "r_r_bias" in k:
142 |                             part = v.shape[0] // ratio
143 |                             d_new['module'][k] = v[shift * part:(shift + 1) * part, :].clone()
144 |                         else:
145 |                             part = v.shape[1] // ratio
146 |                             d_new['module'][k] = v[:, shift * part:(shift + 1) * part].clone()
147 |                     elif len(v.shape) == 1 and ('dense_h_to_4h' in k or "attention.relative" in k):
148 |                         part = v.shape[0] // ratio
149 |                         d_new['module'][k] = v[shift * part:(shift + 1) * part].clone()
150 |                     elif len(v.shape) == 1 and 'query_key_value' in k:
151 |                         part = v.shape[0] // ratio // 3
152 |                         d_new['module'][k] = torch.cat(
153 |                             [v[shift * part:(shift + 1) * part].clone(),
154 |                              v[(shift + ratio) * part:(shift + 1 + ratio) * part].clone(),
155 |                              v[(shift + 2 * ratio) * part:(shift + 1 + 2 * ratio) * part].clone()], 0)
156 |                     else:
157 |                         d_new['module'][k] = v.clone()
158 |             filename = os.path.join(new_checkpoint, "mp_rank_{:02d}_model_states.pt".format(j))
159 |             torch.save(d_new, filename)
160 | 


--------------------------------------------------------------------------------
/chinese_sentencepiece/cog-pretrain.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/GLM/4f61ed7237a3b0187f4d62062429348276a78c84/chinese_sentencepiece/cog-pretrain.model


--------------------------------------------------------------------------------
/config/config_block_10B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": 21,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "steps_per_print": 50,
 5 |   "gradient_clipping": 1.0,
 6 |   "zero_optimization": {
 7 |     "stage": 2,
 8 |     "contiguous_gradients": false,
 9 |     "overlap_comm": true,
10 |     "reduce_scatter": true,
11 |     "reduce_bucket_size": 50000000,
12 |     "allgather_bucket_size": 500000000
13 |   },
14 |   "zero_allow_untested_optimizer": true,
15 |   "fp16": {
16 |     "enabled": true,
17 |     "loss_scale": 0,
18 |     "loss_scale_window": 1000,
19 |     "hysteresis": 2,
20 |     "min_loss_scale": 1
21 |   },
22 |   "optimizer": {
23 |     "type": "Adam",
24 |     "params": {
25 |       "lr": 0.0001,
26 |       "betas": [
27 |         0.9,
28 |         0.95
29 |       ],
30 |       "eps": 1e-8,
31 |       "weight_decay": 1e-1
32 |     }
33 |   },
34 |   "activation_checkpointing": {
35 |     "partition_activations": false,
36 |     "contiguous_memory_optimization": false
37 |   },
38 |   "wall_clock_breakdown": false
39 | }


--------------------------------------------------------------------------------
/config/config_block_10B_longer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": 8,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "steps_per_print": 50,
 5 |   "gradient_clipping": 1.0,
 6 |   "zero_optimization": {
 7 |     "stage": 2,
 8 |     "contiguous_gradients": true,
 9 |     "overlap_comm": true,
10 |     "reduce_scatter": true,
11 |     "reduce_bucket_size": 50000000,
12 |     "allgather_bucket_size": 500000000
13 |   },
14 |   "zero_allow_untested_optimizer": true,
15 |   "fp16": {
16 |     "enabled": true,
17 |     "loss_scale": 0,
18 |     "loss_scale_window": 1000,
19 |     "hysteresis": 2,
20 |     "min_loss_scale": 1
21 |   },
22 |   "optimizer": {
23 |     "type": "Adam",
24 |     "params": {
25 |       "lr": 0.00003,
26 |       "betas": [
27 |         0.9,
28 |         0.95
29 |       ],
30 |       "eps": 1e-8,
31 |       "weight_decay": 1e-1
32 |     }
33 |   },
34 |   "activation_checkpointing": {
35 |     "partition_activations": false,
36 |     "contiguous_memory_optimization": false
37 |   },
38 |   "wall_clock_breakdown": false
39 | }


--------------------------------------------------------------------------------
/config/config_block_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": 16,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "steps_per_print": 100,
 5 |   "gradient_clipping": 1.0,
 6 |   "fp16": {
 7 |     "enabled": true,
 8 |     "loss_scale": 0,
 9 |     "loss_scale_window": 1000,
10 |     "hysteresis": 2,
11 |     "min_loss_scale": 1
12 |   },
13 |   "optimizer": {
14 |     "type": "Adam",
15 |     "params": {
16 |       "lr": 0.0004,
17 |       "weight_decay": 0.1,
18 |       "betas": [
19 |         0.9,
20 |         0.98
21 |       ],
22 |       "eps": 1e-6
23 |     }
24 |   },
25 |   "activation_checkpointing": {
26 |     "partition_activations": false,
27 |     "contiguous_memory_optimization": false
28 |   },
29 |   "wall_clock_breakdown": false
30 | }


--------------------------------------------------------------------------------
/config/config_block_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": 16,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "steps_per_print": 100,
 5 |   "gradient_clipping": 1.0,
 6 |   "fp16": {
 7 |     "enabled": true,
 8 |     "loss_scale": 0,
 9 |     "loss_scale_window": 1000,
10 |     "hysteresis": 2,
11 |     "min_loss_scale": 1
12 |   },
13 |   "optimizer": {
14 |     "type": "Adam",
15 |     "params": {
16 |       "lr": 0.0002,
17 |       "weight_decay": 0.1,
18 |       "betas": [
19 |         0.9,
20 |         0.98
21 |       ],
22 |       "eps": 1e-6
23 |     }
24 |   },
25 |   "activation_checkpointing": {
26 |     "partition_activations": false,
27 |     "contiguous_memory_optimization": false
28 |   },
29 |   "wall_clock_breakdown": false
30 | }


--------------------------------------------------------------------------------
/config/config_block_large_chinese.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": 32,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "steps_per_print": 100,
 5 |   "gradient_clipping": 1.0,
 6 |   "fp16": {
 7 |     "enabled": true,
 8 |     "loss_scale": 0,
 9 |     "loss_scale_window": 1000,
10 |     "hysteresis": 2,
11 |     "min_loss_scale": 1
12 |   },
13 |   "optimizer": {
14 |     "type": "Adam",
15 |     "params": {
16 |       "lr": 0.0004,
17 |       "weight_decay": 0.01,
18 |       "betas": [
19 |         0.9,
20 |         0.98
21 |       ],
22 |       "eps": 1e-6
23 |     }
24 |   },
25 |   "activation_checkpointing": {
26 |     "partition_activations": false,
27 |     "contiguous_memory_optimization": false
28 |   },
29 |   "wall_clock_breakdown": false
30 | }


--------------------------------------------------------------------------------
/config/config_blockta_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": 16,
 3 |   "gradient_accumulation_steps": 8,
 4 |   "steps_per_print": 100,
 5 |   "gradient_clipping": 1.0,
 6 |   "fp16": {
 7 |     "enabled": true,
 8 |     "loss_scale": 0,
 9 |     "loss_scale_window": 1000,
10 |     "hysteresis": 2,
11 |     "min_loss_scale": 1
12 |   },
13 |   "optimizer": {
14 |     "type": "Adam",
15 |     "params": {
16 |       "lr": 0.0004,
17 |       "weight_decay": 0.01,
18 |       "betas": [
19 |         0.9,
20 |         0.98
21 |       ],
22 |       "eps": 1e-6
23 |     }
24 |   },
25 |   "activation_checkpointing": {
26 |     "partition_activations": false,
27 |     "contiguous_memory_optimization": false
28 |   },
29 |   "wall_clock_breakdown": false
30 | }


--------------------------------------------------------------------------------
/config/ds_block_10B.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | script_path=$(realpath $BASH_SOURCE)
 4 | script_dir=$(dirname $script_path)
 5 | 
 6 | config_json="$script_dir/config_block_10B.json"
 7 | gpt_options=" \
 8 |        --block-lm \
 9 |        --task-mask \
10 |        --bert-prob 0.5 \
11 |        --gap-sentence-prob 0.3 \
12 |        --avg-block-length 3 \
13 |        --gpt-min-ratio 0.25 \
14 |        --block-mask-prob 0.1 \
15 |        --short-seq-prob 0.02 \
16 |        --experiment-name blocklm-10b \
17 |        --model-parallel-size ${MP_SIZE} \
18 |        --num-layers 48 \
19 |        --hidden-size 4096 \
20 |        --num-attention-heads 64 \
21 |        --seq-length 512 \
22 |        --max-position-embeddings 1024 \
23 |        --save /dataset/fd5061f6/english_data/checkpoints \
24 |        --log-interval 50 \
25 |        --eval-interval 1000 \
26 |        --save-interval 2000 \
27 |        --train-iters 250000 \
28 |        --train-data pile cc-news \
29 |        --resume-dataloader \
30 |        --filter-english \
31 |        --loader-scatter 32 \
32 |        --tokenizer-type GPT2BPETokenizer \
33 |        --split 949,50,1 \
34 |        --distributed-backend nccl \
35 |        --lr-decay-style cosine \
36 |        --lr-decay-ratio 0.1 \
37 |        --lr-decay-iters 175000 \
38 |        --warmup 0.04 \
39 |        --checkpoint-activations \
40 |        --deepspeed-activation-checkpointing \
41 |        --fp16 \
42 | "
43 | gpt_options="${gpt_options}
44 |                --deepspeed \
45 |                --deepspeed_config ${config_json} \
46 | "


--------------------------------------------------------------------------------
/config/ds_block_10B_chinese.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | script_path=$(realpath $BASH_SOURCE)
 4 | script_dir=$(dirname $script_path)
 5 | 
 6 | config_json="$script_dir/config_block_10B.json"
 7 | gpt_options=" \
 8 |        --block-lm \
 9 |        --task-mask \
10 |        --bert-prob 0.5 \
11 |        --gap-sentence-prob 0.3 \
12 |        --avg-block-length 3 \
13 |        --gpt-min-ratio 0.25 \
14 |        --block-mask-prob 0.1 \
15 |        --short-seq-prob 0.02 \
16 |        --experiment-name blocklm-10b-chinese \
17 |        --model-parallel-size ${MP_SIZE} \
18 |        --num-layers 48 \
19 |        --hidden-size 4096 \
20 |        --num-attention-heads 64 \
21 |        --seq-length 512 \
22 |        --max-position-embeddings 1024 \
23 |        --save /dataset/fd5061f6/english_data/checkpoints \
24 |        --load /dataset/fd5061f6/english_data/checkpoints/blocklm-10b-chinese07-08-15-28 \
25 |        --log-interval 50 \
26 |        --eval-interval 1000 \
27 |        --save-interval 2000 \
28 |        --train-iters 150000 \
29 |        --train-data wudao baike zhihu \
30 |        --resume-dataloader \
31 |        --loader-scatter 32 \
32 |        --no-lazy-loader \
33 |        --tokenizer-type ChineseSPTokenizer \
34 |        --split 949,50,1 \
35 |        --distributed-backend nccl \
36 |        --lr-decay-style cosine \
37 |        --lr-decay-ratio 0.1 \
38 |        --lr-decay-iters 120000 \
39 |        --warmup 0.04 \
40 |        --checkpoint-activations \
41 |        --deepspeed-activation-checkpointing \
42 |        --fp16 \
43 | "
44 | gpt_options="${gpt_options}
45 |                --deepspeed \
46 |                --deepspeed_config ${config_json} \
47 | "


--------------------------------------------------------------------------------
/config/ds_block_10B_chinese_longer.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | script_path=$(realpath $BASH_SOURCE)
 4 | script_dir=$(dirname $script_path)
 5 | 
 6 | config_json="$script_dir/config_block_10B_longer.json"
 7 | gpt_options=" \
 8 |        --block-lm \
 9 |        --task-mask \
10 |        --bert-prob 0.5 \
11 |        --gap-sentence-prob 0.3 \
12 |        --avg-block-length 3 \
13 |        --gpt-min-ratio 0.25 \
14 |        --block-mask-prob 0.1 \
15 |        --short-seq-prob 0.5 \
16 |        --experiment-name blocklm-10b-chinese \
17 |        --model-parallel-size ${MP_SIZE} \
18 |        --num-layers 48 \
19 |        --hidden-size 4096 \
20 |        --num-attention-heads 64 \
21 |        --seq-length 1024 \
22 |        --max-position-embeddings 1024 \
23 |        --save /dataset/fd5061f6/english_data/checkpoints \
24 |        --load /dataset/fd5061f6/english_data/checkpoints/blocklm-10b-chinese07-08-15-28 \
25 |        --no-load-lr-scheduler \
26 |        --log-interval 50 \
27 |        --eval-interval 1000 \
28 |        --save-interval 2000 \
29 |        --train-iters 150000 \
30 |        --train-data wudao baike zhihu \
31 |        --resume-dataloader \
32 |        --loader-scatter 32 \
33 |        --no-lazy-loader \
34 |        --tokenizer-type ChineseSPTokenizer \
35 |        --split 949,50,1 \
36 |        --distributed-backend nccl \
37 |        --lr-decay-style cosine \
38 |        --lr-decay-ratio 0.1 \
39 |        --lr-decay-iters 20000 \
40 |        --warmup 0.025 \
41 |        --checkpoint-activations \
42 |        --deepspeed-activation-checkpointing \
43 |        --fp16 \
44 | "
45 | gpt_options="${gpt_options}
46 |                --deepspeed \
47 |                --deepspeed_config ${config_json} \
48 | "


--------------------------------------------------------------------------------
/config/ds_block_10B_longer.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | script_path=$(realpath $BASH_SOURCE)
 4 | script_dir=$(dirname $script_path)
 5 | 
 6 | config_json="$script_dir/config_block_10B_longer.json"
 7 | gpt_options=" \
 8 |        --block-lm \
 9 |        --task-mask \
10 |        --bert-prob 0.4 \
11 |        --gap-sentence-prob 0.3 \
12 |        --single-span-prob 0.05 \
13 |        --avg-block-length 3 \
14 |        --gpt-min-ratio 0.25 \
15 |        --block-mask-prob 0.1 \
16 |        --short-seq-prob 0.5 \
17 |        --experiment-name blocklm-10b \
18 |        --model-parallel-size ${MP_SIZE} \
19 |        --num-layers 48 \
20 |        --hidden-size 4096 \
21 |        --num-attention-heads 64 \
22 |        --seq-length 1024 \
23 |        --max-position-embeddings 1024 \
24 |        --save /mnt/model_checkpoints \
25 |        --load /cache/blocklm-10b-512 \
26 |        --no-load-lr-scheduler \
27 |        --log-interval 25 \
28 |        --train-iters 250000 \
29 |        --train-data pile cc-news \
30 |        --resume-dataloader \
31 |        --filter-english \
32 |        --loader-scatter 32 \
33 |        --no-lazy-loader \
34 |        --tokenizer-type GPT2BPETokenizer \
35 |        --split 949,50,1 \
36 |        --distributed-backend nccl \
37 |        --lr-decay-style linear \
38 |        --lr-decay-ratio 0.1 \
39 |        --lr-decay-iters 50000 \
40 |        --warmup 0.005 \
41 |        --checkpoint-activations \
42 |        --deepspeed-activation-checkpointing \
43 |        --fp16 \
44 | "
45 | gpt_options="${gpt_options}
46 |                --deepspeed \
47 |                --deepspeed_config ${config_json} \
48 | "


--------------------------------------------------------------------------------
/config/ds_block_base.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | script_path=$(realpath $BASH_SOURCE)
 4 | script_dir=$(dirname $script_path)
 5 | 
 6 | config_json="$script_dir/config_block_base.json"
 7 | gpt_options=" \
 8 |        --block-lm \
 9 |        --bert-prob 1.0 \
10 |        --experiment-name blocklm-blank \
11 |        --model-parallel-size ${MP_SIZE} \
12 |        --num-layers 12 \
13 |        --hidden-size 768 \
14 |        --num-attention-heads 12 \
15 |        --seq-length 512 \
16 |        --max-position-embeddings 512 \
17 |        --save /root/data/checkpoints \
18 |        --train-iters 150000 \
19 |        --resume-dataloader \
20 |        --train-data bert-base \
21 |        --lazy-loader \
22 |        --tokenizer-type BertWordPieceTokenizer \
23 |        --tokenizer-model-type bert-base-uncased \
24 |        --split 949,50,1 \
25 |        --distributed-backend nccl \
26 |        --lr-decay-style cosine \
27 |        --lr-decay-iters 120000 \
28 |        --lr-decay-ratio 0.05 \
29 |        --warmup .05 \
30 |        --checkpoint-activations \
31 |        --deepspeed-activation-checkpointing \
32 |        --fp16 \
33 | "
34 | gpt_options="${gpt_options}
35 |                --deepspeed \
36 |                --deepspeed_config ${config_json} \
37 | "
38 | 


--------------------------------------------------------------------------------
/config/ds_block_large.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | script_path=$(realpath $BASH_SOURCE)
 4 | script_dir=$(dirname $script_path)
 5 | 
 6 | config_json="$script_dir/config_block_large.json"
 7 | gpt_options=" \
 8 |        --block-lm \
 9 |        --bert-prob 1.0 \
10 |        --avg-block-length 3 \
11 |        --experiment-name blocklm-large-blank \
12 |        --model-parallel-size ${MP_SIZE} \
13 |        --num-layers 24 \
14 |        --hidden-size 1024 \
15 |        --num-attention-heads 16 \
16 |        --seq-length 512 \
17 |        --max-position-embeddings 512 \
18 |        --save /root/data/checkpoints \
19 |        --train-iters 200000 \
20 |        --resume-dataloader \
21 |        --train-data bert-large \
22 |        --tokenizer-type BertWordPieceTokenizer \
23 |        --tokenizer-model-type bert-large-uncased \
24 |        --split 949,50,1 \
25 |        --distributed-backend nccl \
26 |        --lr-decay-style cosine \
27 |        --lr-decay-iters 160000 \
28 |        --lr-decay-ratio 0.05 \
29 |        --warmup .05 \
30 |        --checkpoint-activations \
31 |        --deepspeed-activation-checkpointing \
32 |        --fp16 \
33 | "
34 | gpt_options="${gpt_options}
35 |                --deepspeed \
36 |                --deepspeed_config ${config_json} \
37 | "


--------------------------------------------------------------------------------
/config/ds_block_large_chinese.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | script_path=$(realpath $BASH_SOURCE)
 4 | script_dir=$(dirname $script_path)
 5 | 
 6 | config_json="$script_dir/config_block_large_chinese.json"
 7 | gpt_options=" \
 8 |        --block-lm \
 9 |        --task-mask \
10 |        --bert-prob 0.4 \
11 |        --gap-sentence-prob 0.3 \
12 |        --avg-block-length 3 \
13 |        --gpt-min-ratio 0.25 \
14 |        --block-mask-prob 0.1 \
15 |        --short-seq-prob 0.02 \
16 |        --experiment-name blocklm-large-chinese \
17 |        --model-parallel-size ${MP_SIZE} \
18 |        --num-layers 24 \
19 |        --hidden-size 1024 \
20 |        --num-attention-heads 16 \
21 |        --seq-length 512 \
22 |        --max-position-embeddings 1024 \
23 |        --save /dataset/fd5061f6/english_data/checkpoints \
24 |        --load /dataset/fd5061f6/english_data/checkpoints/blocklm-large-chinese08-12-12-48 \
25 |        --log-interval 50 \
26 |        --eval-interval 1000 \
27 |        --save-interval 2000 \
28 |        --train-iters 250000 \
29 |        --train-data wudao baike zhihu \
30 |        --resume-dataloader \
31 |        --loader-scatter 32 \
32 |        --no-lazy-loader \
33 |        --tokenizer-type ChineseSPTokenizer \
34 |        --fix-command-token \
35 |        --split 949,50,1 \
36 |        --distributed-backend nccl \
37 |        --lr-decay-style cosine \
38 |        --lr-decay-ratio 0.1 \
39 |        --lr-decay-iters 200000 \
40 |        --warmup 0.04 \
41 |        --checkpoint-activations \
42 |        --deepspeed-activation-checkpointing \
43 |        --fp16 \
44 | "
45 | gpt_options="${gpt_options}
46 |                --deepspeed \
47 |                --deepspeed_config ${config_json} \
48 | "


--------------------------------------------------------------------------------
/config/ds_blockta_large.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | script_path=$(realpath $BASH_SOURCE)
 4 | script_dir=$(dirname $script_path)
 5 | 
 6 | config_json="$script_dir/config_blockta_large.json"
 7 | gpt_options=" \
 8 |        --block-lm \
 9 |        --task-mask \
10 |        --bert-prob 1.0 \
11 |        --avg-block-length 3 \
12 |        --experiment-name blocklm-roberta-large-blank \
13 |        --model-parallel-size ${MP_SIZE} \
14 |        --num-layers 24 \
15 |        --hidden-size 1024 \
16 |        --num-attention-heads 16 \
17 |        --seq-length 512 \
18 |        --max-position-embeddings 512 \
19 |        --save /dataset/fd5061f6/english_data/checkpoints \
20 |        --save-interval 2500 \
21 |        --train-iters 500000 \
22 |        --resume-dataloader \
23 |        --train-data wikibook cc-news openwebtext \
24 |        --shuffle \
25 |        --tokenizer-type GPT2BPETokenizer \
26 |        --tokenizer-model-type roberta \
27 |        --split 949,50,1 \
28 |        --distributed-backend nccl \
29 |        --lr-decay-style linear \
30 |        --lr-decay-iters 500000 \
31 |        --lr-decay-ratio 0.025 \
32 |        --warmup .06 \
33 |        --checkpoint-activations \
34 |        --deepspeed-activation-checkpointing \
35 |        --fp16 \
36 | "
37 | gpt_options="${gpt_options}
38 |                --deepspeed \
39 |                --deepspeed_config ${config_json} \
40 | "


--------------------------------------------------------------------------------
/config_tasks/config_blocklm_10B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": 4,
 3 |   "gradient_accumulation_steps": 2,
 4 |   "steps_per_print": 50,
 5 |   "gradient_clipping": 1.0,
 6 |   "zero_optimization": {
 7 |     "stage": 2,
 8 |     "contiguous_gradients": false,
 9 |     "overlap_comm": true,
10 |     "reduce_scatter": true,
11 |     "reduce_bucket_size": 5e7,
12 |     "allgather_bucket_size": 5e7,
13 |     "cpu_offload": true
14 |   },
15 |   "zero_allow_untested_optimizer": true,
16 |   "fp16": {
17 |     "enabled": true,
18 |     "loss_scale": 0,
19 |     "loss_scale_window": 1000,
20 |     "hysteresis": 2,
21 |     "min_loss_scale": 1
22 |   },
23 |   "optimizer": {
24 |     "type": "Adam",
25 |     "params": {
26 |       "lr": 5e-6,
27 |       "betas": [
28 |         0.9,
29 |         0.95
30 |       ],
31 |       "eps": 1e-8,
32 |       "weight_decay": 1e-2
33 |     }
34 |   },
35 |   "activation_checkpointing": {
36 |     "partition_activations": false,
37 |     "contiguous_memory_optimization": false
38 |   },
39 |   "wall_clock_breakdown": false
40 | }


--------------------------------------------------------------------------------
/config_tasks/config_blocklm_10B_cnndm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": 4,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "steps_per_print": 50,
 5 |   "gradient_clipping": 1.0,
 6 |   "zero_optimization": {
 7 |     "stage": 2,
 8 |     "contiguous_gradients": false,
 9 |     "overlap_comm": true,
10 |     "reduce_scatter": true,
11 |     "reduce_bucket_size": 5e7,
12 |     "allgather_bucket_size": 5e7,
13 |     "cpu_offload": true
14 |   },
15 |   "zero_allow_untested_optimizer": true,
16 |   "fp16": {
17 |     "enabled": true,
18 |     "loss_scale": 0,
19 |     "loss_scale_window": 1000,
20 |     "hysteresis": 2,
21 |     "min_loss_scale": 1
22 |   },
23 |   "optimizer": {
24 |     "type": "Adam",
25 |     "params": {
26 |       "lr": 5e-6,
27 |       "betas": [
28 |         0.9,
29 |         0.95
30 |       ],
31 |       "eps": 1e-8,
32 |       "weight_decay": 1e-2
33 |     }
34 |   },
35 |   "activation_checkpointing": {
36 |     "partition_activations": false,
37 |     "contiguous_memory_optimization": false
38 |   },
39 |   "wall_clock_breakdown": false
40 | }


--------------------------------------------------------------------------------
/config_tasks/config_blocklm_10B_record.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": 1,
 3 |   "gradient_accumulation_steps": 8,
 4 |   "steps_per_print": 50,
 5 |   "gradient_clipping": 1.0,
 6 |   "zero_optimization": {
 7 |     "stage": 2,
 8 |     "contiguous_gradients": false,
 9 |     "overlap_comm": true,
10 |     "reduce_scatter": true,
11 |     "reduce_bucket_size": 5e7,
12 |     "allgather_bucket_size": 5e7,
13 |     "cpu_offload": true
14 |   },
15 |   "zero_allow_untested_optimizer": true,
16 |   "fp16": {
17 |     "enabled": true,
18 |     "loss_scale": 0,
19 |     "loss_scale_window": 1000,
20 |     "hysteresis": 2,
21 |     "min_loss_scale": 1
22 |   },
23 |   "optimizer": {
24 |     "type": "Adam",
25 |     "params": {
26 |       "lr": 5e-6,
27 |       "betas": [
28 |         0.9,
29 |         0.95
30 |       ],
31 |       "eps": 1e-8,
32 |       "weight_decay": 1e-2
33 |     }
34 |   },
35 |   "activation_checkpointing": {
36 |     "partition_activations": false,
37 |     "contiguous_memory_optimization": false
38 |   },
39 |   "wall_clock_breakdown": false
40 | }


--------------------------------------------------------------------------------
/config_tasks/model_blocklm_1.25_generation.sh:
--------------------------------------------------------------------------------
 1 | MODEL_TYPE="blocklm-1.25-generation"
 2 | MODEL_ARGS="--block-lm \
 3 |             --cloze-eval \
 4 |             --num-layers 30 \
 5 |             --hidden-size 1024 \
 6 |             --num-attention-heads 16 \
 7 |             --max-position-embeddings 512 \
 8 |             --tokenizer-model-type bert-large-uncased \
 9 |             --tokenizer-type BertWordPieceTokenizer \
10 |             --load-pretrained ${CHECKPOINT_PATH}/blocklm-1.25-generation"


--------------------------------------------------------------------------------
/config_tasks/model_blocklm_1.5_generation.sh:
--------------------------------------------------------------------------------
 1 | MODEL_TYPE="blocklm-1.5-generation"
 2 | MODEL_ARGS="--block-lm \
 3 |             --cloze-eval \
 4 |             --num-layers 30 \
 5 |             --hidden-size 1152 \
 6 |             --num-attention-heads 18 \
 7 |             --max-position-embeddings 512 \
 8 |             --tokenizer-model-type bert-large-uncased \
 9 |             --tokenizer-type BertWordPieceTokenizer \
10 |             --load-pretrained ${CHECKPOINT_PATH}/blocklm-1.5-generation"


--------------------------------------------------------------------------------
/config_tasks/model_blocklm_10B.sh:
--------------------------------------------------------------------------------
 1 | MODEL_TYPE="GLM-10B"
 2 | MODEL_ARGS="--block-lm \
 3 |             --cloze-eval \
 4 |             --task-mask \
 5 |             --num-layers 48 \
 6 |             --hidden-size 4096 \
 7 |             --num-attention-heads 64 \
 8 |             --max-position-embeddings 1024 \
 9 |             --tokenizer-type GPT2BPETokenizer \
10 |             --load-pretrained ${CHECKPOINT_PATH}/glm-10b-1024"


--------------------------------------------------------------------------------
/config_tasks/model_blocklm_10B_chinese.sh:
--------------------------------------------------------------------------------
 1 | MODEL_TYPE="GLM-10B-chinese"
 2 | MODEL_ARGS="--block-lm \
 3 |             --cloze-eval \
 4 |             --task-mask \
 5 |             --num-layers 48 \
 6 |             --hidden-size 4096 \
 7 |             --num-attention-heads 64 \
 8 |             --max-position-embeddings 1024 \
 9 |             --tokenizer-type ChineseSPTokenizer \
10 |             --load-pretrained ${CHECKPOINT_PATH}/glm-10b-chinese"


--------------------------------------------------------------------------------
/config_tasks/model_blocklm_2B.sh:
--------------------------------------------------------------------------------
 1 | MODEL_TYPE="blocklm-2B"
 2 | MODEL_ARGS="--block-lm \
 3 |             --cloze-eval \
 4 |             --task-mask \
 5 |             --num-layers 36 \
 6 |             --hidden-size 2048 \
 7 |             --num-attention-heads 32 \
 8 |             --max-position-embeddings 1024 \
 9 |             --tokenizer-type GPT2BPETokenizer \
10 |             --load-pretrained ${CHECKPOINT_PATH}/blocklm-2b-512"


--------------------------------------------------------------------------------
/config_tasks/model_blocklm_base.sh:
--------------------------------------------------------------------------------
1 | MODEL_TYPE="blank-base"
2 | MODEL_ARGS="--block-lm \
3 |             --num-layers 12 \
4 |             --hidden-size 768 \
5 |             --num-attention-heads 12 \
6 |             --max-position-embeddings 512 \
7 |             --tokenizer-model-type bert-base-uncased \
8 |             --tokenizer-type BertWordPieceTokenizer \
9 |             --load-pretrained ${CHECKPOINT_PATH}/blocklm-base-blank"


--------------------------------------------------------------------------------
/config_tasks/model_blocklm_large.sh:
--------------------------------------------------------------------------------
1 | MODEL_TYPE="blank-large"
2 | MODEL_ARGS="--block-lm \
3 |             --num-layers 24 \
4 |             --hidden-size 1024 \
5 |             --num-attention-heads 16 \
6 |             --max-position-embeddings 512 \
7 |             --tokenizer-model-type bert-large-uncased \
8 |             --tokenizer-type BertWordPieceTokenizer \
9 |             --load-pretrained ${CHECKPOINT_PATH}/blocklm-large-blank"


--------------------------------------------------------------------------------
/config_tasks/model_blocklm_large_chinese.sh:
--------------------------------------------------------------------------------
 1 | MODEL_TYPE="blocklm-large-chinese"
 2 | MODEL_ARGS="--block-lm \
 3 |             --cloze-eval \
 4 |             --task-mask \
 5 |             --num-layers 24 \
 6 |             --hidden-size 1024 \
 7 |             --num-attention-heads 16 \
 8 |             --max-position-embeddings 1024 \
 9 |             --tokenizer-type ChineseSPTokenizer \
10 |             --fix-command-token \
11 |             --load-pretrained ${CHECKPOINT_PATH}/blocklm-large-chinese"


--------------------------------------------------------------------------------
/config_tasks/model_blocklm_large_generation.sh:
--------------------------------------------------------------------------------
1 | MODEL_TYPE="generation-large"
2 | MODEL_ARGS="--block-lm \
3 |             --num-layers 24 \
4 |             --hidden-size 1024 \
5 |             --num-attention-heads 16 \
6 |             --max-position-embeddings 512 \
7 |             --tokenizer-model-type bert-large-uncased \
8 |             --tokenizer-type BertWordPieceTokenizer \
9 |             --load-pretrained ${CHECKPOINT_PATH}/blocklm-large-generation"


--------------------------------------------------------------------------------
/config_tasks/model_blocklm_roberta_1.25.sh:
--------------------------------------------------------------------------------
 1 | MODEL_TYPE="blocklm-roberta-1.25"
 2 | MODEL_ARGS="--block-lm \
 3 |             --cloze-eval \
 4 |             --task-mask \
 5 |             --num-layers 24 \
 6 |             --hidden-size 1152 \
 7 |             --num-attention-heads 18 \
 8 |             --max-position-embeddings 1024 \
 9 |             --tokenizer-model-type roberta \
10 |             --tokenizer-type GPT2BPETokenizer \
11 |             --load-pretrained /dataset/c07bd62b/checkpoints/blocklm-roberta-1.25-blank04-22-14-01"


--------------------------------------------------------------------------------
/config_tasks/model_blocklm_roberta_large.sh:
--------------------------------------------------------------------------------
 1 | MODEL_TYPE="blocklm-roberta-large"
 2 | MODEL_ARGS="--block-lm \
 3 |             --cloze-eval \
 4 |             --num-layers 24 \
 5 |             --hidden-size 1024 \
 6 |             --num-attention-heads 16 \
 7 |             --max-position-embeddings 512 \
 8 |             --tokenizer-model-type roberta \
 9 |             --tokenizer-type GPT2BPETokenizer \
10 |             --load-pretrained ${CHECKPOINT_PATH}/blocklm-roberta-large-blank"


--------------------------------------------------------------------------------
/config_tasks/seq_blank.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-blank-${MASK_RATIO}
 2 | TASK_NAME=blank
 3 | DATA_PATH="${DATA_ROOT}/blank_yahoo"
 4 | 
 5 | TRAIN_ARGS="--epochs 5 \
 6 |             --batch-size 16 \
 7 |             --lr 1e-5 \
 8 |             --lr-decay-style linear \
 9 |             --warmup 0.06 \
10 |             --weight-decay 1.0e-1
11 |             --label-smoothing 0.1 \
12 |             --blank-maskratio ${MASK_RATIO} \
13 |             --save-epoch 5"
14 | 
15 | COMMON_ARGS="--save-interval 10000 \
16 |              --log-interval 50 \
17 |              --eval-interval 1000 \
18 |              --eval-iters 100"
19 | 
20 | TASK_ARGS="--src-seq-length 256 \
21 |            --tgt-seq-length 200 \
22 |            --min-tgt-length 0 \
23 |            --length-penalty 1 \
24 |            --no-repeat-ngram-size 3 \
25 |            --eval-batch-size 8"


--------------------------------------------------------------------------------
/config_tasks/seq_cmrc.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-cmrc
 2 | TASK_NAME=cmrc
 3 | DATA_PATH="${DATA_ROOT}/CMRC2018"
 4 | 
 5 | TRAIN_ARGS="--epochs 10 \
 6 |             --batch-size 8 \
 7 |             --lr 2e-5 \
 8 |             --lr-decay-style linear \
 9 |             --warmup 0.06 \
10 |             --weight-decay 1.0e-1 \
11 |             --label-smoothing 0.1"
12 | 
13 | COMMON_ARGS="--save-interval 10000 \
14 |              --log-interval 50 \
15 |              --eval-interval 1000 \
16 |              --eval-iters 100 \
17 |              --eval-epoch 10"
18 | 
19 | TASK_ARGS="--src-seq-length 464 \
20 |            --tgt-seq-length 48 \
21 |            --min-tgt-length 0 \
22 |            --length-penalty 0.7 \
23 |            --num-beams 5 \
24 |            --select-topk \
25 |            --eval-batch-size 4"


--------------------------------------------------------------------------------
/config_tasks/seq_cnndm.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-cnndm
 2 | TASK_NAME=cnn_dm
 3 | DATA_PATH="/root/data/cnn_dm"
 4 | 
 5 | TRAIN_ARGS="--epochs 15 \
 6 |             --lr 3e-5 \
 7 |             --lr-decay-style linear \
 8 |             --warmup 0.06 \
 9 |             --weight-decay 1.0e-1 \
10 |             --label-smoothing 0.1"
11 | 
12 | COMMON_ARGS="--save-interval 10000 \
13 |              --log-interval 50 \
14 |              --eval-interval 1000 \
15 |              --eval-iters 100"
16 | 
17 | TASK_ARGS="--src-seq-length 608 \
18 |            --tgt-seq-length 160 \
19 |            --min-tgt-length 55 \
20 |            --length-penalty 0.7 \
21 |            --no-repeat-ngram-size 3 \
22 |            --num-beams 5 \
23 |            --select-topk \
24 |            --eval-batch-size 4"


--------------------------------------------------------------------------------
/config_tasks/seq_cnndm_org.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-cnndm_org
 2 | TASK_NAME=cnn_dm_original
 3 | DATA_PATH="${DATA_ROOT}/cnn_dm_original"
 4 | 
 5 | TRAIN_ARGS="--epochs 10 \
 6 |             --lr 1e-5 \
 7 |             --lr-decay-style linear \
 8 |             --warmup 0.06 \
 9 |             --weight-decay 1.0e-1 \
10 |             --label-smoothing 0.1"
11 | 
12 | COMMON_ARGS="--save-interval 10000 \
13 |              --log-interval 50 \
14 |              --eval-interval 1000 \
15 |              --eval-iters 100 \
16 |              --eval-epoch 2"
17 | 
18 | TASK_ARGS="--src-seq-length 608 \
19 |            --tgt-seq-length 160 \
20 |            --min-tgt-length 55 \
21 |            --length-penalty 0.7 \
22 |            --no-repeat-ngram-size 3 \
23 |            --num-beams 5 \
24 |            --select-topk \
25 |            --eval-batch-size 1"


--------------------------------------------------------------------------------
/config_tasks/seq_customization.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-customization
 2 | TASK_NAME=customization
 3 | DATA_PATH="${DATA_ROOT}/customization"
 4 | 
 5 | TRAIN_ARGS="--epochs 10 \
 6 |             --lr 1e-5 \
 7 |             --lr-decay-style linear \
 8 |             --warmup 0.06 \
 9 |             --label-smoothing 0.1"
10 | 
11 | COMMON_ARGS="--save-interval 10000 \
12 |              --log-interval 50 \
13 |              --eval-interval 1000 \
14 |              --eval-iters 100 \
15 |              --eval-epoch 2"
16 | 
17 | TASK_ARGS="--src-seq-length 512 \
18 |            --tgt-seq-length 128 \
19 |            --min-tgt-length 55 \
20 |            --length-penalty 0.7 \
21 |            --no-repeat-ngram-size 3 \
22 |            --num-beams 5 \
23 |            --select-topk \
24 |            --eval-batch-size 1"


--------------------------------------------------------------------------------
/config_tasks/seq_gigaword.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-ggw
 2 | TASK_NAME=gigaword
 3 | DATA_PATH="${DATA_ROOT}/gigaword/org_data"
 4 | 
 5 | TRAIN_ARGS="--epochs 10 \
 6 |             --lr 3e-5 \
 7 |             --lr-decay-style linear \
 8 |             --warmup 0.06 \
 9 |             --weight-decay 1.0e-1 \
10 |             --label-smoothing 0.1"
11 | 
12 | COMMON_ARGS="--save-interval 10000 \
13 |              --log-interval 50 \
14 |              --eval-interval 1000 \
15 |              --eval-iters 100"
16 | 
17 | TASK_ARGS="--src-seq-length 192 \
18 |            --tgt-seq-length 32 \
19 |            --min-tgt-length 0 \
20 |            --length-penalty 0.6 \
21 |            --no-repeat-ngram-size 3 \
22 |            --num-beams 5 \
23 |            --select-topk \
24 |            --eval-batch-size 4"


--------------------------------------------------------------------------------
/config_tasks/seq_squad.sh:
--------------------------------------------------------------------------------
 1 | TASK_NAME=squad
 2 | EXPERIMENT_NAME=${MODEL_TYPE}-${TASK_NAME}
 3 | DATA_PATH="/dataset/fd5061f6/english_data/SQuAD"
 4 | 
 5 | LR_SINGLE=1e-5
 6 | EPOCH_SINGLE=10
 7 | BATCH_SINGLE=12
 8 | 
 9 | TRAIN_ARGS="--lr-decay-style linear \
10 |             --warmup 0.06 \
11 |             --weight-decay 1.0e-1 \
12 |             --label-smoothing 0.1"
13 | 
14 | COMMON_ARGS="--save-interval 10000 \
15 |              --log-interval 200 \
16 |              --eval-interval 1000 \
17 |              --eval-iters 100 \
18 |              --eval-epoch 1 \
19 |              --overwrite"
20 | 
21 | TASK_ARGS="--src-seq-length 512 \
22 |            --tgt-seq-length 64 \
23 |            --min-tgt-length 0 \
24 |            --length-penalty 0 \
25 |            --num-beams 5 \
26 |            --select-topk \
27 |            --eval-batch-size 8 \
28 |            --validation-metric F1"
29 | 


--------------------------------------------------------------------------------
/config_tasks/seq_squad_generation.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-squad
 2 | TASK_NAME=squad_generation
 3 | DATA_PATH="/dataset/c07bd62b/nqg/raw"
 4 | 
 5 | TRAIN_ARGS="--epochs 10 \
 6 |             --lr 2e-5 \
 7 |             --lr-decay-style linear \
 8 |             --warmup 0.06 \
 9 |             --weight-decay 1.0e-1 \
10 |             --label-smoothing 0.1"
11 | 
12 | COMMON_ARGS="--save-interval 10000 \
13 |              --log-interval 50 \
14 |              --eval-interval 1000 \
15 |              --eval-iters 100 \
16 |              --eval-epoch 10"
17 | 
18 | TASK_ARGS="--src-seq-length 464 \
19 |            --tgt-seq-length 48 \
20 |            --min-tgt-length 0 \
21 |            --length-penalty 0.7 \
22 |            --num-beams 5 \
23 |            --select-topk \
24 |            --eval-batch-size 4"


--------------------------------------------------------------------------------
/config_tasks/seq_squad_v1.sh:
--------------------------------------------------------------------------------
 1 | TASK_NAME=squad_v1
 2 | EXPERIMENT_NAME=${MODEL_TYPE}-${TASK_NAME}
 3 | DATA_PATH="/dataset/fd5061f6/english_data/SQuAD"
 4 | 
 5 | LR_SINGLE=1e-5
 6 | EPOCH_SINGLE=5
 7 | BATCH_SINGLE=12
 8 | 
 9 | TRAIN_ARGS="--lr-decay-style linear \
10 |             --warmup 0.06 \
11 |             --weight-decay 1.0e-1 \
12 |             --label-smoothing 0.1"
13 | 
14 | COMMON_ARGS="--save-interval 10000 \
15 |              --log-interval 200 \
16 |              --eval-interval 1000 \
17 |              --eval-iters 100 \
18 |              --eval-epoch 1 \
19 |              --overwrite"
20 | 
21 | TASK_ARGS="--src-seq-length 512 \
22 |            --tgt-seq-length 64 \
23 |            --min-tgt-length 0 \
24 |            --length-penalty 0 \
25 |            --num-beams 5 \
26 |            --select-topk \
27 |            --eval-batch-size 8 \
28 |            --validation-metric F1"
29 | 
30 | #           --load /dataset/fd5061f6/finetune_checkpoints/blank-base-squad_v1
31 | #           --load /dataset/fd5061f6/finetune_checkpoints/blocklm-roberta-large-squad_v1


--------------------------------------------------------------------------------
/config_tasks/seq_xsum.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-xsum
 2 | TASK_NAME=xsum
 3 | DATA_PATH="${DATA_ROOT}/bbc-summary-data"
 4 | 
 5 | TRAIN_ARGS="--epochs 6 \
 6 |             --lr 1e-5 \
 7 |             --lr-decay-style linear \
 8 |             --warmup 0.06 \
 9 |             --weight-decay 1.0e-1 \
10 |             --label-smoothing 0.1"
11 | 
12 | COMMON_ARGS="--save-interval 10000 \
13 |              --log-interval 50 \
14 |              --eval-interval 1000 \
15 |              --eval-iters 100 \
16 |              --eval-epoch 2"
17 | 
18 | TASK_ARGS="--src-seq-length 608 \
19 |            --tgt-seq-length 60 \
20 |            --min-tgt-length 10 \
21 |            --length-penalty 1.0 \
22 |            --no-repeat-ngram-size 3 \
23 |            --num-beams 6 \
24 |            --select-topk \
25 |            --eval-batch-size 1"


--------------------------------------------------------------------------------
/config_tasks/task_afqmc.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-AFQMC
 2 | TASK_NAME=afqmc
 3 | DATA_PATH="${DATA_ROOT}/AFQMC"
 4 | MAX_SEQ_LEN=256
 5 | 
 6 | LR_SINGLE=1e-5
 7 | EPOCH_SINGLE=10
 8 | XXLARGE_EPOCH=20
 9 | 
10 | TRAIN_ARGS="--lr-decay-style linear \
11 |             --warmup 0.1 \
12 |             --weight-decay 1.0e-1 \
13 |             --pattern-id 0"
14 | 
15 | COMMON_ARGS="--save-interval 10000 \
16 |              --log-interval 50 \
17 |              --eval-interval 1000 \
18 |              --eval-iters 100"
19 | 
20 | PATTERN_IDS=(0 1)
21 | PROMPT_IDS=(1 2 3)
22 | 
23 | BATCH_SIZE=16


--------------------------------------------------------------------------------
/config_tasks/task_boolq.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-boolq
 2 | TASK_NAME=BoolQ
 3 | DATA_PATH="${DATA_ROOT}/BoolQ"
 4 | MAX_SEQ_LEN=256
 5 | 
 6 | LR_SINGLE=1e-5
 7 | EPOCH_SINGLE=20
 8 | XXLARGE_EPOCH=24
 9 | 
10 | TRAIN_ARGS="--lr-decay-style linear \
11 |             --warmup 0.1 \
12 |             --weight-decay 1.0e-1 \
13 |             --pattern-id 4"
14 | 
15 | COMMON_ARGS="--save-interval 10000 \
16 |              --log-interval 50 \
17 |              --eval-interval 10000000 \
18 |              --eval-iters 100"
19 | 
20 | PATTERN_IDS=(0 1 2 3 4 5)
21 | PROMPT_IDS=(1 2 3)
22 | 
23 | BATCH_SIZE=16


--------------------------------------------------------------------------------
/config_tasks/task_cb.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-CB
 2 | TASK_NAME=cb
 3 | DATA_PATH="${DATA_ROOT}/CB"
 4 | MAX_SEQ_LEN=256
 5 | 
 6 | LR_SINGLE=1e-5
 7 | EPOCH_SINGLE=50
 8 | XXLARGE_EPOCH=100
 9 | 
10 | TRAIN_ARGS="--lr-decay-style linear \
11 |             --warmup 0.1 \
12 |             --weight-decay 1.0e-1 \
13 |             --pattern-id 3"
14 | 
15 | COMMON_ARGS="--save-interval 10000 \
16 |              --log-interval 50 \
17 |              --eval-interval 1000 \
18 |              --eval-iters 100"
19 | 
20 | PATTERN_IDS=(0 1 2 3)
21 | PROMPT_IDS=(1 2 3)
22 | 
23 | BATCH_SIZE=16


--------------------------------------------------------------------------------
/config_tasks/task_cluewsc.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-CLUEWSC
 2 | TASK_NAME=cluewsc
 3 | DATA_PATH="${DATA_ROOT}/CLUEWSC"
 4 | MAX_SEQ_LEN=256
 5 | 
 6 | LR_SINGLE=1e-5
 7 | EPOCH_SINGLE=10
 8 | XXLARGE_EPOCH=12
 9 | 
10 | TRAIN_ARGS="--lr-decay-style linear \
11 |             --warmup 0.1 \
12 |             --weight-decay 1.0e-1 \
13 |             --pattern-id 0"
14 | 
15 | COMMON_ARGS="--save-interval 10000 \
16 |              --log-interval 50 \
17 |              --eval-interval 1000 \
18 |              --eval-iters 100"
19 | 
20 | PATTERN_IDS=(0 1)
21 | PROMPT_IDS=(1 2 3)
22 | 
23 | BATCH_SIZE=16


--------------------------------------------------------------------------------
/config_tasks/task_cmrc.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-CMRC
 2 | TASK_NAME=cmrc
 3 | DATA_PATH="${DATA_ROOT}/CMRC"
 4 | MAX_SEQ_LEN=256
 5 | 
 6 | LR_SINGLE=1e-5
 7 | EPOCH_SINGLE=10
 8 | XXLARGE_EPOCH=12
 9 | 
10 | TRAIN_ARGS="--lr-decay-style linear \
11 |             --warmup 0.1 \
12 |             --weight-decay 1.0e-1 \
13 |             --pattern-id 0"
14 | 
15 | COMMON_ARGS="--save-interval 10000 \
16 |              --log-interval 50 \
17 |              --eval-interval 1000 \
18 |              --eval-iters 100"
19 | 
20 | PATTERN_IDS=(0 1)
21 | PROMPT_IDS=(1 2 3)
22 | 
23 | BATCH_SIZE=16


--------------------------------------------------------------------------------
/config_tasks/task_copa.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-copa
 2 | TASK_NAME=COPA
 3 | DATA_PATH="${DATA_ROOT}/COPA"
 4 | MAX_SEQ_LEN=256
 5 | 
 6 | LR_SINGLE=1e-5
 7 | EPOCH_SINGLE=50
 8 | XXLARGE_EPOCH=100
 9 | 
10 | TRAIN_ARGS="--lr-decay-style linear \
11 |             --warmup 0.1 \
12 |             --weight-decay 1.0e-1 \
13 |             --pattern-id 0"
14 | 
15 | COMMON_ARGS="--save-interval 10000 \
16 |              --log-interval 20 \
17 |              --eval-interval 1000 \
18 |              --eval-iters 100"
19 | 
20 | PATTERN_IDS=(0 1)
21 | PROMPT_IDS=(1 2)
22 | 
23 | BATCH_SIZE=16


--------------------------------------------------------------------------------
/config_tasks/task_multirc.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-MultiRC
 2 | TASK_NAME=multirc
 3 | DATA_PATH="${DATA_ROOT}/MultiRC"
 4 | MAX_SEQ_LEN=512
 5 | 
 6 | LR_SINGLE=1e-5
 7 | EPOCH_SINGLE=15
 8 | XXLARGE_EPOCH=12
 9 | 
10 | TRAIN_ARGS="--batch-size 16 \
11 |             --lr-decay-style linear \
12 |             --warmup 0.1 \
13 |             --weight-decay 1.0e-1 \
14 |             --pattern-id 0"
15 | 
16 | COMMON_ARGS="--save-interval 10000 \
17 |              --log-interval 50 \
18 |              --eval-interval 10000000 \
19 |              --eval-iters 100"
20 | 
21 | PATTERN_IDS=(0 1 2)
22 | PROMPT_IDS=(1 2 3)
23 | 
24 | BATCH_SIZE=64


--------------------------------------------------------------------------------
/config_tasks/task_record.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-record
 2 | TASK_NAME=ReCoRD
 3 | DATA_PATH="${DATA_ROOT}/ReCoRD"
 4 | MAX_SEQ_LEN=512
 5 | 
 6 | LR_SINGLE=1e-5
 7 | EPOCH_SINGLE=5
 8 | XXLARGE_EPOCH=3
 9 | 
10 | TRAIN_ARGS="--lr-decay-style linear \
11 |             --warmup 0.1 \
12 |             --weight-decay 1.0e-1 \
13 |             --pattern-id 0"
14 | 
15 | COMMON_ARGS="--save-interval 10000 \
16 |              --log-interval 50 \
17 |              --eval-interval 1000 \
18 |              --eval-iters 100 \
19 |              --deepspeed_config config_tasks/config_blocklm_10B_record.json"
20 | 
21 | PATTERN_IDS=(0)
22 | 
23 | BATCH_SIZE=64


--------------------------------------------------------------------------------
/config_tasks/task_rte.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-rte
 2 | TASK_NAME=RTE
 3 | DATA_PATH="${DATA_ROOT}/RTE"
 4 | MAX_SEQ_LEN=256
 5 | 
 6 | LR_SINGLE=1e-5
 7 | EPOCH_SINGLE=50
 8 | XXLARGE_EPOCH=50
 9 | 
10 | TRAIN_ARGS="--lr-decay-style linear \
11 |             --warmup 0.1 \
12 |             --weight-decay 1.0e-1 \
13 |             --pattern-id 0"
14 | 
15 | COMMON_ARGS="--save-interval 10000 \
16 |              --log-interval 50 \
17 |              --eval-interval 10000000 \
18 |              --eval-iters 100"
19 | 
20 | PATTERN_IDS=(0 1 2 3)
21 | PROMPT_IDS=(1 2 3)
22 | 
23 | BATCH_SIZE=16


--------------------------------------------------------------------------------
/config_tasks/task_tnews.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-TNews
 2 | TASK_NAME=tnews
 3 | DATA_PATH="${DATA_ROOT}/TNews"
 4 | MAX_SEQ_LEN=256
 5 | 
 6 | LR_SINGLE=1e-5
 7 | EPOCH_SINGLE=10
 8 | XXLARGE_EPOCH=12
 9 | 
10 | TRAIN_ARGS="--lr-decay-style linear \
11 |             --warmup 0.1 \
12 |             --weight-decay 1.0e-1 \
13 |             --pattern-id 0"
14 | 
15 | COMMON_ARGS="--save-interval 10000 \
16 |              --log-interval 50 \
17 |              --eval-interval 1000 \
18 |              --eval-iters 100"
19 | 
20 | PATTERN_IDS=(0 1)
21 | PROMPT_IDS=(1 2 3)
22 | 
23 | BATCH_SIZE=16


--------------------------------------------------------------------------------
/config_tasks/task_wic.sh:
--------------------------------------------------------------------------------
 1 | EXPERIMENT_NAME=${MODEL_TYPE}-wic
 2 | TASK_NAME=wic
 3 | DATA_PATH="${DATA_ROOT}/WiC"
 4 | MAX_SEQ_LEN=256
 5 | 
 6 | LR_SINGLE=1e-5
 7 | EPOCH_SINGLE=30
 8 | XXLARGE_EPOCH=40
 9 | 
10 | TRAIN_ARGS="--lr-decay-style linear \
11 |             --warmup 0.1 \
12 |             --weight-decay 1.0e-1 \
13 |             --pattern-id 1"
14 | 
15 | COMMON_ARGS="--save-interval 10000 \
16 |              --log-interval 50 \
17 |              --eval-interval 10000000 \
18 |              --eval-iters 100"
19 | 
20 | PATTERN_IDS=(0 1 2)
21 | PROMPT_IDS=(1 2 3)
22 | 
23 | BATCH_SIZE=16


--------------------------------------------------------------------------------
/config_tasks/task_wsc.sh:
--------------------------------------------------------------------------------
 1 | TASK_NAME=wsc
 2 | EXPERIMENT_NAME=${MODEL_TYPE}-${TASK_NAME}
 3 | DATA_PATH="${DATA_ROOT}/WSC-negative"
 4 | MAX_SEQ_LEN=128
 5 | 
 6 | LR_SINGLE=1e-5
 7 | EPOCH_SINGLE=50
 8 | XXLARGE_EPOCH=100
 9 | 
10 | TRAIN_ARGS="--lr-decay-style linear \
11 |             --warmup 0.1 \
12 |             --weight-decay 0.1 \
13 |             --loss-func mix \
14 |             --wsc-negative \
15 |             --length-penalty 1 \
16 |             --pattern-id 2"
17 | 
18 | COMMON_ARGS="--save-interval 10000 \
19 |              --log-interval 50 \
20 |              --eval-interval 1000 \
21 |              --eval-iters 100"
22 | 
23 | PATTERN_IDS=(0 1 2)
24 | PROMPT_IDS=(1 2 3)
25 | 
26 | BATCH_SIZE=16


--------------------------------------------------------------------------------
/config_tasks/task_wsc_generative.sh:
--------------------------------------------------------------------------------
 1 | TASK_NAME=wsc
 2 | EXPERIMENT_NAME=${MODEL_TYPE}-${TASK_NAME}_generative
 3 | DATA_PATH="${DATA_ROOT}/WSC"
 4 | MAX_SEQ_LEN=128
 5 | 
 6 | LR_SINGLE=1e-5
 7 | EPOCH_SINGLE=50
 8 | XXLARGE_EPOCH=100
 9 | 
10 | TRAIN_ARGS="--lr-decay-style linear \
11 |             --warmup 0.1 \
12 |             --weight-decay 0.1"
13 | 
14 | COMMON_ARGS="--save-interval 10000 \
15 |              --log-interval 50 \
16 |              --eval-interval 1000 \
17 |              --eval-iters 100"
18 | 
19 | BATCH_SIZE=16
20 | 


--------------------------------------------------------------------------------
/config_tasks/zero_lambada.sh:
--------------------------------------------------------------------------------
1 | EXPERIMENT_NAME=${MODEL_TYPE}-lambda
2 | TASK_NAME=lambda
3 | DATA_PATH="${DATA_ROOT}/lambada_test.jsonl"
4 | EVALUATE_ARGS="--eval-batch-size 16 \
5 |                --seq-length 512"


--------------------------------------------------------------------------------
/config_tasks/zero_lambada_uni.sh:
--------------------------------------------------------------------------------
1 | EXPERIMENT_NAME=${MODEL_TYPE}-lambda_uni
2 | TASK_NAME=lambda
3 | DATA_PATH="${DATA_ROOT}/lambada_test.jsonl"
4 | EVALUATE_ARGS="--eval-batch-size 16 \
5 |                --seq-length 512 \
6 |                --unidirectional"


--------------------------------------------------------------------------------
/config_tasks/zero_lm.sh:
--------------------------------------------------------------------------------
1 | EXPERIMENT_NAME=${MODEL_TYPE}-lm
2 | TASK_NAME=language_model
3 | DATA_PATH=${DATA_ROOT}/bert-large-test.txt
4 | EVALUATE_ARGS="--eval-batch-size 16 \
5 |                --seq-length 512 \
6 |                --overlapping-eval 256"


--------------------------------------------------------------------------------
/config_tasks/zero_lm_uni.sh:
--------------------------------------------------------------------------------
1 | EXPERIMENT_NAME=${MODEL_TYPE}-lm_uni
2 | TASK_NAME=language_model
3 | DATA_PATH=${DATA_ROOT}/bert-large-test.txt
4 | EVALUATE_ARGS="--eval-batch-size 16 \
5 |                --seq-length 512 \
6 |                --overlapping-eval 256 \
7 |                --unidirectional"


--------------------------------------------------------------------------------
/config_tasks/zero_wikitext.sh:
--------------------------------------------------------------------------------
1 | EXPERIMENT_NAME=${MODEL_TYPE}-wikitext
2 | TASK_NAME=wikitext
3 | DATA_PATH=/dataset/c07bd62b/wikitext-103/wiki.test.tokens
4 | EVALUATE_ARGS="--eval-batch-size 16 \
5 |                --seq-length 1024 \
6 |                --overlapping-eval 256"


--------------------------------------------------------------------------------
/data_utils/extraction.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import glob
 3 | import json
 4 | import os
 5 | 
 6 | nltk.download('punkt')
 7 | 
 8 | 
 9 | class NLTKSegmenter:
10 |     def __init(self):
11 |         pass
12 | 
13 |     @staticmethod
14 |     def segment_string(article):
15 |         return nltk.tokenize.sent_tokenize(article)
16 | 
17 | 
18 | wiki_path = "data/extracted"
19 | output_path = "formatted/wiki-key.txt"
20 | segmenter = NLTKSegmenter()
21 | with open(output_path, "w") as output:
22 |     for dirname in glob.glob(os.path.join(wiki_path, '*'), recursive=False):
23 |         for filename in glob.glob(os.path.join(dirname, 'wiki_*'), recursive=True):
24 |             print(filename)
25 |             article_lines = []
26 |             article_open = False
27 |             with open(filename, mode='r', newline='\n') as file:
28 |                 for line in file:
29 |                     line = line.rstrip()
30 |                     if '<doc id=' in line:
31 |                         article_open = True
32 |                     elif '</doc>' in line:
33 |                         key_sentences, contents = [], []
34 |                         key, content = None, []
35 |                         for sentences in article_lines[1:]:
36 |                             if len(sentences) > 1:
37 |                                 if key:
38 |                                     if len(content) > 0 or len(contents) == 0:
39 |                                         key_sentences.append(key)
40 |                                         contents.append(content)
41 |                                     else:
42 |                                         contents[-1].append(key)
43 |                                     key, content = None, []
44 |                                 key_sentences.append(sentences[0])
45 |                                 contents.append(sentences[1:])
46 |                             elif len(sentences) > 0:
47 |                                 if key:
48 |                                     content.append(sentences[0])
49 |                                 else:
50 |                                     key = sentences[0]
51 |                         if key:
52 |                             if len(content) > 0 or len(contents) == 0:
53 |                                 key_sentences.append(key)
54 |                                 contents.append(content)
55 |                             else:
56 |                                 contents[-1].append(key)
57 |                         contents = [" ".join(content) for content in contents]
58 |                         article = {"key": key_sentences, "content": contents}
59 |                         output.write(json.dumps(article))
60 |                         output.write("\n")
61 |                         article_open = False
62 |                         article_lines = []
63 |                     else:
64 |                         if article_open and line:
65 |                             sentences = segmenter.segment_string(line)
66 |                             article_lines.append(sentences)
67 | 


--------------------------------------------------------------------------------
/data_utils/samplers.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """batch samplers that work with either random or sequential data samplers"""
 16 | import math
 17 | import os
 18 | import sys
 19 | 
 20 | import torch
 21 | from torch.utils import data
 22 | import numpy as np
 23 | 
 24 | 
 25 | class RandomSampler(data.sampler.Sampler):
 26 |     r"""
 27 |     Based off of pytorch RandomSampler and DistributedSampler. Essentially a RandomSampler,
 28 |     but this class lets the user set an epoch like DistributedSampler
 29 |     Samples elements randomly. If without replacement, then sample from a shuffled dataset.
 30 |     If with replacement, then user can specify ``num_samples`` to draw.
 31 |     Arguments:
 32 |         data_source (Dataset): dataset to sample from
 33 |         num_samples (int): number of samples to draw, default=len(dataset)
 34 |         replacement (bool): samples are drawn with replacement if ``True``, default=False
 35 |     """
 36 | 
 37 |     def __init__(self, data_source, replacement=False, num_samples=None):
 38 |         super(RandomSampler, self).__init__(data_source)
 39 |         self.data_source = data_source
 40 |         self.replacement = replacement
 41 |         self._num_samples = num_samples
 42 |         self.epoch = -1
 43 | 
 44 |         if self._num_samples is not None and replacement is False:
 45 |             raise ValueError("With replacement=False, num_samples should not be specified, "
 46 |                              "since a random permute will be performed.")
 47 | 
 48 |         if not isinstance(self.num_samples, int) or self.num_samples <= 0:
 49 |             raise ValueError("num_samples should be a positive integer "
 50 |                              "value, but got num_samples={}".format(self.num_samples))
 51 |         if not isinstance(self.replacement, bool):
 52 |             raise ValueError("replacement should be a boolean value, but got "
 53 |                              "replacement={}".format(self.replacement))
 54 | 
 55 |     @property
 56 |     def num_samples(self):
 57 |         # dataset size might change at runtime
 58 |         if self._num_samples is None:
 59 |             return len(self.data_source)
 60 |         return self._num_samples
 61 | 
 62 |     def __iter__(self):
 63 |         n = len(self.data_source)
 64 |         g = torch.Generator()
 65 |         if self.epoch >= 0:
 66 |             g.manual_seed(self.epoch)
 67 |         if self.replacement:
 68 |             for _ in range(self.num_samples // 32):
 69 |                 yield from torch.randint(high=n, size=(32,), dtype=torch.int64, generator=g).tolist()
 70 |             yield from torch.randint(high=n, size=(self.num_samples % 32,), dtype=torch.int64,
 71 |                                      generator=g).tolist()
 72 |         else:
 73 |             yield from torch.randperm(n, generator=self.generator).tolist()
 74 | 
 75 |     def __len__(self):
 76 |         return self.num_samples
 77 | 
 78 |     def set_epoch(self, epoch):
 79 |         self.epoch = epoch
 80 | 
 81 | 
 82 | class DistributedSequentialSampler(data.sampler.Sampler):
 83 |     def __init__(self, num_samples, train_iters, batch_size, rank=-1, world_size=2):
 84 |         super().__init__(num_samples)
 85 |         if rank == -1:
 86 |             rank = 0
 87 |             world_size = 1
 88 |         self.num_samples = num_samples
 89 |         self.rank = rank
 90 |         self.world_size = world_size
 91 |         self.start_iter = 0
 92 |         self.train_iters = train_iters
 93 |         self.batch_size = batch_size
 94 |         self.batch_bias = [i * (num_samples // batch_size) for i in range(batch_size)]
 95 | 
 96 |     def __iter__(self):
 97 |         for idx in range(self.start_iter, self.train_iters * 10):
 98 |             batch = [(idx + bias) % self.num_samples for bias in self.batch_bias]
 99 |             tbatch = self._batch(batch)
100 |             yield tbatch
101 | 
102 |     def __len__(self):
103 |         return self.train_iters
104 | 
105 |     def _batch(self, batch):
106 |         """extracts samples only pertaining to this worker's batch"""
107 |         start = self.rank*self.batch_size//self.world_size
108 |         end = (self.rank+1)*self.batch_size//self.world_size
109 |         return batch[start:end]
110 | 
111 | 
112 | class DistributedBatchSampler(data.sampler.BatchSampler):
113 |     """
114 |     similar to normal implementation of distributed sampler, except implementation is at the
115 |     batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary
116 |     data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler.
117 |     """
118 |     def __init__(self, sampler, batch_size, drop_last, rank=-1, world_size=2, wrap_last=False, gradient_accumulation_steps=None):
119 |         super(DistributedBatchSampler, self).__init__(sampler, batch_size, drop_last)
120 |         if rank == -1:
121 |             assert False, 'should not be here'
122 |         self.rank = rank
123 |         self.world_size = world_size
124 |         self.sampler.wrap_around = 0
125 |         self.wrap_around = 0
126 |         self.wrap_last = wrap_last
127 |         self.start_iter = 0
128 |         self.effective_batch_size = batch_size if gradient_accumulation_steps is None else batch_size * gradient_accumulation_steps
129 | 
130 |     def __iter__(self):
131 |         batch = []
132 |         i = 0
133 |         for idx in self.data_iterator(self.sampler, wrap_around=False):
134 |             batch.append(idx)
135 |             if len(batch) == self.batch_size:
136 |                 tbatch = self._batch(batch)
137 |                 if i >= self.start_iter * self.effective_batch_size:
138 |                     yield tbatch
139 |                     self.start_iter = 0
140 |                 i += len(batch)
141 |                 batch = []
142 |         batch_len = len(batch)
143 |         if batch_len > 0 and not self.drop_last:
144 |             if self.wrap_last:
145 |                 self.sampler.wrap_around -= (self.batch_size)
146 |                 self.wrap_around += (len(batch))
147 |                 self.wrap_around %= self.batch_size
148 |             yield self._batch(batch)
149 |         if self.wrap_last:
150 |             self.sampler.wrap_around += self.batch_size
151 | 
152 |     def data_iterator(self, _iter, wrap_around=False):
153 |         """iterates through data and handles wrap around"""
154 |         for i, idx in enumerate(_iter):
155 |             if i < self.wrap_around%self.batch_size:
156 |                 continue
157 |             if wrap_around:
158 |                 self.wrap_around += 1
159 |                 self.wrap_around %= self.batch_size
160 |             yield idx
161 | 
162 |     def _batch(self, batch):
163 |         """extracts samples only pertaining to this worker's batch"""
164 |         start = self.rank*self.batch_size//self.world_size
165 |         end = (self.rank+1)*self.batch_size//self.world_size
166 |         return batch[start:end]
167 | 


--------------------------------------------------------------------------------
/data_utils/sp_tokenizer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | from https://github.com/openai/gpt-2/, changed for chinese
  3 | """
  4 | import json
  5 | import os
  6 | import sentencepiece as spm
  7 | 
  8 | """
  9 | SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation 
 10 | systems where the vocabulary size is predetermined prior to the neural model training. SentencePiece implements 
 11 | subword units (e.g., byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model [Kudo.]) with the 
 12 | extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end 
 13 | system that does not depend on language-specific pre/postprocessing.
 14 | https://github.com/google/sentencepiece
 15 | 
 16 | pip install sentencepiece
 17 | 
 18 | or  git clone https://github.com/google/sentencepiece.git
 19 | python setup.py install
 20 | 
 21 | """
 22 | PRETRAINED_MODEL_FILE = "chinese_sentencepiece/cog-pretrain.model"
 23 | 
 24 | 
 25 | def get_pairs(word):
 26 |     pairs = set()
 27 |     prev_char = word[0]
 28 |     for char in word[1:]:
 29 |         pairs.add((prev_char, char))
 30 |         prev_char = char
 31 |     return pairs
 32 | 
 33 | 
 34 | class Encoder:
 35 |     def __init__(self, encoder, bpe_merges):
 36 |         self.encoder = encoder
 37 |         self.decoder = {v: k for k, v in self.encoder.items()}
 38 |         self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
 39 |         self.cache = {}
 40 |         self.max_len = 0
 41 | 
 42 |     def bpe(self, token):
 43 |         if token in self.cache:
 44 |             return self.cache[token]
 45 |         word = tuple(token)
 46 |         pairs = get_pairs(word)
 47 |         if not pairs:
 48 |             return token
 49 | 
 50 |         while True:
 51 |             bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
 52 |             if bigram not in self.bpe_ranks:
 53 |                 break
 54 |             first, second = bigram
 55 |             new_word = []
 56 |             i = 0
 57 |             while i < len(word):
 58 |                 try:
 59 |                     j = word.index(first, i)
 60 |                     new_word.extend(word[i:j])
 61 |                     i = j
 62 |                 except:
 63 |                     new_word.extend(word[i:])
 64 |                     break
 65 | 
 66 |                 if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
 67 |                     new_word.append(first + second)
 68 |                     i += 2
 69 |                 else:
 70 |                     new_word.append(word[i])
 71 |                     i += 1
 72 |             new_word = tuple(new_word)
 73 |             word = new_word
 74 |             if len(word) == 1:
 75 |                 break
 76 |             else:
 77 |                 pairs = get_pairs(word)
 78 |         word = ' '.join(word)
 79 |         self.cache[token] = word
 80 |         return word
 81 | 
 82 |     def encode(self, text):
 83 |         return [self.encoder.get(token, 1) for token in self.tokenize(text)]
 84 | 
 85 |     def decode(self, tokens):
 86 |         text = ''.join([self.decoder[token] for token in tokens])
 87 |         return text
 88 | 
 89 |     def tokenize(self, text):
 90 |         bpe_tokens = []
 91 |         bpe_tokens.extend(bpe_token for bpe_token in self.bpe(text).split(' '))
 92 |         return bpe_tokens
 93 | 
 94 |     def convert_tokens_to_ids(self, tokens):
 95 |         return [self.encoder.get(token, 1) for token in tokens]
 96 | 
 97 | 
 98 | class Encoder_SP:
 99 |     def __init__(self, model_path):
100 |         self.sp = spm.SentencePieceProcessor()
101 |         self.sp.Load(model_path)
102 | 
103 |     def encode(self, text):
104 |         """
105 |         text="...."
106 |         """
107 |         return self.sp.EncodeAsIds(text)
108 | 
109 |     def decode(self, tokens):
110 |         """
111 |         tokens=[x1,x2,...]
112 |         """
113 |         text = [int(token) for token in tokens]
114 |         # print(text)
115 |         return self.sp.DecodeIds(text)
116 | 
117 |     def tokenize(self, text):
118 |         return self.sp.EncodeAsPieces(text)
119 | 
120 |     def convert_tokens_to_ids(self, tokens):
121 |         return [self.sp.PieceToId(token) for token in tokens]
122 | 
123 |     def convert_token_to_id(self, token):
124 |         return self.sp.PieceToId(token)
125 | 
126 |     def convert_id_to_token(self, idx):
127 |         return self.sp.IdToPiece(idx)
128 | 
129 | 
130 | def get_encoder(encoder_file, bpe_file):
131 |     # 以下是为了同一个函数入兼容sentencepiece
132 |     filepath, filename = os.path.split(encoder_file)
133 |     shotname, extension = os.path.splitext(filename)
134 | 
135 |     if (".model" == extension) and (bpe_file == ""):
136 |         return Encoder_SP(encoder_file)
137 |     else:
138 |         with open(encoder_file, 'r', encoding="utf-8") as f:
139 |             encoder = json.load(f)
140 |         with open(bpe_file, 'r', encoding="utf-8") as f:
141 |             bpe_data = f.read()
142 |         bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
143 |         return Encoder(
144 |             encoder=encoder,
145 |             bpe_merges=bpe_merges,
146 |         )
147 | 
148 | 
149 | def from_pretrained():
150 |     return get_encoder(PRETRAINED_MODEL_FILE, "")


--------------------------------------------------------------------------------
/docker/prepare.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | config=/root/.jupyter/jupyter_notebook_config.py
 4 | 
 5 | if [ ! -f $config ]; then
 6 | 
 7 |   cat > $config <<EOF
 8 | c.NotebookApp.allow_password_change = True
 9 | c.NotebookApp.ip = '0.0.0.0'
10 | c.NotebookApp.notebook_dir = '/workspace'
11 | c.NotebookApp.open_browser = False
12 | c.NotebookApp.port = 8888
13 | c.NotebookApp.base_url = '/'
14 | EOF
15 | 
16 |   default_pwd=$JUPYTER_DEFAULT_PWD
17 | 
18 |   if [ ! $default_pwd ]; then
19 |     default_pwd=''
20 |     echo "doesn't need password"
21 | 
22 |     echo "c.NotebookApp.token =''" >> $config
23 |   else
24 |     default_pwd=`python -c "from notebook.auth import passwd; pwd=passwd('${default_pwd}'); print(pwd);"`
25 |     echo "sha1 password: $default_pwd"
26 |     echo "default password: $default_pwd"
27 | 
28 |     echo "c.NotebookApp.password ='${default_pwd}'" >> $config
29 |   fi
30 | 
31 | fi
32 | 


--------------------------------------------------------------------------------
/docker/ssh-env-config.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # This command wrapper sets up SSH config files based on the following
  4 | # environment variables:
  5 | #
  6 | #   SSH_CONFIG - contents of an SSH config file
  7 | #   SSH_KNOWN_HOSTS - contents of a SSH known_hosts file
  8 | #   SSH_PRIVATE_RSA_KEY - contents of a SSH private RSA key
  9 | #   SSH_PRIVATE_DSA_KEY - contents of a SSH private DSA key
 10 | #   SSH_DEBUG - switch to a high debug level 3 for all hosts, to help solve SSH issues
 11 | #
 12 | # The environment variables are unset after the files are created to help
 13 | # prevent accidental output in logs
 14 | 
 15 | set -e
 16 | 
 17 | if [ -z "$SSH_CONFIG" ] && \
 18 |   [ -z "$SSH_CONFIG_B64" ] && \
 19 |   [ -z "$SSH_CONFIG_PATH" ] && \
 20 |   [ -z "$SSH_KNOWN_HOSTS" ] && \
 21 |   [ -z "$SSH_KNOWN_HOSTS_B64" ] && \
 22 |   [ -z "$SSH_KNOWN_HOSTS_PATH" ] && \
 23 |   [ -z "$SSH_PRIVATE_RSA_KEY" ] && \
 24 |   [ -z "$SSH_PRIVATE_RSA_KEY_B64" ] && \
 25 |   [ -z "$SSH_PRIVATE_RSA_KEY_PATH" ] && \
 26 |   [ -z "$SSH_PRIVATE_DSA_KEY" ] && \
 27 |   [ -z "$SSH_PRIVATE_DSA_KEY_B64" ] && \
 28 |   [ -z "$SSH_PRIVATE_DSA_KEY_PATH" ] && \
 29 |   [ -z "$SSH_DEBUG" ]; then
 30 |     # none of the ENV vars we care about found, so skip the logic in this script
 31 |     [[ $1 ]] && exec "$@"
 32 | fi
 33 | 
 34 | mkdir -p ~/.ssh
 35 | chmod 700 ~/.ssh
 36 | 
 37 | decode_base64() {
 38 |   # Determine the platform dependent base64 decode argument
 39 |   if [ "$(echo 'eA==' | base64 -d 2> /dev/null)" = 'x' ]; then
 40 |     local BASE64_DECODE_ARG='-d'
 41 |   else
 42 |     local BASE64_DECODE_ARG='--decode'
 43 |   fi
 44 | 
 45 |   echo "$1" | tr -d '\n' | base64 "$BASE64_DECODE_ARG"
 46 | }
 47 | 
 48 | ## ~/.ssh/config
 49 | 
 50 | [[ ! -z "$SSH_CONFIG" ]] && \
 51 |   echo "$SSH_CONFIG" > ~/.ssh/config && \
 52 |   chmod 600 ~/.ssh/config && \
 53 |   unset SSH_CONFIG
 54 | 
 55 | [[ ! -z "$SSH_CONFIG_B64" ]] && \
 56 |   decode_base64 "$SSH_CONFIG_B64" > ~/.ssh/config && \
 57 |   chmod 600 ~/.ssh/config && \
 58 |   unset SSH_CONFIG_B64
 59 | 
 60 | [[ ! -z "$SSH_CONFIG_PATH" && ! -a ~/.ssh/config ]] && \
 61 |   cp "$SSH_CONFIG_PATH" ~/.ssh/config && \
 62 |   chmod 600 ~/.ssh/config && \
 63 |   unset SSH_CONFIG_PATH
 64 | 
 65 | ## ~/.ssh/known_hosts
 66 | 
 67 | [[ ! -z "$SSH_KNOWN_HOSTS" ]] && \
 68 |   echo "$SSH_KNOWN_HOSTS" > ~/.ssh/known_hosts && \
 69 |   chmod 600 ~/.ssh/known_hosts && \
 70 |   unset SSH_KNOWN_HOSTS
 71 | 
 72 | [[ ! -z "$SSH_KNOWN_HOSTS_B64" ]] && \
 73 |   decode_base64 "$SSH_KNOWN_HOSTS_B64" > ~/.ssh/known_hosts && \
 74 |   chmod 600 ~/.ssh/known_hosts && \
 75 |   unset SSH_KNOWN_HOSTS_B64
 76 | 
 77 | [[ ! -z "$SSH_KNOWN_HOSTS_PATH" && ! -a ~/.ssh/known_hosts ]] && \
 78 |   cp "$SSH_KNOWN_HOSTS_PATH" ~/.ssh/known_hosts && \
 79 |   chmod 600 ~/.ssh/known_hosts && \
 80 |   unset SSH_KNOWN_HOSTS_PATH
 81 | 
 82 | ## ~/.ssh/id_rsa
 83 | 
 84 | [[ ! -z "$SSH_PRIVATE_RSA_KEY" ]] && \
 85 |   echo "$SSH_PRIVATE_RSA_KEY" > ~/.ssh/id_rsa && \
 86 |   chmod 600 ~/.ssh/id_rsa && \
 87 |   unset SSH_PRIVATE_RSA_KEY
 88 | 
 89 | [[ ! -z "$SSH_PRIVATE_RSA_KEY_B64" ]] && \
 90 |   decode_base64 "$SSH_PRIVATE_RSA_KEY_B64" > ~/.ssh/id_rsa && \
 91 |   chmod 600 ~/.ssh/id_rsa && \
 92 |   unset SSH_PRIVATE_RSA_KEY_B64
 93 | 
 94 | [[ ! -z "$SSH_PRIVATE_RSA_KEY_PATH" && ! -a ~/.ssh/id_rsa ]] && \
 95 |   cp "$SSH_PRIVATE_RSA_KEY_PATH" ~/.ssh/id_rsa && \
 96 |   chmod 600 ~/.ssh/id_rsa && \
 97 |   unset SSH_PRIVATE_RSA_KEY_PATH
 98 | 
 99 | ## ~/.ssh/id_dsa
100 | 
101 | [[ ! -z "$SSH_PRIVATE_DSA_KEY" ]] && \
102 |   echo "$SSH_PRIVATE_DSA_KEY" > ~/.ssh/id_dsa && \
103 |   chmod 600 ~/.ssh/id_dsa && \
104 |   unset SSH_PRIVATE_DSA_KEY
105 | 
106 | [[ ! -z "$SSH_PRIVATE_DSA_KEY_B64" ]] && \
107 |   decode_base64 "$SSH_PRIVATE_DSA_KEY_B64" > ~/.ssh/id_dsa && \
108 |   chmod 600 ~/.ssh/id_dsa && \
109 |   unset SSH_PRIVATE_DSA_KEY_B64
110 | 
111 | [[ ! -z "$SSH_PRIVATE_DSA_KEY_PATH" && ! -a ~/.ssh/id_dsa ]] && \
112 |   cp "$SSH_PRIVATE_DSA_KEY_PATH" ~/.ssh/id_dsa && \
113 |   chmod 600 ~/.ssh/id_dsa && \
114 |   unset SSH_PRIVATE_DSA_KEY_PATH
115 | 
116 | ## ssh debug mode
117 | 
118 | [[ ! -z "$SSH_DEBUG" ]] && \
119 |   touch ~/.ssh/config && \
120 |   chmod 600 ~/.ssh/config && \
121 |   echo -e "Host *\n  LogLevel DEBUG3" >> ~/.ssh/config && \
122 |   unset SSH_DEBUG
123 | 
124 | [[ $1 ]] && exec "$@"
125 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # GLM Examples
 2 | This is a directory that collects GLM's implementation over various NLP datasets. 
 3 | We feel lucky to collaborate with all contributors that share their implementations here.
 4 | 
 5 | ## Make You Pull Requests (PRs)
 6 | If you also want to become a contributor of GLM, we encourage you to make PR to this repo according to the following PR rules.
 7 | The maintainer will check the validity before accept the PR.
 8 | 
 9 | ### Directory Structure
10 | Each PR should include the code and markdown description in a subdirectory of the current `examples` directory.
11 | An example subdirectory tree is as follows:
12 | 
13 | ```
14 | └── examples
15 |     └── <Your PR directory>: Huggingface Datasets identifier (recommended) or customized name
16 |         ├── README.md
17 |         ├── requirements.txt
18 |         └── <Your code>
19 | ```
20 | 
21 | Please exclude data files in the PR as they take up too much space, and describe the method to acquire the data in your `README.md`.
22 | 
23 | ### Task Description (README.md)
24 | Please at least include the following sections in your README to help its better use:
25 | 
26 | + **Dataset Name**: serves as the markdown title.
27 | + **Authors**: Your name(s), contacts (email), and the url to your homepage(s) (if available).
28 | + **Task Description**: A short paragraph to briefly introduce what the dataset and corresponding task is about.
29 | + **Running Commands**: Provide the bash/shell commands for preprocessing, training, and inference.
30 | + **Results**: Please provide your implementation's final performance, along with other available comparison methods'. Ensure that they are reproducible once using your provided `Running Commands`.
31 | + **Reference**: Proper citation information for the dataset and related comparison methods.
32 | 
33 | ### Environment Requirements (requirements.txt)
34 | Please include the necessary python packages in the file for other users to reproduce your results.
35 | 
36 | ## Example List (To Be Updated)
37 | TODO
38 | 


--------------------------------------------------------------------------------
/fp16/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from .fp16util import (
16 |     BN_convert_float,
17 |     network_to_half,
18 |     prep_param_lists,
19 |     model_grads_to_master_grads,
20 |     master_params_to_model_params,
21 |     tofp16,
22 |     to_python_float,
23 |     clip_grad_norm,
24 |     convert_module,
25 |     convert_network,
26 |     FP16Model,
27 | )
28 | 
29 | from .fp16 import *
30 | from .loss_scaler import *
31 | 


--------------------------------------------------------------------------------
/fp16/fp16util.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import torch
 17 | import torch.nn as nn
 18 | from torch.autograd import Variable
 19 | from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 20 | 
 21 | import mpu
 22 | 
 23 | 
 24 | class tofp16(nn.Module):
 25 |     """
 26 |     Utility module that implements::
 27 | 
 28 |         def forward(self, input):
 29 |             return input.half()
 30 |     """
 31 | 
 32 |     def __init__(self):
 33 |         super(tofp16, self).__init__()
 34 | 
 35 |     def forward(self, input):
 36 |         return input.half()
 37 | 
 38 | 
 39 | def BN_convert_float(module):
 40 |     """
 41 |     Utility function for network_to_half().
 42 | 
 43 |     Retained for legacy purposes.
 44 |     """
 45 |     if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
 46 |         module.float()
 47 |     for child in module.children():
 48 |         BN_convert_float(child)
 49 |     return module
 50 | 
 51 | 
 52 | def network_to_half(network):
 53 |     """
 54 |     Convert model to half precision in a batchnorm-safe way.
 55 | 
 56 |     Retained for legacy purposes. It is recommended to use FP16Model.
 57 |     """
 58 |     return nn.Sequential(tofp16(), BN_convert_float(network.half()))
 59 | 
 60 | 
 61 | def convert_module(module, dtype):
 62 |     """
 63 |     Converts a module's immediate parameters and buffers to dtype.
 64 |     """
 65 |     for param in module.parameters(recurse=False):
 66 |         if param is not None:
 67 |             if param.data.dtype.is_floating_point:
 68 |                 param.data = param.data.to(dtype=dtype)
 69 |             if param._grad is not None and param._grad.data.dtype.is_floating_point:
 70 |                 param._grad.data = param._grad.data.to(dtype=dtype)
 71 | 
 72 |     for buf in module.buffers(recurse=False):
 73 |         if buf is not None and buf.data.dtype.is_floating_point:
 74 |             buf.data = buf.data.to(dtype=dtype)
 75 | 
 76 | 
 77 | def convert_network(network, dtype):
 78 |     """
 79 |     Converts a network's parameters and buffers to dtype.
 80 |     """
 81 |     for module in network.modules():
 82 |         if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
 83 |             continue
 84 |         convert_module(module, dtype)
 85 |     return network
 86 | 
 87 | 
 88 | class FP16Model(nn.Module):
 89 |     """
 90 |     Convert model to half precision in a batchnorm-safe way.
 91 |     """
 92 | 
 93 |     def __init__(self, network):
 94 |         super(FP16Model, self).__init__()
 95 |         self.network = convert_network(network, dtype=torch.half)
 96 | 
 97 |     def forward(self, *inputs):
 98 |         inputs = tuple(t.half() for t in inputs)
 99 |         return self.network(*inputs)
100 | 
101 | 
102 | def backwards_debug_hook(grad):
103 |     raise RuntimeError("master_params recieved a gradient in the backward pass!")
104 | 
105 | def prep_param_lists(model, flat_master=False):
106 |     """
107 |     Creates a list of FP32 master parameters for a given model, as in
108 |     `Training Neural Networks with Mixed Precision:  Real Examples`_.
109 | 
110 |     Args:
111 |         model (torch.nn.Module): Existing Pytorch model
112 |         flat_master (bool, optional, default=False):  Flatten the master parameters into a single tensor, as a performance optimization.
113 |     Returns:
114 |         A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`.  ``master_params`` is a list of FP32 master gradients.  If ``flat_master=True``, ``master_params`` will be a list with one element.
115 | 
116 |     Example::
117 | 
118 |         model_params, master_params = prep_param_lists(model)
119 | 
120 |     .. warning::
121 |         Currently, if ``flat_master=True``, all the model's parameters must be the same type.  If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`.
122 | 
123 |     .. _`Training Neural Networks with Mixed Precision:  Real Examples`:
124 |         http://on-demand.gputechconf.com/gtc/2018/video/S81012/
125 |     """
126 |     model_params = [param for param in model.parameters() if param.requires_grad]
127 | 
128 |     if flat_master:
129 |         # Give the user some more useful error messages
130 |         try:
131 |             # flatten_dense_tensors returns a contiguous flat array.
132 |             # http://pytorch.org/docs/master/_modules/torch/_utils.html
133 |             master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
134 |         except:
135 |             print("Error in prep_param_lists:  model may contain a mixture of parameters "
136 |                       "of different types.  Use flat_master=False, or use F16_Optimizer.")
137 |             raise
138 |         master_params = torch.nn.Parameter(master_params)
139 |         master_params.requires_grad = True
140 |         # master_params.register_hook(backwards_debug_hook)
141 |         if master_params.grad is None:
142 |             master_params.grad = master_params.new(*master_params.size())
143 |         return model_params, [master_params]
144 |     else:
145 |         master_params = [param.clone().float().detach() for param in model_params]
146 |         for param in master_params:
147 |             param.requires_grad = True
148 |         return model_params, master_params
149 | 
150 | 
151 | def model_grads_to_master_grads(model_params, master_params, flat_master=False):
152 |     """
153 |     Copy model gradients to master gradients.  
154 | 
155 |     Args:
156 |         model_params:  List of model parameters created by :func:`prep_param_lists`.
157 |         master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`.
158 |     """
159 |     if flat_master:
160 |         # The flattening may incur one more deep copy than is necessary.
161 |         master_params[0].grad.data.copy_(
162 |             _flatten_dense_tensors([p.grad.data for p in model_params]))
163 |     else:
164 |         for model, master in zip(model_params, master_params):
165 |             if model.grad is not None:
166 |                 if master.grad is None:
167 |                     master.grad = Variable(master.data.new(*master.data.size()))
168 |                 master.grad.data.copy_(model.grad.data)
169 |             else:
170 |                 master.grad = None
171 | 
172 | 
173 | def master_params_to_model_params(model_params, master_params, flat_master=False):
174 |     """
175 |     Copy master parameters to model parameters.
176 | 
177 |     Args:
178 |         model_params:  List of model parameters created by :func:`prep_param_lists`.
179 |         master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
180 |     """
181 |     if flat_master:
182 |         for model, master in zip(model_params, 
183 |                                  _unflatten_dense_tensors(master_params[0].data, model_params)):
184 |             model.data.copy_(master)
185 |     else:
186 |         for model, master in zip(model_params, master_params):
187 |             model.data.copy_(master.data)
188 | 
189 | # Backward compatibility fixes
190 | 
191 | def to_python_float(t):
192 |     if hasattr(t, 'item'):
193 |         return t.item()
194 |     else:
195 |         return t[0]
196 | 
197 | TORCH_MAJOR = int(torch.__version__.split('.')[0])
198 | TORCH_MINOR = int(torch.__version__.split('.')[1])
199 | 
200 | clip_grad_norm = mpu.clip_grad_norm
201 | #elif TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
202 | #    clip_grad_norm = torch.nn.utils.clip_grad_norm
203 | #else:
204 | #    clip_grad_norm = torch.nn.utils.clip_grad_norm_
205 | 


--------------------------------------------------------------------------------
/learning_rates.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """PyTorch DataLoader for TFRecords"""
16 | 
17 | import torch
18 | from torch.optim.lr_scheduler import _LRScheduler
19 | import math
20 | 
21 | 
22 | class AnnealingLR(_LRScheduler):
23 |     """Anneals the learning rate from start to zero along a cosine curve."""
24 | 
25 |     DECAY_STYLES = ['linear', 'cosine', 'exponential', 'constant', 'None']
26 | 
27 |     def __init__(self, optimizer, start_lr, warmup_iter, num_iters, decay_style=None, last_iter=-1, decay_ratio=0.5):
28 |         assert warmup_iter <= num_iters
29 |         self.optimizer = optimizer
30 |         self.start_lr = start_lr
31 |         self.warmup_iter = warmup_iter
32 |         self.num_iters = last_iter + 1
33 |         self.end_iter = num_iters
34 |         self.decay_style = decay_style.lower() if isinstance(decay_style, str) else None
35 |         self.decay_ratio = 1 / decay_ratio
36 |         self.step(self.num_iters)
37 |         if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
38 |             print(f'learning rate decaying style {self.decay_style}, ratio {self.decay_ratio}')
39 | 
40 |     def get_lr(self):
41 |         # https://openreview.net/pdf?id=BJYwwY9ll pg. 4
42 |         if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter:
43 |             return float(self.start_lr) * self.num_iters / self.warmup_iter
44 |         else:
45 |             if self.decay_style == self.DECAY_STYLES[0]:
46 |                 decay_step_ratio = (self.num_iters - self.warmup_iter) / self.end_iter
47 |                 return self.start_lr - self.start_lr * (1 - 1 / self.decay_ratio) * decay_step_ratio
48 |             elif self.decay_style == self.DECAY_STYLES[1]:
49 |                 decay_step_ratio = min(1.0, (self.num_iters - self.warmup_iter) / self.end_iter)
50 |                 return self.start_lr / self.decay_ratio * (
51 |                         (math.cos(math.pi * decay_step_ratio) + 1) * (self.decay_ratio - 1) / 2 + 1)
52 |             elif self.decay_style == self.DECAY_STYLES[2]:
53 |                 # TODO: implement exponential decay
54 |                 return self.start_lr
55 |             else:
56 |                 return self.start_lr
57 | 
58 |     def step(self, step_num=None):
59 |         if step_num is None:
60 |             step_num = self.num_iters + 1
61 |         self.num_iters = step_num
62 |         new_lr = self.get_lr()
63 |         for group in self.optimizer.param_groups:
64 |             group['lr'] = new_lr
65 | 
66 |     def state_dict(self):
67 |         sd = {
68 |             # 'start_lr': self.start_lr,
69 |             'warmup_iter': self.warmup_iter,
70 |             'num_iters': self.num_iters,
71 |             'decay_style': self.decay_style,
72 |             'end_iter': self.end_iter,
73 |             'decay_ratio': self.decay_ratio
74 |         }
75 |         return sd
76 | 
77 |     def load_state_dict(self, sd):
78 |         # self.start_lr = sd['start_lr']
79 |         self.warmup_iter = sd['warmup_iter']
80 |         self.num_iters = sd['num_iters']
81 |         # self.end_iter = sd['end_iter']
82 |         # self.decay_style = sd['decay_style']
83 |         # if 'decay_ratio' in sd:
84 |         #     self.decay_ratio = sd['decay_ratio']
85 |         self.step(self.num_iters)
86 | 
87 |     def switch_linear(self, args):
88 |         current_lr = self.get_lr()
89 |         self.start_lr = current_lr
90 |         self.end_iter = args.train_iters - self.num_iters
91 |         self.num_iters = 0
92 |         self.decay_style = "linear"
93 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from .distributed import PyTorchDistributedDataParallel, DistributedDataParallel
17 | from .modeling_glm import GLMModel, glm_get_params_for_weight_decay_optimization
18 | from .downstream import GLMForMultiTokenCloze, GLMForMultiTokenClozeFast, GLMForSingleTokenCloze, \
19 |     GLMForSequenceClassification
20 | 


--------------------------------------------------------------------------------
/model/distributed.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import torch
 17 | from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 18 | import torch.distributed as dist
 19 | from torch.nn.modules import Module
 20 | from torch.autograd import Variable
 21 | from torch.nn.parallel.distributed import DistributedDataParallel as DDP
 22 | 
 23 | import mpu
 24 | 
 25 | 
 26 | class PyTorchDistributedDataParallel(DDP):
 27 |     def named_parameters(self, prefix: str = '', recurse: bool = True):
 28 |         return self.module.named_parameters(prefix=prefix, recurse=recurse)
 29 | 
 30 |     def state_dict(self, destination=None, prefix='', keep_vars=False):
 31 |         sd = self.module.state_dict(destination, prefix, keep_vars)
 32 |         return sd
 33 | 
 34 |     def load_state_dict(self, state_dict, strict=True):
 35 |         return self.module.load_state_dict(state_dict, strict=strict)
 36 | 
 37 | 
 38 | class DistributedDataParallel(Module):
 39 | 
 40 |     def __init__(self, module):
 41 |         super(DistributedDataParallel, self).__init__()
 42 |         self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
 43 | 
 44 |         self.module = module
 45 |         self.data_parallel_group = mpu.get_data_parallel_group()
 46 |         src_rank = mpu.get_model_parallel_rank()
 47 |         for p in self.module.parameters():
 48 |             if torch.is_tensor(p):
 49 |                 dist.broadcast(p, src_rank, group=self.data_parallel_group)
 50 | 
 51 |         def allreduce_params(reduce_after=True, no_scale=False, fp32_allreduce=False):
 52 |             if (self.needs_reduction):
 53 |                 self.needs_reduction = False
 54 |                 buckets = {}
 55 |                 for name, param in self.module.named_parameters():
 56 |                     if param.requires_grad and param.grad is not None:
 57 |                         tp = (param.data.type())
 58 |                         if tp not in buckets:
 59 |                             buckets[tp] = []
 60 |                         buckets[tp].append(param)
 61 |                 if self.warn_on_half:
 62 |                     if torch.cuda.HalfTensor in buckets:
 63 |                         print("WARNING: gloo dist backend for half parameters may be extremely slow." +
 64 |                               " It is recommended to use the NCCL backend in this case.")
 65 |                         self.warn_on_half = False
 66 |                 for tp in buckets:
 67 |                     bucket = buckets[tp]
 68 |                     grads = [param.grad.data for param in bucket]
 69 |                     coalesced = _flatten_dense_tensors(grads)
 70 |                     if fp32_allreduce:
 71 |                         coalesced = coalesced.float()
 72 |                     if not no_scale and not reduce_after:
 73 |                         coalesced /= dist.get_world_size(group=self.data_parallel_group)
 74 |                     dist.all_reduce(coalesced, group=self.data_parallel_group)
 75 |                     torch.cuda.synchronize()
 76 |                     if not no_scale and reduce_after:
 77 |                         coalesced /= dist.get_world_size(group=self.data_parallel_group)
 78 |                     for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
 79 |                         buf.copy_(synced)
 80 | 
 81 |         self.hook_handles = []
 82 |         self.hooks = []
 83 |         for param in list(self.module.parameters()):
 84 |             def allreduce_hook(*unused):
 85 |                 Variable._execution_engine.queue_callback(allreduce_params)
 86 |         #    handle = param.register_hook(allreduce_hook)
 87 |         # self.hooks.append(allreduce_hook)
 88 |         # self.hook_handles.append(handle)
 89 |         self.allreduce_params = allreduce_params
 90 | 
 91 |     def forward(self, *inputs, **kwargs):
 92 |         self.needs_reduction = True
 93 |         return self.module(*inputs, **kwargs)
 94 | 
 95 |     def state_dict(self, destination=None, prefix='', keep_vars=False):
 96 |         # [h.remove() for h in self.hook_handles]
 97 |         sd = self.module.state_dict(destination, prefix, keep_vars)
 98 |         return sd
 99 | 
100 |     def load_state_dict(self, state_dict, strict=True):
101 |         return self.module.load_state_dict(state_dict, strict=strict)
102 | 
103 |     def named_parameters(self, prefix: str = '', recurse: bool = True):
104 |         return self.module.named_parameters(prefix=prefix, recurse=recurse)
105 | 
106 |     '''
107 |     def _sync_buffers(self):
108 |         buffers = list(self.module._all_buffers())
109 |         if len(buffers) > 0:
110 |             # cross-node buffer sync
111 |             flat_buffers = _flatten_dense_tensors(buffers)
112 |             dist.broadcast(flat_buffers, 0)
113 |             for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
114 |                 buf.copy_(synced)
115 |     def train(self, mode=True):
116 |         # Clear NCCL communicator and CUDA event cache of the default group ID,
117 |         # These cache will be recreated at the later call. This is currently a
118 |         # work-around for a potential NCCL deadlock.
119 |         if dist._backend == dist.dist_backend.NCCL:
120 |             dist._clear_group_cache()
121 |         super(DistributedDataParallel, self).train(mode)
122 |         self.module.train(mode)
123 |     '''
124 | 


--------------------------------------------------------------------------------
/model/prompt.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import torch
 3 | 
 4 | 
 5 | class PromptSpell(torch.nn.Module):
 6 |     def __init__(self, spell_length, hidden_size, spell_func):
 7 |         super(PromptSpell, self).__init__()
 8 |         self.spell_length = spell_length
 9 |         self.hidden_size = hidden_size
10 |         self.spell_embeddings = torch.nn.Embedding(self.spell_length, self.hidden_size)
11 |         self.spell_func = spell_func
12 |         if self.spell_func == "lstm":
13 |             self.lstm_head = torch.nn.LSTM(input_size=self.hidden_size,
14 |                                            hidden_size=self.hidden_size,
15 |                                            num_layers=2,
16 |                                            # dropout=self.lstm_dropout,
17 |                                            bidirectional=True,
18 |                                            batch_first=True)  # .to(torch.device("cuda"))
19 |             self.mlp_head = torch.nn.Sequential(torch.nn.Linear(2 * self.hidden_size, self.hidden_size),
20 |                                                 torch.nn.ReLU(),
21 |                                                 torch.nn.Linear(self.hidden_size, self.hidden_size))
22 |         elif self.spell_func == "mlp":
23 |             self.mlp_head = torch.nn.Sequential(torch.nn.Linear(self.hidden_size, self.hidden_size),
24 |                                                 torch.nn.ReLU(),
25 |                                                 torch.nn.Linear(self.hidden_size, self.hidden_size))
26 |         elif self.spell_func != "none":
27 |             raise NotImplementedError("Prompt function " + self.spell_func)
28 | 
29 |     def init_embedding(self, word_embeddings=None, task_tokens=None):
30 |         num_words = 5000
31 |         with torch.no_grad():
32 |             for i in range(self.spell_length):
33 |                 rand_token = random.randrange(num_words)
34 |                 if task_tokens is None:
35 |                     target_embedding = word_embeddings[rand_token]
36 |                 else:
37 |                     word_embedding = word_embeddings[rand_token]
38 |                     task_token = random.choice(task_tokens)
39 |                     task_embedding = word_embeddings[task_token]
40 |                     ratio = random.random()
41 |                     target_embedding = word_embedding * ratio + task_embedding * (1 - ratio)
42 |                 self.spell_embeddings.weight.data[i] = target_embedding
43 | 
44 |     def forward(self):
45 |         prompt_embeds = self.spell_embeddings.weight.unsqueeze(0)
46 |         if self.spell_func == "lstm":
47 |             prompt_embeds = self.lstm_head(prompt_embeds)[0]
48 |         if self.spell_func == "lstm" or self.spell_func == "mlp":
49 |             prompt_embeds = self.mlp_head(prompt_embeds)
50 |         return prompt_embeds
51 | 


--------------------------------------------------------------------------------
/mpu/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Model parallel utility interface."""
17 | 
18 | from .cross_entropy import vocab_parallel_cross_entropy
19 | 
20 | from .data import broadcast_data
21 | 
22 | from .grads import clip_grad_norm
23 | 
24 | from .initialize import destroy_model_parallel
25 | from .initialize import get_data_parallel_group
26 | from .initialize import get_data_parallel_rank
27 | from .initialize import get_data_parallel_world_size
28 | from .initialize import get_model_parallel_group
29 | from .initialize import get_model_parallel_rank
30 | from .initialize import get_model_parallel_src_rank
31 | from .initialize import get_model_parallel_world_size
32 | from .initialize import initialize_model_parallel
33 | from .initialize import model_parallel_is_initialized
34 | 
35 | from .layers import ColumnParallelLinear
36 | from .layers import ParallelEmbedding
37 | from .layers import RowParallelLinear
38 | from .layers import VocabParallelEmbedding
39 | 
40 | from .mappings import copy_to_model_parallel_region
41 | from .mappings import gather_from_model_parallel_region
42 | from .mappings import reduce_from_model_parallel_region
43 | from .mappings import scatter_to_model_parallel_region
44 | 
45 | from .random import checkpoint
46 | from .random import partition_activations_in_checkpoint
47 | from .random import get_cuda_rng_tracker
48 | from .random import model_parallel_cuda_manual_seed
49 | 
50 | from .transformer import GPT2ParallelTransformer
51 | from .transformer import LayerNorm
52 | 


--------------------------------------------------------------------------------
/mpu/cross_entropy.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | import torch
 18 | 
 19 | from .initialize import get_model_parallel_group
 20 | from .initialize import get_model_parallel_rank
 21 | from .initialize import get_model_parallel_world_size
 22 | from .utils import VocabUtility
 23 | 
 24 | 
 25 | class _VocabParallelCrossEntropy(torch.autograd.Function):
 26 | 
 27 |     @staticmethod
 28 |     def forward(ctx, vocab_parallel_logits, target):
 29 | 
 30 |         # Copy so the input remains unchanged.
 31 |         logits = vocab_parallel_logits.clone()
 32 |         # Maximum value along vocab dimension across all GPUs.
 33 |         logits_max = torch.max(logits, dim=-1)[0]
 34 |         torch.distributed.all_reduce(logits_max,
 35 |                                      op=torch.distributed.ReduceOp.MAX,
 36 |                                      group=get_model_parallel_group())
 37 |         # Subtract the maximum value.
 38 |         logits.sub_(logits_max.unsqueeze(dim=-1))
 39 |         # Sum of exponential of logits along vocab dimension across all GPUs.
 40 |         exp_logits = logits.exp()
 41 |         sum_exp_logits = exp_logits.sum(dim=-1)
 42 |         torch.distributed.all_reduce(sum_exp_logits,
 43 |                                      op=torch.distributed.ReduceOp.SUM,
 44 |                                      group=get_model_parallel_group())
 45 | 
 46 |         # Get the partition's vocab indecies
 47 |         get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
 48 |         partition_vocab_size = vocab_parallel_logits.size()[-1]
 49 |         rank = get_model_parallel_rank()
 50 |         world_size = get_model_parallel_world_size()
 51 |         vocab_start_index, vocab_end_index = get_vocab_range(
 52 |             partition_vocab_size, rank, world_size)
 53 | 
 54 |         # Create a mask of valid vocab ids (1 means it needs to be masked).
 55 |         target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
 56 |         masked_target = target.clone() - vocab_start_index
 57 |         masked_target[target_mask] = 0
 58 | 
 59 |         # Get predicted-logits = logits[target].
 60 |         # For Simplicity, we convert logits to a 2-D tensor with size
 61 |         # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
 62 |         logits_2d = logits.view(-1, partition_vocab_size)
 63 |         masked_target_1d = masked_target.view(-1)
 64 |         arange_1d = torch.arange(start=0, end=logits_2d.size()[0],
 65 |                                  device=logits_2d.device)
 66 |         predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
 67 |         predicted_logits = predicted_logits_1d.view_as(target)
 68 |         predicted_logits[target_mask] = 0.0
 69 |         # All reduce is needed to get the chunks from other GPUs.
 70 |         torch.distributed.all_reduce(predicted_logits,
 71 |                                      op=torch.distributed.ReduceOp.SUM,
 72 |                                      group=get_model_parallel_group())
 73 | 
 74 |         # Loss = log(sum(exp(logits))) - predicted-logit.
 75 |         loss = torch.log(sum_exp_logits) - predicted_logits
 76 | 
 77 |         # Store softmax, target-mask and masked-target for backward pass.
 78 |         exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
 79 |         ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
 80 | 
 81 |         return loss
 82 | 
 83 |     @staticmethod
 84 |     def backward(ctx, grad_output):
 85 | 
 86 |         # Retreive tensors from the forward path.
 87 |         softmax, target_mask, masked_target_1d = ctx.saved_tensors
 88 | 
 89 |         # All the inputs have softmax as thier gradient.
 90 |         grad_input = softmax
 91 |         # For simplicity, work with the 2D gradient.
 92 |         partition_vocab_size = softmax.size()[-1]
 93 |         grad_2d = grad_input.view(-1, partition_vocab_size)
 94 | 
 95 |         # Add the gradient from matching classes.
 96 |         arange_1d = torch.arange(start=0, end=grad_2d.size()[0],
 97 |                                  device=grad_2d.device)
 98 |         grad_2d[arange_1d, masked_target_1d] -= (
 99 |             1.0 - target_mask.view(-1).float())
100 | 
101 |         # Finally elementwise multiplication with the output gradients.
102 |         grad_input.mul_(grad_output.unsqueeze(dim=-1))
103 | 
104 |         return grad_input, None
105 | 
106 | 
107 | def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
108 |     """Helper function for the cross entropy."""
109 |     return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
110 | 


--------------------------------------------------------------------------------
/mpu/data.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import torch
 17 | 
 18 | from .initialize import get_model_parallel_group
 19 | from .initialize import get_model_parallel_rank
 20 | from .initialize import get_model_parallel_src_rank
 21 | 
 22 | 
 23 | _MAX_DATA_DIM = 5
 24 | 
 25 | 
 26 | def _check_data_types(keys, data, target_dtype):
 27 |     """Check that all the keys have the same target data type."""
 28 |     for key in keys:
 29 |         assert data[key].dtype == target_dtype, '{} has data type {} which '\
 30 |             'is different than {}'.format(key, data[key].dtype, target_dtype)
 31 | 
 32 | 
 33 | def _build_key_size_numel_dictionaries(keys, data):
 34 |     """Build the size on rank 0 and broadcast."""
 35 |     max_dim = _MAX_DATA_DIM
 36 |     sizes = [0 for _ in range(max_dim) for _ in keys]
 37 | 
 38 |     # Pack the sizes on rank zero.
 39 |     if get_model_parallel_rank() == 0:
 40 |         offset = 0
 41 |         for key in keys:
 42 |             assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM'
 43 |             size = data[key].size()
 44 |             for i, s in enumerate(size):
 45 |                 sizes[i + offset] = s
 46 |             offset += max_dim
 47 | 
 48 |     # Move to GPU and broadcast.
 49 |     sizes_cuda = torch.cuda.LongTensor(sizes)
 50 |     torch.distributed.broadcast(sizes_cuda, get_model_parallel_src_rank(),
 51 |                                 group=get_model_parallel_group())
 52 | 
 53 |     # Move back to cpu and unpack.
 54 |     sizes_cpu = sizes_cuda.cpu()
 55 |     key_size = {}
 56 |     key_numel = {}
 57 |     total_numel = 0
 58 |     offset = 0
 59 |     for key in keys:
 60 |         i = 0
 61 |         size = []
 62 |         numel = 1
 63 |         while sizes_cpu[offset + i] > 0:
 64 |             this_size = sizes_cpu[offset + i]
 65 |             size.append(this_size)
 66 |             numel *= this_size
 67 |             i += 1
 68 |         key_size[key] = size
 69 |         key_numel[key] = numel
 70 |         total_numel += numel
 71 |         offset += max_dim
 72 | 
 73 |     return key_size, key_numel, total_numel
 74 | 
 75 | 
 76 | def broadcast_data(keys, data, datatype):
 77 |     """Broadcast data from rank zero of each model parallel group to the
 78 |     members of the same model parallel group.
 79 | 
 80 |     Arguments:
 81 |         keys: list of keys in the data disctionary to be broadcasted
 82 |         data: data dictionary of string keys and cpu tensor values.
 83 |         datatype: torch data type of all tensors in data associated
 84 |                   with keys.
 85 |     """
 86 |     # Build (key, size) and (key, number of elements) dictionaries along
 87 |     # with the total number of elements on all ranks.
 88 |     key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys,
 89 |                                                                           data)
 90 | 
 91 |     # Pack on rank zero.
 92 |     if get_model_parallel_rank() == 0:
 93 |         # Check that all keys have the same data type.
 94 |         _check_data_types(keys, data, datatype)
 95 |         # Flatten the data associated with the keys
 96 |         flatten_data = torch.cat(
 97 |             [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
 98 |     else:
 99 |         flatten_data = torch.empty(total_numel,
100 |                                    device=torch.cuda.current_device(),
101 |                                    dtype=datatype)
102 | 
103 |     # Boradcast
104 |     torch.distributed.broadcast(flatten_data, get_model_parallel_src_rank(),
105 |                                 group=get_model_parallel_group())
106 | 
107 |     # Unpack
108 |     output = {}
109 |     offset = 0
110 |     for key in keys:
111 |         size = key_size[key]
112 |         numel = key_numel[key]
113 |         output[key] = flatten_data.narrow(0, offset, numel).view(size)
114 |         offset += numel
115 | 
116 |     return output
117 | 


--------------------------------------------------------------------------------
/mpu/grads.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | # Parts of the code here are adapted from PyTorch
18 | # repo: https://github.com/pytorch/pytorch
19 | 
20 | 
21 | import torch
22 | from torch._six import inf
23 | 
24 | from .initialize import get_model_parallel_group
25 | from .initialize import get_model_parallel_rank
26 | 
27 | 
28 | def clip_grad_norm(parameters, max_norm, norm_type=2):
29 |     """Clips gradient norm of an iterable of parameters.
30 | 
31 |     This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
32 |     added functionality to handle model parallel parameters. Note that
33 |     the gradients are modified in place.
34 | 
35 |     Arguments:
36 |         parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
37 |             single Tensor that will have gradients normalized
38 |         max_norm (float or int): max norm of the gradients
39 |         norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
40 |             infinity norm.
41 | 
42 |     Returns:
43 |         Total norm of the parameters (viewed as a single vector).
44 |     """
45 |     if isinstance(parameters, torch.Tensor):
46 |         parameters = [parameters]
47 |     parameters = list(filter(lambda p: p.grad is not None, parameters))
48 |     max_norm = float(max_norm)
49 |     norm_type = float(norm_type)
50 |     if norm_type == inf:
51 |         total_norm = max(p.grad.data.abs().max() for p in parameters)
52 |         total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
53 |         # Take max across all GPUs.
54 |         torch.distributed.all_reduce(total_norm_cuda,
55 |                                      op=torch.distributed.ReduceOp.MAX,
56 |                                      group=get_model_parallel_group())
57 |         total_norm = total_norm_cuda[0].item()
58 |     else:
59 |         total_norm = 0
60 |         for p in parameters:
61 |             if p.model_parallel or (get_model_parallel_rank() == 0):
62 |                 param_norm = p.grad.data.norm(norm_type)
63 |                 total_norm += param_norm.item() ** norm_type
64 |         # Sum across all model parallel GPUs.
65 |         total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
66 |         torch.distributed.all_reduce(total_norm_cuda,
67 |                                      op=torch.distributed.ReduceOp.SUM,
68 |                                      group=get_model_parallel_group())
69 |         total_norm = total_norm_cuda[0].item() ** (1. / norm_type)
70 |     clip_coef = max_norm / (total_norm + 1e-6)
71 |     if clip_coef < 1:
72 |         for p in parameters:
73 |             p.grad.data.mul_(clip_coef)
74 |     return total_norm
75 | 


--------------------------------------------------------------------------------
/mpu/initialize.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | """Model and data parallel groups."""
 18 | 
 19 | import torch
 20 | 
 21 | from .utils import ensure_divisibility
 22 | 
 23 | 
 24 | # Model parallel group that the current rank belongs to.
 25 | _MODEL_PARALLEL_GROUP = None
 26 | # Data parallel group that the current rank belongs to.
 27 | _DATA_PARALLEL_GROUP = None
 28 | 
 29 | 
 30 | def initialize_model_parallel(model_parallel_size_):
 31 |     """
 32 |     Initialize model data parallel groups.
 33 | 
 34 |     Arguments:
 35 |         model_parallel_size: number of GPUs used to parallelize model.
 36 | 
 37 |     Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
 38 |     use 2 GPUs to parallelize the model. The present function will
 39 |     create 4 model parallel groups and 2 data parallel grous as:
 40 |         4 model parallel groups:
 41 |             [g0, g1], [g2, g3], [g4, g5], [g6, g7]
 42 |         2 data parallel groups:
 43 |             [g0, g2, g4, g6], [g1, g3, g5, g7]
 44 |     Note that for efficiency, the caller should make sure adjacent ranks
 45 |     are on the same DGX box. For example if we are using 2 DGX-1 boxes
 46 |     with a total of 16 GPUs, rank 0 to 7 belong to the first box and
 47 |     ranks 8 to 15 belong to the second box.
 48 |     """
 49 |     if torch.distributed.get_rank() == 0:
 50 |         print('> initializing model parallel with size {}'.format(
 51 |             model_parallel_size_))
 52 |     # Get world size and rank. Ensure some consistencies.
 53 |     assert torch.distributed.is_initialized()
 54 |     world_size = torch.distributed.get_world_size()
 55 |     model_parallel_size = min(model_parallel_size_, world_size)
 56 |     ensure_divisibility(world_size, model_parallel_size)
 57 |     rank = torch.distributed.get_rank()
 58 | 
 59 |     # Build the data parallel groups.
 60 |     global _DATA_PARALLEL_GROUP
 61 |     assert _DATA_PARALLEL_GROUP is None, \
 62 |         'data parallel group is already initialized'
 63 |     for i in range(model_parallel_size):
 64 |         ranks = range(i, world_size, model_parallel_size)
 65 |         group = torch.distributed.new_group(ranks)
 66 |         if i == (rank % model_parallel_size):
 67 |             _DATA_PARALLEL_GROUP = group
 68 | 
 69 |     # Build the model parallel groups.
 70 |     global _MODEL_PARALLEL_GROUP
 71 |     assert _MODEL_PARALLEL_GROUP is None, \
 72 |         'model parallel group is already initialized'
 73 |     for i in range(world_size // model_parallel_size):
 74 |         ranks = range(i * model_parallel_size,
 75 |                       (i + 1) * model_parallel_size)
 76 |         group = torch.distributed.new_group(ranks)
 77 |         if i == (rank // model_parallel_size):
 78 |             _MODEL_PARALLEL_GROUP = group
 79 | 
 80 | 
 81 | def model_parallel_is_initialized():
 82 |     """Check if model and data parallel groups are initialized."""
 83 |     if _MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None:
 84 |         return False
 85 |     return True
 86 | 
 87 | 
 88 | def get_model_parallel_group():
 89 |     """Get the model parallel group the caller rank belongs to."""
 90 |     assert _MODEL_PARALLEL_GROUP is not None, \
 91 |         'model parallel group is not initialized'
 92 |     return _MODEL_PARALLEL_GROUP
 93 | 
 94 | 
 95 | def get_data_parallel_group():
 96 |     """Get the data parallel group the caller rank belongs to."""
 97 |     assert _DATA_PARALLEL_GROUP is not None, \
 98 |         'data parallel group is not initialized'
 99 |     return _DATA_PARALLEL_GROUP
100 | 
101 | 
102 | def get_model_parallel_world_size():
103 |     """Return world size for the model parallel group."""
104 |     return torch.distributed.get_world_size(group=get_model_parallel_group())
105 | 
106 | 
107 | def get_model_parallel_rank():
108 |     """Return my rank for the model parallel group."""
109 |     return torch.distributed.get_rank(group=get_model_parallel_group())
110 | 
111 | 
112 | def get_model_parallel_src_rank():
113 |     """Calculate the global rank corresponding to a local rank zeor
114 |     in the model parallel group."""
115 |     global_rank = torch.distributed.get_rank()
116 |     local_world_size = get_model_parallel_world_size()
117 |     return (global_rank // local_world_size) * local_world_size
118 | 
119 | 
120 | def get_data_parallel_world_size():
121 |     """Return world size for the data parallel group."""
122 |     return torch.distributed.get_world_size(group=get_data_parallel_group())
123 | 
124 | 
125 | def get_data_parallel_rank():
126 |     """Return my rank for the data parallel group."""
127 |     return torch.distributed.get_rank(group=get_data_parallel_group())
128 | 
129 | 
130 | def destroy_model_parallel():
131 |     """Set the groups to none."""
132 |     global _MODEL_PARALLEL_GROUP
133 |     _MODEL_PARALLEL_GROUP = None
134 |     global _DATA_PARALLEL_GROUP
135 |     _DATA_PARALLEL_GROUP = None
136 | 


--------------------------------------------------------------------------------
/mpu/mappings.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import torch
 17 | 
 18 | from .initialize import get_model_parallel_group
 19 | from .utils import split_tensor_along_last_dim
 20 | 
 21 | 
 22 | def _reduce(input_):
 23 |     """All-reduce the the input tensor across model parallel group."""
 24 |     group = get_model_parallel_group()
 25 | 
 26 |     # Bypass the function if we are using only 1 GPU.
 27 |     if torch.distributed.get_world_size(group=group) == 1:
 28 |         return input_
 29 | 
 30 |     # All-reduce.
 31 |     torch.distributed.all_reduce(input_, group=group)
 32 | 
 33 |     return input_
 34 | 
 35 | 
 36 | def _split(input_):
 37 |     """Split the tensor along its last dimension and keep the
 38 |     corresponding slice."""
 39 |     group = get_model_parallel_group()
 40 | 
 41 |     # Bypass the function if we are using only 1 GPU.
 42 |     if torch.distributed.get_world_size(group=group) == 1:
 43 |         return input_
 44 | 
 45 |     # Split along last dimension.
 46 |     world_size = torch.distributed.get_world_size(group=group)
 47 |     input_list = split_tensor_along_last_dim(input_, world_size)
 48 | 
 49 |     # Note: torch.split does not create contiguous tensors by default.
 50 |     rank = torch.distributed.get_rank(group=group)
 51 |     output = input_list[rank].contiguous()
 52 | 
 53 |     return output
 54 | 
 55 | 
 56 | def _gather(input_):
 57 |     """Gather tensors and concatinate along the last dimension."""
 58 |     group = get_model_parallel_group()
 59 | 
 60 |     # Bypass the function if we are using only 1 GPU.
 61 |     if torch.distributed.get_world_size(group=group) == 1:
 62 |         return input_
 63 | 
 64 |     # Size and dimension.
 65 |     last_dim = input_.dim() - 1
 66 |     rank = torch.distributed.get_rank(group=group)
 67 |     world_size = torch.distributed.get_world_size(group=group)
 68 | 
 69 |     tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
 70 |     tensor_list[rank] = input_
 71 |     torch.distributed.all_gather(tensor_list, input_, group=group)
 72 | 
 73 |     # Note: torch.cat already creates a contiguous tensor.
 74 |     output = torch.cat(tensor_list, dim=last_dim).contiguous()
 75 | 
 76 |     return output
 77 | 
 78 | 
 79 | class _CopyToModelParallelRegion(torch.autograd.Function):
 80 |     """Pass the input to the model parallel region."""
 81 | 
 82 |     @staticmethod
 83 |     def forward(ctx, input_):
 84 |         return input_
 85 | 
 86 |     @staticmethod
 87 |     def backward(ctx, grad_output):
 88 |         return _reduce(grad_output)
 89 | 
 90 | 
 91 | class _ReduceFromModelParallelRegion(torch.autograd.Function):
 92 |     """All-redcue the input from the model parallel region."""
 93 | 
 94 |     @staticmethod
 95 |     def forward(ctx, input_):
 96 |         return _reduce(input_)
 97 | 
 98 |     @staticmethod
 99 |     def backward(ctx, grad_output):
100 |         return grad_output
101 | 
102 | 
103 | class _ScatterToModelParallelRegion(torch.autograd.Function):
104 |     """Split the input and keep only the corresponding chuck to the rank."""
105 | 
106 |     @staticmethod
107 |     def forward(ctx, input_):
108 |         return _split(input_)
109 | 
110 |     @staticmethod
111 |     def backward(ctx, grad_output):
112 |         return _gather(grad_output)
113 | 
114 | 
115 | class _GatherFromModelParallelRegion(torch.autograd.Function):
116 |     """Gather the input from model parallel region and concatinate."""
117 | 
118 |     @staticmethod
119 |     def forward(ctx, input_):
120 |         return _gather(input_)
121 | 
122 |     @staticmethod
123 |     def backward(ctx, grad_output):
124 |         return _split(grad_output)
125 | 
126 | 
127 | # -----------------
128 | # Helper functions.
129 | # -----------------
130 | 
131 | def copy_to_model_parallel_region(input_):
132 |     return _CopyToModelParallelRegion.apply(input_)
133 | 
134 | def reduce_from_model_parallel_region(input_):
135 |     return _ReduceFromModelParallelRegion.apply(input_)
136 | 
137 | def scatter_to_model_parallel_region(input_):
138 |     return _ScatterToModelParallelRegion.apply(input_)
139 | 
140 | def gather_from_model_parallel_region(input_):
141 |     return _GatherFromModelParallelRegion.apply(input_)
142 | 


--------------------------------------------------------------------------------
/mpu/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/GLM/4f61ed7237a3b0187f4d62062429348276a78c84/mpu/tests/__init__.py


--------------------------------------------------------------------------------
/mpu/tests/commons.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import argparse
17 | import os
18 | import random
19 | import numpy
20 | import torch
21 | 
22 | import mpu
23 | 
24 | 
25 | class IdentityLayer(torch.nn.Module):
26 |     def __init__(self, size, scale=1.0):
27 |         super(IdentityLayer, self).__init__()
28 |         self.weight = torch.nn.Parameter(scale * torch.randn(size))
29 |     def forward(self):
30 |         return self.weight
31 | 
32 | 
33 | def set_random_seed(seed):
34 |     """Set random seed for reproducability."""
35 |     random.seed(seed)
36 |     numpy.random.seed(seed)
37 |     torch.manual_seed(seed)
38 |     mpu.model_parallel_cuda_manual_seed(seed)
39 | 
40 | 
41 | def initialize_distributed(backend='nccl'):
42 |     """Initialize torch.distributed."""
43 |     # Get local rank in case it is provided.
44 |     parser = argparse.ArgumentParser()
45 |     parser.add_argument('--local_rank', type=int, default=None,
46 |                         help='local rank passed from distributed launcher')
47 |     args = parser.parse_args()
48 |     local_rank = args.local_rank
49 | 
50 |     # Get rank and world size.
51 |     rank = int(os.getenv('RANK', '0'))
52 |     world_size = int(os.getenv("WORLD_SIZE", '1'))
53 | 
54 |     print('> initializing torch.distributed with local rank: {}, '
55 |           'rank: {}, world size: {}'.format(local_rank, rank, world_size))
56 | 
57 |     # Set the device id.
58 |     device = rank % torch.cuda.device_count()
59 |     if local_rank is not None:
60 |         device = local_rank
61 |     torch.cuda.set_device(device)
62 | 
63 |     # Call the init process.
64 |     init_method = 'tcp://'
65 |     master_ip = os.getenv('MASTER_ADDR', 'localhost')
66 |     master_port = os.getenv('MASTER_PORT', '6000')
67 |     init_method += master_ip + ':' + master_port
68 |     torch.distributed.init_process_group(
69 |         backend=backend,
70 |         world_size=world_size,
71 |         rank=rank,
72 |         init_method=init_method)
73 | 
74 | 
75 | def print_separator(message):
76 |     torch.distributed.barrier()
77 |     filler_len = (78 - len(message)) // 2
78 |     filler = '-' * filler_len
79 |     string = '\n' + filler + ' {} '.format(message) + filler
80 |     if torch.distributed.get_rank() == 0:
81 |         print(string, flush=True)
82 |     torch.distributed.barrier()
83 | 


--------------------------------------------------------------------------------
/mpu/tests/test_cross_entropy.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import random
 17 | import sys
 18 | sys.path.append("../..")
 19 | 
 20 | import torch
 21 | import torch.nn.functional as F
 22 | import mpu
 23 | from mpu.cross_entropy import vocab_parallel_cross_entropy
 24 | 
 25 | from commons import initialize_distributed
 26 | from commons import print_separator
 27 | from commons import IdentityLayer
 28 | from commons import set_random_seed
 29 | 
 30 | 
 31 | def torch_cross_entropy(batch_size, seq_length, vocab_size,
 32 |                         logits_scale, seed):
 33 |     set_random_seed(seed)
 34 |     identity = IdentityLayer((batch_size, seq_length, vocab_size),
 35 |                              scale=logits_scale).cuda()
 36 |     logits = identity()
 37 |     target = torch.cuda.LongTensor(
 38 |         size=(batch_size, seq_length)).random_(0, vocab_size)
 39 |     loss = F.cross_entropy(logits.view(-1, logits.size()[-1]),
 40 |                            target.view(-1),
 41 |                            reduction='none').view_as(target).mean()
 42 |     loss.backward()
 43 |     return loss, identity.weight.grad
 44 | 
 45 | 
 46 | def mpu_cross_entropy(batch_size, seq_length, vocab_size,
 47 |                       logits_scale, seed):
 48 |     set_random_seed(seed)
 49 |     identity = IdentityLayer((batch_size, seq_length, vocab_size),
 50 |                              scale=logits_scale).cuda()
 51 |     logits = identity()
 52 |     logits_parallel = mpu.scatter_to_model_parallel_region(logits)
 53 |     target = torch.cuda.LongTensor(
 54 |         size=(batch_size, seq_length)).random_(0, vocab_size)
 55 |     loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
 56 |     loss.backward()
 57 |     return loss, identity.weight.grad
 58 | 
 59 | 
 60 | def test_cross_entropy(model_parallel_size):
 61 | 
 62 |     if torch.distributed.get_rank() == 0:
 63 |         print('> testing cross entropy with model parallel size {} ...'.
 64 |               format(model_parallel_size))
 65 | 
 66 |     mpu.initialize_model_parallel(model_parallel_size)
 67 |     model_parallel_size = mpu.get_model_parallel_world_size()
 68 | 
 69 |     batch_size = 13
 70 |     seq_length = 17
 71 |     vocab_size_per_partition = 11
 72 |     logits_scale = 1000.0
 73 |     vocab_size = vocab_size_per_partition * model_parallel_size
 74 |     seed = 1234
 75 | 
 76 |     loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length,
 77 |                                                  vocab_size, logits_scale,
 78 |                                                  seed)
 79 |     loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length,
 80 |                                            vocab_size, logits_scale,
 81 |                                            seed)
 82 | 
 83 |     error = loss_torch.sub_(loss_mpu).abs().max()
 84 |     print('   max error in loss on global rank {}: {}'.format(
 85 |         torch.distributed.get_rank(), error))
 86 |     assert error < 1.0e-6
 87 | 
 88 |     error = grad_torch.sub_(grad_mpu).abs().max()
 89 |     print('   max error in grad on global rank {}: {}'.format(
 90 |         torch.distributed.get_rank(), error))
 91 |     assert error < 1.0e-6
 92 | 
 93 |     # Reset groups
 94 |     mpu.destroy_model_parallel()
 95 | 
 96 |     torch.distributed.barrier()
 97 |     if torch.distributed.get_rank() == 0:
 98 |         print('>> passed the test :-)')
 99 | 
100 | 
101 | if __name__ == '__main__':
102 | 
103 |     initialize_distributed()
104 |     world_size = torch.distributed.get_world_size()
105 | 
106 |     model_parallel_size = 1
107 |     while model_parallel_size <= world_size:
108 |         print_separator('test cross entropy')
109 |         test_cross_entropy(model_parallel_size)
110 |         model_parallel_size *= 2
111 | 


--------------------------------------------------------------------------------
/mpu/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import functools
17 | import operator
18 | import sys
19 | sys.path.append("../..")
20 | 
21 | import torch
22 | import mpu
23 | from mpu import data as data_utils
24 | 
25 | from commons import initialize_distributed
26 | from commons import print_separator
27 | 
28 | 
29 | def test_boradcast_data(model_parallel_size):
30 | 
31 |     if torch.distributed.get_rank() == 0:
32 |         print('> testing boradcast_data with model parallel size {} ...'.
33 |               format(model_parallel_size))
34 | 
35 |     mpu.initialize_model_parallel(model_parallel_size)
36 |     torch.manual_seed(1234 + mpu.get_data_parallel_rank())
37 |     model_parallel_size = mpu.get_model_parallel_world_size()
38 | 
39 |     key_size_t = {'key1': [7, 11],
40 |                   'key2': [8, 2, 1],
41 |                   'key3': [13],
42 |                   'key4': [5, 1, 2],
43 |                   'key5': [5, 12]}
44 |     keys = list(key_size_t.keys())
45 | 
46 |     data = {}
47 |     data_t = {}
48 |     for key in key_size_t:
49 |         data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
50 |         data_t[key] = data[key].clone()
51 |     data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
52 |     data_t['keyX'] = data['keyX'].clone()
53 |     if mpu.get_model_parallel_rank() != 0:
54 |         data = None
55 | 
56 |     data_utils._check_data_types(keys, data_t, torch.int64)
57 |     key_size, key_numel, \
58 |         total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
59 |     for key in keys:
60 |         assert key_size[key] == key_size_t[key]
61 |     total_numel_t = 0
62 |     for key in keys:
63 |         target_size = functools.reduce(operator.mul, key_size_t[key], 1)
64 |         assert key_numel[key] == target_size
65 |         total_numel_t += target_size
66 |     assert total_numel == total_numel_t
67 | 
68 |     data_b = data_utils.broadcast_data(keys, data, torch.int64)
69 |     for key in keys:
70 |         tensor = data_t[key].cuda()
71 |         assert data_b[key].sub(tensor).abs().max() == 0
72 | 
73 |     # Reset groups
74 |     mpu.destroy_model_parallel()
75 | 
76 |     torch.distributed.barrier()
77 |     if torch.distributed.get_rank() == 0:
78 |         print('>> passed the test :-)')
79 | 
80 | 
81 | if __name__ == '__main__':
82 | 
83 |     initialize_distributed()
84 |     world_size = torch.distributed.get_world_size()
85 | 
86 |     model_parallel_size = 1
87 |     while model_parallel_size <= world_size:
88 |         print_separator('test test boradcast data')
89 |         test_boradcast_data(model_parallel_size)
90 |         model_parallel_size *= 2
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/mpu/tests/test_initialize.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import sys
17 | sys.path.append("../..")
18 | 
19 | import torch
20 | import mpu
21 | 
22 | from commons import initialize_distributed
23 | from commons import print_separator
24 | 
25 | 
26 | def test_initialize_model_parallel(model_parallel_size):
27 | 
28 |     if torch.distributed.get_rank() == 0:
29 |         print('> testing initialize_model_parallel with size {} ...'.format(
30 |             model_parallel_size))
31 |     model_parallel_size_ = min(model_parallel_size,
32 |                                torch.distributed.get_world_size())
33 |     assert not mpu.model_parallel_is_initialized()
34 |     mpu.initialize_model_parallel(model_parallel_size_)
35 |     assert mpu.model_parallel_is_initialized()
36 | 
37 |     # Checks.
38 |     def check(group, world_size, rank):
39 |         assert world_size == torch.distributed.get_world_size(group=group)
40 |         assert rank == torch.distributed.get_rank(group=group)
41 | 
42 |     # Model parallel.
43 |     world_size = model_parallel_size_
44 |     rank = torch.distributed.get_rank() % model_parallel_size_
45 |     assert world_size == mpu.get_model_parallel_world_size()
46 |     assert rank == mpu.get_model_parallel_rank()
47 |     check(mpu.get_model_parallel_group(), world_size, rank)
48 | 
49 | 
50 |     # Data parallel.
51 |     world_size = torch.distributed.get_world_size() // model_parallel_size_
52 |     rank = torch.distributed.get_rank() // model_parallel_size
53 |     assert world_size == mpu.get_data_parallel_world_size()
54 |     assert rank == mpu.get_data_parallel_rank()
55 |     check(mpu.get_data_parallel_group(), world_size, rank)
56 | 
57 |     # Reset groups
58 |     mpu.destroy_model_parallel()
59 | 
60 |     torch.distributed.barrier()
61 |     if torch.distributed.get_rank() == 0:
62 |         print('>> passed the test :-)')
63 | 
64 | 
65 | def test_get_model_parallel_src_rank(model_parallel_size_):
66 | 
67 |     if torch.distributed.get_rank() == 0:
68 |         print('> testing get_model_parallel_src_rank with size {} ...'.format(
69 |             model_parallel_size_))
70 |     model_parallel_size = min(model_parallel_size_,
71 |                               torch.distributed.get_world_size())
72 |     assert not mpu.model_parallel_is_initialized()
73 |     mpu.initialize_model_parallel(model_parallel_size)
74 |     assert mpu.model_parallel_is_initialized()
75 | 
76 |     # Checks
77 |     src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank()
78 |     assert mpu.get_model_parallel_src_rank() == src_rank
79 | 
80 |     # Reset groups
81 |     mpu.destroy_model_parallel()
82 | 
83 |     torch.distributed.barrier()
84 |     if torch.distributed.get_rank() == 0:
85 |         print('>> passed the test :-)')
86 | 
87 | 
88 | if __name__ == '__main__':
89 | 
90 |     initialize_distributed()
91 |     world_size = torch.distributed.get_world_size()
92 |     model_parallel_size = 1
93 |     while model_parallel_size <= world_size:
94 |         print_separator('test initialize model parallel')
95 |         test_initialize_model_parallel(model_parallel_size)
96 |         print_separator('test model parallel source rank')
97 |         test_get_model_parallel_src_rank(model_parallel_size)
98 |         model_parallel_size *= 2
99 | 


--------------------------------------------------------------------------------
/mpu/tests/test_random.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import sys
 17 | sys.path.append("../..")
 18 | 
 19 | import torch
 20 | import mpu
 21 | 
 22 | from commons import initialize_distributed
 23 | from commons import print_separator
 24 | 
 25 | 
 26 | def test_set_cuda_rng_state(model_parallel_size):
 27 | 
 28 |     if torch.distributed.get_rank() == 0:
 29 |         print('> testing set_rng_state with size {} ...'.
 30 |               format(model_parallel_size))
 31 | 
 32 |     mpu.initialize_model_parallel(model_parallel_size)
 33 |     model_parallel_size = mpu.get_model_parallel_world_size()
 34 | 
 35 |     size = 123
 36 |     seed = 1234
 37 |     torch.cuda.manual_seed(1234)
 38 |     tensor = torch.cuda.FloatTensor(size)
 39 | 
 40 |     # Get the state
 41 |     rng_state = torch.cuda.get_rng_state()
 42 |     rng_state_copy = rng_state.clone()
 43 | 
 44 |     # Do some stuff.
 45 |     for _ in range(5):
 46 |         torch.randn(size, out=tensor)
 47 |     result_1 = tensor.clone()
 48 | 
 49 |     assert rng_state.sub(rng_state_copy).max() == 0
 50 |     assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0
 51 | 
 52 |     # State should be different.
 53 |     new_rng_state = torch.cuda.get_rng_state()
 54 |     max_diff = new_rng_state.sub(rng_state).max()
 55 |     print('   max diff in rng state (should be non-zero) on global rank {}: {}'.
 56 |           format(torch.distributed.get_rank(), max_diff))
 57 |     assert max_diff > 0
 58 | 
 59 |     # Reset the rng state and do the same stuff.
 60 |     mpu.random._set_cuda_rng_state(rng_state)
 61 |     for _ in range(5):
 62 |         torch.randn(size, out=tensor)
 63 |     mpu.random._set_cuda_rng_state(rng_state)
 64 |     for _ in range(5):
 65 |         torch.randn(size, out=tensor)
 66 |     result_2 = tensor.clone()
 67 | 
 68 |     # Results should be the same
 69 |     error = result_2.sub(result_1).abs().max()
 70 |     print('   max error in generated tensors (should be zero) on '
 71 |           'global rank {}: {}'.format(torch.distributed.get_rank(), error))
 72 |     assert error < 1.0e-6
 73 | 
 74 |     # Input state should have remained intact.
 75 |     error = rng_state.sub(rng_state_copy).max()
 76 |     print('   max error in rng state (should be zero) on global rank {}: {}'.
 77 |           format(torch.distributed.get_rank(), error))
 78 |     assert error == 0
 79 | 
 80 |     # Reset groups
 81 |     mpu.destroy_model_parallel()
 82 | 
 83 |     torch.distributed.barrier()
 84 |     if torch.distributed.get_rank() == 0:
 85 |         print('>> passed the test :-)')
 86 | 
 87 | 
 88 | def test_cuda_rng_tracker(model_parallel_size):
 89 | 
 90 |     if torch.distributed.get_rank() == 0:
 91 |         print('> testing cuda rng tracker with size {} ...'.
 92 |               format(model_parallel_size))
 93 | 
 94 |     mpu.initialize_model_parallel(model_parallel_size)
 95 |     model_parallel_size = mpu.get_model_parallel_world_size()
 96 | 
 97 |     seed_1 = 1234
 98 |     seed_2 = 4321
 99 |     size = [12, 21]
100 |     tensor = torch.cuda.FloatTensor(size)
101 | 
102 |     # Set to seed_1 and generate two tensors.
103 |     torch.cuda.manual_seed(seed_1)
104 |     torch.randn(size, out=tensor)
105 |     target_11 = tensor.clone()
106 |     torch.randn(size, out=tensor)
107 |     target_12 = tensor.clone()
108 | 
109 |     # Set to seed_2 and generate two tensors.
110 |     torch.cuda.manual_seed(seed_2)
111 |     torch.randn(size, out=tensor)
112 |     target_21 = tensor.clone()
113 |     torch.randn(size, out=tensor)
114 |     target_22 = tensor.clone()
115 | 
116 |     # Now if we interleave seed_1 and seed_2,
117 |     # we should still get the same tensors
118 |     torch.cuda.manual_seed(seed_1)
119 |     mpu.get_cuda_rng_tracker().add('test', seed_2)
120 | 
121 |     torch.randn(size, out=tensor)
122 |     result_11 = tensor.clone()
123 | 
124 |     with mpu.get_cuda_rng_tracker().fork('test'):
125 |         torch.randn(size, out=tensor)
126 |         result_21 = tensor.clone()
127 | 
128 |     torch.randn(size, out=tensor)
129 |     result_12 = tensor.clone()
130 | 
131 |     with mpu.get_cuda_rng_tracker().fork('test'):
132 |         torch.randn(size, out=tensor)
133 |         result_22 = tensor.clone()
134 | 
135 |     diff = result_11.sub(result_21).abs().max()
136 |     diff = min(diff, result_12.sub(result_22).abs().max())
137 |     print('   max diff in generated tensors (should be non-zero) on '
138 |           'global rank {}: {}'.format(torch.distributed.get_rank(), diff))
139 |     assert diff > 1.0e-6
140 |     error = max(result_11.sub(target_11).abs().max(),
141 |                 result_12.sub(target_12).abs().max())
142 |     error = max(error, result_21.sub(target_21).abs().max())
143 |     error = max(error, result_22.sub(target_22).abs().max())
144 |     print('   max error in generated tensors (should be zero) on '
145 |           'global rank {}: {}'.format(torch.distributed.get_rank(), error))
146 |     assert error < 1.0e-6
147 | 
148 |     # Reset the tracker
149 |     mpu.get_cuda_rng_tracker().reset()
150 | 
151 |     # Reset groups
152 |     mpu.destroy_model_parallel()
153 | 
154 |     torch.distributed.barrier()
155 |     if torch.distributed.get_rank() == 0:
156 |         print('>> passed the test :-)')
157 | 
158 | 
159 | def test_model_parallel_cuda_manual_seed(model_parallel_size):
160 | 
161 |     if torch.distributed.get_rank() == 0:
162 |         print('> testing model parallel cuda manual seed with size {} ...'.
163 |               format(model_parallel_size))
164 | 
165 |     mpu.initialize_model_parallel(model_parallel_size)
166 |     model_parallel_size = mpu.get_model_parallel_world_size()
167 | 
168 |     mpu.model_parallel_cuda_manual_seed(12345)
169 |     assert torch.cuda.initial_seed() == 12345
170 |     with mpu.get_cuda_rng_tracker().fork():
171 |         assert torch.cuda.initial_seed() == (12345 + 2718 +
172 |                                              mpu.get_model_parallel_rank())
173 | 
174 |     # Reset the tracker
175 |     mpu.get_cuda_rng_tracker().reset()
176 | 
177 |     # Reset groups
178 |     mpu.destroy_model_parallel()
179 | 
180 |     torch.distributed.barrier()
181 |     if torch.distributed.get_rank() == 0:
182 |         print('>> passed the test :-)')
183 | 
184 | 
185 | if __name__ == '__main__':
186 | 
187 |     initialize_distributed()
188 |     world_size = torch.distributed.get_world_size()
189 | 
190 |     model_parallel_size = 1
191 |     while model_parallel_size <= world_size:
192 |         print_separator('test set rng state')
193 |         test_set_cuda_rng_state(model_parallel_size)
194 |         model_parallel_size *= 2
195 | 
196 |     model_parallel_size = 1
197 |     while model_parallel_size <= world_size:
198 |         print_separator('test cuda rng tracker')
199 |         test_cuda_rng_tracker(model_parallel_size)
200 |         model_parallel_size *= 2
201 | 
202 |     model_parallel_size = 1
203 |     while model_parallel_size <= world_size:
204 |         print_separator('test model parallel cuda manual seed')
205 |         test_model_parallel_cuda_manual_seed(model_parallel_size)
206 |         model_parallel_size *= 2
207 | 
208 | 


--------------------------------------------------------------------------------
/mpu/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import torch
18 | 
19 | 
20 | def ensure_divisibility(numerator, denominator):
21 |     """Ensure that numerator is divisible by the denominator."""
22 |     assert numerator % denominator == 0, '{} is not divisible by {}'.format(
23 |         numerator, denominator)
24 | 
25 | 
26 | def divide(numerator, denominator):
27 |     """Ensure that numerator is divisible by the denominator and return
28 |     the division value."""
29 |     ensure_divisibility(numerator, denominator)
30 |     return numerator // denominator
31 | 
32 | 
33 | def split_tensor_along_last_dim(tensor, num_partitions,
34 |                                 contiguous_split_chunks=False):
35 |     """Split a tensor along its last dimension.
36 |     Arguments:
37 |         tensor: input tensor.
38 |         num_partitions: number of partitions to split the tensor
39 |         contiguous_split_chunks: If True, make each chunk contiguous
40 |                                  in memory.
41 |     """
42 |     # Get the size and dimension.
43 |     last_dim = tensor.dim() - 1
44 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
45 |     # Split.
46 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
47 |     # Note: torch.split does not create contiguous tensors by default.
48 |     if contiguous_split_chunks:
49 |         return tuple(chunk.contiguous() for chunk in tensor_list)
50 | 
51 |     return tensor_list
52 | 
53 | 
54 | class VocabUtility:
55 |     """Split the vocabulary into `world_size` chunks amd return the
56 |         first and last index of the vocabulary belonging to the `rank`
57 |         partition: Note that indecies in [fist, last)"""
58 | 
59 |     @staticmethod
60 |     def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
61 |                                                   rank, world_size):
62 |         index_f = rank * per_partition_vocab_size
63 |         index_l = index_f + per_partition_vocab_size
64 |         return index_f, index_l
65 | 
66 |     @staticmethod
67 |     def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
68 |         per_partition_vocab_size = divide(global_vocab_size, world_size)
69 |         return VocabUtility.vocab_range_from_per_partition_vocab_size(
70 |             per_partition_vocab_size, rank, world_size)
71 | 


--------------------------------------------------------------------------------
/process_grid.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import json
 4 | import glob
 5 | import statistics
 6 | 
 7 | path_pattern = sys.argv[1]
 8 | target_type = sys.argv[2]
 9 | best_value, best_result, best_name = None, None, None
10 | mean_result = {}
11 | print(path_pattern)
12 | for dir_path in glob.glob(path_pattern, recursive=True):
13 |     entry = os.path.basename(dir_path)
14 |     valid_result = None
15 |     test_found = os.path.exists(os.path.join(dir_path, "test_results.json"))
16 |     valid_path = os.path.join(dir_path, "results.json")
17 |     if os.path.exists(valid_path):
18 |         print(entry)
19 |         with open(valid_path) as file:
20 |             valid_result = json.load(file)
21 |     else:
22 |         print(f"{entry} no validation results")
23 |         continue
24 |     if not test_found:
25 |         print(f"{entry} not tested yet")
26 |     if target_type == "max":
27 |         metric = sys.argv[3]
28 |         metric_value = valid_result[metric]
29 |         if best_value is None or metric_value > best_value:
30 |             best_value = metric_value
31 |             best_result = valid_result
32 |             best_name = entry
33 |     elif target_type == "mean" or target_type == "median":
34 |         if mean_result:
35 |             for metric, value in valid_result.items():
36 |                 if metric not in ["type", "epoch"]:
37 |                     mean_result[metric].append(value)
38 |         else:
39 |             mean_result = {metric: [value] for metric, value in valid_result.items() if
40 |                            metric not in ["type", "epoch"]}
41 | 
42 | if target_type == "max":
43 |     print(f"Best result found at {best_name}: {best_result}")
44 | elif target_type == "mean":
45 |     mean_result = {metric: sum(value) / len(value) for metric, value in mean_result.items()}
46 |     print(f"Mean result {mean_result}")
47 | elif target_type == "median":
48 |     mean_result = {metric: statistics.median(value) for metric, value in mean_result.items()}
49 |     print(f"Mean result {mean_result}")
50 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | botocore
 2 | boto3
 3 | deepspeed
 4 | filelock
 5 | scipy
 6 | nltk
 7 | regex
 8 | tqdm
 9 | matplotlib
10 | pandas
11 | requests
12 | sentencepiece
13 | ftfy
14 | langdetect
15 | lsh
16 | scikit_learn
17 | tensorboardX
18 | termcolor
19 | tldextract
20 | transformers
21 | rouge_score
22 | fasttext
23 | unidecode


--------------------------------------------------------------------------------
/run_test.py:
--------------------------------------------------------------------------------
1 | import sys
2 | 
3 | if sys.argv[1] == 'block':
4 |     from test.test_block import main
5 |     main()
6 | elif sys.argv[1] == 'rel_shift':
7 |     from test.test_rel_shift import main
8 |     main()


--------------------------------------------------------------------------------
/scripts/convert_glm_checkpoint_to_transformers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | You can use `scripts/convert_glm_checkpoint_to_transformers.py` to convert the checkpoint
 3 | ```shell
 4 | python scripts/convert_glm_checkpoint_to_transformers.py CHECKPOINT_PATH MODEL_NAME
 5 | ```
 6 | where `CHECKPOINT_PATH` is the path to the `mp_rank_00_model_states.pt` file,
 7 | MODEL_NAME is the repo name on huggingface hub
 8 | (should be in `["glm-large", "glm-roberta-large", "glm-large-chinese", "glm-515m", "glm-2b", "glm-10b",
 9 | "glm-10b-chinese"]`).
10 | The `pytorch_model.bin` will be saved under the same directory as `mp_rank_00_model_states.pt`.
11 | """
12 | import os
13 | import sys
14 | import torch
15 | 
16 | 
17 | def convert_glm_checkpoint_to_transformers(checkpoint_path, copy_dict=None):
18 |     checkpoint = torch.load(checkpoint_path, map_location='cpu')
19 |     state_dict = checkpoint['module']
20 |     if copy_dict is not None:
21 |         word_embeddings = state_dict['word_embeddings.weight']
22 |         for src_id, dest_id in copy_dict:
23 |             word_embeddings[dest_id] = word_embeddings[src_id]
24 |     directory = os.path.dirname(checkpoint_path)
25 |     output_path = os.path.join(directory, "pytorch_model.bin")
26 |     torch.save(state_dict, output_path)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     checkpoint_path = sys.argv[1]
31 |     model_name = sys.argv[2]
32 |     copy_dict = None
33 |     assert model_name in ["glm-large", "glm-roberta-large", "glm-large-chinese", "glm-515m", "glm-2b", "glm-10b",
34 |                           "glm-10b-chinese"]
35 |     if model_name == "glm-10b-chinese":
36 |         copy_dict = [(50007, 50009)]
37 |     convert_glm_checkpoint_to_transformers(checkpoint_path, copy_dict)
38 | 


--------------------------------------------------------------------------------
/scripts/dispatcher.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import argparse
  3 | import subprocess
  4 | import multiprocessing
  5 | import datetime
  6 | import pickle
  7 | import csv
  8 | import random
  9 | import copy
 10 | 
 11 | import itertools as it
 12 | import json
 13 | 
 14 | from termcolor import colored
 15 | 
 16 | 
 17 | CONFIG = [
 18 |     {
 19 |         "lr": [1e-5,2e-5],
 20 |         "batch-size": [16,32],
 21 |         "epochs": [20,40],
 22 |         "warmup": [0.1],
 23 |         "weight-decay": [0.1],
 24 |         # "adam-beta2": [0.98],
 25 |         # "adam-eps": [1e-8],
 26 |         "seed": [1,2,3]
 27 |     }
 28 | ]
 29 | 
 30 | TASK_CONFIG = {
 31 |     "rte": (
 32 |         "--task rte "
 33 |         "--data-dir /root/data/superglue/RTE "
 34 |         "--seq-length 256 "
 35 |         ),
 36 |     "cb": (
 37 |         "--task cb "
 38 |         "--data-dir /root/data/superglue/CB "
 39 |         "--seq-length 256 "
 40 |         ),
 41 |     "multirc": (
 42 |         "--task multirc "
 43 |         "--data-dir /root/data/superglue/MultiRC "
 44 |         "--seq-length 430 "
 45 |         ),
 46 | }
 47 | 
 48 | MODEL_CONFIG = {
 49 |     "blocklm-roberta-large": (
 50 |         "--block-lm "
 51 |         "--cloze-eval "
 52 |         "--num-layers 24 "
 53 |         "--hidden-size 1024 "
 54 |         "--num-attention-heads 16 "
 55 |         "--max-position-embeddings 512 "
 56 |         "--tokenizer-model-type roberta "
 57 |         "--tokenizer-type GPT2BPETokenizer "
 58 |         "--load-pretrained /root/data/checkpoints/blocklm-roberta-large/250000 "
 59 |         ),
 60 |     "blocklm-base-na": (
 61 |         "--block-lm "
 62 |         "--cloze-eval "
 63 |         "--num-layers 12 "
 64 |         "--hidden-size 768 "
 65 |         "--num-attention-heads 12 "
 66 |         "--max-position-embeddings 512 "
 67 |         "--tokenizer-model-type bert-base-uncased "
 68 |         "--tokenizer-type BertWordPieceTokenizer "
 69 |         "--load-pretrained /root/data/checkpoints/blocklm-base-len6-na03-12-21-21"
 70 |         ),
 71 | }
 72 | 
 73 | CHECKPOINT_PATH = "/root/data/finetune_checkpoints"
 74 | RESULT_PATH = "runs/{EXPERIMENT_NAME}/results.json"
 75 | LOG_PATH = "logs/"
 76 | 
 77 | DISTRIBUTED_ARGS = "--nproc_per_node {N_GPU} --nnodes 1 --node_rank 0 --master_addr localhost --master_port {MASTER_PORT}"
 78 | 
 79 | COMMON_ARGS = (
 80 |     "--save-interval 10000 "
 81 |     "--log-interval 50 "
 82 |      "--eval-interval 1000 "
 83 |      "--eval-iters 100 ")
 84 | 
 85 | 
 86 | def get_command(model, task, n_gpu, config, overwrite=True):
 87 | 
 88 |     distributed_args = DISTRIBUTED_ARGS.format(N_GPU=n_gpu, MASTER_PORT=random.randint(10000, 65535))
 89 | 
 90 |     config = copy.deepcopy(config)
 91 |     hyper = "-".join([f"{k}-{v}" for k,v in config.items()])
 92 |     experiment_name = f"{model}-{task}/{hyper}"
 93 | 
 94 |     command = (f"python -m torch.distributed.launch {distributed_args} finetune_gpt2.py "
 95 |        f"--finetune {MODEL_CONFIG[model]} {TASK_CONFIG[task]} {COMMON_ARGS} "
 96 |        f"--experiment-name {experiment_name} "
 97 |        f"--save {CHECKPOINT_PATH} "
 98 |        f"--checkpoint-activations "
 99 |        f"--eval-batch-size 16 ")
100 | 
101 |     config["batch-size"] = config["batch-size"] // n_gpu
102 |     command = update_cmd(command, config)
103 |     if overwrite:
104 |         command += "--overwrite "
105 | 
106 |     result_path = RESULT_PATH.format(EXPERIMENT_NAME=experiment_name)
107 |     log_path = LOG_PATH + f"{model}-{task}-{hyper}.txt"
108 | 
109 |     return command, result_path, log_path
110 | 
111 | 
112 | def chain_configs(configs):
113 |     '''
114 |         @param configs list of configurations
115 |     '''
116 |     all_configs = []
117 |     for config in configs:
118 |         # preserve order of configs
119 |         keys = sorted(config)
120 |         all_args = it.product(*(config[k] for k in keys))
121 |         all_args_dict = [dict(zip(keys, c)) for c in all_args]
122 | 
123 |         all_configs.append(all_args_dict)
124 | 
125 |     return it.chain(*all_configs)  # flatten result
126 | 
127 | 
128 | def update_cmd(cmd, config):
129 |     '''
130 |         @param cmd str
131 |         @param configs list of dicts
132 |     '''
133 |     for k, v in config.items():
134 |         if v is None:
135 |             continue
136 |         if type(v) == bool:
137 |             if v:
138 |                 cmd += "--{} ".format(k)
139 |         else:
140 |             cmd += "--{} {} ".format(k, v)
141 | 
142 |     return cmd
143 | 
144 | 
145 | def parse_args():
146 |     parser = argparse.ArgumentParser(description='Dispatcher to run all experiments')
147 | 
148 |     parser.add_argument("--gpu", type=str, default='0,1,2,3',
149 |                         help='list of available gpus')
150 |     parser.add_argument("--n_gpu", type=int, default=1,
151 |                         help="number of gpus per job")
152 |     parser.add_argument("--model", type=str, default='blocklm-roberta-large')
153 |     parser.add_argument("--task", type=str, required=True)
154 |     parser.add_argument("--overwrite", action='store_true', default=False,
155 |                         help='whether to rerun experiments with the same result '
156 |                         'file location')
157 |     parser.add_argument("--debug", action='store_true', default=False)
158 | 
159 |     return parser.parse_args()
160 | 
161 | 
162 | def main():
163 |     args = parse_args()
164 |     assert args.model in MODEL_CONFIG
165 |     assert args.task in TASK_CONFIG
166 | 
167 |     # compute cartesian product for each set of configurations
168 |     configs = chain_configs(CONFIG)
169 |     all_configs = configs
170 | 
171 |     # queues
172 |     gpu_list = args.gpu.split(',')
173 |     total_gpu = len(gpu_list)
174 | 
175 |     gpu_queues = []
176 |     for i in range(0, total_gpu, args.n_gpu):
177 |         gpu = ','.join(gpu_list[i:i+args.n_gpu])
178 |         gpu_queues.append((multiprocessing.Queue(), gpu))
179 |     done_queue = multiprocessing.Queue()
180 | 
181 |     results = []
182 |     indx = 0
183 |     num_jobs = 0
184 | 
185 |     for config in all_configs:
186 |         gpu_queues[indx][0].put(config)
187 |         indx = (indx + 1) % len(gpu_queues)
188 |         num_jobs += 1
189 | 
190 |     for job_queue, gpu in gpu_queues:
191 |         print("Start GPU worker {} with {} jobs".format(gpu, job_queue.qsize()))
192 |         multiprocessing.Process(target=_worker, args=(gpu, job_queue, done_queue, args)).start()
193 | 
194 |     timestamp = datetime.datetime.now().strftime("%m-%d-%H-%M")
195 |     summary_path = LOG_PATH + f"grid_{args.model}-{args.task}_{timestamp}.txt"
196 | 
197 |     print("Summary path:", summary_path)
198 | 
199 |     for _ in range(num_jobs):
200 |         result_path, config = done_queue.get()
201 | 
202 |         try:
203 |             res = json.load(open(result_path))
204 |         except Exception as e:
205 |             print("Experiment at {} failed".format(colored(result_path, 'red')))
206 |             print(e)
207 |             continue
208 | 
209 |         with open(summary_path, "a") as f:
210 |             f.write("Config: " + json.dumps(config) + "\n")
211 |             f.write(json.dumps(res) + "\n")
212 | 
213 |     print('Done')
214 | 
215 | 
216 | def _worker(gpu, queue, done_queue, args):
217 |     while not queue.empty():
218 |         config = queue.get()
219 |         if config is None:
220 |             return
221 |         done_queue.put(_launch_experiment(gpu, config, args))
222 | 
223 | 
224 | def _launch_experiment(gpu, config, args):
225 | 
226 |     command, result_path, log_path = get_command(args.model, args.task, args.n_gpu, config, args.overwrite)
227 | 
228 |     shell_cmd = f"CUDA_VISIBLE_DEVICES={gpu} " + command 
229 |     if not args.debug:
230 |         shell_cmd += f" > {log_path} 2>&1; "
231 | 
232 |     print("Time {}, launched exp: {}".format(str(datetime.datetime.now()), log_path))
233 | 
234 |     # if experiment has already been run, skip
235 |     if not os.path.exists(result_path) or args.overwrite:
236 |         return_code = subprocess.call(shell_cmd, shell=True)
237 | 
238 |     if not os.path.exists(result_path):
239 |         # running this process failed, alert me
240 |         print("Dispatcher, Alert! Job has crashed! Check logfile at:[{}]".format(log_path))
241 | 
242 |     return result_path, config
243 | 
244 | 
245 | 
246 | if __name__ == "__main__":
247 |     main()
248 | 


--------------------------------------------------------------------------------
/scripts/ds_finetune_record.sh:
--------------------------------------------------------------------------------
 1 | MP_SIZE=1
 2 | DATA_ROOT=/dataset/c07bd62b/superglue
 3 | GLUE_DATA_ROOT=/dataset/c07bd62b/glue_data
 4 | source config_tasks/model_blocklm_10B.sh
 5 | source config_tasks/task_record.sh
 6 | 
 7 | CHECKPOINT_PATH="/dataset/c07bd62b/finetune_checkpoints"
 8 | 
 9 | MASTER_PORT=$(shuf -n 1 -i 10000-65535)
10 | 
11 | OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2"
12 | DISTRIBUTED_ARGS="${OPTIONS_NCCL} deepspeed --num_gpus 4 --num_nodes 1 --master_port $MASTER_PORT"
13 | DATESTR=$(date +"%m-%d-%H-%M")
14 | 
15 | EXPERIMENT_NAME=${EXPERIMENT_NAME}_${DATESTR}
16 | 
17 | mkdir logs
18 | run_cmd="${DISTRIBUTED_ARGS} finetune_gpt2.py \
19 |        --deepspeed \
20 |        --deepspeed_config config_tasks/config_blocklm_10B_record.json \
21 |        --finetune \
22 |        --experiment-name ${EXPERIMENT_NAME} \
23 |        --task ${TASK_NAME} \
24 |        --data-dir ${DATA_PATH} \
25 |        --save ${CHECKPOINT_PATH} \
26 |        --seq-length ${MAX_SEQ_LEN} \
27 |        --checkpoint-activations \
28 |        --eval-batch-size 2 \
29 |        --save-epoch 100 \
30 |        --num-workers 1 \
31 |        --no-load-optim \
32 |        --no-load-lr-scheduler \
33 |        --fp16 \
34 |        $MODEL_ARGS \
35 |        $TRAIN_ARGS \
36 |        $COMMON_ARGS \
37 |        --model-parallel-size ${MP_SIZE} \
38 |        --epochs ${EPOCH_SINGLE} \
39 |        --overwrite \
40 |        2>&1 | tee logs/log-${EXPERIMENT_NAME}.txt"
41 | 
42 | echo ${run_cmd}
43 | eval ${run_cmd}
44 | 


--------------------------------------------------------------------------------
/scripts/ds_finetune_seq2seq.sh:
--------------------------------------------------------------------------------
 1 | DATA_ROOT=/root/data
 2 | CHECKPOINT_PATH="/dataset/c07bd62b/finetune_checkpoints"
 3 | SAVE_PATH=/root/data/finetune_checkpoints
 4 | DATESTR=$(date +"%m-%d-%H-%M")
 5 | 
 6 | source $1    # Model
 7 | source $2    # Task
 8 | 
 9 | NUM_WORKERS=2
10 | NUM_GPUS_PER_WORKER=8
11 | HOST_FILE_PATH="./hostfile"
12 | MP_SIZE=1
13 | MASTER_PORT=$(shuf -n 1 -i 10000-65535)
14 | 
15 | OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2"
16 | DISTRIBUTED_ARGS="${OPTIONS_NCCL} deepspeed --hostfile ${HOST_FILE_PATH} --master_port ${MASTER_PORT} --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER}"
17 | 
18 | EXPERIMENT_NAME=${EXPERIMENT_NAME}_${DATESTR}
19 | mkdir logs
20 | run_cmd="${DISTRIBUTED_ARGS} finetune_glm.py \
21 |        --deepspeed \
22 |        --deepspeed_config config_tasks/config_blocklm_10B_cnndm.json \
23 |        --finetune \
24 |        --experiment-name ${EXPERIMENT_NAME} \
25 |        --task ${TASK_NAME} \
26 |        --data-dir ${DATA_PATH} \
27 |        --save ${SAVE_PATH} \
28 |        --checkpoint-activations \
29 |        --num-workers 1 \
30 |        --no-load-lr-scheduler \
31 |        $MODEL_ARGS \
32 |        $TRAIN_ARGS \
33 |        $COMMON_ARGS \
34 |        $TASK_ARGS \
35 |        --fp16 \
36 |        --model-parallel-size ${MP_SIZE} \
37 |        --overwrite \
38 |        2>&1 | tee logs/log-${EXPERIMENT_NAME}.txt"
39 | 
40 | echo ${run_cmd}
41 | eval ${run_cmd}
42 | 


--------------------------------------------------------------------------------
/scripts/ds_finetune_superglue.sh:
--------------------------------------------------------------------------------
 1 | DATA_ROOT=/dataset/fd5061f6/tuteng/BlockLM/data
 2 | CHECKPOINT_PATH=/dataset/fd5061f6/english_data/checkpoints
 3 | SAVE_PATH=/dataset/fd5061f6/tuteng/BlockLM/finetune_checkpoints
 4 | DATESTR=$(date +"%m-%d-%H-%M")
 5 | 
 6 | source $1    # Model
 7 | source $2    # Task
 8 | 
 9 | NUM_WORKERS=1
10 | NUM_GPUS_PER_WORKER=8
11 | MP_SIZE=1
12 | MASTER_PORT=$(shuf -n 1 -i 10000-65535)
13 | 
14 | OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2"
15 | DISTRIBUTED_ARGS="${OPTIONS_NCCL} deepspeed --master_port $MASTER_PORT --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER}"
16 | 
17 | EXPERIMENT_NAME=${EXPERIMENT_NAME}_${DATESTR}
18 | mkdir logs
19 | run_cmd="${DISTRIBUTED_ARGS} finetune_glm.py \
20 |        --deepspeed \
21 |        --deepspeed_config config_tasks/config_blocklm_10B.json \
22 |        --finetune \
23 |        --cloze-eval \
24 |        --experiment-name ${EXPERIMENT_NAME} \
25 |        --task ${TASK_NAME} \
26 |        --data-dir ${DATA_PATH} \
27 |        --save ${CHECKPOINT_PATH} \
28 |        --seq-length ${MAX_SEQ_LEN} \
29 |        --checkpoint-activations \
30 |        --eval-batch-size 16 \
31 |        --save-epoch 100000 \
32 |        --num-workers 1 \
33 |        --no-load-optim \
34 |        --no-load-lr-scheduler \
35 |        $MODEL_ARGS \
36 |        $TRAIN_ARGS \
37 |        $COMMON_ARGS \
38 |        --pattern-id 0 \
39 |        --fp16 \
40 |        --model-parallel-size ${MP_SIZE} \
41 |        --epochs ${XXLARGE_EPOCH} \
42 |        --overwrite \
43 |        2>&1 | tee logs/log-${EXPERIMENT_NAME}.txt"
44 | 
45 | echo ${run_cmd}
46 | eval ${run_cmd}
47 | 


--------------------------------------------------------------------------------
/scripts/ds_finetune_superglue_prompt.sh:
--------------------------------------------------------------------------------
 1 | DATA_ROOT=/root/data/superglue
 2 | CHECKPOINT_PATH=/root/data/checkpoints
 3 | SAVE_PATH=/root/data/finetune_checkpoints
 4 | DATESTR=$(date +"%m-%d-%H-%M")
 5 | 
 6 | source $1    # Model
 7 | source $2    # Task
 8 | 
 9 | NUM_WORKERS=1
10 | NUM_GPUS_PER_WORKER=8
11 | MP_SIZE=1
12 | MASTER_PORT=$(shuf -n 1 -i 10000-65535)
13 | 
14 | OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2"
15 | DISTRIBUTED_ARGS="${OPTIONS_NCCL} deepspeed --master_port $MASTER_PORT --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER}"
16 | 
17 | EXPERIMENT_NAME=${EXPERIMENT_NAME}_${DATESTR}
18 | mkdir logs
19 | run_cmd="${DISTRIBUTED_ARGS} finetune_glm.py \
20 |        --deepspeed \
21 |        --deepspeed_config config_tasks/config_blocklm_10B.json \
22 |        --finetune \
23 |        --cloze-eval \
24 |        --experiment-name ${EXPERIMENT_NAME} \
25 |        --task ${TASK_NAME} \
26 |        --data-dir ${DATA_PATH} \
27 |        --save ${CHECKPOINT_PATH} \
28 |        --seq-length ${MAX_SEQ_LEN} \
29 |        --checkpoint-activations \
30 |        --eval-batch-size 16 \
31 |        --save-epoch 100000 \
32 |        --num-workers 1 \
33 |        --no-load-optim \
34 |        --no-load-lr-scheduler \
35 |        $MODEL_ARGS \
36 |        $TRAIN_ARGS \
37 |        $COMMON_ARGS \
38 |        --fp16 \
39 |        --model-parallel-size ${MP_SIZE} \
40 |        --continuous-prompt \
41 |        --num-prompt-tokens 3 \
42 |        --epochs ${XXLARGE_EPOCH} \
43 |        --overwrite \
44 |        2>&1 | tee logs/log-${EXPERIMENT_NAME}.txt"
45 | 
46 | echo ${run_cmd}
47 | eval ${run_cmd}
48 | 


--------------------------------------------------------------------------------
/scripts/ds_pretrain_nvidia.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Change for multinode config
 4 | 
 5 | NUM_WORKERS=32
 6 | NUM_GPUS_PER_WORKER=8
 7 | MP_SIZE=1
 8 | MASTER_PORT=$(shuf -n 1 -i 10000-65535)
 9 | 
10 | source $1
11 | DATESTR=$(date +"%m-%d-%H-%M")
12 | 
13 | OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2"
14 | HOST_FILE_PATH="/workspace/hostfile"
15 | 
16 | mkdir logs
17 | run_cmd="${OPTIONS_NCCL} deepspeed --master_port ${MASTER_PORT} --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} --hostfile ${HOST_FILE_PATH} pretrain_glm.py ${gpt_options} 2>&1 | tee logs/log-${DATESTR}.txt"
18 | echo ${run_cmd}
19 | eval ${run_cmd}
20 | 
21 | set +x


--------------------------------------------------------------------------------
/scripts/evaluate_lm.sh:
--------------------------------------------------------------------------------
 1 | DATA_ROOT=/dataset/c07bd62b
 2 | CHECKPOINT_PATH="/dataset/c07bd62b/checkpoints"
 3 | 
 4 | MASTER_PORT=$(shuf -n 1 -i 10000-65535)
 5 | DISTRIBUTED_ARGS="--nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr localhost --master_port $MASTER_PORT"
 6 | DATESTR=$(date +"%m-%d-%H-%M")
 7 | 
 8 | source $1    # Model
 9 | source $2    # Task
10 | 
11 | python -m torch.distributed.launch $DISTRIBUTED_ARGS finetune_glm.py \
12 |        --deepspeed \
13 |        --finetune \
14 |        --experiment-name ${EXPERIMENT_NAME} \
15 |        --task ${TASK_NAME} \
16 |        --valid-data ${DATA_PATH} \
17 |        --save ${CHECKPOINT_PATH} \
18 |        --checkpoint-activations \
19 |        --fp16 \
20 |        --overwrite \
21 |        $MODEL_ARGS \
22 |        $EVALUATE_ARGS \
23 |        2>&1 | tee logs/log-${EXPERIMENT_NAME}_${DATESTR}.txt


--------------------------------------------------------------------------------
/scripts/evaluate_multichoice.sh:
--------------------------------------------------------------------------------
 1 | CHECKPOINT_PATH=<Your checkpoint directory>
 2 | DATA_PATH= <Your jsonl file path>
 3 | 
 4 | source $1    # Model
 5 | 
 6 | NUM_WORKERS=1
 7 | NUM_GPUS_PER_WORKER=1
 8 | MP_SIZE=1
 9 | MASTER_PORT=$(shuf -n 1 -i 10000-65535)
10 | MAX_SEQ_LEN=512
11 | 
12 | OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2"
13 | DISTRIBUTED_ARGS="${OPTIONS_NCCL} deepspeed --master_port $MASTER_PORT --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER}"
14 | 
15 | mkdir logs
16 | run_cmd="${DISTRIBUTED_ARGS} finetune_glm.py \
17 |        --deepspeed \
18 |        --deepspeed_config config_tasks/config_blocklm_10B.json \
19 |        --finetune \
20 |        --cloze-eval \
21 |        --task multichoice \
22 |        --test-data ${DATA_PATH} \
23 |        --seq-length ${MAX_SEQ_LEN} \
24 |        --checkpoint-activations \
25 |        --eval-batch-size 16 \
26 |        --num-workers 1 \
27 |        --no-load-optim \
28 |        --no-load-lr-scheduler \
29 |        $MODEL_ARGS \
30 |        --fp16 \
31 |        --model-parallel-size ${MP_SIZE} \
32 |        --epochs 0 \
33 |        --overwrite \
34 |        2>&1"
35 | 
36 | echo ${run_cmd}
37 | eval ${run_cmd}
38 | 


--------------------------------------------------------------------------------
/scripts/evaluate_seq2seq.sh:
--------------------------------------------------------------------------------
1 | export CLASSPATH=/path/to/stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0.jar
2 | mkdir tmp
3 | cat $1 | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > tmp/test.hypo.tokenized
4 | cat $2 | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > tmp/test.hypo.target
5 | files2rouge tmp/test.hypo.tokenized tmp/test.hypo.target


--------------------------------------------------------------------------------
/scripts/finetune_blank.sh:
--------------------------------------------------------------------------------
 1 | DATA_ROOT=/root/data
 2 | CHECKPOINT_PATH=/root/data/checkpoints
 3 | SAVE_PATH=/root/data/finetune_checkpoints
 4 | DATESTR=$(date +"%m-%d-%H-%M")
 5 | 
 6 | MASK_RATIO=0.1
 7 | 
 8 | source $1    # Model
 9 | source $2    # Task
10 | 
11 | MASTER_PORT=$(shuf -n 1 -i 10000-65535)
12 | DISTRIBUTED_ARGS="--nproc_per_node 4 --nnodes 1 --node_rank 0 --master_addr localhost --master_port $MASTER_PORT"
13 | 
14 | python -m torch.distributed.launch $DISTRIBUTED_ARGS finetune_glm.py \
15 |        --finetune \
16 |        --experiment-name ${EXPERIMENT_NAME} \
17 |        --task ${TASK_NAME} \
18 |        --data-dir ${DATA_PATH} \
19 |        --save ${SAVE_PATH} \
20 |        --checkpoint-activations \
21 |        --overwrite \
22 |        $MODEL_ARGS \
23 |        $TRAIN_ARGS \
24 |        $COMMON_ARGS \
25 |        $TASK_ARGS \
26 |        2>&1 | tee logs/log-${EXPERIMENT_NAME}.txt


--------------------------------------------------------------------------------
/scripts/finetune_seq2seq.sh:
--------------------------------------------------------------------------------
 1 | DATA_ROOT=/root/data
 2 | CHECKPOINT_PATH=/dataset/fd5061f6/pretrained_models
 3 | SAVE_PATH=/dataset/fd5061f6/finetune_checkpoints
 4 | DATESTR=$(date +"%m-%d-%H-%M")
 5 | 
 6 | source $1    # Model
 7 | source $2    # Task
 8 | 
 9 | if [ -z $N_GPU ];then
10 |   N_GPU=4
11 | fi
12 | 
13 | MASTER_PORT=$(shuf -n 1 -i 10000-65535)
14 | DISTRIBUTED_ARGS="--nproc_per_node ${N_GPU} --nnodes 1 --node_rank 0 --master_addr localhost --master_port $MASTER_PORT"
15 | 
16 | DATESTR=$(date +"%m-%d-%H-%M")
17 | EXPERIMENT_NAME=${EXPERIMENT_NAME}  #-${DATESTR}
18 | 
19 | TOKENIZERS_PARALLELISM=false
20 | 
21 | mkdir logs
22 | python -m torch.distributed.launch $DISTRIBUTED_ARGS finetune_glm.py \
23 |        --finetune \
24 |        --experiment-name ${EXPERIMENT_NAME} \
25 |        --task ${TASK_NAME} \
26 |        --data-dir ${DATA_PATH} \
27 |        --save ${SAVE_PATH} \
28 |        --checkpoint-activations \
29 |        --epochs ${EPOCH_SINGLE} \
30 |        --batch-size ${BATCH_SINGLE} \
31 |        --lr ${LR_SINGLE} \
32 |        $MODEL_ARGS \
33 |        $TRAIN_ARGS \
34 |        $COMMON_ARGS \
35 |        $TASK_ARGS \
36 |        2>&1 | tee logs/log-${EXPERIMENT_NAME}.txt
37 | 
38 | 


--------------------------------------------------------------------------------
/scripts/finetune_seq2seq_grid.sh:
--------------------------------------------------------------------------------
 1 | DATA_ROOT=/root/data
 2 | CHECKPOINT_PATH=/dataset/fd5061f6/pretrained_models
 3 | SAVE_PATH=/dataset/fd5061f6/finetune_checkpoints
 4 | DATESTR=$(date +"%m-%d-%H-%M")
 5 | 
 6 | source $1    # Model
 7 | source $2    # Task
 8 | 
 9 | if [ -z $N_GPU ];then
10 |   N_GPU=4
11 | fi
12 | 
13 | MASTER_PORT=$(shuf -n 1 -i 10000-65535)
14 | DISTRIBUTED_ARGS="--nproc_per_node ${N_GPU} --nnodes 1 --node_rank 0 --master_addr localhost --master_port $MASTER_PORT"
15 | 
16 | DATESTR=$(date +"%m-%d-%H-%M")
17 | EXPERIMENT_NAME=${EXPERIMENT_NAME}  #-${DATESTR}
18 | 
19 | TOKENIZERS_PARALLELISM=false
20 | 
21 | mkdir logs
22 | GRID_LOG=logs/grid_${EXPERIMENT_NAME}_${DATESTR}.txt
23 | 
24 | for lr in 5e-6 1e-5 2e-5
25 | do
26 |   for batch in 4 8 12
27 |   do
28 |     for epoch in 5 10
29 |     do
30 |       HYPER=${lr}-${batch}-${epoch}
31 |       python -m torch.distributed.launch $DISTRIBUTED_ARGS finetune_glm.py \
32 |              --finetune \
33 |              --experiment-name ${EXPERIMENT_NAME}/${HYPER} \
34 |              --task ${TASK_NAME} \
35 |              --data-dir ${DATA_PATH} \
36 |              --save ${SAVE_PATH} \
37 |              --checkpoint-activations \
38 |              --epochs ${epoch} \
39 |              --batch-size ${batch} \
40 |              --lr ${lr} \
41 |              $MODEL_ARGS \
42 |              $TRAIN_ARGS \
43 |              $COMMON_ARGS \
44 |              $TASK_ARGS \
45 |              2>&1 | tee logs/log-${EXPERIMENT_NAME}-${HYPER}.txt
46 |       echo $lr $batch $epoch >> $GRID_LOG
47 |       cat runs/${EXPERIMENT_NAME}/${HYPER}/results.json >> $GRID_LOG
48 |     done
49 |   done
50 | done
51 | 
52 | 


--------------------------------------------------------------------------------
/scripts/finetune_superglue.sh:
--------------------------------------------------------------------------------
 1 | DATA_ROOT=/dataset/fd5061f6/english_data/superglue
 2 | CHECKPOINT_PATH=/dataset/fd5061f6/pretrained_models
 3 | SAVE_PATH=/dataset/fd5061f6/finetune_checkpoints
 4 | 
 5 | source $1    # Model
 6 | source $2    # Task
 7 | 
 8 | if [ -z $N_GPU ];then
 9 |   N_GPU=1
10 | fi
11 | 
12 | MASTER_PORT=$(shuf -n 1 -i 10000-65535)
13 | DISTRIBUTED_ARGS="--nproc_per_node ${N_GPU} --nnodes 1 --node_rank 0 --master_addr localhost --master_port $MASTER_PORT"
14 | 
15 | PER_GPU_BS=$(($BATCH_SIZE/$N_GPU))
16 | DATESTR=$(date +"%m-%d-%H-%M")
17 | EXPERIMENT_NAME=${EXPERIMENT_NAME}-${DATESTR}
18 | 
19 | mkdir logs
20 | python -m torch.distributed.launch $DISTRIBUTED_ARGS finetune_glm.py \
21 |        --finetune \
22 |        --cloze-eval \
23 |        --experiment-name ${EXPERIMENT_NAME} \
24 |        --task ${TASK_NAME} \
25 |        --data-dir ${DATA_PATH} \
26 |        --save ${SAVE_PATH} \
27 |        --seq-length ${MAX_SEQ_LEN} \
28 |        --checkpoint-activations \
29 |        --eval-batch-size 16 \
30 |        --save-epoch 100000 \
31 |        $MODEL_ARGS \
32 |        $TRAIN_ARGS \
33 |        $COMMON_ARGS \
34 |        --fp16 \
35 |        --batch-size ${PER_GPU_BS} \
36 |        --epochs ${EPOCH_SINGLE} \
37 |        --lr ${LR_SINGLE} \
38 |        --overwrite \
39 |        2>&1 | tee logs/log-${EXPERIMENT_NAME}.txt
40 | 


--------------------------------------------------------------------------------
/scripts/finetune_superglue_fast.sh:
--------------------------------------------------------------------------------
 1 | DATA_ROOT=/root/data/superglue
 2 | source config_tasks/model_blocklm.sh
 3 | source $1
 4 | 
 5 | CHECKPOINT_PATH="/root/data/finetune_checkpoints"
 6 | 
 7 | MASTER_PORT=$(shuf -n 1 -i 10000-65535)
 8 | DISTRIBUTED_ARGS="--nproc_per_node 4 --nnodes 1 --node_rank 0 --master_addr localhost --master_port $MASTER_PORT"
 9 | DATESTR=$(date +"%m-%d-%H-%M")
10 | 
11 | mkdir logs
12 | python -m torch.distributed.launch $DISTRIBUTED_ARGS finetune_glm.py \
13 |        --finetune \
14 |        --cloze-eval \
15 |        --experiment-name ${EXPERIMENT_NAME} \
16 |        --task ${TASK_NAME} \
17 |        --data-dir ${DATA_PATH} \
18 |        --save ${CHECKPOINT_PATH} \
19 |        --seq-length ${MAX_SEQ_LEN} \
20 |        --fast-decode \
21 |        --batch-size 8 \
22 |        --eval-batch-size 16 \
23 |        --save-epoch 5 \
24 |        $MODEL_ARGS \
25 |        $TRAIN_ARGS \
26 |        $COMMON_ARGS \
27 |        --epochs ${EPOCH_SINGLE} \
28 |        --lr ${LR_SINGLE} \
29 |        --overwrite \
30 |        2>&1 | tee logs/log-${EXPERIMENT_NAME}.txt
31 | 


--------------------------------------------------------------------------------
/scripts/finetune_superglue_grid.sh:
--------------------------------------------------------------------------------
 1 | DATA_ROOT=/dataset/c07bd62b/superglue
 2 | source config_tasks/model_blocklm_roberta_1.25.sh
 3 | source $1
 4 | 
 5 | CHECKPOINT_PATH="/dataset/c07bd62b/finetune_checkpoints"
 6 | 
 7 | if [ -z $N_GPU ];then
 8 |   N_GPU=2
 9 | fi
10 | MASTER_PORT=$(shuf -n 1 -i 10000-65535)
11 | DISTRIBUTED_ARGS="--nproc_per_node ${N_GPU} --nnodes 1 --node_rank 0 --master_addr localhost --master_port $MASTER_PORT"
12 | 
13 | DATESTR=$(date +"%m-%d-%H-%M")
14 | GRID_LOG=logs/grid_${EXPERIMENT_NAME}_${DATESTR}.txt
15 | mkdir logs
16 | for lr in 6e-6 1e-5 2e-5
17 | do
18 |   for seed in 1234 5678 3456
19 |   do
20 |   HYPER=${lr}-${seed}
21 |   PER_GPU_BS=$(($BATCH_SIZE/$N_GPU))
22 |   if [ ! -f runs/${EXPERIMENT_NAME}/${HYPER}/test_results.json ]; then
23 |     echo runs/${EXPERIMENT_NAME}/${HYPER}
24 |     python -m torch.distributed.launch $DISTRIBUTED_ARGS finetune_gpt2.py \
25 |        --finetune \
26 |        --experiment-name ${EXPERIMENT_NAME}/${HYPER} \
27 |        --task ${TASK_NAME} \
28 |        --data-dir ${DATA_PATH} \
29 |        --save ${CHECKPOINT_PATH} \
30 |        --seq-length ${MAX_SEQ_LEN} \
31 |        --checkpoint-activations \
32 |        --eval-batch-size 16 \
33 |        --save-epoch 1000 \
34 |        $MODEL_ARGS \
35 |        $TRAIN_ARGS \
36 |        $COMMON_ARGS \
37 |        --fp16 \
38 |        --attention-scale 8.0 \
39 |        --batch-size ${PER_GPU_BS} \
40 |        --epochs ${EPOCH_SINGLE} \
41 |        --lr-decay-style linear \
42 |        --lr ${lr} \
43 |        --seed ${seed} \
44 |        --overwrite \
45 |        2>&1 | tee logs/log-${EXPERIMENT_NAME}-${HYPER}.txt
46 |   fi
47 |   echo $lr $seed >> $GRID_LOG
48 |   cat runs/${EXPERIMENT_NAME}/${HYPER}/results.json >> $GRID_LOG
49 |   done
50 | done
51 | 
52 | echo $EXPERIMENT_NAME >> $GRID_LOG


--------------------------------------------------------------------------------
/scripts/generate_block.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CHECKPOINT_PATH=/zhangpai21/checkpoints
 3 | 
 4 | source $1
 5 | 
 6 | MPSIZE=1
 7 | MAXSEQLEN=512
 8 | MASTER_PORT=$(shuf -n 1 -i 10000-65535)
 9 | 
10 | #SAMPLING ARGS
11 | TEMP=0.9
12 | #If TOPK/TOPP are 0 it defaults to greedy sampling, top-k will also override top-p
13 | TOPK=40
14 | TOPP=0
15 | 
16 | script_path=$(realpath $0)
17 | script_dir=$(dirname $script_path)
18 | 
19 | config_json="$script_dir/ds_config.json"
20 | 
21 | python -m torch.distributed.launch --nproc_per_node=$MPSIZE --master_port $MASTER_PORT generate_samples.py \
22 |        --DDP-impl none \
23 |        --model-parallel-size $MPSIZE \
24 |        $MODEL_ARGS \
25 |        --fp16 \
26 |        --cache-dir cache \
27 |        --out-seq-length $MAXSEQLEN \
28 |        --seq-length 512 \
29 |        --temperature $TEMP \
30 |        --top-k $TOPK \
31 |        --top-p $TOPP
32 | 


--------------------------------------------------------------------------------
/scripts/multi-bleu.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
  4 | # Public License version 2.1 or, at your option, any later version.
  5 | 
  6 | # $Id$
  7 | use warnings;
  8 | use strict;
  9 | 
 10 | my $lowercase = 0;
 11 | if ($ARGV[0] eq "-lc") {
 12 |   $lowercase = 1;
 13 |   shift;
 14 | }
 15 | 
 16 | my $stem = $ARGV[0];
 17 | if (!defined $stem) {
 18 |   print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
 19 |   print STDERR "Reads the references from reference or reference0, reference1, ...\n";
 20 |   exit(1);
 21 | }
 22 | 
 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
 24 | 
 25 | my @REF;
 26 | my $ref=0;
 27 | while(-e "$stem$ref") {
 28 |     &add_to_ref("$stem$ref",\@REF);
 29 |     $ref++;
 30 | }
 31 | &add_to_ref($stem,\@REF) if -e $stem;
 32 | die("ERROR: could not find reference file $stem") unless scalar @REF;
 33 | 
 34 | # add additional references explicitly specified on the command line
 35 | shift;
 36 | foreach my $stem (@ARGV) {
 37 |     &add_to_ref($stem,\@REF) if -e $stem;
 38 | }
 39 | 
 40 | 
 41 | 
 42 | sub add_to_ref {
 43 |     my ($file,$REF) = @_;
 44 |     my $s=0;
 45 |     if ($file =~ /.gz$/) {
 46 | 	open(REF,"gzip -dc $file|") or die "Can't read $file";
 47 |     } else {
 48 | 	open(REF,$file) or die "Can't read $file";
 49 |     }
 50 |     while(<REF>) {
 51 | 	chop;
 52 | 	push @{$$REF[$s++]}, $_;
 53 |     }
 54 |     close(REF);
 55 | }
 56 | 
 57 | my(@CORRECT,@TOTAL,$length_translation,$length_reference);
 58 | my $s=0;
 59 | while(<STDIN>) {
 60 |     chop;
 61 |     $_ = lc if $lowercase;
 62 |     my @WORD = split;
 63 |     my %REF_NGRAM = ();
 64 |     my $length_translation_this_sentence = scalar(@WORD);
 65 |     my ($closest_diff,$closest_length) = (9999,9999);
 66 |     foreach my $reference (@{$REF[$s]}) {
 67 | #      print "$s $_ <=> $reference\n";
 68 |   $reference = lc($reference) if $lowercase;
 69 | 	my @WORD = split(' ',$reference);
 70 | 	my $length = scalar(@WORD);
 71 |         my $diff = abs($length_translation_this_sentence-$length);
 72 | 	if ($diff < $closest_diff) {
 73 | 	    $closest_diff = $diff;
 74 | 	    $closest_length = $length;
 75 | 	    # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
 76 | 	} elsif ($diff == $closest_diff) {
 77 |             $closest_length = $length if $length < $closest_length;
 78 |             # from two references with the same closeness to me
 79 |             # take the *shorter* into account, not the "first" one.
 80 |         }
 81 | 	for(my $n=1;$n<=4;$n++) {
 82 | 	    my %REF_NGRAM_N = ();
 83 | 	    for(my $start=0;$start<=$#WORD-($n-1);$start++) {
 84 | 		my $ngram = "$n";
 85 | 		for(my $w=0;$w<$n;$w++) {
 86 | 		    $ngram .= " ".$WORD[$start+$w];
 87 | 		}
 88 | 		$REF_NGRAM_N{$ngram}++;
 89 | 	    }
 90 | 	    foreach my $ngram (keys %REF_NGRAM_N) {
 91 | 		if (!defined($REF_NGRAM{$ngram}) ||
 92 | 		    $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
 93 | 		    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
 94 | #	    print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
 95 | 		}
 96 | 	    }
 97 | 	}
 98 |     }
 99 |     $length_translation += $length_translation_this_sentence;
100 |     $length_reference += $closest_length;
101 |     for(my $n=1;$n<=4;$n++) {
102 | 	my %T_NGRAM = ();
103 | 	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
104 | 	    my $ngram = "$n";
105 | 	    for(my $w=0;$w<$n;$w++) {
106 | 		$ngram .= " ".$WORD[$start+$w];
107 | 	    }
108 | 	    $T_NGRAM{$ngram}++;
109 | 	}
110 | 	foreach my $ngram (keys %T_NGRAM) {
111 | 	    $ngram =~ /^(\d+) /;
112 | 	    my $n = $1;
113 |             # my $corr = 0;
114 | #	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
115 | 	    $TOTAL[$n] += $T_NGRAM{$ngram};
116 | 	    if (defined($REF_NGRAM{$ngram})) {
117 | 		if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
118 | 		    $CORRECT[$n] += $T_NGRAM{$ngram};
119 |                     # $corr =  $T_NGRAM{$ngram};
120 | #	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
121 | 		}
122 | 		else {
123 | 		    $CORRECT[$n] += $REF_NGRAM{$ngram};
124 |                     # $corr =  $REF_NGRAM{$ngram};
125 | #	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
126 | 		}
127 | 	    }
128 |             # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
129 |             # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
130 | 	}
131 |     }
132 |     $s++;
133 | }
134 | my $brevity_penalty = 1;
135 | my $bleu = 0;
136 | 
137 | my @bleu=();
138 | 
139 | for(my $n=1;$n<=4;$n++) {
140 |   if (defined ($TOTAL[$n])){
141 |     $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
142 |     # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
143 |   }else{
144 |     $bleu[$n]=0;
145 |   }
146 | }
147 | 
148 | if ($length_reference==0){
149 |   printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
150 |   exit(1);
151 | }
152 | 
153 | if ($length_translation<$length_reference) {
154 |   $brevity_penalty = exp(1-$length_reference/$length_translation);
155 | }
156 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
157 | 				my_log( $bleu[2] ) +
158 | 				my_log( $bleu[3] ) +
159 | 				my_log( $bleu[4] ) ) / 4) ;
160 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
161 |     100*$bleu,
162 |     100*$bleu[1],
163 |     100*$bleu[2],
164 |     100*$bleu[3],
165 |     100*$bleu[4],
166 |     $brevity_penalty,
167 |     $length_translation / $length_reference,
168 |     $length_translation,
169 |     $length_reference;
170 | 
171 | sub my_log {
172 |   return -9999999999 unless $_[0];
173 |   return log($_[0]);
174 | }


--------------------------------------------------------------------------------
/tasks/language_model/detokenizer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def ptb_detokenizer(string):
 5 |     string = string.replace(" '", "'")
 6 |     string = string.replace(" \n", "\n")
 7 |     string = string.replace("\n ", "\n")
 8 |     string = string.replace(" n't", "n't")
 9 |     string = string.replace(" N ", "1 ")
10 |     string = string.replace("$ 1", "$1")
11 |     string = string.replace("# 1", "#1")
12 |     return string
13 | 
14 | 
15 | def wikitext_detokenizer(string):
16 |     # contractions
17 |     string = string.replace("s '", "s'")
18 |     string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
19 |     # number separators
20 |     string = string.replace(" @-@ ", "-")
21 |     string = string.replace(" @,@ ", ",")
22 |     string = string.replace(" @.@ ", ".")
23 |     # punctuation
24 |     string = string.replace(" : ", ": ")
25 |     string = string.replace(" ; ", "; ")
26 |     string = string.replace(" . ", ". ")
27 |     string = string.replace(" ! ", "! ")
28 |     string = string.replace(" ? ", "? ")
29 |     string = string.replace(" , ", ", ")
30 |     # double brackets
31 |     string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
32 |     string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
33 |     string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
34 |     string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
35 |     string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
36 |     # miscellaneous
37 |     string = string.replace("= = = =", "====")
38 |     string = string.replace("= = =", "===")
39 |     string = string.replace("= =", "==")
40 |     string = string.replace(" " + chr(176) + " ", chr(176))
41 |     string = string.replace(" \n", "\n")
42 |     string = string.replace("\n ", "\n")
43 |     string = string.replace(" N ", " 1 ")
44 |     string = string.replace(" 's", "'s")
45 | 
46 |     return string
47 | 
48 | 
49 | def lambada_detokenizer(string):
50 |     return string
51 | 
52 | 
53 | def get_detokenizer(dataset):
54 |     return DETOKENIZERS[dataset]
55 | 
56 | 
57 | DETOKENIZERS = {
58 |     'ptb': ptb_detokenizer,
59 |     'wikitext': wikitext_detokenizer,
60 |     'lambada': lambada_detokenizer,
61 | }
62 | 


--------------------------------------------------------------------------------
/tasks/seq2seq/finetune.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Race."""
 17 | import torch
 18 | import mpu
 19 | import json
 20 | import functools
 21 | from tasks.eval_utils import accuracy_func_provider
 22 | from finetune_glm import finetune
 23 | from pretrain_glm import get_batch
 24 | from collections import OrderedDict
 25 | from tasks.seq2seq.dataset import Seq2SeqDataset, BlankLMDataset, ExtractionDataset, CustomizationDataset
 26 | from tasks.seq2seq.evaluate import rouge_metric, DecoderEvaluater, BlankLMEvaluater
 27 | from tasks.superglue.evaluate import squad_exact_match, squad_f1
 28 | 
 29 | global_tokenizer = None
 30 | 
 31 | 
 32 | def seq2seq_forward_step(data, model, args, timers, mems):
 33 |     """Forward step."""
 34 | 
 35 |     # Get the batch.
 36 |     if timers is not None:
 37 |         timers('batch generator').start()
 38 |     tokens, labels, loss_mask, attention_mask, position_ids = get_batch(data, args)
 39 |     if timers is not None:
 40 |         timers('batch generator').stop()
 41 |     # Forward model.
 42 |     logits, *mems = model(tokens, position_ids, attention_mask, *mems)
 43 |     # logits, loss_mask = logits[:, args.src_seq_length:], loss_mask[:, args.src_seq_length:]
 44 |     # target_ids = target_ids[:, args.src_seq_length:]
 45 |     losses = mpu.vocab_parallel_cross_entropy(logits.contiguous().float(), labels)
 46 |     if args.label_smoothing > 0.0:
 47 |         epsilon = args.label_smoothing
 48 |         smooth_loss = -torch.nn.functional.log_softmax(logits, dim=-1).mean(dim=-1)
 49 |         losses = (1 - epsilon) * losses + epsilon * smooth_loss
 50 |     loss_mask = loss_mask.reshape(-1)
 51 |     # The loss is not normalized for fair comparison
 52 |     loss = torch.sum(losses.reshape(-1) * loss_mask) / loss_mask.sum()
 53 |     return loss, mems, 'bert'
 54 | 
 55 | 
 56 | def train_valid_datasets_provider(args, tokenizer):
 57 |     """Provide train and validation datasets."""
 58 |     if args.task.lower() == 'blank':
 59 |         train_dataset = BlankLMDataset(args, split='train', tokenizer=tokenizer)
 60 |         valid_dataset = None
 61 |     elif args.task.lower() == 'extraction':
 62 |         train_dataset = ExtractionDataset(args, split='train', tokenizer=tokenizer)
 63 |         valid_dataset = None
 64 |     elif args.task.lower() == 'customization':
 65 |         train_dataset = CustomizationDataset(args, split='train', tokenizer=tokenizer)
 66 |         valid_dataset = None
 67 |     else:
 68 |         train_dataset = Seq2SeqDataset(args, split='train', tokenizer=tokenizer)
 69 |         valid_dataset = None
 70 |     global global_tokenizer
 71 |     global_tokenizer = tokenizer
 72 |     return train_dataset, valid_dataset
 73 | 
 74 | 
 75 | def metrics_func_provider(args, tokenizer, is_test):
 76 |     """Provide metrics callback function."""
 77 | 
 78 |     def single_dataset_provider(split):
 79 |         if args.task.lower() == 'blank':
 80 |             return BlankLMDataset(args, split=split, tokenizer=tokenizer)
 81 |         elif args.task.lower() == 'extraction':
 82 |             return ExtractionDataset(args, split=split, tokenizer=tokenizer)
 83 |         elif args.task.lower() == 'customization':
 84 |             return CustomizationDataset(args, split=split, tokenizer=tokenizer)
 85 |         else:
 86 |             return Seq2SeqDataset(args, split=split, tokenizer=tokenizer)
 87 | 
 88 |     if args.task.lower() in ['blank', 'extraction']:
 89 |         evaluater = BlankLMEvaluater(args, tokenizer)
 90 |         eval_func = evaluater.evaluate
 91 |         metric_dict = {}
 92 |     else:
 93 |         evaluater = DecoderEvaluater(args, tokenizer)
 94 |         eval_func = evaluater.evaluate
 95 |         if args.tokenizer_type == "BertWordPieceTokenizer":
 96 |             dataset = 'cnn_dm'
 97 |         elif args.task.lower() == 'gigaword':
 98 |             dataset = 'gigaword'
 99 |         else:
100 |             dataset = 'cnn_dm_org'
101 |         if args.task.lower() in ['squad', 'squad_v1']:
102 |             metric_dict = {"EM": squad_exact_match, "F1": squad_f1}
103 |         else:
104 |             metric_dict = OrderedDict({"rouge-1": functools.partial(rouge_metric, metric="rouge-1", dataset=dataset),
105 |                                        "rouge-2": functools.partial(rouge_metric, metric="rouge-2", dataset=dataset),
106 |                                        "rouge-l": functools.partial(rouge_metric, metric="rouge-l", dataset=dataset)})
107 | 
108 |     def output_func(predictions, examples, output_file):
109 |         if args.task.lower() in ['squad', 'squad_v1']:
110 |             with open(output_file, "w", encoding='utf-8') as output:
111 |                 res = {}
112 |                 for prediction, example in zip(predictions, examples):
113 |                     idx = example.idx
114 |                     if prediction.lower().replace(' ', '') == 'n/a':
115 |                         prediction = ''
116 |                     if idx not in res or res[idx] == '':
117 |                         res[idx] = prediction
118 |                 json.dump(res, output)
119 |             with open(output_file + ".refs", "w", encoding='utf-8') as output:
120 |                 for prediction, example in zip(predictions, examples):
121 |                     res = {'id': example.idx, 'pred': prediction, 'gold': example.meta['answers']}
122 |                     output.write(json.dumps(res) + '\n')
123 |             return
124 |         with open(output_file + ".hyps", "w", encoding='utf-8') as output:
125 |             for prediction in predictions:
126 |                 output.write(prediction)
127 |                 output.write("\n")
128 |         with open(output_file + ".refs", "w", encoding='utf-8') as output:
129 |             for example in examples:
130 |                 output.write(example.meta["ref"])
131 |                 output.write("\n")
132 |         if args.task.lower() == 'squad_generation':
133 |             with open(output_file + ".source", "w", encoding='utf-8') as output:
134 |                 for example in examples:
135 |                     output.write(example.text_a.replace("\n", " ") + " Answer: " + example.meta["answer"])
136 |                     output.write("\n")
137 | 
138 |     return accuracy_func_provider(single_dataset_provider, metric_dict, args, is_test=is_test, eval_func=eval_func,
139 |                                   output_func=output_func, only_rank0=False)
140 | 
141 | 
142 | def main(args):
143 |     if args.src_seq_length > args.max_position_embeddings:
144 |         args.max_position_embeddings = args.src_seq_length
145 |     if args.task.lower() in ['cnn_dm', 'cnn_dm_original', 'gigaword', 'blank', 'squad_generation', 'xsum',
146 |                              'squad', 'squad_v1', 'extraction', 'cmrc', 'customization']:
147 |         finetune(args, train_valid_datasets_provider, {}, end_of_epoch_callback_provider=metrics_func_provider,
148 |                  forward_step=seq2seq_forward_step)
149 |     else:
150 |         raise NotImplementedError(args.task)
151 | 


--------------------------------------------------------------------------------
/tasks/superglue/README.md:
--------------------------------------------------------------------------------
  1 | # Use GLM for your NLU tasks
  2 | To use GLM for your own NLU tasks, you should implement a subclass of `DataProcessor` in [tasks/superglue/dataset.py](dataset.py) and a subclass of `PVP` in [tasks/superglue/pvp.py](pvp.py). You should also specify the  We will take the RTE and ReCoRD tasks in SuperGLUE as an example.
  3 | 
  4 | ## 1. Design your patterns
  5 | RTE is an NLI task in which the model is required to predict text entailment between a premise and a hypothesis. The label can be `entailment` or `not_entailment` One sample from the training set is 
  6 | ```
  7 | premise: No Weapons of Mass Destruction Found in Iraq Yet.
  8 | hypothesis: Weapons of Mass Destruction Found in Iraq.
  9 | label: not_entailment
 10 | ```
 11 | We design the pattern as
 12 | ```
 13 | "`hypothesis`"?, [MASK], "`premise`"
 14 | ```
 15 | GLM predicts "Yes" for `entailment` and "No" for `not_entailment`. "Yes" and "No" are called verbalizers for `entailment` and `not_entailment`.
 16 | 
 17 | ReCoRD is a multi-choice QA task. Each example consists of a news article and a Cloze-style question about the article in which one entity is masked out. The system must predict the masked out entity from a list of possible entities in the provided passage. We directly adopt the cloze-style question as our pattern and use GLM to predict the masked entity. 
 18 | 
 19 | ## 2. Implement subclass of `DataProcessor`
 20 | A subclass of `DataProcessor` should implement `get_train_examples`, `get_dev_examples` and `get_test_examples`, which return the examples of the train, dev, and test sets. The returned value is a list of `InputExample`. It should also implement `get_labels` to return the list of possible labels. Hete we take the `RTEProcessor` as an example:
 21 | ```python
 22 | class RteProcessor(DataProcessor):
 23 |     """Processor for the RTE data set."""
 24 |     
 25 |     def get_train_examples(self, data_dir):
 26 |         return self._create_examples(os.path.join(data_dir, "train.jsonl"), "train")
 27 | 
 28 |     def get_dev_examples(self, data_dir, for_train=False):
 29 |         return self._create_examples(os.path.join(data_dir, "val.jsonl"), "dev")
 30 | 
 31 |     def get_test_examples(self, data_dir):
 32 |         return self._create_examples(os.path.join(data_dir, "test.jsonl"), "test")
 33 | 
 34 |     def get_unlabeled_examples(self, data_dir):
 35 |         return self._create_examples(os.path.join(data_dir, "unlabeled.jsonl"), "unlabeled")
 36 | 
 37 |     def get_labels(self):
 38 |         return ["entailment", "not_entailment"]
 39 | 
 40 |     def _create_examples(self, path: str, set_type: str, hypothesis_name: str = "hypothesis",
 41 |                          premise_name: str = "premise") -> List[InputExample]:
 42 |         examples = []
 43 | 
 44 |         with open(path, encoding='utf8') as f:
 45 |             for line_idx, line in enumerate(f):
 46 |                 example_json = json.loads(line)
 47 |                 idx = example_json['idx']
 48 |                 if isinstance(idx, str):
 49 |                     try:
 50 |                         idx = int(idx)
 51 |                     except ValueError:
 52 |                         idx = line_idx
 53 |                 label = example_json.get('label')
 54 |                 guid = "%s-%s" % (set_type, idx)
 55 |                 text_a = example_json[premise_name]
 56 |                 text_b = example_json[hypothesis_name]
 57 | 
 58 |                 example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, idx=idx)
 59 |                 examples.append(example)
 60 | 
 61 |         return examples
 62 | ```
 63 | After that, you should add the implemented class to ``PROCESSORS`` at the end of [tasks/superglue/dataset.py](dataset.py):
 64 | ```python
 65 | PROCESSORS = {
 66 |     ...
 67 |     "rte": RteProcessor
 68 | }
 69 | ```
 70 | 
 71 | ## 3. Implement subclass of `PVP`
 72 | To implement a subclass of `PVP`, you should first decide your verbalizers is single-token or multi-token. The verbalizers in RTE, "Yes" and "No" are single-token. Instead, the verbalizers in ReCoRD are multi-token, as one entity can be tokenized into multiple tokens with WordPiece or BPE tokenizer.
 73 | 
 74 | For single-token task, you should set `is_multi_token=False` in the class definition. You should implement `get_parts` to return the inputs to GLM given an example and `verbalize` to return the verbalizer given a label. Take `RTEPVP` as an example:
 75 | ```python
 76 | class RtePVP(PVP):
 77 |     is_multi_token = False
 78 |     VERBALIZER = {
 79 |         "not_entailment": [" No"],
 80 |         "entailment": [" Yes"]
 81 |     }
 82 | 
 83 |     @property
 84 |     def spell_length(self):
 85 |         return self.pattern_id
 86 | 
 87 |     def get_parts(self, example: InputExample) -> FilledPattern:
 88 |         # switch text_a and text_b to get the correct order
 89 |         text_a = example.text_a
 90 |         text_b = example.text_b.rstrip(string.punctuation)
 91 |         return ['"', self.shortenable(text_b), '" ?'], [[self.mask], ', "', self.shortenable(text_a), '"']
 92 | 
 93 |     def verbalize(self, label) -> List[str]:
 94 |         return RtePVP.VERBALIZER[label]
 95 | ```
 96 | We use `PvP.shortenable` to mark the segments that can be truncated when exceeding the maximum sequence length.
 97 | 
 98 | For multi-token task, you should set `is_multi_token=True` in the class definition. You should implement `get_parts` to return the inputs to GLM given an example and `get_answers` to return the candidates. Take `ReCoRDPVP` as an example:
 99 | ```python
100 | class RecordPVP(PVP):
101 |     is_multi_token = True
102 | 
103 |     def get_answers(self, example: InputExample):
104 |         choices = example.meta['candidates']
105 |         choices = [" " + choice for choice in choices]
106 |         return choices
107 | 
108 |     def get_parts(self, example: InputExample) -> FilledPattern:
109 |         premise = self.shortenable(example.text_a)
110 | 
111 |         assert '@placeholder' in example.text_b, f'question "{example.text_b}" does not contain a @placeholder token'
112 |         question_a, question_b = example.text_b.split('@placeholder')
113 |         return [premise, " " + question_a.rstrip(), [self.mask], question_b], []
114 | ```
115 | After that, you should implement the class to `PVPS` at the end of [tasks/superglue/pvp.py](pvp.py):
116 | ```python
117 | PVPS = {
118 |     ...
119 |     'rte': RtePVP,
120 |     'record': RecordPVP
121 | }
122 | ```
123 | ## 4. Run the experiment
124 | To run the experiment for your new task, you should create a config file like [config_tasks/task_rte.sh](/config_tasks/task_rte.sh). You should also specify the evaluation metrics for the task in `DEFAULT_METRICS` of [tasks/superglue/finetune.py](finetune.py):
125 | ```python
126 | DEFAULT_METRICS = {
127 |     ...
128 |     "record": [("EM", qa_exact_match), ("F1", qa_f1)],
129 |     "rte": [("accuracy", accuracy_metric)]
130 | }
131 | ```
132 | Then you can run the experiment with [finetune_superglue.sh](/scripts/finetune_superglue.sh):
133 | ```shell
134 | bash scripts/finetune_superglue.sh \
135 |      config_tasks/model_blocklm_large.sh \
136 |      config_tasks/task_rte.sh
137 | ```


--------------------------------------------------------------------------------
/tasks/superglue/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/GLM/4f61ed7237a3b0187f4d62062429348276a78c84/tasks/superglue/__init__.py


--------------------------------------------------------------------------------
/tasks/superglue/evaluate.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Official evaluation script for ReCoRD v1.0.
  3 | (Some functions are adopted from the SQuAD evaluation script.)
  4 | """
  5 | 
  6 | from __future__ import print_function
  7 | from collections import Counter
  8 | import string
  9 | import re
 10 | from tasks.data_utils import InputExample
 11 | from typing import List
 12 | import functools
 13 | from collections import defaultdict
 14 | import unidecode
 15 | 
 16 | 
 17 | def normalize_answer(s):
 18 |     """Lower text and remove punctuation, articles and extra whitespace."""
 19 | 
 20 |     def remove_articles(text):
 21 |         return re.sub(r'\b(a|an|the)\b', ' ', text)
 22 | 
 23 |     def white_space_fix(text):
 24 |         return ' '.join(text.split())
 25 | 
 26 |     def remove_punc(text):
 27 |         exclude = set(string.punctuation)
 28 |         return ''.join(ch for ch in text if ch not in exclude)
 29 | 
 30 |     def lower(text):
 31 |         return unidecode.unidecode(text.lower())
 32 | 
 33 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
 34 | 
 35 | 
 36 | def f1_score(prediction, ground_truth):
 37 |     prediction_tokens = normalize_answer(prediction).split()
 38 |     ground_truth_tokens = normalize_answer(ground_truth).split()
 39 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
 40 |     num_same = sum(common.values())
 41 |     if num_same == 0:
 42 |         return 0
 43 |     precision = 1.0 * num_same / len(prediction_tokens)
 44 |     recall = 1.0 * num_same / len(ground_truth_tokens)
 45 |     f1 = (2 * precision * recall) / (precision + recall)
 46 |     return f1
 47 | 
 48 | 
 49 | def exact_match_score(prediction, ground_truth):
 50 |     return normalize_answer(prediction) == normalize_answer(ground_truth)
 51 | 
 52 | 
 53 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
 54 |     if not ground_truths:
 55 |         return 0.0
 56 |     scores_for_ground_truths = []
 57 |     for ground_truth in ground_truths:
 58 |         score = metric_fn(prediction, ground_truth)
 59 |         scores_for_ground_truths.append(score)
 60 |     return max(scores_for_ground_truths)
 61 | 
 62 | 
 63 | def qa_evaluate(predictions, labels, examples: List[InputExample], metric):
 64 |     assert len(examples) == len(predictions)
 65 |     score = 0.0
 66 |     for example, prediction in zip(examples, predictions):
 67 |         ground_truths = example.meta["answers"]
 68 |         prediction = example.meta["candidates"][prediction]
 69 |         if ground_truths:
 70 |             score += metric_max_over_ground_truths(metric, prediction, ground_truths)
 71 |     score = 100.0 * score / len(predictions)
 72 |     return score
 73 | 
 74 | 
 75 | def squad_evaluate(predictions, labels, examples, metric):
 76 |     assert len(examples) == len(predictions)
 77 |     score = 0.0
 78 |     idx2predictions = {}
 79 |     idx2ground_truths = {}
 80 |     for example, prediction in zip(examples, predictions):
 81 |         idx = example.idx
 82 |         if idx not in idx2predictions:
 83 |             idx2predictions[idx] = []
 84 |             idx2ground_truths[idx] = example.meta["answers"]
 85 |         idx2predictions[idx].append(prediction)
 86 |     # assert len(predictions) == len(idx2predictions)
 87 |     for idx, predictions in idx2predictions.items():
 88 |         prediction = 'N/A'
 89 |         for i in range(len(predictions)):
 90 |             prediction = predictions[i]
 91 |             if prediction.lower().replace(' ', '') == 'n/a':
 92 |                 prediction = 'N/A'
 93 |             else:
 94 |                 break
 95 |         ground_truths = idx2ground_truths[idx]
 96 |         if len(ground_truths) == 1 and ground_truths[0] == 'N/A':
 97 |             score += (prediction == 'N/A')
 98 |         else:
 99 |             score += metric_max_over_ground_truths(metric, prediction, ground_truths)
100 |     score = 100.0 * score / len(idx2predictions)
101 |     return score
102 | 
103 | 
104 | def multirc_em(predictions, labels, examples: List[InputExample]):
105 |     """Compute the exact match (EM) for a sequence of predictions and actual labels"""
106 |     question_ids = [example.meta["question_idx"] for example in examples]
107 |     unique_questions = set(question_ids)
108 | 
109 |     q_actuals = list(zip(question_ids, labels))
110 |     q_predictions = list(zip(question_ids, predictions))
111 | 
112 |     actuals_per_question = defaultdict(list)
113 |     predictions_per_question = defaultdict(list)
114 | 
115 |     for qid, val in q_actuals:
116 |         actuals_per_question[qid].append(val)
117 |     for qid, val in q_predictions:
118 |         predictions_per_question[qid].append(val)
119 | 
120 |     em = 0
121 |     for qid in unique_questions:
122 |         if actuals_per_question[qid] == predictions_per_question[qid]:
123 |             em += 1
124 |     em /= len(unique_questions)
125 |     return em
126 | 
127 | 
128 | qa_exact_match = functools.partial(qa_evaluate, metric=exact_match_score)
129 | qa_f1 = functools.partial(qa_evaluate, metric=f1_score)
130 | 
131 | squad_exact_match = functools.partial(squad_evaluate, metric=exact_match_score)
132 | squad_f1 = functools.partial(squad_evaluate, metric=f1_score)
133 | 


--------------------------------------------------------------------------------
/tasks/superglue/finetune.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Race."""
 17 | 
 18 | from collections import OrderedDict
 19 | from finetune_glm import finetune
 20 | from tasks.superglue.dataset import SuperGlueDataset, MultiChoiceDataset, PROCESSORS, get_output_func
 21 | from tasks.superglue.dataset import CLASSIFICATION_DATASETS, MULTI_CHOICE_DATASETS
 22 | from tasks.superglue.evaluate import qa_exact_match, qa_f1, multirc_em, squad_exact_match, squad_f1
 23 | from tasks.superglue.pvp import PVPS
 24 | from tasks.eval_utils import accuracy_func_provider
 25 | from tasks.eval_utils import accuracy_metric, f1_macro_metric, f1_metric
 26 | from glob import glob
 27 | 
 28 | DEFAULT_METRICS = {
 29 |     "record": [("EM", qa_exact_match), ("F1", qa_f1)],
 30 |     "copa": [("accuracy", accuracy_metric)],
 31 |     "rte": [("accuracy", accuracy_metric)],
 32 |     "boolq": [("accuracy", accuracy_metric)],
 33 |     "wic": [("accuracy", accuracy_metric)],
 34 |     "wsc": [("accuracy", accuracy_metric)],
 35 |     "cb": [("accuracy", accuracy_metric), ("f1-macro", f1_macro_metric)],
 36 |     "multirc": [("f1a", f1_metric), ("em", multirc_em), ("acc", accuracy_metric)],
 37 |     "mnli": [("accuracy", accuracy_metric)],
 38 |     "sst2": [("accuracy", accuracy_metric)],
 39 |     "qnli": [("accuracy", accuracy_metric)],
 40 |     "qqp": [("accuracy", accuracy_metric)],
 41 |     "mrpc": [("accuracy", accuracy_metric)],
 42 |     "cola": [("accuracy", accuracy_metric)],
 43 |     "squad": [("accuracy", accuracy_metric)],
 44 |     "afqmc": [("accuracy", accuracy_metric)],
 45 |     "tnews": [("accuracy", accuracy_metric)],
 46 |     "cluewsc": [("accuracy", accuracy_metric)],
 47 |     "cmrc": [("accuracy", accuracy_metric)],
 48 |     "multichoice": [("accuracy", accuracy_metric)]
 49 | }
 50 | 
 51 | 
 52 | def train_valid_datasets_provider(args, tokenizer, pattern_text=False):
 53 |     """Provide train and validation datasets."""
 54 |     task_name = args.task.lower()
 55 |     data_dir = args.data_dir
 56 |     train_dataset = SuperGlueDataset(args, task_name, data_dir, args.seq_length, "train", tokenizer,
 57 |                                      pattern_text=pattern_text)
 58 |     valid_dataset = SuperGlueDataset(args, task_name, data_dir, args.seq_length, "dev", tokenizer, for_train=True,
 59 |                                      pattern_text=pattern_text)
 60 | 
 61 |     return train_dataset, valid_dataset
 62 | 
 63 | 
 64 | def metrics_func_provider(args, tokenizer, is_test):
 65 |     """Privde metrics callback function."""
 66 | 
 67 |     def single_dataset_provider(split):
 68 |         if args.task == "multichoice":
 69 |             return MultiChoiceDataset(args, split, tokenizer, args.seq_length)
 70 |         else:
 71 |             return SuperGlueDataset(args, args.task.lower(), args.data_dir, args.seq_length, split, tokenizer)
 72 | 
 73 |     output_func = get_output_func(args.task.lower(), args)
 74 |     eval_func = None
 75 |     if args.task.lower() == 'wsc' and args.cloze_eval and not args.wsc_negative:
 76 |         from tasks.language_model.finetune import classify_evaluate
 77 |         eval_func = classify_evaluate
 78 |     metric_dict = OrderedDict(DEFAULT_METRICS[args.task.lower()])
 79 |     return accuracy_func_provider(single_dataset_provider, metric_dict, args, is_test=is_test, eval_func=eval_func,
 80 |                                   output_func=output_func, only_rank0=False, tokenizer=tokenizer)
 81 | 
 82 | 
 83 | def main(args):
 84 |     model_kwargs = {}
 85 |     if args.task.lower() != "multichoice":
 86 |         processor = PROCESSORS[args.task.lower()](args)
 87 |         pvp = PVPS[args.task.lower()](args, None, processor.get_labels(), args.seq_length,
 88 |                                       pattern_id=args.pattern_id, is_multi_token=args.multi_token,
 89 |                                       num_prompt_tokens=args.num_prompt_tokens)
 90 |     else:
 91 |         patterns = args.test_data
 92 |         datapaths = []
 93 |         for pattern in patterns:
 94 |             for path in glob(pattern, recursive=True):
 95 |                 datapaths.append(path)
 96 |         args.test_data = datapaths
 97 |     if args.continuous_prompt:
 98 |         model_kwargs["spell_length"] = pvp.spell_length
 99 |     if args.task.lower() == 'wsc' and args.cloze_eval and not args.wsc_negative:
100 |         from tasks.language_model.finetune import lm_forward_step
101 |         finetune(args, train_valid_datasets_provider, model_kwargs,
102 |                  end_of_epoch_callback_provider=metrics_func_provider, forward_step=lm_forward_step)
103 |     else:
104 |         if args.task.lower() == "multichoice":
105 |             multi_token = True
106 |         elif args.cloze_eval:
107 |             multi_token = pvp.is_multi_token
108 |         else:
109 |             multi_token = args.task.lower() in MULTI_CHOICE_DATASETS
110 |         args.multi_token = multi_token
111 |         if not multi_token:
112 |             model_kwargs["model_type"] = "multiple_choice" if args.cloze_eval else "classification"
113 |             model_kwargs["multi_token"] = False
114 |             model_kwargs["num_labels"] = len(processor.get_labels())
115 |         else:
116 |             model_kwargs["model_type"] = "multiple_choice"
117 |             model_kwargs["multi_token"] = True
118 |             model_kwargs["num_labels"] = 1
119 |         finetune(args, train_valid_datasets_provider, model_kwargs,
120 |                  end_of_epoch_callback_provider=metrics_func_provider)
121 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/GLM/4f61ed7237a3b0187f4d62062429348276a78c84/test/__init__.py


--------------------------------------------------------------------------------
/test/test_block.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | from blocklm_utils import ConstructBlockStrategy
 4 | from argparse import Namespace
 5 | 
 6 | 
 7 | # rng = random.Random()
 8 | # span_lengths = [2, 3, 4, 2, 3, 4]
 9 | # length = 100
10 | #
11 | # counts = np.array([0] * length)
12 | # for _ in range(10000):
13 | #     rng.shuffle(span_lengths)
14 | #     spans = ConstructBlockStrategy.sample_spans(span_lengths, length, rng)
15 | #     for start, end in spans:
16 | #         counts[start: end] += 1
17 | # print(counts)
18 | def main():
19 |     args = Namespace()
20 |     args.seq_length = 10
21 |     args.eod_token = 0
22 | 
23 |     strategy = ConstructBlockStrategy(args, None, bert_ratio=0.4, max_seq_length=128)
24 |     counts = np.array([0] * 10)
25 |     for _ in range(10000):
26 |         spans = strategy.sample_span_in_document(np.array([1, 2, 3, 0, 4, 5, 6, 7, 9, 0], dtype=np.long), [1, 1],
27 |                                                  random.Random())
28 |         for start, end in spans:
29 |             counts[start: end] += 1
30 | 
31 |     print(counts)
32 | 


--------------------------------------------------------------------------------
/test/test_rel_shift.py:
--------------------------------------------------------------------------------
 1 | # import torch
 2 | # from mpu.transformer import GPT2ParallelSelfAttention
 3 | #
 4 | # b = torch.arange(2) * 1000
 5 | # h = torch.arange(3) * 100
 6 | # pos_seq = torch.arange(9, -1, -1)
 7 | # query = torch.arange(7) * 10
 8 | # s = pos_seq.unsqueeze(0) + query.unsqueeze(1)
 9 | # s = b.view(-1, 1, 1, 1) + h.view(1, -1, 1, 1) + s
10 | # s = GPT2ParallelSelfAttention._rel_shift(s)
11 | # print(s)
12 | 
13 | from torch.nn.modules import Linear
14 | from torch.optim import Adam
15 | from learning_rates import AnnealingLR
16 | import matplotlib.pyplot as plt
17 | import numpy as np
18 | 
19 | 
20 | def main():
21 |     model = Linear(10, 10)
22 |     optimizer = Adam(model.parameters())
23 |     lr_scheduler = AnnealingLR(optimizer,
24 |                                start_lr=0.00015,
25 |                                warmup_iter=3000,
26 |                                num_iters=300000,
27 |                                decay_style='cosine',
28 |                                decay_ratio=0.1)
29 |     steps = np.arange(0, 400000, 10, dtype=np.long)
30 |     rates = []
31 |     for step in steps:
32 |         lr_scheduler.num_iters = step
33 |         rates.append(lr_scheduler.get_lr())
34 |     print(rates)
35 |     plt.plot(steps, rates)
36 |     plt.savefig("lr.pdf", format='pdf')


--------------------------------------------------------------------------------