├── mpu ├── tests │ ├── __init__.py │ ├── commons.py │ ├── test_data.py │ ├── test_initialize.py │ ├── test_cross_entropy.py │ └── test_random.py ├── __init__.py ├── utils.py ├── grads.py ├── data.py ├── mappings.py ├── cross_entropy.py ├── initialize.py └── random.py ├── requirements.txt ├── docker ├── README.md ├── requirements.txt └── Dockerfile ├── openwebtext ├── run_make_gpt2_dataset.sh ├── make_gpt2_sizes.py ├── tokenizer.py ├── merge_jsons.py ├── README.md ├── remove_group_duplicates.py ├── make_gpt2_dataset.py ├── group_duplicates_url.py ├── find_duplicates.py ├── cleanup_dataset.py └── blacklist_urls.py ├── scripts ├── presplit_sentences_json.py ├── generate_text.sh ├── pretrain_gpt2.sh ├── pretrain_bert.sh ├── pretrain_bert_sentencepiece.sh ├── pretrain_gpt2_distributed.sh ├── pretrain_gpt2_model_parallel.sh ├── pretrain_bert_distributed.sh ├── pretrain_bert_model_parallel.sh ├── pretrain_bert_tfrecords_distributed.sh ├── run_gpt2_eval.py └── split_json.py ├── model ├── __init__.py ├── model.py ├── gpt2_modeling.py └── distributed.py ├── fp16 ├── __init__.py ├── fp16util.py └── loss_scaler.py ├── data_utils ├── corpora.py ├── __init__.py ├── tf_dl.py ├── samplers.py ├── lazy_loader.py └── file_utils.py ├── detokenizer.py ├── learning_rates.py ├── gpt2_data_loader.py ├── configure_data.py └── generate_samples.py /mpu/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | nltk>=3.4 2 | numpy>=1.15.4 3 | pandas>=0.24.0 4 | sentencepiece>=0.1.8 5 | tensorflow>=1.12.0 6 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | Note that as of now you need to have PySOL cloned to the directory here before building the container. 2 | -------------------------------------------------------------------------------- /docker/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3 2 | google-cloud-language 3 | inflect 4 | nltk 5 | numpy 6 | pandas 7 | requests 8 | sentencepiece 9 | tensorflow 10 | tqdm 11 | -------------------------------------------------------------------------------- /openwebtext/run_make_gpt2_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "processing gpt2 data ..." 4 | DIR="/raid/mpatwary/redownload_v0/0-21" 5 | 6 | for thread in {0..3}; do 7 | echo " launching thread "$thread && python make_gpt2_dataset.py $DIR $thread > $DIR/logs/shard_$thread.log 2>&1 & 8 | done 9 | -------------------------------------------------------------------------------- /scripts/presplit_sentences_json.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python scripts/presplit_sentences_json.py 4 | """ 5 | 6 | import sys 7 | import json 8 | 9 | import nltk 10 | 11 | nltk.download('punkt') 12 | 13 | input_file = sys.argv[1] 14 | output_file = sys.argv[2] 15 | 16 | line_seperator = "\n" 17 | 18 | with open(input_file, 'r') as ifile: 19 | with open(output_file, "w") as ofile: 20 | for doc in ifile.readlines(): 21 | parsed = json.loads(doc) 22 | sent_list = [] 23 | for line in parsed['text'].split('\n'): 24 | if line != '\n': 25 | sent_list.extend(nltk.tokenize.sent_tokenize(line)) 26 | parsed['text'] = line_seperator.join(sent_list) 27 | ofile.write(json.dumps(parsed)+'\n') 28 | -------------------------------------------------------------------------------- /scripts/generate_text.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CHECKPOINT_PATH=/path/to/checkpoint 4 | MPSIZE=1 5 | NLAYERS=24 6 | NHIDDEN=1024 7 | NATT=16 8 | MAXSEQLEN=1024 9 | 10 | #SAMPLING ARGS 11 | TEMP=0.9 12 | #If TOPK/TOPP are 0 it defaults to greedy sampling, top-k will also override top-p 13 | TOPK=0 14 | TOPP=0 15 | 16 | python generate_samples.py \ 17 | --model-parallel-size $MPSIZE \ 18 | --num-layers $NLAYERS \ 19 | --hidden-size $NHIDDEN \ 20 | --load $CHECKPOINT_PATH \ 21 | --num-attention-heads $NATT \ 22 | --max-position-embeddings 1024 \ 23 | --tokenizer-type GPT2BPETokenizer \ 24 | --fp16 \ 25 | --cache-dir cache \ 26 | --out-seq-length $MAXSEQLEN \ 27 | --temperature $TEMP \ 28 | --top_k $TOPK \ 29 | --top_p $TOPP 30 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .distributed import * 17 | from .gpt2_modeling import gpt2_get_params_for_weight_decay_optimization 18 | from .gpt2_modeling import GPT2Model 19 | from .model import BertModel 20 | from .model import get_params_for_weight_decay_optimization 21 | -------------------------------------------------------------------------------- /scripts/pretrain_gpt2.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | RANK=0 6 | WORLD_SIZE=1 7 | 8 | python pretrain_gpt2.py \ 9 | --num-layers 24 \ 10 | --hidden-size 1024 \ 11 | --num-attention-heads 16 \ 12 | --batch-size 8 \ 13 | --seq-length 1024 \ 14 | --max-position-embeddings 1024 \ 15 | --train-iters 320000 \ 16 | --save checkpoints/gpt2_345m \ 17 | --load checkpoints/gpt2_345m \ 18 | --resume-dataloader \ 19 | --train-data wikipedia \ 20 | --lazy-loader \ 21 | --tokenizer-type GPT2BPETokenizer \ 22 | --cache-dir cache \ 23 | --split 949,50,1 \ 24 | --distributed-backend nccl \ 25 | --lr 0.00015 \ 26 | --lr-decay-style cosine \ 27 | --weight-decay 1e-2 \ 28 | --clip-grad 1.0 \ 29 | --warmup .01 \ 30 | --checkpoint-activations \ 31 | --fp16 32 | 33 | 34 | set +x 35 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # =========== 2 | # base images 3 | # =========== 4 | FROM nvcr.io/nvidia/pytorch:19.05-py3 5 | 6 | 7 | # =============== 8 | # system packages 9 | # =============== 10 | RUN apt-get update && apt-get install -y \ 11 | bash-completion \ 12 | emacs \ 13 | git \ 14 | graphviz \ 15 | htop \ 16 | libopenexr-dev \ 17 | rsync \ 18 | wget \ 19 | && rm -rf /var/lib/apt/lists/* 20 | 21 | 22 | # ============ 23 | # pip packages 24 | # ============ 25 | RUN pip install --upgrade pip && \ 26 | pip install --upgrade setuptools 27 | COPY requirements.txt /tmp/ 28 | RUN pip install --upgrade --ignore-installed -r /tmp/requirements.txt 29 | 30 | 31 | # =========== 32 | # latest apex 33 | # =========== 34 | RUN pip uninstall -y apex && \ 35 | git clone https://github.com/NVIDIA/apex.git ~/apex && \ 36 | cd ~/apex && \ 37 | pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . 38 | 39 | -------------------------------------------------------------------------------- /scripts/pretrain_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RANK=0 4 | WORLD_SIZE=1 5 | 6 | python pretrain_bert.py \ 7 | --num-layers 24 \ 8 | --hidden-size 1024 \ 9 | --num-attention-heads 16 \ 10 | --batch-size 4 \ 11 | --seq-length 512 \ 12 | --max-preds-per-seq 80 \ 13 | --max-position-embeddings 512 \ 14 | --train-iters 1000000 \ 15 | --save checkpoints/bert_345m \ 16 | --load checkpoints/bert_345m \ 17 | --resume-dataloader \ 18 | --train-data wikipedia \ 19 | --lazy-loader \ 20 | --tokenizer-type BertWordPieceTokenizer \ 21 | --tokenizer-model-type bert-large-uncased \ 22 | --presplit-sentences \ 23 | --cache-dir cache \ 24 | --split 949,50,1 \ 25 | --distributed-backend nccl \ 26 | --lr 0.0001 \ 27 | --lr-decay-style linear \ 28 | --lr-decay-iters 990000 \ 29 | --weight-decay 1e-2 \ 30 | --clip-grad 1.0 \ 31 | --warmup .01 \ 32 | --fp16 \ 33 | --fp32-layernorm \ 34 | --fp32-embedding 35 | -------------------------------------------------------------------------------- /fp16/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from .fp16util import ( 16 | BN_convert_float, 17 | network_to_half, 18 | prep_param_lists, 19 | model_grads_to_master_grads, 20 | master_params_to_model_params, 21 | tofp16, 22 | to_python_float, 23 | clip_grad_norm, 24 | convert_module, 25 | convert_network, 26 | FP16Model, 27 | ) 28 | 29 | from .fp16 import * 30 | from .loss_scaler import * 31 | -------------------------------------------------------------------------------- /scripts/pretrain_bert_sentencepiece.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RANK=0 4 | WORLD_SIZE=1 5 | 6 | python pretrain_bert.py \ 7 | --num-layers 24 \ 8 | --hidden-size 1024 \ 9 | --num-attention-heads 16 \ 10 | --batch-size 4 \ 11 | --seq-length 512 \ 12 | --max-preds-per-seq 80 \ 13 | --max-position-embeddings 512 \ 14 | --train-iters 1000000 \ 15 | --save checkpoints/bert_345m \ 16 | --load checkpoints/bert_345m \ 17 | --resume-dataloader \ 18 | --train-data wikipedia \ 19 | --lazy-loader \ 20 | --tokenizer-type SentencePieceTokenizer \ 21 | --tokenizer-model-type bpe \ 22 | --tokenizer-path tokenizer.model \ 23 | --presplit-sentences \ 24 | --cache-dir cache \ 25 | --split 949,50,1 \ 26 | --distributed-backend nccl \ 27 | --lr 0.0001 \ 28 | --lr-decay-style linear \ 29 | --lr-decay-iters 990000 \ 30 | --weight-decay 1e-2 \ 31 | --clip-grad 1.0 \ 32 | --warmup .01 \ 33 | --fp16 \ 34 | --fp32-layernorm \ 35 | --fp32-embedding 36 | -------------------------------------------------------------------------------- /openwebtext/make_gpt2_sizes.py: -------------------------------------------------------------------------------- 1 | 2 | import glob 3 | import json 4 | import os 5 | import time 6 | import sys 7 | 8 | import numpy as np 9 | 10 | 11 | if __name__ == '__main__': 12 | 13 | print('building the shard sizes ...') 14 | 15 | path = sys.argv[1] 16 | print('> reading numpy files from {}'.format(path)) 17 | 18 | npy_files = glob.glob(path + '/*.npy') 19 | npy_files.sort() 20 | print(' found {} numpy files'.format(len(npy_files))) 21 | 22 | size_dict = {} 23 | counter = 0 24 | start_time = time.time() 25 | for filename in npy_files: 26 | data = np.load(filename, allow_pickle=True) 27 | size = np.hstack(data).size 28 | np_filename = os.path.basename(filename) 29 | size_dict[np_filename] = size 30 | counter += 1 31 | if counter % 10 == 0: 32 | print(' processed {} files in {:.2f} seconds'.format( 33 | counter, time.time() - start_time)) 34 | 35 | output_filename = os.path.join(path, 'sizes.txt') 36 | with open(output_filename, 'w') as f: 37 | json.dump(size_dict, f) 38 | print('> wrote sizes to {}'.format(output_filename)) 39 | -------------------------------------------------------------------------------- /scripts/pretrain_gpt2_distributed.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | GPUS_PER_NODE=8 6 | # Change for multinode config 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=6000 9 | NNODES=1 10 | NODE_RANK=0 11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 12 | 13 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 14 | 15 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 16 | pretrain_gpt2.py \ 17 | --num-layers 24 \ 18 | --hidden-size 1024 \ 19 | --num-attention-heads 16 \ 20 | --batch-size 8 \ 21 | --seq-length 1024 \ 22 | --max-position-embeddings 1024 \ 23 | --train-iters 320000 \ 24 | --save checkpoints/gpt2_345m \ 25 | --load checkpoints/gpt2_345m \ 26 | --resume-dataloader \ 27 | --train-data wikipedia \ 28 | --lazy-loader \ 29 | --tokenizer-type GPT2BPETokenizer \ 30 | --cache-dir cache \ 31 | --split 949,50,1 \ 32 | --distributed-backend nccl \ 33 | --lr 0.00015 \ 34 | --lr-decay-style cosine \ 35 | --weight-decay 1e-2 \ 36 | --clip-grad 1.0 \ 37 | --warmup .01 \ 38 | --checkpoint-activations \ 39 | --fp16 40 | 41 | 42 | set +x 43 | -------------------------------------------------------------------------------- /scripts/pretrain_gpt2_model_parallel.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | GPUS_PER_NODE=8 6 | # Change for multinode config 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=6000 9 | NNODES=1 10 | NODE_RANK=0 11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 12 | 13 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 14 | 15 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 16 | pretrain_gpt2.py \ 17 | --model-parallel-size 2 \ 18 | --num-layers 24 \ 19 | --hidden-size 1024 \ 20 | --num-attention-heads 16 \ 21 | --batch-size 8 \ 22 | --seq-length 1024 \ 23 | --max-position-embeddings 1024 \ 24 | --train-iters 320000 \ 25 | --save checkpoints/gpt2_345m_mp2 \ 26 | --load checkpoints/gpt2_345m_mp2 \ 27 | --resume-dataloader \ 28 | --train-data wikipedia \ 29 | --lazy-loader \ 30 | --tokenizer-type GPT2BPETokenizer \ 31 | --cache-dir cache \ 32 | --split 949,50,1 \ 33 | --distributed-backend nccl \ 34 | --lr 0.00015 \ 35 | --lr-decay-style cosine \ 36 | --weight-decay 1e-2 \ 37 | --clip-grad 1.0 \ 38 | --warmup .01 \ 39 | --checkpoint-activations \ 40 | --fp16 41 | 42 | 43 | set +x 44 | -------------------------------------------------------------------------------- /scripts/pretrain_bert_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 12 | 13 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 14 | pretrain_bert.py \ 15 | --num-layers 24 \ 16 | --hidden-size 1024 \ 17 | --num-attention-heads 16 \ 18 | --batch-size 4 \ 19 | --seq-length 512 \ 20 | --max-preds-per-seq 80 \ 21 | --max-position-embeddings 512 \ 22 | --train-iters 1000000 \ 23 | --save checkpoints/bert_345m \ 24 | --load checkpoints/bert_345m \ 25 | --resume-dataloader \ 26 | --train-data wikipedia \ 27 | --lazy-loader \ 28 | --tokenizer-type BertWordPieceTokenizer \ 29 | --tokenizer-model-type bert-large-uncased \ 30 | --presplit-sentences \ 31 | --cache-dir cache \ 32 | --split 949,50,1 \ 33 | --distributed-backend nccl \ 34 | --lr 0.0001 \ 35 | --lr-decay-style linear \ 36 | --lr-decay-iters 990000 \ 37 | --weight-decay 1e-2 \ 38 | --clip-grad 1.0 \ 39 | --warmup .01 \ 40 | --fp16 \ 41 | --fp32-layernorm \ 42 | --fp32-embedding 43 | 44 | -------------------------------------------------------------------------------- /scripts/pretrain_bert_model_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 12 | 13 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 14 | pretrain_bert.py \ 15 | --model-parallel-size 2 \ 16 | --num-layers 24 \ 17 | --hidden-size 1024 \ 18 | --num-attention-heads 16 \ 19 | --batch-size 4 \ 20 | --seq-length 512 \ 21 | --max-preds-per-seq 80 \ 22 | --max-position-embeddings 512 \ 23 | --train-iters 1000000 \ 24 | --save checkpoints/bert_345m_mp2 \ 25 | --load checkpoints/bert_345m_mp2 \ 26 | --resume-dataloader \ 27 | --train-data wikipedia \ 28 | --lazy-loader \ 29 | --tokenizer-type BertWordPieceTokenizer \ 30 | --tokenizer-model-type bert-large-uncased \ 31 | --presplit-sentences \ 32 | --cache-dir cache \ 33 | --split 949,50,1 \ 34 | --distributed-backend nccl \ 35 | --lr 0.0001 \ 36 | --lr-decay-style linear \ 37 | --lr-decay-iters 990000 \ 38 | --weight-decay 1e-2 \ 39 | --clip-grad 1.0 \ 40 | --warmup .01 \ 41 | --fp16 \ 42 | --fp32-layernorm \ 43 | --fp32-embedding 44 | 45 | -------------------------------------------------------------------------------- /scripts/pretrain_bert_tfrecords_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 12 | 13 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 14 | pretrain_bert.py \ 15 | --num-layers 24 \ 16 | --hidden-size 1024 \ 17 | --num-attention-heads 16 \ 18 | --batch-size 4 \ 19 | --seq-length 512 \ 20 | --max-preds-per-seq 80 \ 21 | --max-position-embeddings 512 \ 22 | --train-iters 1000000 \ 23 | --save checkpoints/bert_345m \ 24 | --load checkpoints/bert_345m \ 25 | --resume-dataloader \ 26 | --use-tfrecords \ 27 | --train-data \ 28 | --valid-data \ 29 | --test-data \ 30 | --tokenizer-type BertWordPieceTokenizer \ 31 | --tokenizer-model-type bert-large-uncased \ 32 | --presplit-sentences \ 33 | --cache-dir cache \ 34 | --split 949,50,1 \ 35 | --distributed-backend nccl \ 36 | --lr 0.0001 \ 37 | --lr-decay-style linear \ 38 | --lr-decay-iters 990000 \ 39 | --weight-decay 1e-2 \ 40 | --clip-grad 1.0 \ 41 | --warmup .01 \ 42 | --fp16 \ 43 | --fp32-layernorm \ 44 | --fp32-embedding 45 | -------------------------------------------------------------------------------- /openwebtext/tokenizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import sys 17 | sys.path.append('..') 18 | 19 | from data_utils.tokenization_gpt2 import GPT2Tokenizer 20 | 21 | 22 | class Tokenizer: 23 | 24 | def __init__(self, cache_dir=None): 25 | self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2', 26 | cache_dir=cache_dir) 27 | self.tokenizer.max_len = int(1e12) 28 | self.eod_token = self.tokenizer.encoder['<|endoftext|>'] 29 | assert self.eod_token < 65535, 'vocab size will not fit in uint16' 30 | print('> GPT2 tokenizer with {} vocab size and eod token {} ...'.format( 31 | len(self.tokenizer.encoder), self.eod_token)) 32 | 33 | def tokenize_document(self, document): 34 | tokens = self.tokenizer.encode(document) 35 | tokens.append(self.eod_token) 36 | return tokens 37 | -------------------------------------------------------------------------------- /openwebtext/merge_jsons.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import glob 18 | import sys 19 | import json 20 | import argparse 21 | 22 | if __name__ == '__main__': 23 | 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--json_path", type=str, default=".", 26 | help="path where all the json files are located") 27 | 28 | parser.add_argument("--output_file", type=str, default="merged_output.json", 29 | help="filename where the merged json should go") 30 | 31 | args = parser.parse_args() 32 | 33 | json_path = args.json_path 34 | out_file = args.output_file 35 | 36 | json_files = glob.glob(json_path + '/*.json') 37 | 38 | counter = 0 39 | 40 | with open(out_file, 'w') as outfile: 41 | for fname in json_files: 42 | counter += 1 43 | 44 | if counter % 1024 == 0: 45 | print("Merging at ", counter, flush=True) 46 | 47 | with open(fname, 'r') as infile: 48 | for row in infile: 49 | each_row = json.loads(row) 50 | outfile.write(row) 51 | 52 | 53 | print("Merged file", out_file, flush=True) 54 | 55 | 56 | -------------------------------------------------------------------------------- /data_utils/corpora.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """several datasets with preset arguments""" 16 | from .datasets import json_dataset, csv_dataset 17 | import os 18 | 19 | class wikipedia(json_dataset): 20 | """ 21 | dataset for wikipedia with arguments configured for convenience 22 | 23 | command line usage: `--train-data wikipedia` 24 | """ 25 | PATH = 'data/wikipedia/wikidump_lines.json' 26 | assert_str = "make sure to set PATH for wikipedia data_utils/corpora.py" 27 | def __init__(self, **kwargs): 28 | assert os.path.exists(wikipedia.PATH), \ 29 | wikipedia.assert_str 30 | if not kwargs: 31 | kwargs = {} 32 | kwargs['text_key'] = 'text' 33 | kwargs['loose_json'] = True 34 | super(wikipedia, self).__init__(wikipedia.PATH, **kwargs) 35 | 36 | 37 | class webtext(json_dataset): 38 | """ 39 | dataset for webtext with arguments configured for convenience 40 | 41 | command line usage: `--train-data webtext` 42 | """ 43 | PATH = 'data/webtext/data.json' 44 | assert_str = "make sure to set PATH for webtext data_utils/corpora.py" 45 | def __init__(self, **kwargs): 46 | assert os.path.exists(webtext.PATH), \ 47 | webtext.assert_str 48 | if not kwargs: 49 | kwargs = {} 50 | kwargs['text_key'] = 'text' 51 | kwargs['loose_json'] = True 52 | super(webtext, self).__init__(webtext.PATH, **kwargs) 53 | 54 | 55 | NAMED_CORPORA = { 56 | 'wikipedia': wikipedia, 57 | 'webtext': webtext, 58 | } 59 | -------------------------------------------------------------------------------- /detokenizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def ptb_detokenizer(string): 4 | string = string.replace(" '", "'") 5 | string = string.replace(" \n", "\n") 6 | string = string.replace("\n ", "\n") 7 | string = string.replace(" n't", "n't") 8 | string = string.replace(" N ","1 ") 9 | string = string.replace("$ 1", "$1") 10 | string = string.replace("# 1", "#1") 11 | return string 12 | 13 | 14 | def wikitext_detokenizer(string): 15 | #contractions 16 | string = string.replace("s '", "s'") 17 | string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) 18 | # number separators 19 | string = string.replace(" @-@ ", "-") 20 | string = string.replace(" @,@ ", ",") 21 | string = string.replace(" @.@ ", ".") 22 | #punctuation 23 | string = string.replace(" : ", ": ") 24 | string = string.replace(" ; ", "; ") 25 | string = string.replace(" . ", ". ") 26 | string = string.replace(" ! ", "! ") 27 | string = string.replace(" ? ", "? ") 28 | string = string.replace(" , ", ", ") 29 | # double brackets 30 | string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) 31 | string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) 32 | string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) 33 | string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) 34 | string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) 35 | # miscellaneous 36 | string = string.replace("= = = =", "====") 37 | string = string.replace("= = =", "===") 38 | string = string.replace("= =", "==") 39 | string = string.replace(" "+chr(176)+" ", chr(176)) 40 | string = string.replace(" \n", "\n") 41 | string = string.replace("\n ", "\n") 42 | string = string.replace(" N ", " 1 ") 43 | string = string.replace(" 's", "'s") 44 | 45 | return string 46 | 47 | def lambada_detokenizer(string): 48 | return string 49 | 50 | def get_detokenizer(path): 51 | for key in DETOKENIZERS.keys(): 52 | if key in path: 53 | print(key) 54 | return DETOKENIZERS[key] 55 | 56 | DETOKENIZERS = { 57 | 'ptb': ptb_detokenizer, 58 | 'wikitext': wikitext_detokenizer, 59 | 'lambada': lambada_detokenizer, 60 | } 61 | -------------------------------------------------------------------------------- /mpu/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Model parallel utility interface.""" 17 | 18 | from .cross_entropy import vocab_parallel_cross_entropy 19 | 20 | from .data import broadcast_data 21 | 22 | from .grads import clip_grad_norm 23 | 24 | from .initialize import destroy_model_parallel 25 | from .initialize import get_data_parallel_group 26 | from .initialize import get_data_parallel_rank 27 | from .initialize import get_data_parallel_world_size 28 | from .initialize import get_model_parallel_group 29 | from .initialize import get_model_parallel_rank 30 | from .initialize import get_model_parallel_src_rank 31 | from .initialize import get_model_parallel_world_size 32 | from .initialize import initialize_model_parallel 33 | from .initialize import model_parallel_is_initialized 34 | 35 | from .layers import ColumnParallelLinear 36 | from .layers import ParallelEmbedding 37 | from .layers import RowParallelLinear 38 | from .layers import VocabParallelEmbedding 39 | 40 | from .mappings import copy_to_model_parallel_region 41 | from .mappings import gather_from_model_parallel_region 42 | from .mappings import reduce_from_model_parallel_region 43 | from .mappings import scatter_to_model_parallel_region 44 | 45 | from .random import checkpoint 46 | from .random import get_cuda_rng_tracker 47 | from .random import model_parallel_cuda_manual_seed 48 | 49 | from .transformer import BertParallelSelfAttention 50 | from .transformer import BertParallelTransformerLayer 51 | from .transformer import GPT2ParallelTransformer 52 | from .transformer import LayerNorm 53 | -------------------------------------------------------------------------------- /openwebtext/README.md: -------------------------------------------------------------------------------- 1 | The following steps show how to prepare training dataset to train the mode. 2 | 3 | # Libraries to install 4 | 5 | ``` 6 | pip install ftfy langdetect numpy torch pandas nltk sentencepiece boto3 tqdm regex bs4 newspaper3k htmlmin tldextract 7 | git clone https://github.com/mattilyra/LSH 8 | cd LSH 9 | python setup.py install 10 | ``` 11 | 12 | # Download the dataset 13 | 14 | 1. Download the deduplicated URLs from [jcpeterson](https://mega.nz/#F!EZZD0YwJ!9_PlEQzdMVLaNdKv_ICNVQ!cc4RgQQZ) 15 | 2. Remove blacklisted URLs. 16 | ``` 17 | python blacklist_urls.py 18 | ``` 19 | 3. Download the content from the clean urls with [openwebtext's utilities](https://github.com/eukaryote31/openwebtext/blob/master/download.py). 20 | 21 | 4. Merge the contents into one loose json file with 1 json per newline of the format `{'text': text, 'url': unique_url}`. It is important for the url to be unique. 22 | 23 | # Prepare the data for GPT-2 training: 24 | 25 | 1. Perform ftfy, english detection and remove documents with less than 128 tokens. This step can be sharded and run on shards. 26 | ``` 27 | python cleanup_dataset.py 28 | ``` 29 | 2. Using LSH, find possible duplicates and store then in a file for later processing. This step can NOT be sharded and usually takes 12 to 24 hours for OpenWebText dataset. 30 | ``` 31 | python find_duplicates.py 32 | ``` 33 | 3. Based on similarity measure defind inside function `is_similar` (default: 0.9), group urls that are similar. Basically, for each group, only one url we should keep and remove the rest. 34 | ``` 35 | python group_duplicate_urls.py 36 | ``` 37 | 4. Remove similar documents that were detected in the last step. 38 | ``` 39 | python remove_group_duplicates.py 40 | ``` 41 | 42 | 5. Shuffle the dataset. 43 | ``` 44 | shuf -o train_data.json 45 | ``` 46 | 47 | -------------------------------------------------------------------------------- /openwebtext/remove_group_duplicates.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import json 18 | import time 19 | import sys 20 | 21 | 22 | if __name__ == '__main__': 23 | 24 | url_filename = sys.argv[1] 25 | data_filename = sys.argv[2] 26 | output_filename = sys.argv[3] 27 | 28 | urls = set() 29 | with open(url_filename, 'r') as f: 30 | for line in f: 31 | myjson = json.loads(line) 32 | for key in myjson: 33 | this_urls = myjson[key] 34 | for i in range(1, len(this_urls)): 35 | urls.add(this_urls[i]) 36 | print('will be removing {} urls'.format(len(urls)), flush=True) 37 | 38 | written_docs = 0 39 | removed_docs = 0 40 | removed_chars = 0 41 | start_time = time.time() 42 | with open(output_filename, 'wb') as fout: 43 | with open(data_filename, 'r') as fin: 44 | for line in fin: 45 | try: 46 | myjson = json.loads(line) 47 | url = myjson['url'] 48 | if url in urls: 49 | print('removing', myjson) 50 | removed_docs += 1 51 | removed_chars += len(myjson['text']) 52 | continue 53 | myjson = json.dumps(myjson, ensure_ascii=False) 54 | fout.write(myjson.encode('utf-8')) 55 | fout.write('\n'.encode('utf-8')) 56 | written_docs += 1 57 | if written_docs % 10000 == 0: 58 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 59 | '| removed: {} (char: {})'.format( 60 | time.time() - start_time, 61 | written_docs, removed_docs, removed_chars)) 62 | except Exception as e: 63 | print('[SKIPPING]', line, e) 64 | 65 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 66 | '| removed: {} (char: {})'.format( 67 | time.time() - start_time, 68 | written_docs, removed_docs, removed_chars)) 69 | print('done :-)') 70 | -------------------------------------------------------------------------------- /openwebtext/make_gpt2_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import json 18 | import numpy as np 19 | import time 20 | import os 21 | import sys 22 | 23 | from tokenizer import Tokenizer 24 | 25 | 26 | def tokenize_corpus(filename, np_filename, print_interval=10000): 27 | 28 | print(' > tokenizing {}'.format(filename)) 29 | 30 | tokenizer = Tokenizer(cache_dir='./cache') 31 | 32 | tokenized_docs = [] 33 | num_docs = 0 34 | num_tokens = 0 35 | start_time = time.time() 36 | with open(filename, 'r') as f: 37 | for line in f: 38 | try: 39 | myjson = json.loads(line) 40 | url = myjson['url'] 41 | sample = myjson['text'] 42 | tokens = tokenizer.tokenize_document(sample) 43 | tokenized_docs.append(np.array(tokens, dtype=np.uint16)) 44 | num_docs += 1 45 | num_tokens += len(tokens) 46 | if num_docs % print_interval == 0: 47 | print(' processed {:9d} documents in {:.2f} (s) so far'. 48 | format(num_docs, time.time() - start_time), 49 | flush=True) 50 | except Exception as e: 51 | print(' skipping ', line, e) 52 | 53 | print(' >> processed {} document with total of {} tokens ...'.format( 54 | num_docs, num_tokens)) 55 | 56 | tokenized_docs = np.array(tokenized_docs, dtype=object) 57 | np.save(np_filename, tokenized_docs, allow_pickle=True) 58 | print(' >> saved the tokenzed document to {} ...'.format(np_filename)) 59 | 60 | 61 | if __name__ == '__main__': 62 | 63 | print('building gpt2 dataset ...') 64 | 65 | path = sys.argv[1] 66 | shard = sys.argv[2] 67 | 68 | input_filename = os.path.join(path, 69 | 'shards/shard_{:04d}'.format(int(shard))) 70 | output_filename = os.path.join(path, 71 | 'npys/shard_{:04d}.npy'.format(int(shard))) 72 | print('will be reading {}'.format(input_filename)) 73 | print('and will write the results to {}'.format(output_filename)) 74 | 75 | tokenize_corpus(input_filename, output_filename) 76 | 77 | 78 | -------------------------------------------------------------------------------- /mpu/tests/commons.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import argparse 17 | import os 18 | import random 19 | import numpy 20 | import torch 21 | 22 | import mpu 23 | 24 | 25 | class IdentityLayer(torch.nn.Module): 26 | def __init__(self, size, scale=1.0): 27 | super(IdentityLayer, self).__init__() 28 | self.weight = torch.nn.Parameter(scale * torch.randn(size)) 29 | def forward(self): 30 | return self.weight 31 | 32 | 33 | def set_random_seed(seed): 34 | """Set random seed for reproducability.""" 35 | random.seed(seed) 36 | numpy.random.seed(seed) 37 | torch.manual_seed(seed) 38 | mpu.model_parallel_cuda_manual_seed(seed) 39 | 40 | 41 | def initialize_distributed(backend='nccl'): 42 | """Initialize torch.distributed.""" 43 | # Get local rank in case it is provided. 44 | parser = argparse.ArgumentParser() 45 | parser.add_argument('--local_rank', type=int, default=None, 46 | help='local rank passed from distributed launcher') 47 | args = parser.parse_args() 48 | local_rank = args.local_rank 49 | 50 | # Get rank and world size. 51 | rank = int(os.getenv('RANK', '0')) 52 | world_size = int(os.getenv("WORLD_SIZE", '1')) 53 | 54 | print('> initializing torch.distributed with local rank: {}, ' 55 | 'rank: {}, world size: {}'.format(local_rank, rank, world_size)) 56 | 57 | # Set the device id. 58 | device = rank % torch.cuda.device_count() 59 | if local_rank is not None: 60 | device = local_rank 61 | torch.cuda.set_device(device) 62 | 63 | # Call the init process. 64 | init_method = 'tcp://' 65 | master_ip = os.getenv('MASTER_ADDR', 'localhost') 66 | master_port = os.getenv('MASTER_PORT', '6000') 67 | init_method += master_ip + ':' + master_port 68 | torch.distributed.init_process_group( 69 | backend=backend, 70 | world_size=world_size, 71 | rank=rank, 72 | init_method=init_method) 73 | 74 | 75 | def print_separator(message): 76 | torch.distributed.barrier() 77 | filler_len = (78 - len(message)) // 2 78 | filler = '-' * filler_len 79 | string = '\n' + filler + ' {} '.format(message) + filler 80 | if torch.distributed.get_rank() == 0: 81 | print(string, flush=True) 82 | torch.distributed.barrier() 83 | -------------------------------------------------------------------------------- /mpu/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import torch 18 | 19 | 20 | def ensure_divisibility(numerator, denominator): 21 | """Ensure that numerator is divisible by the denominator.""" 22 | assert numerator % denominator == 0, '{} is not divisible by {}'.format( 23 | numerator, denominator) 24 | 25 | 26 | def divide(numerator, denominator): 27 | """Ensure that numerator is divisible by the denominator and return 28 | the division value.""" 29 | ensure_divisibility(numerator, denominator) 30 | return numerator // denominator 31 | 32 | 33 | def split_tensor_along_last_dim(tensor, num_partitions, 34 | contiguous_split_chunks=False): 35 | """Split a tensor along its last dimension. 36 | Arguments: 37 | tensor: input tensor. 38 | num_partitions: number of partitions to split the tensor 39 | contiguous_split_chunks: If True, make each chunk contiguous 40 | in memory. 41 | """ 42 | # Get the size and dimension. 43 | last_dim = tensor.dim() - 1 44 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 45 | # Split. 46 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 47 | # Note: torch.split does not create contiguous tensors by default. 48 | if contiguous_split_chunks: 49 | return tuple(chunk.contiguous() for chunk in tensor_list) 50 | 51 | return tensor_list 52 | 53 | 54 | class VocabUtility: 55 | """Split the vocabulary into `world_size` chunks amd return the 56 | first and last index of the vocabulary belonging to the `rank` 57 | partition: Note that indecies in [fist, last)""" 58 | 59 | @staticmethod 60 | def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, 61 | rank, world_size): 62 | index_f = rank * per_partition_vocab_size 63 | index_l = index_f + per_partition_vocab_size 64 | return index_f, index_l 65 | 66 | @staticmethod 67 | def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size): 68 | per_partition_vocab_size = divide(global_vocab_size, world_size) 69 | return VocabUtility.vocab_range_from_per_partition_vocab_size( 70 | per_partition_vocab_size, rank, world_size) 71 | -------------------------------------------------------------------------------- /mpu/grads.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # Parts of the code here are adapted from PyTorch 18 | # repo: https://github.com/pytorch/pytorch 19 | 20 | 21 | import torch 22 | from torch._six import inf 23 | 24 | from .initialize import get_model_parallel_group 25 | from .initialize import get_model_parallel_rank 26 | 27 | 28 | def clip_grad_norm(parameters, max_norm, norm_type=2): 29 | """Clips gradient norm of an iterable of parameters. 30 | 31 | This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and 32 | added functionality to handle model parallel parameters. Note that 33 | the gradients are modified in place. 34 | 35 | Arguments: 36 | parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a 37 | single Tensor that will have gradients normalized 38 | max_norm (float or int): max norm of the gradients 39 | norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for 40 | infinity norm. 41 | 42 | Returns: 43 | Total norm of the parameters (viewed as a single vector). 44 | """ 45 | if isinstance(parameters, torch.Tensor): 46 | parameters = [parameters] 47 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 48 | max_norm = float(max_norm) 49 | norm_type = float(norm_type) 50 | if norm_type == inf: 51 | total_norm = max(p.grad.data.abs().max() for p in parameters) 52 | total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) 53 | # Take max across all GPUs. 54 | torch.distributed.all_reduce(total_norm_cuda, 55 | op=torch.distributed.ReduceOp.MAX, 56 | group=get_model_parallel_group()) 57 | total_norm = total_norm_cuda[0].item() 58 | else: 59 | total_norm = 0 60 | for p in parameters: 61 | if p.model_parallel or (get_model_parallel_rank() == 0): 62 | param_norm = p.grad.data.norm(norm_type) 63 | total_norm += param_norm.item() ** norm_type 64 | # Sum across all model parallel GPUs. 65 | total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) 66 | torch.distributed.all_reduce(total_norm_cuda, 67 | op=torch.distributed.ReduceOp.SUM, 68 | group=get_model_parallel_group()) 69 | total_norm = total_norm_cuda[0].item() ** (1. / norm_type) 70 | clip_coef = max_norm / (total_norm + 1e-6) 71 | if clip_coef < 1: 72 | for p in parameters: 73 | p.grad.data.mul_(clip_coef) 74 | return total_norm 75 | -------------------------------------------------------------------------------- /mpu/tests/test_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import functools 17 | import operator 18 | import sys 19 | sys.path.append("../..") 20 | 21 | import torch 22 | import mpu 23 | from mpu import data as data_utils 24 | 25 | from commons import initialize_distributed 26 | from commons import print_separator 27 | 28 | 29 | def test_boradcast_data(model_parallel_size): 30 | 31 | if torch.distributed.get_rank() == 0: 32 | print('> testing boradcast_data with model parallel size {} ...'. 33 | format(model_parallel_size)) 34 | 35 | mpu.initialize_model_parallel(model_parallel_size) 36 | torch.manual_seed(1234 + mpu.get_data_parallel_rank()) 37 | model_parallel_size = mpu.get_model_parallel_world_size() 38 | 39 | key_size_t = {'key1': [7, 11], 40 | 'key2': [8, 2, 1], 41 | 'key3': [13], 42 | 'key4': [5, 1, 2], 43 | 'key5': [5, 12]} 44 | keys = list(key_size_t.keys()) 45 | 46 | data = {} 47 | data_t = {} 48 | for key in key_size_t: 49 | data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000) 50 | data_t[key] = data[key].clone() 51 | data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000) 52 | data_t['keyX'] = data['keyX'].clone() 53 | if mpu.get_model_parallel_rank() != 0: 54 | data = None 55 | 56 | data_utils._check_data_types(keys, data_t, torch.int64) 57 | key_size, key_numel, \ 58 | total_numel = data_utils._build_key_size_numel_dictionaries(keys, data) 59 | for key in keys: 60 | assert key_size[key] == key_size_t[key] 61 | total_numel_t = 0 62 | for key in keys: 63 | target_size = functools.reduce(operator.mul, key_size_t[key], 1) 64 | assert key_numel[key] == target_size 65 | total_numel_t += target_size 66 | assert total_numel == total_numel_t 67 | 68 | data_b = data_utils.broadcast_data(keys, data, torch.int64) 69 | for key in keys: 70 | tensor = data_t[key].cuda() 71 | assert data_b[key].sub(tensor).abs().max() == 0 72 | 73 | # Reset groups 74 | mpu.destroy_model_parallel() 75 | 76 | torch.distributed.barrier() 77 | if torch.distributed.get_rank() == 0: 78 | print('>> passed the test :-)') 79 | 80 | 81 | if __name__ == '__main__': 82 | 83 | initialize_distributed() 84 | world_size = torch.distributed.get_world_size() 85 | 86 | model_parallel_size = 1 87 | while model_parallel_size <= world_size: 88 | print_separator('test test boradcast data') 89 | test_boradcast_data(model_parallel_size) 90 | model_parallel_size *= 2 91 | 92 | 93 | -------------------------------------------------------------------------------- /learning_rates.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch DataLoader for TFRecords""" 16 | 17 | import torch 18 | from torch.optim.lr_scheduler import _LRScheduler 19 | import math 20 | 21 | class AnnealingLR(_LRScheduler): 22 | """Anneals the learning rate from start to zero along a cosine curve.""" 23 | 24 | DECAY_STYLES = ['linear', 'cosine', 'exponential', 'constant', 'None'] 25 | 26 | def __init__(self, optimizer, start_lr, warmup_iter, num_iters, decay_style=None, last_iter=-1): 27 | self.optimizer = optimizer 28 | self.start_lr = start_lr 29 | self.warmup_iter = warmup_iter 30 | self.num_iters = last_iter + 1 31 | self.end_iter = num_iters 32 | self.decay_style = decay_style.lower() if isinstance(decay_style, str) else None 33 | self.step(self.num_iters) 34 | if torch.distributed.get_rank() == 0: 35 | print('learning rate decaying', decay_style) 36 | 37 | def get_lr(self): 38 | # https://openreview.net/pdf?id=BJYwwY9ll pg. 4 39 | if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter: 40 | return float(self.start_lr) * self.num_iters / self.warmup_iter 41 | else: 42 | if self.decay_style == self.DECAY_STYLES[0]: 43 | return self.start_lr*((self.end_iter-(self.num_iters-self.warmup_iter))/self.end_iter) 44 | elif self.decay_style == self.DECAY_STYLES[1]: 45 | return self.start_lr / 2.0 * (math.cos(math.pi * (self.num_iters - self.warmup_iter) / self.end_iter) + 1) 46 | elif self.decay_style == self.DECAY_STYLES[2]: 47 | #TODO: implement exponential decay 48 | return self.start_lr 49 | else: 50 | return self.start_lr 51 | 52 | def step(self, step_num=None): 53 | if step_num is None: 54 | step_num = self.num_iters + 1 55 | self.num_iters = step_num 56 | new_lr = self.get_lr() 57 | for group in self.optimizer.param_groups: 58 | group['lr'] = new_lr 59 | 60 | def state_dict(self): 61 | sd = { 62 | 'start_lr': self.start_lr, 63 | 'warmup_iter': self.warmup_iter, 64 | 'num_iters': self.num_iters, 65 | 'decay_style': self.decay_style, 66 | 'end_iter': self.end_iter 67 | } 68 | return sd 69 | 70 | def load_state_dict(self, sd): 71 | self.start_lr = sd['start_lr'] 72 | self.warmup_iter = sd['warmup_iter'] 73 | self.num_iters = sd['num_iters'] 74 | self.end_iter = sd['end_iter'] 75 | self.decay_style = sd['decay_style'] 76 | self.step(self.num_iters) 77 | -------------------------------------------------------------------------------- /openwebtext/group_duplicates_url.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import json 17 | import time 18 | import sys 19 | 20 | 21 | def is_similar(jaccard_similarity): 22 | return (js >= 0.9) 23 | 24 | 25 | if __name__ == '__main__': 26 | 27 | 28 | print('grouping duplicate urls ...') 29 | 30 | input = sys.argv[1] 31 | output = sys.argv[2] 32 | 33 | url_to_index = {} 34 | index_to_urls = [] 35 | counter = 0 36 | start_time = time.time() 37 | with open(input, 'r') as f: 38 | for line in f: 39 | counter += 1 40 | myjson = json.loads(line) 41 | urls = [] 42 | for main_url in myjson.keys(): 43 | urls.append(main_url) 44 | for value in myjson[main_url]: 45 | for other_url, js in value.items(): 46 | if is_similar(js): 47 | urls.append(other_url) 48 | current_index = -1 49 | other_indices = set() 50 | for url in urls: 51 | if url in url_to_index: 52 | if current_index == -1: 53 | current_index = url_to_index[url] 54 | elif current_index != url_to_index[url]: 55 | other_indices.add(url_to_index[url]) 56 | if current_index == -1: 57 | current_index = len(index_to_urls) 58 | index_to_urls.append(set()) 59 | for url in urls: 60 | url_to_index[url] = current_index 61 | index_to_urls[current_index].add(url) 62 | for index in other_indices: 63 | for url in index_to_urls[index]: 64 | index_to_urls[current_index].add(url) 65 | url_to_index[url] = current_index 66 | index_to_urls[index] = None 67 | 68 | if counter % 100000 == 0: 69 | print(' > processed {} lines in {} seconds ...'.format( 70 | counter, time.time() - start_time)) 71 | 72 | 73 | total_remove = 0 74 | total_remain = 0 75 | for urls in index_to_urls: 76 | if urls is not None: 77 | if len(urls) > 1: 78 | total_remove += (len(urls) - 1) 79 | total_remain += 1 80 | print('out of {} urls, only {} are unique and {} should be removed'.format( 81 | total_remove+total_remain, total_remain, total_remove)) 82 | 83 | with open(output, 'wb') as f: 84 | for i, urls in enumerate(index_to_urls): 85 | if urls is not None: 86 | if len(urls) > 1: 87 | myjson = json.dumps({str(i): list(urls)}, 88 | ensure_ascii=False) 89 | f.write(myjson.encode('utf-8')) 90 | f.write('\n'.encode('utf-8')) 91 | -------------------------------------------------------------------------------- /mpu/tests/test_initialize.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import sys 17 | sys.path.append("../..") 18 | 19 | import torch 20 | import mpu 21 | 22 | from commons import initialize_distributed 23 | from commons import print_separator 24 | 25 | 26 | def test_initialize_model_parallel(model_parallel_size): 27 | 28 | if torch.distributed.get_rank() == 0: 29 | print('> testing initialize_model_parallel with size {} ...'.format( 30 | model_parallel_size)) 31 | model_parallel_size_ = min(model_parallel_size, 32 | torch.distributed.get_world_size()) 33 | assert not mpu.model_parallel_is_initialized() 34 | mpu.initialize_model_parallel(model_parallel_size_) 35 | assert mpu.model_parallel_is_initialized() 36 | 37 | # Checks. 38 | def check(group, world_size, rank): 39 | assert world_size == torch.distributed.get_world_size(group=group) 40 | assert rank == torch.distributed.get_rank(group=group) 41 | 42 | # Model parallel. 43 | world_size = model_parallel_size_ 44 | rank = torch.distributed.get_rank() % model_parallel_size_ 45 | assert world_size == mpu.get_model_parallel_world_size() 46 | assert rank == mpu.get_model_parallel_rank() 47 | check(mpu.get_model_parallel_group(), world_size, rank) 48 | 49 | 50 | # Data parallel. 51 | world_size = torch.distributed.get_world_size() // model_parallel_size_ 52 | rank = torch.distributed.get_rank() // model_parallel_size 53 | assert world_size == mpu.get_data_parallel_world_size() 54 | assert rank == mpu.get_data_parallel_rank() 55 | check(mpu.get_data_parallel_group(), world_size, rank) 56 | 57 | # Reset groups 58 | mpu.destroy_model_parallel() 59 | 60 | torch.distributed.barrier() 61 | if torch.distributed.get_rank() == 0: 62 | print('>> passed the test :-)') 63 | 64 | 65 | def test_get_model_parallel_src_rank(model_parallel_size_): 66 | 67 | if torch.distributed.get_rank() == 0: 68 | print('> testing get_model_parallel_src_rank with size {} ...'.format( 69 | model_parallel_size_)) 70 | model_parallel_size = min(model_parallel_size_, 71 | torch.distributed.get_world_size()) 72 | assert not mpu.model_parallel_is_initialized() 73 | mpu.initialize_model_parallel(model_parallel_size) 74 | assert mpu.model_parallel_is_initialized() 75 | 76 | # Checks 77 | src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank() 78 | assert mpu.get_model_parallel_src_rank() == src_rank 79 | 80 | # Reset groups 81 | mpu.destroy_model_parallel() 82 | 83 | torch.distributed.barrier() 84 | if torch.distributed.get_rank() == 0: 85 | print('>> passed the test :-)') 86 | 87 | 88 | if __name__ == '__main__': 89 | 90 | initialize_distributed() 91 | world_size = torch.distributed.get_world_size() 92 | model_parallel_size = 1 93 | while model_parallel_size <= world_size: 94 | print_separator('test initialize model parallel') 95 | test_initialize_model_parallel(model_parallel_size) 96 | print_separator('test model parallel source rank') 97 | test_get_model_parallel_src_rank(model_parallel_size) 98 | model_parallel_size *= 2 99 | -------------------------------------------------------------------------------- /scripts/run_gpt2_eval.py: -------------------------------------------------------------------------------- 1 | """ 2 | example usage: 3 | python scripts/run_gpt2_eval.py \ 4 | --model-parallel-size 1 \ 5 | --num-layers 12 \ 6 | --hidden-size 768 \ 7 | --num-attention-heads 12 \ 8 | --model-path \ 9 | --data-path \ 10 | --batch-size 16 \ 11 | --cache-dir 12 | """ 13 | import argparse 14 | import subprocess 15 | 16 | parser = argparse.ArgumentParser('run zero shot GPT2 eval') 17 | parser.add_argument('--model-path', type=str, required=True, 18 | help='Saved model path for evaluation') 19 | parser.add_argument('--batch-size', type=int, default=4, 20 | help='batch size to use for evaluation') 21 | parser.add_argument('--num-attention-heads', type=int, default=12, 22 | help='num of transformer attention heads') 23 | parser.add_argument('--hidden-size', type=int, default=768, 24 | help='tansformer hidden size') 25 | parser.add_argument('--num-layers', type=int, default=12, 26 | help='num decoder layers') 27 | parser.add_argument('--data-path', type=str, required=True, 28 | help='Data path for evaluation data') 29 | parser.add_argument('--cloze-eval', action='store_true', 30 | help='Run lambada cloze eval instead of perplexity eval.') 31 | parser.add_argument('--webtext-eval', action='store_true', 32 | help='Run webtext PPL eval instead of wikitext PPL eval.') 33 | parser.add_argument('--eval-iters', default=5000, type=int, 34 | help='number of iterations to run webtext evaluation') 35 | parser.add_argument('--model-parallel-size', type=int, default=1, 36 | help='model parallel size to use') 37 | parser.add_argument('--load-openai', action='store_true', 38 | help='Load weights from saved openai/hf checkpoints') 39 | parser.add_argument('--cache-dir', type=str, default='cache', 40 | help='directory to cache gpt2 tokenizers') 41 | args = parser.parse_args() 42 | 43 | multinode_args = '' 44 | if args.model_parallel_size > 1: 45 | multinode_args += ' -m torch.distributed.launch --nproc_per_node {} '.format(args.model_parallel_size) 46 | 47 | CMD = ' --model-parallel-size {model_par} \ 48 | --num-layers {nlayers} \ 49 | --hidden-size {hidden} \ 50 | --log-interval 100 \ 51 | --load {model} \ 52 | --eval-batch-size {batch} \ 53 | --num-attention-heads {natt} \ 54 | --seq-length 1024 \ 55 | --max-position-embeddings 1024 \ 56 | --tokenizer-type GPT2BPETokenizer \ 57 | --text-key text \ 58 | --distributed-backend nccl \ 59 | --hidden-dropout 0.1 \ 60 | --attention-dropout 0.1 \ 61 | --fp16 \ 62 | --overlapping-eval 32 \ 63 | --cache-dir {cache} '.format(model_par=args.model_parallel_size, 64 | nlayers=args.num_layers, 65 | hidden=args.hidden_size, 66 | model=args.model_path, 67 | batch=args.batch_size, 68 | natt=args.num_attention_heads, 69 | cache=args.cache_dir) 70 | 71 | if args.load_openai: 72 | CMD += ' --load-openai ' 73 | if args.cloze_eval: 74 | CMD += ' --cloze-eval ' 75 | CMD = 'evaluate_gpt2.py' + CMD 76 | print('Running Lambada Eval Command:', flush=True) 77 | elif args.webtext_eval: 78 | CMD += '--train-iters 0 --eval-iters {} --test-data {} --loose-json '.format(args.eval_iters, args.data_path) 79 | CMD = 'pretrain_gpt2.py' + CMD 80 | print('Running Webtext Eval Command:', flush=True) 81 | else: 82 | CMD += ' --valid-data {} '.format(args.data_path) 83 | CMD = 'evaluate_gpt2.py' + CMD 84 | print('Running PPL Eval Command:', flush=True) 85 | 86 | CMD = 'python3 '+multinode_args+CMD 87 | print(CMD, flush=True) 88 | 89 | subprocess.call(CMD.split()) 90 | -------------------------------------------------------------------------------- /model/model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Utilities for wrapping BertModel.""" 17 | 18 | import torch 19 | 20 | from .modeling import BertConfig 21 | from .modeling import BertForPreTraining, BertForMaskedLM 22 | from .modeling import BertLayerNorm 23 | 24 | 25 | def get_params_for_weight_decay_optimization(module): 26 | 27 | weight_decay_params = {'params': []} 28 | no_weight_decay_params = {'params': [], 'weight_decay': 0.0} 29 | for module_ in module.modules(): 30 | if isinstance(module_, (BertLayerNorm, torch.nn.LayerNorm)): 31 | no_weight_decay_params['params'].extend( 32 | [p for p in list(module_._parameters.values()) 33 | if p is not None]) 34 | else: 35 | weight_decay_params['params'].extend( 36 | [p for n, p in list(module_._parameters.items()) 37 | if p is not None and n != 'bias']) 38 | no_weight_decay_params['params'].extend( 39 | [p for n, p in list(module_._parameters.items()) 40 | if p is not None and n == 'bias']) 41 | 42 | return weight_decay_params, no_weight_decay_params 43 | 44 | 45 | class BertModel(torch.nn.Module): 46 | 47 | def __init__(self, args): 48 | super(BertModel, self).__init__() 49 | if args.pretrained_bert: 50 | self.model = BertForPreTraining.from_pretrained( 51 | args.tokenizer_model_type, 52 | cache_dir=args.cache_dir, 53 | fp32_layernorm=args.fp32_layernorm, 54 | fp32_embedding=args.fp32_embedding, 55 | layernorm_epsilon=args.layernorm_epsilon) 56 | else: 57 | if args.intermediate_size is None: 58 | intermediate_size = 4 * args.hidden_size 59 | else: 60 | intermediate_size = args.intermediate_size 61 | self.config = BertConfig( 62 | args.tokenizer_num_tokens, 63 | hidden_size=args.hidden_size, 64 | num_hidden_layers=args.num_layers, 65 | num_attention_heads=args.num_attention_heads, 66 | intermediate_size=intermediate_size, 67 | hidden_dropout_prob=args.hidden_dropout, 68 | attention_probs_dropout_prob=args.attention_dropout, 69 | max_position_embeddings=args.max_position_embeddings, 70 | type_vocab_size=args.tokenizer_num_type_tokens, 71 | fp32_layernorm=args.fp32_layernorm, 72 | fp32_embedding=args.fp32_embedding, 73 | fp32_tokentypes=args.fp32_tokentypes, 74 | layernorm_epsilon=args.layernorm_epsilon, 75 | deep_init=args.deep_init) 76 | self.model = BertForPreTraining(self.config) 77 | 78 | def forward(self, input_tokens, token_type_ids=None, 79 | attention_mask=None, checkpoint_activations=False): 80 | return self.model( 81 | input_tokens, token_type_ids, attention_mask, 82 | checkpoint_activations=checkpoint_activations) 83 | 84 | def state_dict(self, destination=None, prefix='', keep_vars=False): 85 | return self.model.state_dict(destination=destination, prefix=prefix, 86 | keep_vars=keep_vars) 87 | 88 | def load_state_dict(self, state_dict, strict=True): 89 | return self.model.load_state_dict(state_dict, strict=strict) 90 | 91 | -------------------------------------------------------------------------------- /openwebtext/find_duplicates.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import itertools 18 | import json 19 | from lsh import cache, minhash 20 | import time 21 | import sys 22 | 23 | 24 | # This function is adapted from: 25 | # https://github.com/mattilyra/LSH/blob/master/examples/Introduction.ipynb 26 | def shingles(text, char_ngram=5): 27 | return set(text[head:head + char_ngram] 28 | for head in range(0, len(text) - char_ngram)) 29 | 30 | 31 | # This function is adapted from: 32 | # https://github.com/mattilyra/LSH/blob/master/examples/Introduction.ipynb 33 | def jaccard(set_a, set_b): 34 | intersection = set_a & set_b 35 | union = set_a | set_b 36 | return len(intersection) / len(union) 37 | 38 | 39 | if __name__ == '__main__': 40 | 41 | print('finding possible duplicate content ...') 42 | 43 | input = sys.argv[1] 44 | output = sys.argv[2] 45 | 46 | hasher = minhash.MinHasher(seeds=100, char_ngram=5, hashbytes=4) 47 | lshcache = cache.Cache(bands=10, hasher=hasher) 48 | 49 | counter = 0 50 | url_doc = {} 51 | start_time = time.time() 52 | with open(input, 'r') as f: 53 | for line in f: 54 | try: 55 | myjson = json.loads(line) 56 | url = myjson['url'] 57 | text = myjson['text'] 58 | counter += 1 59 | url_doc[url] = text 60 | lshcache.add_fingerprint(hasher.fingerprint(text), url) 61 | except Exception as e: 62 | print('Error:', e) 63 | if counter % 10000 == 0: 64 | print(' [read]> processed {} documents in {:.2f} seconds ...'. 65 | format(counter, time.time() - start_time), flush=True) 66 | 67 | counter = 0 68 | start_time = time.time() 69 | deduped = 0 70 | with open(output, 'wb') as f: 71 | for b in lshcache.bins: 72 | for bucket_id in b: 73 | if len(b[bucket_id]) > 1: 74 | items = list(b[bucket_id]) 75 | main_url = items[0] 76 | main_dhingles = shingles(url_doc[main_url]) 77 | remove_urls = [] 78 | for i in range(1, len(items)): 79 | counter += 1 80 | other_url= items[i] 81 | other_shingles = shingles(url_doc[other_url]) 82 | try: 83 | jaccard_sim = jaccard(main_dhingles, other_shingles) 84 | except Exception as e: 85 | print('Error:', e) 86 | if jaccard_sim > 0.5: 87 | remove_urls.append({other_url: jaccard_sim}) 88 | deduped += 1 89 | if counter % 10000 == 0: 90 | print(' [write]> processed {} documents in {:.2f} ' 91 | 'seoncds and deduped {} documents ...'. 92 | format(counter, time.time() - start_time, 93 | deduped), flush=True) 94 | if len(remove_urls) > 0: 95 | myjson = json.dumps({main_url: remove_urls}, 96 | ensure_ascii=False) 97 | f.write(myjson.encode('utf-8')) 98 | f.write('\n'.encode('utf-8')) 99 | 100 | print('done :-)') 101 | -------------------------------------------------------------------------------- /scripts/split_json.py: -------------------------------------------------------------------------------- 1 | """ 2 | Takes a corpora of files (specified by `--input_files`) with json data separated 3 | by newlines (loose json). Splits data into train.json, val.json, test.json files 4 | under `output_dir`. 5 | 6 | Note: This code has the potential to override files with the names 7 | train.json, val.json, test.json in `--output_dir`. 8 | """ 9 | import os 10 | import argparse 11 | import math 12 | import random 13 | 14 | parser = argparse.ArgumentParser('resplit loose json data into train/val/test') 15 | parser.add_argument('--input_files', nargs='+', required=True, 16 | help='whitespace separated list of input data files') 17 | parser.add_argument('--output_dir', required=True, 18 | help='output directory where to put files') 19 | parser.add_argument('--test_percent', type=float, nargs='+', default=[0.05, 0], 20 | help='percentage of available data to use for val/test dataset') 21 | args = parser.parse_args() 22 | 23 | def get_lines(filepath): 24 | lines = [] 25 | with open(filepath, 'r') as f: 26 | for i, l in enumerate(f.readlines()): 27 | l = l.strip() 28 | lines.append(l) 29 | return lines 30 | 31 | def get_splits(lines, line_counts): 32 | all_lines = [] 33 | line_idx = [] 34 | file_mappings = [] 35 | for i, l in enumerate(lines): 36 | all_lines.extend(l) 37 | line_idx.extend(list(range(len(l)))) 38 | file_mappings.extend([i]*len(l)) 39 | 40 | indices = list(range(len(all_lines))) 41 | random.shuffle(indices) 42 | all_lines = [all_lines[idx] for idx in indices] 43 | line_idx = [line_idx[idx] for idx in indices] 44 | file_mappings = [file_mappings[idx] for idx in indices] 45 | 46 | splits = [] 47 | mappings = [] 48 | start = 0 49 | for end in line_counts: 50 | end += start 51 | splits.append(all_lines[start:end]) 52 | mappings.append(format_mappings(line_idx[start:end], file_mappings[start:end])) 53 | start = end 54 | return splits, mappings 55 | 56 | def format_mappings(line_idx, file_mappings): 57 | lines = [] 58 | for m, l in zip(file_mappings, line_idx): 59 | lines.append(str(m).strip()+'\t'+str(l).strip()) 60 | return lines 61 | 62 | 63 | def get_filepaths(filepaths, output_dir): 64 | paths = [] 65 | train_path = 'train.json' 66 | dev_path = 'dev.json' 67 | test_path = 'test.json' 68 | paths.append(os.path.join(output_dir, train_path)) 69 | paths.append(os.path.join(output_dir, dev_path)) 70 | paths.append(os.path.join(output_dir, test_path)) 71 | return paths 72 | 73 | def write_files(lines, mappings, filepaths): 74 | for l, m, path in zip(lines, mappings, filepaths): 75 | write_file(l, path) 76 | write_mapping_file(m, path) 77 | 78 | def write_file(lines, path): 79 | print('Writing:', path) 80 | with open(path, 'w') as f: 81 | for l in lines: 82 | f.write(l+'\n') 83 | 84 | def write_mapping_file(m, path): 85 | path = path+'.map' 86 | m = [get_mapping_header()]+m 87 | write_file(m, path) 88 | 89 | def get_mapping_header(): 90 | return 'file\tline #' 91 | 92 | if not os.path.exists(args.output_dir): 93 | os.makedirs(args.output_dir) 94 | 95 | lines = [] 96 | 97 | for filepath in args.input_files: 98 | _lines = get_lines(filepath) 99 | lines.append(_lines) 100 | 101 | #calculate number of lines to use for each 102 | line_counts = [len(l) for l in lines] 103 | total_lines = sum(line_counts) 104 | dev_percent = args.test_percent[0] 105 | dev_lines = math.ceil(dev_percent*total_lines) 106 | test_percent = 0 107 | if len(args.test_percent)==2: 108 | test_percent=args.test_percent[1] 109 | test_lines = math.ceil(test_percent*total_lines) 110 | train_lines = total_lines-(test_lines+dev_lines) 111 | normed_lines = [train_lines, dev_lines, test_lines] 112 | normed_lines = [int(l) for l in normed_lines] 113 | 114 | 115 | splits, mappings = get_splits(lines, normed_lines) 116 | filepaths = get_filepaths(args.input_files, args.output_dir) 117 | print('Writing output to:', filepaths) 118 | write_files(splits, mappings, filepaths) 119 | 120 | -------------------------------------------------------------------------------- /mpu/tests/test_cross_entropy.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import random 17 | import sys 18 | sys.path.append("../..") 19 | 20 | import torch 21 | import torch.nn.functional as F 22 | import mpu 23 | from mpu.cross_entropy import vocab_parallel_cross_entropy 24 | 25 | from commons import initialize_distributed 26 | from commons import print_separator 27 | from commons import IdentityLayer 28 | from commons import set_random_seed 29 | 30 | 31 | def torch_cross_entropy(batch_size, seq_length, vocab_size, 32 | logits_scale, seed): 33 | set_random_seed(seed) 34 | identity = IdentityLayer((batch_size, seq_length, vocab_size), 35 | scale=logits_scale).cuda() 36 | logits = identity() 37 | target = torch.cuda.LongTensor( 38 | size=(batch_size, seq_length)).random_(0, vocab_size) 39 | loss = F.cross_entropy(logits.view(-1, logits.size()[-1]), 40 | target.view(-1), 41 | reduction='none').view_as(target).mean() 42 | loss.backward() 43 | return loss, identity.weight.grad 44 | 45 | 46 | def mpu_cross_entropy(batch_size, seq_length, vocab_size, 47 | logits_scale, seed): 48 | set_random_seed(seed) 49 | identity = IdentityLayer((batch_size, seq_length, vocab_size), 50 | scale=logits_scale).cuda() 51 | logits = identity() 52 | logits_parallel = mpu.scatter_to_model_parallel_region(logits) 53 | target = torch.cuda.LongTensor( 54 | size=(batch_size, seq_length)).random_(0, vocab_size) 55 | loss = vocab_parallel_cross_entropy(logits_parallel, target).mean() 56 | loss.backward() 57 | return loss, identity.weight.grad 58 | 59 | 60 | def test_cross_entropy(model_parallel_size): 61 | 62 | if torch.distributed.get_rank() == 0: 63 | print('> testing cross entropy with model parallel size {} ...'. 64 | format(model_parallel_size)) 65 | 66 | mpu.initialize_model_parallel(model_parallel_size) 67 | model_parallel_size = mpu.get_model_parallel_world_size() 68 | 69 | batch_size = 13 70 | seq_length = 17 71 | vocab_size_per_partition = 11 72 | logits_scale = 1000.0 73 | vocab_size = vocab_size_per_partition * model_parallel_size 74 | seed = 1234 75 | 76 | loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length, 77 | vocab_size, logits_scale, 78 | seed) 79 | loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length, 80 | vocab_size, logits_scale, 81 | seed) 82 | 83 | error = loss_torch.sub_(loss_mpu).abs().max() 84 | print(' max error in loss on global rank {}: {}'.format( 85 | torch.distributed.get_rank(), error)) 86 | assert error < 1.0e-6 87 | 88 | error = grad_torch.sub_(grad_mpu).abs().max() 89 | print(' max error in grad on global rank {}: {}'.format( 90 | torch.distributed.get_rank(), error)) 91 | assert error < 1.0e-6 92 | 93 | # Reset groups 94 | mpu.destroy_model_parallel() 95 | 96 | torch.distributed.barrier() 97 | if torch.distributed.get_rank() == 0: 98 | print('>> passed the test :-)') 99 | 100 | 101 | if __name__ == '__main__': 102 | 103 | initialize_distributed() 104 | world_size = torch.distributed.get_world_size() 105 | 106 | model_parallel_size = 1 107 | while model_parallel_size <= world_size: 108 | print_separator('test cross entropy') 109 | test_cross_entropy(model_parallel_size) 110 | model_parallel_size *= 2 111 | -------------------------------------------------------------------------------- /mpu/data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | from .initialize import get_model_parallel_group 19 | from .initialize import get_model_parallel_rank 20 | from .initialize import get_model_parallel_src_rank 21 | 22 | 23 | _MAX_DATA_DIM = 4 24 | 25 | 26 | def _check_data_types(keys, data, target_dtype): 27 | """Check that all the keys have the same target data type.""" 28 | for key in keys: 29 | assert data[key].dtype == target_dtype, '{} has data type {} which '\ 30 | 'is different than {}'.format(key, data[key].dtype, target_dtype) 31 | 32 | 33 | def _build_key_size_numel_dictionaries(keys, data): 34 | """Build the size on rank 0 and broadcast.""" 35 | max_dim = _MAX_DATA_DIM 36 | sizes = [0 for _ in range(max_dim) for _ in keys] 37 | 38 | # Pack the sizes on rank zero. 39 | if get_model_parallel_rank() == 0: 40 | offset = 0 41 | for key in keys: 42 | assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM' 43 | size = data[key].size() 44 | for i, s in enumerate(size): 45 | sizes[i + offset] = s 46 | offset += max_dim 47 | 48 | # Move to GPU and broadcast. 49 | sizes_cuda = torch.cuda.LongTensor(sizes) 50 | torch.distributed.broadcast(sizes_cuda, get_model_parallel_src_rank(), 51 | group=get_model_parallel_group()) 52 | 53 | # Move back to cpu and unpack. 54 | sizes_cpu = sizes_cuda.cpu() 55 | key_size = {} 56 | key_numel = {} 57 | total_numel = 0 58 | offset = 0 59 | for key in keys: 60 | i = 0 61 | size = [] 62 | numel = 1 63 | while sizes_cpu[offset + i] > 0: 64 | this_size = sizes_cpu[offset + i] 65 | size.append(this_size) 66 | numel *= this_size 67 | i += 1 68 | key_size[key] = size 69 | key_numel[key] = numel 70 | total_numel += numel 71 | offset += max_dim 72 | 73 | return key_size, key_numel, total_numel 74 | 75 | 76 | def broadcast_data(keys, data, datatype): 77 | """Broadcast data from rank zero of each model parallel group to the 78 | members of the same model parallel group. 79 | 80 | Arguments: 81 | keys: list of keys in the data disctionary to be broadcasted 82 | data: data dictionary of string keys and cpu tensor values. 83 | datatype: torch data type of all tensors in data associated 84 | with keys. 85 | """ 86 | # Build (key, size) and (key, number of elements) dictionaries along 87 | # with the total number of elements on all ranks. 88 | key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, 89 | data) 90 | 91 | # Pack on rank zero. 92 | if get_model_parallel_rank() == 0: 93 | # Check that all keys have the same data type. 94 | _check_data_types(keys, data, datatype) 95 | # Flatten the data associated with the keys 96 | flatten_data = torch.cat( 97 | [data[key].contiguous().view(-1) for key in keys], dim=0).cuda() 98 | else: 99 | flatten_data = torch.empty(total_numel, 100 | device=torch.cuda.current_device(), 101 | dtype=datatype) 102 | 103 | # Boradcast 104 | torch.distributed.broadcast(flatten_data, get_model_parallel_src_rank(), 105 | group=get_model_parallel_group()) 106 | 107 | # Unpack 108 | output = {} 109 | offset = 0 110 | for key in keys: 111 | size = key_size[key] 112 | numel = key_numel[key] 113 | output[key] = flatten_data.narrow(0, offset, numel).view(size) 114 | offset += numel 115 | 116 | return output 117 | -------------------------------------------------------------------------------- /openwebtext/cleanup_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import ftfy 18 | import json 19 | from langdetect import detect 20 | import numpy as np 21 | import time 22 | import os 23 | import sys 24 | 25 | from tokenizer import Tokenizer 26 | 27 | MIN_DOCUMENT_LENGHT = 128 28 | 29 | 30 | def print_progress(prefix, start_time, num_docs, num_fixed_text, 31 | num_non_english_docs, chars_non_english_docs, 32 | num_small_docs, chars_small_docs): 33 | 34 | string = prefix + ' | ' 35 | string += 'elapsed time: {:.2f} | '.format(time.time() - start_time) 36 | string += 'documents: {} | '.format(num_docs) 37 | string += 'fixed text: {} | '.format(num_fixed_text) 38 | string += 'non-english: {} | '.format(num_non_english_docs) 39 | string += 'non-english chars: {} | '.format(chars_non_english_docs) 40 | string += 'small docs: {} | '.format(num_small_docs) 41 | string += 'small docs chars: {}'.format(chars_small_docs) 42 | print(string, flush=True) 43 | 44 | 45 | def filter_corpus(filename, out_filename, print_interval=10000): 46 | 47 | print(' > filtering {}'.format(filename)) 48 | 49 | tokenizer = Tokenizer(cache_dir='./cache') 50 | 51 | num_docs = 0 52 | num_written_docs = 0 53 | num_small_docs = 0 54 | num_fixed_text = 0 55 | num_non_english_docs = 0 56 | chars_non_english_docs = 0 57 | chars_small_docs = 0 58 | start_time = time.time() 59 | with open(out_filename, 'wb') as f: 60 | with open(filename, 'r') as fin: 61 | for line in fin: 62 | try: 63 | num_docs += 1 64 | myjson = json.loads(line) 65 | # Fix text 66 | text = ftfy.fix_text(myjson['text']) 67 | if text != myjson['text']: 68 | num_fixed_text += 1 69 | myjson['text'] = text 70 | # Detect language. 71 | if detect(text) != 'en': 72 | print('[non-english text]', myjson) 73 | num_non_english_docs += 1 74 | chars_non_english_docs += len(text) 75 | continue 76 | # On average each token is 5 characters so 8 is an 77 | # upper bound. 78 | if len(text) < (8 * MIN_DOCUMENT_LENGHT): 79 | tokens = tokenizer.tokenize_document(text) 80 | if len(tokens) < MIN_DOCUMENT_LENGHT: 81 | print('[small document, skipping]:', myjson) 82 | num_small_docs += 1 83 | chars_small_docs += len(text) 84 | continue 85 | myjson = json.dumps(myjson, ensure_ascii=False) 86 | f.write(myjson.encode('utf-8')) 87 | f.write('\n'.encode('utf-8')) 88 | num_written_docs += 1 89 | if num_docs % print_interval == 0: 90 | print_progress('[PROGRESS]', start_time, num_docs, 91 | num_fixed_text, num_non_english_docs, 92 | chars_non_english_docs, 93 | num_small_docs, chars_small_docs) 94 | except Exception as e: 95 | print(' skipping ', line, e) 96 | 97 | print_progress('[FINAL]', start_time, num_docs, 98 | num_fixed_text, num_non_english_docs, 99 | chars_non_english_docs, 100 | num_small_docs, chars_small_docs) 101 | 102 | 103 | if __name__ == '__main__': 104 | 105 | print('building gpt2 dataset ...') 106 | 107 | input_filename = sys.argv[1] 108 | output_filename = sys.argv[2] 109 | 110 | print('will be reading {}'.format(input_filename)) 111 | print('and will write the results to {}'.format(output_filename)) 112 | 113 | filter_corpus(input_filename, output_filename) 114 | 115 | 116 | -------------------------------------------------------------------------------- /mpu/mappings.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | from .initialize import get_model_parallel_group 19 | from .utils import split_tensor_along_last_dim 20 | 21 | 22 | def _reduce(input_): 23 | """All-reduce the the input tensor across model parallel group.""" 24 | group = get_model_parallel_group() 25 | 26 | # Bypass the function if we are using only 1 GPU. 27 | if torch.distributed.get_world_size(group=group) == 1: 28 | return input_ 29 | 30 | # All-reduce. 31 | torch.distributed.all_reduce(input_, group=group) 32 | 33 | return input_ 34 | 35 | 36 | def _split(input_): 37 | """Split the tensor along its last dimension and keep the 38 | corresponding slice.""" 39 | group = get_model_parallel_group() 40 | 41 | # Bypass the function if we are using only 1 GPU. 42 | if torch.distributed.get_world_size(group=group) == 1: 43 | return input_ 44 | 45 | # Split along last dimension. 46 | world_size = torch.distributed.get_world_size(group=group) 47 | input_list = split_tensor_along_last_dim(input_, world_size) 48 | 49 | # Note: torch.split does not create contiguous tensors by default. 50 | rank = torch.distributed.get_rank(group=group) 51 | output = input_list[rank].contiguous() 52 | 53 | return output 54 | 55 | 56 | def _gather(input_): 57 | """Gather tensors and concatinate along the last dimension.""" 58 | group = get_model_parallel_group() 59 | 60 | # Bypass the function if we are using only 1 GPU. 61 | if torch.distributed.get_world_size(group=group) == 1: 62 | return input_ 63 | 64 | # Size and dimension. 65 | last_dim = input_.dim() - 1 66 | rank = torch.distributed.get_rank(group=group) 67 | world_size = torch.distributed.get_world_size(group=group) 68 | 69 | tensor_list = [torch.empty_like(input_) for _ in range(world_size)] 70 | tensor_list[rank] = input_ 71 | torch.distributed.all_gather(tensor_list, input_, group=group) 72 | 73 | # Note: torch.cat already creates a contiguous tensor. 74 | output = torch.cat(tensor_list, dim=last_dim).contiguous() 75 | 76 | return output 77 | 78 | 79 | class _CopyToModelParallelRegion(torch.autograd.Function): 80 | """Pass the input to the model parallel region.""" 81 | 82 | @staticmethod 83 | def forward(ctx, input_): 84 | return input_ 85 | 86 | @staticmethod 87 | def backward(ctx, grad_output): 88 | return _reduce(grad_output) 89 | 90 | 91 | class _ReduceFromModelParallelRegion(torch.autograd.Function): 92 | """All-redcue the input from the model parallel region.""" 93 | 94 | @staticmethod 95 | def forward(ctx, input_): 96 | return _reduce(input_) 97 | 98 | @staticmethod 99 | def backward(ctx, grad_output): 100 | return grad_output 101 | 102 | 103 | class _ScatterToModelParallelRegion(torch.autograd.Function): 104 | """Split the input and keep only the corresponding chuck to the rank.""" 105 | 106 | @staticmethod 107 | def forward(ctx, input_): 108 | return _split(input_) 109 | 110 | @staticmethod 111 | def backward(ctx, grad_output): 112 | return _gather(grad_output) 113 | 114 | 115 | class _GatherFromModelParallelRegion(torch.autograd.Function): 116 | """Gather the input from model parallel region and concatinate.""" 117 | 118 | @staticmethod 119 | def forward(ctx, input_): 120 | return _gather(input_) 121 | 122 | @staticmethod 123 | def backward(ctx, grad_output): 124 | return _split(grad_output) 125 | 126 | 127 | # ----------------- 128 | # Helper functions. 129 | # ----------------- 130 | 131 | def copy_to_model_parallel_region(input_): 132 | return _CopyToModelParallelRegion.apply(input_) 133 | 134 | def reduce_from_model_parallel_region(input_): 135 | return _ReduceFromModelParallelRegion.apply(input_) 136 | 137 | def scatter_to_model_parallel_region(input_): 138 | return _ScatterToModelParallelRegion.apply(input_) 139 | 140 | def gather_from_model_parallel_region(input_): 141 | return _GatherFromModelParallelRegion.apply(input_) 142 | -------------------------------------------------------------------------------- /model/gpt2_modeling.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """GPT-2 model.""" 17 | 18 | import torch 19 | import torch.nn.functional as F 20 | 21 | import mpu 22 | 23 | 24 | def init_method_normal(std=0.02): 25 | """Init method based on normal distribution. 26 | 27 | This is only used for embeddings. The transformer has its 28 | own initializer. 29 | """ 30 | def init_(tensor): 31 | return torch.nn.init.normal_(tensor, mean=0.0, std=std) 32 | return init_ 33 | 34 | 35 | class GPT2Model(torch.nn.Module): 36 | """GPT-2 Language model. 37 | 38 | The output of the forward method are the logits (parallel or 39 | serial depending on the `parallel_output` flag. 40 | """ 41 | 42 | def __init__(self, 43 | num_layers, 44 | vocab_size, 45 | hidden_size, 46 | num_attention_heads, 47 | embedding_dropout_prob, 48 | attention_dropout_prob, 49 | output_dropout_prob, 50 | max_sequence_length, 51 | checkpoint_activations, 52 | checkpoint_num_layers=1, 53 | parallel_output=True): 54 | 55 | super(GPT2Model, self).__init__() 56 | 57 | self.parallel_output = parallel_output 58 | 59 | init_method = init_method_normal(std=0.02) 60 | 61 | # Word embeddings (parallel). 62 | self.word_embeddings = mpu.VocabParallelEmbedding( 63 | vocab_size, hidden_size, init_method=init_method) 64 | 65 | # Position embedding (serial). 66 | self.position_embeddings = torch.nn.Embedding(max_sequence_length, 67 | hidden_size) 68 | # Initialize the position embeddings. 69 | init_method(self.position_embeddings.weight) 70 | 71 | # Embeddings dropout 72 | self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) 73 | 74 | # Transformer 75 | self.transformer = mpu.GPT2ParallelTransformer(num_layers, 76 | hidden_size, 77 | num_attention_heads, 78 | attention_dropout_prob, 79 | output_dropout_prob, 80 | checkpoint_activations, 81 | checkpoint_num_layers) 82 | 83 | def forward(self, input_ids, position_ids, attention_mask): 84 | 85 | # Embeddings. 86 | words_embeddings = self.word_embeddings(input_ids) 87 | position_embeddings = self.position_embeddings(position_ids) 88 | embeddings = words_embeddings + position_embeddings 89 | 90 | # Dropout. 91 | embeddings = self.embedding_dropout(embeddings) 92 | 93 | # Transformer. 94 | transformer_output = self.transformer(embeddings, attention_mask) 95 | 96 | # Parallel logits. 97 | transformer_output_parallel = mpu.copy_to_model_parallel_region( 98 | transformer_output) 99 | logits_parallel = F.linear(transformer_output_parallel, 100 | self.word_embeddings.weight) 101 | 102 | if self.parallel_output: 103 | return logits_parallel 104 | 105 | return mpu.gather_from_model_parallel_region(logits_parallel) 106 | 107 | 108 | def gpt2_get_params_for_weight_decay_optimization(module): 109 | 110 | weight_decay_params = {'params': []} 111 | no_weight_decay_params = {'params': [], 'weight_decay': 0.0} 112 | for module_ in module.modules(): 113 | if isinstance(module_, (mpu.LayerNorm, torch.nn.LayerNorm)): 114 | no_weight_decay_params['params'].extend( 115 | [p for p in list(module_._parameters.values()) 116 | if p is not None]) 117 | else: 118 | weight_decay_params['params'].extend( 119 | [p for n, p in list(module_._parameters.items()) 120 | if p is not None and n != 'bias']) 121 | no_weight_decay_params['params'].extend( 122 | [p for n, p in list(module_._parameters.items()) 123 | if p is not None and n == 'bias']) 124 | 125 | return weight_decay_params, no_weight_decay_params 126 | -------------------------------------------------------------------------------- /mpu/cross_entropy.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import torch 18 | 19 | from .initialize import get_model_parallel_group 20 | from .initialize import get_model_parallel_rank 21 | from .initialize import get_model_parallel_world_size 22 | from .utils import VocabUtility 23 | 24 | 25 | class _VocabParallelCrossEntropy(torch.autograd.Function): 26 | 27 | @staticmethod 28 | def forward(ctx, vocab_parallel_logits, target): 29 | 30 | # Copy so the input remains unchanged. 31 | logits = vocab_parallel_logits.clone() 32 | # Maximum value along vocab dimension across all GPUs. 33 | logits_max = torch.max(logits, dim=-1)[0] 34 | torch.distributed.all_reduce(logits_max, 35 | op=torch.distributed.ReduceOp.MAX, 36 | group=get_model_parallel_group()) 37 | # Subtract the maximum value. 38 | logits.sub_(logits_max.unsqueeze(dim=-1)) 39 | # Sum of exponential of logits along vocab dimension across all GPUs. 40 | exp_logits = logits.exp() 41 | sum_exp_logits = exp_logits.sum(dim=-1) 42 | torch.distributed.all_reduce(sum_exp_logits, 43 | op=torch.distributed.ReduceOp.SUM, 44 | group=get_model_parallel_group()) 45 | 46 | # Get the partition's vocab indecies 47 | get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size 48 | partition_vocab_size = vocab_parallel_logits.size()[-1] 49 | rank = get_model_parallel_rank() 50 | world_size = get_model_parallel_world_size() 51 | vocab_start_index, vocab_end_index = get_vocab_range( 52 | partition_vocab_size, rank, world_size) 53 | 54 | # Create a mask of valid vocab ids (1 means it needs to be masked). 55 | target_mask = (target < vocab_start_index) | (target >= vocab_end_index) 56 | masked_target = target.clone() - vocab_start_index 57 | masked_target[target_mask] = 0 58 | 59 | # Get predicted-logits = logits[target]. 60 | # For Simplicity, we convert logits to a 2-D tensor with size 61 | # [*, partition-vocab-size] and target to a 1-D tensor of size [*]. 62 | logits_2d = logits.view(-1, partition_vocab_size) 63 | masked_target_1d = masked_target.view(-1) 64 | arange_1d = torch.arange(start=0, end=logits_2d.size()[0], 65 | device=logits_2d.device) 66 | predicted_logits_1d = logits_2d[arange_1d, masked_target_1d] 67 | predicted_logits = predicted_logits_1d.view_as(target) 68 | predicted_logits[target_mask] = 0.0 69 | # All reduce is needed to get the chunks from other GPUs. 70 | torch.distributed.all_reduce(predicted_logits, 71 | op=torch.distributed.ReduceOp.SUM, 72 | group=get_model_parallel_group()) 73 | 74 | # Loss = log(sum(exp(logits))) - predicted-logit. 75 | loss = torch.log(sum_exp_logits) - predicted_logits 76 | 77 | # Store softmax, target-mask and masked-target for backward pass. 78 | exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1)) 79 | ctx.save_for_backward(exp_logits, target_mask, masked_target_1d) 80 | 81 | return loss 82 | 83 | @staticmethod 84 | def backward(ctx, grad_output): 85 | 86 | # Retreive tensors from the forward path. 87 | softmax, target_mask, masked_target_1d = ctx.saved_tensors 88 | 89 | # All the inputs have softmax as thier gradient. 90 | grad_input = softmax 91 | # For simplicity, work with the 2D gradient. 92 | partition_vocab_size = softmax.size()[-1] 93 | grad_2d = grad_input.view(-1, partition_vocab_size) 94 | 95 | # Add the gradient from matching classes. 96 | arange_1d = torch.arange(start=0, end=grad_2d.size()[0], 97 | device=grad_2d.device) 98 | grad_2d[arange_1d, masked_target_1d] -= ( 99 | 1.0 - target_mask.view(-1).float()) 100 | 101 | # Finally elementwise multiplication with the output gradients. 102 | grad_input.mul_(grad_output.unsqueeze(dim=-1)) 103 | 104 | return grad_input, None 105 | 106 | 107 | def vocab_parallel_cross_entropy(vocab_parallel_logits, target): 108 | """Helper function for the cross entropy.""" 109 | return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target) 110 | -------------------------------------------------------------------------------- /model/distributed.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors 18 | import torch.distributed as dist 19 | from torch.nn.modules import Module 20 | from torch.autograd import Variable 21 | 22 | import mpu 23 | 24 | class DistributedDataParallel(Module): 25 | 26 | def __init__(self, module): 27 | super(DistributedDataParallel, self).__init__() 28 | self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False 29 | 30 | self.module = module 31 | self.data_parallel_group = mpu.get_data_parallel_group() 32 | src_rank = mpu.get_model_parallel_rank() 33 | for p in self.module.parameters(): 34 | if torch.is_tensor(p): 35 | dist.broadcast(p, src_rank, group=self.data_parallel_group) 36 | 37 | def allreduce_params(reduce_after=True, no_scale=False, fp32_allreduce=False): 38 | if(self.needs_reduction): 39 | self.needs_reduction = False 40 | buckets = {} 41 | for name, param in self.module.named_parameters(): 42 | if param.requires_grad and param.grad is not None: 43 | tp = (param.data.type()) 44 | if tp not in buckets: 45 | buckets[tp] = [] 46 | buckets[tp].append(param) 47 | if self.warn_on_half: 48 | if torch.cuda.HalfTensor in buckets: 49 | print("WARNING: gloo dist backend for half parameters may be extremely slow." + 50 | " It is recommended to use the NCCL backend in this case.") 51 | self.warn_on_half = False 52 | for tp in buckets: 53 | bucket = buckets[tp] 54 | grads = [param.grad.data for param in bucket] 55 | coalesced = _flatten_dense_tensors(grads) 56 | if fp32_allreduce: 57 | coalesced = coalesced.float() 58 | if not no_scale and not reduce_after: 59 | coalesced /= dist.get_world_size(group=self.data_parallel_group) 60 | dist.all_reduce(coalesced, group=self.data_parallel_group) 61 | torch.cuda.synchronize() 62 | if not no_scale and reduce_after: 63 | coalesced /= dist.get_world_size(group=self.data_parallel_group) 64 | for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): 65 | buf.copy_(synced) 66 | self.hook_handles = [] 67 | self.hooks = [] 68 | for param in list(self.module.parameters()): 69 | def allreduce_hook(*unused): 70 | Variable._execution_engine.queue_callback(allreduce_params) 71 | # handle = param.register_hook(allreduce_hook) 72 | #self.hooks.append(allreduce_hook) 73 | #self.hook_handles.append(handle) 74 | self.allreduce_params = allreduce_params 75 | 76 | def forward(self, *inputs, **kwargs): 77 | self.needs_reduction = True 78 | return self.module(*inputs, **kwargs) 79 | 80 | def state_dict(self, destination=None, prefix='', keep_vars=False): 81 | #[h.remove() for h in self.hook_handles] 82 | sd = self.module.state_dict(destination, prefix, keep_vars) 83 | # for handle, hook in zip(self.hook_handles, self.hooks): 84 | # d = handle.hooks_dict_ref() 85 | # d[handle.id] = hook 86 | 87 | return sd 88 | 89 | def load_state_dict(self, state_dict, strict=True): 90 | self.module.load_state_dict(state_dict, strict=strict) 91 | 92 | ''' 93 | def _sync_buffers(self): 94 | buffers = list(self.module._all_buffers()) 95 | if len(buffers) > 0: 96 | # cross-node buffer sync 97 | flat_buffers = _flatten_dense_tensors(buffers) 98 | dist.broadcast(flat_buffers, 0) 99 | for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)): 100 | buf.copy_(synced) 101 | def train(self, mode=True): 102 | # Clear NCCL communicator and CUDA event cache of the default group ID, 103 | # These cache will be recreated at the later call. This is currently a 104 | # work-around for a potential NCCL deadlock. 105 | if dist._backend == dist.dist_backend.NCCL: 106 | dist._clear_group_cache() 107 | super(DistributedDataParallel, self).train(mode) 108 | self.module.train(mode) 109 | ''' 110 | 111 | -------------------------------------------------------------------------------- /mpu/initialize.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | """Model and data parallel groups.""" 18 | 19 | import torch 20 | 21 | from .utils import ensure_divisibility 22 | 23 | 24 | # Model parallel group that the current rank belongs to. 25 | _MODEL_PARALLEL_GROUP = None 26 | # Data parallel group that the current rank belongs to. 27 | _DATA_PARALLEL_GROUP = None 28 | 29 | 30 | def initialize_model_parallel(model_parallel_size_): 31 | """ 32 | Initialize model data parallel groups. 33 | 34 | Arguments: 35 | model_parallel_size: number of GPUs used to parallelize model. 36 | 37 | Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we 38 | use 2 GPUs to parallelize the model. The present function will 39 | create 4 model parallel groups and 2 data parallel grous as: 40 | 4 model parallel groups: 41 | [g0, g1], [g2, g3], [g4, g5], [g6, g7] 42 | 2 data parallel groups: 43 | [g0, g2, g4, g6], [g1, g3, g5, g7] 44 | Note that for efficiency, the caller should make sure adjacent ranks 45 | are on the same DGX box. For example if we are using 2 DGX-1 boxes 46 | with a total of 16 GPUs, rank 0 to 7 belong to the first box and 47 | ranks 8 to 15 belong to the second box. 48 | """ 49 | if torch.distributed.get_rank() == 0: 50 | print('> initializing model parallel with size {}'.format( 51 | model_parallel_size_)) 52 | # Get world size and rank. Ensure some consistencies. 53 | assert torch.distributed.is_initialized() 54 | world_size = torch.distributed.get_world_size() 55 | model_parallel_size = min(model_parallel_size_, world_size) 56 | ensure_divisibility(world_size, model_parallel_size) 57 | rank = torch.distributed.get_rank() 58 | 59 | # Build the data parallel groups. 60 | global _DATA_PARALLEL_GROUP 61 | assert _DATA_PARALLEL_GROUP is None, \ 62 | 'data parallel group is already initialized' 63 | for i in range(model_parallel_size): 64 | ranks = range(i, world_size, model_parallel_size) 65 | group = torch.distributed.new_group(ranks) 66 | if i == (rank % model_parallel_size): 67 | _DATA_PARALLEL_GROUP = group 68 | 69 | # Build the model parallel groups. 70 | global _MODEL_PARALLEL_GROUP 71 | assert _MODEL_PARALLEL_GROUP is None, \ 72 | 'model parallel group is already initialized' 73 | for i in range(world_size // model_parallel_size): 74 | ranks = range(i * model_parallel_size, 75 | (i + 1) * model_parallel_size) 76 | group = torch.distributed.new_group(ranks) 77 | if i == (rank // model_parallel_size): 78 | _MODEL_PARALLEL_GROUP = group 79 | 80 | 81 | def model_parallel_is_initialized(): 82 | """Check if model and data parallel groups are initialized.""" 83 | if _MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None: 84 | return False 85 | return True 86 | 87 | 88 | def get_model_parallel_group(): 89 | """Get the model parallel group the caller rank belongs to.""" 90 | assert _MODEL_PARALLEL_GROUP is not None, \ 91 | 'model parallel group is not initialized' 92 | return _MODEL_PARALLEL_GROUP 93 | 94 | 95 | def get_data_parallel_group(): 96 | """Get the data parallel group the caller rank belongs to.""" 97 | assert _DATA_PARALLEL_GROUP is not None, \ 98 | 'data parallel group is not initialized' 99 | return _DATA_PARALLEL_GROUP 100 | 101 | 102 | def get_model_parallel_world_size(): 103 | """Return world size for the model parallel group.""" 104 | return torch.distributed.get_world_size(group=get_model_parallel_group()) 105 | 106 | 107 | def get_model_parallel_rank(): 108 | """Return my rank for the model parallel group.""" 109 | return torch.distributed.get_rank(group=get_model_parallel_group()) 110 | 111 | 112 | def get_model_parallel_src_rank(): 113 | """Calculate the global rank corresponding to a local rank zeor 114 | in the model parallel group.""" 115 | global_rank = torch.distributed.get_rank() 116 | local_world_size = get_model_parallel_world_size() 117 | return (global_rank // local_world_size) * local_world_size 118 | 119 | 120 | def get_data_parallel_world_size(): 121 | """Return world size for the data parallel group.""" 122 | return torch.distributed.get_world_size(group=get_data_parallel_group()) 123 | 124 | 125 | def get_data_parallel_rank(): 126 | """Return my rank for the data parallel group.""" 127 | return torch.distributed.get_rank(group=get_data_parallel_group()) 128 | 129 | 130 | def destroy_model_parallel(): 131 | """Set the groups to none.""" 132 | global _MODEL_PARALLEL_GROUP 133 | _MODEL_PARALLEL_GROUP = None 134 | global _DATA_PARALLEL_GROUP 135 | _DATA_PARALLEL_GROUP = None 136 | -------------------------------------------------------------------------------- /data_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """utils for creating datasets""" 16 | import os 17 | import math 18 | 19 | from .samplers import DistributedBatchSampler 20 | from .datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, bert_sentencepair_dataset, GPT2Dataset 21 | from .lazy_loader import exists_lazy, make_lazy, lazy_array_loader 22 | from .tokenization import Tokenization, CommandToken, Tokenizer, CharacterLevelTokenizer, BertWordPieceTokenizer, GPT2BPETokenizer, make_tokenizer 23 | from . import corpora 24 | 25 | TRAIN_DATA = 0 26 | VAL_DATA = 1 27 | TEST_DATA = 2 28 | 29 | def should_split(split): 30 | """ 31 | given split proportions checks if should split 32 | Examples: 33 | >>> should_split([10,0,0]) 34 | False 35 | >>> should_split([1,.1,.2]) 36 | True 37 | """ 38 | return max(split)/sum(split) != 1. 39 | 40 | def get_ext(path): 41 | """gets path extension""" 42 | return os.path.splitext(path)[1] 43 | 44 | def get_dataset(path, **kwargs): 45 | """gets dataset object based on keyword args and file at `path`""" 46 | if supported_corpus(path): 47 | return corpora.NAMED_CORPORA[path](**kwargs) 48 | ext = get_ext(path) 49 | if '.json' in ext: 50 | text = json_dataset(path, **kwargs) 51 | elif ext in ['.csv', '.tsv']: 52 | text = csv_dataset(path, **kwargs) 53 | else: 54 | raise NotImplementedError('data file type %s is not supported'%(ext)) 55 | return text 56 | 57 | def supported_corpus(corpus_name): 58 | """checks if corpus name is defined in `corpora.py`""" 59 | return corpus_name in corpora.NAMED_CORPORA 60 | 61 | def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.], 62 | delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None, 63 | tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None, 64 | model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None, **kwargs): 65 | """function to create datasets+tokenizers for common options""" 66 | if isinstance(process_fn, str): 67 | process_fn = eval(process_fn) 68 | if non_binary_cols is not None: 69 | # multilabel dataset support (only for csvs) 70 | label_key = non_binary_cols 71 | def get_dataset_from_path(path_): 72 | if lazy: 73 | # get lazily loaded dataset 74 | named_corpora = False 75 | if supported_corpus(path_): 76 | named_corpora = True 77 | name = path_ 78 | path_ = corpora.NAMED_CORPORA[path_].PATH 79 | if not exists_lazy(path_, data_type='data'): 80 | # create cached version of dataset for lazy loading if it doesn't exist 81 | text = get_dataset(name if named_corpora else path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent, 82 | delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose) 83 | make_lazy(path_, text.X, data_type='data') 84 | text = lazy_array_loader(path_, data_type='data', map_fn=process_fn) 85 | else: 86 | # get dataset 87 | text = get_dataset(path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent, 88 | delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose, preprocess_fn=process_fn) 89 | return text 90 | # get one or multiple datasets and concatenate 91 | if isinstance(path, str): 92 | path = [path] 93 | datasets = [get_dataset_from_path(p) for p in path] 94 | if len(datasets) == 1: 95 | ds = datasets[0] 96 | else: 97 | ds = ConcatDataset(datasets) 98 | # make tokenizer for dataset 99 | if tokenizer is None: 100 | tokenizer = make_tokenizer(tokenizer_type, ds, tokenizer_model_path, vocab_size, model_type, 101 | pad_token, character_converage, **kwargs) 102 | 103 | ds_type = '' 104 | if 'ds_type' in kwargs: 105 | ds_type = kwargs['ds_type'] 106 | ds.SetTokenizer(tokenizer) 107 | # Split dataset into train/val/test (and wrap bert dataset) 108 | if should_split(split): 109 | ds = split_ds(ds, split) 110 | if ds_type.lower() == 'bert': 111 | presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False 112 | ds = [bert_sentencepair_dataset(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences) if d is not None else None for d in ds] 113 | elif ds_type.lower() == 'gpt2': 114 | ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds] 115 | else: 116 | if ds_type.lower() == 'bert': 117 | presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False 118 | ds = bert_sentencepair_dataset(ds, max_seq_len=seq_length, presplit_sentences=presplit_sentences) 119 | elif ds_type.lower() == 'gpt2': 120 | ds = GPT2Dataset(ds, max_seq_len=seq_length) 121 | return ds, tokenizer 122 | -------------------------------------------------------------------------------- /data_utils/tf_dl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch DataLoader for TFRecords""" 16 | 17 | import queue 18 | import threading 19 | 20 | import tensorflow as tf 21 | tf.enable_eager_execution() 22 | import torch 23 | import numpy as np 24 | 25 | class TFRecordDataLoader(object): 26 | def __init__(self, records, batch_size, max_seq_len, max_preds_per_seq, train, num_workers=2, seed=1, threaded_dl=False): 27 | assert max_preds_per_seq is not None, "--max-preds-per-seq MUST BE SPECIFIED when using tfrecords" 28 | tf.set_random_seed(seed) 29 | if isinstance(records, str): 30 | records = [records] 31 | 32 | self.record_converter = Record2Example({"input_ids": tf.FixedLenFeature([max_seq_len], tf.int64), 33 | "input_mask": tf.FixedLenFeature([max_seq_len], tf.int64), 34 | "segment_ids": tf.FixedLenFeature([max_seq_len], tf.int64), 35 | "masked_lm_positions": tf.FixedLenFeature([max_preds_per_seq], tf.int64), 36 | "masked_lm_ids": tf.FixedLenFeature([max_preds_per_seq], tf.int64), 37 | "masked_lm_weights": tf.FixedLenFeature([max_preds_per_seq], tf.float32), 38 | "next_sentence_labels": tf.FixedLenFeature([1], tf.int64)}) 39 | 40 | #Instantiate dataset according to original BERT implementation 41 | if train: 42 | self.dataset = tf.data.Dataset.from_tensor_slices(tf.constant(records)) 43 | self.dataset = self.dataset.repeat() 44 | self.dataset = self.dataset.shuffle(buffer_size=len(records)) 45 | 46 | # use sloppy tfrecord dataset 47 | self.dataset = self.dataset.apply( 48 | tf.contrib.data.parallel_interleave( 49 | tf.data.TFRecordDataset, 50 | sloppy=train, 51 | cycle_length=min(num_workers, len(records)))) 52 | self.dataset = self.dataset.shuffle(buffer_size=100) 53 | else: 54 | self.dataset = tf.data.TFRecordDataset(records) 55 | self.dataset = self.dataset.repeat() 56 | 57 | # Instantiate dataloader (do not drop remainder for eval) 58 | loader_args = {'batch_size': batch_size, 59 | 'num_parallel_batches': num_workers, 60 | 'drop_remainder': train} 61 | self.dataloader = self.dataset.apply(tf.contrib.data.map_and_batch(self.record_converter, **loader_args)) 62 | self.threaded_dl = threaded_dl 63 | self.num_workers = num_workers 64 | 65 | def __iter__(self): 66 | if self.threaded_dl: 67 | data_iter = iter(MultiprocessLoader(self.dataloader, self.num_workers)) 68 | for item in data_iter: 69 | yield item 70 | else: 71 | data_iter = iter(self.dataloader) 72 | for item in data_iter: 73 | yield convert_tf_example_to_torch_tensors(item) 74 | 75 | class Record2Example(object): 76 | def __init__(self, feature_map): 77 | self.feature_map = feature_map 78 | 79 | def __call__(self, record): 80 | """Decodes a BERT TF record to a TF example.""" 81 | example = tf.parse_single_example(record, self.feature_map) 82 | for k, v in list(example.items()): 83 | if v.dtype == tf.int64: 84 | example[k] = tf.to_int32(v) 85 | return example 86 | 87 | def convert_tf_example_to_torch_tensors(example): 88 | item = {k: (v.numpy()) for k,v in example.items()} 89 | mask = np.zeros_like(item['input_ids']) 90 | mask_labels = np.ones_like(item['input_ids'])*-1 91 | for b, row in enumerate(item['masked_lm_positions'].astype(int)): 92 | for i, idx in enumerate(row): 93 | if item['masked_lm_weights'][b, i] != 0: 94 | mask[b, idx] = 1 95 | mask_labels[b, idx] = item['masked_lm_ids'][b, i] 96 | output = {'text': item['input_ids'], 'types': item['segment_ids'],'is_random': item['next_sentence_labels'], 97 | 'pad_mask': 1-item['input_mask'], 'mask': mask, 'mask_labels': mask_labels} 98 | return {k: torch.from_numpy(v) for k,v in output.items()} 99 | 100 | class MultiprocessLoader(object): 101 | def __init__(self, dataloader, num_workers=2): 102 | self.dl = dataloader 103 | self.queue_size = 2*num_workers 104 | 105 | def __iter__(self): 106 | output_queue = queue.Queue(self.queue_size) 107 | output_thread = threading.Thread(target=_multiproc_iter, 108 | args=(self.dl, output_queue)) 109 | output_thread.daemon = True 110 | output_thread.start() 111 | 112 | while output_thread.is_alive(): 113 | yield output_queue.get(block=True) 114 | else: 115 | print(RuntimeError('TF record data loader thread exited unexpectedly')) 116 | 117 | def _multiproc_iter(dl, output_queue): 118 | data_iter = iter(dl) 119 | for item in data_iter: 120 | tensors = convert_tf_example_to_torch_tensors(item) 121 | output_queue.put(tensors, block=True) -------------------------------------------------------------------------------- /data_utils/samplers.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """batch samplers that work with either random or sequential data samplers""" 16 | import math 17 | import os 18 | import sys 19 | 20 | import torch 21 | from torch.utils import data 22 | import numpy as np 23 | 24 | class RandomSampler(data.sampler.Sampler): 25 | r""" 26 | Based off of pytorch RandomSampler and DistributedSampler. Essentially a RandomSampler, 27 | but this class lets the user set an epoch like DistributedSampler 28 | Samples elements randomly. If without replacement, then sample from a shuffled dataset. 29 | If with replacement, then user can specify ``num_samples`` to draw. 30 | Arguments: 31 | data_source (Dataset): dataset to sample from 32 | num_samples (int): number of samples to draw, default=len(dataset) 33 | replacement (bool): samples are drawn with replacement if ``True``, default=False 34 | """ 35 | 36 | def __init__(self, data_source, replacement=False, num_samples=None): 37 | self.data_source = data_source 38 | self.replacement = replacement 39 | self._num_samples = num_samples 40 | self.epoch = -1 41 | 42 | if self._num_samples is not None and replacement is False: 43 | raise ValueError("With replacement=False, num_samples should not be specified, " 44 | "since a random permute will be performed.") 45 | 46 | if not isinstance(self.num_samples, int) or self.num_samples <= 0: 47 | raise ValueError("num_samples should be a positive integer " 48 | "value, but got num_samples={}".format(self.num_samples)) 49 | if not isinstance(self.replacement, bool): 50 | raise ValueError("replacement should be a boolean value, but got " 51 | "replacement={}".format(self.replacement)) 52 | 53 | @property 54 | def num_samples(self): 55 | # dataset size might change at runtime 56 | if self._num_samples is None: 57 | return len(self.data_source) 58 | return self._num_samples 59 | 60 | def __iter__(self): 61 | n = len(self.data_source) 62 | g = torch.Generator() 63 | if self.epoch >= 0: 64 | g.manual_seed(self.epoch) 65 | if self.replacement: 66 | return iter(torch.randint(high=n, size=(self.num_samples,), dtype=torch.int64, generator=g).tolist()) 67 | return iter(torch.randperm(n, generator=g).tolist()) 68 | 69 | def __len__(self): 70 | return self.num_samples 71 | 72 | def set_epoch(self, epoch): 73 | self.epoch = epoch 74 | 75 | class DistributedBatchSampler(data.sampler.BatchSampler): 76 | """ 77 | similar to normal implementation of distributed sampler, except implementation is at the 78 | batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary 79 | data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler. 80 | """ 81 | def __init__(self, sampler, batch_size, drop_last, rank=-1, world_size=2, wrap_last=False): 82 | super(DistributedBatchSampler, self).__init__(sampler, batch_size, drop_last) 83 | if rank == -1: 84 | assert False, 'should not be here' 85 | rank = torch.distributed.get_rank() 86 | self.rank = rank 87 | self.world_size = world_size 88 | self.sampler.wrap_around = 0 89 | self.wrap_around = 0 90 | self.wrap_last = wrap_last 91 | self.start_iter = 0 92 | 93 | def __iter__(self): 94 | batch = [] 95 | last_batch = None 96 | i = 0 97 | for idx in self.data_iterator(self.sampler, wrap_around=False): 98 | batch.append(idx) 99 | if len(batch) == self.batch_size: 100 | tbatch = self._batch(batch) 101 | if i >= self.start_iter: 102 | yield tbatch 103 | self.start_iter = 0 104 | i += 1 105 | last_batch = np.array(list(tbatch)) 106 | batch = [] 107 | batch_len = len(batch) 108 | if batch_len > 0 and not self.drop_last: 109 | if self.wrap_last: 110 | self.sampler.wrap_around -= (self.batch_size) 111 | self.wrap_around += (len(batch)) 112 | self.wrap_around %= self.batch_size 113 | if isinstance(self.sampler, TransposedSampler): 114 | for i, idx in enumerate(self.data_iterator(self.sampler, wrap_around=True)): 115 | if i == 0: 116 | continue 117 | batch.append(idx) 118 | new_batch_len = len(batch) 119 | if len(batch) == self.batch_size: 120 | break 121 | yield self._batch(batch) 122 | if self.wrap_last: 123 | self.sampler.wrap_around += self.batch_size 124 | 125 | def data_iterator(self, _iter, wrap_around=False): 126 | """iterates through data and handles wrap around""" 127 | for i, idx in enumerate(_iter): 128 | if i < self.wrap_around%self.batch_size: 129 | continue 130 | if wrap_around: 131 | self.wrap_around += 1 132 | self.wrap_around %= self.batch_size 133 | yield idx 134 | 135 | def _batch(self, batch): 136 | """extracts samples only pertaining to this worker's batch""" 137 | start = self.rank*self.batch_size//self.world_size 138 | end = (self.rank+1)*self.batch_size//self.world_size 139 | return batch[start:end] 140 | -------------------------------------------------------------------------------- /mpu/tests/test_random.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import sys 17 | sys.path.append("../..") 18 | 19 | import torch 20 | import mpu 21 | 22 | from commons import initialize_distributed 23 | from commons import print_separator 24 | 25 | 26 | def test_set_cuda_rng_state(model_parallel_size): 27 | 28 | if torch.distributed.get_rank() == 0: 29 | print('> testing set_rng_state with size {} ...'. 30 | format(model_parallel_size)) 31 | 32 | mpu.initialize_model_parallel(model_parallel_size) 33 | model_parallel_size = mpu.get_model_parallel_world_size() 34 | 35 | size = 123 36 | seed = 1234 37 | torch.cuda.manual_seed(1234) 38 | tensor = torch.cuda.FloatTensor(size) 39 | 40 | # Get the state 41 | rng_state = torch.cuda.get_rng_state() 42 | rng_state_copy = rng_state.clone() 43 | 44 | # Do some stuff. 45 | for _ in range(5): 46 | torch.randn(size, out=tensor) 47 | result_1 = tensor.clone() 48 | 49 | assert rng_state.sub(rng_state_copy).max() == 0 50 | assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0 51 | 52 | # State should be different. 53 | new_rng_state = torch.cuda.get_rng_state() 54 | max_diff = new_rng_state.sub(rng_state).max() 55 | print(' max diff in rng state (should be non-zero) on global rank {}: {}'. 56 | format(torch.distributed.get_rank(), max_diff)) 57 | assert max_diff > 0 58 | 59 | # Reset the rng state and do the same stuff. 60 | mpu.random._set_cuda_rng_state(rng_state) 61 | for _ in range(5): 62 | torch.randn(size, out=tensor) 63 | mpu.random._set_cuda_rng_state(rng_state) 64 | for _ in range(5): 65 | torch.randn(size, out=tensor) 66 | result_2 = tensor.clone() 67 | 68 | # Results should be the same 69 | error = result_2.sub(result_1).abs().max() 70 | print(' max error in generated tensors (should be zero) on ' 71 | 'global rank {}: {}'.format(torch.distributed.get_rank(), error)) 72 | assert error < 1.0e-6 73 | 74 | # Input state should have remained intact. 75 | error = rng_state.sub(rng_state_copy).max() 76 | print(' max error in rng state (should be zero) on global rank {}: {}'. 77 | format(torch.distributed.get_rank(), error)) 78 | assert error == 0 79 | 80 | # Reset groups 81 | mpu.destroy_model_parallel() 82 | 83 | torch.distributed.barrier() 84 | if torch.distributed.get_rank() == 0: 85 | print('>> passed the test :-)') 86 | 87 | 88 | def test_cuda_rng_tracker(model_parallel_size): 89 | 90 | if torch.distributed.get_rank() == 0: 91 | print('> testing cuda rng tracker with size {} ...'. 92 | format(model_parallel_size)) 93 | 94 | mpu.initialize_model_parallel(model_parallel_size) 95 | model_parallel_size = mpu.get_model_parallel_world_size() 96 | 97 | seed_1 = 1234 98 | seed_2 = 4321 99 | size = [12, 21] 100 | tensor = torch.cuda.FloatTensor(size) 101 | 102 | # Set to seed_1 and generate two tensors. 103 | torch.cuda.manual_seed(seed_1) 104 | torch.randn(size, out=tensor) 105 | target_11 = tensor.clone() 106 | torch.randn(size, out=tensor) 107 | target_12 = tensor.clone() 108 | 109 | # Set to seed_2 and generate two tensors. 110 | torch.cuda.manual_seed(seed_2) 111 | torch.randn(size, out=tensor) 112 | target_21 = tensor.clone() 113 | torch.randn(size, out=tensor) 114 | target_22 = tensor.clone() 115 | 116 | # Now if we interleave seed_1 and seed_2, 117 | # we should still get the same tensors 118 | torch.cuda.manual_seed(seed_1) 119 | mpu.get_cuda_rng_tracker().add('test', seed_2) 120 | 121 | torch.randn(size, out=tensor) 122 | result_11 = tensor.clone() 123 | 124 | with mpu.get_cuda_rng_tracker().fork('test'): 125 | torch.randn(size, out=tensor) 126 | result_21 = tensor.clone() 127 | 128 | torch.randn(size, out=tensor) 129 | result_12 = tensor.clone() 130 | 131 | with mpu.get_cuda_rng_tracker().fork('test'): 132 | torch.randn(size, out=tensor) 133 | result_22 = tensor.clone() 134 | 135 | diff = result_11.sub(result_21).abs().max() 136 | diff = min(diff, result_12.sub(result_22).abs().max()) 137 | print(' max diff in generated tensors (should be non-zero) on ' 138 | 'global rank {}: {}'.format(torch.distributed.get_rank(), diff)) 139 | assert diff > 1.0e-6 140 | error = max(result_11.sub(target_11).abs().max(), 141 | result_12.sub(target_12).abs().max()) 142 | error = max(error, result_21.sub(target_21).abs().max()) 143 | error = max(error, result_22.sub(target_22).abs().max()) 144 | print(' max error in generated tensors (should be zero) on ' 145 | 'global rank {}: {}'.format(torch.distributed.get_rank(), error)) 146 | assert error < 1.0e-6 147 | 148 | # Reset the tracker 149 | mpu.get_cuda_rng_tracker().reset() 150 | 151 | # Reset groups 152 | mpu.destroy_model_parallel() 153 | 154 | torch.distributed.barrier() 155 | if torch.distributed.get_rank() == 0: 156 | print('>> passed the test :-)') 157 | 158 | 159 | def test_model_parallel_cuda_manual_seed(model_parallel_size): 160 | 161 | if torch.distributed.get_rank() == 0: 162 | print('> testing model parallel cuda manual seed with size {} ...'. 163 | format(model_parallel_size)) 164 | 165 | mpu.initialize_model_parallel(model_parallel_size) 166 | model_parallel_size = mpu.get_model_parallel_world_size() 167 | 168 | mpu.model_parallel_cuda_manual_seed(12345) 169 | assert torch.cuda.initial_seed() == 12345 170 | with mpu.get_cuda_rng_tracker().fork(): 171 | assert torch.cuda.initial_seed() == (12345 + 2718 + 172 | mpu.get_model_parallel_rank()) 173 | 174 | # Reset the tracker 175 | mpu.get_cuda_rng_tracker().reset() 176 | 177 | # Reset groups 178 | mpu.destroy_model_parallel() 179 | 180 | torch.distributed.barrier() 181 | if torch.distributed.get_rank() == 0: 182 | print('>> passed the test :-)') 183 | 184 | 185 | if __name__ == '__main__': 186 | 187 | initialize_distributed() 188 | world_size = torch.distributed.get_world_size() 189 | 190 | model_parallel_size = 1 191 | while model_parallel_size <= world_size: 192 | print_separator('test set rng state') 193 | test_set_cuda_rng_state(model_parallel_size) 194 | model_parallel_size *= 2 195 | 196 | model_parallel_size = 1 197 | while model_parallel_size <= world_size: 198 | print_separator('test cuda rng tracker') 199 | test_cuda_rng_tracker(model_parallel_size) 200 | model_parallel_size *= 2 201 | 202 | model_parallel_size = 1 203 | while model_parallel_size <= world_size: 204 | print_separator('test model parallel cuda manual seed') 205 | test_model_parallel_cuda_manual_seed(model_parallel_size) 206 | model_parallel_size *= 2 207 | 208 | -------------------------------------------------------------------------------- /data_utils/lazy_loader.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """utils for loading text from disk""" 16 | import os 17 | import mmap 18 | import pickle as pkl 19 | import time 20 | from itertools import accumulate 21 | 22 | import torch 23 | from torch.multiprocessing import Lock 24 | 25 | def get_lazy_path(path): 26 | """ 27 | Gets directory path where lazy files are stored. 28 | """ 29 | return os.path.splitext(path)[0]+'.lazy' 30 | 31 | def exists_lazy(path, data_type='data'): 32 | """ 33 | Check if we've already made a lazy version of this file for the `data_type` field. 34 | """ 35 | if not os.path.exists(get_lazy_path(path)): 36 | return False 37 | contents = os.listdir(get_lazy_path(path)) 38 | if data_type not in contents: 39 | return False 40 | if data_type+'.len.pkl' not in contents: 41 | return False 42 | return True 43 | 44 | def make_lazy(path, strs, data_type='data'): 45 | """ 46 | Make lazy version of `data_type` field of the file. Byte offsets 47 | corresponding to data indices are stored in a `.len.pkl` data file. 48 | """ 49 | lazypath = get_lazy_path(path) 50 | if not os.path.exists(lazypath): 51 | os.makedirs(lazypath) 52 | datapath = os.path.join(lazypath, data_type) 53 | lenpath = os.path.join(lazypath, data_type+'.len.pkl') 54 | if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: 55 | with open(datapath, 'wb') as f: 56 | str_lens = [] 57 | str_cnt = 0 58 | for s in strs: 59 | if isinstance(s, dict): 60 | s = s['text'] 61 | encoded = s.encode('utf-8') 62 | f.write(encoded) 63 | str_cnt = len(encoded) 64 | str_lens.append(str_cnt) 65 | pkl.dump(str_lens, open(lenpath, 'wb')) 66 | else: 67 | while not os.path.exists(lenpath): 68 | time.sleep(1) 69 | 70 | def split_strings(strings, start, chr_lens): 71 | """ 72 | Split strings based on string lengths and given start. 73 | """ 74 | return [strings[i-start:j-start] for i, j in zip([start]+chr_lens[:-1], chr_lens)] 75 | 76 | class ProcessorTokenizer: 77 | """ 78 | callable class that runs a preprocessing, as well as tokenization step, 79 | on input text. 80 | """ 81 | def __init__(self, tokenizer, process_fn=None): 82 | self.tokenizer = tokenizer 83 | self.process_fn = process_fn 84 | 85 | def __call__(self, string): 86 | if self.tokenizer is not None: 87 | string = self.tokenizer(string, process_fn=self.process_fn) 88 | elif self.process_fn is not None: 89 | string = self.process_fn(string) 90 | return string 91 | 92 | class lazy_array_loader(object): 93 | """ 94 | Arguments: 95 | path: path to directory where array entries are concatenated into one big string file 96 | and the .len file are located 97 | data_type (str): Some datsets have multiple fields that are stored in different paths. 98 | `data_type` specifies which of these fields to load in this class 99 | mem_map (boolean): Specifies whether to memory map file `path` 100 | map_fn (callable): Fetched strings are passed through map_fn before being returned. 101 | 102 | Example of lazy loader directory structure: 103 | file.json 104 | file.lazy/ 105 | data_type1 106 | data_type1.len.pkl 107 | data_type2 108 | data_type2.len.pkl 109 | """ 110 | def __init__(self, path, data_type='data', mem_map=False, map_fn=None): 111 | lazypath = get_lazy_path(path) 112 | datapath = os.path.join(lazypath, data_type) 113 | #get file where array entries are concatenated into one big string 114 | self._file = open(datapath, 'rb') 115 | self.file = self._file 116 | #memory map file if necessary 117 | self.mem_map = mem_map 118 | if self.mem_map: 119 | self.file = mmap.mmap(self.file.fileno(), 0, prot=mmap.PROT_READ) 120 | lenpath = os.path.join(lazypath, data_type+'.len.pkl') 121 | self.lens = pkl.load(open(lenpath, 'rb')) 122 | self.ends = list(accumulate(self.lens)) 123 | self.dumb_ends = list(self.ends) 124 | self.read_lock = Lock() 125 | self.process_fn = map_fn 126 | self.map_fn = map_fn 127 | self._tokenizer = None 128 | 129 | def SetTokenizer(self, tokenizer): 130 | """ 131 | logic to set and remove (set to None) tokenizer. 132 | combines preprocessing/tokenization into one callable. 133 | """ 134 | if tokenizer is None: 135 | if not hasattr(self, '_tokenizer'): 136 | self._tokenizer = tokenizer 137 | else: 138 | self._tokenizer = tokenizer 139 | self.map_fn = ProcessorTokenizer(tokenizer, self.process_fn) 140 | 141 | def GetTokenizer(self): 142 | return self._tokenizer 143 | 144 | def __getitem__(self, index): 145 | """ 146 | read file and splice strings based on string ending array `self.ends` 147 | """ 148 | if not isinstance(index, slice): 149 | if index == 0: 150 | start = 0 151 | else: 152 | start = self.ends[index-1] 153 | end = self.ends[index] 154 | rtn = self.file_read(start, end) 155 | if self.map_fn is not None: 156 | return self.map_fn(rtn) 157 | else: 158 | # if slice, fetch strings with 1 diskread and then splice in memory 159 | chr_lens = self.ends[index] 160 | if index.start == 0 or index.start is None: 161 | start = 0 162 | else: 163 | start = self.ends[index.start-1] 164 | stop = chr_lens[-1] 165 | strings = self.file_read(start, stop) 166 | rtn = split_strings(strings, start, chr_lens) 167 | if self.map_fn is not None: 168 | return self.map_fn([s for s in rtn]) 169 | return rtn 170 | 171 | def __len__(self): 172 | return len(self.ends) 173 | 174 | def file_read(self, start=0, end=None): 175 | """read specified portion of file""" 176 | 177 | # atomic reads to avoid race conditions with multiprocess dataloader 178 | self.read_lock.acquire() 179 | # seek to start of file read 180 | self.file.seek(start) 181 | # read to end of file if no end point provided 182 | if end is None: 183 | rtn = self.file.read() 184 | #else read amount needed to reach end point 185 | else: 186 | rtn = self.file.read(end-start) 187 | self.read_lock.release() 188 | #TODO: @raulp figure out mem map byte string bug 189 | #if mem map'd need to decode byte string to string 190 | rtn = rtn.decode('utf-8', 'ignore') 191 | # rtn = str(rtn) 192 | if self.mem_map: 193 | rtn = rtn.decode('unicode_escape') 194 | return rtn 195 | 196 | -------------------------------------------------------------------------------- /fp16/fp16util.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | import torch.nn as nn 18 | from torch.autograd import Variable 19 | from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors 20 | 21 | import mpu 22 | 23 | 24 | class tofp16(nn.Module): 25 | """ 26 | Utility module that implements:: 27 | 28 | def forward(self, input): 29 | return input.half() 30 | """ 31 | 32 | def __init__(self): 33 | super(tofp16, self).__init__() 34 | 35 | def forward(self, input): 36 | return input.half() 37 | 38 | 39 | def BN_convert_float(module): 40 | """ 41 | Utility function for network_to_half(). 42 | 43 | Retained for legacy purposes. 44 | """ 45 | if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True: 46 | module.float() 47 | for child in module.children(): 48 | BN_convert_float(child) 49 | return module 50 | 51 | 52 | def network_to_half(network): 53 | """ 54 | Convert model to half precision in a batchnorm-safe way. 55 | 56 | Retained for legacy purposes. It is recommended to use FP16Model. 57 | """ 58 | return nn.Sequential(tofp16(), BN_convert_float(network.half())) 59 | 60 | 61 | def convert_module(module, dtype): 62 | """ 63 | Converts a module's immediate parameters and buffers to dtype. 64 | """ 65 | for param in module.parameters(recurse=False): 66 | if param is not None: 67 | if param.data.dtype.is_floating_point: 68 | param.data = param.data.to(dtype=dtype) 69 | if param._grad is not None and param._grad.data.dtype.is_floating_point: 70 | param._grad.data = param._grad.data.to(dtype=dtype) 71 | 72 | for buf in module.buffers(recurse=False): 73 | if buf is not None and buf.data.dtype.is_floating_point: 74 | buf.data = buf.data.to(dtype=dtype) 75 | 76 | 77 | def convert_network(network, dtype): 78 | """ 79 | Converts a network's parameters and buffers to dtype. 80 | """ 81 | for module in network.modules(): 82 | if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True: 83 | continue 84 | convert_module(module, dtype) 85 | return network 86 | 87 | 88 | class FP16Model(nn.Module): 89 | """ 90 | Convert model to half precision in a batchnorm-safe way. 91 | """ 92 | 93 | def __init__(self, network): 94 | super(FP16Model, self).__init__() 95 | self.network = convert_network(network, dtype=torch.half) 96 | 97 | def forward(self, *inputs): 98 | inputs = tuple(t.half() for t in inputs) 99 | return self.network(*inputs) 100 | 101 | 102 | def backwards_debug_hook(grad): 103 | raise RuntimeError("master_params recieved a gradient in the backward pass!") 104 | 105 | def prep_param_lists(model, flat_master=False): 106 | """ 107 | Creates a list of FP32 master parameters for a given model, as in 108 | `Training Neural Networks with Mixed Precision: Real Examples`_. 109 | 110 | Args: 111 | model (torch.nn.Module): Existing Pytorch model 112 | flat_master (bool, optional, default=False): Flatten the master parameters into a single tensor, as a performance optimization. 113 | Returns: 114 | A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`. ``master_params`` is a list of FP32 master gradients. If ``flat_master=True``, ``master_params`` will be a list with one element. 115 | 116 | Example:: 117 | 118 | model_params, master_params = prep_param_lists(model) 119 | 120 | .. warning:: 121 | Currently, if ``flat_master=True``, all the model's parameters must be the same type. If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`. 122 | 123 | .. _`Training Neural Networks with Mixed Precision: Real Examples`: 124 | http://on-demand.gputechconf.com/gtc/2018/video/S81012/ 125 | """ 126 | model_params = [param for param in model.parameters() if param.requires_grad] 127 | 128 | if flat_master: 129 | # Give the user some more useful error messages 130 | try: 131 | # flatten_dense_tensors returns a contiguous flat array. 132 | # http://pytorch.org/docs/master/_modules/torch/_utils.html 133 | master_params = _flatten_dense_tensors([param.data for param in model_params]).float() 134 | except: 135 | print("Error in prep_param_lists: model may contain a mixture of parameters " 136 | "of different types. Use flat_master=False, or use F16_Optimizer.") 137 | raise 138 | master_params = torch.nn.Parameter(master_params) 139 | master_params.requires_grad = True 140 | # master_params.register_hook(backwards_debug_hook) 141 | if master_params.grad is None: 142 | master_params.grad = master_params.new(*master_params.size()) 143 | return model_params, [master_params] 144 | else: 145 | master_params = [param.clone().float().detach() for param in model_params] 146 | for param in master_params: 147 | param.requires_grad = True 148 | return model_params, master_params 149 | 150 | 151 | def model_grads_to_master_grads(model_params, master_params, flat_master=False): 152 | """ 153 | Copy model gradients to master gradients. 154 | 155 | Args: 156 | model_params: List of model parameters created by :func:`prep_param_lists`. 157 | master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`. 158 | """ 159 | if flat_master: 160 | # The flattening may incur one more deep copy than is necessary. 161 | master_params[0].grad.data.copy_( 162 | _flatten_dense_tensors([p.grad.data for p in model_params])) 163 | else: 164 | for model, master in zip(model_params, master_params): 165 | if model.grad is not None: 166 | if master.grad is None: 167 | master.grad = Variable(master.data.new(*master.data.size())) 168 | master.grad.data.copy_(model.grad.data) 169 | else: 170 | master.grad = None 171 | 172 | 173 | def master_params_to_model_params(model_params, master_params, flat_master=False): 174 | """ 175 | Copy master parameters to model parameters. 176 | 177 | Args: 178 | model_params: List of model parameters created by :func:`prep_param_lists`. 179 | master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`. 180 | """ 181 | if flat_master: 182 | for model, master in zip(model_params, 183 | _unflatten_dense_tensors(master_params[0].data, model_params)): 184 | model.data.copy_(master) 185 | else: 186 | for model, master in zip(model_params, master_params): 187 | model.data.copy_(master.data) 188 | 189 | # Backward compatibility fixes 190 | 191 | def to_python_float(t): 192 | if hasattr(t, 'item'): 193 | return t.item() 194 | else: 195 | return t[0] 196 | 197 | TORCH_MAJOR = int(torch.__version__.split('.')[0]) 198 | TORCH_MINOR = int(torch.__version__.split('.')[1]) 199 | 200 | clip_grad_norm = mpu.clip_grad_norm 201 | #elif TORCH_MAJOR == 0 and TORCH_MINOR <= 4: 202 | # clip_grad_norm = torch.nn.utils.clip_grad_norm 203 | #else: 204 | # clip_grad_norm = torch.nn.utils.clip_grad_norm_ 205 | -------------------------------------------------------------------------------- /openwebtext/blacklist_urls.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import glob 18 | import re 19 | import time 20 | import tldextract 21 | import sys 22 | 23 | 24 | # List of the domains to blacklist. 25 | domain_blacklist = set([ 26 | '500px', 27 | 'aapks', 28 | 'akamaihd', 29 | 'amazon', 30 | 'apple', 31 | 'artifactfire', 32 | 'artstation', 33 | 'awwni', 34 | 'bandcamp', 35 | 'battleforthenet', 36 | 'coinscalendar', 37 | 'dailymotion', 38 | 'deviantart', 39 | 'discord', 40 | 'discordapp', 41 | 'dlapkandroid', 42 | 'dropbox', 43 | 'e621', 44 | 'ebay', 45 | 'edealinfo', 46 | 'erome', 47 | 'eroshare', 48 | 'explosm', 49 | 'facebook', 50 | 'fbcdn', 51 | 'flickr', 52 | 'furaffinity', 53 | 'futhead', 54 | 'gatopardo', 55 | 'gfycat', 56 | 'gifsound', 57 | 'gifsoup', 58 | 'giphy', 59 | 'github', 60 | 'google', 61 | 'gunprime', 62 | 'gyazo', 63 | 'hotdealstar', 64 | 'imagefap', 65 | 'imageshack', 66 | 'imgflip', 67 | 'imgur', 68 | 'instagram', 69 | 'karmadecay', 70 | 'kryptocal', 71 | 'kym-cdn', 72 | 'liveleak', 73 | 'livememe', 74 | 'lmgtfy', 75 | 'magaimg', 76 | 'memegenerator', 77 | 'minorplanetcenter', 78 | 'minus', 79 | 'mobafire', 80 | 'morejpeg', 81 | 'nocookie', 82 | 'pcpartpicker', 83 | 'photobucket', 84 | 'pinimg', 85 | 'pinterest', 86 | 'pixiv', 87 | 'pornhub', 88 | 'prntscr', 89 | 'puu', 90 | 'qkme', 91 | 'quickmeme', 92 | 'radd', 93 | 'redd', 94 | 'reddit', 95 | 'reddit-stream', 96 | 'redditlog', 97 | 'redditmedia', 98 | 'reddituploads', 99 | 'redtube', 100 | 'reupp', 101 | 'reverb', 102 | 'roanoke', 103 | 'rollingstone', 104 | 'sli', 105 | 'soundcloud', 106 | 'soundgasm', 107 | 'spankbang', 108 | 'spotify', 109 | 'strawpoll', 110 | 'streamable', 111 | 'timeanddate', 112 | 'tinypic', 113 | 'touhouradio', 114 | 'tumblr', 115 | 'twimg', 116 | 'twitch', 117 | 'twitter', 118 | 'vid', 119 | 'vimeo', 120 | 'vine', 121 | 'vkaao', 122 | 'vocaroo', 123 | 'voyagefusion', 124 | 'walmart', 125 | 'wciu', 126 | 'wikimedia', 127 | 'wikipedia', 128 | 'xhamster', 129 | 'xkcd', 130 | 'xvideos', 131 | 'youtu', 132 | 'youtube', 133 | 'youtubedoubler', 134 | 'ytimg', 135 | 'zillexplorer', 136 | ]) 137 | 138 | def domain_is_in_blacklist(url): 139 | domain = tldextract.extract(url).domain 140 | return domain in domain_blacklist 141 | 142 | 143 | # List of extentions to blacklist. 144 | extentions_blacklist = ( 145 | '.3gp', 146 | '.7z' 147 | '.ai', 148 | '.aif', 149 | '.apk', 150 | '.app', 151 | '.avi', 152 | '.bin', 153 | '.bmp', 154 | '.bz2', 155 | '.css', 156 | '.csv', 157 | '.dat', 158 | '.deb', 159 | '.dmg', 160 | '.doc', 161 | '.docx', 162 | '.exe', 163 | '.gif', 164 | '.gifv', 165 | '.gz', 166 | '.iso', 167 | '.jar', 168 | '.jpeg', 169 | '.jpg', 170 | '.js', 171 | '.log', 172 | '.mid', 173 | '.midi', 174 | '.mkv', 175 | '.mov', 176 | '.mp3', 177 | '.mp4', 178 | '.mpeg', 179 | '.mpg', 180 | '.ogg', 181 | '.ogv', 182 | '.otf', 183 | '.pdf', 184 | '.pkg', 185 | '.png', 186 | '.pps', 187 | '.ppt', 188 | '.pptx', 189 | '.psd', 190 | '.py', 191 | '.qt', 192 | '.ram', 193 | '.rar', 194 | '.sql', 195 | '.svg', 196 | '.swf', 197 | '.tar.gz', 198 | '.tar', 199 | '.tgz', 200 | '.tiff', 201 | '.ttf', 202 | '.txt', 203 | '.wav', 204 | '.webm', 205 | '.wma', 206 | '.wmv', 207 | '.xls', 208 | '.xlsx', 209 | '.xml', 210 | '.xz', 211 | '.zip', 212 | ) 213 | 214 | def extention_is_in_blacklist(url): 215 | if url.split('?')[0].lower().endswith(extentions_blacklist): 216 | return True 217 | return False 218 | 219 | 220 | # Malformed urls. 221 | # This function is adapted from: 222 | # https://stackoverflow.com/questions/7160737/python-how-to-validate-a-url-in-python-malformed-or-not 223 | url_regex = re.compile( 224 | r'^(?:http)s?://' # http:// or https:// 225 | r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... 226 | r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip 227 | r'(?::\d+)?' # optional port 228 | r'(?:/?|[/?]\S+)$', re.IGNORECASE) 229 | def url_is_malformed(url): 230 | return re.match(url_regex, url) is None 231 | 232 | 233 | def print_progress(prefix, start_time, urls_counter, 234 | domain_blacklist_counter, 235 | extention_blacklist_counter, 236 | short_url_counter, malformed_url_counter, 237 | duplicate_url_counter): 238 | string = prefix + ' | ' 239 | string += 'time elapsed (s): {:.2f} | '.format(time.time() - start_time) 240 | string += 'number of urls: {} | '.format(urls_counter) 241 | string += 'domain blacklisted: {} | '.format(domain_blacklist_counter) 242 | string += 'extention blacklisted: {} | '.format(extention_blacklist_counter) 243 | string += 'short urls (<=8): {} | '.format(short_url_counter) 244 | string += 'malformed urls: {} | '.format(malformed_url_counter) 245 | string += 'duplicate urls: {}'.format(duplicate_url_counter) 246 | print(string, flush=True) 247 | 248 | 249 | if __name__ == '__main__': 250 | 251 | 252 | print('remove blacklisted urls ..') 253 | 254 | # Path to the url files. 255 | path = sys.argv[1] 256 | # Output url file. 257 | output = sys.argv[2] 258 | 259 | # Get the list of url files. 260 | files = glob.glob(path + '/*.txt') 261 | print('> found {} files'.format(len(files))) 262 | 263 | urls = set() 264 | urls_counter = 0 265 | domain_blacklist_counter = 0 266 | extention_blacklist_counter = 0 267 | short_url_counter = 0 268 | malformed_url_counter = 0 269 | duplicate_url_counter = 0 270 | start_time = time.time() 271 | for filename in files: 272 | with open(filename, 'r') as f: 273 | for line in f: 274 | url = line.strip() 275 | urls_counter += 1 276 | if domain_is_in_blacklist(url): 277 | print('[DOMAIN BLACKLIST]: {}'.format(url), flush=True) 278 | domain_blacklist_counter += 1 279 | elif extention_is_in_blacklist(url): 280 | print('[EXTENTION BLACKLIST]: {}'.format(url), flush=True) 281 | extention_blacklist_counter += 1 282 | elif len(url) <= 8: 283 | print('[SHORT URL]: {}'.format(url), flush=True) 284 | short_url_counter += 1 285 | elif url_is_malformed(url): 286 | print('[MALFORMED URL]: {}'.format(url), flush=True) 287 | malformed_url_counter += 1 288 | elif url in urls: 289 | print('[DUPLICATE URL]: {}'.format(url), flush=True) 290 | duplicate_url_counter += 1 291 | else: 292 | urls.add(url) 293 | if urls_counter % 100000 == 0: 294 | print_progress('PROGRESS', start_time, urls_counter, 295 | domain_blacklist_counter, 296 | extention_blacklist_counter, 297 | short_url_counter, malformed_url_counter, 298 | duplicate_url_counter) 299 | 300 | print_progress('FINAL', start_time, urls_counter, 301 | domain_blacklist_counter, 302 | extention_blacklist_counter, 303 | short_url_counter, malformed_url_counter, 304 | duplicate_url_counter) 305 | 306 | # Write the final set of urls. 307 | print('> writing cleaned up url list to {}'.format(output)) 308 | with open(output, 'w') as f: 309 | for url in urls: 310 | f.write(url + '\n') 311 | 312 | print('done :-)') 313 | -------------------------------------------------------------------------------- /gpt2_data_loader.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import json 17 | import os 18 | 19 | import numpy as np 20 | import torch 21 | from torch.multiprocessing import Lock 22 | from torch.utils.data import Dataset 23 | 24 | import mpu 25 | from data_utils.samplers import DistributedBatchSampler 26 | from data_utils.tokenization_gpt2 import GPT2Tokenizer 27 | 28 | 29 | def make_gpt2_dataloaders(args): 30 | 31 | # Input parameters. 32 | input_data_sizes_file = args.input_data_sizes_file 33 | seq_length = args.seq_length 34 | initial_seed = args.seed 35 | 36 | # Data parallel arguments. 37 | world_size = mpu.get_data_parallel_world_size() 38 | rank = mpu.get_data_parallel_rank() 39 | global_batch_size = args.batch_size * world_size 40 | num_workers = args.num_workers 41 | 42 | def make_data_loader_(data_path): 43 | # Build the dataset. 44 | dataset = GPT2Dataset(data_path, input_data_sizes_file, 45 | seq_length, initial_seed) 46 | # Use a simple sampler with distributed batch sampler. 47 | sampler = torch.utils.data.SequentialSampler(dataset) 48 | batch_sampler = DistributedBatchSampler(sampler=sampler, 49 | batch_size=global_batch_size, 50 | drop_last=True, 51 | rank=rank, 52 | world_size=world_size) 53 | # Torch dataloader. 54 | return torch.utils.data.DataLoader(dataset, 55 | batch_sampler=batch_sampler, 56 | num_workers=num_workers, 57 | pin_memory=True) 58 | 59 | train = make_data_loader_(args.train_data_path) 60 | valid = make_data_loader_(args.val_data_path) 61 | test = make_data_loader_(args.test_data_path) 62 | 63 | args.do_train = False 64 | args.do_valid = False 65 | args.do_test = False 66 | 67 | if train is not None: 68 | args.do_train = True 69 | if valid is not None: 70 | args.do_valid = True 71 | if test is not None: 72 | args.do_test = True 73 | 74 | # Tokenizer. 75 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=args.cache_dir) 76 | eod_token = tokenizer.encoder['<|endoftext|>'] 77 | num_tokens = eod_token + 1 78 | 79 | return (train, valid, test), num_tokens, eod_token 80 | 81 | 82 | class GPT2Dataset(Dataset): 83 | 84 | def __init__(self, data_path, sizes_filename, seq_length, 85 | initial_seed, max_epochs=100): 86 | # Input parameters. 87 | self.data_path = data_path 88 | self.sizes_filename = sizes_filename 89 | self.seq_length = seq_length 90 | self.initial_seed = initial_seed 91 | self.max_epochs = max_epochs 92 | # Lock for building the dataset. 93 | self.lock = Lock() 94 | 95 | # Shard stuff. 96 | # Dictionary from shard nameto its size (number of element). 97 | self.master_shard_size_dict = None 98 | # Dictionary from shard name to modified size so it is 99 | # divisible by self.seq_length. 100 | self.shard_size_dict = None 101 | # Long array (self.max_epochs * num-shards) populated 102 | # randomly with shard names. 103 | self.shards_name = None 104 | # Start index of the data for a shard. 105 | self.shards_start_index = None 106 | self.build_shard_mappings_() 107 | self.data_length = self.shards_start_index[-1] 108 | 109 | # Data. 110 | self.shards_data = [None]*self.shards_name.size 111 | self.shards_sample_index = [None]*self.shards_name.size 112 | 113 | def __len__(self): 114 | return self.data_length 115 | 116 | def __getitem__(self, idx): 117 | # Find which shard we need. 118 | shard_index = np.searchsorted(self.shards_start_index, 119 | idx, side='right') - 1 120 | # data index in the shard. 121 | data_idx = idx - self.shards_start_index[shard_index] 122 | # Load the shard if it is not in memory. 123 | #self.lock.acquire() 124 | if self.shards_data[shard_index] is None: 125 | print('global rank {} is building data for shard index {} ...'. 126 | format(torch.distributed.get_rank(), shard_index)) 127 | self.build_dataset_(shard_index) 128 | #assert self.shards_data[shard_index] is not None 129 | #self.lock.release() 130 | # Start index. 131 | start_index = self.shards_sample_index[shard_index][data_idx] 132 | # Add one for label shift. 133 | end_index = start_index + self.seq_length + 1 134 | data = self.shards_data[shard_index][start_index:end_index] 135 | return {'text': np.array(data, dtype=np.int64)} 136 | 137 | def build_dataset_(self, shard_index): 138 | # Garbage collect so we don't use a lot of memory. 139 | # Leave the last one in case other threads have not catche up yet. 140 | #for i in range(shard_index - 1): 141 | for i in range(shard_index): 142 | self.shards_data[i] = None 143 | self.shards_sample_index[i] = None 144 | # Read the shard. 145 | filename = os.path.join(self.data_path, self.shards_name[shard_index]) 146 | print('loading {}'.format(filename)) 147 | data = np.load(filename, allow_pickle=True) 148 | # Shuffle the data 149 | rng = np.random.RandomState(self.initial_seed + shard_index) 150 | rng.shuffle(data) 151 | # Flatten. 152 | data = np.hstack(data) 153 | size = (data.shape[0] - 1) // self.seq_length 154 | last_index = size * self.seq_length + 1 155 | data = data[0:last_index] 156 | self.shards_data[shard_index] = data 157 | indices = np.arange(size) * self.seq_length 158 | rng.shuffle(indices) 159 | self.shards_sample_index[shard_index] = indices 160 | 161 | def build_shard_mappings_(self): 162 | # Load the sizes file. 163 | sizes_filename = os.path.join(self.data_path, self.sizes_filename) 164 | if torch.distributed.get_rank() == 0: 165 | print(' > loading sizes from {}'.format(sizes_filename)) 166 | with open(sizes_filename, 'r') as f: 167 | self.master_shard_size_dict = json.load(f) 168 | if torch.distributed.get_rank() == 0: 169 | print(' found {} shards'.format(len(self.master_shard_size_dict))) 170 | # Adjust sizes to be a multiple of seq_length. 171 | self.shard_size_dict = self.master_shard_size_dict.copy() 172 | total_samples = 0 173 | for shard in self.shard_size_dict: 174 | size = self.shard_size_dict[shard] 175 | size = ((size - 1) // self.seq_length) * self.seq_length 176 | total_samples += size // self.seq_length 177 | self.shard_size_dict[shard] = size 178 | if torch.distributed.get_rank() == 0: 179 | print(' found {} samples in the dataset'.format(total_samples)) 180 | # Build a list of shards. 181 | shards_ = np.sort(np.array(list(self.shard_size_dict.keys()))) 182 | rng = np.random.RandomState(self.initial_seed) 183 | self.shards_name = np.copy(shards_) 184 | rng.shuffle(self.shards_name) 185 | for i in range(1, self.max_epochs): 186 | shards_c = np.copy(shards_) 187 | rng.shuffle(shards_c) 188 | self.shards_name = np.append(self.shards_name, shards_c) 189 | # Build the global indexing. 190 | self.shards_start_index = np.zeros(self.shards_name.size, dtype=np.int) 191 | self.shards_start_index[0] = 0 192 | for i in range(1, self.shards_name.size): 193 | shard = str(self.shards_name[i-1]) 194 | size = self.shard_size_dict[shard] 195 | self.shards_start_index[i] = self.shards_start_index[i-1] + \ 196 | size // self.seq_length 197 | 198 | ''' 199 | if __name__ == '__main__': 200 | 201 | print('gpt2 data loader ...') 202 | path = '/raid/mshoeybi/data/gpt2/adlr/reddit_all_ftfy_lg200/npys' 203 | 204 | dataset = GPT2Dataset(path, 'sizes.txt', 1024, 1234, 100) 205 | print('dataset contains {} samples'.format(dataset.data_length)) 206 | 207 | for i in range(len(dataset)): 208 | if i % 512000 == 0: 209 | print(i) 210 | data = dataset[i] 211 | ''' 212 | -------------------------------------------------------------------------------- /data_utils/file_utils.py: -------------------------------------------------------------------------------- 1 | # This file is provided as is from: 2 | # https://github.com/huggingface/pytorch-pretrained-BERT 3 | # Please refer to their repository for copyright. 4 | 5 | """ 6 | Utilities for working with the local dataset cache. 7 | This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp 8 | Copyright by the AllenNLP authors. 9 | """ 10 | from __future__ import (absolute_import, division, print_function, unicode_literals) 11 | 12 | import json 13 | import logging 14 | import os 15 | import shutil 16 | import tempfile 17 | from functools import wraps 18 | from hashlib import sha256 19 | import sys 20 | from io import open 21 | 22 | import boto3 23 | import requests 24 | from botocore.exceptions import ClientError 25 | from tqdm import tqdm 26 | 27 | try: 28 | from urllib.parse import urlparse 29 | except ImportError: 30 | from urlparse import urlparse 31 | 32 | try: 33 | from pathlib import Path 34 | PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', 35 | Path.home() / '.pytorch_pretrained_bert')) 36 | except (AttributeError, ImportError): 37 | PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', 38 | os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert')) 39 | 40 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 41 | 42 | 43 | def url_to_filename(url, etag=None): 44 | """ 45 | Convert `url` into a hashed filename in a repeatable way. 46 | If `etag` is specified, append its hash to the url's, delimited 47 | by a period. 48 | """ 49 | url_bytes = url.encode('utf-8') 50 | url_hash = sha256(url_bytes) 51 | filename = url_hash.hexdigest() 52 | 53 | if etag: 54 | etag_bytes = etag.encode('utf-8') 55 | etag_hash = sha256(etag_bytes) 56 | filename += '.' + etag_hash.hexdigest() 57 | 58 | return filename 59 | 60 | 61 | def filename_to_url(filename, cache_dir=None): 62 | """ 63 | Return the url and etag (which may be ``None``) stored for `filename`. 64 | Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. 65 | """ 66 | if cache_dir is None: 67 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 68 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 69 | cache_dir = str(cache_dir) 70 | 71 | cache_path = os.path.join(cache_dir, filename) 72 | if not os.path.exists(cache_path): 73 | raise EnvironmentError("file {} not found".format(cache_path)) 74 | 75 | meta_path = cache_path + '.json' 76 | if not os.path.exists(meta_path): 77 | raise EnvironmentError("file {} not found".format(meta_path)) 78 | 79 | with open(meta_path, encoding="utf-8") as meta_file: 80 | metadata = json.load(meta_file) 81 | url = metadata['url'] 82 | etag = metadata['etag'] 83 | 84 | return url, etag 85 | 86 | 87 | def cached_path(url_or_filename, cache_dir=None): 88 | """ 89 | Given something that might be a URL (or might be a local path), 90 | determine which. If it's a URL, download the file and cache it, and 91 | return the path to the cached file. If it's already a local path, 92 | make sure the file exists and then return the path. 93 | """ 94 | if cache_dir is None: 95 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 96 | if sys.version_info[0] == 3 and isinstance(url_or_filename, Path): 97 | url_or_filename = str(url_or_filename) 98 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 99 | cache_dir = str(cache_dir) 100 | 101 | parsed = urlparse(url_or_filename) 102 | 103 | if parsed.scheme in ('http', 'https', 's3'): 104 | # URL, so get it from the cache (downloading if necessary) 105 | return get_from_cache(url_or_filename, cache_dir) 106 | elif os.path.exists(url_or_filename): 107 | # File, and it exists. 108 | return url_or_filename 109 | elif parsed.scheme == '': 110 | # File, but it doesn't exist. 111 | raise EnvironmentError("file {} not found".format(url_or_filename)) 112 | else: 113 | # Something unknown 114 | raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) 115 | 116 | 117 | def split_s3_path(url): 118 | """Split a full s3 path into the bucket name and path.""" 119 | parsed = urlparse(url) 120 | if not parsed.netloc or not parsed.path: 121 | raise ValueError("bad s3 path {}".format(url)) 122 | bucket_name = parsed.netloc 123 | s3_path = parsed.path 124 | # Remove '/' at beginning of path. 125 | if s3_path.startswith("/"): 126 | s3_path = s3_path[1:] 127 | return bucket_name, s3_path 128 | 129 | 130 | def s3_request(func): 131 | """ 132 | Wrapper function for s3 requests in order to create more helpful error 133 | messages. 134 | """ 135 | 136 | @wraps(func) 137 | def wrapper(url, *args, **kwargs): 138 | try: 139 | return func(url, *args, **kwargs) 140 | except ClientError as exc: 141 | if int(exc.response["Error"]["Code"]) == 404: 142 | raise EnvironmentError("file {} not found".format(url)) 143 | else: 144 | raise 145 | 146 | return wrapper 147 | 148 | 149 | @s3_request 150 | def s3_etag(url): 151 | """Check ETag on S3 object.""" 152 | s3_resource = boto3.resource("s3") 153 | bucket_name, s3_path = split_s3_path(url) 154 | s3_object = s3_resource.Object(bucket_name, s3_path) 155 | return s3_object.e_tag 156 | 157 | 158 | @s3_request 159 | def s3_get(url, temp_file): 160 | """Pull a file directly from S3.""" 161 | s3_resource = boto3.resource("s3") 162 | bucket_name, s3_path = split_s3_path(url) 163 | s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) 164 | 165 | 166 | def http_get(url, temp_file): 167 | req = requests.get(url, stream=True) 168 | content_length = req.headers.get('Content-Length') 169 | total = int(content_length) if content_length is not None else None 170 | progress = tqdm(unit="B", total=total) 171 | for chunk in req.iter_content(chunk_size=1024): 172 | if chunk: # filter out keep-alive new chunks 173 | progress.update(len(chunk)) 174 | temp_file.write(chunk) 175 | progress.close() 176 | 177 | 178 | def get_from_cache(url, cache_dir=None): 179 | """ 180 | Given a URL, look for the corresponding dataset in the local cache. 181 | If it's not there, download it. Then return the path to the cached file. 182 | """ 183 | if cache_dir is None: 184 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 185 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 186 | cache_dir = str(cache_dir) 187 | 188 | if not os.path.exists(cache_dir): 189 | os.makedirs(cache_dir) 190 | 191 | # Get eTag to add to filename, if it exists. 192 | if url.startswith("s3://"): 193 | etag = s3_etag(url) 194 | else: 195 | response = requests.head(url, allow_redirects=True) 196 | if response.status_code != 200: 197 | raise IOError("HEAD request failed for url {} with status code {}" 198 | .format(url, response.status_code)) 199 | etag = response.headers.get("ETag") 200 | 201 | filename = url_to_filename(url, etag) 202 | 203 | # get cache path to put the file 204 | cache_path = os.path.join(cache_dir, filename) 205 | 206 | if not os.path.exists(cache_path): 207 | # Download to temporary file, then copy to cache dir once finished. 208 | # Otherwise you get corrupt cache entries if the download gets interrupted. 209 | with tempfile.NamedTemporaryFile() as temp_file: 210 | logger.info("%s not found in cache, downloading to %s", url, temp_file.name) 211 | 212 | # GET file object 213 | if url.startswith("s3://"): 214 | s3_get(url, temp_file) 215 | else: 216 | http_get(url, temp_file) 217 | 218 | # we are copying the file before closing it, so flush to avoid truncation 219 | temp_file.flush() 220 | # shutil.copyfileobj() starts at the current position, so go to the start 221 | temp_file.seek(0) 222 | 223 | logger.info("copying %s to cache at %s", temp_file.name, cache_path) 224 | with open(cache_path, 'wb') as cache_file: 225 | shutil.copyfileobj(temp_file, cache_file) 226 | 227 | logger.info("creating metadata file for %s", cache_path) 228 | meta = {'url': url, 'etag': etag} 229 | meta_path = cache_path + '.json' 230 | with open(meta_path, 'w', encoding="utf-8") as meta_file: 231 | json.dump(meta, meta_file) 232 | 233 | logger.info("removing temp file %s", temp_file.name) 234 | 235 | return cache_path 236 | 237 | 238 | def read_set_from_file(filename): 239 | ''' 240 | Extract a de-duped collection (set) of text from a file. 241 | Expected file format is one item per line. 242 | ''' 243 | collection = set() 244 | with open(filename, 'r', encoding='utf-8') as file_: 245 | for line in file_: 246 | collection.add(line.rstrip()) 247 | return collection 248 | 249 | 250 | def get_file_extension(path, dot=True, lower=True): 251 | ext = os.path.splitext(path)[1] 252 | ext = ext if dot else ext[1:] 253 | return ext.lower() if lower else ext 254 | -------------------------------------------------------------------------------- /configure_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """parses arguments and preps data loader""" 17 | 18 | import copy 19 | import torch 20 | import data_utils 21 | 22 | import mpu 23 | 24 | class DataConfig: 25 | 26 | def __init__(self, defaults={}): 27 | super(DataConfig, self).__init__() 28 | self.defaults = defaults 29 | 30 | def apply(self, args): 31 | if torch.distributed.get_rank() == 0: 32 | print('configuring data') 33 | self.apply_defaults(args) 34 | return make_loaders(args) 35 | 36 | def set_defaults(self, **kwargs): 37 | for k, v in kwargs.items(): 38 | self.defaults[k] = v 39 | 40 | def apply_defaults(self, args): 41 | for k, v in self.defaults.items(): 42 | k = k.replace('-', '_') 43 | if not hasattr(args, k): 44 | setattr(args, k, v) 45 | 46 | 47 | def make_data_loader(dataset, batch_size, args): 48 | 49 | shuffle = args.shuffle 50 | if shuffle: 51 | sampler = data_utils.samplers.RandomSampler(dataset, replacement=True, num_samples=batch_size*args.train_iters) 52 | else: 53 | sampler = torch.utils.data.SequentialSampler(dataset) 54 | world_size = torch.distributed.get_world_size( 55 | group=mpu.get_data_parallel_group()) 56 | rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group()) 57 | distributed = world_size > 1 58 | drop_last = distributed 59 | 60 | if distributed: 61 | batch_sampler = data_utils.samplers.DistributedBatchSampler(sampler, 62 | batch_size, 63 | drop_last, 64 | rank, 65 | world_size) 66 | else: 67 | batch_sampler = torch.utils.data.BatchSampler(sampler, 68 | batch_size, 69 | drop_last) 70 | 71 | data_loader = torch.utils.data.DataLoader(dataset, 72 | batch_sampler=batch_sampler, 73 | num_workers=args.num_workers, 74 | pin_memory=True) 75 | 76 | return data_loader 77 | 78 | 79 | def make_tfrecord_loaders(args): 80 | """Load train/val/test dataset from shuffled TFRecords""" 81 | 82 | import data_utils.tf_dl 83 | data_set_args = {'batch_size': args.batch_size, 84 | 'max_seq_len': args.seq_length, 85 | 'max_preds_per_seq': args.max_preds_per_seq, 86 | 'train': True, 87 | 'num_workers': max(args.num_workers, 1), 88 | 'seed': args.seed + args.rank + 1, 89 | 'threaded_dl': args.num_workers > 0 90 | } 91 | train = data_utils.tf_dl.TFRecordDataLoader(args.train_data, 92 | **data_set_args) 93 | data_set_args['train'] = False 94 | if args.eval_seq_length is not None: 95 | data_set_args['max_seq_len'] = args.eval_seq_length 96 | if args.eval_max_preds_per_seq is not None: 97 | data_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq 98 | valid = None 99 | if args.valid_data is not None: 100 | valid = data_utils.tf_dl.TFRecordDataLoader(args.valid_data, 101 | **data_set_args) 102 | test = None 103 | if args.test_data is not None: 104 | test = data_utils.tf_dl.TFRecordDataLoader(args.test_data, 105 | **data_set_args) 106 | tokenizer = data_utils.make_tokenizer(args.tokenizer_type, 107 | train, 108 | args.tokenizer_path, 109 | args.vocab_size, 110 | args.tokenizer_model_type, 111 | cache_dir=args.cache_dir) 112 | 113 | return (train, valid, test), tokenizer 114 | 115 | 116 | def make_loaders(args): 117 | """makes training/val/test""" 118 | 119 | if args.use_tfrecords: 120 | return make_tfrecord_loaders(args) 121 | world_size = torch.distributed.get_world_size( 122 | group=mpu.get_data_parallel_group()) 123 | batch_size = args.batch_size * world_size 124 | eval_batch_size = batch_size 125 | if args.eval_batch_size is not None: 126 | eval_batch_size = args.eval_batch_size * world_size 127 | seq_length = args.seq_length 128 | if seq_length < 0: 129 | seq_length = seq_length * world_size 130 | eval_seq_length = args.eval_seq_length 131 | if eval_seq_length is not None and eval_seq_length < 0: 132 | eval_seq_length = eval_seq_length * world_size 133 | split = get_split(args) 134 | data_set_args = { 135 | 'path': args.train_data, 136 | 'seq_length': seq_length, 137 | 'lazy': args.lazy_loader, 138 | 'delim': args.delim, 139 | 'text_key': args.text_key, 140 | 'label_key': 'label', 141 | 'non_binary_cols': None, 142 | 'ds_type': args.data_set_type, 143 | 'split': split, 144 | 'loose': args.loose_json, 145 | 'tokenizer_type': args.tokenizer_type, 146 | 'tokenizer_model_path': args.tokenizer_path, 147 | 'vocab_size': args.vocab_size, 148 | 'model_type': args.tokenizer_model_type, 149 | 'cache_dir': args.cache_dir, 150 | 'max_preds_per_seq': args.max_preds_per_seq, 151 | 'presplit_sentences': args.presplit_sentences} 152 | 153 | eval_set_args = copy.copy(data_set_args) 154 | eval_set_args['split'] = [1.] 155 | # if optional eval args were set then replace their 156 | # equivalent values in the arg dict 157 | if eval_seq_length: 158 | eval_set_args['seq_length'] = eval_seq_length 159 | if args.eval_max_preds_per_seq: 160 | eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq 161 | if args.eval_text_key is not None: 162 | eval_set_args['text_key'] = args.eval_text_key 163 | 164 | # make datasets splits and tokenizer 165 | train = None 166 | valid = None 167 | test = None 168 | 169 | if args.train_data is not None: 170 | train, tokenizer = data_utils.make_dataset(**data_set_args) 171 | if data_utils.should_split(split): 172 | train, valid, test = train 173 | eval_set_args['tokenizer'] = tokenizer 174 | 175 | # make training and val dataset if necessary 176 | if valid is None and args.valid_data is not None: 177 | eval_set_args['path'] = args.valid_data 178 | valid, tokenizer = data_utils.make_dataset(**eval_set_args) 179 | eval_set_args['tokenizer'] = tokenizer 180 | if test is None and args.test_data is not None: 181 | eval_set_args['path'] = args.test_data 182 | test, tokenizer = data_utils.make_dataset(**eval_set_args) 183 | 184 | # wrap datasets with data loader 185 | if train is not None and args.batch_size > 0: 186 | train = make_data_loader(train, batch_size, args) 187 | args.do_train = True 188 | else: 189 | args.do_train = False 190 | eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size 191 | if valid is not None: 192 | valid = make_data_loader(valid, eval_batch_size, args) 193 | args.do_valid = True 194 | else: 195 | args.do_valid = False 196 | if test is not None: 197 | test = make_data_loader(test, eval_batch_size, args) 198 | args.do_test = True 199 | else: 200 | args.do_test = False 201 | 202 | return (train, valid, test), tokenizer 203 | 204 | def get_split(args): 205 | """ 206 | Get dataset splits from comma separated string list 207 | """ 208 | splits = [] 209 | if args.split.find(',') != -1: 210 | splits = [float(s) for s in args.split.split(',')] 211 | elif args.split.find('/') != -1: 212 | splits = [float(s) for s in args.split.split('/')] 213 | else: 214 | splits = [float(args.split)] 215 | split_total = sum(splits) 216 | if split_total < 1.: 217 | splits.append(1-split_total) 218 | while len(splits) < 3: 219 | splits.append(0.) 220 | splits = splits[:3] 221 | if args.valid_data is not None: 222 | splits[1] = 0. 223 | if args.test_data is not None: 224 | splits[2] = 0. 225 | final_sum = sum(splits) 226 | return [s/final_sum for s in splits] 227 | 228 | def configure_data(): 229 | 230 | """add cmdline flags for configuring datasets""" 231 | # These are options that are used by data_utils, but are either 232 | # deprecated or not meant to be exposed to the command line user. 233 | # These options are intneded to be set in code by specific scripts. 234 | defaults = { 235 | 'world_size': 1, 236 | 'rank': -1, 237 | 'persist_state': 0, 238 | 'lazy': False, 239 | 'transpose': False, 240 | 'data_set_type': 'supervised', 241 | 'seq_length': 256, 242 | 'eval_seq_length': 256, 243 | 'samples_per_shard': 100 244 | } 245 | 246 | return DataConfig(defaults=defaults) 247 | -------------------------------------------------------------------------------- /mpu/random.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # Parts of the code here are adapted from PyTorch 18 | # repo: https://github.com/pytorch/pytorch 19 | 20 | import contextlib 21 | 22 | import torch 23 | from torch import _C 24 | from torch.cuda import _lazy_call, device as device_ctx_manager 25 | from torch.utils.checkpoint import detach_variable 26 | 27 | from .initialize import get_data_parallel_rank 28 | from .initialize import get_model_parallel_rank 29 | 30 | 31 | # Default name for the model parallel rng tracker. 32 | _MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng' 33 | 34 | 35 | def _set_cuda_rng_state(new_state, device=-1): 36 | """Sets the random number generator state of the current GPU. 37 | 38 | Argumentss: 39 | new_state (torch.ByteTensor): The desired state 40 | This function is adapted from PyTorch repo (torch.cuda.set_rng_state) 41 | with a single change: the input state is not cloned. Cloning caused 42 | major performance issues for +4 GPU cases. 43 | """ 44 | if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState): 45 | # older PyTorch 46 | def cb(): 47 | with device_ctx_manager(device): 48 | _C._cuda_setRNGState(new_state) 49 | else: 50 | # newer PyTorch 51 | if device == -1: 52 | device = torch.device('cuda') 53 | elif isinstance(device, str): 54 | device = torch.device(device) 55 | elif isinstance(device, int): 56 | device = torch.device('cuda', device) 57 | 58 | def cb(): 59 | idx = device.index 60 | if idx is None: 61 | idx = torch.cuda.current_device() 62 | default_generator = torch.cuda.default_generators[idx] 63 | default_generator.set_state(new_state) 64 | 65 | _lazy_call(cb) 66 | 67 | 68 | class CudaRNGStatesTracker: 69 | """Tracker for the cuda RNG states. 70 | 71 | Using the `add` method, a cuda rng state is initialized based on 72 | the input `seed` and is assigned to `name`. Later, by forking the 73 | rng state, we can perform operations and return to our starting 74 | cuda state. 75 | """ 76 | def __init__(self): 77 | # Map from a string name to the cuda rng state. 78 | self.states_ = {} 79 | # Seeds are just for book keeping and ensure no seed is set twice. 80 | self.seeds_ = set() 81 | 82 | def reset(self): 83 | """Set to the initial state (no tracker).""" 84 | self.states_ = {} 85 | self.seeds_ = set() 86 | 87 | def get_states(self): 88 | """Get rng states. Copy the dictionary so we have direct 89 | pointers to the states, not just a pointer to the dictionary.""" 90 | states = {} 91 | for name in self.states_: 92 | states[name] = self.states_[name] 93 | return states 94 | 95 | def set_states(self, states): 96 | """Set the rng states. For efficiency purposes, we do not check 97 | the size of seed for compatibility.""" 98 | self.states_ = states 99 | 100 | def add(self, name, seed): 101 | """Track the rng state.""" 102 | # Check seed is not already used. 103 | if seed in self.seeds_: 104 | raise Exception('seed {} already exists'.format(seed)) 105 | self.seeds_.add(seed) 106 | # Check that state is not already defined. 107 | if name in self.states_: 108 | raise Exception('cuda rng state {} already exists'.format(name)) 109 | # Get the current rng state. 110 | orig_rng_state = torch.cuda.get_rng_state() 111 | # Set the new state and store it. 112 | torch.cuda.manual_seed(seed) 113 | self.states_[name] = torch.cuda.get_rng_state() 114 | # Reset rng state to what it was. 115 | _set_cuda_rng_state(orig_rng_state) 116 | 117 | @contextlib.contextmanager 118 | def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME): 119 | """Fork the cuda rng state, perform operations, and exit with 120 | the original state.""" 121 | # Check if we have added the state 122 | if name not in self.states_: 123 | raise Exception('cuda rng state {} is not added'.format(name)) 124 | # Store current rng state. 125 | orig_cuda_rng_state = torch.cuda.get_rng_state() 126 | # Set rng state to the desired one 127 | _set_cuda_rng_state(self.states_[name]) 128 | # Do the stuff we wanted to do. 129 | try: 130 | yield 131 | finally: 132 | # Update the current rng state for later use. 133 | self.states_[name] = torch.cuda.get_rng_state() 134 | # And set the state to the original state we started with. 135 | _set_cuda_rng_state(orig_cuda_rng_state) 136 | 137 | 138 | # RNG tracker object. 139 | _CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker() 140 | 141 | 142 | def get_cuda_rng_tracker(): 143 | """Get cuda rng tracker.""" 144 | return _CUDA_RNG_STATE_TRACKER 145 | 146 | 147 | def model_parallel_cuda_manual_seed(seed): 148 | """Initialize model parallel cuda seed. 149 | 150 | This function should be called after the model parallel is 151 | initialized. Also, no torch.cuda.manual_seed should be called 152 | after this function. Basically, this is replacement for that 153 | function. 154 | Two set of RNG states are tracked: 155 | default state: This is for data parallelism and is the same among a 156 | set of model parallel GPUs but different across 157 | different model paralle groups. This is used for 158 | example for dropout in the non-model-parallel regions. 159 | model-parallel state: This state is different among a set of model 160 | parallel GPUs, but the same across data parallel 161 | groups. This is used for example for dropout in 162 | model parallel regions. 163 | """ 164 | # 2718 is just for fun and any POSITIVE value will work. 165 | offset = seed + 2718 166 | model_parallel_seed = offset + get_model_parallel_rank() 167 | # Data parallel gets the original sedd. 168 | data_parallel_seed = seed 169 | 170 | if torch.distributed.get_rank() == 0: 171 | print('> initializing model parallel cuda seeds on global rank {}, ' 172 | 'model parallel rank {}, and data parallel rank {} with ' 173 | 'model parallel seed: {} and data parallel seed: {}'.format( 174 | torch.distributed.get_rank(), get_model_parallel_rank(), 175 | get_data_parallel_rank(), model_parallel_seed, 176 | data_parallel_seed), flush=True) 177 | _CUDA_RNG_STATE_TRACKER.reset() 178 | # Set the default state. 179 | torch.cuda.manual_seed(data_parallel_seed) 180 | # and model parallel state. 181 | _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME, 182 | model_parallel_seed) 183 | 184 | 185 | class CheckpointFunction(torch.autograd.Function): 186 | """This function is adapted from torch.utils.checkpoint with 187 | two main changes: 188 | 1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state` 189 | 2) the states in the model parallel tracker are also properly 190 | tracked/set/reset. 191 | """ 192 | @staticmethod 193 | def forward(ctx, run_function, *args): 194 | ctx.run_function = run_function 195 | 196 | # Copy the rng states. 197 | ctx.fwd_cpu_rng_state = torch.get_rng_state() 198 | ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state() 199 | ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states() 200 | 201 | ctx.save_for_backward(*args) 202 | with torch.no_grad(): 203 | outputs = run_function(*args) 204 | return outputs 205 | 206 | @staticmethod 207 | def backward(ctx, *args): 208 | if not torch.autograd._is_checkpoint_valid(): 209 | raise RuntimeError("Checkpointing is not compatible with .grad(), " 210 | "please use .backward() if possible") 211 | inputs = ctx.saved_tensors 212 | 213 | # Store the current states. 214 | bwd_cpu_rng_state = torch.get_rng_state() 215 | bwd_cuda_rng_state = torch.cuda.get_rng_state() 216 | bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states() 217 | 218 | # Set the states to what it used to be before the forward pass. 219 | torch.set_rng_state(ctx.fwd_cpu_rng_state) 220 | _set_cuda_rng_state(ctx.fwd_cuda_rng_state) 221 | get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker) 222 | 223 | # Compute the forward pass. 224 | detached_inputs = detach_variable(inputs) 225 | with torch.enable_grad(): 226 | outputs = ctx.run_function(*detached_inputs) 227 | 228 | # Set the states back to what it was at the start of this function. 229 | torch.set_rng_state(bwd_cpu_rng_state) 230 | _set_cuda_rng_state(bwd_cuda_rng_state) 231 | get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker) 232 | 233 | if isinstance(outputs, torch.Tensor): 234 | outputs = (outputs,) 235 | torch.autograd.backward(outputs, args) 236 | return (None,) + tuple(inp.grad for inp in detached_inputs) 237 | 238 | 239 | def checkpoint(function, *args): 240 | """Checkpoint a model or part of the model. 241 | This has been directly copied from torch.utils.checkpoint.""" 242 | return CheckpointFunction.apply(function, *args) 243 | -------------------------------------------------------------------------------- /fp16/loss_scaler.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | import mpu 18 | 19 | # item() is a recent addition, so this helps with backward compatibility. 20 | def to_python_float(t): 21 | if hasattr(t, 'item'): 22 | return t.item() 23 | else: 24 | return t[0] 25 | 26 | class LossScaler: 27 | """ 28 | Class that manages a static loss scale. This class is intended to interact with 29 | :class:`FP16_Optimizer`, and should not be directly manipulated by the user. 30 | 31 | Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to 32 | :class:`FP16_Optimizer`'s constructor. 33 | 34 | Args: 35 | scale (float, optional, default=1.0): The loss scale. 36 | """ 37 | 38 | def __init__(self, scale=1): 39 | self.cur_scale = scale 40 | 41 | # `params` is a list / generator of torch.Variable 42 | def has_overflow(self, params): 43 | return False 44 | 45 | # `x` is a torch.Tensor 46 | def _has_inf_or_nan(x): 47 | return False 48 | 49 | def update_scale(self, overflow): 50 | pass 51 | 52 | @property 53 | def loss_scale(self): 54 | return self.cur_scale 55 | 56 | def scale_gradient(self, module, grad_in, grad_out): 57 | return tuple(self.loss_scale * g for g in grad_in) 58 | 59 | def backward(self, loss, retain_graph=False): 60 | scaled_loss = loss*self.loss_scale 61 | scaled_loss.backward(retain_graph=retain_graph) 62 | 63 | class DynamicLossScaler: 64 | """ 65 | Class that manages dynamic loss scaling. It is recommended to use :class:`DynamicLossScaler` 66 | indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of 67 | :class:`FP16_Optimizer`. However, it's important to understand how :class:`DynamicLossScaler` 68 | operates, because the default options can be changed using the 69 | the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor. 70 | 71 | Loss scaling is designed to combat the problem of underflowing gradients encountered at long 72 | times when training fp16 networks. Dynamic loss scaling begins by attempting a very high loss 73 | scale. Ironically, this may result in OVERflowing gradients. If overflowing gradients are 74 | encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has 75 | occurred. 76 | :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch, 77 | and :class:`DynamicLossScaler` adjusts the loss scale to a lower value. 78 | If a certain number of iterations occur without overflowing gradients detected, 79 | :class:`DynamicLossScaler` increases the loss scale once more. 80 | In this way :class:`DynamicLossScaler` attempts to "ride the edge" of 81 | always using the highest loss scale possible without incurring overflow. 82 | 83 | Args: 84 | init_scale (float, optional, default=2**32): Initial loss scale attempted by :class:`DynamicLossScaler.` 85 | scale_factor (float, optional, default=2.0): Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``. If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``. 86 | scale_window (int, optional, default=1000): Number of consecutive iterations without an overflow to wait before increasing the loss scale. 87 | """ 88 | 89 | def __init__(self, 90 | init_scale=2**32, 91 | scale_factor=2., 92 | scale_window=1000, 93 | min_scale=1, 94 | delayed_shift=1, 95 | consecutive_hysteresis=False): 96 | self.cur_scale = init_scale 97 | self.cur_iter = 0 98 | self.last_overflow_iter = -1 99 | self.scale_factor = scale_factor 100 | self.scale_window = scale_window 101 | self.min_scale = min_scale 102 | self.delayed_shift = delayed_shift 103 | self.cur_hysteresis = delayed_shift 104 | self.consecutive_hysteresis = consecutive_hysteresis 105 | 106 | # `params` is a list / generator of torch.Variable 107 | def has_overflow_serial(self, params): 108 | for p in params: 109 | if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data): 110 | return True 111 | 112 | return False 113 | 114 | def has_overflow(self, params): 115 | overflow = self.has_overflow_serial(params) 116 | # Since each model parallel GPU carries only part of the model, 117 | # make sure overflow flag is synced across all the model parallel GPUs 118 | overflow_gpu = torch.cuda.ByteTensor([overflow]) 119 | torch.distributed.all_reduce(overflow_gpu, 120 | op=torch.distributed.ReduceOp.MAX, 121 | group=mpu.get_model_parallel_group()) 122 | overflow = overflow_gpu[0].item() 123 | return bool(overflow) 124 | 125 | 126 | # `x` is a torch.Tensor 127 | def _has_inf_or_nan(x): 128 | try: 129 | # if x is half, the .float() incurs an additional deep copy, but it's necessary if 130 | # Pytorch's .sum() creates a one-element tensor of the same type as x 131 | # (which is true for some recent version of pytorch). 132 | cpu_sum = float(x.float().sum()) 133 | # More efficient version that can be used if .sum() returns a Python scalar 134 | # cpu_sum = float(x.sum()) 135 | except RuntimeError as instance: 136 | # We want to check if inst is actually an overflow exception. 137 | # RuntimeError could come from a different error. 138 | # If so, we still want the exception to propagate. 139 | if "value cannot be converted" not in instance.args[0]: 140 | raise 141 | return True 142 | else: 143 | if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum: 144 | return True 145 | return False 146 | 147 | # `overflow` is boolean indicating whether the gradient overflowed 148 | def update_scale(self, overflow): 149 | 150 | if not hasattr(self, 'min_scale'): 151 | self.min_scale = 1 152 | if not hasattr(self, 'delayed_shift'): 153 | self.delayed_shift = 1 154 | if not hasattr(self, 'cur_hysteresis'): 155 | self.cur_hysteresis = 1 156 | if not hasattr(self, 'consecutive_hysteresis'): 157 | self.consecutive_hysteresis = True 158 | if overflow: 159 | # self.cur_scale /= self.scale_factor 160 | if self.delayed_shift == 1 or self.cur_hysteresis == 1: 161 | self.cur_scale = max(self.cur_scale/self.scale_factor, self.min_scale) 162 | else: 163 | self.cur_hysteresis -= 1 164 | self.last_overflow_iter = self.cur_iter 165 | else: 166 | if self.consecutive_hysteresis: 167 | self.cur_hysteresis = self.delayed_shift 168 | if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0: 169 | if not self.consecutive_hysteresis: 170 | self.cur_hysteresis = self.delayed_shift 171 | self.cur_scale *= self.scale_factor 172 | self.cur_iter += 1 173 | 174 | @property 175 | def loss_scale(self): 176 | return self.cur_scale 177 | 178 | def scale_gradient(self, module, grad_in, grad_out): 179 | return tuple(self.loss_scale * g for g in grad_in) 180 | 181 | def backward(self, loss, retain_graph=False): 182 | scaled_loss = loss*self.loss_scale 183 | scaled_loss.backward(retain_graph=retain_graph) 184 | 185 | ############################################################## 186 | # Example usage below here -- assuming it's in a separate file 187 | ############################################################## 188 | """ 189 | TO-DO separate out into an example. 190 | if __name__ == "__main__": 191 | import torch 192 | from torch.autograd import Variable 193 | from dynamic_loss_scaler import DynamicLossScaler 194 | 195 | # N is batch size; D_in is input dimension; 196 | # H is hidden dimension; D_out is output dimension. 197 | N, D_in, H, D_out = 64, 1000, 100, 10 198 | 199 | # Create random Tensors to hold inputs and outputs, and wrap them in Variables. 200 | x = Variable(torch.randn(N, D_in), requires_grad=False) 201 | y = Variable(torch.randn(N, D_out), requires_grad=False) 202 | 203 | w1 = Variable(torch.randn(D_in, H), requires_grad=True) 204 | w2 = Variable(torch.randn(H, D_out), requires_grad=True) 205 | parameters = [w1, w2] 206 | 207 | learning_rate = 1e-6 208 | optimizer = torch.optim.SGD(parameters, lr=learning_rate) 209 | loss_scaler = DynamicLossScaler() 210 | 211 | for t in range(500): 212 | y_pred = x.mm(w1).clamp(min=0).mm(w2) 213 | loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale 214 | print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale)) 215 | print('Iter {} scaled loss: {}'.format(t, loss.data[0])) 216 | print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale)) 217 | 218 | # Run backprop 219 | optimizer.zero_grad() 220 | loss.backward() 221 | 222 | # Check for overflow 223 | has_overflow = DynamicLossScaler.has_overflow(parameters) 224 | 225 | # If no overflow, unscale grad and update as usual 226 | if not has_overflow: 227 | for param in parameters: 228 | param.grad.data.mul_(1. / loss_scaler.loss_scale) 229 | optimizer.step() 230 | # Otherwise, don't do anything -- ie, skip iteration 231 | else: 232 | print('OVERFLOW!') 233 | 234 | # Update loss scale for next iteration 235 | loss_scaler.update_scale(has_overflow) 236 | 237 | """ 238 | -------------------------------------------------------------------------------- /generate_samples.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Sample Generate GPT2""" 17 | 18 | import os 19 | import random 20 | import numpy as np 21 | import torch 22 | import torch.nn.functional as F 23 | import argparse 24 | import time 25 | from arguments import get_args 26 | from utils import Timers 27 | from pretrain_gpt2 import initialize_distributed 28 | from pretrain_gpt2 import set_random_seed 29 | from pretrain_gpt2 import get_train_val_test_data 30 | from pretrain_gpt2 import get_masks_and_position_ids 31 | from utils import load_checkpoint 32 | from data_utils import make_tokenizer 33 | from configure_data import configure_data 34 | import mpu 35 | 36 | from fp16 import FP16_Module 37 | from model import GPT2Model 38 | from model import DistributedDataParallel as DDP 39 | from utils import print_rank_0 40 | 41 | def get_model(args): 42 | """Build the model.""" 43 | 44 | print_rank_0('building GPT2 model ...') 45 | model = GPT2Model(num_layers=args.num_layers, 46 | vocab_size=args.vocab_size, 47 | hidden_size=args.hidden_size, 48 | num_attention_heads=args.num_attention_heads, 49 | embedding_dropout_prob=args.hidden_dropout, 50 | attention_dropout_prob=args.attention_dropout, 51 | output_dropout_prob=args.hidden_dropout, 52 | max_sequence_length=args.max_position_embeddings, 53 | checkpoint_activations=args.checkpoint_activations, 54 | checkpoint_num_layers=args.checkpoint_num_layers, 55 | parallel_output=False) 56 | 57 | if mpu.get_data_parallel_rank() == 0: 58 | print(' > number of parameters on model parallel rank {}: {}'.format( 59 | mpu.get_model_parallel_rank(), 60 | sum([p.nelement() for p in model.parameters()])), flush=True) 61 | 62 | # GPU allocation. 63 | model.cuda(torch.cuda.current_device()) 64 | 65 | # Fp16 conversion. 66 | if args.fp16: 67 | model = FP16_Module(model) 68 | 69 | # Wrap model for distributed training. 70 | model = DDP(model) 71 | 72 | return model 73 | 74 | def setup_model(args): 75 | """Setup model and optimizer.""" 76 | 77 | model = get_model(args) 78 | 79 | if args.load is not None: 80 | _ = load_checkpoint( 81 | model, None, None, args) 82 | 83 | return model 84 | 85 | 86 | def get_batch(context_tokens, device, args): 87 | tokens = context_tokens 88 | tokens = tokens.view(args.batch_size, -1).contiguous() 89 | tokens = tokens.to(device) 90 | 91 | # Get the masks and postition ids. 92 | attention_mask, loss_mask, position_ids = get_masks_and_position_ids( 93 | tokens, 94 | args.eod_token, 95 | args.reset_position_ids, 96 | args.reset_attention_mask) 97 | 98 | return tokens, attention_mask, position_ids 99 | 100 | def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): 101 | # This function has been mostly taken from huggingface conversational ai code at 102 | # https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313 103 | 104 | if top_k > 0: 105 | # Remove all tokens with a probability less than the last token of the top-k 106 | indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] 107 | logits[indices_to_remove] = filter_value 108 | 109 | if top_p > 0.0: 110 | #convert to 1D 111 | logits=logits.view(logits.size()[1]).contiguous() 112 | sorted_logits, sorted_indices = torch.sort(logits, descending=True) 113 | cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) 114 | 115 | # Remove tokens with cumulative probability above the threshold 116 | sorted_indices_to_remove = cumulative_probs > top_p 117 | # Shift the indices to the right to keep also the first token above the threshold 118 | sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() 119 | sorted_indices_to_remove[..., 0] = 0 120 | indices_to_remove = sorted_indices[sorted_indices_to_remove] 121 | logits[indices_to_remove] = filter_value 122 | #going back to 2D 123 | logits=logits.view(1, -1).contiguous() 124 | 125 | return logits 126 | 127 | 128 | def generate_samples(model, tokenizer, args, device): 129 | 130 | context_count=0 131 | model.eval() 132 | with torch.no_grad(): 133 | while True: 134 | torch.distributed.barrier(group=mpu.get_model_parallel_group()) 135 | terminate_runs=0 136 | 137 | if mpu.get_model_parallel_rank() == 0: 138 | raw_text = input("\nContext prompt (stop to exit) >>> ") 139 | while not raw_text: 140 | print('Prompt should not be empty!') 141 | raw_text = input("\nContext prompt (stop to exit) >>> ") 142 | 143 | if "stop" in raw_text: 144 | terminate_runs = 1 145 | else: 146 | context_tokens = tokenizer.EncodeAsIds(raw_text).tokenization 147 | context_length = len(context_tokens) 148 | 149 | if context_length >=args.seq_length//2: 150 | print("\nContext length", context_length, \ 151 | "\nPlease give smaller context (half of the sequence length)!") 152 | continue 153 | else: 154 | context_tokens = tokenizer.EncodeAsIds("EMPTY TEXT").tokenization 155 | context_length = len(context_tokens) 156 | 157 | terminate_runs_tensor = torch.cuda.LongTensor([terminate_runs]) 158 | torch.distributed.broadcast(terminate_runs_tensor, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) 159 | terminate_runs = terminate_runs_tensor[0].item() 160 | 161 | if terminate_runs == 1: 162 | return 163 | 164 | pad_id = tokenizer.get_command('pad').Id 165 | if context_length < args.seq_length: 166 | context_tokens.extend([pad_id] * (args.seq_length - context_length)) 167 | 168 | context_tokens_tensor = torch.cuda.LongTensor(context_tokens) 169 | context_length_tensor = torch.cuda.LongTensor([context_length]) 170 | 171 | torch.distributed.broadcast(context_length_tensor, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) 172 | torch.distributed.broadcast(context_tokens_tensor, mpu.get_model_parallel_src_rank(), group=mpu.get_model_parallel_group()) 173 | 174 | context_length = context_length_tensor[0].item() 175 | tokens, attention_mask, position_ids=get_batch(context_tokens_tensor, device, args) 176 | 177 | start_time = time.time() 178 | 179 | counter = 0 180 | org_context_length = context_length 181 | 182 | while counter < (org_context_length + args.out_seq_length): 183 | logits = model(tokens, position_ids, attention_mask) 184 | logits = logits[:, context_length - 1, :] / args.temperature 185 | logits = top_k_logits(logits, top_k=args.top_k, top_p=args.top_p) 186 | log_probs = F.softmax(logits, dim=-1) 187 | prev = torch.multinomial(log_probs, num_samples=1) 188 | tokens[0, context_length] = prev[0] 189 | context_length += 1 190 | counter += 1 191 | 192 | output_tokens_list = tokens.view(-1).contiguous() 193 | decode_tokens = tokenizer.DecodeIds(output_tokens_list.tolist()) 194 | token_end = decode_tokens.find("<|endoftext|>") 195 | 196 | 197 | if mpu.get_model_parallel_rank() == 0 and (counter % 16 == 0 or token_end != -1): 198 | os.system('clear') 199 | print("\nTaken time {:.2f}\n".format(time.time() - start_time), flush=True) 200 | print("\nContext:", raw_text, flush=True) 201 | trim_decode_tokens = decode_tokens[len(raw_text):decode_tokens.find("<|endoftext|>")] 202 | print("\nGPT2:", trim_decode_tokens, flush=True) 203 | if token_end != -1: 204 | break 205 | 206 | if mpu.get_model_parallel_rank() == 0: 207 | os.system('clear') 208 | print("\nTaken time {:.2f}\n".format(time.time() - start_time), flush=True) 209 | print("\nContext:", raw_text, flush=True) 210 | output_tokens_list = tokens.view(-1).contiguous() 211 | decode_tokens = tokenizer.DecodeIds(output_tokens_list.tolist()) 212 | trim_decode_tokens = decode_tokens[len(raw_text):decode_tokens.find("<|endoftext|>")] 213 | print("\nGPT2:", trim_decode_tokens, flush=True) 214 | raw_text = None 215 | 216 | torch.distributed.barrier(group=mpu.get_model_parallel_group()) 217 | context_count += 1 218 | 219 | def prepare_tokenizer(args): 220 | 221 | tokenizer_args = { 222 | 'tokenizer_type': args.tokenizer_type, 223 | 'corpus': None, 224 | 'model_path': args.tokenizer_path, 225 | 'vocab_size': args.vocab_size, 226 | 'model_type': args.tokenizer_model_type, 227 | 'cache_dir': args.cache_dir} 228 | tokenizer = make_tokenizer(**tokenizer_args) 229 | 230 | args.tokenizer_num_tokens = tokenizer.num_tokens 231 | args.tokenizer_num_type_tokens = tokenizer.num_type_tokens 232 | args.eod_token = tokenizer.get_command('eos').Id 233 | 234 | after = tokenizer.num_tokens 235 | while after % mpu.get_model_parallel_world_size() != 0: 236 | after += 1 237 | 238 | args.vocab_size = after 239 | print("prepare tokenizer done", flush=True) 240 | 241 | return tokenizer 242 | 243 | def main(): 244 | """Main training program.""" 245 | 246 | print('Generate Samples') 247 | 248 | # Disable CuDNN. 249 | torch.backends.cudnn.enabled = False 250 | 251 | # Timer. 252 | timers = Timers() 253 | 254 | # Arguments. 255 | args = get_args() 256 | 257 | # Pytorch distributed. 258 | initialize_distributed(args) 259 | 260 | # Random seeds for reproducability. 261 | set_random_seed(args.seed) 262 | 263 | #get the tokenizer 264 | tokenizer = prepare_tokenizer(args) 265 | 266 | # Model, optimizer, and learning rate. 267 | model = setup_model(args) 268 | 269 | #setting default batch size to 1 270 | args.batch_size = 1 271 | 272 | #generate samples 273 | generate_samples(model, tokenizer, args, torch.cuda.current_device()) 274 | 275 | 276 | if __name__ == "__main__": 277 | main() 278 | 279 | 280 | 281 | --------------------------------------------------------------------------------