├── hostfile ├── megatron ├── mpu │ ├── tests │ │ ├── __init__.py │ │ ├── commons.py │ │ ├── test_data.py │ │ ├── test_initialize.py │ │ ├── test_cross_entropy.py │ │ └── test_random.py │ ├── __init__.py │ ├── utils.py │ ├── data.py │ ├── mappings.py │ ├── cross_entropy.py │ └── grads.py ├── data │ ├── __init__.py │ ├── helpers.cpython-38-x86_64-linux-gnu.so │ ├── test │ │ ├── test_preprocess_data.sh │ │ └── test_indexed_dataset.py │ ├── Makefile │ ├── ict_dataset.py │ ├── samplers.py │ └── realm_dataset_utils.py ├── tokenizer │ ├── __init__.py │ └── t5_tokenization.py ├── deprecated_data_utils │ ├── scripts │ │ ├── presplit_sentences_json.py │ │ ├── split_json.py │ │ └── split_gpt2_json.py │ ├── corpora.py │ ├── tf_dl.py │ ├── samplers.py │ ├── __init__.py │ └── lazy_loader.py ├── model │ ├── __init__.py │ ├── utils.py │ ├── fused_bias_gelu.py │ ├── classification.py │ ├── multiple_choice.py │ ├── distributed.py │ ├── fused_softmax.py │ └── bert_model.py ├── fp16 │ └── __init__.py ├── module.py ├── package_info.py ├── __init__.py ├── fused_kernels │ ├── scaled_upper_triang_masked_softmax.cpp │ ├── scaled_masked_softmax.cpp │ ├── scaled_upper_triang_masked_softmax_cuda.cu │ ├── scaled_masked_softmax_cuda.cu │ └── __init__.py ├── indexer.py ├── learning_rates.py └── memory.py ├── .gitignore ├── doc └── release-note.md ├── ds_config_gpt2.json ├── ds_config_t5.json ├── LICENSE ├── run_t5.sh ├── run_gpt2.sh ├── pretrain_gpt2.py ├── pretrain_t5.py └── tds └── __init__.py /hostfile: -------------------------------------------------------------------------------- 1 | node1 slots=8 2 | -------------------------------------------------------------------------------- /megatron/mpu/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /megatron/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import indexed_dataset 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | upload.sh 3 | results/ 4 | results-local 5 | __pycache__ 6 | 7 | -------------------------------------------------------------------------------- /megatron/data/helpers.cpython-38-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TsinghuaAI/CPM-1-Pretrain/HEAD/megatron/data/helpers.cpython-38-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /doc/release-note.md: -------------------------------------------------------------------------------- 1 | ## v0.1.0 2 | 3 | ### Functions 4 | 5 | - 支持GPT-2、T5的多机多卡预训练 6 | - 支持数据并行、模型并行、流水并行、混合精度计算 7 | 8 | ### Performance 9 | 10 | - GPT-2、T5可以稳定运行 11 | -------------------------------------------------------------------------------- /ds_config_gpt2.json: -------------------------------------------------------------------------------- 1 | {"train_batch_size": 64, "train_micro_batch_size_per_gpu": 4, "steps_per_print": 10, "gradient_clipping": 1.0, "fp16": {"enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "hysteresis": 2, "initial_scale_power": 16}, "zero_allow_untested_optimizer": true, "wall_clock_breakdown": true} -------------------------------------------------------------------------------- /ds_config_t5.json: -------------------------------------------------------------------------------- 1 | {"train_batch_size": 128, "train_micro_batch_size_per_gpu": 4, "steps_per_print": 10, "gradient_clipping": 1.0, "fp16": {"enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "hysteresis": 2, "initial_scale_power": 16}, "zero_allow_untested_optimizer": true, "wall_clock_breakdown": true} -------------------------------------------------------------------------------- /megatron/data/test/test_preprocess_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IMPL=cached 4 | python ../preprocess_data.py \ 5 | --input test_samples.json \ 6 | --vocab vocab.txt \ 7 | --dataset-impl ${IMPL} \ 8 | --output-prefix test_samples_${IMPL} \ 9 | --workers 1 \ 10 | --log-interval 2 11 | -------------------------------------------------------------------------------- /megatron/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from .tokenizer import build_tokenizer 18 | -------------------------------------------------------------------------------- /megatron/deprecated_data_utils/scripts/presplit_sentences_json.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python scripts/presplit_sentences_json.py 4 | """ 5 | 6 | import sys 7 | import json 8 | 9 | import nltk 10 | 11 | nltk.download('punkt') 12 | 13 | input_file = sys.argv[1] 14 | output_file = sys.argv[2] 15 | 16 | line_seperator = "\n" 17 | 18 | with open(input_file, 'r') as ifile: 19 | with open(output_file, "w") as ofile: 20 | for doc in ifile.readlines(): 21 | parsed = json.loads(doc) 22 | sent_list = [] 23 | for line in parsed['text'].split('\n'): 24 | if line != '\n': 25 | sent_list.extend(nltk.tokenize.sent_tokenize(line)) 26 | parsed['text'] = line_seperator.join(sent_list) 27 | ofile.write(json.dumps(parsed) + '\n') 28 | -------------------------------------------------------------------------------- /megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .distributed import * 17 | from .bert_model import BertModel 18 | from .realm_model import ICTBertModel 19 | from .gpt2_model import GPT2Model, GPT2ModelPipe 20 | from .utils import get_params_for_weight_decay_optimization 21 | from .language_model import get_language_model 22 | from .t5_model import T5ModelPipe, T5Model -------------------------------------------------------------------------------- /megatron/fp16/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from .fp16util import ( 16 | BN_convert_float, 17 | network_to_half, 18 | prep_param_lists, 19 | model_grads_to_master_grads, 20 | master_params_to_model_params, 21 | tofp16, 22 | to_python_float, 23 | clip_grad_norm, 24 | convert_module, 25 | convert_network, 26 | FP16Model, 27 | ) 28 | 29 | from .fp16 import * 30 | from .loss_scaler import * 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Tsinghua AI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /megatron/module.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Megatron Module""" 17 | 18 | import torch 19 | 20 | 21 | class MegatronModule(torch.nn.Module): 22 | """Megatron specific extentions of torch Module.""" 23 | 24 | def __init__(self): 25 | super(MegatronModule, self).__init__() 26 | 27 | def state_dict_for_save_checkpoint(self, destination=None, prefix='', 28 | keep_vars=False): 29 | """Use this function to override the state dict for 30 | saving checkpoints.""" 31 | return self.state_dict(destination, prefix, keep_vars) 32 | -------------------------------------------------------------------------------- /megatron/package_info.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | MAJOR = 1 17 | MINOR = 1.5 18 | 19 | # Use the following formatting: (major, minor) 20 | VERSION = (MAJOR, MINOR) 21 | 22 | __version__ = '.'.join(map(str, VERSION)) 23 | __package_name__ = 'megatron-lm' 24 | __contact_names__ = 'NVIDIA INC' 25 | __url__ = 'https://github.com/NVIDIA/Megatron-LM' 26 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases' 27 | __description__ = 'Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism.' 28 | __license__ = 'See https://github.com/NVIDIA/Megatron-LM/blob/master/LICENSE' 29 | __keywords__ = 'deep learning, Megatron, gpu, NLP, nvidia, pytorch, torch, language' 30 | 31 | -------------------------------------------------------------------------------- /megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import torch 16 | 17 | from .package_info import ( 18 | __description__, 19 | __contact_names__, 20 | __url__, 21 | __download_url__, 22 | __keywords__, 23 | __license__, 24 | __package_name__, 25 | __version__, 26 | ) 27 | 28 | from .global_vars import get_args 29 | from .global_vars import get_tokenizer 30 | from .global_vars import get_tensorboard_writer 31 | from .global_vars import get_adlr_autoresume 32 | from .global_vars import get_timers 33 | from .initialize import initialize_megatron 34 | 35 | def print_rank_0(message): 36 | """If distributed is initialized print only on rank 0.""" 37 | if torch.distributed.is_initialized(): 38 | if torch.distributed.get_rank() == 0: 39 | print(message, flush=True) 40 | else: 41 | print(message, flush=True) 42 | -------------------------------------------------------------------------------- /megatron/deprecated_data_utils/corpora.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """several datasets with preset arguments""" 16 | from .datasets import json_dataset, csv_dataset 17 | import os 18 | 19 | 20 | class wikipedia(json_dataset): 21 | """ 22 | dataset for wikipedia with arguments configured for convenience 23 | 24 | command line usage: `--train-data wikipedia` 25 | """ 26 | PATH = 'data/wikipedia/wikidump_lines.json' 27 | assert_str = "make sure to set PATH for wikipedia data_utils/corpora.py" 28 | 29 | def __init__(self, **kwargs): 30 | assert os.path.exists(wikipedia.PATH), \ 31 | wikipedia.assert_str 32 | if not kwargs: 33 | kwargs = {} 34 | kwargs['text_key'] = 'text' 35 | kwargs['loose_json'] = True 36 | super(wikipedia, self).__init__(wikipedia.PATH, **kwargs) 37 | 38 | 39 | class webtext(json_dataset): 40 | """ 41 | dataset for webtext with arguments configured for convenience 42 | 43 | command line usage: `--train-data webtext` 44 | """ 45 | PATH = 'data/webtext/data.json' 46 | assert_str = "make sure to set PATH for webtext data_utils/corpora.py" 47 | 48 | def __init__(self, **kwargs): 49 | assert os.path.exists(webtext.PATH), \ 50 | webtext.assert_str 51 | if not kwargs: 52 | kwargs = {} 53 | kwargs['text_key'] = 'text' 54 | kwargs['loose_json'] = True 55 | super(webtext, self).__init__(webtext.PATH, **kwargs) 56 | 57 | 58 | NAMED_CORPORA = { 59 | 'wikipedia': wikipedia, 60 | 'webtext': webtext, 61 | } 62 | -------------------------------------------------------------------------------- /megatron/mpu/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Model parallel utility interface.""" 17 | 18 | from .cross_entropy import vocab_parallel_cross_entropy 19 | 20 | from .data import broadcast_data 21 | 22 | from .grads import clip_grad_norm 23 | 24 | from .initialize import is_unitialized 25 | from .initialize import destroy_model_parallel 26 | from .initialize import get_data_parallel_group 27 | from .initialize import get_data_parallel_rank 28 | from .initialize import get_data_parallel_world_size 29 | from .initialize import get_model_parallel_group 30 | from .initialize import get_model_parallel_rank, set_model_parallel_rank 31 | from .initialize import get_model_parallel_src_rank 32 | from .initialize import get_model_parallel_world_size, set_model_parallel_world_size 33 | from .initialize import get_topology 34 | from .initialize import get_pipe_parallel_group 35 | from .initialize import get_pipe_parallel_rank 36 | from .initialize import get_pipe_parallel_world_size 37 | from .initialize import get_io_parallel_group 38 | from .initialize import initialize_model_parallel 39 | from .initialize import model_parallel_is_initialized 40 | 41 | from .layers import LayerNorm 42 | from .layers import ColumnParallelLinear 43 | from .layers import RowParallelLinear 44 | from .layers import VocabParallelEmbedding 45 | 46 | from .mappings import copy_to_model_parallel_region 47 | from .mappings import gather_from_model_parallel_region 48 | from .mappings import reduce_from_model_parallel_region 49 | from .mappings import scatter_to_model_parallel_region 50 | 51 | from .random import checkpoint 52 | from .random import get_cuda_rng_tracker 53 | from .random import init_checkpointed_activations_memory_buffer 54 | from .random import model_parallel_cuda_manual_seed 55 | from .random import reset_checkpointed_activations_memory_buffer 56 | 57 | from .utils import divide 58 | from .utils import split_tensor_along_last_dim 59 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | namespace multihead_attn { 22 | namespace fused_softmax { 23 | namespace scaled_upper_triang_masked_softmax { 24 | 25 | torch::Tensor fwd_cuda( 26 | torch::Tensor const& input, 27 | float scale_factor); 28 | 29 | torch::Tensor bwd_cuda( 30 | torch::Tensor const& output_grads, 31 | torch::Tensor const& softmax_results, 32 | float scale_factor); 33 | 34 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) { 35 | AT_ASSERTM(input.dim() == 3, "expected 3D tensor"); 36 | AT_ASSERTM(input.scalar_type() == at::ScalarType::Half, 37 | "Only HALF is supported"); 38 | 39 | return fwd_cuda(input, scale_factor); 40 | } 41 | 42 | torch::Tensor bwd( 43 | torch::Tensor const& output_grads, 44 | torch::Tensor const& softmax_results, 45 | float scale_factor) { 46 | 47 | AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor"); 48 | AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor"); 49 | 50 | AT_ASSERTM(output_grads.scalar_type() == at::ScalarType::Half, 51 | "Only HALF is supported"); 52 | AT_ASSERTM(softmax_results.scalar_type() == at::ScalarType::Half, 53 | "Only HALF is supported"); 54 | 55 | return bwd_cuda(output_grads, softmax_results, scale_factor); 56 | } 57 | 58 | } // end namespace scaled_upper_triang_masked_softmax 59 | } // end namespace fused_softmax 60 | } // end namespace multihead_attn 61 | 62 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 63 | m.def("forward", 64 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd, 65 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 66 | m.def("backward", 67 | &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd, 68 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 69 | } 70 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_masked_softmax.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | namespace multihead_attn { 22 | namespace fused_softmax { 23 | namespace scaled_masked_softmax { 24 | 25 | torch::Tensor fwd_cuda( 26 | torch::Tensor const& input, 27 | torch::Tensor const& mask, 28 | float scale_factor); 29 | 30 | torch::Tensor bwd_cuda( 31 | torch::Tensor const& output_grads, 32 | torch::Tensor const& softmax_results, 33 | float scale_factor); 34 | 35 | torch::Tensor fwd( 36 | torch::Tensor const& input, 37 | torch::Tensor const& mask, 38 | float scale_factor) { 39 | AT_ASSERTM(input.dim() == 4, "expected 4D tensor"); 40 | AT_ASSERTM(input.scalar_type() == at::ScalarType::Half, 41 | "Only HALF is supported"); 42 | AT_ASSERTM(mask.dim() == 4, "expected 4D tensor"); 43 | 44 | return fwd_cuda(input, mask, scale_factor); 45 | } 46 | 47 | torch::Tensor bwd( 48 | torch::Tensor const& output_grads, 49 | torch::Tensor const& softmax_results, 50 | float scale_factor) { 51 | 52 | AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor"); 53 | AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor"); 54 | 55 | AT_ASSERTM(output_grads.scalar_type() == at::ScalarType::Half, 56 | "Only HALF is supported"); 57 | AT_ASSERTM(softmax_results.scalar_type() == at::ScalarType::Half, 58 | "Only HALF is supported"); 59 | 60 | return bwd_cuda(output_grads, softmax_results, scale_factor); 61 | } 62 | 63 | } // end namespace scaled_masked_softmax 64 | } // end namespace fused_softmax 65 | } // end namespace multihead_attn 66 | 67 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 68 | m.def("forward", 69 | &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 70 | "Self Multihead Attention scaled, time masked softmax -- Forward."); 71 | m.def("backward", 72 | &multihead_attn::fused_softmax::scaled_masked_softmax::bwd, 73 | "Self Multihead Attention scaled, time masked softmax -- Backward."); 74 | } 75 | -------------------------------------------------------------------------------- /megatron/mpu/tests/commons.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import argparse 17 | import os 18 | import random 19 | import numpy 20 | import torch 21 | 22 | import mpu 23 | 24 | 25 | class IdentityLayer(torch.nn.Module): 26 | def __init__(self, size, scale=1.0): 27 | super(IdentityLayer, self).__init__() 28 | self.weight = torch.nn.Parameter(scale * torch.randn(size)) 29 | 30 | def forward(self): 31 | return self.weight 32 | 33 | 34 | def set_random_seed(seed): 35 | """Set random seed for reproducability.""" 36 | random.seed(seed) 37 | numpy.random.seed(seed) 38 | torch.manual_seed(seed) 39 | mpu.model_parallel_cuda_manual_seed(seed) 40 | 41 | 42 | def initialize_distributed(backend='nccl'): 43 | """Initialize torch.distributed.""" 44 | # Get local rank in case it is provided. 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument('--local_rank', type=int, default=None, 47 | help='local rank passed from distributed launcher') 48 | args = parser.parse_args() 49 | local_rank = args.local_rank 50 | 51 | # Get rank and world size. 52 | rank = int(os.getenv('RANK', '0')) 53 | world_size = int(os.getenv("WORLD_SIZE", '1')) 54 | 55 | print('> initializing torch.distributed with local rank: {}, ' 56 | 'rank: {}, world size: {}'.format(local_rank, rank, world_size)) 57 | 58 | # Set the device id. 59 | device = rank % torch.cuda.device_count() 60 | if local_rank is not None: 61 | device = local_rank 62 | torch.cuda.set_device(device) 63 | 64 | # Call the init process. 65 | init_method = 'tcp://' 66 | master_ip = os.getenv('MASTER_ADDR', 'localhost') 67 | master_port = os.getenv('MASTER_PORT', '6000') 68 | init_method += master_ip + ':' + master_port 69 | torch.distributed.init_process_group( 70 | backend=backend, 71 | world_size=world_size, 72 | rank=rank, 73 | init_method=init_method) 74 | 75 | 76 | def print_separator(message): 77 | torch.distributed.barrier() 78 | filler_len = (78 - len(message)) // 2 79 | filler = '-' * filler_len 80 | string = '\n' + filler + ' {} '.format(message) + filler 81 | if torch.distributed.get_rank() == 0: 82 | print(string, flush=True) 83 | torch.distributed.barrier() 84 | -------------------------------------------------------------------------------- /megatron/mpu/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import torch 18 | 19 | 20 | def ensure_divisibility(numerator, denominator): 21 | """Ensure that numerator is divisible by the denominator.""" 22 | assert numerator % denominator == 0, '{} is not divisible by {}'.format( 23 | numerator, denominator) 24 | 25 | 26 | def divide(numerator, denominator): 27 | """Ensure that numerator is divisible by the denominator and return 28 | the division value.""" 29 | ensure_divisibility(numerator, denominator) 30 | return numerator // denominator 31 | 32 | 33 | def split_tensor_along_last_dim(tensor, num_partitions, 34 | contiguous_split_chunks=False): 35 | """Split a tensor along its last dimension. 36 | Arguments: 37 | tensor: input tensor. 38 | num_partitions: number of partitions to split the tensor 39 | contiguous_split_chunks: If True, make each chunk contiguous 40 | in memory. 41 | """ 42 | # Get the size and dimension. 43 | last_dim = tensor.dim() - 1 44 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 45 | # Split. 46 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 47 | # Note: torch.split does not create contiguous tensors by default. 48 | if contiguous_split_chunks: 49 | return tuple(chunk.contiguous() for chunk in tensor_list) 50 | 51 | return tensor_list 52 | 53 | 54 | class VocabUtility: 55 | """Split the vocabulary into `world_size` chunks amd return the 56 | first and last index of the vocabulary belonging to the `rank` 57 | partition: Note that indecies in [fist, last)""" 58 | 59 | @staticmethod 60 | def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, 61 | rank, world_size): 62 | index_f = rank * per_partition_vocab_size 63 | index_l = index_f + per_partition_vocab_size 64 | return index_f, index_l 65 | 66 | @staticmethod 67 | def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size): 68 | per_partition_vocab_size = divide(global_vocab_size, world_size) 69 | return VocabUtility.vocab_range_from_per_partition_vocab_size( 70 | per_partition_vocab_size, rank, world_size) 71 | -------------------------------------------------------------------------------- /megatron/model/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Utilities for models.""" 17 | 18 | import math 19 | 20 | import torch 21 | 22 | from .transformer import LayerNorm 23 | 24 | 25 | def init_method_normal(sigma): 26 | """Init method based on N(0, sigma).""" 27 | def init_(tensor): 28 | return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) 29 | 30 | return init_ 31 | 32 | 33 | def scaled_init_method_normal(sigma, num_layers): 34 | """Init method based on N(0, sigma/sqrt(2*num_layers).""" 35 | std = sigma / math.sqrt(2.0 * num_layers) 36 | 37 | def init_(tensor): 38 | return torch.nn.init.normal_(tensor, mean=0.0, std=std) 39 | 40 | return init_ 41 | 42 | 43 | def get_linear_layer(rows, columns, init_method): 44 | """Simple linear layer with weight initialization.""" 45 | layer = torch.nn.Linear(rows, columns) 46 | init_method(layer.weight) 47 | with torch.no_grad(): 48 | layer.bias.zero_() 49 | return layer 50 | 51 | @torch.jit.script 52 | def gelu_impl(x): 53 | """OpenAI's gelu implementation.""" 54 | return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * 55 | (1.0 + 0.044715 * x * x))) 56 | def openai_gelu(x): 57 | return gelu_impl(x) 58 | 59 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter 60 | @torch.jit.script 61 | def erf_gelu(x): 62 | return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype)) 63 | 64 | def get_params_for_weight_decay_optimization(module): 65 | """Divide params into with-weight-decay and without-weight-decay groups. 66 | Layernorms and baises will have no weight decay but the rest will. 67 | """ 68 | weight_decay_params = {'params': []} 69 | no_weight_decay_params = {'params': [], 'weight_decay': 0.0} 70 | for module_ in module.modules(): 71 | if isinstance(module_, LayerNorm): 72 | no_weight_decay_params['params'].extend( 73 | [p for p in list(module_._parameters.values()) 74 | if p is not None]) 75 | else: 76 | weight_decay_params['params'].extend( 77 | [p for n, p in list(module_._parameters.items()) 78 | if p is not None and n != 'bias']) 79 | no_weight_decay_params['params'].extend( 80 | [p for n, p in list(module_._parameters.items()) 81 | if p is not None and n == 'bias']) 82 | 83 | return weight_decay_params, no_weight_decay_params 84 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include "THC/THC.h" 23 | #include 24 | #include 25 | #include "scaled_upper_triang_masked_softmax.h" 26 | 27 | namespace multihead_attn { 28 | namespace fused_softmax { 29 | namespace scaled_upper_triang_masked_softmax { 30 | 31 | torch::Tensor fwd_cuda( 32 | torch::Tensor const& input, 33 | float scale_factor) 34 | { 35 | // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] 36 | const int attn_batches = input.size(0); 37 | const int seq_len = input.size(1); 38 | TORCH_INTERNAL_ASSERT(seq_len <= 2048); 39 | 40 | // Output 41 | auto act_options = input.options().requires_grad(false); 42 | torch::Tensor softmax_results = 43 | torch::empty({attn_batches, seq_len, seq_len}, act_options); 44 | 45 | // Softmax Intermediate Result Ptr 46 | void* input_ptr = static_cast(input.data_ptr()); 47 | void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); 48 | 49 | dispatch_scaled_upper_triang_masked_softmax_forward( 50 | reinterpret_cast(softmax_results_ptr), 51 | reinterpret_cast(input_ptr), 52 | scale_factor, 53 | seq_len, 54 | seq_len, 55 | attn_batches); 56 | return softmax_results; 57 | } 58 | 59 | torch::Tensor bwd_cuda( 60 | torch::Tensor const& output_grads_, 61 | torch::Tensor const& softmax_results_, 62 | float scale_factor) { 63 | 64 | auto output_grads = output_grads_.contiguous(); 65 | auto softmax_results = softmax_results_.contiguous(); 66 | 67 | //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] 68 | const int attn_batches = output_grads.size(0); 69 | const int seq_len = output_grads.size(1); 70 | TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2)); 71 | 72 | void* output_grads_ptr = static_cast(output_grads.data_ptr()); 73 | 74 | //Softmax Grad 75 | dispatch_scaled_upper_triang_masked_softmax_backward( 76 | reinterpret_cast(output_grads_ptr), 77 | reinterpret_cast(output_grads_ptr), 78 | reinterpret_cast(softmax_results.data_ptr()), 79 | scale_factor, 80 | seq_len, 81 | seq_len, 82 | attn_batches); 83 | 84 | //backward pass is completely in-place 85 | return output_grads; 86 | } 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /megatron/mpu/tests/test_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from commons import print_separator 17 | from commons import initialize_distributed 18 | from mpu import data as data_utils 19 | import mpu 20 | import torch 21 | import functools 22 | import operator 23 | import sys 24 | sys.path.append("../..") 25 | 26 | 27 | def test_boradcast_data(model_parallel_size): 28 | 29 | if torch.distributed.get_rank() == 0: 30 | print('> testing boradcast_data with model parallel size {} ...'. 31 | format(model_parallel_size)) 32 | 33 | mpu.initialize_model_parallel(model_parallel_size) 34 | torch.manual_seed(1234 + mpu.get_data_parallel_rank()) 35 | model_parallel_size = mpu.get_model_parallel_world_size() 36 | 37 | key_size_t = {'key1': [7, 11], 38 | 'key2': [8, 2, 1], 39 | 'key3': [13], 40 | 'key4': [5, 1, 2], 41 | 'key5': [5, 12]} 42 | keys = list(key_size_t.keys()) 43 | 44 | data = {} 45 | data_t = {} 46 | for key in key_size_t: 47 | data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000) 48 | data_t[key] = data[key].clone() 49 | data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000) 50 | data_t['keyX'] = data['keyX'].clone() 51 | if mpu.get_model_parallel_rank() != 0: 52 | data = None 53 | 54 | data_utils._check_data_types(keys, data_t, torch.int64) 55 | key_size, key_numel, \ 56 | total_numel = data_utils._build_key_size_numel_dictionaries(keys, data) 57 | for key in keys: 58 | assert key_size[key] == key_size_t[key] 59 | total_numel_t = 0 60 | for key in keys: 61 | target_size = functools.reduce(operator.mul, key_size_t[key], 1) 62 | assert key_numel[key] == target_size 63 | total_numel_t += target_size 64 | assert total_numel == total_numel_t 65 | 66 | data_b = data_utils.broadcast_data(keys, data, torch.int64) 67 | for key in keys: 68 | tensor = data_t[key].cuda() 69 | assert data_b[key].sub(tensor).abs().max() == 0 70 | 71 | # Reset groups 72 | mpu.destroy_model_parallel() 73 | 74 | torch.distributed.barrier() 75 | if torch.distributed.get_rank() == 0: 76 | print('>> passed the test :-)') 77 | 78 | 79 | if __name__ == '__main__': 80 | 81 | initialize_distributed() 82 | world_size = torch.distributed.get_world_size() 83 | 84 | model_parallel_size = 1 85 | while model_parallel_size <= world_size: 86 | print_separator('test test boradcast data') 87 | test_boradcast_data(model_parallel_size) 88 | model_parallel_size *= 2 89 | -------------------------------------------------------------------------------- /megatron/model/fused_bias_gelu.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | torch._C._jit_set_profiling_mode(False) 19 | torch._C._jit_set_profiling_executor(False) 20 | torch._C._jit_override_can_fuse_on_cpu(True) 21 | torch._C._jit_override_can_fuse_on_gpu(True) 22 | 23 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################ 24 | # 1/sqrt(2*pi)-> 0.3989423 25 | # 1/sqrt(2) -> 0.70710678 26 | # sqrt(2/pi) -> 0.79788456 27 | # this function is tanh approximation of gelu 28 | # actual gelu is: 29 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) 30 | 31 | @torch.jit.script 32 | def bias_gelu(bias, y): 33 | x = bias + y 34 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 35 | 36 | # gradient of tanh approximation of gelu 37 | # gradient of actual gelu is: 38 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 39 | @torch.jit.script 40 | def bias_gelu_back(g, bias, y): 41 | x = bias + y 42 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 43 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 44 | ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) 45 | return ff*g 46 | 47 | @torch.jit.script 48 | def gelu(x): 49 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 50 | 51 | # gradient of tanh approximation of gelu 52 | # gradient of actual gelu is: 53 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 54 | @torch.jit.script 55 | def gelu_back(g, x): 56 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 57 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 58 | ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) 59 | return ff*g 60 | 61 | class GeLUFunction(torch.autograd.Function): 62 | @staticmethod 63 | # bias is an optional argument 64 | def forward(ctx, input, bias): 65 | ctx.save_for_backward(input, bias) 66 | return bias_gelu(bias, input) 67 | 68 | @staticmethod 69 | def backward(ctx, grad_output): 70 | input, bias = ctx.saved_tensors 71 | tmp = bias_gelu_back(grad_output, bias, input) 72 | return tmp, tmp 73 | 74 | class GeLUFunctionWithoutBias(torch.autograd.Function): 75 | @staticmethod 76 | # bias is an optional argument 77 | def forward(ctx, input): 78 | ctx.save_for_backward(input) 79 | return gelu(input) 80 | 81 | @staticmethod 82 | def backward(ctx, grad_output): 83 | input = ctx.saved_tensors 84 | tmp = gelu_back(grad_output, input) 85 | return tmp 86 | 87 | bias_gelu_impl = GeLUFunction.apply 88 | gelu_impl = GeLUFunctionWithoutBias.apply 89 | -------------------------------------------------------------------------------- /megatron/mpu/tests/test_initialize.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from commons import print_separator 17 | from commons import initialize_distributed 18 | import mpu 19 | import torch 20 | import sys 21 | sys.path.append("../..") 22 | 23 | 24 | def test_initialize_model_parallel(model_parallel_size): 25 | 26 | if torch.distributed.get_rank() == 0: 27 | print('> testing initialize_model_parallel with size {} ...'.format( 28 | model_parallel_size)) 29 | model_parallel_size_ = min(model_parallel_size, 30 | torch.distributed.get_world_size()) 31 | assert not mpu.model_parallel_is_initialized() 32 | mpu.initialize_model_parallel(model_parallel_size_) 33 | assert mpu.model_parallel_is_initialized() 34 | 35 | # Checks. 36 | def check(group, world_size, rank): 37 | assert world_size == torch.distributed.get_world_size(group=group) 38 | assert rank == torch.distributed.get_rank(group=group) 39 | 40 | # Model parallel. 41 | world_size = model_parallel_size_ 42 | rank = torch.distributed.get_rank() % model_parallel_size_ 43 | assert world_size == mpu.get_model_parallel_world_size() 44 | assert rank == mpu.get_model_parallel_rank() 45 | check(mpu.get_model_parallel_group(), world_size, rank) 46 | 47 | # Data parallel. 48 | world_size = torch.distributed.get_world_size() // model_parallel_size_ 49 | rank = torch.distributed.get_rank() // model_parallel_size 50 | assert world_size == mpu.get_data_parallel_world_size() 51 | assert rank == mpu.get_data_parallel_rank() 52 | check(mpu.get_data_parallel_group(), world_size, rank) 53 | 54 | # Reset groups 55 | mpu.destroy_model_parallel() 56 | 57 | torch.distributed.barrier() 58 | if torch.distributed.get_rank() == 0: 59 | print('>> passed the test :-)') 60 | 61 | 62 | def test_get_model_parallel_src_rank(model_parallel_size_): 63 | 64 | if torch.distributed.get_rank() == 0: 65 | print('> testing get_model_parallel_src_rank with size {} ...'.format( 66 | model_parallel_size_)) 67 | model_parallel_size = min(model_parallel_size_, 68 | torch.distributed.get_world_size()) 69 | assert not mpu.model_parallel_is_initialized() 70 | mpu.initialize_model_parallel(model_parallel_size) 71 | assert mpu.model_parallel_is_initialized() 72 | 73 | # Checks 74 | src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank() 75 | assert mpu.get_model_parallel_src_rank() == src_rank 76 | 77 | # Reset groups 78 | mpu.destroy_model_parallel() 79 | 80 | torch.distributed.barrier() 81 | if torch.distributed.get_rank() == 0: 82 | print('>> passed the test :-)') 83 | 84 | 85 | if __name__ == '__main__': 86 | 87 | initialize_distributed() 88 | world_size = torch.distributed.get_world_size() 89 | model_parallel_size = 1 90 | while model_parallel_size <= world_size: 91 | print_separator('test initialize model parallel') 92 | test_initialize_model_parallel(model_parallel_size) 93 | print_separator('test model parallel source rank') 94 | test_get_model_parallel_src_rank(model_parallel_size) 95 | model_parallel_size *= 2 96 | -------------------------------------------------------------------------------- /megatron/fused_kernels/scaled_masked_softmax_cuda.cu: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include "THC/THC.h" 23 | #include 24 | #include 25 | #include "scaled_masked_softmax.h" 26 | 27 | namespace multihead_attn { 28 | namespace fused_softmax { 29 | namespace scaled_masked_softmax { 30 | 31 | torch::Tensor fwd_cuda( 32 | torch::Tensor const& input, 33 | torch::Tensor const& mask, 34 | float scale_factor) 35 | { 36 | // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] 37 | const int batches = input.size(0); 38 | const int pad_batches = mask.size(0); 39 | const int attn_heads = input.size(1); 40 | const int seq_len = input.size(2); 41 | TORCH_INTERNAL_ASSERT(seq_len <= 2048); 42 | TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches); 43 | TORCH_INTERNAL_ASSERT(mask.size(1) == 1); 44 | TORCH_INTERNAL_ASSERT(mask.size(2) == seq_len); 45 | TORCH_INTERNAL_ASSERT(mask.size(3) == seq_len); 46 | 47 | // Output 48 | auto act_options = input.options().requires_grad(false); 49 | torch::Tensor softmax_results = 50 | torch::empty({batches, attn_heads, seq_len, seq_len}, act_options); 51 | 52 | // Softmax Intermediate Result Ptr 53 | void* input_ptr = static_cast(input.data_ptr()); 54 | void* mask_ptr = static_cast(mask.data_ptr()); 55 | void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); 56 | 57 | dispatch_scaled_masked_softmax_forward( 58 | reinterpret_cast(softmax_results_ptr), 59 | reinterpret_cast(input_ptr), 60 | reinterpret_cast(mask_ptr), 61 | scale_factor, 62 | seq_len, 63 | seq_len, 64 | batches, 65 | attn_heads, 66 | pad_batches); 67 | return softmax_results; 68 | } 69 | 70 | torch::Tensor bwd_cuda( 71 | torch::Tensor const& output_grads_, 72 | torch::Tensor const& softmax_results_, 73 | float scale_factor) { 74 | 75 | auto output_grads = output_grads_.contiguous(); 76 | auto softmax_results = softmax_results_.contiguous(); 77 | 78 | //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] 79 | const int batches = output_grads.size(0); 80 | const int attn_heads = output_grads.size(1); 81 | const int seq_len = output_grads.size(2); 82 | TORCH_INTERNAL_ASSERT(output_grads.size(2) == output_grads.size(3)); 83 | 84 | void* output_grads_ptr = static_cast(output_grads.data_ptr()); 85 | 86 | //Softmax Grad 87 | dispatch_scaled_masked_softmax_backward( 88 | reinterpret_cast(output_grads_ptr), 89 | reinterpret_cast(output_grads_ptr), 90 | reinterpret_cast(softmax_results.data_ptr()), 91 | scale_factor, 92 | seq_len, 93 | seq_len, 94 | batches, 95 | attn_heads); 96 | 97 | //backward pass is completely in-place 98 | return output_grads; 99 | } 100 | } 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /run_t5.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | GPUS_PER_NODE=4 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=8888 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | export DLWS_NUM_WORKER=${NNODES} 12 | export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE} 13 | 14 | WORKING_DIR=${HOME}/Megatron-LM-3D 15 | DATA_DIR=${HOME}/t5_data 16 | 17 | DATA_PATH="${DATA_DIR}/pretrain_data/baike_small_document" 18 | VOCAB_PATH="${DATA_DIR}/bpe_new" 19 | CHECKPOINT_PATH=checkpoints/t5_test 20 | config_json="${WORKING_DIR}/ds_config_t5.json" 21 | 22 | # Megatron Model Parallelism 23 | mp_size=2 24 | # DeepSpeed Pipeline parallelism 25 | pp_size=1 26 | 27 | NLAYERS=2 28 | NHIDDEN=128 29 | BATCHSIZE=4 30 | GAS=16 31 | LOGDIR="tensorboard_data/${NLAYERS}l_${NHIDDEN}h_${NNODES}n_${GPUS_PER_NODE}g_${pp_size}pp_${mp_size}mp_${BATCHSIZE}b_ds4" 32 | 33 | #Actication Checkpointing and Contigious Memory 34 | checkpoint_activations=false 35 | chkp_layers=1 36 | PA=true 37 | PA_CPU=false 38 | CC=true 39 | SYNCHRONIZE=true 40 | PROFILE=false 41 | 42 | t5_options=" \ 43 | --model-parallel-size ${mp_size} \ 44 | --pipe-parallel-size ${pp_size} \ 45 | --num-layers $NLAYERS \ 46 | --hidden-size $NHIDDEN \ 47 | --kv-hidden-size 16 \ 48 | --ff-hidden-size 256 \ 49 | --num-attention-heads 8 \ 50 | --enc-seq-length 1024\ 51 | --dec-seq-length 384\ 52 | --max-position-embeddings 1024 \ 53 | --batch-size $BATCHSIZE \ 54 | --gas $GAS \ 55 | --train-iters 320000 \ 56 | --lr-decay-iters 320000 \ 57 | --save $CHECKPOINT_PATH \ 58 | --data-path $DATA_PATH \ 59 | --vocab-file $VOCAB_PATH \ 60 | --data-impl mmap \ 61 | --split 949,50,1 \ 62 | --distributed-backend nccl \ 63 | --lr 1.5e-4 \ 64 | --lr-decay-style cosine \ 65 | --min-lr 1.0e-5 \ 66 | --weight-decay 1e-2 \ 67 | --clip-grad 1.0 \ 68 | --warmup 0.01 \ 69 | --log-interval 1 \ 70 | --save-interval 500 \ 71 | --eval-interval 100 \ 72 | --eval-iters 10 \ 73 | --fp16 \ 74 | --tensorboard-dir ${LOGDIR} 75 | " 76 | 77 | deepspeed_options=" \ 78 | --deepspeed \ 79 | --deepspeed_config ${config_json} \ 80 | " 81 | 82 | if [ "${contigious_gradients}" = "true" ]; then 83 | deepspeed_options="${deepspeed_options} \ 84 | --zero-contigious-gradients" 85 | fi 86 | 87 | if [ "${reduce_scatter}" = "true" ]; then 88 | deepspeed_options="${deepspeed_options} \ 89 | --zero-reduce-scatter" 90 | fi 91 | 92 | if ["${checkpoint_activations}" = "true"]; then 93 | 94 | chkp_opt=" \ 95 | --checkpoint-activations \ 96 | --checkpoint-num-layers ${chkp_layers}" 97 | 98 | if [ "${PA}" = "true" ]; then 99 | chkp_opt="${chkp_opt} \ 100 | --partition-activations" 101 | fi 102 | 103 | if [ "${PA_CPU}" = "true" ]; then 104 | chkp_opt="${chkp_opt} \ 105 | --checkpoint-in-cpu" 106 | fi 107 | 108 | if [ "${SYNCHRONIZE}" = "true" ]; then 109 | chkp_opt="${chkp_opt} \ 110 | --synchronize-each-layer" 111 | fi 112 | 113 | if [ "${CC}" = "true" ]; then 114 | chkp_opt="${chkp_opt} \ 115 | --contigious-checkpointing" 116 | fi 117 | 118 | if [ "${PROFILE}" = "true" ]; then 119 | chkp_opt="${chkp_opt} \ 120 | --profile-backward" 121 | fi 122 | else 123 | chkp_opt = " " 124 | fi 125 | 126 | full_options="${t5_options} ${deepspeed_options} ${chkp_opt}" 127 | 128 | run_cmd="deepspeed --master_port ${MASTER_PORT} -i node1:4,5,6,7 --hostfile hostfile pretrain_t5.py $@ ${full_options}" 129 | echo ${run_cmd} 130 | eval ${run_cmd} 131 | 132 | set +x 133 | -------------------------------------------------------------------------------- /run_gpt2.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | GPUS_PER_NODE=4 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=8888 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | export DLWS_NUM_WORKER=${NNODES} 12 | export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE} 13 | 14 | WORKING_DIR=${HOME}/Megatron-LM-3D 15 | DATA_DIR=${HOME}/gpt2_data 16 | 17 | DATA_PATH="${DATA_DIR}/CPM-train/data" 18 | # VOCAB_PATH=data/gpt2-vocab.json 19 | # MERGE_PATH=data/gpt2-merges.txt 20 | TOKENIZER_PATH="${DATA_DIR}/bpe_3w_new/vocab.json" 21 | CHECKPOINT_PATH=checkpoints/gpt2_345m_ds 22 | config_json="${WORKING_DIR}/ds_config_gpt2.json" 23 | 24 | # Megatron Model Parallelism 25 | mp_size=2 26 | # DeepSpeed Pipeline parallelism 27 | pp_size=2 28 | 29 | NLAYERS=2 30 | NHIDDEN=256 31 | BATCHSIZE=4 32 | GAS=16 33 | LOGDIR="tensorboard_data/${NLAYERS}l_${NHIDDEN}h_${NNODES}n_${GPUS_PER_NODE}g_${pp_size}pp_${mp_size}mp_${BATCHSIZE}b_ds4" 34 | 35 | 36 | #Actication Checkpointing and Contigious Memory 37 | checkpoint_activations 38 | chkp_layers=1 39 | PA=true 40 | PA_CPU=false 41 | CC=true 42 | SYNCHRONIZE=true 43 | PROFILE=false 44 | 45 | gpt_options=" \ 46 | --model-parallel-size ${mp_size} \ 47 | --pipe-parallel-size ${pp_size} \ 48 | --num-layers $NLAYERS \ 49 | --hidden-size $NHIDDEN \ 50 | --num-attention-heads 16 \ 51 | --kv-hidden-size 16 \ 52 | --ff-hidden-size 1024 \ 53 | --num-attention-heads 16 \ 54 | --seq-length 1024 \ 55 | --max-position-embeddings 1024 \ 56 | --batch-size $BATCHSIZE \ 57 | --gas $GAS \ 58 | --train-iters 320000 \ 59 | --lr-decay-iters 320000 \ 60 | --save $CHECKPOINT_PATH \ 61 | --data-path $DATA_PATH \ 62 | --data-impl mmap \ 63 | --vocab-file $TOKENIZER_PATH\ 64 | --split 949,50,1 \ 65 | --distributed-backend nccl \ 66 | --lr 1.5e-4 \ 67 | --lr-decay-style cosine \ 68 | --min-lr 1.0e-5 \ 69 | --weight-decay 1e-2 \ 70 | --clip-grad 1.0 \ 71 | --warmup 0.01 \ 72 | --checkpoint-activations \ 73 | --log-interval 1 \ 74 | --save-interval 500 \ 75 | --eval-interval 100 \ 76 | --eval-iters 10 \ 77 | --fp16 \ 78 | --hidden-bias \ 79 | --tensorboard-dir ${LOGDIR} 80 | " 81 | 82 | deepspeed_options=" \ 83 | --deepspeed \ 84 | --deepspeed_config ${config_json} \ 85 | " 86 | 87 | if [ "${contigious_gradients}" = "true" ]; then 88 | deepspeed_options="${deepspeed_options} \ 89 | --zero-contigious-gradients" 90 | fi 91 | 92 | if [ "${reduce_scatter}" = "true" ]; then 93 | deepspeed_options="${deepspeed_options} \ 94 | --zero-reduce-scatter" 95 | fi 96 | 97 | if ["${checkpoint_activations}" = "true"]; then 98 | 99 | chkp_opt=" \ 100 | --checkpoint-activations \ 101 | --checkpoint-num-layers ${chkp_layers}" 102 | 103 | if [ "${PA}" = "true" ]; then 104 | chkp_opt="${chkp_opt} \ 105 | --partition-activations" 106 | fi 107 | 108 | if [ "${PA_CPU}" = "true" ]; then 109 | chkp_opt="${chkp_opt} \ 110 | --checkpoint-in-cpu" 111 | fi 112 | 113 | if [ "${SYNCHRONIZE}" = "true" ]; then 114 | chkp_opt="${chkp_opt} \ 115 | --synchronize-each-layer" 116 | fi 117 | 118 | if [ "${CC}" = "true" ]; then 119 | chkp_opt="${chkp_opt} \ 120 | --contigious-checkpointing" 121 | fi 122 | 123 | if [ "${PROFILE}" = "true" ]; then 124 | chkp_opt="${chkp_opt} \ 125 | --profile-backward" 126 | fi 127 | else 128 | chkp_opt = " " 129 | fi 130 | 131 | full_options="${gpt_options} ${deepspeed_options} ${chkp_opt}" 132 | 133 | run_cmd="deepspeed --master_port ${MASTER_PORT} -i node1:4,5,6,7 --hostfile hostfile pretrain_gpt2.py $@ ${full_options}" 134 | echo ${run_cmd} 135 | eval ${run_cmd} 136 | 137 | set +x 138 | -------------------------------------------------------------------------------- /megatron/deprecated_data_utils/scripts/split_json.py: -------------------------------------------------------------------------------- 1 | """ 2 | Takes a corpora of files (specified by `--input_files`) with json data separated 3 | by newlines (loose json). Splits data into train.json, val.json, test.json files 4 | under `output_dir`. 5 | 6 | Note: This code has the potential to override files with the names 7 | train.json, val.json, test.json in `--output_dir`. 8 | """ 9 | import os 10 | import argparse 11 | import math 12 | import random 13 | 14 | parser = argparse.ArgumentParser('resplit loose json data into train/val/test') 15 | parser.add_argument('--input_files', nargs='+', required=True, 16 | help='whitespace separated list of input data files') 17 | parser.add_argument('--output_dir', required=True, 18 | help='output directory where to put files') 19 | parser.add_argument('--test_percent', type=float, nargs='+', default=[0.05, 0], 20 | help='percentage of available data to use for val/test dataset') 21 | args = parser.parse_args() 22 | 23 | 24 | def get_lines(filepath): 25 | lines = [] 26 | with open(filepath, 'r') as f: 27 | for i, l in enumerate(f.readlines()): 28 | l = l.strip() 29 | lines.append(l) 30 | return lines 31 | 32 | 33 | def get_splits(lines, line_counts): 34 | all_lines = [] 35 | line_idx = [] 36 | file_mappings = [] 37 | for i, l in enumerate(lines): 38 | all_lines.extend(l) 39 | line_idx.extend(list(range(len(l)))) 40 | file_mappings.extend([i] * len(l)) 41 | 42 | indices = list(range(len(all_lines))) 43 | random.shuffle(indices) 44 | all_lines = [all_lines[idx] for idx in indices] 45 | line_idx = [line_idx[idx] for idx in indices] 46 | file_mappings = [file_mappings[idx] for idx in indices] 47 | 48 | splits = [] 49 | mappings = [] 50 | start = 0 51 | for end in line_counts: 52 | end += start 53 | splits.append(all_lines[start:end]) 54 | mappings.append(format_mappings(line_idx[start:end], file_mappings[start:end])) 55 | start = end 56 | return splits, mappings 57 | 58 | 59 | def format_mappings(line_idx, file_mappings): 60 | lines = [] 61 | for m, l in zip(file_mappings, line_idx): 62 | lines.append(str(m).strip() + '\t' + str(l).strip()) 63 | return lines 64 | 65 | 66 | def get_filepaths(filepaths, output_dir): 67 | paths = [] 68 | train_path = 'train.json' 69 | dev_path = 'dev.json' 70 | test_path = 'test.json' 71 | paths.append(os.path.join(output_dir, train_path)) 72 | paths.append(os.path.join(output_dir, dev_path)) 73 | paths.append(os.path.join(output_dir, test_path)) 74 | return paths 75 | 76 | 77 | def write_files(lines, mappings, filepaths): 78 | for l, m, path in zip(lines, mappings, filepaths): 79 | write_file(l, path) 80 | write_mapping_file(m, path) 81 | 82 | 83 | def write_file(lines, path): 84 | print('Writing:', path) 85 | with open(path, 'w') as f: 86 | for l in lines: 87 | f.write(l + '\n') 88 | 89 | 90 | def write_mapping_file(m, path): 91 | path = path + '.map' 92 | m = [get_mapping_header()] + m 93 | write_file(m, path) 94 | 95 | 96 | def get_mapping_header(): 97 | return 'file\tline #' 98 | 99 | 100 | if not os.path.exists(args.output_dir): 101 | os.makedirs(args.output_dir) 102 | 103 | lines = [] 104 | 105 | for filepath in args.input_files: 106 | _lines = get_lines(filepath) 107 | lines.append(_lines) 108 | 109 | # calculate number of lines to use for each 110 | line_counts = [len(l) for l in lines] 111 | total_lines = sum(line_counts) 112 | dev_percent = args.test_percent[0] 113 | dev_lines = math.ceil(dev_percent * total_lines) 114 | test_percent = 0 115 | if len(args.test_percent) == 2: 116 | test_percent = args.test_percent[1] 117 | test_lines = math.ceil(test_percent * total_lines) 118 | train_lines = total_lines - (test_lines + dev_lines) 119 | normed_lines = [train_lines, dev_lines, test_lines] 120 | normed_lines = [int(l) for l in normed_lines] 121 | 122 | 123 | splits, mappings = get_splits(lines, normed_lines) 124 | filepaths = get_filepaths(args.input_files, args.output_dir) 125 | print('Writing output to:', filepaths) 126 | write_files(splits, mappings, filepaths) 127 | -------------------------------------------------------------------------------- /megatron/mpu/tests/test_cross_entropy.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from commons import set_random_seed 17 | from commons import IdentityLayer 18 | from commons import print_separator 19 | from commons import initialize_distributed 20 | from mpu.cross_entropy import vocab_parallel_cross_entropy 21 | import mpu 22 | import torch.nn.functional as F 23 | import torch 24 | import random 25 | import sys 26 | sys.path.append("../..") 27 | 28 | 29 | def torch_cross_entropy(batch_size, seq_length, vocab_size, 30 | logits_scale, seed): 31 | set_random_seed(seed) 32 | identity = IdentityLayer((batch_size, seq_length, vocab_size), 33 | scale=logits_scale).cuda() 34 | logits = identity() 35 | target = torch.cuda.LongTensor( 36 | size=(batch_size, seq_length)).random_(0, vocab_size) 37 | loss = F.cross_entropy(logits.view(-1, logits.size()[-1]), 38 | target.view(-1), 39 | reduction='none').view_as(target).mean() 40 | loss.backward() 41 | return loss, identity.weight.grad 42 | 43 | 44 | def mpu_cross_entropy(batch_size, seq_length, vocab_size, 45 | logits_scale, seed): 46 | set_random_seed(seed) 47 | identity = IdentityLayer((batch_size, seq_length, vocab_size), 48 | scale=logits_scale).cuda() 49 | logits = identity() 50 | logits_parallel = mpu.scatter_to_model_parallel_region(logits) 51 | target = torch.cuda.LongTensor( 52 | size=(batch_size, seq_length)).random_(0, vocab_size) 53 | loss = vocab_parallel_cross_entropy(logits_parallel, target).mean() 54 | loss.backward() 55 | return loss, identity.weight.grad 56 | 57 | 58 | def test_cross_entropy(model_parallel_size): 59 | 60 | if torch.distributed.get_rank() == 0: 61 | print('> testing cross entropy with model parallel size {} ...'. 62 | format(model_parallel_size)) 63 | 64 | mpu.initialize_model_parallel(model_parallel_size) 65 | model_parallel_size = mpu.get_model_parallel_world_size() 66 | 67 | batch_size = 13 68 | seq_length = 17 69 | vocab_size_per_partition = 11 70 | logits_scale = 1000.0 71 | vocab_size = vocab_size_per_partition * model_parallel_size 72 | seed = 1234 73 | 74 | loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length, 75 | vocab_size, logits_scale, 76 | seed) 77 | loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length, 78 | vocab_size, logits_scale, 79 | seed) 80 | 81 | error = loss_torch.sub_(loss_mpu).abs().max() 82 | print(' max error in loss on global rank {}: {}'.format( 83 | torch.distributed.get_rank(), error)) 84 | assert error < 1.0e-6 85 | 86 | error = grad_torch.sub_(grad_mpu).abs().max() 87 | print(' max error in grad on global rank {}: {}'.format( 88 | torch.distributed.get_rank(), error)) 89 | assert error < 1.0e-6 90 | 91 | # Reset groups 92 | mpu.destroy_model_parallel() 93 | 94 | torch.distributed.barrier() 95 | if torch.distributed.get_rank() == 0: 96 | print('>> passed the test :-)') 97 | 98 | 99 | if __name__ == '__main__': 100 | 101 | initialize_distributed() 102 | world_size = torch.distributed.get_world_size() 103 | 104 | model_parallel_size = 1 105 | while model_parallel_size <= world_size: 106 | print_separator('test cross entropy') 107 | test_cross_entropy(model_parallel_size) 108 | model_parallel_size *= 2 109 | -------------------------------------------------------------------------------- /megatron/indexer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | 4 | from megatron import get_args 5 | from megatron import mpu 6 | from megatron.checkpointing import load_ict_checkpoint 7 | from megatron.data.ict_dataset import get_ict_dataset 8 | from megatron.data.realm_dataset_utils import get_one_epoch_dataloader 9 | from megatron.data.realm_index import detach, BlockData 10 | from megatron.data.realm_dataset_utils import get_ict_batch 11 | from megatron.model.realm_model import general_ict_model_provider 12 | from megatron.training import get_model 13 | 14 | 15 | class IndexBuilder(object): 16 | """Object for taking one pass over a dataset and creating a BlockData of its embeddings""" 17 | def __init__(self): 18 | args = get_args() 19 | self.model = None 20 | self.dataloader = None 21 | self.block_data = None 22 | 23 | # need to know whether we're using a REALM checkpoint (args.load) or ICT checkpoint 24 | assert not (args.load and args.ict_load) 25 | self.using_realm_chkpt = args.ict_load is None 26 | 27 | self.log_interval = args.indexer_log_interval 28 | self.batch_size = args.indexer_batch_size 29 | 30 | self.load_attributes() 31 | self.is_main_builder = mpu.get_data_parallel_rank() == 0 32 | self.num_total_builders = mpu.get_data_parallel_world_size() 33 | self.iteration = self.total_processed = 0 34 | 35 | def load_attributes(self): 36 | """Load the necessary attributes: model, dataloader and empty BlockData""" 37 | model = get_model(lambda: general_ict_model_provider(only_block_model=True)) 38 | self.model = load_ict_checkpoint(model, only_block_model=True, from_realm_chkpt=self.using_realm_chkpt) 39 | self.model.eval() 40 | self.dataset = get_ict_dataset() 41 | self.dataloader = iter(get_one_epoch_dataloader(self.dataset, self.batch_size)) 42 | self.block_data = BlockData(load_from_path=False) 43 | 44 | def track_and_report_progress(self, batch_size): 45 | """Utility function for tracking progress""" 46 | self.iteration += 1 47 | self.total_processed += batch_size * self.num_total_builders 48 | if self.is_main_builder and self.iteration % self.log_interval == 0: 49 | print('Batch {:10d} | Total {:10d}'.format(self.iteration, self.total_processed), flush=True) 50 | 51 | def build_and_save_index(self): 52 | """Goes through one epoch of the dataloader and adds all data to this instance's BlockData. 53 | 54 | The copy of BlockData is saved as a shard, which when run in a distributed setting will be 55 | consolidated by the rank 0 process and saved as a final pickled BlockData. 56 | """ 57 | 58 | while True: 59 | try: 60 | # batch also has query_tokens and query_pad_data 61 | _, _, block_tokens, block_pad_mask, block_sample_data = get_ict_batch(self.dataloader) 62 | except (StopIteration, IndexError): 63 | break 64 | 65 | unwrapped_model = self.model 66 | while not hasattr(unwrapped_model, 'embed_block'): 67 | unwrapped_model = unwrapped_model.module 68 | 69 | # detach, separate fields and add to BlockData 70 | block_logits = detach(unwrapped_model.embed_block(block_tokens, block_pad_mask)) 71 | detached_data = detach(block_sample_data) 72 | 73 | # block_sample_data is a 2D array [batch x 4] 74 | # with columns [start_idx, end_idx, doc_idx, block_idx] same as class BlockSampleData 75 | block_indices = detached_data[:, 3] 76 | block_metas = detached_data[:, :3] 77 | 78 | self.block_data.add_block_data(block_indices, block_logits, block_metas) 79 | self.track_and_report_progress(batch_size=block_tokens.shape[0]) 80 | 81 | # This process signals to finalize its shard and then synchronize with the other processes 82 | self.block_data.save_shard() 83 | torch.distributed.barrier() 84 | del self.model 85 | 86 | # rank 0 process builds the final copy 87 | if self.is_main_builder: 88 | self.block_data.merge_shards_and_save() 89 | # make sure that every single piece of data was embedded 90 | assert len(self.block_data.embed_data) == len(self.dataset) 91 | self.block_data.clear() 92 | -------------------------------------------------------------------------------- /megatron/fused_kernels/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import pathlib 17 | import subprocess 18 | import os 19 | from torch.utils import cpp_extension 20 | 21 | # Setting this param to a list has a problem of generating 22 | # different compilation commands (with diferent order of architectures) 23 | # and leading to recompilation of fused kernels. 24 | # set it to empty string to avoid recompilation 25 | # and assign arch flags explicity in extra_cuda_cflags below 26 | os.environ["TORCH_CUDA_ARCH_LIST"] = "" 27 | 28 | def get_cuda_bare_metal_version(cuda_dir): 29 | raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], 30 | universal_newlines=True) 31 | output = raw_output.split() 32 | release_idx = output.index("release") + 1 33 | release = output[release_idx].split(".") 34 | bare_metal_major = release[0] 35 | bare_metal_minor = release[1][0] 36 | 37 | return raw_output, bare_metal_major, bare_metal_minor 38 | 39 | def create_build_dir(buildpath): 40 | try: 41 | os.mkdir(buildpath) 42 | except OSError: 43 | if not os.path.isdir(buildpath): 44 | print(f"Creation of the build directory {buildpath} failed") 45 | 46 | def load_scaled_upper_triang_masked_softmax_fusion_kernel(): 47 | 48 | # Check, if CUDA11 is installed for compute capability 8.0 49 | cc_flag = [] 50 | _, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME) 51 | if int(bare_metal_major) >= 11: 52 | cc_flag.append('-gencode') 53 | cc_flag.append('arch=compute_80,code=sm_80') 54 | 55 | srcpath = pathlib.Path(__file__).parent.absolute() 56 | buildpath = srcpath / 'build' 57 | 58 | create_build_dir(buildpath) 59 | 60 | scaled_upper_triang_masked_softmax_cuda = cpp_extension.load( 61 | name='scaled_upper_triang_masked_softmax_cuda', 62 | sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp', 63 | srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu'], 64 | build_directory=buildpath, 65 | extra_cflags=['-O3',], 66 | extra_cuda_cflags=['-O3', 67 | '-gencode', 'arch=compute_70,code=sm_70', 68 | '-U__CUDA_NO_HALF_OPERATORS__', 69 | '-U__CUDA_NO_HALF_CONVERSIONS__', 70 | '--expt-relaxed-constexpr', 71 | '--expt-extended-lambda', 72 | '--use_fast_math'] + cc_flag) 73 | 74 | def load_scaled_masked_softmax_fusion_kernel(): 75 | 76 | # Check, if CUDA11 is installed for compute capability 8.0 77 | cc_flag = [] 78 | _, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME) 79 | if int(bare_metal_major) >= 11: 80 | cc_flag.append('-gencode') 81 | cc_flag.append('arch=compute_80,code=sm_80') 82 | 83 | srcpath = pathlib.Path(__file__).parent.absolute() 84 | buildpath = srcpath / 'build' 85 | 86 | create_build_dir(buildpath) 87 | 88 | scaled_upper_triang_masked_softmax_cuda = cpp_extension.load( 89 | name='scaled_masked_softmax_cuda', 90 | sources=[srcpath / 'scaled_masked_softmax.cpp', 91 | srcpath / 'scaled_masked_softmax_cuda.cu'], 92 | build_directory=buildpath, 93 | extra_cflags=['-O3',], 94 | extra_cuda_cflags=['-O3', 95 | '-gencode', 'arch=compute_70,code=sm_70', 96 | '-U__CUDA_NO_HALF_OPERATORS__', 97 | '-U__CUDA_NO_HALF_CONVERSIONS__', 98 | '--expt-relaxed-constexpr', 99 | '--expt-extended-lambda', 100 | '--use_fast_math'] + cc_flag) 101 | -------------------------------------------------------------------------------- /megatron/model/classification.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Classification model.""" 17 | 18 | import torch 19 | 20 | from megatron import get_args, print_rank_0 21 | from megatron.model.bert_model import bert_attention_mask_func, bert_extended_attention_mask, bert_position_ids 22 | from megatron.model.language_model import get_language_model 23 | from megatron.model.utils import get_linear_layer 24 | from megatron.model.utils import init_method_normal 25 | from megatron.model.utils import scaled_init_method_normal 26 | from megatron.module import MegatronModule 27 | 28 | 29 | class Classification(MegatronModule): 30 | 31 | def __init__(self, num_classes, num_tokentypes=2): 32 | super(Classification, self).__init__() 33 | args = get_args() 34 | 35 | self.num_classes = num_classes 36 | init_method = init_method_normal(args.init_method_std) 37 | 38 | self.language_model, self._language_model_key = get_language_model( 39 | attention_mask_func=bert_attention_mask_func, 40 | num_tokentypes=num_tokentypes, 41 | add_pooler=True, 42 | init_method=init_method, 43 | scaled_init_method=scaled_init_method_normal(args.init_method_std, 44 | args.num_layers)) 45 | 46 | # Multi-choice head. 47 | self.classification_dropout = torch.nn.Dropout(args.hidden_dropout) 48 | self.classification_head = get_linear_layer(args.hidden_size, 49 | self.num_classes, 50 | init_method) 51 | self._classification_head_key = 'classification_head' 52 | 53 | def forward(self, input_ids, attention_mask, tokentype_ids): 54 | 55 | extended_attention_mask = bert_extended_attention_mask( 56 | attention_mask, next(self.language_model.parameters()).dtype) 57 | position_ids = bert_position_ids(input_ids) 58 | 59 | _, pooled_output = self.language_model(input_ids, 60 | position_ids, 61 | extended_attention_mask, 62 | tokentype_ids=tokentype_ids) 63 | 64 | # Output. 65 | classification_output = self.classification_dropout(pooled_output) 66 | classification_logits = self.classification_head(classification_output) 67 | 68 | # Reshape back to separate choices. 69 | classification_logits = classification_logits.view(-1, self.num_classes) 70 | 71 | return classification_logits 72 | 73 | def state_dict_for_save_checkpoint(self, destination=None, prefix='', 74 | keep_vars=False): 75 | """For easy load when model is combined with other heads, 76 | add an extra key.""" 77 | 78 | state_dict_ = {} 79 | state_dict_[self._language_model_key] \ 80 | = self.language_model.state_dict_for_save_checkpoint( 81 | destination, prefix, keep_vars) 82 | state_dict_[self._classification_head_key] \ 83 | = self.classification_head.state_dict( 84 | destination, prefix, keep_vars) 85 | return state_dict_ 86 | 87 | def load_state_dict(self, state_dict, strict=True): 88 | """Customized load.""" 89 | 90 | self.language_model.load_state_dict( 91 | state_dict[self._language_model_key], strict=strict) 92 | if self._classification_head_key in state_dict: 93 | self.classification_head.load_state_dict( 94 | state_dict[self._classification_head_key], strict=strict) 95 | else: 96 | print_rank_0('***WARNING*** could not find {} in the checkpoint, ' 97 | 'initializing to random'.format( 98 | self._classification_head_key)) 99 | -------------------------------------------------------------------------------- /megatron/mpu/data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | from .initialize import get_model_parallel_group 19 | from .initialize import get_model_parallel_rank 20 | from .initialize import get_model_parallel_src_rank 21 | 22 | 23 | _MAX_DATA_DIM = 4 24 | 25 | 26 | def _check_data_types(keys, data, target_dtype): 27 | """Check that all the keys have the same target data type.""" 28 | for key in keys: 29 | assert data[key].dtype == target_dtype, '{} has data type {} which '\ 30 | 'is different than {}'.format(key, data[key].dtype, target_dtype) 31 | 32 | 33 | def _build_key_size_numel_dictionaries(keys, data): 34 | """Build the size on rank 0 and broadcast.""" 35 | max_dim = _MAX_DATA_DIM 36 | sizes = [0 for _ in range(max_dim) for _ in keys] 37 | 38 | # Pack the sizes on rank zero. 39 | if get_model_parallel_rank() == 0: 40 | offset = 0 41 | for key in keys: 42 | assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM' 43 | size = data[key].size() 44 | for i, s in enumerate(size): 45 | sizes[i + offset] = s 46 | offset += max_dim 47 | 48 | # Move to GPU and broadcast. 49 | sizes_cuda = torch.cuda.LongTensor(sizes) 50 | torch.distributed.broadcast(sizes_cuda, get_model_parallel_src_rank(), 51 | group=get_model_parallel_group()) 52 | 53 | # Move back to cpu and unpack. 54 | sizes_cpu = sizes_cuda.cpu() 55 | key_size = {} 56 | key_numel = {} 57 | total_numel = 0 58 | offset = 0 59 | for key in keys: 60 | i = 0 61 | size = [] 62 | numel = 1 63 | while sizes_cpu[offset + i] > 0: 64 | this_size = sizes_cpu[offset + i] 65 | size.append(this_size) 66 | numel *= this_size 67 | i += 1 68 | key_size[key] = size 69 | key_numel[key] = numel 70 | total_numel += numel 71 | offset += max_dim 72 | 73 | return key_size, key_numel, total_numel 74 | 75 | 76 | def broadcast_data(keys, data, datatype): 77 | """Broadcast data from rank zero of each model parallel group to the 78 | members of the same model parallel group. 79 | 80 | Arguments: 81 | keys: list of keys in the data disctionary to be broadcasted 82 | data: data dictionary of string keys and cpu tensor values. 83 | datatype: torch data type of all tensors in data associated 84 | with keys. 85 | """ 86 | # Build (key, size) and (key, number of elements) dictionaries along 87 | # with the total number of elements on all ranks. 88 | key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, 89 | data) 90 | 91 | # Pack on rank zero. 92 | if get_model_parallel_rank() == 0: 93 | # Check that all keys have the same data type. 94 | _check_data_types(keys, data, datatype) 95 | # Flatten the data associated with the keys 96 | flatten_data = torch.cat( 97 | [data[key].contiguous().view(-1) for key in keys], dim=0).cuda() 98 | else: 99 | flatten_data = torch.empty(total_numel, 100 | device=torch.cuda.current_device(), 101 | dtype=datatype) 102 | 103 | # Boradcast 104 | torch.distributed.broadcast(flatten_data, get_model_parallel_src_rank(), 105 | group=get_model_parallel_group()) 106 | 107 | # Unpack 108 | output = {} 109 | offset = 0 110 | for key in keys: 111 | size = key_size[key] 112 | numel = key_numel[key] 113 | output[key] = flatten_data.narrow(0, offset, numel).view(size) 114 | offset += numel 115 | 116 | return output 117 | -------------------------------------------------------------------------------- /megatron/deprecated_data_utils/scripts/split_gpt2_json.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """ 17 | Takes a corpora of files (specified by `--input_files`) with json data separated 18 | by newlines (loose json). Splits data into train.json, val.json, test.json files 19 | under `output_dir`. 20 | 21 | Note: This code has the potential to override files with the names 22 | train.json, val.json, test.json in `--output_dir`. 23 | """ 24 | import os 25 | import argparse 26 | import math 27 | import random 28 | 29 | parser = argparse.ArgumentParser('resplit loose json data into train/val/test') 30 | parser.add_argument('--input_files', nargs='+', required=True, 31 | help='whitespace separated list of input data files') 32 | parser.add_argument('--output_dir', required=True, 33 | help='output directory where to put files') 34 | parser.add_argument('--test_percent', type=float, nargs='+', default=[0.05, 0], 35 | help='percentage of available data to use for val/test dataset') 36 | args = parser.parse_args() 37 | 38 | 39 | def get_lines(filepath): 40 | lines = [] 41 | with open(filepath, 'r') as f: 42 | for i, l in enumerate(f.readlines()): 43 | l = l.strip() 44 | lines.append(l) 45 | return lines 46 | 47 | 48 | def get_splits(lines, line_counts): 49 | all_lines = [] 50 | line_idx = [] 51 | file_mappings = [] 52 | for i, l in enumerate(lines): 53 | all_lines.extend(l) 54 | line_idx.extend(list(range(len(l)))) 55 | file_mappings.extend([i] * len(l)) 56 | 57 | indices = list(range(len(all_lines))) 58 | random.shuffle(indices) 59 | all_lines = [all_lines[idx] for idx in indices] 60 | line_idx = [line_idx[idx] for idx in indices] 61 | file_mappings = [file_mappings[idx] for idx in indices] 62 | 63 | splits = [] 64 | mappings = [] 65 | start = 0 66 | for end in line_counts: 67 | end += start 68 | splits.append(all_lines[start:end]) 69 | mappings.append(format_mappings(line_idx[start:end], file_mappings[start:end])) 70 | start = end 71 | return splits, mappings 72 | 73 | 74 | def format_mappings(line_idx, file_mappings): 75 | lines = [] 76 | for m, l in zip(file_mappings, line_idx): 77 | lines.append(str(m).strip() + '\t' + str(l).strip()) 78 | return lines 79 | 80 | 81 | def get_filepaths(filepaths, output_dir): 82 | paths = [] 83 | train_path = 'train.json' 84 | dev_path = 'dev.json' 85 | test_path = 'test.json' 86 | paths.append(os.path.join(output_dir, train_path)) 87 | paths.append(os.path.join(output_dir, dev_path)) 88 | paths.append(os.path.join(output_dir, test_path)) 89 | return paths 90 | 91 | 92 | def write_files(lines, mappings, filepaths): 93 | for l, m, path in zip(lines, mappings, filepaths): 94 | write_file(l, path) 95 | write_mapping_file(m, path) 96 | 97 | 98 | def write_file(lines, path): 99 | print('Writing:', path) 100 | with open(path, 'w') as f: 101 | for l in lines: 102 | f.write(l + '\n') 103 | 104 | 105 | def write_mapping_file(m, path): 106 | path = path + '.map' 107 | m = [get_mapping_header()] + m 108 | write_file(m, path) 109 | 110 | 111 | def get_mapping_header(): 112 | return 'file\tline #' 113 | 114 | 115 | if not os.path.exists(args.output_dir): 116 | os.makedirs(args.output_dir) 117 | 118 | lines = [] 119 | 120 | for filepath in args.input_files: 121 | _lines = get_lines(filepath) 122 | lines.append(_lines) 123 | 124 | # calculate number of lines to use for each 125 | line_counts = [len(l) for l in lines] 126 | total_lines = sum(line_counts) 127 | dev_percent = args.test_percent[0] 128 | dev_lines = math.ceil(dev_percent * total_lines) 129 | test_percent = 0 130 | if len(args.test_percent) == 2: 131 | test_percent = args.test_percent[1] 132 | test_lines = math.ceil(test_percent * total_lines) 133 | train_lines = total_lines - (test_lines + dev_lines) 134 | normed_lines = [train_lines, dev_lines, test_lines] 135 | normed_lines = [int(l) for l in normed_lines] 136 | 137 | 138 | splits, mappings = get_splits(lines, normed_lines) 139 | filepaths = get_filepaths(args.input_files, args.output_dir) 140 | print('Writing output to:', filepaths) 141 | write_files(splits, mappings, filepaths) 142 | -------------------------------------------------------------------------------- /megatron/mpu/mappings.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | from .initialize import get_model_parallel_group, get_model_parallel_world_size, get_model_parallel_rank 19 | from .utils import split_tensor_along_last_dim 20 | 21 | 22 | def _reduce(input_): 23 | """All-reduce the the input tensor across model parallel group.""" 24 | 25 | # Bypass the function if we are using only 1 GPU. 26 | if get_model_parallel_world_size()==1: 27 | return input_ 28 | 29 | # All-reduce. 30 | torch.distributed.all_reduce(input_, group=get_model_parallel_group()) 31 | 32 | return input_ 33 | 34 | 35 | def _split(input_): 36 | """Split the tensor along its last dimension and keep the 37 | corresponding slice.""" 38 | 39 | world_size = get_model_parallel_world_size() 40 | # Bypass the function if we are using only 1 GPU. 41 | if world_size==1: 42 | return input_ 43 | 44 | # Split along last dimension. 45 | input_list = split_tensor_along_last_dim(input_, world_size) 46 | 47 | # Note: torch.split does not create contiguous tensors by default. 48 | rank = get_model_parallel_rank() 49 | output = input_list[rank].contiguous() 50 | 51 | return output 52 | 53 | 54 | def _gather(input_): 55 | """Gather tensors and concatinate along the last dimension.""" 56 | 57 | world_size = get_model_parallel_world_size() 58 | # Bypass the function if we are using only 1 GPU. 59 | if world_size==1: 60 | return input_ 61 | 62 | # Size and dimension. 63 | last_dim = input_.dim() - 1 64 | rank = get_model_parallel_rank() 65 | 66 | tensor_list = [torch.empty_like(input_) for _ in range(world_size)] 67 | tensor_list[rank] = input_ 68 | torch.distributed.all_gather(tensor_list, input_, group=get_model_parallel_group()) 69 | 70 | # Note: torch.cat already creates a contiguous tensor. 71 | output = torch.cat(tensor_list, dim=last_dim).contiguous() 72 | 73 | return output 74 | 75 | 76 | class _CopyToModelParallelRegion(torch.autograd.Function): 77 | """Pass the input to the model parallel region.""" 78 | 79 | @staticmethod 80 | def symbolic(graph, input_): 81 | return input_ 82 | 83 | @staticmethod 84 | def forward(ctx, input_): 85 | return input_ 86 | 87 | @staticmethod 88 | def backward(ctx, grad_output): 89 | return _reduce(grad_output) 90 | 91 | 92 | class _ReduceFromModelParallelRegion(torch.autograd.Function): 93 | """All-redcue the input from the model parallel region.""" 94 | 95 | @staticmethod 96 | def symbolic(graph, input_): 97 | return _reduce(input_) 98 | 99 | @staticmethod 100 | def forward(ctx, input_): 101 | return _reduce(input_) 102 | 103 | @staticmethod 104 | def backward(ctx, grad_output): 105 | return grad_output 106 | 107 | 108 | class _ScatterToModelParallelRegion(torch.autograd.Function): 109 | """Split the input and keep only the corresponding chuck to the rank.""" 110 | 111 | @staticmethod 112 | def symbolic(graph, input_): 113 | return _split(input_) 114 | 115 | @staticmethod 116 | def forward(ctx, input_): 117 | return _split(input_) 118 | 119 | @staticmethod 120 | def backward(ctx, grad_output): 121 | return _gather(grad_output) 122 | 123 | 124 | class _GatherFromModelParallelRegion(torch.autograd.Function): 125 | """Gather the input from model parallel region and concatinate.""" 126 | 127 | @staticmethod 128 | def symbolic(graph, input_): 129 | return _gather(input_) 130 | 131 | @staticmethod 132 | def forward(ctx, input_): 133 | return _gather(input_) 134 | 135 | @staticmethod 136 | def backward(ctx, grad_output): 137 | return _split(grad_output) 138 | 139 | 140 | # ----------------- 141 | # Helper functions. 142 | # ----------------- 143 | 144 | def copy_to_model_parallel_region(input_): 145 | return _CopyToModelParallelRegion.apply(input_) 146 | 147 | 148 | def reduce_from_model_parallel_region(input_): 149 | return _ReduceFromModelParallelRegion.apply(input_) 150 | 151 | 152 | def scatter_to_model_parallel_region(input_): 153 | return _ScatterToModelParallelRegion.apply(input_) 154 | 155 | 156 | def gather_from_model_parallel_region(input_): 157 | return _GatherFromModelParallelRegion.apply(input_) 158 | -------------------------------------------------------------------------------- /megatron/model/multiple_choice.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Multiple choice model.""" 17 | 18 | import torch 19 | 20 | from megatron import get_args, print_rank_0 21 | from megatron.model.bert_model import bert_attention_mask_func, bert_extended_attention_mask, bert_position_ids 22 | from megatron.model.language_model import get_language_model 23 | from megatron.model.utils import get_linear_layer 24 | from megatron.model.utils import init_method_normal 25 | from megatron.model.utils import scaled_init_method_normal 26 | from megatron.module import MegatronModule 27 | 28 | 29 | class MultipleChoice(MegatronModule): 30 | 31 | def __init__(self, num_tokentypes=2): 32 | super(MultipleChoice, self).__init__() 33 | args = get_args() 34 | 35 | init_method = init_method_normal(args.init_method_std) 36 | 37 | self.language_model, self._language_model_key = get_language_model( 38 | attention_mask_func=bert_attention_mask_func, 39 | num_tokentypes=num_tokentypes, 40 | add_pooler=True, 41 | init_method=init_method, 42 | scaled_init_method=scaled_init_method_normal(args.init_method_std, 43 | args.num_layers)) 44 | 45 | # Multi-choice head. 46 | self.multichoice_dropout = torch.nn.Dropout(args.hidden_dropout) 47 | self.multichoice_head = get_linear_layer(args.hidden_size, 1, 48 | init_method) 49 | self._multichoice_head_key = 'multichoice_head' 50 | 51 | def forward(self, input_ids, attention_mask, tokentype_ids): 52 | 53 | # [batch, choices, sequence] --> [batch * choices, sequence] --> 54 | # transformer --> [batch, choices] --> softmax 55 | 56 | # Ensure the shape is [batch-size, choices, sequence] 57 | assert len(input_ids.shape) == 3 58 | assert len(attention_mask.shape) == 3 59 | assert len(tokentype_ids.shape) == 3 60 | 61 | # Reshape and treat choice dimension the same as batch. 62 | num_choices = input_ids.shape[1] 63 | input_ids = input_ids.view(-1, input_ids.size(-1)) 64 | attention_mask = attention_mask.view(-1, attention_mask.size(-1)) 65 | tokentype_ids = tokentype_ids.view(-1, tokentype_ids.size(-1)) 66 | 67 | extended_attention_mask = bert_extended_attention_mask( 68 | attention_mask, next(self.language_model.parameters()).dtype) 69 | position_ids = bert_position_ids(input_ids) 70 | 71 | _, pooled_output = self.language_model(input_ids, 72 | position_ids, 73 | extended_attention_mask, 74 | tokentype_ids=tokentype_ids) 75 | 76 | # Output. 77 | multichoice_output = self.multichoice_dropout(pooled_output) 78 | multichoice_logits = self.multichoice_head(multichoice_output) 79 | 80 | # Reshape back to separate choices. 81 | multichoice_logits = multichoice_logits.view(-1, num_choices) 82 | 83 | return multichoice_logits 84 | 85 | def state_dict_for_save_checkpoint(self, destination=None, prefix='', 86 | keep_vars=False): 87 | """For easy load when model is combined with other heads, 88 | add an extra key.""" 89 | 90 | state_dict_ = {} 91 | state_dict_[self._language_model_key] \ 92 | = self.language_model.state_dict_for_save_checkpoint( 93 | destination, prefix, keep_vars) 94 | state_dict_[self._multichoice_head_key] \ 95 | = self.multichoice_head.state_dict( 96 | destination, prefix, keep_vars) 97 | return state_dict_ 98 | 99 | def load_state_dict(self, state_dict, strict=True): 100 | """Customized load.""" 101 | 102 | self.language_model.load_state_dict( 103 | state_dict[self._language_model_key], strict=strict) 104 | if self._multichoice_head_key in state_dict: 105 | self.multichoice_head.load_state_dict( 106 | state_dict[self._multichoice_head_key], strict=strict) 107 | else: 108 | print_rank_0('***WARNING*** could not find {} in the checkpoint, ' 109 | 'initializing to random'.format( 110 | self._multichoice_head_key)) 111 | -------------------------------------------------------------------------------- /megatron/data/test/test_indexed_dataset.py: -------------------------------------------------------------------------------- 1 | # This file isn't really a formal automated test, it's just a place to 2 | # put some code used during development and manual testing of 3 | # indexed_dataset. 4 | 5 | from megatron.data import indexed_dataset 6 | from megatron.tokenizer import build_tokenizer 7 | import argparse 8 | import os 9 | import sys 10 | 11 | import torch 12 | 13 | script_dir = os.path.dirname(os.path.realpath(__file__)) 14 | sys.path.append(os.path.join(script_dir, "../../../")) 15 | 16 | 17 | def test_indexed_dataset(args): 18 | ds = indexed_dataset.make_dataset(args.data, args.dataset_impl) 19 | tokenizer = build_tokenizer(args) 20 | print(len(ds.doc_idx)) 21 | print(len(ds)) 22 | print(ds.doc_idx[-1]) 23 | if ds.supports_prefetch: 24 | # just prefetch the whole thing in test (so assume it is small) 25 | ds.prefetch(range(len(ds))) 26 | if args.count > len(ds.doc_idx) - 1: 27 | args.count = len(ds.doc_idx) - 1 28 | 29 | for i in range(args.count): 30 | start = ds.doc_idx[i] 31 | end = ds.doc_idx[i + 1] 32 | ids = ds[start:end] 33 | print(f"Document {i}:") 34 | print("--------------") 35 | for s in ids: 36 | assert len(s) > 0 37 | l = s.data.tolist() 38 | text = tokenizer.detokenize(l) 39 | print(text) 40 | print("---") 41 | 42 | 43 | def test_indexed_dataset_get(args): 44 | ds = indexed_dataset.make_dataset(args.data, args.dataset_impl) 45 | tokenizer = build_tokenizer(args) 46 | size = ds.sizes[0] 47 | print(f"size: {size}") 48 | full = ds.get(0) 49 | print(full) 50 | # print(tokenizer.detokenize(full.data.tolist())) 51 | print("---") 52 | end = ds.get(0, offset=size - 10) 53 | print(end) 54 | # print(tokenizer.detokenize(end.data.tolist())) 55 | 56 | start = ds.get(0, length=10) 57 | print(start) 58 | # print(tokenizer.detokenize(start.data.tolist())) 59 | 60 | part = ds.get(0, offset=2, length=8) 61 | print(part) 62 | # print(tokenizer.detokenize(part.data.tolist())) 63 | 64 | # def test_albert_dataset(args): 65 | # # tokenizer = FullBertTokenizer(args.vocab, do_lower_case=True) 66 | # # idataset = indexed_dataset.make_dataset(args.data, args.dataset_impl) 67 | # # ds = AlbertDataset(idataset, tokenizer) 68 | # ds = AlbertDataset.from_paths(args.vocab, args.data, args.dataset_impl, 69 | # args.epochs, args.max_num_samples, 70 | # args.masked_lm_prob, args.seq_length, 71 | # args.short_seq_prob, args.seed) 72 | # truncated = 0 73 | # total = 0 74 | # for i, s in enumerate(ds): 75 | # ids = s['text'] 76 | # tokens = ds.tokenizer.convert_ids_to_tokens(ids) 77 | # print(tokens) 78 | # if i >= args.count-1: 79 | # exit() 80 | 81 | 82 | def main(): 83 | parser = argparse.ArgumentParser() 84 | parser.add_argument('--data', type=str, help='prefix to data files') 85 | parser.add_argument('--dataset-impl', type=str, default='infer', 86 | choices=['lazy', 'cached', 'mmap', 'infer']) 87 | parser.add_argument('--count', type=int, default=10, 88 | help='Number of samples/documents to print') 89 | 90 | group = parser.add_argument_group(title='tokenizer') 91 | group.add_argument('--tokenizer-type', type=str, required=True, 92 | choices=['BertWordPieceLowerCase', 93 | 'GPT2BPETokenizer'], 94 | help='What type of tokenizer to use.') 95 | group.add_argument('--vocab-file', type=str, default=None, 96 | help='Path to the vocab file') 97 | group.add_argument('--merge-file', type=str, default=None, 98 | help='Path to the BPE merge file (if necessary).') 99 | 100 | parser.add_argument('--epochs', type=int, default=5, 101 | help='Number of epochs to plan for') 102 | parser.add_argument('--max-num-samples', type=int, default=None, 103 | help='Maximum number of samples to plan for') 104 | parser.add_argument('--masked-lm-prob', type=float, default=0.15, 105 | help='probability of masking tokens') 106 | parser.add_argument('--seq-length', type=int, default=512, 107 | help='maximum sequence length') 108 | parser.add_argument('--short-seq-prob', type=float, default=0.1, 109 | help='probability of creating a short sequence') 110 | parser.add_argument('--seed', type=int, default=1234, 111 | help='random seed') 112 | args = parser.parse_args() 113 | args.rank = 0 114 | args.make_vocab_size_divisible_by = 128 115 | args.model_parallel_size = 1 116 | 117 | if args.dataset_impl == "infer": 118 | args.dataset_impl = indexed_dataset.infer_dataset_impl(args.data) 119 | 120 | # test_albert_dataset(args) 121 | test_indexed_dataset_get(args) 122 | 123 | 124 | if __name__ == "__main__": 125 | main() 126 | -------------------------------------------------------------------------------- /megatron/mpu/cross_entropy.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import torch 18 | 19 | from .initialize import get_model_parallel_group 20 | from .initialize import get_model_parallel_rank 21 | from .initialize import get_model_parallel_world_size 22 | from .utils import VocabUtility 23 | 24 | 25 | class _VocabParallelCrossEntropy(torch.autograd.Function): 26 | 27 | @staticmethod 28 | def forward(ctx, vocab_parallel_logits, target): 29 | 30 | # Maximum value along vocab dimension across all GPUs. 31 | logits_max = torch.max(vocab_parallel_logits, dim=-1)[0] 32 | torch.distributed.all_reduce(logits_max, 33 | op=torch.distributed.ReduceOp.MAX, 34 | group=get_model_parallel_group()) 35 | # Subtract the maximum value. 36 | vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1)) 37 | 38 | # Get the partition's vocab indecies 39 | get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size 40 | partition_vocab_size = vocab_parallel_logits.size()[-1] 41 | rank = get_model_parallel_rank() 42 | world_size = get_model_parallel_world_size() 43 | vocab_start_index, vocab_end_index = get_vocab_range( 44 | partition_vocab_size, rank, world_size) 45 | 46 | # Create a mask of valid vocab ids (1 means it needs to be masked). 47 | target_mask = (target < vocab_start_index) | (target >= vocab_end_index) 48 | masked_target = target.clone() - vocab_start_index 49 | masked_target[target_mask] = 0 50 | 51 | # Get predicted-logits = logits[target]. 52 | # For Simplicity, we convert logits to a 2-D tensor with size 53 | # [*, partition-vocab-size] and target to a 1-D tensor of size [*]. 54 | logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size) 55 | masked_target_1d = masked_target.view(-1) 56 | arange_1d = torch.arange(start=0, end=logits_2d.size()[0], 57 | device=logits_2d.device) 58 | predicted_logits_1d = logits_2d[arange_1d, masked_target_1d] 59 | predicted_logits_1d = predicted_logits_1d.clone().contiguous() 60 | predicted_logits = predicted_logits_1d.view_as(target) 61 | predicted_logits[target_mask] = 0.0 62 | # All reduce is needed to get the chunks from other GPUs. 63 | torch.distributed.all_reduce(predicted_logits, 64 | op=torch.distributed.ReduceOp.SUM, 65 | group=get_model_parallel_group()) 66 | 67 | # Sum of exponential of logits along vocab dimension across all GPUs. 68 | exp_logits = vocab_parallel_logits 69 | torch.exp(vocab_parallel_logits, out=exp_logits) 70 | sum_exp_logits = exp_logits.sum(dim=-1) 71 | torch.distributed.all_reduce(sum_exp_logits, 72 | op=torch.distributed.ReduceOp.SUM, 73 | group=get_model_parallel_group()) 74 | 75 | # Loss = log(sum(exp(logits))) - predicted-logit. 76 | loss = torch.log(sum_exp_logits) - predicted_logits 77 | 78 | # Store softmax, target-mask and masked-target for backward pass. 79 | exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1)) 80 | ctx.save_for_backward(exp_logits, target_mask, masked_target_1d) 81 | 82 | return loss 83 | 84 | @staticmethod 85 | def backward(ctx, grad_output): 86 | 87 | # Retreive tensors from the forward path. 88 | softmax, target_mask, masked_target_1d = ctx.saved_tensors 89 | 90 | # All the inputs have softmax as thier gradient. 91 | grad_input = softmax 92 | # For simplicity, work with the 2D gradient. 93 | partition_vocab_size = softmax.size()[-1] 94 | grad_2d = grad_input.view(-1, partition_vocab_size) 95 | 96 | # Add the gradient from matching classes. 97 | arange_1d = torch.arange(start=0, end=grad_2d.size()[0], 98 | device=grad_2d.device) 99 | grad_2d[arange_1d, masked_target_1d] -= ( 100 | 1.0 - target_mask.view(-1).float()) 101 | 102 | # Finally elementwise multiplication with the output gradients. 103 | grad_input.mul_(grad_output.unsqueeze(dim=-1)) 104 | 105 | return grad_input, None 106 | 107 | 108 | def vocab_parallel_cross_entropy(vocab_parallel_logits, target): 109 | """Helper function for the cross entropy.""" 110 | return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target) 111 | -------------------------------------------------------------------------------- /megatron/mpu/grads.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # Parts of the code here are adapted from PyTorch 18 | # repo: https://github.com/pytorch/pytorch 19 | 20 | 21 | import torch 22 | from torch._six import inf 23 | 24 | try: 25 | from apex.multi_tensor_apply import multi_tensor_applier 26 | import amp_C 27 | 28 | except Exception as e: 29 | print('WARNING: APEX is not installed, multi_tensor_applier will not be available.') 30 | 31 | from .initialize import get_model_parallel_group 32 | from .initialize import get_model_parallel_rank 33 | 34 | 35 | def l2_grad_clipper(parameters, max_norm): 36 | """Efficient L2 norm gradient clipping.""" 37 | 38 | overflow_buf = torch.zeros(1, dtype=torch.int, device='cuda') 39 | # Make sure we have an iterable. 40 | if isinstance(parameters, torch.Tensor): 41 | parameters = [parameters] 42 | # Filter parameters with gradients. 43 | parameters_with_grads = list(filter( 44 | lambda p: p.grad is not None, parameters)) 45 | # Filter parameters for norm calculations. 46 | mp_rank_is_zero = (get_model_parallel_rank() == 0) 47 | parameters_for_norm = list(filter( 48 | lambda p: p.model_parallel or mp_rank_is_zero, parameters_with_grads)) 49 | # Calculate L2 norm. 50 | norm, _ = multi_tensor_applier( 51 | amp_C.multi_tensor_l2norm, 52 | overflow_buf, 53 | [parameters_for_norm], 54 | False # no per-parameter norm 55 | ) 56 | # Sum across all model parallel GPUs. 57 | norm_2 = norm * norm 58 | torch.distributed.all_reduce(norm_2, 59 | op=torch.distributed.ReduceOp.SUM, 60 | group=get_model_parallel_group()) 61 | total_norm = norm_2.item() ** 0.5 62 | # Scale to get max_norm. 63 | clip_coef = float(max_norm) / (total_norm + 1.0e-6) 64 | grads = [p.grad for p in parameters_with_grads] 65 | if clip_coef < 1.0: 66 | multi_tensor_applier( 67 | amp_C.multi_tensor_scale, 68 | overflow_buf, 69 | [grads, grads], 70 | clip_coef) 71 | return total_norm 72 | 73 | 74 | def clip_grad_norm(parameters, max_norm, norm_type=2): 75 | """Clips gradient norm of an iterable of parameters. 76 | 77 | This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and 78 | added functionality to handle model parallel parameters. Note that 79 | the gradients are modified in place. 80 | 81 | Arguments: 82 | parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a 83 | single Tensor that will have gradients normalized 84 | max_norm (float or int): max norm of the gradients 85 | norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for 86 | infinity norm. 87 | 88 | Returns: 89 | Total norm of the parameters (viewed as a single vector). 90 | """ 91 | if isinstance(parameters, torch.Tensor): 92 | parameters = [parameters] 93 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 94 | max_norm = float(max_norm) 95 | norm_type = float(norm_type) 96 | if norm_type == inf: 97 | total_norm = max(p.grad.data.abs().max() for p in parameters) 98 | total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) 99 | # Take max across all GPUs. 100 | torch.distributed.all_reduce(total_norm_cuda, 101 | op=torch.distributed.ReduceOp.MAX, 102 | group=get_model_parallel_group()) 103 | total_norm = total_norm_cuda[0].item() 104 | clip_coef = max_norm / (total_norm + 1e-6) 105 | if clip_coef < 1: 106 | for p in parameters: 107 | p.grad.data.mul_(clip_coef) 108 | #elif norm_type == 2: 109 | # total_norm = l2_grad_clipper(parameters, max_norm) 110 | 111 | else: 112 | total_norm = 0 113 | for p in parameters: 114 | if p.model_parallel or (get_model_parallel_rank() == 0): 115 | param_norm = p.grad.data.norm(norm_type) 116 | total_norm += param_norm.item() ** norm_type 117 | # Sum across all model parallel GPUs. 118 | total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) 119 | torch.distributed.all_reduce(total_norm_cuda, 120 | op=torch.distributed.ReduceOp.SUM, 121 | group=get_model_parallel_group()) 122 | total_norm = total_norm_cuda[0].item() ** (1. / norm_type) 123 | clip_coef = max_norm / (total_norm + 1e-6) 124 | if clip_coef < 1: 125 | for p in parameters: 126 | p.grad.data.mul_(clip_coef) 127 | return total_norm 128 | -------------------------------------------------------------------------------- /megatron/learning_rates.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Learning rate decay functions.""" 17 | 18 | import math 19 | 20 | from megatron import print_rank_0 21 | 22 | 23 | class AnnealingLR(object): 24 | """Anneals the learning rate.""" 25 | 26 | def __init__(self, optimizer, start_lr, 27 | warmup_iter, total_iters, 28 | decay_style, last_iter, min_lr=0.0, 29 | use_checkpoint_lr_scheduler=True, 30 | override_lr_scheduler=False): 31 | 32 | # Class values. 33 | self.optimizer = optimizer 34 | self.start_lr = start_lr 35 | self.min_lr = min_lr 36 | self.warmup_iter = warmup_iter 37 | self.num_iters = last_iter 38 | self.end_iter = total_iters 39 | assert self.end_iter > 0 40 | self.decay_style = decay_style 41 | self.override_lr_scheduler = override_lr_scheduler 42 | self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler 43 | if self.override_lr_scheduler: 44 | assert not self.use_checkpoint_lr_scheduler, 'both override and '\ 45 | 'use-checkpoint are set.' 46 | # Set the learning rate 47 | self.step(self.num_iters) 48 | 49 | print_rank_0('> learning rate decay style: {}'.format(self.decay_style)) 50 | 51 | def get_lr(self): 52 | """Learning rate decay functions from: 53 | https://openreview.net/pdf?id=BJYwwY9ll pg. 4""" 54 | 55 | num_iters_ = min(self.num_iters, self.end_iter - self.warmup_iter) 56 | # Warmup. 57 | if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter: 58 | return float(self.start_lr) * num_iters_ / self.warmup_iter 59 | 60 | num_iters_ = num_iters_ - self.warmup_iter 61 | if self.decay_style == 'linear': 62 | lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter 63 | elif self.decay_style == 'cosine': 64 | lr = self.start_lr / 2.0 * (math.cos( 65 | math.pi * num_iters_ / self.end_iter) + 1) 66 | elif self.decay_style == 'exponential': 67 | # exp(-0.693) = 1/2 68 | lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter) 69 | else: 70 | lr = self.start_lr 71 | return max(lr, self.min_lr) 72 | 73 | def step(self, step_num=None): 74 | """Set lr for all parameters groups.""" 75 | if step_num is None: 76 | step_num = self.num_iters + 1 77 | self.num_iters = step_num 78 | new_lr = self.get_lr() 79 | for group in self.optimizer.param_groups: 80 | group['lr'] = new_lr 81 | 82 | def state_dict(self): 83 | state_dict = { 84 | 'start_lr': self.start_lr, 85 | 'warmup_iter': self.warmup_iter, 86 | 'num_iters': self.num_iters, 87 | 'decay_style': self.decay_style, 88 | 'end_iter': self.end_iter, 89 | 'min_lr': self.min_lr 90 | } 91 | return state_dict 92 | 93 | def _check_and_set(self, cls_value, sd_value, name): 94 | """Auxiliary function for checking the values in the checkpoint and 95 | setting them.""" 96 | if self.override_lr_scheduler: 97 | print_rank_0(' > overriding {} value to {}'.format(name, cls_value)) 98 | return cls_value 99 | 100 | if not self.use_checkpoint_lr_scheduler: 101 | assert cls_value == sd_value, 'AnnealingLR: class input value' \ 102 | 'and checkpoint values for {} do not match'.format(name) 103 | print_rank_0(' > using checkpoint value {} for {}'.format(sd_value, 104 | name)) 105 | return sd_value 106 | 107 | def load_state_dict(self, sd): 108 | 109 | self.start_lr = self._check_and_set(self.start_lr, sd['start_lr'], 110 | 'learning rate') 111 | self.min_lr = self._check_and_set(self.min_lr, sd['min_lr'], 112 | 'minimum learning rate') 113 | self.warmup_iter = self._check_and_set(self.warmup_iter, 114 | sd['warmup_iter'], 115 | 'warmup iterations') 116 | self.end_iter = self._check_and_set(self.end_iter, sd['end_iter'], 117 | 'total number of iterations') 118 | self.decay_style = self._check_and_set(self.decay_style, 119 | sd['decay_style'], 120 | 'decay style') 121 | 122 | self.num_iters = sd['num_iters'] 123 | self.step(self.num_iters) 124 | -------------------------------------------------------------------------------- /megatron/model/distributed.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors 18 | import torch.distributed as dist 19 | from torch.nn.modules import Module 20 | from torch.autograd import Variable 21 | 22 | from megatron import mpu 23 | from megatron.module import MegatronModule 24 | 25 | 26 | class DistributedDataParallel(MegatronModule): 27 | 28 | def __init__(self, module): 29 | super(DistributedDataParallel, self).__init__() 30 | self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False 31 | 32 | self.module = module 33 | self.data_parallel_group = mpu.get_data_parallel_group() 34 | 35 | def allreduce_params(reduce_after=True, no_scale=False, fp32_allreduce=False): 36 | if(self.needs_reduction): 37 | self.needs_reduction = False 38 | buckets = {} 39 | for name, param in self.module.named_parameters(): 40 | if param.requires_grad and param.grad is not None: 41 | tp = (param.data.type()) 42 | if tp not in buckets: 43 | buckets[tp] = [] 44 | buckets[tp].append(param) 45 | if self.warn_on_half: 46 | if torch.cuda.HalfTensor in buckets: 47 | print("WARNING: gloo dist backend for half parameters may be extremely slow." + 48 | " It is recommended to use the NCCL backend in this case.") 49 | self.warn_on_half = False 50 | for tp in buckets: 51 | bucket = buckets[tp] 52 | grads = [param.grad.data for param in bucket] 53 | coalesced = _flatten_dense_tensors(grads) 54 | if fp32_allreduce: 55 | coalesced = coalesced.float() 56 | if not no_scale and not reduce_after: 57 | coalesced /= dist.get_world_size(group=self.data_parallel_group) 58 | dist.all_reduce(coalesced, group=self.data_parallel_group) 59 | torch.cuda.synchronize() 60 | if not no_scale and reduce_after: 61 | coalesced /= dist.get_world_size(group=self.data_parallel_group) 62 | for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): 63 | buf.copy_(synced) 64 | self.hook_handles = [] 65 | self.hooks = [] 66 | for param in list(self.module.parameters()): 67 | def allreduce_hook(*unused): 68 | Variable._execution_engine.queue_callback(allreduce_params) 69 | # handle = param.register_hook(allreduce_hook) 70 | # self.hooks.append(allreduce_hook) 71 | # self.hook_handles.append(handle) 72 | self.allreduce_params = allreduce_params 73 | 74 | def forward(self, *inputs, **kwargs): 75 | self.needs_reduction = True 76 | return self.module(*inputs, **kwargs) 77 | 78 | def state_dict(self, destination=None, prefix='', keep_vars=False): 79 | #[h.remove() for h in self.hook_handles] 80 | sd = self.module.state_dict(destination, prefix, keep_vars) 81 | # for handle, hook in zip(self.hook_handles, self.hooks): 82 | # d = handle.hooks_dict_ref() 83 | # d[handle.id] = hook 84 | 85 | return sd 86 | 87 | def state_dict_for_save_checkpoint(self, destination=None, prefix='', 88 | keep_vars=False): 89 | return self.module.state_dict_for_save_checkpoint(destination, prefix, 90 | keep_vars) 91 | 92 | def load_state_dict(self, state_dict, strict=True): 93 | self.module.load_state_dict(state_dict, strict=strict) 94 | 95 | ''' 96 | def _sync_buffers(self): 97 | buffers = list(self.module._all_buffers()) 98 | if len(buffers) > 0: 99 | # cross-node buffer sync 100 | flat_buffers = _flatten_dense_tensors(buffers) 101 | dist.broadcast(flat_buffers, 0) 102 | for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)): 103 | buf.copy_(synced) 104 | def train(self, mode=True): 105 | # Clear NCCL communicator and CUDA event cache of the default group ID, 106 | # These cache will be recreated at the later call. This is currently a 107 | # work-around for a potential NCCL deadlock. 108 | if dist._backend == dist.dist_backend.NCCL: 109 | dist._clear_group_cache() 110 | super(DistributedDataParallel, self).train(mode) 111 | self.module.train(mode) 112 | ''' 113 | -------------------------------------------------------------------------------- /megatron/model/fused_softmax.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function) : 19 | """ 20 | Fused operation which performs following three operations in sequence 21 | 1. Scale the tensor. 22 | 2. Apply upper triangular mask (typically used in gpt models). 23 | 3. Perform softmax. 24 | """ 25 | @staticmethod 26 | def forward(ctx, inputs, scale): 27 | import scaled_upper_triang_masked_softmax_cuda 28 | scale_t = torch.tensor([scale]) 29 | 30 | softmax_results = \ 31 | scaled_upper_triang_masked_softmax_cuda.forward(inputs, scale_t[0]) 32 | ctx.save_for_backward(softmax_results, scale_t) 33 | return softmax_results 34 | 35 | @staticmethod 36 | def backward(ctx, output_grads): 37 | import scaled_upper_triang_masked_softmax_cuda 38 | softmax_results, scale_t = ctx.saved_tensors 39 | 40 | input_grads = \ 41 | scaled_upper_triang_masked_softmax_cuda.backward(output_grads, 42 | softmax_results, 43 | scale_t[0]) 44 | return input_grads, None 45 | 46 | class ScaledMaskedSoftmax(torch.autograd.Function) : 47 | """ 48 | Fused operation which performs following three operations in sequence 49 | 1. Scale the tensor. 50 | 2. Apply the mask. 51 | 3. Perform softmax. 52 | """ 53 | @staticmethod 54 | def forward(ctx, inputs, mask, scale): 55 | import scaled_masked_softmax_cuda 56 | scale_t = torch.tensor([scale]) 57 | 58 | softmax_results = \ 59 | scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0]) 60 | ctx.save_for_backward(softmax_results, scale_t) 61 | return softmax_results 62 | 63 | @staticmethod 64 | def backward(ctx, output_grads): 65 | import scaled_masked_softmax_cuda 66 | softmax_results, scale_t = ctx.saved_tensors 67 | 68 | input_grads = \ 69 | scaled_masked_softmax_cuda.backward(output_grads, 70 | softmax_results, 71 | scale_t[0]) 72 | return input_grads, None, None 73 | 74 | class FusedScaleMaskSoftmax(torch.nn.Module): 75 | """ 76 | fused operation: scaling + mask + softmax 77 | Arguments: 78 | input_in_fp16: flag to indicate if input in fp16 data format. 79 | upper_triang_mask: if true, apply upper triangular masking. 80 | (used in gpt family networks) 81 | mask_func: mask function to be applied. 82 | softmax_in_fp32: if true, softmax in performed at fp32 precision. 83 | scale: scaling factor used in input tensor scaling. 84 | 85 | """ 86 | def __init__(self, input_in_fp16, upper_triang_mask_fusion, 87 | general_mask_fusion, mask_func, softmax_in_fp32, scale): 88 | super(FusedScaleMaskSoftmax, self).__init__() 89 | self.input_in_fp16 = input_in_fp16 90 | self.upper_triang_mask_fusion = upper_triang_mask_fusion 91 | self.general_mask_fusion = general_mask_fusion 92 | self.mask_func = mask_func 93 | self.softmax_in_fp32 = softmax_in_fp32 94 | self.scale = scale 95 | 96 | assert self.scale is None or softmax_in_fp32, \ 97 | 'softmax should be in fp32 when scaled' 98 | 99 | def forward(self, input, mask): 100 | # [b, np, s, s] 101 | data_size = input.size() 102 | assert input.dim() == 4 103 | 104 | # invoke custom kernel 105 | if self.input_in_fp16 and data_size[-1] <= 2048 and \ 106 | (self.upper_triang_mask_fusion or self.general_mask_fusion) and \ 107 | input.size()[2] == input.size()[3]: 108 | scale = self.scale if self.scale is not None else 1.0 109 | if self.upper_triang_mask_fusion: 110 | input = input.view(-1, data_size[2], data_size[3]) 111 | probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale) 112 | probs = probs.view(*data_size) 113 | else: 114 | probs = ScaledMaskedSoftmax.apply(input, mask, scale) 115 | else: 116 | if self.input_in_fp16 and self.softmax_in_fp32: 117 | input = input.float() 118 | 119 | if self.scale is not None: 120 | input = input * self.scale 121 | mask_output = self.mask_func(input, mask) 122 | probs = torch.nn.Softmax(dim=-1)(mask_output) 123 | 124 | if self.input_in_fp16 and self.softmax_in_fp32: 125 | probs = probs.half() 126 | 127 | return probs 128 | -------------------------------------------------------------------------------- /megatron/memory.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import torch 18 | 19 | 20 | # A dictionary of all the memory buffers allocated. 21 | _MEM_BUFFS = dict() 22 | 23 | 24 | def allocate_mem_buff(name, numel, dtype, track_usage): 25 | """Allocate a memory buffer.""" 26 | assert name not in _MEM_BUFFS, \ 27 | 'memory buffer {} already allocated.'.format(name) 28 | _MEM_BUFFS[name] = MemoryBuffer(name, numel, dtype, track_usage) 29 | return _MEM_BUFFS[name] 30 | 31 | 32 | def get_mem_buff(name): 33 | """Get the memory buffer.""" 34 | return _MEM_BUFFS[name] 35 | 36 | 37 | class MemoryBuffer: 38 | """Contiguous memory buffer. 39 | Allocate a contiguous memory of type `dtype` and size `numel`. It is 40 | used to reduce memory fragmentation. 41 | 42 | Usage: After the allocation, the `_start` index is set tot the first 43 | index of the memory. A memory chunk starting from `_start` index 44 | can be `allocated` for an input tensor, with the elements of the 45 | tensor being coppied. The buffer can be reused by resetting the 46 | `_start` index. 47 | 48 | """ 49 | def __init__(self, name, numel, dtype, track_usage): 50 | if torch.distributed.get_rank() == 0: 51 | element_size = torch.tensor([], dtype=dtype).element_size() 52 | print('> building the {} memory buffer with {} num elements ' 53 | 'and {} dtype ({:.1f} MB)...'.format( 54 | name, numel, dtype, numel*element_size/1024/1024), 55 | flush=True) 56 | self.name = name 57 | self.numel = numel 58 | self.dtype = dtype 59 | self.data = torch.empty(self.numel, 60 | dtype=self.dtype, 61 | device=torch.cuda.current_device(), 62 | requires_grad=False) 63 | 64 | # Index tracking the start of the free memory. 65 | self._start = 0 66 | 67 | # Values used for tracking usage. 68 | self.track_usage = track_usage 69 | if self.track_usage: 70 | self.in_use_value = 0.0 71 | self.total_value = 0.0 72 | 73 | 74 | def reset(self): 75 | """Reset the buffer start index to the beginning of the buffer.""" 76 | self._start = 0 77 | 78 | 79 | def is_in_use(self): 80 | """Whether the current buffer hold on to any memory.""" 81 | return self._start > 0 82 | 83 | 84 | def numel_in_use(self): 85 | """Return number of elements in use.""" 86 | return self._start 87 | 88 | 89 | def add(self, tensor): 90 | """Allocate a chunk of memory from the buffer to tensor and copy 91 | the values.""" 92 | assert tensor.dtype == self.dtype, \ 93 | 'Input tensor type {} different from buffer type {}'.format( 94 | tensor.dtype, self.dtype) 95 | # Number of elements of the input tensor. 96 | tensor_numel = torch.numel(tensor) 97 | new_start = self._start + tensor_numel 98 | assert new_start <= self.numel, \ 99 | 'Not enough memory left in the buffer ({} > {})'.format( 100 | tensor_numel, self.numel - self._start) 101 | # New tensor is a view into the memory. 102 | new_tensor = self.data[self._start:new_start] 103 | self._start = new_start 104 | new_tensor = new_tensor.view(tensor.shape) 105 | new_tensor.copy_(tensor) 106 | # Return a pointer to the new tensor. 107 | return new_tensor 108 | 109 | 110 | def get_data(self): 111 | """Return the data currently in use.""" 112 | if self.track_usage: 113 | self.in_use_value += float(self._start) 114 | self.total_value += float(self.numel) 115 | return self.data[:self._start] 116 | 117 | 118 | def print_average_usage(self): 119 | """Print memory usage average over time. We would like this value 120 | to be as high as possible.""" 121 | assert self.track_usage, 'You need to enable track usage.' 122 | if torch.distributed.get_rank() == 0: 123 | print(' > usage of {} memory buffer: {:.2f} %'.format( 124 | self.name, self.in_use_value * 100.0 / self.total_value), 125 | flush=True) 126 | 127 | 128 | 129 | class RingMemBuffer: 130 | """A ring of memory buffers.""" 131 | 132 | def __init__(self, name, num_buffers, numel, dtype, track_usage): 133 | self.num_buffers = num_buffers 134 | self.buffers = [ 135 | allocate_mem_buff(name+' {}'.format(i), numel, dtype, track_usage) 136 | for i in range(num_buffers)] 137 | self._index = -1 138 | 139 | 140 | def get_next_buffer(self): 141 | self._index += 1 142 | self._index = self._index % self.num_buffers 143 | buff = self.buffers[self._index] 144 | assert not buff.is_in_use(), 'buffer is already in use.' 145 | return buff 146 | -------------------------------------------------------------------------------- /pretrain_gpt2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Pretrain GPT2""" 17 | 18 | import torch 19 | 20 | from megatron import get_args 21 | from megatron import print_rank_0 22 | from megatron import get_timers 23 | from megatron import get_tokenizer 24 | from megatron import mpu 25 | from megatron.data.gpt2_dataset import build_train_valid_test_datasets 26 | from megatron.model import GPT2Model, GPT2ModelPipe 27 | from megatron.training import pretrain 28 | from megatron.utils import get_ltor_masks_and_position_ids 29 | from megatron.utils import reduce_losses 30 | from megatron.fp16 import fp32_to_fp16 31 | 32 | # pretend this is a great DeepSpeed change too 33 | 34 | def model_provider(): 35 | """Build the model.""" 36 | args = get_args() 37 | print_rank_0('building GPT2 model ...') 38 | if args.pipe_parallel_size == 0: 39 | model = GPT2Model(num_tokentypes=0, parallel_output=True) 40 | else: 41 | model = GPT2ModelPipe(num_tokentypes=0, parallel_output=True, topology=mpu.get_topology()) 42 | model._megatron_batch_fn = get_batch_pipe 43 | model._input_grad = [True, False] 44 | model._input_type = ['float', 'bool'] 45 | model._input_pipe_partitioned = [True, False] 46 | return model 47 | 48 | def get_batch(data_iterator): 49 | """Generate a batch""" 50 | args = get_args() 51 | tokenizer = get_tokenizer() 52 | 53 | # Items and their type. 54 | keys = ['text'] 55 | datatype = torch.int64 56 | 57 | # Broadcast data. 58 | if data_iterator is not None: 59 | data = next(data_iterator) 60 | else: 61 | data = None 62 | data_b = mpu.broadcast_data(keys, data, datatype) 63 | 64 | # Unpack. 65 | tokens_ = data_b['text'].long() 66 | labels = tokens_[:, 1:].contiguous() 67 | tokens = tokens_[:, :-1].contiguous() 68 | 69 | # Get the masks and postition ids. 70 | attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( 71 | tokens, 72 | tokenizer.eod, 73 | args.reset_position_ids, 74 | args.reset_attention_mask, 75 | args.eod_mask_loss) 76 | 77 | return tokens, labels, loss_mask, attention_mask, position_ids 78 | 79 | def get_batch_pipe(data): 80 | """A modification of get_batch() to work with the latest batch instead of an iterator. """ 81 | args = get_args() 82 | tokenizer = get_tokenizer() 83 | 84 | # Items and their type. 85 | keys = ['text'] 86 | datatype = torch.int64 87 | 88 | # Broadcast data. 89 | data_b = mpu.broadcast_data(keys, data, datatype) 90 | 91 | # Unpack. 92 | tokens_ = data_b['text'].long() 93 | labels = tokens_[:, 1:].contiguous() 94 | tokens = tokens_[:, :-1].contiguous() 95 | 96 | # Get the masks and postition ids. 97 | attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( 98 | tokens, 99 | tokenizer.eod, 100 | args.reset_position_ids, 101 | args.reset_attention_mask, 102 | args.eod_mask_loss) 103 | 104 | # unpack data 105 | if args.fp16: 106 | # cast to fp16 because pipeline parallelism skips the FP16 wrapper. 107 | return fp32_to_fp16((tokens, position_ids, attention_mask)), fp32_to_fp16((labels, loss_mask)) 108 | else: 109 | return (tokens, position_ids, attention_mask), (labels, loss_mask) 110 | 111 | 112 | def forward_step(data_iterator, model): 113 | """Forward step.""" 114 | args = get_args() 115 | timers = get_timers() 116 | 117 | # Get the batch. 118 | timers('batch generator').start() 119 | tokens, labels, loss_mask, attention_mask, position_ids = get_batch(data_iterator) 120 | timers('batch generator').stop() 121 | # Forward model. 122 | losses = model(tokens, position_ids, attention_mask, labels=labels) 123 | loss_mask = loss_mask.view(-1) 124 | loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() 125 | 126 | # Reduce loss for logging. 127 | reduced_loss = reduce_losses([loss]) 128 | 129 | return loss, {'lm loss': reduced_loss[0]} 130 | 131 | 132 | def train_valid_test_datasets_provider(train_val_test_num_samples): 133 | """Build train, valid, and test datasets.""" 134 | args = get_args() 135 | 136 | print_rank_0('> building train, validation, and test datasets ' 137 | 'for GPT2 ...') 138 | train_ds, valid_ds, test_ds = build_train_valid_test_datasets( 139 | data_prefix=args.data_path, 140 | data_impl=args.data_impl, 141 | splits_string=args.split, 142 | train_valid_test_num_samples=train_val_test_num_samples, 143 | seq_length=args.seq_length, 144 | seed=args.seed, 145 | skip_warmup=(not args.mmap_warmup)) 146 | print_rank_0("> finished creating GPT2 datasets ...") 147 | return train_ds, valid_ds, test_ds 148 | 149 | 150 | if __name__ == "__main__": 151 | pretrain(train_valid_test_datasets_provider, model_provider, forward_step, 152 | args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) 153 | -------------------------------------------------------------------------------- /megatron/deprecated_data_utils/tf_dl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch DataLoader for TFRecords""" 16 | 17 | import numpy as np 18 | import torch 19 | import queue 20 | import threading 21 | 22 | import tensorflow as tf 23 | tf.enable_eager_execution() 24 | 25 | 26 | class TFRecordDataLoader(object): 27 | def __init__(self, records, batch_size, max_seq_len, max_preds_per_seq, 28 | train, num_workers=2, seed=1, threaded_dl=False): 29 | assert max_preds_per_seq is not None, "--max-preds-per-seq MUST BE SPECIFIED when using tfrecords" 30 | tf.set_random_seed(seed) 31 | if isinstance(records, str): 32 | records = [records] 33 | 34 | self.record_converter = Record2Example({"input_ids": tf.FixedLenFeature([max_seq_len], tf.int64), 35 | "input_mask": tf.FixedLenFeature([max_seq_len], tf.int64), 36 | "segment_ids": tf.FixedLenFeature([max_seq_len], tf.int64), 37 | "masked_lm_positions": tf.FixedLenFeature([max_preds_per_seq], tf.int64), 38 | "masked_lm_ids": tf.FixedLenFeature([max_preds_per_seq], tf.int64), 39 | "masked_lm_weights": tf.FixedLenFeature([max_preds_per_seq], tf.float32), 40 | "next_sentence_labels": tf.FixedLenFeature([1], tf.int64)}) 41 | 42 | # Instantiate dataset according to original BERT implementation 43 | if train: 44 | self.dataset = tf.data.Dataset.from_tensor_slices(tf.constant(records)) 45 | self.dataset = self.dataset.repeat() 46 | self.dataset = self.dataset.shuffle(buffer_size=len(records)) 47 | 48 | # use sloppy tfrecord dataset 49 | self.dataset = self.dataset.apply( 50 | tf.contrib.data.parallel_interleave( 51 | tf.data.TFRecordDataset, 52 | sloppy=train, 53 | cycle_length=min(num_workers, len(records)))) 54 | self.dataset = self.dataset.shuffle(buffer_size=100) 55 | else: 56 | self.dataset = tf.data.TFRecordDataset(records) 57 | self.dataset = self.dataset.repeat() 58 | 59 | # Instantiate dataloader (do not drop remainder for eval) 60 | loader_args = {'batch_size': batch_size, 61 | 'num_parallel_batches': num_workers, 62 | 'drop_remainder': train} 63 | self.dataloader = self.dataset.apply( 64 | tf.contrib.data.map_and_batch( 65 | self.record_converter, **loader_args)) 66 | self.threaded_dl = threaded_dl 67 | self.num_workers = num_workers 68 | 69 | def __iter__(self): 70 | if self.threaded_dl: 71 | data_iter = iter(MultiprocessLoader(self.dataloader, self.num_workers)) 72 | for item in data_iter: 73 | yield item 74 | else: 75 | data_iter = iter(self.dataloader) 76 | for item in data_iter: 77 | yield convert_tf_example_to_torch_tensors(item) 78 | 79 | 80 | class Record2Example(object): 81 | def __init__(self, feature_map): 82 | self.feature_map = feature_map 83 | 84 | def __call__(self, record): 85 | """Decodes a BERT TF record to a TF example.""" 86 | example = tf.parse_single_example(record, self.feature_map) 87 | for k, v in list(example.items()): 88 | if v.dtype == tf.int64: 89 | example[k] = tf.to_int32(v) 90 | return example 91 | 92 | 93 | def convert_tf_example_to_torch_tensors(example): 94 | item = {k: (v.numpy()) for k, v in example.items()} 95 | mask = np.zeros_like(item['input_ids']) 96 | mask_labels = np.ones_like(item['input_ids']) * -1 97 | for b, row in enumerate(item['masked_lm_positions'].astype(int)): 98 | for i, idx in enumerate(row): 99 | if item['masked_lm_weights'][b, i] != 0: 100 | mask[b, idx] = 1 101 | mask_labels[b, idx] = item['masked_lm_ids'][b, i] 102 | output = {'text': item['input_ids'], 'types': item['segment_ids'], 'is_random': item['next_sentence_labels'], 103 | 'pad_mask': 1 - item['input_mask'], 'mask': mask, 'mask_labels': mask_labels} 104 | return {k: torch.from_numpy(v) for k, v in output.items()} 105 | 106 | 107 | class MultiprocessLoader(object): 108 | def __init__(self, dataloader, num_workers=2): 109 | self.dl = dataloader 110 | self.queue_size = 2 * num_workers 111 | 112 | def __iter__(self): 113 | output_queue = queue.Queue(self.queue_size) 114 | output_thread = threading.Thread(target=_multiproc_iter, 115 | args=(self.dl, output_queue)) 116 | output_thread.daemon = True 117 | output_thread.start() 118 | 119 | while output_thread.is_alive(): 120 | yield output_queue.get(block=True) 121 | else: 122 | print(RuntimeError('TF record data loader thread exited unexpectedly')) 123 | 124 | 125 | def _multiproc_iter(dl, output_queue): 126 | data_iter = iter(dl) 127 | for item in data_iter: 128 | tensors = convert_tf_example_to_torch_tensors(item) 129 | output_queue.put(tensors, block=True) 130 | -------------------------------------------------------------------------------- /megatron/data/ict_dataset.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import random 3 | 4 | import numpy as np 5 | from torch.utils.data import Dataset 6 | 7 | from megatron import get_tokenizer 8 | from megatron import get_args 9 | from megatron.data.dataset_utils import get_indexed_dataset_ 10 | from megatron.data.realm_dataset_utils import get_block_samples_mapping 11 | 12 | 13 | def get_ict_dataset(use_titles=True, query_in_block_prob=1): 14 | """Get a dataset which uses block samples mappings to get ICT/block indexing data (via get_block()) 15 | rather than for training, since it is only built with a single epoch sample mapping. 16 | """ 17 | args = get_args() 18 | block_dataset = get_indexed_dataset_(args.data_path, 'mmap', True) 19 | titles_dataset = get_indexed_dataset_(args.titles_data_path, 'mmap', True) 20 | 21 | kwargs = dict( 22 | name='full', 23 | block_dataset=block_dataset, 24 | title_dataset=titles_dataset, 25 | data_prefix=args.data_path, 26 | num_epochs=1, 27 | max_num_samples=None, 28 | max_seq_length=args.seq_length, 29 | seed=1, 30 | query_in_block_prob=query_in_block_prob, 31 | use_titles=use_titles, 32 | use_one_sent_docs=args.use_one_sent_docs 33 | ) 34 | dataset = ICTDataset(**kwargs) 35 | return dataset 36 | 37 | 38 | class ICTDataset(Dataset): 39 | """Dataset containing sentences and their blocks for an inverse cloze task.""" 40 | def __init__(self, name, block_dataset, title_dataset, data_prefix, 41 | num_epochs, max_num_samples, max_seq_length, query_in_block_prob, 42 | seed, use_titles=True, use_one_sent_docs=False): 43 | self.name = name 44 | self.seed = seed 45 | self.max_seq_length = max_seq_length 46 | self.query_in_block_prob = query_in_block_prob 47 | self.block_dataset = block_dataset 48 | self.title_dataset = title_dataset 49 | self.rng = random.Random(self.seed) 50 | self.use_titles = use_titles 51 | self.use_one_sent_docs = use_one_sent_docs 52 | 53 | self.samples_mapping = get_block_samples_mapping( 54 | block_dataset, title_dataset, data_prefix, num_epochs, 55 | max_num_samples, max_seq_length, seed, name, use_one_sent_docs) 56 | self.tokenizer = get_tokenizer() 57 | self.vocab_id_list = list(self.tokenizer.inv_vocab.keys()) 58 | self.vocab_id_to_token_list = self.tokenizer.inv_vocab 59 | self.cls_id = self.tokenizer.cls 60 | self.sep_id = self.tokenizer.sep 61 | self.mask_id = self.tokenizer.mask 62 | self.pad_id = self.tokenizer.pad 63 | 64 | def __len__(self): 65 | return len(self.samples_mapping) 66 | 67 | def __getitem__(self, idx): 68 | """Get an ICT example of a pseudo-query and the block of text from which it was extracted""" 69 | sample_data = self.samples_mapping[idx] 70 | start_idx, end_idx, doc_idx, block_idx = sample_data.as_tuple() 71 | 72 | if self.use_titles: 73 | title = self.title_dataset[int(doc_idx)] 74 | title_pad_offset = 3 + len(title) 75 | else: 76 | title = None 77 | title_pad_offset = 2 78 | block = [self.block_dataset[i] for i in range(start_idx, end_idx)] 79 | assert len(block) > 1 or self.use_one_sent_docs or self.query_in_block_prob == 1 80 | 81 | # randint() is inclusive for Python rng 82 | rand_sent_idx = self.rng.randint(0, len(block) - 1) 83 | 84 | # keep the query in the context query_in_block_prob fraction of the time. 85 | if self.rng.random() < self.query_in_block_prob: 86 | query = block[rand_sent_idx].copy() 87 | else: 88 | query = block.pop(rand_sent_idx) 89 | 90 | # still need to truncate because blocks are concluded when 91 | # the sentence lengths have exceeded max_seq_length. 92 | query = query[:self.max_seq_length - 2] 93 | block = list(itertools.chain(*block))[:self.max_seq_length - title_pad_offset] 94 | 95 | query_tokens, query_pad_mask = self.concat_and_pad_tokens(query) 96 | block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title) 97 | block_data = sample_data.as_array() 98 | 99 | sample = { 100 | 'query_tokens': query_tokens, 101 | 'query_pad_mask': query_pad_mask, 102 | 'block_tokens': block_tokens, 103 | 'block_pad_mask': block_pad_mask, 104 | 'block_data': block_data, 105 | } 106 | 107 | return sample 108 | 109 | def get_block(self, start_idx, end_idx, doc_idx): 110 | """Get the IDs for an evidence block plus the title of the corresponding document""" 111 | block = [self.block_dataset[i] for i in range(start_idx, end_idx)] 112 | title = self.title_dataset[int(doc_idx)] 113 | 114 | block = list(itertools.chain(*block))[:self.max_seq_length - (3 + len(title))] 115 | block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title) 116 | 117 | return block_tokens, block_pad_mask 118 | 119 | def get_null_block(self): 120 | """Get empty block and title - used in REALM pretraining""" 121 | block, title = [], [] 122 | block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title) 123 | 124 | return block_tokens, block_pad_mask 125 | 126 | def concat_and_pad_tokens(self, tokens, title=None): 127 | """Concat with special tokens and pad sequence to self.max_seq_length""" 128 | tokens = list(tokens) 129 | if title is None: 130 | tokens = [self.cls_id] + tokens + [self.sep_id] 131 | else: 132 | title = list(title) 133 | tokens = [self.cls_id] + title + [self.sep_id] + tokens + [self.sep_id] 134 | assert len(tokens) <= self.max_seq_length 135 | 136 | num_pad = self.max_seq_length - len(tokens) 137 | pad_mask = [1] * len(tokens) + [0] * num_pad 138 | tokens += [self.pad_id] * num_pad 139 | 140 | return np.array(tokens), np.array(pad_mask) 141 | -------------------------------------------------------------------------------- /megatron/deprecated_data_utils/samplers.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """batch samplers that work with either random or sequential data samplers""" 16 | import math 17 | import os 18 | import sys 19 | 20 | import torch 21 | from torch.utils import data 22 | import numpy as np 23 | 24 | 25 | class RandomSampler(data.sampler.Sampler): 26 | r""" 27 | Based off of pytorch RandomSampler and DistributedSampler. Essentially a RandomSampler, 28 | but this class lets the user set an epoch like DistributedSampler 29 | Samples elements randomly. If without replacement, then sample from a shuffled dataset. 30 | If with replacement, then user can specify ``num_samples`` to draw. 31 | Arguments: 32 | data_source (Dataset): dataset to sample from 33 | num_samples (int): number of samples to draw, default=len(dataset) 34 | replacement (bool): samples are drawn with replacement if ``True``, default=False 35 | """ 36 | 37 | def __init__(self, data_source, replacement=False, num_samples=None): 38 | self.data_source = data_source 39 | self.replacement = replacement 40 | self._num_samples = num_samples 41 | self.epoch = -1 42 | 43 | if self._num_samples is not None and replacement is False: 44 | raise ValueError("With replacement=False, num_samples should not be specified, " 45 | "since a random permute will be performed.") 46 | 47 | if not isinstance(self.num_samples, int) or self.num_samples <= 0: 48 | raise ValueError("num_samples should be a positive integer " 49 | "value, but got num_samples={}".format(self.num_samples)) 50 | if not isinstance(self.replacement, bool): 51 | raise ValueError("replacement should be a boolean value, but got " 52 | "replacement={}".format(self.replacement)) 53 | 54 | @property 55 | def num_samples(self): 56 | # dataset size might change at runtime 57 | if self._num_samples is None: 58 | return len(self.data_source) 59 | return self._num_samples 60 | 61 | def __iter__(self): 62 | n = len(self.data_source) 63 | g = torch.Generator() 64 | if self.epoch >= 0: 65 | g.manual_seed(self.epoch) 66 | if self.replacement: 67 | return iter(torch.randint(high=n, size=(self.num_samples,), 68 | dtype=torch.int64, generator=g).tolist()) 69 | return iter(torch.randperm(n, generator=g).tolist()) 70 | 71 | def __len__(self): 72 | return self.num_samples 73 | 74 | def set_epoch(self, epoch): 75 | self.epoch = epoch 76 | 77 | 78 | class DistributedBatchSampler(data.sampler.BatchSampler): 79 | """ 80 | similar to normal implementation of distributed sampler, except implementation is at the 81 | batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary 82 | data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler. 83 | """ 84 | 85 | def __init__(self, sampler, batch_size, drop_last, rank=-1, world_size=2, wrap_last=False): 86 | super(DistributedBatchSampler, self).__init__(sampler, batch_size, drop_last) 87 | if rank == -1: 88 | assert False, 'should not be here' 89 | rank = torch.distributed.get_rank() 90 | self.rank = rank 91 | self.world_size = world_size 92 | self.sampler.wrap_around = 0 93 | self.wrap_around = 0 94 | self.wrap_last = wrap_last 95 | self.start_iter = 0 96 | 97 | def __iter__(self): 98 | batch = [] 99 | last_batch = None 100 | i = 0 101 | for idx in self.data_iterator(self.sampler, wrap_around=False): 102 | batch.append(idx) 103 | if len(batch) == self.batch_size: 104 | tbatch = self._batch(batch) 105 | if i >= self.start_iter: 106 | yield tbatch 107 | self.start_iter = 0 108 | i += 1 109 | last_batch = np.array(list(tbatch)) 110 | batch = [] 111 | batch_len = len(batch) 112 | if batch_len > 0 and not self.drop_last: 113 | if self.wrap_last: 114 | self.sampler.wrap_around -= (self.batch_size) 115 | self.wrap_around += (len(batch)) 116 | self.wrap_around %= self.batch_size 117 | if isinstance(self.sampler, TransposedSampler): 118 | for i, idx in enumerate(self.data_iterator(self.sampler, wrap_around=True)): 119 | if i == 0: 120 | continue 121 | batch.append(idx) 122 | new_batch_len = len(batch) 123 | if len(batch) == self.batch_size: 124 | break 125 | yield self._batch(batch) 126 | if self.wrap_last: 127 | self.sampler.wrap_around += self.batch_size 128 | 129 | def data_iterator(self, _iter, wrap_around=False): 130 | """iterates through data and handles wrap around""" 131 | for i, idx in enumerate(_iter): 132 | if i < self.wrap_around % self.batch_size: 133 | continue 134 | if wrap_around: 135 | self.wrap_around += 1 136 | self.wrap_around %= self.batch_size 137 | yield idx 138 | 139 | def _batch(self, batch): 140 | """extracts samples only pertaining to this worker's batch""" 141 | start = self.rank * self.batch_size // self.world_size 142 | end = (self.rank + 1) * self.batch_size // self.world_size 143 | return batch[start:end] 144 | -------------------------------------------------------------------------------- /megatron/deprecated_data_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """utils for creating datasets""" 16 | import os 17 | import math 18 | 19 | import torch 20 | 21 | from .samplers import DistributedBatchSampler 22 | from .datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, bert_sentencepair_dataset, GPT2Dataset 23 | from .lazy_loader import exists_lazy, make_lazy, lazy_array_loader 24 | from .tokenization import Tokenization, CommandToken, Tokenizer, CharacterLevelTokenizer, BertWordPieceTokenizer, GPT2BPETokenizer, make_tokenizer 25 | from . import corpora 26 | 27 | TRAIN_DATA = 0 28 | VAL_DATA = 1 29 | TEST_DATA = 2 30 | 31 | 32 | def should_split(split): 33 | """ 34 | given split proportions checks if should split 35 | Examples: 36 | >>> should_split([10,0,0]) 37 | False 38 | >>> should_split([1,.1,.2]) 39 | True 40 | """ 41 | return max(split) / sum(split) != 1. 42 | 43 | 44 | def get_ext(path): 45 | """gets path extension""" 46 | return os.path.splitext(path)[1] 47 | 48 | 49 | def get_dataset(path, **kwargs): 50 | """gets dataset object based on keyword args and file at `path`""" 51 | if supported_corpus(path): 52 | return corpora.NAMED_CORPORA[path](**kwargs) 53 | ext = get_ext(path) 54 | if '.json' in ext: 55 | text = json_dataset(path, **kwargs) 56 | elif ext in ['.csv', '.tsv']: 57 | text = csv_dataset(path, **kwargs) 58 | else: 59 | raise NotImplementedError('data file type %s is not supported' % (ext)) 60 | return text 61 | 62 | 63 | def supported_corpus(corpus_name): 64 | """checks if corpus name is defined in `corpora.py`""" 65 | return corpus_name in corpora.NAMED_CORPORA 66 | 67 | 68 | def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.], 69 | delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None, 70 | tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None, 71 | model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None, 72 | parallel_group=None, **kwargs): 73 | """function to create datasets+tokenizers for common options""" 74 | if isinstance(process_fn, str): 75 | process_fn = eval(process_fn) 76 | if non_binary_cols is not None: 77 | # multilabel dataset support (only for csvs) 78 | label_key = non_binary_cols 79 | 80 | def get_dataset_from_path(path_): 81 | if lazy: 82 | # get lazily loaded dataset 83 | named_corpora = False 84 | if supported_corpus(path_): 85 | named_corpora = True 86 | name = path_ 87 | path_ = corpora.NAMED_CORPORA[path_].PATH 88 | if torch.distributed.get_rank() == 0 and not exists_lazy(path_, data_type='data'): 89 | # create cached version of dataset for lazy loading if it doesn't exist 90 | text = get_dataset(name if named_corpora else path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent, 91 | delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose) 92 | make_lazy(path_, text.X, data_type='data') 93 | # This should be a barrier but nccl barrier assumes 94 | # device_index=rank which is not the case for model 95 | # parallel case 96 | counts = torch.cuda.LongTensor([1]) 97 | torch.distributed.all_reduce(counts, group=parallel_group) 98 | assert counts[0].item() == torch.distributed.get_world_size( 99 | group=parallel_group) 100 | 101 | text = lazy_array_loader(path_, data_type='data', map_fn=process_fn) 102 | else: 103 | # get dataset 104 | text = get_dataset(path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent, 105 | delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose, preprocess_fn=process_fn) 106 | return text 107 | # get one or multiple datasets and concatenate 108 | if isinstance(path, str): 109 | path = [path] 110 | datasets = [get_dataset_from_path(p) for p in path] 111 | if len(datasets) == 1: 112 | ds = datasets[0] 113 | else: 114 | ds = ConcatDataset(datasets) 115 | # make tokenizer for dataset 116 | if tokenizer is None: 117 | tokenizer = make_tokenizer(tokenizer_type, ds, tokenizer_model_path, vocab_size, model_type, 118 | pad_token, character_converage, **kwargs) 119 | 120 | ds_type = '' 121 | if 'ds_type' in kwargs: 122 | ds_type = kwargs['ds_type'] 123 | ds.SetTokenizer(tokenizer) 124 | # Split dataset into train/val/test (and wrap bert dataset) 125 | if should_split(split): 126 | ds = split_ds(ds, split) 127 | if 'bert' in ds_type.lower(): 128 | presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False 129 | dstype = bert_sentencepair_dataset 130 | ds = [dstype(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences) 131 | if d is not None else None for d in ds] 132 | elif ds_type.lower() == 'gpt2': 133 | ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds] 134 | else: 135 | if 'bert' in ds_type.lower(): 136 | presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False 137 | dstype = bert_sentencepair_dataset 138 | ds = dstype(ds, max_seq_len=seq_length, presplit_sentences=presplit_sentences) 139 | elif ds_type.lower() == 'gpt2': 140 | ds = GPT2Dataset(ds, max_seq_len=seq_length) 141 | return ds, tokenizer 142 | -------------------------------------------------------------------------------- /megatron/data/samplers.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Batch samplers that work with either random or sequential data samplers.""" 17 | 18 | import torch 19 | from torch.utils import data 20 | 21 | 22 | class RandomSampler(data.sampler.Sampler): 23 | """Based off of pytorch RandomSampler and DistributedSampler. Essentially 24 | a RandomSampler, but this class lets the user set an epoch like 25 | DistributedSampler Samples elements randomly. If without replacement, then 26 | sample from a shuffled dataset. If with replacement, then user can 27 | specify ``num_samples`` to draw. 28 | Arguments: 29 | data_source (Dataset): dataset to sample from 30 | num_samples (int): number of samples to draw, default=len(dataset) 31 | replacement (bool): samples are drawn with replacement if ``True``, 32 | default=False 33 | """ 34 | 35 | def __init__(self, data_source, replacement=False, num_samples=None): 36 | self.data_source = data_source 37 | self.replacement = replacement 38 | self._num_samples = num_samples 39 | self.epoch = -1 40 | 41 | if self._num_samples is not None and replacement is False: 42 | raise ValueError("With replacement=False, num_samples should not " 43 | "be specified, since a random permute will be " 44 | "performed.") 45 | 46 | if not isinstance(self.num_samples, int) or self.num_samples <= 0: 47 | raise ValueError("num_samples should be a positive integer " 48 | "value, but got num_samples={}".format( 49 | self.num_samples)) 50 | if not isinstance(self.replacement, bool): 51 | raise ValueError("replacement should be a boolean value, but got " 52 | "replacement={}".format(self.replacement)) 53 | 54 | @property 55 | def num_samples(self): 56 | # dataset size might change at runtime 57 | if self._num_samples is None: 58 | return len(self.data_source) 59 | return self._num_samples 60 | 61 | def __iter__(self): 62 | n = len(self.data_source) 63 | g = torch.Generator() 64 | if self.epoch >= 0: 65 | g.manual_seed(self.epoch) 66 | if self.replacement: 67 | return iter(torch.randint(high=n, size=(self.num_samples,), 68 | dtype=torch.int64, generator=g).tolist()) 69 | return iter(torch.randperm(n, generator=g).tolist()) 70 | 71 | def __len__(self): 72 | return self.num_samples 73 | 74 | def set_epoch(self, epoch): 75 | self.epoch = epoch 76 | 77 | 78 | class DistributedBatchSampler(data.sampler.BatchSampler): 79 | """Similar to normal implementation of distributed sampler, except 80 | implementation is at the batch sampler level, instead of just the 81 | sampler level. This allows wrapping of arbitrary data samplers 82 | (sequential, random, WeightedRandomSampler, etc.) with this batch 83 | sampler. 84 | 85 | The `interleave` argument specifies how to distribute a batch. A value 86 | of True combined with the above random sampler is equivalent to pytorch's 87 | torch.utils.data.distributed.DistributedSampler. 88 | 89 | For the following batch [0,1,2,3,4,5,6,7] and data parallelism of 2 90 | specifying True will result in the following samples for each gpu: 91 | GPU0: [0,2,4,6] GPU1: [1,3,5,7] 92 | specifying False will result in the following samples: 93 | GPU0: [0,1,2,3] GPU1: [4,5,6,7]""" 94 | 95 | def __init__(self, sampler, batch_size, drop_last, rank=-1, 96 | world_size=2, wrap_last=False, interleave=False): 97 | super(DistributedBatchSampler, self).__init__(sampler, batch_size, 98 | drop_last) 99 | if rank == -1: 100 | assert False, 'should not be here' 101 | rank = torch.distributed.get_rank() 102 | self.rank = rank 103 | self.world_size = world_size 104 | self.sampler.wrap_around = 0 105 | self.wrap_around = 0 106 | self.wrap_last = wrap_last 107 | self.start_iter = 0 108 | self.interleave = interleave 109 | 110 | def __iter__(self): 111 | batch = [] 112 | i = 0 113 | for idx in self.data_iterator(self.sampler, wrap_around=False): 114 | batch.append(idx) 115 | if len(batch) == self.batch_size: 116 | tbatch = self._batch(batch) 117 | if i >= self.start_iter: 118 | yield tbatch 119 | self.start_iter = 0 120 | i += 1 121 | batch = [] 122 | batch_len = len(batch) 123 | if batch_len > 0 and not self.drop_last: 124 | if self.wrap_last: 125 | self.sampler.wrap_around -= (self.batch_size) 126 | self.wrap_around += (len(batch)) 127 | self.wrap_around %= self.batch_size 128 | yield self._batch(batch) 129 | if self.wrap_last: 130 | self.sampler.wrap_around += self.batch_size 131 | 132 | def data_iterator(self, _iter, wrap_around=False): 133 | """iterates through data and handles wrap around""" 134 | for i, idx in enumerate(_iter): 135 | if i < self.wrap_around % self.batch_size: 136 | continue 137 | if wrap_around: 138 | self.wrap_around += 1 139 | self.wrap_around %= self.batch_size 140 | yield idx 141 | 142 | def _batch(self, batch): 143 | """extracts samples only pertaining to this worker's batch""" 144 | if self.interleave: 145 | return batch[self.rank:self.batch_size:self.world_size] 146 | start = self.rank * self.batch_size // self.world_size 147 | end = (self.rank + 1) * self.batch_size // self.world_size 148 | return batch[start:end] 149 | -------------------------------------------------------------------------------- /pretrain_t5.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Pretrain T5""" 17 | 18 | import torch 19 | 20 | from megatron import get_args 21 | from megatron import print_rank_0 22 | from megatron import get_timers 23 | from megatron import get_tokenizer 24 | from megatron import mpu 25 | from megatron.data.T5_dataset import build_train_valid_test_datasets 26 | from megatron.model import T5ModelPipe, T5Model 27 | from megatron.training import pretrain 28 | from megatron.utils import get_masks_and_position_ids_for_t5 29 | from megatron.utils import reduce_losses 30 | from megatron.fp16 import fp32_to_fp16 31 | 32 | def model_provider(): 33 | """Build the model.""" 34 | args = get_args() 35 | print_rank_0('building T5 model ...') 36 | if args.pipe_parallel_size == 0 or args.pipe_parallel_size == 1: 37 | model = T5Model(num_tokentypes=0, parallel_output=True) 38 | else: 39 | model = T5ModelPipe(num_tokentypes=0, parallel_output=True, topology=mpu.get_topology()) 40 | model._megatron_batch_fn = get_batch_pipe 41 | model._input_grad = [True, False, True, False, False] 42 | model._input_type = ['float', 'int', 'float', 'int', 'int'] 43 | model._input_pipe_partitioned = [True, False, True, False, False] 44 | return model 45 | 46 | def get_batch(data_iterator): 47 | args = get_args() 48 | tokenizer = get_tokenizer() 49 | 50 | # Items and their type. 51 | keys = [ 52 | "contexts", 53 | "targets", 54 | "labels", 55 | "ctx_eod_mask", 56 | ] 57 | datatype = torch.int64 58 | 59 | if data_iterator is not None: 60 | data = next(data_iterator) 61 | else: 62 | data = None 63 | 64 | # Broadcast data. 65 | data_b = mpu.broadcast_data(keys, data, datatype) 66 | 67 | # Unpack. 68 | contexts = data_b['contexts'].long() 69 | targets = data_b['targets'].long() 70 | labels = data_b['labels'].long() 71 | ctx_eod_mask = data_b['ctx_eod_mask'].long() 72 | 73 | # Unpack. 74 | enc_token_ids = contexts 75 | dec_token_ids = targets 76 | 77 | # Get the masks and postition ids. 78 | enc_attn_mask, enc_pos_ids, dec_attn_mask, dec_pos_ids, cross_attn_mask, loss_mask = get_masks_and_position_ids_for_t5( 79 | args, 80 | tokenizer, 81 | contexts, 82 | targets, 83 | labels, 84 | ctx_eod_mask, 85 | args.reset_position_ids, 86 | args.reset_attention_mask) 87 | 88 | if args.fp16: 89 | # cast to fp16 because pipeline parallelism skips the FP16 wrapper. 90 | return fp32_to_fp16((enc_token_ids, enc_pos_ids, enc_attn_mask, 91 | dec_token_ids, dec_pos_ids, dec_attn_mask, 92 | cross_attn_mask)), fp32_to_fp16((labels, loss_mask)) 93 | else: 94 | return (enc_token_ids, enc_pos_ids, enc_attn_mask, 95 | dec_token_ids, dec_pos_ids, dec_attn_mask, 96 | cross_attn_mask), (labels, loss_mask) 97 | 98 | def get_batch_pipe(data): 99 | args = get_args() 100 | tokenizer = get_tokenizer() 101 | 102 | # Items and their type. 103 | keys = [ 104 | "contexts", 105 | "targets", 106 | "labels", 107 | "ctx_eod_mask", 108 | ] 109 | datatype = torch.int64 110 | 111 | # Broadcast data. 112 | data_b = mpu.broadcast_data(keys, data, datatype) 113 | 114 | # Unpack. 115 | contexts = data_b['contexts'].long() 116 | targets = data_b['targets'].long() 117 | labels = data_b['labels'].long() 118 | ctx_eod_mask = data_b['ctx_eod_mask'].long() 119 | 120 | # Unpack. 121 | enc_token_ids = contexts 122 | dec_token_ids = targets 123 | 124 | # Get the masks and postition ids. 125 | enc_attn_mask, enc_pos_ids, dec_attn_mask, dec_pos_ids, cross_attn_mask, loss_mask = get_masks_and_position_ids_for_t5( 126 | args, 127 | tokenizer, 128 | contexts, 129 | targets, 130 | labels, 131 | ctx_eod_mask, 132 | args.reset_position_ids, 133 | args.reset_attention_mask) 134 | 135 | if args.fp16: 136 | # cast to fp16 because pipeline parallelism skips the FP16 wrapper. 137 | return fp32_to_fp16((enc_token_ids, enc_pos_ids, enc_attn_mask, 138 | dec_token_ids, dec_pos_ids, dec_attn_mask, 139 | cross_attn_mask)), fp32_to_fp16((labels, loss_mask)) 140 | else: 141 | return (enc_token_ids, enc_pos_ids, enc_attn_mask, 142 | dec_token_ids, dec_pos_ids, dec_attn_mask, 143 | cross_attn_mask), (labels, loss_mask) 144 | 145 | def forward_step(data_iterator, model): 146 | """Forward step.""" 147 | args = get_args() 148 | timers = get_timers() 149 | 150 | # Get the batch. 151 | timers('batch generator').start() 152 | 153 | (enc_token_ids, enc_pos_ids, enc_attn_mask, 154 | dec_token_ids, dec_pos_ids, dec_attn_mask, 155 | cross_attn_mask), (labels, loss_mask) = get_batch(data_iterator) 156 | 157 | timers('batch generator').stop() 158 | 159 | # Forward model. 160 | losses = model(enc_token_ids, enc_pos_ids, enc_attn_mask, 161 | dec_token_ids, dec_pos_ids, dec_attn_mask, cross_attn_mask, 162 | labels=labels) 163 | 164 | loss_mask = loss_mask.view(-1) 165 | loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() 166 | 167 | # Reduce loss for logging. 168 | reduced_loss = reduce_losses([loss]) 169 | 170 | return loss, {'lm loss': reduced_loss[0]} 171 | 172 | 173 | 174 | def train_valid_test_datasets_provider(train_val_test_num_samples): 175 | """Build train, valid, and test datasets.""" 176 | args = get_args() 177 | tokenizer = get_tokenizer() 178 | 179 | print_rank_0('> building train, validation, and test datasets ' 180 | 'for Enc-Dec ...') 181 | train_ds, valid_ds, test_ds = build_train_valid_test_datasets( 182 | tokenizer=tokenizer, 183 | data_prefix=args.data_path, 184 | data_impl=args.data_impl, 185 | splits_string=args.split, 186 | train_valid_test_num_samples=train_val_test_num_samples, 187 | enc_seq_length=args.enc_seq_length, 188 | dec_seq_length=args.dec_seq_length, 189 | seed=args.seed, 190 | skip_warmup=(not args.mmap_warmup)) 191 | print_rank_0("> finished creating Enc-Dec datasets ...") 192 | 193 | return train_ds, valid_ds, test_ds 194 | 195 | 196 | if __name__ == "__main__": 197 | 198 | pretrain(train_valid_test_datasets_provider, model_provider, forward_step, 199 | args_defaults={'tokenizer_type': 'T5Tokenizer'}) 200 | -------------------------------------------------------------------------------- /megatron/tokenizer/t5_tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Tokenization classes for OpenAI T5.""" 17 | 18 | from __future__ import (absolute_import, division, print_function, 19 | unicode_literals) 20 | 21 | import json 22 | from io import open 23 | import jieba 24 | import collections 25 | import six 26 | 27 | 28 | try: 29 | from functools import lru_cache 30 | except ImportError: 31 | # Just a dummy decorator to get the checks to run on python2 32 | # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now. 33 | def lru_cache(): 34 | return lambda func: func 35 | 36 | 37 | def convert_to_unicode(text): 38 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" 39 | if six.PY3: 40 | if isinstance(text, str): 41 | return text 42 | elif isinstance(text, bytes): 43 | return text.decode("utf-8", "ignore") 44 | else: 45 | raise ValueError("Unsupported string type: %s" % (type(text))) 46 | elif six.PY2: 47 | if isinstance(text, str): 48 | return text.decode("utf-8", "ignore") 49 | elif isinstance(text, unicode): 50 | return text 51 | else: 52 | raise ValueError("Unsupported string type: %s" % (type(text))) 53 | else: 54 | raise ValueError("Not running on Python2 or Python 3?") 55 | 56 | 57 | def load_vocab(vocab_file): 58 | """Loads a vocabulary file into a dictionary.""" 59 | if vocab_file.find(".json")!=-1: 60 | with open(vocab_file, "r") as reader: 61 | vocab = json.loads(reader.read()) 62 | else: 63 | vocab = collections.OrderedDict() 64 | index = 0 65 | with open(vocab_file, "r") as reader: 66 | while True: 67 | token = convert_to_unicode(reader.readline()) 68 | if not token: 69 | break 70 | token = token.strip() 71 | vocab[token] = index 72 | index += 1 73 | return vocab 74 | 75 | 76 | class WordpieceTokenizer(object): 77 | 78 | def __init__(self, vocab, unk_token="", max_input_chars_per_word=200): 79 | self.vocab = vocab 80 | self.unk_token = unk_token 81 | self.max_input_chars_per_word = max_input_chars_per_word 82 | 83 | def tokenize(self, token): 84 | 85 | token = convert_to_unicode(token) 86 | 87 | chars = list(token) 88 | if len(chars) > self.max_input_chars_per_word: 89 | return [self.unk_token] 90 | 91 | start = 0 92 | sub_tokens = [] 93 | while start < len(chars): 94 | end = len(chars) 95 | cur_substr = None 96 | while start < end: 97 | substr = "".join(chars[start:end]) 98 | if substr in self.vocab: 99 | cur_substr = substr 100 | break 101 | end -= 1 102 | if cur_substr is None: 103 | sub_tokens.append(self.unk_token) 104 | start += 1 105 | continue 106 | sub_tokens.append(cur_substr) 107 | start = end 108 | 109 | return sub_tokens 110 | 111 | 112 | class T5Tokenizer(object): 113 | 114 | def __init__(self, vocab_file, max_len=None, max_sentinels=190): 115 | self.max_len = max_len if max_len is not None else int(1e12) 116 | self.encoder = load_vocab(vocab_file) 117 | self.decoder = {v:k for k,v in self.encoder.items()} 118 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder) 119 | 120 | self.translator = str.maketrans(" \n", "\u2582\u2583") 121 | 122 | self.sentinel_list = [self.encoder[''.format(i)] for i in range(max_sentinels)] 123 | 124 | @property 125 | def vocab_size(self): 126 | return len(self.encoder) 127 | 128 | def __len__(self): 129 | return len(self.encoder) 130 | 131 | @property 132 | def eod_id(self): 133 | return self.encoder[self.eod_token] 134 | 135 | @property 136 | def pad_id(self): 137 | return self.encoder[self.pad_token] 138 | 139 | @property 140 | def eod_token(self): 141 | return '' 142 | 143 | @property 144 | def pad_token(self): 145 | return '' 146 | 147 | def get_sentinel_num(self): 148 | return len(self.sentinel_list) 149 | 150 | def get_sentinel_id(self, idx): 151 | return self.sentinel_list[idx] 152 | 153 | def tokenize(self, text): 154 | """ Tokenize a string. """ 155 | output_tokens = [] 156 | for x in jieba.cut(text, cut_all=False): 157 | x = x.translate(self.translator) 158 | output_tokens.extend(self.wordpiece_tokenizer.tokenize(x)) 159 | return output_tokens 160 | 161 | def encode(self, text): 162 | res = [self.encoder[x] for x in self.tokenize(text)] 163 | return res 164 | 165 | def decode(self, tokens): 166 | text = ''.join([self.decoder[x] for x in tokens]) 167 | text = text.replace('\u2582', ' ').replace('\u2583', '\n') 168 | return text 169 | 170 | class GPT2TokenizerwoMerge(object): 171 | 172 | def __init__(self, vocab_file, max_len=None, max_sentinels=190): 173 | self.max_len = max_len if max_len is not None else int(1e12) 174 | self.encoder = load_vocab(vocab_file) 175 | self.decoder = {v:k for k,v in self.encoder.items()} 176 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder) 177 | self.translator = str.maketrans(" \n", "\u2582\u2583") 178 | 179 | @property 180 | def vocab_size(self): 181 | return len(self.encoder) 182 | 183 | def __len__(self): 184 | return len(self.encoder) 185 | 186 | @property 187 | def eod_id(self): 188 | return self.encoder[self.eod_token] 189 | 190 | @property 191 | def pad_id(self): 192 | return self.encoder[self.pad_token] 193 | 194 | @property 195 | def eod_token(self): 196 | return '' 197 | 198 | @property 199 | def pad_token(self): 200 | return '' 201 | 202 | def tokenize(self, text): 203 | """ Tokenize a string. """ 204 | output_tokens = [] 205 | for x in jieba.cut(text, cut_all=False): 206 | x = x.translate(self.translator) 207 | output_tokens.extend(self.wordpiece_tokenizer.tokenize(x)) 208 | return output_tokens 209 | 210 | def encode(self, text): 211 | res = [self.encoder[x] for x in self.tokenize(text)] 212 | return res 213 | 214 | def decode(self, tokens): 215 | text = ''.join([self.decoder[x] for x in tokens]) 216 | text = text.replace('\u2582', ' ').replace('\u2583', '\n') 217 | return text 218 | -------------------------------------------------------------------------------- /megatron/mpu/tests/test_random.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from commons import print_separator 17 | from commons import initialize_distributed 18 | import mpu 19 | import torch 20 | import sys 21 | sys.path.append("../..") 22 | 23 | 24 | def test_set_cuda_rng_state(model_parallel_size): 25 | 26 | if torch.distributed.get_rank() == 0: 27 | print('> testing set_rng_state with size {} ...'. 28 | format(model_parallel_size)) 29 | 30 | mpu.initialize_model_parallel(model_parallel_size) 31 | model_parallel_size = mpu.get_model_parallel_world_size() 32 | 33 | size = 123 34 | seed = 1234 35 | torch.cuda.manual_seed(1234) 36 | tensor = torch.cuda.FloatTensor(size) 37 | 38 | # Get the state 39 | rng_state = torch.cuda.get_rng_state() 40 | rng_state_copy = rng_state.clone() 41 | 42 | # Do some stuff. 43 | for _ in range(5): 44 | torch.randn(size, out=tensor) 45 | result_1 = tensor.clone() 46 | 47 | assert rng_state.sub(rng_state_copy).max() == 0 48 | assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0 49 | 50 | # State should be different. 51 | new_rng_state = torch.cuda.get_rng_state() 52 | max_diff = new_rng_state.sub(rng_state).max() 53 | print(' max diff in rng state (should be non-zero) on global rank {}: {}'. 54 | format(torch.distributed.get_rank(), max_diff)) 55 | assert max_diff > 0 56 | 57 | # Reset the rng state and do the same stuff. 58 | mpu.random._set_cuda_rng_state(rng_state) 59 | for _ in range(5): 60 | torch.randn(size, out=tensor) 61 | mpu.random._set_cuda_rng_state(rng_state) 62 | for _ in range(5): 63 | torch.randn(size, out=tensor) 64 | result_2 = tensor.clone() 65 | 66 | # Results should be the same 67 | error = result_2.sub(result_1).abs().max() 68 | print(' max error in generated tensors (should be zero) on ' 69 | 'global rank {}: {}'.format(torch.distributed.get_rank(), error)) 70 | assert error < 1.0e-6 71 | 72 | # Input state should have remained intact. 73 | error = rng_state.sub(rng_state_copy).max() 74 | print(' max error in rng state (should be zero) on global rank {}: {}'. 75 | format(torch.distributed.get_rank(), error)) 76 | assert error == 0 77 | 78 | # Reset groups 79 | mpu.destroy_model_parallel() 80 | 81 | torch.distributed.barrier() 82 | if torch.distributed.get_rank() == 0: 83 | print('>> passed the test :-)') 84 | 85 | 86 | def test_cuda_rng_tracker(model_parallel_size): 87 | 88 | if torch.distributed.get_rank() == 0: 89 | print('> testing cuda rng tracker with size {} ...'. 90 | format(model_parallel_size)) 91 | 92 | mpu.initialize_model_parallel(model_parallel_size) 93 | model_parallel_size = mpu.get_model_parallel_world_size() 94 | 95 | seed_1 = 1234 96 | seed_2 = 4321 97 | size = [12, 21] 98 | tensor = torch.cuda.FloatTensor(size) 99 | 100 | # Set to seed_1 and generate two tensors. 101 | torch.cuda.manual_seed(seed_1) 102 | torch.randn(size, out=tensor) 103 | target_11 = tensor.clone() 104 | torch.randn(size, out=tensor) 105 | target_12 = tensor.clone() 106 | 107 | # Set to seed_2 and generate two tensors. 108 | torch.cuda.manual_seed(seed_2) 109 | torch.randn(size, out=tensor) 110 | target_21 = tensor.clone() 111 | torch.randn(size, out=tensor) 112 | target_22 = tensor.clone() 113 | 114 | # Now if we interleave seed_1 and seed_2, 115 | # we should still get the same tensors 116 | torch.cuda.manual_seed(seed_1) 117 | mpu.get_cuda_rng_tracker().add('test', seed_2) 118 | 119 | torch.randn(size, out=tensor) 120 | result_11 = tensor.clone() 121 | 122 | with mpu.get_cuda_rng_tracker().fork('test'): 123 | torch.randn(size, out=tensor) 124 | result_21 = tensor.clone() 125 | 126 | torch.randn(size, out=tensor) 127 | result_12 = tensor.clone() 128 | 129 | with mpu.get_cuda_rng_tracker().fork('test'): 130 | torch.randn(size, out=tensor) 131 | result_22 = tensor.clone() 132 | 133 | diff = result_11.sub(result_21).abs().max() 134 | diff = min(diff, result_12.sub(result_22).abs().max()) 135 | print(' max diff in generated tensors (should be non-zero) on ' 136 | 'global rank {}: {}'.format(torch.distributed.get_rank(), diff)) 137 | assert diff > 1.0e-6 138 | error = max(result_11.sub(target_11).abs().max(), 139 | result_12.sub(target_12).abs().max()) 140 | error = max(error, result_21.sub(target_21).abs().max()) 141 | error = max(error, result_22.sub(target_22).abs().max()) 142 | print(' max error in generated tensors (should be zero) on ' 143 | 'global rank {}: {}'.format(torch.distributed.get_rank(), error)) 144 | assert error < 1.0e-6 145 | 146 | # Reset the tracker 147 | mpu.get_cuda_rng_tracker().reset() 148 | 149 | # Reset groups 150 | mpu.destroy_model_parallel() 151 | 152 | torch.distributed.barrier() 153 | if torch.distributed.get_rank() == 0: 154 | print('>> passed the test :-)') 155 | 156 | 157 | def test_model_parallel_cuda_manual_seed(model_parallel_size): 158 | 159 | if torch.distributed.get_rank() == 0: 160 | print('> testing model parallel cuda manual seed with size {} ...'. 161 | format(model_parallel_size)) 162 | 163 | mpu.initialize_model_parallel(model_parallel_size) 164 | model_parallel_size = mpu.get_model_parallel_world_size() 165 | 166 | mpu.model_parallel_cuda_manual_seed(12345) 167 | assert torch.cuda.initial_seed() == 12345 168 | with mpu.get_cuda_rng_tracker().fork(): 169 | assert torch.cuda.initial_seed() == (12345 + 2718 + 170 | mpu.get_model_parallel_rank()) 171 | 172 | # Reset the tracker 173 | mpu.get_cuda_rng_tracker().reset() 174 | 175 | # Reset groups 176 | mpu.destroy_model_parallel() 177 | 178 | torch.distributed.barrier() 179 | if torch.distributed.get_rank() == 0: 180 | print('>> passed the test :-)') 181 | 182 | 183 | if __name__ == '__main__': 184 | 185 | initialize_distributed() 186 | world_size = torch.distributed.get_world_size() 187 | 188 | model_parallel_size = 1 189 | while model_parallel_size <= world_size: 190 | print_separator('test set rng state') 191 | test_set_cuda_rng_state(model_parallel_size) 192 | model_parallel_size *= 2 193 | 194 | model_parallel_size = 1 195 | while model_parallel_size <= world_size: 196 | print_separator('test cuda rng tracker') 197 | test_cuda_rng_tracker(model_parallel_size) 198 | model_parallel_size *= 2 199 | 200 | model_parallel_size = 1 201 | while model_parallel_size <= world_size: 202 | print_separator('test model parallel cuda manual seed') 203 | test_model_parallel_cuda_manual_seed(model_parallel_size) 204 | model_parallel_size *= 2 205 | -------------------------------------------------------------------------------- /megatron/deprecated_data_utils/lazy_loader.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """utils for loading text from disk""" 16 | import os 17 | import mmap 18 | import pickle as pkl 19 | import time 20 | from itertools import accumulate 21 | 22 | import torch 23 | from torch.multiprocessing import Lock 24 | 25 | 26 | def get_lazy_path(path): 27 | """ 28 | Gets directory path where lazy files are stored. 29 | """ 30 | return os.path.splitext(path)[0] + '.lazy' 31 | 32 | 33 | def exists_lazy(path, data_type='data'): 34 | """ 35 | Check if we've already made a lazy version of this file for the `data_type` field. 36 | """ 37 | if not os.path.exists(get_lazy_path(path)): 38 | return False 39 | contents = os.listdir(get_lazy_path(path)) 40 | if data_type not in contents: 41 | return False 42 | if data_type + '.len.pkl' not in contents: 43 | return False 44 | return True 45 | 46 | 47 | def make_lazy(path, strs, data_type='data'): 48 | """ 49 | Make lazy version of `data_type` field of the file. Byte offsets 50 | corresponding to data indices are stored in a `.len.pkl` data file. 51 | """ 52 | lazypath = get_lazy_path(path) 53 | if not os.path.exists(lazypath): 54 | os.makedirs(lazypath) 55 | datapath = os.path.join(lazypath, data_type) 56 | lenpath = os.path.join(lazypath, data_type + '.len.pkl') 57 | if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: 58 | with open(datapath, 'wb') as f: 59 | str_lens = [] 60 | str_cnt = 0 61 | for s in strs: 62 | if isinstance(s, dict): 63 | s = s['text'] 64 | encoded = s.encode('utf-8') 65 | f.write(encoded) 66 | str_cnt = len(encoded) 67 | str_lens.append(str_cnt) 68 | pkl.dump(str_lens, open(lenpath, 'wb')) 69 | else: 70 | while not os.path.exists(lenpath): 71 | time.sleep(1) 72 | 73 | 74 | def split_strings(strings, start, chr_lens): 75 | """ 76 | Split strings based on string lengths and given start. 77 | """ 78 | return [strings[i - start:j - start] for i, j in zip([start] + chr_lens[:-1], chr_lens)] 79 | 80 | 81 | class ProcessorTokenizer: 82 | """ 83 | callable class that runs a preprocessing, as well as tokenization step, 84 | on input text. 85 | """ 86 | 87 | def __init__(self, tokenizer, process_fn=None): 88 | self.tokenizer = tokenizer 89 | self.process_fn = process_fn 90 | 91 | def __call__(self, string): 92 | if self.tokenizer is not None: 93 | string = self.tokenizer(string, process_fn=self.process_fn) 94 | elif self.process_fn is not None: 95 | string = self.process_fn(string) 96 | return string 97 | 98 | 99 | class lazy_array_loader(object): 100 | """ 101 | Arguments: 102 | path: path to directory where array entries are concatenated into one big string file 103 | and the .len file are located 104 | data_type (str): Some datsets have multiple fields that are stored in different paths. 105 | `data_type` specifies which of these fields to load in this class 106 | mem_map (boolean): Specifies whether to memory map file `path` 107 | map_fn (callable): Fetched strings are passed through map_fn before being returned. 108 | 109 | Example of lazy loader directory structure: 110 | file.json 111 | file.lazy/ 112 | data_type1 113 | data_type1.len.pkl 114 | data_type2 115 | data_type2.len.pkl 116 | """ 117 | 118 | def __init__(self, path, data_type='data', mem_map=False, map_fn=None): 119 | lazypath = get_lazy_path(path) 120 | datapath = os.path.join(lazypath, data_type) 121 | # get file where array entries are concatenated into one big string 122 | self._file = open(datapath, 'rb', buffering=0) 123 | self.file = self._file 124 | # memory map file if necessary 125 | self.mem_map = mem_map 126 | if self.mem_map: 127 | self.file = mmap.mmap(self.file.fileno(), 0, prot=mmap.PROT_READ) 128 | lenpath = os.path.join(lazypath, data_type + '.len.pkl') 129 | self.lens = pkl.load(open(lenpath, 'rb')) 130 | self.ends = list(accumulate(self.lens)) 131 | self.dumb_ends = list(self.ends) 132 | self.read_lock = Lock() 133 | self.process_fn = map_fn 134 | self.map_fn = map_fn 135 | self._tokenizer = None 136 | 137 | def SetTokenizer(self, tokenizer): 138 | """ 139 | logic to set and remove (set to None) tokenizer. 140 | combines preprocessing/tokenization into one callable. 141 | """ 142 | if tokenizer is None: 143 | if not hasattr(self, '_tokenizer'): 144 | self._tokenizer = tokenizer 145 | else: 146 | self._tokenizer = tokenizer 147 | self.map_fn = ProcessorTokenizer(tokenizer, self.process_fn) 148 | 149 | def GetTokenizer(self): 150 | return self._tokenizer 151 | 152 | def __getitem__(self, index): 153 | """ 154 | read file and splice strings based on string ending array `self.ends` 155 | """ 156 | if not isinstance(index, slice): 157 | if index == 0: 158 | start = 0 159 | else: 160 | start = self.ends[index - 1] 161 | end = self.ends[index] 162 | rtn = self.file_read(start, end) 163 | if self.map_fn is not None: 164 | return self.map_fn(rtn) 165 | else: 166 | # if slice, fetch strings with 1 diskread and then splice in memory 167 | chr_lens = self.ends[index] 168 | if index.start == 0 or index.start is None: 169 | start = 0 170 | else: 171 | start = self.ends[index.start - 1] 172 | stop = chr_lens[-1] 173 | strings = self.file_read(start, stop) 174 | rtn = split_strings(strings, start, chr_lens) 175 | if self.map_fn is not None: 176 | return self.map_fn([s for s in rtn]) 177 | return rtn 178 | 179 | def __len__(self): 180 | return len(self.ends) 181 | 182 | def file_read(self, start=0, end=None): 183 | """read specified portion of file""" 184 | 185 | # atomic reads to avoid race conditions with multiprocess dataloader 186 | self.read_lock.acquire() 187 | # seek to start of file read 188 | self.file.seek(start) 189 | # read to end of file if no end point provided 190 | if end is None: 191 | rtn = self.file.read() 192 | # else read amount needed to reach end point 193 | else: 194 | rtn = self.file.read(end - start) 195 | self.read_lock.release() 196 | # TODO: @raulp figure out mem map byte string bug 197 | # if mem map'd need to decode byte string to string 198 | rtn = rtn.decode('utf-8', 'ignore') 199 | # rtn = str(rtn) 200 | if self.mem_map: 201 | rtn = rtn.decode('unicode_escape') 202 | return rtn 203 | -------------------------------------------------------------------------------- /megatron/data/realm_dataset_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | import numpy as np 5 | import torch 6 | 7 | from megatron import mpu, print_rank_0 8 | from megatron.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy 9 | from megatron.data.samplers import DistributedBatchSampler 10 | from megatron import get_args, get_tokenizer, print_rank_0, mpu 11 | 12 | 13 | def get_one_epoch_dataloader(dataset, batch_size=None): 14 | """Specifically one epoch to be used in an indexing job.""" 15 | args = get_args() 16 | 17 | world_size = mpu.get_data_parallel_world_size() 18 | rank = mpu.get_data_parallel_rank() 19 | if batch_size is None: 20 | batch_size = args.batch_size 21 | global_batch_size = batch_size * world_size 22 | num_workers = args.num_workers 23 | 24 | sampler = torch.utils.data.SequentialSampler(dataset) 25 | # importantly, drop_last must be False to get all the data. 26 | batch_sampler = DistributedBatchSampler(sampler, 27 | batch_size=global_batch_size, 28 | drop_last=False, 29 | rank=rank, 30 | world_size=world_size) 31 | 32 | return torch.utils.data.DataLoader(dataset, 33 | batch_sampler=batch_sampler, 34 | num_workers=num_workers, 35 | pin_memory=True) 36 | 37 | 38 | def get_ict_batch(data_iterator): 39 | # Items and their type. 40 | keys = ['query_tokens', 'query_pad_mask', 41 | 'block_tokens', 'block_pad_mask', 'block_data'] 42 | datatype = torch.int64 43 | 44 | # Broadcast data. 45 | if data_iterator is None: 46 | data = None 47 | else: 48 | data = next(data_iterator) 49 | data_b = mpu.broadcast_data(keys, data, datatype) 50 | 51 | # Unpack. 52 | query_tokens = data_b['query_tokens'].long() 53 | query_pad_mask = data_b['query_pad_mask'].long() 54 | block_tokens = data_b['block_tokens'].long() 55 | block_pad_mask = data_b['block_pad_mask'].long() 56 | block_indices = data_b['block_data'].long() 57 | 58 | return query_tokens, query_pad_mask,\ 59 | block_tokens, block_pad_mask, block_indices 60 | 61 | 62 | def join_str_list(str_list): 63 | """Join a list of strings, handling spaces appropriately""" 64 | result = "" 65 | for s in str_list: 66 | if s.startswith("##"): 67 | result += s[2:] 68 | else: 69 | result += " " + s 70 | return result 71 | 72 | 73 | class BlockSampleData(object): 74 | """A struct for fully describing a fixed-size block of data as used in REALM 75 | 76 | :param start_idx: for first sentence of the block 77 | :param end_idx: for last sentence of the block (may be partially truncated in sample construction) 78 | :param doc_idx: the index of the document from which the block comes in the original indexed dataset 79 | :param block_idx: a unique integer identifier given to every block. 80 | """ 81 | def __init__(self, start_idx, end_idx, doc_idx, block_idx): 82 | self.start_idx = start_idx 83 | self.end_idx = end_idx 84 | self.doc_idx = doc_idx 85 | self.block_idx = block_idx 86 | 87 | def as_array(self): 88 | return np.array([self.start_idx, self.end_idx, self.doc_idx, self.block_idx]).astype(np.int64) 89 | 90 | def as_tuple(self): 91 | return self.start_idx, self.end_idx, self.doc_idx, self.block_idx 92 | 93 | 94 | class BlockSamplesMapping(object): 95 | def __init__(self, mapping_array): 96 | # make sure that the array is compatible with BlockSampleData 97 | assert mapping_array.shape[1] == 4 98 | self.mapping_array = mapping_array 99 | 100 | def __len__(self): 101 | return self.mapping_array.shape[0] 102 | 103 | def __getitem__(self, idx): 104 | """Get the data associated with an indexed sample.""" 105 | sample_data = BlockSampleData(*self.mapping_array[idx]) 106 | return sample_data 107 | 108 | 109 | def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epochs, 110 | max_num_samples, max_seq_length, seed, name, use_one_sent_docs=False): 111 | """Get samples mapping for a dataset over fixed size blocks. This function also requires 112 | a dataset of the titles for the source documents since their lengths must be taken into account. 113 | 114 | :return: samples_mapping (BlockSamplesMapping) 115 | """ 116 | 117 | if not num_epochs: 118 | if not max_num_samples: 119 | raise ValueError("Need to specify either max_num_samples " 120 | "or num_epochs") 121 | num_epochs = np.iinfo(np.int32).max - 1 122 | if not max_num_samples: 123 | max_num_samples = np.iinfo(np.int64).max - 1 124 | 125 | # Filename of the index mapping 126 | indexmap_filename = data_prefix 127 | indexmap_filename += '_{}_indexmap'.format(name) 128 | if num_epochs != (np.iinfo(np.int32).max - 1): 129 | indexmap_filename += '_{}ep'.format(num_epochs) 130 | if max_num_samples != (np.iinfo(np.int64).max - 1): 131 | indexmap_filename += '_{}mns'.format(max_num_samples) 132 | indexmap_filename += '_{}msl'.format(max_seq_length) 133 | indexmap_filename += '_{}s'.format(seed) 134 | if use_one_sent_docs: 135 | indexmap_filename += '_1sentok' 136 | indexmap_filename += '.npy' 137 | 138 | # Build the indexed mapping if not exist. 139 | if mpu.get_data_parallel_rank() == 0 and \ 140 | not os.path.isfile(indexmap_filename): 141 | print(' > WARNING: could not find index map file {}, building ' 142 | 'the indices on rank 0 ...'.format(indexmap_filename)) 143 | 144 | # Make sure the types match the helpers input types. 145 | assert block_dataset.doc_idx.dtype == np.int64 146 | assert block_dataset.sizes.dtype == np.int32 147 | 148 | # Build samples mapping 149 | verbose = torch.distributed.get_rank() == 0 150 | start_time = time.time() 151 | print_rank_0(' > building samples index mapping for {} ...'.format( 152 | name)) 153 | 154 | # compile/bind the C++ helper code 155 | from megatron.data.dataset_utils import compile_helper 156 | compile_helper() 157 | 158 | from megatron.data import helpers 159 | mapping_array = helpers.build_blocks_mapping( 160 | block_dataset.doc_idx, 161 | block_dataset.sizes, 162 | title_dataset.sizes, 163 | num_epochs, 164 | max_num_samples, 165 | max_seq_length - 3, # account for added tokens 166 | seed, 167 | verbose, 168 | use_one_sent_docs) 169 | 170 | 171 | print_rank_0(' > done building samples index mapping') 172 | np.save(indexmap_filename, mapping_array, allow_pickle=True) 173 | print_rank_0(' > saved the index mapping in {}'.format( 174 | indexmap_filename)) 175 | # Make sure all the ranks have built the mapping 176 | print_rank_0(' > elapsed time to build and save samples mapping ' 177 | '(seconds): {:4f}'.format( 178 | time.time() - start_time)) 179 | 180 | # This should be a barrier but nccl barrier assumes 181 | # device_index=rank which is not the case for model 182 | # parallel case 183 | counts = torch.cuda.LongTensor([1]) 184 | torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) 185 | assert counts[0].item() == torch.distributed.get_world_size( 186 | group=mpu.get_data_parallel_group()) 187 | 188 | # Load indexed dataset. 189 | print_rank_0(' > loading indexed mapping from {}'.format( 190 | indexmap_filename)) 191 | start_time = time.time() 192 | 193 | mapping_array = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r') 194 | samples_mapping = BlockSamplesMapping(mapping_array) 195 | 196 | print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( 197 | time.time() - start_time)) 198 | print_rank_0(' total number of samples: {}'.format( 199 | mapping_array.shape[0])) 200 | 201 | return samples_mapping 202 | -------------------------------------------------------------------------------- /megatron/model/bert_model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """BERT model.""" 17 | 18 | import torch 19 | 20 | from megatron import get_args 21 | from megatron import mpu 22 | from megatron.model.language_model import parallel_lm_logits 23 | from megatron.model.language_model import get_language_model 24 | from megatron.model.transformer import LayerNorm 25 | from megatron.model.utils import openai_gelu, erf_gelu 26 | from megatron.model.utils import get_linear_layer 27 | from megatron.model.utils import init_method_normal 28 | from megatron.model.utils import scaled_init_method_normal 29 | from megatron.module import MegatronModule 30 | 31 | def bert_attention_mask_func(attention_scores, attention_mask): 32 | attention_scores.masked_fill_(attention_mask, -10000.0) 33 | return attention_scores 34 | 35 | def bert_extended_attention_mask(attention_mask): 36 | # We create a 3D attention mask from a 2D tensor mask. 37 | # [b, 1, s] 38 | attention_mask_b1s = attention_mask.unsqueeze(1) 39 | # [b, s, 1] 40 | attention_mask_bs1 = attention_mask.unsqueeze(2) 41 | # [b, s, s] 42 | attention_mask_bss = attention_mask_b1s * attention_mask_bs1 43 | # [b, 1, s, s] 44 | extended_attention_mask = attention_mask_bss.unsqueeze(1) 45 | 46 | # Convert attention mask to binary: 47 | extended_attention_mask = (extended_attention_mask < 0.5) 48 | 49 | return extended_attention_mask 50 | 51 | def bert_position_ids(token_ids): 52 | # Create position ids 53 | seq_length = token_ids.size(1) 54 | position_ids = torch.arange(seq_length, dtype=torch.long, 55 | device=token_ids.device) 56 | position_ids = position_ids.unsqueeze(0).expand_as(token_ids) 57 | 58 | return position_ids 59 | 60 | 61 | class BertLMHead(MegatronModule): 62 | """Masked LM head for Bert 63 | 64 | Arguments: 65 | mpu_vocab_size: model parallel size of vocabulary. 66 | hidden_size: hidden size 67 | init_method: init method for weight initialization 68 | layernorm_epsilon: tolerance for layer norm divisions 69 | parallel_output: whether output logits being distributed or not. 70 | """ 71 | 72 | def __init__(self, mpu_vocab_size, hidden_size, init_method, 73 | layernorm_epsilon, parallel_output): 74 | 75 | super(BertLMHead, self).__init__() 76 | 77 | args = get_args() 78 | 79 | self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size)) 80 | self.bias.model_parallel = True 81 | self.bias.partition_dim = 0 82 | self.bias.stride = 1 83 | self.parallel_output = parallel_output 84 | 85 | self.dense = get_linear_layer(hidden_size, hidden_size, init_method) 86 | self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) 87 | self.gelu = torch.nn.functional.gelu 88 | if args.openai_gelu: 89 | self.gelu = openai_gelu 90 | elif args.onnx_safe: 91 | self.gelu = erf_gelu 92 | 93 | def forward(self, hidden_states, word_embeddings_weight): 94 | hidden_states = self.dense(hidden_states) 95 | hidden_states = self.gelu(hidden_states) 96 | hidden_states = self.layernorm(hidden_states) 97 | output = parallel_lm_logits(hidden_states, 98 | word_embeddings_weight, 99 | self.parallel_output, 100 | bias=self.bias) 101 | return output 102 | 103 | 104 | class BertModel(MegatronModule): 105 | """Bert Language model.""" 106 | 107 | def __init__(self, num_tokentypes=2, add_binary_head=True, 108 | parallel_output=True): 109 | super(BertModel, self).__init__() 110 | args = get_args() 111 | 112 | self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy 113 | self.add_binary_head = add_binary_head 114 | self.parallel_output = parallel_output 115 | init_method = init_method_normal(args.init_method_std) 116 | scaled_init_method = scaled_init_method_normal(args.init_method_std, 117 | args.num_layers) 118 | 119 | self.language_model, self._language_model_key = get_language_model( 120 | attention_mask_func=bert_attention_mask_func, 121 | num_tokentypes=num_tokentypes, 122 | add_pooler=self.add_binary_head, 123 | init_method=init_method, 124 | scaled_init_method=scaled_init_method) 125 | 126 | self.lm_head = BertLMHead( 127 | self.language_model.embedding.word_embeddings.weight.size(0), 128 | args.hidden_size, init_method, args.layernorm_epsilon, parallel_output) 129 | self._lm_head_key = 'lm_head' 130 | if self.add_binary_head: 131 | self.binary_head = get_linear_layer(args.hidden_size, 2, 132 | init_method) 133 | self._binary_head_key = 'binary_head' 134 | 135 | def forward(self, input_ids, attention_mask, 136 | tokentype_ids=None, lm_labels=None): 137 | 138 | extended_attention_mask = bert_extended_attention_mask(attention_mask) 139 | position_ids = bert_position_ids(input_ids) 140 | 141 | if self.add_binary_head: 142 | lm_output, pooled_output = self.language_model( 143 | input_ids, 144 | position_ids, 145 | extended_attention_mask, 146 | tokentype_ids=tokentype_ids) 147 | else: 148 | lm_output = self.language_model( 149 | input_ids, 150 | position_ids, 151 | extended_attention_mask, 152 | tokentype_ids=tokentype_ids) 153 | 154 | # Output. 155 | lm_logits = self.lm_head( 156 | lm_output, self.language_model.embedding.word_embeddings.weight) 157 | 158 | binary_logits = None 159 | if self.add_binary_head: 160 | binary_logits = self.binary_head(pooled_output) 161 | 162 | if lm_labels is None: 163 | return lm_logits, binary_logits 164 | else: 165 | if self.fp16_lm_cross_entropy: 166 | assert lm_logits.dtype == torch.half 167 | lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits, lm_labels) 168 | else: 169 | lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits.float(), 170 | lm_labels) 171 | return lm_loss, binary_logits 172 | 173 | 174 | def state_dict_for_save_checkpoint(self, destination=None, prefix='', 175 | keep_vars=False): 176 | """For easy load when model is combined with other heads, 177 | add an extra key.""" 178 | 179 | state_dict_ = {} 180 | state_dict_[self._language_model_key] \ 181 | = self.language_model.state_dict_for_save_checkpoint( 182 | destination, prefix, keep_vars) 183 | state_dict_[self._lm_head_key] \ 184 | = self.lm_head.state_dict_for_save_checkpoint( 185 | destination, prefix, keep_vars) 186 | if self.add_binary_head: 187 | state_dict_[self._binary_head_key] \ 188 | = self.binary_head.state_dict(destination, prefix, keep_vars) 189 | return state_dict_ 190 | 191 | def load_state_dict(self, state_dict, strict=True): 192 | """Customized load.""" 193 | 194 | self.language_model.load_state_dict( 195 | state_dict[self._language_model_key], strict=strict) 196 | self.lm_head.load_state_dict( 197 | state_dict[self._lm_head_key], strict=strict) 198 | if self.add_binary_head: 199 | self.binary_head.load_state_dict( 200 | state_dict[self._binary_head_key], strict=strict) 201 | -------------------------------------------------------------------------------- /tds/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright 2020 The Microsoft DeepSpeed Team 3 | ''' 4 | import sys 5 | import types 6 | 7 | import deepspeed 8 | from deepspeed import ops 9 | from deepspeed import pipe 10 | from deepspeed import runtime 11 | from deepspeed.runtime.engine import DeepSpeedEngine 12 | from deepspeed.runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER 13 | from deepspeed.runtime.lr_schedules import add_tuning_arguments 14 | from deepspeed.runtime.config import DeepSpeedConfig, DeepSpeedConfigError 15 | from deepspeed.runtime.activation_checkpointing import checkpointing 16 | from deepspeed.ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig 17 | from deepspeed.utils import log_dist 18 | from deepspeed.utils.distributed import init_distributed 19 | from deepspeed.pipe import PipelineModule 20 | from deepspeed.git_version_info import version, git_hash, git_branch 21 | 22 | from .TPipelineEngine import PipelineEngine 23 | 24 | def _parse_version(version_str): 25 | '''Parse a version string and extract the major, minor, and patch versions.''' 26 | import re 27 | matched = re.search('^(\d+)\.(\d+)\.(\d+)', version_str) 28 | return int(matched.group(1)), int(matched.group(2)), int(matched.group(3)) 29 | 30 | # Export version information 31 | __version__ = version 32 | __version_major__, __version_minor__, __version_patch__ = _parse_version(__version__) 33 | __git_hash__ = git_hash 34 | __git_branch__ = git_branch 35 | 36 | # Provide backwards compatability with old deepspeed.pt module structure, should hopefully not be used 37 | pt = types.ModuleType('pt', 'dummy pt module for backwards compatability') 38 | deepspeed = sys.modules[__name__] 39 | setattr(deepspeed, 'pt', pt) 40 | setattr(deepspeed.pt, 'deepspeed_utils', deepspeed.runtime.utils) 41 | sys.modules['deepspeed.pt'] = deepspeed.pt 42 | sys.modules['deepspeed.pt.deepspeed_utils'] = deepspeed.runtime.utils 43 | setattr(deepspeed.pt, 'deepspeed_config', deepspeed.runtime.config) 44 | sys.modules['deepspeed.pt.deepspeed_config'] = deepspeed.runtime.config 45 | setattr(deepspeed.pt, 'loss_scaler', deepspeed.runtime.fp16.loss_scaler) 46 | sys.modules['deepspeed.pt.loss_scaler'] = deepspeed.runtime.fp16.loss_scaler 47 | 48 | 49 | def initialize(args, 50 | model, 51 | optimizer=None, 52 | model_parameters=None, 53 | training_data=None, 54 | lr_scheduler=None, 55 | mpu=None, 56 | dist_init_required=None, 57 | collate_fn=None, 58 | config_params=None): 59 | """Initialize the DeepSpeed Engine. 60 | 61 | Arguments: 62 | args: a dictionary containing local_rank and deepspeed_config 63 | file location 64 | 65 | model: Required: nn.module class before apply any wrappers 66 | 67 | optimizer: Optional: a user defined optimizer, this is typically used instead of defining 68 | an optimizer in the DeepSpeed json config. 69 | 70 | model_parameters: Optional: An iterable of torch.Tensors or dicts. 71 | Specifies what Tensors should be optimized. 72 | 73 | training_data: Optional: Dataset of type torch.utils.data.Dataset 74 | 75 | lr_scheduler: Optional: Learning Rate Scheduler Object. It should define a get_lr(), 76 | step(), state_dict(), and load_state_dict() methods 77 | 78 | mpu: Optional: A model parallelism unit object that implements 79 | get_{model,data}_parallel_{rank,group,world_size}() 80 | 81 | dist_init_required: Optional: None will auto-initialize torch.distributed if needed, 82 | otherwise the user can force it to be initialized or not via boolean. 83 | 84 | collate_fn: Optional: Merges a list of samples to form a 85 | mini-batch of Tensor(s). Used when using batched loading from a 86 | map-style dataset. 87 | 88 | Returns: 89 | A tuple of ``engine``, ``optimizer``, ``training_dataloader``, ``lr_scheduler`` 90 | 91 | * ``engine``: DeepSpeed runtime engine which wraps the client model for distributed training. 92 | 93 | * ``optimizer``: Wrapped optimizer if a user defined ``optimizer`` is supplied, or if 94 | optimizer is specified in json config else ``None``. 95 | 96 | * ``training_dataloader``: DeepSpeed dataloader if ``training_data`` was supplied, 97 | otherwise ``None``. 98 | 99 | * ``lr_scheduler``: Wrapped lr scheduler if user ``lr_scheduler`` is passed, or 100 | if ``lr_scheduler`` specified in JSON configuration. Otherwise ``None``. 101 | """ 102 | log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format( 103 | __version__, 104 | __git_hash__, 105 | __git_branch__), 106 | ranks=[0]) 107 | 108 | if not isinstance(model, PipelineModule): 109 | engine = DeepSpeedEngine(args=args, 110 | model=model, 111 | optimizer=optimizer, 112 | model_parameters=model_parameters, 113 | training_data=training_data, 114 | lr_scheduler=lr_scheduler, 115 | mpu=mpu, 116 | dist_init_required=dist_init_required, 117 | collate_fn=collate_fn, 118 | config_params=config_params) 119 | else: 120 | assert mpu is None, "mpu must be None with pipeline parallelism" 121 | engine = PipelineEngine(args=args, 122 | model=model, 123 | optimizer=optimizer, 124 | model_parameters=model_parameters, 125 | training_data=training_data, 126 | lr_scheduler=lr_scheduler, 127 | mpu=model.mpu(), 128 | dist_init_required=dist_init_required, 129 | collate_fn=collate_fn, 130 | config_params=config_params) 131 | 132 | return_items = [ 133 | engine, 134 | engine.optimizer, 135 | engine.training_dataloader, 136 | engine.lr_scheduler 137 | ] 138 | return tuple(return_items) 139 | 140 | 141 | def _add_core_arguments(parser): 142 | r"""Helper (internal) function to update an argument parser with an argument group of the core DeepSpeed arguments. 143 | The core set of DeepSpeed arguments include the following: 144 | 1) --deepspeed: boolean flag to enable DeepSpeed 145 | 2) --deepspeed_config : path of a json configuration file to configure DeepSpeed runtime. 146 | 147 | This is a helper function to the public add_config_arguments() 148 | 149 | Arguments: 150 | parser: argument parser 151 | Return: 152 | parser: Updated Parser 153 | """ 154 | group = parser.add_argument_group('DeepSpeed', 'DeepSpeed configurations') 155 | 156 | group.add_argument( 157 | '--deepspeed', 158 | default=False, 159 | action='store_true', 160 | help= 161 | 'Enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)') 162 | 163 | group.add_argument('--deepspeed_config', 164 | default=None, 165 | type=str, 166 | help='DeepSpeed json configuration file.') 167 | 168 | group.add_argument( 169 | '--deepscale', 170 | default=False, 171 | action='store_true', 172 | help= 173 | 'Deprecated enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)' 174 | ) 175 | 176 | group.add_argument('--deepscale_config', 177 | default=None, 178 | type=str, 179 | help='Deprecated DeepSpeed json configuration file.') 180 | 181 | group.add_argument( 182 | '--deepspeed_mpi', 183 | default=False, 184 | action='store_true', 185 | help= 186 | "Run via MPI, this will attempt to discover the necessary variables to initialize torch " 187 | "distributed from the MPI environment") 188 | 189 | return parser 190 | 191 | 192 | def add_config_arguments(parser): 193 | r"""Update the argument parser to enabling parsing of DeepSpeed command line arguments. 194 | The set of DeepSpeed arguments include the following: 195 | 1) --deepspeed: boolean flag to enable DeepSpeed 196 | 2) --deepspeed_config : path of a json configuration file to configure DeepSpeed runtime. 197 | 198 | Arguments: 199 | parser: argument parser 200 | Return: 201 | parser: Updated Parser 202 | """ 203 | parser = _add_core_arguments(parser) 204 | 205 | return parser 206 | --------------------------------------------------------------------------------