├── hostfile
├── megatron
    ├── mpu
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── commons.py
    │   │   ├── test_data.py
    │   │   ├── test_initialize.py
    │   │   ├── test_cross_entropy.py
    │   │   └── test_random.py
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── data.py
    │   ├── mappings.py
    │   ├── cross_entropy.py
    │   └── grads.py
    ├── data
    │   ├── __init__.py
    │   ├── helpers.cpython-38-x86_64-linux-gnu.so
    │   ├── test
    │   │   ├── test_preprocess_data.sh
    │   │   └── test_indexed_dataset.py
    │   ├── Makefile
    │   ├── ict_dataset.py
    │   ├── samplers.py
    │   └── realm_dataset_utils.py
    ├── tokenizer
    │   ├── __init__.py
    │   └── t5_tokenization.py
    ├── deprecated_data_utils
    │   ├── scripts
    │   │   ├── presplit_sentences_json.py
    │   │   ├── split_json.py
    │   │   └── split_gpt2_json.py
    │   ├── corpora.py
    │   ├── tf_dl.py
    │   ├── samplers.py
    │   ├── __init__.py
    │   └── lazy_loader.py
    ├── model
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── fused_bias_gelu.py
    │   ├── classification.py
    │   ├── multiple_choice.py
    │   ├── distributed.py
    │   ├── fused_softmax.py
    │   └── bert_model.py
    ├── fp16
    │   └── __init__.py
    ├── module.py
    ├── package_info.py
    ├── __init__.py
    ├── fused_kernels
    │   ├── scaled_upper_triang_masked_softmax.cpp
    │   ├── scaled_masked_softmax.cpp
    │   ├── scaled_upper_triang_masked_softmax_cuda.cu
    │   ├── scaled_masked_softmax_cuda.cu
    │   └── __init__.py
    ├── indexer.py
    ├── learning_rates.py
    └── memory.py
├── .gitignore
├── doc
    └── release-note.md
├── ds_config_gpt2.json
├── ds_config_t5.json
├── LICENSE
├── run_t5.sh
├── run_gpt2.sh
├── pretrain_gpt2.py
├── pretrain_t5.py
└── tds
    └── __init__.py


/hostfile:
--------------------------------------------------------------------------------
1 | node1 slots=8
2 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/megatron/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import indexed_dataset
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/
2 | upload.sh
3 | results/
4 | results-local
5 | __pycache__
6 | 
7 | 


--------------------------------------------------------------------------------
/megatron/data/helpers.cpython-38-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TsinghuaAI/CPM-1-Pretrain/HEAD/megatron/data/helpers.cpython-38-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/doc/release-note.md:
--------------------------------------------------------------------------------
 1 | ## v0.1.0
 2 | 
 3 | ### Functions
 4 | 
 5 | - 支持GPT-2、T5的多机多卡预训练
 6 | - 支持数据并行、模型并行、流水并行、混合精度计算
 7 | 
 8 | ### Performance
 9 | 
10 | - GPT-2、T5可以稳定运行
11 | 


--------------------------------------------------------------------------------
/ds_config_gpt2.json:
--------------------------------------------------------------------------------
1 | {"train_batch_size": 64, "train_micro_batch_size_per_gpu": 4, "steps_per_print": 10, "gradient_clipping": 1.0, "fp16": {"enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "hysteresis": 2, "initial_scale_power": 16}, "zero_allow_untested_optimizer": true, "wall_clock_breakdown": true}


--------------------------------------------------------------------------------
/ds_config_t5.json:
--------------------------------------------------------------------------------
1 | {"train_batch_size": 128, "train_micro_batch_size_per_gpu": 4, "steps_per_print": 10, "gradient_clipping": 1.0, "fp16": {"enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "hysteresis": 2, "initial_scale_power": 16}, "zero_allow_untested_optimizer": true, "wall_clock_breakdown": true}


--------------------------------------------------------------------------------
/megatron/data/test/test_preprocess_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | IMPL=cached
 4 | python ../preprocess_data.py \
 5 |        --input test_samples.json \
 6 |        --vocab vocab.txt \
 7 |        --dataset-impl ${IMPL} \
 8 |        --output-prefix test_samples_${IMPL} \
 9 |        --workers 1 \
10 |        --log-interval 2
11 | 


--------------------------------------------------------------------------------
/megatron/data/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = helpers
 4 | LIBEXT = $(shell python3-config --extension-suffix)
 5 | 
 6 | default: $(LIBNAME)$(LIBEXT)
 7 | 
 8 | %$(LIBEXT): %.cpp
 9 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 | 


--------------------------------------------------------------------------------
/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | from .tokenizer import build_tokenizer
18 | 


--------------------------------------------------------------------------------
/megatron/deprecated_data_utils/scripts/presplit_sentences_json.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python scripts/presplit_sentences_json.py <original loose json file> <output loose json file>
 4 | """
 5 | 
 6 | import sys
 7 | import json
 8 | 
 9 | import nltk
10 | 
11 | nltk.download('punkt')
12 | 
13 | input_file = sys.argv[1]
14 | output_file = sys.argv[2]
15 | 
16 | line_seperator = "\n"
17 | 
18 | with open(input_file, 'r') as ifile:
19 |     with open(output_file, "w") as ofile:
20 |         for doc in ifile.readlines():
21 |             parsed = json.loads(doc)
22 |             sent_list = []
23 |             for line in parsed['text'].split('\n'):
24 |                 if line != '\n':
25 |                     sent_list.extend(nltk.tokenize.sent_tokenize(line))
26 |             parsed['text'] = line_seperator.join(sent_list)
27 |             ofile.write(json.dumps(parsed) + '\n')
28 | 


--------------------------------------------------------------------------------
/megatron/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from .distributed import *
17 | from .bert_model import BertModel
18 | from .realm_model import ICTBertModel
19 | from .gpt2_model import GPT2Model, GPT2ModelPipe
20 | from .utils import get_params_for_weight_decay_optimization
21 | from .language_model import get_language_model
22 | from .t5_model import T5ModelPipe, T5Model


--------------------------------------------------------------------------------
/megatron/fp16/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from .fp16util import (
16 |     BN_convert_float,
17 |     network_to_half,
18 |     prep_param_lists,
19 |     model_grads_to_master_grads,
20 |     master_params_to_model_params,
21 |     tofp16,
22 |     to_python_float,
23 |     clip_grad_norm,
24 |     convert_module,
25 |     convert_network,
26 |     FP16Model,
27 | )
28 | 
29 | from .fp16 import *
30 | from .loss_scaler import *
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Tsinghua AI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/megatron/module.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Megatron Module"""
17 | 
18 | import torch
19 | 
20 | 
21 | class MegatronModule(torch.nn.Module):
22 |     """Megatron specific extentions of torch Module."""
23 | 
24 |     def __init__(self):
25 |         super(MegatronModule, self).__init__()
26 | 
27 |     def state_dict_for_save_checkpoint(self, destination=None, prefix='',
28 |                                        keep_vars=False):
29 |         """Use this function to override the state dict for
30 |         saving checkpoints."""
31 |         return self.state_dict(destination, prefix, keep_vars)
32 | 


--------------------------------------------------------------------------------
/megatron/package_info.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | MAJOR = 1
17 | MINOR = 1.5
18 | 
19 | # Use the following formatting: (major, minor)
20 | VERSION = (MAJOR, MINOR)
21 | 
22 | __version__ = '.'.join(map(str, VERSION))
23 | __package_name__ = 'megatron-lm'
24 | __contact_names__ = 'NVIDIA INC'
25 | __url__ = 'https://github.com/NVIDIA/Megatron-LM'
26 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
27 | __description__ = 'Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism.'
28 | __license__ = 'See https://github.com/NVIDIA/Megatron-LM/blob/master/LICENSE'
29 | __keywords__ = 'deep learning, Megatron, gpu, NLP, nvidia, pytorch, torch, language'
30 | 
31 | 


--------------------------------------------------------------------------------
/megatron/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 | 
17 | from .package_info import (
18 |     __description__,
19 |     __contact_names__,
20 |     __url__,
21 |     __download_url__,
22 |     __keywords__,
23 |     __license__,
24 |     __package_name__,
25 |     __version__,
26 | )
27 | 
28 | from .global_vars import get_args
29 | from .global_vars import get_tokenizer
30 | from .global_vars import get_tensorboard_writer
31 | from .global_vars import get_adlr_autoresume
32 | from .global_vars import get_timers
33 | from .initialize  import initialize_megatron
34 | 
35 | def print_rank_0(message):
36 |     """If distributed is initialized print only on rank 0."""
37 |     if torch.distributed.is_initialized():
38 |         if torch.distributed.get_rank() == 0:
39 |             print(message, flush=True)
40 |     else:
41 |         print(message, flush=True)
42 | 


--------------------------------------------------------------------------------
/megatron/deprecated_data_utils/corpora.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """several datasets with preset arguments"""
16 | from .datasets import json_dataset, csv_dataset
17 | import os
18 | 
19 | 
20 | class wikipedia(json_dataset):
21 |     """
22 |     dataset for wikipedia with arguments configured for convenience
23 | 
24 |     command line usage: `--train-data wikipedia`
25 |     """
26 |     PATH = 'data/wikipedia/wikidump_lines.json'
27 |     assert_str = "make sure to set PATH for wikipedia data_utils/corpora.py"
28 | 
29 |     def __init__(self, **kwargs):
30 |         assert os.path.exists(wikipedia.PATH), \
31 |             wikipedia.assert_str
32 |         if not kwargs:
33 |             kwargs = {}
34 |         kwargs['text_key'] = 'text'
35 |         kwargs['loose_json'] = True
36 |         super(wikipedia, self).__init__(wikipedia.PATH, **kwargs)
37 | 
38 | 
39 | class webtext(json_dataset):
40 |     """
41 |     dataset for webtext with arguments configured for convenience
42 | 
43 |     command line usage: `--train-data webtext`
44 |     """
45 |     PATH = 'data/webtext/data.json'
46 |     assert_str = "make sure to set PATH for webtext data_utils/corpora.py"
47 | 
48 |     def __init__(self, **kwargs):
49 |         assert os.path.exists(webtext.PATH), \
50 |             webtext.assert_str
51 |         if not kwargs:
52 |             kwargs = {}
53 |         kwargs['text_key'] = 'text'
54 |         kwargs['loose_json'] = True
55 |         super(webtext, self).__init__(webtext.PATH, **kwargs)
56 | 
57 | 
58 | NAMED_CORPORA = {
59 |     'wikipedia': wikipedia,
60 |     'webtext': webtext,
61 | }
62 | 


--------------------------------------------------------------------------------
/megatron/mpu/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Model parallel utility interface."""
17 | 
18 | from .cross_entropy import vocab_parallel_cross_entropy
19 | 
20 | from .data import broadcast_data
21 | 
22 | from .grads import clip_grad_norm
23 | 
24 | from .initialize import is_unitialized
25 | from .initialize import destroy_model_parallel
26 | from .initialize import get_data_parallel_group
27 | from .initialize import get_data_parallel_rank
28 | from .initialize import get_data_parallel_world_size
29 | from .initialize import get_model_parallel_group
30 | from .initialize import get_model_parallel_rank, set_model_parallel_rank
31 | from .initialize import get_model_parallel_src_rank
32 | from .initialize import get_model_parallel_world_size, set_model_parallel_world_size
33 | from .initialize import get_topology
34 | from .initialize import get_pipe_parallel_group
35 | from .initialize import get_pipe_parallel_rank
36 | from .initialize import get_pipe_parallel_world_size
37 | from .initialize import get_io_parallel_group
38 | from .initialize import initialize_model_parallel
39 | from .initialize import model_parallel_is_initialized
40 | 
41 | from .layers import LayerNorm
42 | from .layers import ColumnParallelLinear
43 | from .layers import RowParallelLinear
44 | from .layers import VocabParallelEmbedding
45 | 
46 | from .mappings import copy_to_model_parallel_region
47 | from .mappings import gather_from_model_parallel_region
48 | from .mappings import reduce_from_model_parallel_region
49 | from .mappings import scatter_to_model_parallel_region
50 | 
51 | from .random import checkpoint
52 | from .random import get_cuda_rng_tracker
53 | from .random import init_checkpointed_activations_memory_buffer
54 | from .random import model_parallel_cuda_manual_seed
55 | from .random import reset_checkpointed_activations_memory_buffer
56 | 
57 | from .utils import divide
58 | from .utils import split_tensor_along_last_dim
59 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda_fp16.h>
18 | #include <torch/extension.h>
19 | #include <vector>
20 | 
21 | namespace multihead_attn {
22 | namespace fused_softmax {
23 | namespace scaled_upper_triang_masked_softmax {
24 | 
25 | torch::Tensor fwd_cuda(
26 |     torch::Tensor const& input, 
27 |     float scale_factor);
28 | 
29 | torch::Tensor bwd_cuda(
30 |     torch::Tensor const& output_grads, 
31 |     torch::Tensor const& softmax_results,
32 |     float scale_factor);
33 | 
34 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) {
35 |   AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
36 |   AT_ASSERTM(input.scalar_type() == at::ScalarType::Half, 
37 |       "Only HALF is supported");
38 | 
39 |   return fwd_cuda(input, scale_factor);
40 | }
41 | 
42 | torch::Tensor bwd(
43 |     torch::Tensor const& output_grads, 
44 |     torch::Tensor const& softmax_results,
45 |     float scale_factor) {
46 | 
47 |   AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
48 |   AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
49 | 
50 |   AT_ASSERTM(output_grads.scalar_type() == at::ScalarType::Half, 
51 |       "Only HALF is supported");
52 |   AT_ASSERTM(softmax_results.scalar_type() == at::ScalarType::Half, 
53 |       "Only HALF is supported");
54 | 
55 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
56 | }
57 | 
58 | } // end namespace scaled_upper_triang_masked_softmax
59 | } // end namespace fused_softmax
60 | } // end namespace multihead_attn
61 | 
62 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
63 |   m.def("forward", 
64 |         &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd, 
65 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
66 |   m.def("backward", 
67 |         &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
68 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
69 | }
70 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda_fp16.h>
18 | #include <torch/extension.h>
19 | #include <vector>
20 | 
21 | namespace multihead_attn {
22 | namespace fused_softmax {
23 | namespace scaled_masked_softmax {
24 | 
25 | torch::Tensor fwd_cuda(
26 |     torch::Tensor const& input, 
27 |     torch::Tensor const& mask,
28 |     float scale_factor);
29 | 
30 | torch::Tensor bwd_cuda(
31 |     torch::Tensor const& output_grads, 
32 |     torch::Tensor const& softmax_results,
33 |     float scale_factor);
34 | 
35 | torch::Tensor fwd(
36 |     torch::Tensor const& input,
37 |     torch::Tensor const& mask,
38 |     float scale_factor) {
39 |   AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
40 |   AT_ASSERTM(input.scalar_type() == at::ScalarType::Half, 
41 |       "Only HALF is supported");
42 |   AT_ASSERTM(mask.dim() == 4, "expected 4D tensor");
43 | 
44 |   return fwd_cuda(input, mask, scale_factor);
45 | }
46 | 
47 | torch::Tensor bwd(
48 |     torch::Tensor const& output_grads, 
49 |     torch::Tensor const& softmax_results,
50 |     float scale_factor) {
51 | 
52 |   AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
53 |   AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
54 | 
55 |   AT_ASSERTM(output_grads.scalar_type() == at::ScalarType::Half, 
56 |       "Only HALF is supported");
57 |   AT_ASSERTM(softmax_results.scalar_type() == at::ScalarType::Half, 
58 |       "Only HALF is supported");
59 | 
60 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
61 | }
62 | 
63 | } // end namespace scaled_masked_softmax
64 | } // end namespace fused_softmax
65 | } // end namespace multihead_attn
66 | 
67 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
68 |   m.def("forward", 
69 |         &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 
70 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
71 |   m.def("backward", 
72 |         &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
73 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
74 | }
75 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/commons.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import argparse
17 | import os
18 | import random
19 | import numpy
20 | import torch
21 | 
22 | import mpu
23 | 
24 | 
25 | class IdentityLayer(torch.nn.Module):
26 |     def __init__(self, size, scale=1.0):
27 |         super(IdentityLayer, self).__init__()
28 |         self.weight = torch.nn.Parameter(scale * torch.randn(size))
29 | 
30 |     def forward(self):
31 |         return self.weight
32 | 
33 | 
34 | def set_random_seed(seed):
35 |     """Set random seed for reproducability."""
36 |     random.seed(seed)
37 |     numpy.random.seed(seed)
38 |     torch.manual_seed(seed)
39 |     mpu.model_parallel_cuda_manual_seed(seed)
40 | 
41 | 
42 | def initialize_distributed(backend='nccl'):
43 |     """Initialize torch.distributed."""
44 |     # Get local rank in case it is provided.
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument('--local_rank', type=int, default=None,
47 |                         help='local rank passed from distributed launcher')
48 |     args = parser.parse_args()
49 |     local_rank = args.local_rank
50 | 
51 |     # Get rank and world size.
52 |     rank = int(os.getenv('RANK', '0'))
53 |     world_size = int(os.getenv("WORLD_SIZE", '1'))
54 | 
55 |     print('> initializing torch.distributed with local rank: {}, '
56 |           'rank: {}, world size: {}'.format(local_rank, rank, world_size))
57 | 
58 |     # Set the device id.
59 |     device = rank % torch.cuda.device_count()
60 |     if local_rank is not None:
61 |         device = local_rank
62 |     torch.cuda.set_device(device)
63 | 
64 |     # Call the init process.
65 |     init_method = 'tcp://'
66 |     master_ip = os.getenv('MASTER_ADDR', 'localhost')
67 |     master_port = os.getenv('MASTER_PORT', '6000')
68 |     init_method += master_ip + ':' + master_port
69 |     torch.distributed.init_process_group(
70 |         backend=backend,
71 |         world_size=world_size,
72 |         rank=rank,
73 |         init_method=init_method)
74 | 
75 | 
76 | def print_separator(message):
77 |     torch.distributed.barrier()
78 |     filler_len = (78 - len(message)) // 2
79 |     filler = '-' * filler_len
80 |     string = '\n' + filler + ' {} '.format(message) + filler
81 |     if torch.distributed.get_rank() == 0:
82 |         print(string, flush=True)
83 |     torch.distributed.barrier()
84 | 


--------------------------------------------------------------------------------
/megatron/mpu/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import torch
18 | 
19 | 
20 | def ensure_divisibility(numerator, denominator):
21 |     """Ensure that numerator is divisible by the denominator."""
22 |     assert numerator % denominator == 0, '{} is not divisible by {}'.format(
23 |         numerator, denominator)
24 | 
25 | 
26 | def divide(numerator, denominator):
27 |     """Ensure that numerator is divisible by the denominator and return
28 |     the division value."""
29 |     ensure_divisibility(numerator, denominator)
30 |     return numerator // denominator
31 | 
32 | 
33 | def split_tensor_along_last_dim(tensor, num_partitions,
34 |                                 contiguous_split_chunks=False):
35 |     """Split a tensor along its last dimension.
36 |     Arguments:
37 |         tensor: input tensor.
38 |         num_partitions: number of partitions to split the tensor
39 |         contiguous_split_chunks: If True, make each chunk contiguous
40 |                                  in memory.
41 |     """
42 |     # Get the size and dimension.
43 |     last_dim = tensor.dim() - 1
44 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
45 |     # Split.
46 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
47 |     # Note: torch.split does not create contiguous tensors by default.
48 |     if contiguous_split_chunks:
49 |         return tuple(chunk.contiguous() for chunk in tensor_list)
50 | 
51 |     return tensor_list
52 | 
53 | 
54 | class VocabUtility:
55 |     """Split the vocabulary into `world_size` chunks amd return the
56 |         first and last index of the vocabulary belonging to the `rank`
57 |         partition: Note that indecies in [fist, last)"""
58 | 
59 |     @staticmethod
60 |     def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
61 |                                                   rank, world_size):
62 |         index_f = rank * per_partition_vocab_size
63 |         index_l = index_f + per_partition_vocab_size
64 |         return index_f, index_l
65 | 
66 |     @staticmethod
67 |     def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
68 |         per_partition_vocab_size = divide(global_vocab_size, world_size)
69 |         return VocabUtility.vocab_range_from_per_partition_vocab_size(
70 |             per_partition_vocab_size, rank, world_size)
71 | 


--------------------------------------------------------------------------------
/megatron/model/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Utilities for models."""
17 | 
18 | import math
19 | 
20 | import torch
21 | 
22 | from .transformer import LayerNorm
23 | 
24 | 
25 | def init_method_normal(sigma):
26 |     """Init method based on N(0, sigma)."""
27 |     def init_(tensor):
28 |         return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
29 | 
30 |     return init_
31 | 
32 | 
33 | def scaled_init_method_normal(sigma, num_layers):
34 |     """Init method based on N(0, sigma/sqrt(2*num_layers)."""
35 |     std = sigma / math.sqrt(2.0 * num_layers)
36 | 
37 |     def init_(tensor):
38 |         return torch.nn.init.normal_(tensor, mean=0.0, std=std)
39 | 
40 |     return init_
41 | 
42 | 
43 | def get_linear_layer(rows, columns, init_method):
44 |     """Simple linear layer with weight initialization."""
45 |     layer = torch.nn.Linear(rows, columns)
46 |     init_method(layer.weight)
47 |     with torch.no_grad():
48 |         layer.bias.zero_()
49 |     return layer
50 | 
51 | @torch.jit.script
52 | def gelu_impl(x):
53 |     """OpenAI's gelu implementation."""
54 |     return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
55 |                                        (1.0 + 0.044715 * x * x)))
56 | def openai_gelu(x):
57 |     return gelu_impl(x)
58 | 
59 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
60 | @torch.jit.script
61 | def erf_gelu(x):
62 |     return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
63 | 
64 | def get_params_for_weight_decay_optimization(module):
65 |     """Divide params into with-weight-decay and without-weight-decay groups.
66 |     Layernorms and baises will have no weight decay but the rest will.
67 |     """
68 |     weight_decay_params = {'params': []}
69 |     no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
70 |     for module_ in module.modules():
71 |         if isinstance(module_, LayerNorm):
72 |             no_weight_decay_params['params'].extend(
73 |                 [p for p in list(module_._parameters.values())
74 |                  if p is not None])
75 |         else:
76 |             weight_decay_params['params'].extend(
77 |                 [p for n, p in list(module_._parameters.items())
78 |                  if p is not None and n != 'bias'])
79 |             no_weight_decay_params['params'].extend(
80 |                 [p for n, p in list(module_._parameters.items())
81 |                  if p is not None and n == 'bias'])
82 | 
83 |     return weight_decay_params, no_weight_decay_params
84 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <ATen/ATen.h>
18 | #include <cuda.h>
19 | #include <cuda_runtime.h>
20 | #include <cuda_fp16.h>
21 | #include <cuda_profiler_api.h>
22 | #include "THC/THC.h"
23 | #include <ATen/cuda/CUDAContext.h>
24 | #include <torch/extension.h>
25 | #include "scaled_upper_triang_masked_softmax.h"
26 | 
27 | namespace multihead_attn {
28 | namespace fused_softmax {
29 | namespace scaled_upper_triang_masked_softmax {
30 | 
31 | torch::Tensor fwd_cuda(
32 |     torch::Tensor const& input, 
33 |     float scale_factor)
34 | {
35 |   // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
36 |   const int attn_batches = input.size(0);
37 |   const int seq_len = input.size(1);
38 |   TORCH_INTERNAL_ASSERT(seq_len <= 2048);
39 | 
40 |   // Output 
41 |   auto act_options = input.options().requires_grad(false);
42 |   torch::Tensor softmax_results = 
43 |       torch::empty({attn_batches, seq_len, seq_len}, act_options);
44 | 
45 |   // Softmax Intermediate Result Ptr
46 |   void* input_ptr = static_cast<void*>(input.data_ptr());
47 |   void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
48 | 
49 |   dispatch_scaled_upper_triang_masked_softmax_forward<half, half, float>(
50 |       reinterpret_cast<half*>(softmax_results_ptr),
51 |       reinterpret_cast<const half*>(input_ptr),
52 |       scale_factor,
53 |       seq_len,
54 |       seq_len,
55 |       attn_batches);
56 |   return softmax_results;
57 | }
58 | 
59 | torch::Tensor bwd_cuda(
60 |     torch::Tensor const& output_grads_, 
61 |     torch::Tensor const& softmax_results_, 
62 |     float scale_factor)  {
63 | 	
64 |   auto output_grads = output_grads_.contiguous();
65 |   auto softmax_results = softmax_results_.contiguous();
66 | 
67 |   //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
68 |   const int attn_batches = output_grads.size(0);
69 |   const int seq_len = output_grads.size(1);
70 |   TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2));
71 | 
72 |   void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
73 | 
74 |   //Softmax Grad
75 |   dispatch_scaled_upper_triang_masked_softmax_backward<half, half, float>(
76 |       reinterpret_cast<half*>(output_grads_ptr), 
77 |       reinterpret_cast<half*>(output_grads_ptr), 
78 |       reinterpret_cast<half const*>(softmax_results.data_ptr()),
79 |       scale_factor,
80 |       seq_len,
81 |       seq_len,
82 |       attn_batches);
83 |   
84 |   //backward pass is completely in-place
85 |   return output_grads;
86 | }
87 | }
88 | }
89 | }
90 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from commons import print_separator
17 | from commons import initialize_distributed
18 | from mpu import data as data_utils
19 | import mpu
20 | import torch
21 | import functools
22 | import operator
23 | import sys
24 | sys.path.append("../..")
25 | 
26 | 
27 | def test_boradcast_data(model_parallel_size):
28 | 
29 |     if torch.distributed.get_rank() == 0:
30 |         print('> testing boradcast_data with model parallel size {} ...'.
31 |               format(model_parallel_size))
32 | 
33 |     mpu.initialize_model_parallel(model_parallel_size)
34 |     torch.manual_seed(1234 + mpu.get_data_parallel_rank())
35 |     model_parallel_size = mpu.get_model_parallel_world_size()
36 | 
37 |     key_size_t = {'key1': [7, 11],
38 |                   'key2': [8, 2, 1],
39 |                   'key3': [13],
40 |                   'key4': [5, 1, 2],
41 |                   'key5': [5, 12]}
42 |     keys = list(key_size_t.keys())
43 | 
44 |     data = {}
45 |     data_t = {}
46 |     for key in key_size_t:
47 |         data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
48 |         data_t[key] = data[key].clone()
49 |     data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
50 |     data_t['keyX'] = data['keyX'].clone()
51 |     if mpu.get_model_parallel_rank() != 0:
52 |         data = None
53 | 
54 |     data_utils._check_data_types(keys, data_t, torch.int64)
55 |     key_size, key_numel, \
56 |         total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
57 |     for key in keys:
58 |         assert key_size[key] == key_size_t[key]
59 |     total_numel_t = 0
60 |     for key in keys:
61 |         target_size = functools.reduce(operator.mul, key_size_t[key], 1)
62 |         assert key_numel[key] == target_size
63 |         total_numel_t += target_size
64 |     assert total_numel == total_numel_t
65 | 
66 |     data_b = data_utils.broadcast_data(keys, data, torch.int64)
67 |     for key in keys:
68 |         tensor = data_t[key].cuda()
69 |         assert data_b[key].sub(tensor).abs().max() == 0
70 | 
71 |     # Reset groups
72 |     mpu.destroy_model_parallel()
73 | 
74 |     torch.distributed.barrier()
75 |     if torch.distributed.get_rank() == 0:
76 |         print('>> passed the test :-)')
77 | 
78 | 
79 | if __name__ == '__main__':
80 | 
81 |     initialize_distributed()
82 |     world_size = torch.distributed.get_world_size()
83 | 
84 |     model_parallel_size = 1
85 |     while model_parallel_size <= world_size:
86 |         print_separator('test test boradcast data')
87 |         test_boradcast_data(model_parallel_size)
88 |         model_parallel_size *= 2
89 | 


--------------------------------------------------------------------------------
/megatron/model/fused_bias_gelu.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import torch
17 | 
18 | torch._C._jit_set_profiling_mode(False)
19 | torch._C._jit_set_profiling_executor(False)
20 | torch._C._jit_override_can_fuse_on_cpu(True)
21 | torch._C._jit_override_can_fuse_on_gpu(True)
22 | 
23 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################
24 | # 1/sqrt(2*pi)-> 0.3989423
25 | # 1/sqrt(2)   -> 0.70710678
26 | # sqrt(2/pi)  -> 0.79788456
27 | # this function is tanh approximation of gelu
28 | # actual gelu is:
29 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
30 | 
31 | @torch.jit.script
32 | def bias_gelu(bias, y):
33 |     x = bias + y
34 |     return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
35 | 
36 | # gradient of tanh approximation of gelu
37 | # gradient of actual gelu is:
38 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
39 | @torch.jit.script
40 | def bias_gelu_back(g, bias, y):
41 |     x = bias + y
42 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
43 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
44 |     ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
45 |     return ff*g
46 | 
47 | @torch.jit.script
48 | def gelu(x):
49 |     return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
50 | 
51 | # gradient of tanh approximation of gelu
52 | # gradient of actual gelu is:
53 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
54 | @torch.jit.script
55 | def gelu_back(g, x):
56 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
57 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
58 |     ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
59 |     return ff*g
60 | 
61 | class GeLUFunction(torch.autograd.Function):
62 |     @staticmethod
63 |     # bias is an optional argument
64 |     def forward(ctx, input, bias):
65 |         ctx.save_for_backward(input, bias)
66 |         return bias_gelu(bias, input)
67 | 
68 |     @staticmethod
69 |     def backward(ctx, grad_output):
70 |         input, bias = ctx.saved_tensors
71 |         tmp = bias_gelu_back(grad_output, bias, input)
72 |         return tmp, tmp
73 | 
74 | class GeLUFunctionWithoutBias(torch.autograd.Function):
75 |     @staticmethod
76 |     # bias is an optional argument
77 |     def forward(ctx, input):
78 |         ctx.save_for_backward(input)
79 |         return gelu(input)
80 | 
81 |     @staticmethod
82 |     def backward(ctx, grad_output):
83 |         input = ctx.saved_tensors
84 |         tmp = gelu_back(grad_output, input)
85 |         return tmp
86 | 
87 | bias_gelu_impl = GeLUFunction.apply
88 | gelu_impl = GeLUFunctionWithoutBias.apply
89 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/test_initialize.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from commons import print_separator
17 | from commons import initialize_distributed
18 | import mpu
19 | import torch
20 | import sys
21 | sys.path.append("../..")
22 | 
23 | 
24 | def test_initialize_model_parallel(model_parallel_size):
25 | 
26 |     if torch.distributed.get_rank() == 0:
27 |         print('> testing initialize_model_parallel with size {} ...'.format(
28 |             model_parallel_size))
29 |     model_parallel_size_ = min(model_parallel_size,
30 |                                torch.distributed.get_world_size())
31 |     assert not mpu.model_parallel_is_initialized()
32 |     mpu.initialize_model_parallel(model_parallel_size_)
33 |     assert mpu.model_parallel_is_initialized()
34 | 
35 |     # Checks.
36 |     def check(group, world_size, rank):
37 |         assert world_size == torch.distributed.get_world_size(group=group)
38 |         assert rank == torch.distributed.get_rank(group=group)
39 | 
40 |     # Model parallel.
41 |     world_size = model_parallel_size_
42 |     rank = torch.distributed.get_rank() % model_parallel_size_
43 |     assert world_size == mpu.get_model_parallel_world_size()
44 |     assert rank == mpu.get_model_parallel_rank()
45 |     check(mpu.get_model_parallel_group(), world_size, rank)
46 | 
47 |     # Data parallel.
48 |     world_size = torch.distributed.get_world_size() // model_parallel_size_
49 |     rank = torch.distributed.get_rank() // model_parallel_size
50 |     assert world_size == mpu.get_data_parallel_world_size()
51 |     assert rank == mpu.get_data_parallel_rank()
52 |     check(mpu.get_data_parallel_group(), world_size, rank)
53 | 
54 |     # Reset groups
55 |     mpu.destroy_model_parallel()
56 | 
57 |     torch.distributed.barrier()
58 |     if torch.distributed.get_rank() == 0:
59 |         print('>> passed the test :-)')
60 | 
61 | 
62 | def test_get_model_parallel_src_rank(model_parallel_size_):
63 | 
64 |     if torch.distributed.get_rank() == 0:
65 |         print('> testing get_model_parallel_src_rank with size {} ...'.format(
66 |             model_parallel_size_))
67 |     model_parallel_size = min(model_parallel_size_,
68 |                               torch.distributed.get_world_size())
69 |     assert not mpu.model_parallel_is_initialized()
70 |     mpu.initialize_model_parallel(model_parallel_size)
71 |     assert mpu.model_parallel_is_initialized()
72 | 
73 |     # Checks
74 |     src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank()
75 |     assert mpu.get_model_parallel_src_rank() == src_rank
76 | 
77 |     # Reset groups
78 |     mpu.destroy_model_parallel()
79 | 
80 |     torch.distributed.barrier()
81 |     if torch.distributed.get_rank() == 0:
82 |         print('>> passed the test :-)')
83 | 
84 | 
85 | if __name__ == '__main__':
86 | 
87 |     initialize_distributed()
88 |     world_size = torch.distributed.get_world_size()
89 |     model_parallel_size = 1
90 |     while model_parallel_size <= world_size:
91 |         print_separator('test initialize model parallel')
92 |         test_initialize_model_parallel(model_parallel_size)
93 |         print_separator('test model parallel source rank')
94 |         test_get_model_parallel_src_rank(model_parallel_size)
95 |         model_parallel_size *= 2
96 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_masked_softmax_cuda.cu:
--------------------------------------------------------------------------------
  1 | /* coding=utf-8
  2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <ATen/ATen.h>
 18 | #include <cuda.h>
 19 | #include <cuda_runtime.h>
 20 | #include <cuda_fp16.h>
 21 | #include <cuda_profiler_api.h>
 22 | #include "THC/THC.h"
 23 | #include <ATen/cuda/CUDAContext.h>
 24 | #include <torch/extension.h>
 25 | #include "scaled_masked_softmax.h"
 26 | 
 27 | namespace multihead_attn {
 28 | namespace fused_softmax {
 29 | namespace scaled_masked_softmax {
 30 | 
 31 | torch::Tensor fwd_cuda(
 32 |     torch::Tensor const& input,
 33 |     torch::Tensor const& mask,
 34 |     float scale_factor)
 35 | {
 36 |   // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
 37 |   const int batches = input.size(0);
 38 |   const int pad_batches = mask.size(0);
 39 |   const int attn_heads = input.size(1);
 40 |   const int seq_len = input.size(2);
 41 |   TORCH_INTERNAL_ASSERT(seq_len <= 2048);
 42 |   TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches);
 43 |   TORCH_INTERNAL_ASSERT(mask.size(1) == 1);
 44 |   TORCH_INTERNAL_ASSERT(mask.size(2) == seq_len);
 45 |   TORCH_INTERNAL_ASSERT(mask.size(3) == seq_len);
 46 | 
 47 |   // Output 
 48 |   auto act_options = input.options().requires_grad(false);
 49 |   torch::Tensor softmax_results = 
 50 |       torch::empty({batches, attn_heads, seq_len, seq_len}, act_options);
 51 | 
 52 |   // Softmax Intermediate Result Ptr
 53 |   void* input_ptr = static_cast<void*>(input.data_ptr());
 54 |   void* mask_ptr = static_cast<void*>(mask.data_ptr());
 55 |   void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
 56 | 
 57 |   dispatch_scaled_masked_softmax_forward<half, half, float>(
 58 |       reinterpret_cast<half*>(softmax_results_ptr),
 59 |       reinterpret_cast<const half*>(input_ptr),
 60 |       reinterpret_cast<const uint8_t*>(mask_ptr),
 61 |       scale_factor,
 62 |       seq_len,
 63 |       seq_len,
 64 |       batches,
 65 |       attn_heads,
 66 |       pad_batches);
 67 |   return softmax_results;
 68 | }
 69 | 
 70 | torch::Tensor bwd_cuda(
 71 |     torch::Tensor const& output_grads_, 
 72 |     torch::Tensor const& softmax_results_, 
 73 |     float scale_factor)  {
 74 | 	
 75 |   auto output_grads = output_grads_.contiguous();
 76 |   auto softmax_results = softmax_results_.contiguous();
 77 | 
 78 |   //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
 79 |   const int batches = output_grads.size(0);
 80 |   const int attn_heads = output_grads.size(1);
 81 |   const int seq_len = output_grads.size(2);
 82 |   TORCH_INTERNAL_ASSERT(output_grads.size(2) == output_grads.size(3));
 83 | 
 84 |   void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
 85 | 
 86 |   //Softmax Grad
 87 |   dispatch_scaled_masked_softmax_backward<half, half, float>(
 88 |       reinterpret_cast<half*>(output_grads_ptr), 
 89 |       reinterpret_cast<half*>(output_grads_ptr), 
 90 |       reinterpret_cast<half const*>(softmax_results.data_ptr()),
 91 |       scale_factor,
 92 |       seq_len,
 93 |       seq_len,
 94 |       batches,
 95 |       attn_heads);
 96 |   
 97 |   //backward pass is completely in-place
 98 |   return output_grads;
 99 | }
100 | }
101 | }
102 | }
103 | 


--------------------------------------------------------------------------------
/run_t5.sh:
--------------------------------------------------------------------------------
  1 | #! /bin/bash
  2 | 
  3 | GPUS_PER_NODE=4
  4 | # Change for multinode config
  5 | MASTER_ADDR=localhost
  6 | MASTER_PORT=8888
  7 | NNODES=1
  8 | NODE_RANK=0
  9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
 10 | 
 11 | export DLWS_NUM_WORKER=${NNODES}
 12 | export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE}
 13 | 
 14 | WORKING_DIR=${HOME}/Megatron-LM-3D
 15 | DATA_DIR=${HOME}/t5_data
 16 | 
 17 | DATA_PATH="${DATA_DIR}/pretrain_data/baike_small_document"
 18 | VOCAB_PATH="${DATA_DIR}/bpe_new"
 19 | CHECKPOINT_PATH=checkpoints/t5_test
 20 | config_json="${WORKING_DIR}/ds_config_t5.json"
 21 | 
 22 | # Megatron Model Parallelism
 23 | mp_size=2
 24 | # DeepSpeed Pipeline parallelism
 25 | pp_size=1
 26 | 
 27 | NLAYERS=2
 28 | NHIDDEN=128
 29 | BATCHSIZE=4
 30 | GAS=16
 31 | LOGDIR="tensorboard_data/${NLAYERS}l_${NHIDDEN}h_${NNODES}n_${GPUS_PER_NODE}g_${pp_size}pp_${mp_size}mp_${BATCHSIZE}b_ds4"
 32 | 
 33 | #Actication Checkpointing and Contigious Memory
 34 | checkpoint_activations=false
 35 | chkp_layers=1
 36 | PA=true
 37 | PA_CPU=false
 38 | CC=true
 39 | SYNCHRONIZE=true
 40 | PROFILE=false
 41 | 
 42 | t5_options=" \
 43 |         --model-parallel-size ${mp_size} \
 44 |         --pipe-parallel-size ${pp_size} \
 45 |         --num-layers $NLAYERS \
 46 |         --hidden-size $NHIDDEN \
 47 |         --kv-hidden-size 16 \
 48 |         --ff-hidden-size 256 \
 49 |         --num-attention-heads 8 \
 50 |         --enc-seq-length 1024\
 51 |         --dec-seq-length 384\
 52 |         --max-position-embeddings 1024 \
 53 |         --batch-size $BATCHSIZE \
 54 |         --gas $GAS \
 55 |         --train-iters 320000 \
 56 |         --lr-decay-iters 320000 \
 57 |         --save $CHECKPOINT_PATH \
 58 |         --data-path $DATA_PATH \
 59 |         --vocab-file $VOCAB_PATH \
 60 |         --data-impl mmap \
 61 |         --split 949,50,1 \
 62 |         --distributed-backend nccl \
 63 |         --lr 1.5e-4 \
 64 |         --lr-decay-style cosine \
 65 |         --min-lr 1.0e-5 \
 66 |         --weight-decay 1e-2 \
 67 |         --clip-grad 1.0 \
 68 |         --warmup 0.01 \
 69 |         --log-interval 1 \
 70 |         --save-interval 500 \
 71 |         --eval-interval 100 \
 72 |         --eval-iters 10 \
 73 |         --fp16 \
 74 |         --tensorboard-dir ${LOGDIR}
 75 | "
 76 | 
 77 | deepspeed_options=" \
 78 |                 --deepspeed \
 79 |                 --deepspeed_config ${config_json} \
 80 |             "
 81 | 
 82 | if [ "${contigious_gradients}" = "true" ]; then
 83 | deepspeed_options="${deepspeed_options} \
 84 |                 --zero-contigious-gradients"
 85 | fi
 86 | 
 87 | if [ "${reduce_scatter}" = "true" ]; then
 88 | deepspeed_options="${deepspeed_options} \
 89 |                 --zero-reduce-scatter"
 90 | fi
 91 | 
 92 | if ["${checkpoint_activations}" = "true"]; then
 93 | 
 94 |         chkp_opt=" \
 95 |         --checkpoint-activations \
 96 |         --checkpoint-num-layers ${chkp_layers}"
 97 | 
 98 |         if [ "${PA}" = "true" ]; then
 99 |         chkp_opt="${chkp_opt} \
100 |                 --partition-activations"
101 |         fi
102 | 
103 |         if [ "${PA_CPU}" = "true" ]; then
104 |         chkp_opt="${chkp_opt} \
105 |                 --checkpoint-in-cpu"
106 |         fi
107 | 
108 |         if [ "${SYNCHRONIZE}" = "true" ]; then
109 |         chkp_opt="${chkp_opt} \
110 |                 --synchronize-each-layer"
111 |         fi
112 | 
113 |         if [ "${CC}" = "true" ]; then
114 |         chkp_opt="${chkp_opt} \
115 |                 --contigious-checkpointing"
116 |         fi
117 | 
118 |         if [ "${PROFILE}" = "true" ]; then
119 |         chkp_opt="${chkp_opt} \
120 |                 --profile-backward"
121 |         fi
122 | else
123 |         chkp_opt = " "
124 | fi
125 | 
126 | full_options="${t5_options} ${deepspeed_options} ${chkp_opt}"
127 | 
128 | run_cmd="deepspeed --master_port ${MASTER_PORT}  -i node1:4,5,6,7 --hostfile hostfile pretrain_t5.py $@ ${full_options}"
129 | echo ${run_cmd}
130 | eval ${run_cmd}
131 | 
132 | set +x
133 | 


--------------------------------------------------------------------------------
/run_gpt2.sh:
--------------------------------------------------------------------------------
  1 | #! /bin/bash
  2 | 
  3 | GPUS_PER_NODE=4
  4 | # Change for multinode config
  5 | MASTER_ADDR=localhost
  6 | MASTER_PORT=8888
  7 | NNODES=1
  8 | NODE_RANK=0
  9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
 10 | 
 11 | export DLWS_NUM_WORKER=${NNODES}
 12 | export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE}
 13 | 
 14 | WORKING_DIR=${HOME}/Megatron-LM-3D
 15 | DATA_DIR=${HOME}/gpt2_data
 16 | 
 17 | DATA_PATH="${DATA_DIR}/CPM-train/data" 
 18 | # VOCAB_PATH=data/gpt2-vocab.json
 19 | # MERGE_PATH=data/gpt2-merges.txt
 20 | TOKENIZER_PATH="${DATA_DIR}/bpe_3w_new/vocab.json"
 21 | CHECKPOINT_PATH=checkpoints/gpt2_345m_ds
 22 | config_json="${WORKING_DIR}/ds_config_gpt2.json"
 23 | 
 24 | # Megatron Model Parallelism
 25 | mp_size=2
 26 | # DeepSpeed Pipeline parallelism
 27 | pp_size=2
 28 | 
 29 | NLAYERS=2
 30 | NHIDDEN=256
 31 | BATCHSIZE=4
 32 | GAS=16
 33 | LOGDIR="tensorboard_data/${NLAYERS}l_${NHIDDEN}h_${NNODES}n_${GPUS_PER_NODE}g_${pp_size}pp_${mp_size}mp_${BATCHSIZE}b_ds4"
 34 | 
 35 | 
 36 | #Actication Checkpointing and Contigious Memory
 37 | checkpoint_activations
 38 | chkp_layers=1
 39 | PA=true
 40 | PA_CPU=false
 41 | CC=true
 42 | SYNCHRONIZE=true
 43 | PROFILE=false
 44 | 
 45 | gpt_options=" \
 46 |         --model-parallel-size ${mp_size} \
 47 |         --pipe-parallel-size ${pp_size} \
 48 |         --num-layers $NLAYERS \
 49 |         --hidden-size $NHIDDEN \
 50 |         --num-attention-heads 16 \
 51 |         --kv-hidden-size 16 \
 52 |         --ff-hidden-size 1024 \
 53 |         --num-attention-heads 16 \
 54 |         --seq-length 1024 \
 55 |         --max-position-embeddings 1024 \
 56 |         --batch-size $BATCHSIZE \
 57 |         --gas $GAS \
 58 |         --train-iters 320000 \
 59 |         --lr-decay-iters 320000 \
 60 |         --save $CHECKPOINT_PATH \
 61 |         --data-path $DATA_PATH \
 62 |         --data-impl mmap \
 63 |         --vocab-file $TOKENIZER_PATH\
 64 |         --split 949,50,1 \
 65 |         --distributed-backend nccl \
 66 |         --lr 1.5e-4 \
 67 |         --lr-decay-style cosine \
 68 |         --min-lr 1.0e-5 \
 69 |         --weight-decay 1e-2 \
 70 |         --clip-grad 1.0 \
 71 |         --warmup 0.01 \
 72 |         --checkpoint-activations \
 73 |         --log-interval 1 \
 74 |         --save-interval 500 \
 75 |         --eval-interval 100 \
 76 |         --eval-iters 10 \
 77 |         --fp16 \
 78 |         --hidden-bias \
 79 |         --tensorboard-dir ${LOGDIR}
 80 | "
 81 | 
 82 | deepspeed_options=" \
 83 |                 --deepspeed \
 84 |                 --deepspeed_config ${config_json} \
 85 |             "
 86 | 
 87 | if [ "${contigious_gradients}" = "true" ]; then
 88 | deepspeed_options="${deepspeed_options} \
 89 |                 --zero-contigious-gradients"
 90 | fi
 91 | 
 92 | if [ "${reduce_scatter}" = "true" ]; then
 93 | deepspeed_options="${deepspeed_options} \
 94 |                 --zero-reduce-scatter"
 95 | fi
 96 | 
 97 | if ["${checkpoint_activations}" = "true"]; then
 98 | 
 99 |         chkp_opt=" \
100 |         --checkpoint-activations \
101 |         --checkpoint-num-layers ${chkp_layers}"
102 | 
103 |         if [ "${PA}" = "true" ]; then
104 |         chkp_opt="${chkp_opt} \
105 |                 --partition-activations"
106 |         fi
107 | 
108 |         if [ "${PA_CPU}" = "true" ]; then
109 |         chkp_opt="${chkp_opt} \
110 |                 --checkpoint-in-cpu"
111 |         fi
112 | 
113 |         if [ "${SYNCHRONIZE}" = "true" ]; then
114 |         chkp_opt="${chkp_opt} \
115 |                 --synchronize-each-layer"
116 |         fi
117 | 
118 |         if [ "${CC}" = "true" ]; then
119 |         chkp_opt="${chkp_opt} \
120 |                 --contigious-checkpointing"
121 |         fi
122 | 
123 |         if [ "${PROFILE}" = "true" ]; then
124 |         chkp_opt="${chkp_opt} \
125 |                 --profile-backward"
126 |         fi
127 | else
128 |         chkp_opt = " "
129 | fi
130 | 
131 | full_options="${gpt_options} ${deepspeed_options} ${chkp_opt}"
132 | 
133 | run_cmd="deepspeed --master_port ${MASTER_PORT} -i node1:4,5,6,7 --hostfile hostfile pretrain_gpt2.py $@ ${full_options}"
134 | echo ${run_cmd}
135 | eval ${run_cmd}
136 | 
137 | set +x
138 | 


--------------------------------------------------------------------------------
/megatron/deprecated_data_utils/scripts/split_json.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Takes a corpora of files (specified by `--input_files`) with json data separated
  3 | by newlines (loose json). Splits data into train.json, val.json, test.json files
  4 | under `output_dir`.
  5 | 
  6 | Note: This code has the potential to override files with the names
  7 | train.json, val.json, test.json in `--output_dir`.
  8 | """
  9 | import os
 10 | import argparse
 11 | import math
 12 | import random
 13 | 
 14 | parser = argparse.ArgumentParser('resplit loose json data into train/val/test')
 15 | parser.add_argument('--input_files', nargs='+', required=True,
 16 |                     help='whitespace separated list of input data files')
 17 | parser.add_argument('--output_dir', required=True,
 18 |                     help='output directory where to put files')
 19 | parser.add_argument('--test_percent', type=float, nargs='+', default=[0.05, 0],
 20 |                     help='percentage of available data to use for val/test dataset')
 21 | args = parser.parse_args()
 22 | 
 23 | 
 24 | def get_lines(filepath):
 25 |     lines = []
 26 |     with open(filepath, 'r') as f:
 27 |         for i, l in enumerate(f.readlines()):
 28 |             l = l.strip()
 29 |             lines.append(l)
 30 |     return lines
 31 | 
 32 | 
 33 | def get_splits(lines, line_counts):
 34 |     all_lines = []
 35 |     line_idx = []
 36 |     file_mappings = []
 37 |     for i, l in enumerate(lines):
 38 |         all_lines.extend(l)
 39 |         line_idx.extend(list(range(len(l))))
 40 |         file_mappings.extend([i] * len(l))
 41 | 
 42 |     indices = list(range(len(all_lines)))
 43 |     random.shuffle(indices)
 44 |     all_lines = [all_lines[idx] for idx in indices]
 45 |     line_idx = [line_idx[idx] for idx in indices]
 46 |     file_mappings = [file_mappings[idx] for idx in indices]
 47 | 
 48 |     splits = []
 49 |     mappings = []
 50 |     start = 0
 51 |     for end in line_counts:
 52 |         end += start
 53 |         splits.append(all_lines[start:end])
 54 |         mappings.append(format_mappings(line_idx[start:end], file_mappings[start:end]))
 55 |         start = end
 56 |     return splits, mappings
 57 | 
 58 | 
 59 | def format_mappings(line_idx, file_mappings):
 60 |     lines = []
 61 |     for m, l in zip(file_mappings, line_idx):
 62 |         lines.append(str(m).strip() + '\t' + str(l).strip())
 63 |     return lines
 64 | 
 65 | 
 66 | def get_filepaths(filepaths, output_dir):
 67 |     paths = []
 68 |     train_path = 'train.json'
 69 |     dev_path = 'dev.json'
 70 |     test_path = 'test.json'
 71 |     paths.append(os.path.join(output_dir, train_path))
 72 |     paths.append(os.path.join(output_dir, dev_path))
 73 |     paths.append(os.path.join(output_dir, test_path))
 74 |     return paths
 75 | 
 76 | 
 77 | def write_files(lines, mappings, filepaths):
 78 |     for l, m, path in zip(lines, mappings, filepaths):
 79 |         write_file(l, path)
 80 |         write_mapping_file(m, path)
 81 | 
 82 | 
 83 | def write_file(lines, path):
 84 |     print('Writing:', path)
 85 |     with open(path, 'w') as f:
 86 |         for l in lines:
 87 |             f.write(l + '\n')
 88 | 
 89 | 
 90 | def write_mapping_file(m, path):
 91 |     path = path + '.map'
 92 |     m = [get_mapping_header()] + m
 93 |     write_file(m, path)
 94 | 
 95 | 
 96 | def get_mapping_header():
 97 |     return 'file\tline #'
 98 | 
 99 | 
100 | if not os.path.exists(args.output_dir):
101 |     os.makedirs(args.output_dir)
102 | 
103 | lines = []
104 | 
105 | for filepath in args.input_files:
106 |     _lines = get_lines(filepath)
107 |     lines.append(_lines)
108 | 
109 | # calculate number of lines to use for each
110 | line_counts = [len(l) for l in lines]
111 | total_lines = sum(line_counts)
112 | dev_percent = args.test_percent[0]
113 | dev_lines = math.ceil(dev_percent * total_lines)
114 | test_percent = 0
115 | if len(args.test_percent) == 2:
116 |     test_percent = args.test_percent[1]
117 | test_lines = math.ceil(test_percent * total_lines)
118 | train_lines = total_lines - (test_lines + dev_lines)
119 | normed_lines = [train_lines, dev_lines, test_lines]
120 | normed_lines = [int(l) for l in normed_lines]
121 | 
122 | 
123 | splits, mappings = get_splits(lines, normed_lines)
124 | filepaths = get_filepaths(args.input_files, args.output_dir)
125 | print('Writing output to:', filepaths)
126 | write_files(splits, mappings, filepaths)
127 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/test_cross_entropy.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | from commons import set_random_seed
 17 | from commons import IdentityLayer
 18 | from commons import print_separator
 19 | from commons import initialize_distributed
 20 | from mpu.cross_entropy import vocab_parallel_cross_entropy
 21 | import mpu
 22 | import torch.nn.functional as F
 23 | import torch
 24 | import random
 25 | import sys
 26 | sys.path.append("../..")
 27 | 
 28 | 
 29 | def torch_cross_entropy(batch_size, seq_length, vocab_size,
 30 |                         logits_scale, seed):
 31 |     set_random_seed(seed)
 32 |     identity = IdentityLayer((batch_size, seq_length, vocab_size),
 33 |                              scale=logits_scale).cuda()
 34 |     logits = identity()
 35 |     target = torch.cuda.LongTensor(
 36 |         size=(batch_size, seq_length)).random_(0, vocab_size)
 37 |     loss = F.cross_entropy(logits.view(-1, logits.size()[-1]),
 38 |                            target.view(-1),
 39 |                            reduction='none').view_as(target).mean()
 40 |     loss.backward()
 41 |     return loss, identity.weight.grad
 42 | 
 43 | 
 44 | def mpu_cross_entropy(batch_size, seq_length, vocab_size,
 45 |                       logits_scale, seed):
 46 |     set_random_seed(seed)
 47 |     identity = IdentityLayer((batch_size, seq_length, vocab_size),
 48 |                              scale=logits_scale).cuda()
 49 |     logits = identity()
 50 |     logits_parallel = mpu.scatter_to_model_parallel_region(logits)
 51 |     target = torch.cuda.LongTensor(
 52 |         size=(batch_size, seq_length)).random_(0, vocab_size)
 53 |     loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
 54 |     loss.backward()
 55 |     return loss, identity.weight.grad
 56 | 
 57 | 
 58 | def test_cross_entropy(model_parallel_size):
 59 | 
 60 |     if torch.distributed.get_rank() == 0:
 61 |         print('> testing cross entropy with model parallel size {} ...'.
 62 |               format(model_parallel_size))
 63 | 
 64 |     mpu.initialize_model_parallel(model_parallel_size)
 65 |     model_parallel_size = mpu.get_model_parallel_world_size()
 66 | 
 67 |     batch_size = 13
 68 |     seq_length = 17
 69 |     vocab_size_per_partition = 11
 70 |     logits_scale = 1000.0
 71 |     vocab_size = vocab_size_per_partition * model_parallel_size
 72 |     seed = 1234
 73 | 
 74 |     loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length,
 75 |                                                  vocab_size, logits_scale,
 76 |                                                  seed)
 77 |     loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length,
 78 |                                            vocab_size, logits_scale,
 79 |                                            seed)
 80 | 
 81 |     error = loss_torch.sub_(loss_mpu).abs().max()
 82 |     print('   max error in loss on global rank {}: {}'.format(
 83 |         torch.distributed.get_rank(), error))
 84 |     assert error < 1.0e-6
 85 | 
 86 |     error = grad_torch.sub_(grad_mpu).abs().max()
 87 |     print('   max error in grad on global rank {}: {}'.format(
 88 |         torch.distributed.get_rank(), error))
 89 |     assert error < 1.0e-6
 90 | 
 91 |     # Reset groups
 92 |     mpu.destroy_model_parallel()
 93 | 
 94 |     torch.distributed.barrier()
 95 |     if torch.distributed.get_rank() == 0:
 96 |         print('>> passed the test :-)')
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 | 
101 |     initialize_distributed()
102 |     world_size = torch.distributed.get_world_size()
103 | 
104 |     model_parallel_size = 1
105 |     while model_parallel_size <= world_size:
106 |         print_separator('test cross entropy')
107 |         test_cross_entropy(model_parallel_size)
108 |         model_parallel_size *= 2
109 | 


--------------------------------------------------------------------------------
/megatron/indexer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | 
 4 | from megatron import get_args
 5 | from megatron import mpu
 6 | from megatron.checkpointing import load_ict_checkpoint
 7 | from megatron.data.ict_dataset import get_ict_dataset
 8 | from megatron.data.realm_dataset_utils import get_one_epoch_dataloader
 9 | from megatron.data.realm_index import detach, BlockData
10 | from megatron.data.realm_dataset_utils import get_ict_batch
11 | from megatron.model.realm_model import general_ict_model_provider
12 | from megatron.training import get_model
13 | 
14 | 
15 | class IndexBuilder(object):
16 |     """Object for taking one pass over a dataset and creating a BlockData of its embeddings"""
17 |     def __init__(self):
18 |         args = get_args()
19 |         self.model = None
20 |         self.dataloader = None
21 |         self.block_data = None
22 | 
23 |         # need to know whether we're using a REALM checkpoint (args.load) or ICT checkpoint
24 |         assert not (args.load and args.ict_load)
25 |         self.using_realm_chkpt = args.ict_load is None
26 | 
27 |         self.log_interval = args.indexer_log_interval
28 |         self.batch_size = args.indexer_batch_size
29 | 
30 |         self.load_attributes()
31 |         self.is_main_builder = mpu.get_data_parallel_rank() == 0
32 |         self.num_total_builders = mpu.get_data_parallel_world_size()
33 |         self.iteration = self.total_processed = 0
34 | 
35 |     def load_attributes(self):
36 |         """Load the necessary attributes: model, dataloader and empty BlockData"""
37 |         model = get_model(lambda: general_ict_model_provider(only_block_model=True))
38 |         self.model = load_ict_checkpoint(model, only_block_model=True, from_realm_chkpt=self.using_realm_chkpt)
39 |         self.model.eval()
40 |         self.dataset = get_ict_dataset()
41 |         self.dataloader = iter(get_one_epoch_dataloader(self.dataset, self.batch_size))
42 |         self.block_data = BlockData(load_from_path=False)
43 | 
44 |     def track_and_report_progress(self, batch_size):
45 |         """Utility function for tracking progress"""
46 |         self.iteration += 1
47 |         self.total_processed += batch_size * self.num_total_builders
48 |         if self.is_main_builder and self.iteration % self.log_interval == 0:
49 |             print('Batch {:10d} | Total {:10d}'.format(self.iteration, self.total_processed), flush=True)
50 | 
51 |     def build_and_save_index(self):
52 |         """Goes through one epoch of the dataloader and adds all data to this instance's BlockData.
53 | 
54 |         The copy of BlockData is saved as a shard, which when run in a distributed setting will be
55 |         consolidated by the rank 0 process and saved as a final pickled BlockData.
56 |         """
57 | 
58 |         while True:
59 |             try:
60 |                 # batch also has query_tokens and query_pad_data
61 |                 _, _, block_tokens, block_pad_mask, block_sample_data = get_ict_batch(self.dataloader)
62 |             except (StopIteration, IndexError):
63 |                 break
64 | 
65 |             unwrapped_model = self.model
66 |             while not hasattr(unwrapped_model, 'embed_block'):
67 |                 unwrapped_model = unwrapped_model.module
68 | 
69 |             # detach, separate fields and add to BlockData
70 |             block_logits = detach(unwrapped_model.embed_block(block_tokens, block_pad_mask))
71 |             detached_data = detach(block_sample_data)
72 | 
73 |             # block_sample_data is a 2D array [batch x 4]
74 |             # with columns [start_idx, end_idx, doc_idx, block_idx] same as class BlockSampleData
75 |             block_indices = detached_data[:, 3]
76 |             block_metas = detached_data[:, :3]
77 | 
78 |             self.block_data.add_block_data(block_indices, block_logits, block_metas)
79 |             self.track_and_report_progress(batch_size=block_tokens.shape[0])
80 | 
81 |         # This process signals to finalize its shard and then synchronize with the other processes
82 |         self.block_data.save_shard()
83 |         torch.distributed.barrier()
84 |         del self.model
85 | 
86 |         # rank 0 process builds the final copy
87 |         if self.is_main_builder:
88 |             self.block_data.merge_shards_and_save()
89 |             # make sure that every single piece of data was embedded
90 |             assert len(self.block_data.embed_data) == len(self.dataset)
91 |         self.block_data.clear()
92 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/__init__.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import pathlib
 17 | import subprocess
 18 | import os
 19 | from torch.utils import cpp_extension
 20 | 
 21 | # Setting this param to a list has a problem of generating
 22 | # different compilation commands (with diferent order of architectures)
 23 | # and leading to recompilation of fused kernels.
 24 | # set it to empty string to avoid recompilation
 25 | # and assign arch flags explicity in extra_cuda_cflags below
 26 | os.environ["TORCH_CUDA_ARCH_LIST"] = ""
 27 | 
 28 | def get_cuda_bare_metal_version(cuda_dir):
 29 |     raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
 30 |                                          universal_newlines=True)
 31 |     output = raw_output.split()
 32 |     release_idx = output.index("release") + 1
 33 |     release = output[release_idx].split(".")
 34 |     bare_metal_major = release[0]
 35 |     bare_metal_minor = release[1][0]
 36 | 
 37 |     return raw_output, bare_metal_major, bare_metal_minor
 38 | 
 39 | def create_build_dir(buildpath):
 40 |     try:
 41 |         os.mkdir(buildpath)
 42 |     except OSError:
 43 |         if not os.path.isdir(buildpath):
 44 |             print(f"Creation of the build directory {buildpath} failed")
 45 | 
 46 | def load_scaled_upper_triang_masked_softmax_fusion_kernel():
 47 | 
 48 |     # Check, if CUDA11 is installed for compute capability 8.0
 49 |     cc_flag = []
 50 |     _, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
 51 |     if int(bare_metal_major) >= 11:
 52 |         cc_flag.append('-gencode')
 53 |         cc_flag.append('arch=compute_80,code=sm_80')
 54 | 
 55 |     srcpath = pathlib.Path(__file__).parent.absolute()
 56 |     buildpath = srcpath / 'build'
 57 | 
 58 |     create_build_dir(buildpath)
 59 | 
 60 |     scaled_upper_triang_masked_softmax_cuda = cpp_extension.load(
 61 |         name='scaled_upper_triang_masked_softmax_cuda',
 62 |         sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp',
 63 |                  srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu'],
 64 |         build_directory=buildpath,
 65 |         extra_cflags=['-O3',],
 66 |         extra_cuda_cflags=['-O3',
 67 |                            '-gencode', 'arch=compute_70,code=sm_70',
 68 |                            '-U__CUDA_NO_HALF_OPERATORS__',
 69 |                            '-U__CUDA_NO_HALF_CONVERSIONS__',
 70 |                            '--expt-relaxed-constexpr',
 71 |                            '--expt-extended-lambda',
 72 |                            '--use_fast_math'] + cc_flag)
 73 | 
 74 | def load_scaled_masked_softmax_fusion_kernel():
 75 | 
 76 |     # Check, if CUDA11 is installed for compute capability 8.0
 77 |     cc_flag = []
 78 |     _, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
 79 |     if int(bare_metal_major) >= 11:
 80 |         cc_flag.append('-gencode')
 81 |         cc_flag.append('arch=compute_80,code=sm_80')
 82 | 
 83 |     srcpath = pathlib.Path(__file__).parent.absolute()
 84 |     buildpath = srcpath / 'build'
 85 | 
 86 |     create_build_dir(buildpath)
 87 | 
 88 |     scaled_upper_triang_masked_softmax_cuda = cpp_extension.load(
 89 |         name='scaled_masked_softmax_cuda',
 90 |         sources=[srcpath / 'scaled_masked_softmax.cpp',
 91 |                  srcpath / 'scaled_masked_softmax_cuda.cu'],
 92 |         build_directory=buildpath,
 93 |         extra_cflags=['-O3',],
 94 |         extra_cuda_cflags=['-O3',
 95 |                            '-gencode', 'arch=compute_70,code=sm_70',
 96 |                            '-U__CUDA_NO_HALF_OPERATORS__',
 97 |                            '-U__CUDA_NO_HALF_CONVERSIONS__',
 98 |                            '--expt-relaxed-constexpr',
 99 |                            '--expt-extended-lambda',
100 |                            '--use_fast_math'] + cc_flag)
101 | 


--------------------------------------------------------------------------------
/megatron/model/classification.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Classification model."""
17 | 
18 | import torch
19 | 
20 | from megatron import get_args, print_rank_0
21 | from megatron.model.bert_model import bert_attention_mask_func, bert_extended_attention_mask, bert_position_ids
22 | from megatron.model.language_model import get_language_model
23 | from megatron.model.utils import get_linear_layer
24 | from megatron.model.utils import init_method_normal
25 | from megatron.model.utils import scaled_init_method_normal
26 | from megatron.module import MegatronModule
27 | 
28 | 
29 | class Classification(MegatronModule):
30 | 
31 |     def __init__(self, num_classes, num_tokentypes=2):
32 |         super(Classification, self).__init__()
33 |         args = get_args()
34 | 
35 |         self.num_classes = num_classes
36 |         init_method = init_method_normal(args.init_method_std)
37 | 
38 |         self.language_model, self._language_model_key = get_language_model(
39 |             attention_mask_func=bert_attention_mask_func,
40 |             num_tokentypes=num_tokentypes,
41 |             add_pooler=True,
42 |             init_method=init_method,
43 |             scaled_init_method=scaled_init_method_normal(args.init_method_std,
44 |                                                          args.num_layers))
45 | 
46 |         # Multi-choice head.
47 |         self.classification_dropout = torch.nn.Dropout(args.hidden_dropout)
48 |         self.classification_head = get_linear_layer(args.hidden_size,
49 |                                                     self.num_classes,
50 |                                                     init_method)
51 |         self._classification_head_key = 'classification_head'
52 | 
53 |     def forward(self, input_ids, attention_mask, tokentype_ids):
54 | 
55 |         extended_attention_mask = bert_extended_attention_mask(
56 |             attention_mask, next(self.language_model.parameters()).dtype)
57 |         position_ids = bert_position_ids(input_ids)
58 | 
59 |         _, pooled_output = self.language_model(input_ids,
60 |                                                position_ids,
61 |                                                extended_attention_mask,
62 |                                                tokentype_ids=tokentype_ids)
63 | 
64 |         # Output.
65 |         classification_output = self.classification_dropout(pooled_output)
66 |         classification_logits = self.classification_head(classification_output)
67 | 
68 |         # Reshape back to separate choices.
69 |         classification_logits = classification_logits.view(-1, self.num_classes)
70 | 
71 |         return classification_logits
72 | 
73 |     def state_dict_for_save_checkpoint(self, destination=None, prefix='',
74 |                                        keep_vars=False):
75 |         """For easy load when model is combined with other heads,
76 |         add an extra key."""
77 | 
78 |         state_dict_ = {}
79 |         state_dict_[self._language_model_key] \
80 |             = self.language_model.state_dict_for_save_checkpoint(
81 |                 destination, prefix, keep_vars)
82 |         state_dict_[self._classification_head_key] \
83 |             = self.classification_head.state_dict(
84 |                 destination, prefix, keep_vars)
85 |         return state_dict_
86 | 
87 |     def load_state_dict(self, state_dict, strict=True):
88 |         """Customized load."""
89 | 
90 |         self.language_model.load_state_dict(
91 |             state_dict[self._language_model_key], strict=strict)
92 |         if self._classification_head_key in state_dict:
93 |             self.classification_head.load_state_dict(
94 |                 state_dict[self._classification_head_key], strict=strict)
95 |         else:
96 |             print_rank_0('***WARNING*** could not find {} in the checkpoint, '
97 |                          'initializing to random'.format(
98 |                              self._classification_head_key))
99 | 


--------------------------------------------------------------------------------
/megatron/mpu/data.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import torch
 17 | 
 18 | from .initialize import get_model_parallel_group
 19 | from .initialize import get_model_parallel_rank
 20 | from .initialize import get_model_parallel_src_rank
 21 | 
 22 | 
 23 | _MAX_DATA_DIM = 4
 24 | 
 25 | 
 26 | def _check_data_types(keys, data, target_dtype):
 27 |     """Check that all the keys have the same target data type."""
 28 |     for key in keys:
 29 |         assert data[key].dtype == target_dtype, '{} has data type {} which '\
 30 |             'is different than {}'.format(key, data[key].dtype, target_dtype)
 31 | 
 32 | 
 33 | def _build_key_size_numel_dictionaries(keys, data):
 34 |     """Build the size on rank 0 and broadcast."""
 35 |     max_dim = _MAX_DATA_DIM
 36 |     sizes = [0 for _ in range(max_dim) for _ in keys]
 37 | 
 38 |     # Pack the sizes on rank zero.
 39 |     if get_model_parallel_rank() == 0:
 40 |         offset = 0
 41 |         for key in keys:
 42 |             assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM'
 43 |             size = data[key].size()
 44 |             for i, s in enumerate(size):
 45 |                 sizes[i + offset] = s
 46 |             offset += max_dim
 47 | 
 48 |     # Move to GPU and broadcast.
 49 |     sizes_cuda = torch.cuda.LongTensor(sizes)
 50 |     torch.distributed.broadcast(sizes_cuda, get_model_parallel_src_rank(),
 51 |                                 group=get_model_parallel_group())
 52 | 
 53 |     # Move back to cpu and unpack.
 54 |     sizes_cpu = sizes_cuda.cpu()
 55 |     key_size = {}
 56 |     key_numel = {}
 57 |     total_numel = 0
 58 |     offset = 0
 59 |     for key in keys:
 60 |         i = 0
 61 |         size = []
 62 |         numel = 1
 63 |         while sizes_cpu[offset + i] > 0:
 64 |             this_size = sizes_cpu[offset + i]
 65 |             size.append(this_size)
 66 |             numel *= this_size
 67 |             i += 1
 68 |         key_size[key] = size
 69 |         key_numel[key] = numel
 70 |         total_numel += numel
 71 |         offset += max_dim
 72 | 
 73 |     return key_size, key_numel, total_numel
 74 | 
 75 | 
 76 | def broadcast_data(keys, data, datatype):
 77 |     """Broadcast data from rank zero of each model parallel group to the
 78 |     members of the same model parallel group.
 79 | 
 80 |     Arguments:
 81 |         keys: list of keys in the data disctionary to be broadcasted
 82 |         data: data dictionary of string keys and cpu tensor values.
 83 |         datatype: torch data type of all tensors in data associated
 84 |                   with keys.
 85 |     """
 86 |     # Build (key, size) and (key, number of elements) dictionaries along
 87 |     # with the total number of elements on all ranks.
 88 |     key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys,
 89 |                                                                           data)
 90 | 
 91 |     # Pack on rank zero.
 92 |     if get_model_parallel_rank() == 0:
 93 |         # Check that all keys have the same data type.
 94 |         _check_data_types(keys, data, datatype)
 95 |         # Flatten the data associated with the keys
 96 |         flatten_data = torch.cat(
 97 |             [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
 98 |     else:
 99 |         flatten_data = torch.empty(total_numel,
100 |                                    device=torch.cuda.current_device(),
101 |                                    dtype=datatype)
102 | 
103 |     # Boradcast
104 |     torch.distributed.broadcast(flatten_data, get_model_parallel_src_rank(),
105 |                                 group=get_model_parallel_group())
106 | 
107 |     # Unpack
108 |     output = {}
109 |     offset = 0
110 |     for key in keys:
111 |         size = key_size[key]
112 |         numel = key_numel[key]
113 |         output[key] = flatten_data.narrow(0, offset, numel).view(size)
114 |         offset += numel
115 | 
116 |     return output
117 | 


--------------------------------------------------------------------------------
/megatron/deprecated_data_utils/scripts/split_gpt2_json.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """
 17 | Takes a corpora of files (specified by `--input_files`) with json data separated
 18 | by newlines (loose json). Splits data into train.json, val.json, test.json files
 19 | under `output_dir`.
 20 | 
 21 | Note: This code has the potential to override files with the names
 22 | train.json, val.json, test.json in `--output_dir`.
 23 | """
 24 | import os
 25 | import argparse
 26 | import math
 27 | import random
 28 | 
 29 | parser = argparse.ArgumentParser('resplit loose json data into train/val/test')
 30 | parser.add_argument('--input_files', nargs='+', required=True,
 31 |                     help='whitespace separated list of input data files')
 32 | parser.add_argument('--output_dir', required=True,
 33 |                     help='output directory where to put files')
 34 | parser.add_argument('--test_percent', type=float, nargs='+', default=[0.05, 0],
 35 |                     help='percentage of available data to use for val/test dataset')
 36 | args = parser.parse_args()
 37 | 
 38 | 
 39 | def get_lines(filepath):
 40 |     lines = []
 41 |     with open(filepath, 'r') as f:
 42 |         for i, l in enumerate(f.readlines()):
 43 |             l = l.strip()
 44 |             lines.append(l)
 45 |     return lines
 46 | 
 47 | 
 48 | def get_splits(lines, line_counts):
 49 |     all_lines = []
 50 |     line_idx = []
 51 |     file_mappings = []
 52 |     for i, l in enumerate(lines):
 53 |         all_lines.extend(l)
 54 |         line_idx.extend(list(range(len(l))))
 55 |         file_mappings.extend([i] * len(l))
 56 | 
 57 |     indices = list(range(len(all_lines)))
 58 |     random.shuffle(indices)
 59 |     all_lines = [all_lines[idx] for idx in indices]
 60 |     line_idx = [line_idx[idx] for idx in indices]
 61 |     file_mappings = [file_mappings[idx] for idx in indices]
 62 | 
 63 |     splits = []
 64 |     mappings = []
 65 |     start = 0
 66 |     for end in line_counts:
 67 |         end += start
 68 |         splits.append(all_lines[start:end])
 69 |         mappings.append(format_mappings(line_idx[start:end], file_mappings[start:end]))
 70 |         start = end
 71 |     return splits, mappings
 72 | 
 73 | 
 74 | def format_mappings(line_idx, file_mappings):
 75 |     lines = []
 76 |     for m, l in zip(file_mappings, line_idx):
 77 |         lines.append(str(m).strip() + '\t' + str(l).strip())
 78 |     return lines
 79 | 
 80 | 
 81 | def get_filepaths(filepaths, output_dir):
 82 |     paths = []
 83 |     train_path = 'train.json'
 84 |     dev_path = 'dev.json'
 85 |     test_path = 'test.json'
 86 |     paths.append(os.path.join(output_dir, train_path))
 87 |     paths.append(os.path.join(output_dir, dev_path))
 88 |     paths.append(os.path.join(output_dir, test_path))
 89 |     return paths
 90 | 
 91 | 
 92 | def write_files(lines, mappings, filepaths):
 93 |     for l, m, path in zip(lines, mappings, filepaths):
 94 |         write_file(l, path)
 95 |         write_mapping_file(m, path)
 96 | 
 97 | 
 98 | def write_file(lines, path):
 99 |     print('Writing:', path)
100 |     with open(path, 'w') as f:
101 |         for l in lines:
102 |             f.write(l + '\n')
103 | 
104 | 
105 | def write_mapping_file(m, path):
106 |     path = path + '.map'
107 |     m = [get_mapping_header()] + m
108 |     write_file(m, path)
109 | 
110 | 
111 | def get_mapping_header():
112 |     return 'file\tline #'
113 | 
114 | 
115 | if not os.path.exists(args.output_dir):
116 |     os.makedirs(args.output_dir)
117 | 
118 | lines = []
119 | 
120 | for filepath in args.input_files:
121 |     _lines = get_lines(filepath)
122 |     lines.append(_lines)
123 | 
124 | # calculate number of lines to use for each
125 | line_counts = [len(l) for l in lines]
126 | total_lines = sum(line_counts)
127 | dev_percent = args.test_percent[0]
128 | dev_lines = math.ceil(dev_percent * total_lines)
129 | test_percent = 0
130 | if len(args.test_percent) == 2:
131 |     test_percent = args.test_percent[1]
132 | test_lines = math.ceil(test_percent * total_lines)
133 | train_lines = total_lines - (test_lines + dev_lines)
134 | normed_lines = [train_lines, dev_lines, test_lines]
135 | normed_lines = [int(l) for l in normed_lines]
136 | 
137 | 
138 | splits, mappings = get_splits(lines, normed_lines)
139 | filepaths = get_filepaths(args.input_files, args.output_dir)
140 | print('Writing output to:', filepaths)
141 | write_files(splits, mappings, filepaths)
142 | 


--------------------------------------------------------------------------------
/megatron/mpu/mappings.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import torch
 17 | 
 18 | from .initialize import get_model_parallel_group, get_model_parallel_world_size, get_model_parallel_rank
 19 | from .utils import split_tensor_along_last_dim
 20 | 
 21 | 
 22 | def _reduce(input_):
 23 |     """All-reduce the the input tensor across model parallel group."""
 24 | 
 25 |     # Bypass the function if we are using only 1 GPU.
 26 |     if get_model_parallel_world_size()==1:
 27 |         return input_
 28 | 
 29 |     # All-reduce.
 30 |     torch.distributed.all_reduce(input_, group=get_model_parallel_group())
 31 | 
 32 |     return input_
 33 | 
 34 | 
 35 | def _split(input_):
 36 |     """Split the tensor along its last dimension and keep the
 37 |     corresponding slice."""
 38 | 
 39 |     world_size = get_model_parallel_world_size()
 40 |     # Bypass the function if we are using only 1 GPU.
 41 |     if world_size==1:
 42 |         return input_
 43 | 
 44 |     # Split along last dimension.
 45 |     input_list = split_tensor_along_last_dim(input_, world_size)
 46 | 
 47 |     # Note: torch.split does not create contiguous tensors by default.
 48 |     rank = get_model_parallel_rank()
 49 |     output = input_list[rank].contiguous()
 50 | 
 51 |     return output
 52 | 
 53 | 
 54 | def _gather(input_):
 55 |     """Gather tensors and concatinate along the last dimension."""
 56 | 
 57 |     world_size = get_model_parallel_world_size()
 58 |     # Bypass the function if we are using only 1 GPU.
 59 |     if world_size==1:
 60 |         return input_
 61 | 
 62 |     # Size and dimension.
 63 |     last_dim = input_.dim() - 1
 64 |     rank = get_model_parallel_rank()
 65 | 
 66 |     tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
 67 |     tensor_list[rank] = input_
 68 |     torch.distributed.all_gather(tensor_list, input_, group=get_model_parallel_group())
 69 | 
 70 |     # Note: torch.cat already creates a contiguous tensor.
 71 |     output = torch.cat(tensor_list, dim=last_dim).contiguous()
 72 | 
 73 |     return output
 74 | 
 75 | 
 76 | class _CopyToModelParallelRegion(torch.autograd.Function):
 77 |     """Pass the input to the model parallel region."""
 78 | 
 79 |     @staticmethod
 80 |     def symbolic(graph, input_):
 81 |         return input_
 82 |     
 83 |     @staticmethod
 84 |     def forward(ctx, input_):
 85 |         return input_
 86 | 
 87 |     @staticmethod
 88 |     def backward(ctx, grad_output):
 89 |         return _reduce(grad_output)
 90 | 
 91 | 
 92 | class _ReduceFromModelParallelRegion(torch.autograd.Function):
 93 |     """All-redcue the input from the model parallel region."""
 94 | 
 95 |     @staticmethod
 96 |     def symbolic(graph, input_):
 97 |         return _reduce(input_)
 98 |     
 99 |     @staticmethod
100 |     def forward(ctx, input_):
101 |         return _reduce(input_)
102 | 
103 |     @staticmethod
104 |     def backward(ctx, grad_output):
105 |         return grad_output
106 | 
107 | 
108 | class _ScatterToModelParallelRegion(torch.autograd.Function):
109 |     """Split the input and keep only the corresponding chuck to the rank."""
110 | 
111 |     @staticmethod
112 |     def symbolic(graph, input_):
113 |         return _split(input_)
114 | 
115 |     @staticmethod
116 |     def forward(ctx, input_):
117 |         return _split(input_)
118 | 
119 |     @staticmethod
120 |     def backward(ctx, grad_output):
121 |         return _gather(grad_output)
122 | 
123 | 
124 | class _GatherFromModelParallelRegion(torch.autograd.Function):
125 |     """Gather the input from model parallel region and concatinate."""
126 | 
127 |     @staticmethod
128 |     def symbolic(graph, input_):
129 |         return _gather(input_)
130 |     
131 |     @staticmethod
132 |     def forward(ctx, input_):
133 |         return _gather(input_)
134 | 
135 |     @staticmethod
136 |     def backward(ctx, grad_output):
137 |         return _split(grad_output)
138 | 
139 | 
140 | # -----------------
141 | # Helper functions.
142 | # -----------------
143 | 
144 | def copy_to_model_parallel_region(input_):
145 |     return _CopyToModelParallelRegion.apply(input_)
146 | 
147 | 
148 | def reduce_from_model_parallel_region(input_):
149 |     return _ReduceFromModelParallelRegion.apply(input_)
150 | 
151 | 
152 | def scatter_to_model_parallel_region(input_):
153 |     return _ScatterToModelParallelRegion.apply(input_)
154 | 
155 | 
156 | def gather_from_model_parallel_region(input_):
157 |     return _GatherFromModelParallelRegion.apply(input_)
158 | 


--------------------------------------------------------------------------------
/megatron/model/multiple_choice.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Multiple choice model."""
 17 | 
 18 | import torch
 19 | 
 20 | from megatron import get_args, print_rank_0
 21 | from megatron.model.bert_model import bert_attention_mask_func, bert_extended_attention_mask, bert_position_ids
 22 | from megatron.model.language_model import get_language_model
 23 | from megatron.model.utils import get_linear_layer
 24 | from megatron.model.utils import init_method_normal
 25 | from megatron.model.utils import scaled_init_method_normal
 26 | from megatron.module import MegatronModule
 27 | 
 28 | 
 29 | class MultipleChoice(MegatronModule):
 30 | 
 31 |     def __init__(self, num_tokentypes=2):
 32 |         super(MultipleChoice, self).__init__()
 33 |         args = get_args()
 34 | 
 35 |         init_method = init_method_normal(args.init_method_std)
 36 | 
 37 |         self.language_model, self._language_model_key = get_language_model(
 38 |             attention_mask_func=bert_attention_mask_func,
 39 |             num_tokentypes=num_tokentypes,
 40 |             add_pooler=True,
 41 |             init_method=init_method,
 42 |             scaled_init_method=scaled_init_method_normal(args.init_method_std,
 43 |                                                          args.num_layers))
 44 | 
 45 |         # Multi-choice head.
 46 |         self.multichoice_dropout = torch.nn.Dropout(args.hidden_dropout)
 47 |         self.multichoice_head = get_linear_layer(args.hidden_size, 1,
 48 |                                                  init_method)
 49 |         self._multichoice_head_key = 'multichoice_head'
 50 | 
 51 |     def forward(self, input_ids, attention_mask, tokentype_ids):
 52 | 
 53 |         # [batch, choices, sequence] --> [batch * choices, sequence] -->
 54 |         #    transformer --> [batch, choices] --> softmax
 55 | 
 56 |         # Ensure the shape is [batch-size, choices, sequence]
 57 |         assert len(input_ids.shape) == 3
 58 |         assert len(attention_mask.shape) == 3
 59 |         assert len(tokentype_ids.shape) == 3
 60 | 
 61 |         # Reshape and treat choice dimension the same as batch.
 62 |         num_choices = input_ids.shape[1]
 63 |         input_ids = input_ids.view(-1, input_ids.size(-1))
 64 |         attention_mask = attention_mask.view(-1, attention_mask.size(-1))
 65 |         tokentype_ids = tokentype_ids.view(-1, tokentype_ids.size(-1))
 66 | 
 67 |         extended_attention_mask = bert_extended_attention_mask(
 68 |             attention_mask, next(self.language_model.parameters()).dtype)
 69 |         position_ids = bert_position_ids(input_ids)
 70 | 
 71 |         _, pooled_output = self.language_model(input_ids,
 72 |                                                position_ids,
 73 |                                                extended_attention_mask,
 74 |                                                tokentype_ids=tokentype_ids)
 75 | 
 76 |         # Output.
 77 |         multichoice_output = self.multichoice_dropout(pooled_output)
 78 |         multichoice_logits = self.multichoice_head(multichoice_output)
 79 | 
 80 |         # Reshape back to separate choices.
 81 |         multichoice_logits = multichoice_logits.view(-1, num_choices)
 82 | 
 83 |         return multichoice_logits
 84 | 
 85 |     def state_dict_for_save_checkpoint(self, destination=None, prefix='',
 86 |                                        keep_vars=False):
 87 |         """For easy load when model is combined with other heads,
 88 |         add an extra key."""
 89 | 
 90 |         state_dict_ = {}
 91 |         state_dict_[self._language_model_key] \
 92 |             = self.language_model.state_dict_for_save_checkpoint(
 93 |                 destination, prefix, keep_vars)
 94 |         state_dict_[self._multichoice_head_key] \
 95 |             = self.multichoice_head.state_dict(
 96 |                 destination, prefix, keep_vars)
 97 |         return state_dict_
 98 | 
 99 |     def load_state_dict(self, state_dict, strict=True):
100 |         """Customized load."""
101 | 
102 |         self.language_model.load_state_dict(
103 |             state_dict[self._language_model_key], strict=strict)
104 |         if self._multichoice_head_key in state_dict:
105 |             self.multichoice_head.load_state_dict(
106 |                 state_dict[self._multichoice_head_key], strict=strict)
107 |         else:
108 |             print_rank_0('***WARNING*** could not find {} in the checkpoint, '
109 |                          'initializing to random'.format(
110 |                              self._multichoice_head_key))
111 | 


--------------------------------------------------------------------------------
/megatron/data/test/test_indexed_dataset.py:
--------------------------------------------------------------------------------
  1 | # This file isn't really a formal automated test, it's just a place to
  2 | # put some code used during development and manual testing of
  3 | # indexed_dataset.
  4 | 
  5 | from megatron.data import indexed_dataset
  6 | from megatron.tokenizer import build_tokenizer
  7 | import argparse
  8 | import os
  9 | import sys
 10 | 
 11 | import torch
 12 | 
 13 | script_dir = os.path.dirname(os.path.realpath(__file__))
 14 | sys.path.append(os.path.join(script_dir, "../../../"))
 15 | 
 16 | 
 17 | def test_indexed_dataset(args):
 18 |     ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
 19 |     tokenizer = build_tokenizer(args)
 20 |     print(len(ds.doc_idx))
 21 |     print(len(ds))
 22 |     print(ds.doc_idx[-1])
 23 |     if ds.supports_prefetch:
 24 |         # just prefetch the whole thing in test (so assume it is small)
 25 |         ds.prefetch(range(len(ds)))
 26 |     if args.count > len(ds.doc_idx) - 1:
 27 |         args.count = len(ds.doc_idx) - 1
 28 | 
 29 |     for i in range(args.count):
 30 |         start = ds.doc_idx[i]
 31 |         end = ds.doc_idx[i + 1]
 32 |         ids = ds[start:end]
 33 |         print(f"Document {i}:")
 34 |         print("--------------")
 35 |         for s in ids:
 36 |             assert len(s) > 0
 37 |             l = s.data.tolist()
 38 |             text = tokenizer.detokenize(l)
 39 |             print(text)
 40 |             print("---")
 41 | 
 42 | 
 43 | def test_indexed_dataset_get(args):
 44 |     ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
 45 |     tokenizer = build_tokenizer(args)
 46 |     size = ds.sizes[0]
 47 |     print(f"size: {size}")
 48 |     full = ds.get(0)
 49 |     print(full)
 50 |     # print(tokenizer.detokenize(full.data.tolist()))
 51 |     print("---")
 52 |     end = ds.get(0, offset=size - 10)
 53 |     print(end)
 54 |     # print(tokenizer.detokenize(end.data.tolist()))
 55 | 
 56 |     start = ds.get(0, length=10)
 57 |     print(start)
 58 |     # print(tokenizer.detokenize(start.data.tolist()))
 59 | 
 60 |     part = ds.get(0, offset=2, length=8)
 61 |     print(part)
 62 |     # print(tokenizer.detokenize(part.data.tolist()))
 63 | 
 64 | # def test_albert_dataset(args):
 65 | #     # tokenizer = FullBertTokenizer(args.vocab, do_lower_case=True)
 66 | #     # idataset = indexed_dataset.make_dataset(args.data, args.dataset_impl)
 67 | #     # ds = AlbertDataset(idataset, tokenizer)
 68 | #     ds = AlbertDataset.from_paths(args.vocab, args.data, args.dataset_impl,
 69 | #                                   args.epochs, args.max_num_samples,
 70 | #                                   args.masked_lm_prob, args.seq_length,
 71 | #                                   args.short_seq_prob, args.seed)
 72 | #     truncated = 0
 73 | #     total = 0
 74 | #     for i, s in enumerate(ds):
 75 | #         ids = s['text']
 76 | #         tokens = ds.tokenizer.convert_ids_to_tokens(ids)
 77 | #         print(tokens)
 78 | #         if i >= args.count-1:
 79 | #             exit()
 80 | 
 81 | 
 82 | def main():
 83 |     parser = argparse.ArgumentParser()
 84 |     parser.add_argument('--data', type=str, help='prefix to data files')
 85 |     parser.add_argument('--dataset-impl', type=str, default='infer',
 86 |                         choices=['lazy', 'cached', 'mmap', 'infer'])
 87 |     parser.add_argument('--count', type=int, default=10,
 88 |                         help='Number of samples/documents to print')
 89 | 
 90 |     group = parser.add_argument_group(title='tokenizer')
 91 |     group.add_argument('--tokenizer-type', type=str, required=True,
 92 |                        choices=['BertWordPieceLowerCase',
 93 |                                 'GPT2BPETokenizer'],
 94 |                        help='What type of tokenizer to use.')
 95 |     group.add_argument('--vocab-file', type=str, default=None,
 96 |                        help='Path to the vocab file')
 97 |     group.add_argument('--merge-file', type=str, default=None,
 98 |                        help='Path to the BPE merge file (if necessary).')
 99 | 
100 |     parser.add_argument('--epochs', type=int, default=5,
101 |                         help='Number of epochs to plan for')
102 |     parser.add_argument('--max-num-samples', type=int, default=None,
103 |                         help='Maximum number of samples to plan for')
104 |     parser.add_argument('--masked-lm-prob', type=float, default=0.15,
105 |                         help='probability of masking tokens')
106 |     parser.add_argument('--seq-length', type=int, default=512,
107 |                         help='maximum sequence length')
108 |     parser.add_argument('--short-seq-prob', type=float, default=0.1,
109 |                         help='probability of creating a short sequence')
110 |     parser.add_argument('--seed', type=int, default=1234,
111 |                         help='random seed')
112 |     args = parser.parse_args()
113 |     args.rank = 0
114 |     args.make_vocab_size_divisible_by = 128
115 |     args.model_parallel_size = 1
116 | 
117 |     if args.dataset_impl == "infer":
118 |         args.dataset_impl = indexed_dataset.infer_dataset_impl(args.data)
119 | 
120 | #    test_albert_dataset(args)
121 |     test_indexed_dataset_get(args)
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     main()
126 | 


--------------------------------------------------------------------------------
/megatron/mpu/cross_entropy.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | import torch
 18 | 
 19 | from .initialize import get_model_parallel_group
 20 | from .initialize import get_model_parallel_rank
 21 | from .initialize import get_model_parallel_world_size
 22 | from .utils import VocabUtility
 23 | 
 24 | 
 25 | class _VocabParallelCrossEntropy(torch.autograd.Function):
 26 | 
 27 |     @staticmethod
 28 |     def forward(ctx, vocab_parallel_logits, target):
 29 | 
 30 |         # Maximum value along vocab dimension across all GPUs.
 31 |         logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
 32 |         torch.distributed.all_reduce(logits_max,
 33 |                                      op=torch.distributed.ReduceOp.MAX,
 34 |                                      group=get_model_parallel_group())
 35 |         # Subtract the maximum value.
 36 |         vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
 37 | 
 38 |         # Get the partition's vocab indecies
 39 |         get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
 40 |         partition_vocab_size = vocab_parallel_logits.size()[-1]
 41 |         rank = get_model_parallel_rank()
 42 |         world_size = get_model_parallel_world_size()
 43 |         vocab_start_index, vocab_end_index = get_vocab_range(
 44 |             partition_vocab_size, rank, world_size)
 45 | 
 46 |         # Create a mask of valid vocab ids (1 means it needs to be masked).
 47 |         target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
 48 |         masked_target = target.clone() - vocab_start_index
 49 |         masked_target[target_mask] = 0
 50 | 
 51 |         # Get predicted-logits = logits[target].
 52 |         # For Simplicity, we convert logits to a 2-D tensor with size
 53 |         # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
 54 |         logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
 55 |         masked_target_1d = masked_target.view(-1)
 56 |         arange_1d = torch.arange(start=0, end=logits_2d.size()[0],
 57 |                                  device=logits_2d.device)
 58 |         predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
 59 |         predicted_logits_1d = predicted_logits_1d.clone().contiguous()
 60 |         predicted_logits = predicted_logits_1d.view_as(target)
 61 |         predicted_logits[target_mask] = 0.0
 62 |         # All reduce is needed to get the chunks from other GPUs.
 63 |         torch.distributed.all_reduce(predicted_logits,
 64 |                                      op=torch.distributed.ReduceOp.SUM,
 65 |                                      group=get_model_parallel_group())
 66 | 
 67 |         # Sum of exponential of logits along vocab dimension across all GPUs.
 68 |         exp_logits = vocab_parallel_logits
 69 |         torch.exp(vocab_parallel_logits, out=exp_logits)
 70 |         sum_exp_logits = exp_logits.sum(dim=-1)
 71 |         torch.distributed.all_reduce(sum_exp_logits,
 72 |                                      op=torch.distributed.ReduceOp.SUM,
 73 |                                      group=get_model_parallel_group())
 74 | 
 75 |         # Loss = log(sum(exp(logits))) - predicted-logit.
 76 |         loss = torch.log(sum_exp_logits) - predicted_logits
 77 | 
 78 |         # Store softmax, target-mask and masked-target for backward pass.
 79 |         exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
 80 |         ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
 81 | 
 82 |         return loss
 83 | 
 84 |     @staticmethod
 85 |     def backward(ctx, grad_output):
 86 | 
 87 |         # Retreive tensors from the forward path.
 88 |         softmax, target_mask, masked_target_1d = ctx.saved_tensors
 89 | 
 90 |         # All the inputs have softmax as thier gradient.
 91 |         grad_input = softmax
 92 |         # For simplicity, work with the 2D gradient.
 93 |         partition_vocab_size = softmax.size()[-1]
 94 |         grad_2d = grad_input.view(-1, partition_vocab_size)
 95 | 
 96 |         # Add the gradient from matching classes.
 97 |         arange_1d = torch.arange(start=0, end=grad_2d.size()[0],
 98 |                                  device=grad_2d.device)
 99 |         grad_2d[arange_1d, masked_target_1d] -= (
100 |             1.0 - target_mask.view(-1).float())
101 | 
102 |         # Finally elementwise multiplication with the output gradients.
103 |         grad_input.mul_(grad_output.unsqueeze(dim=-1))
104 | 
105 |         return grad_input, None
106 | 
107 | 
108 | def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
109 |     """Helper function for the cross entropy."""
110 |     return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
111 | 


--------------------------------------------------------------------------------
/megatron/mpu/grads.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | # Parts of the code here are adapted from PyTorch
 18 | # repo: https://github.com/pytorch/pytorch
 19 | 
 20 | 
 21 | import torch
 22 | from torch._six import inf
 23 | 
 24 | try:
 25 |     from apex.multi_tensor_apply import multi_tensor_applier
 26 |     import amp_C
 27 | 
 28 | except Exception as e:
 29 |     print('WARNING: APEX is not installed, multi_tensor_applier will not be available.')
 30 | 
 31 | from .initialize import get_model_parallel_group
 32 | from .initialize import get_model_parallel_rank
 33 | 
 34 | 
 35 | def l2_grad_clipper(parameters, max_norm):
 36 |     """Efficient L2 norm gradient clipping."""
 37 | 
 38 |     overflow_buf = torch.zeros(1, dtype=torch.int, device='cuda')
 39 |     # Make sure we have an iterable.
 40 |     if isinstance(parameters, torch.Tensor):
 41 |         parameters = [parameters]
 42 |     # Filter parameters with gradients.
 43 |     parameters_with_grads = list(filter(
 44 |         lambda p: p.grad is not None, parameters))
 45 |     # Filter parameters for norm calculations.
 46 |     mp_rank_is_zero = (get_model_parallel_rank() == 0)
 47 |     parameters_for_norm = list(filter(
 48 |         lambda p: p.model_parallel or mp_rank_is_zero, parameters_with_grads))
 49 |     # Calculate L2 norm.
 50 |     norm, _ = multi_tensor_applier(
 51 |         amp_C.multi_tensor_l2norm,
 52 |         overflow_buf,
 53 |         [parameters_for_norm],
 54 |         False # no per-parameter norm
 55 |     )
 56 |     # Sum across all model parallel GPUs.
 57 |     norm_2 = norm * norm
 58 |     torch.distributed.all_reduce(norm_2,
 59 |                                  op=torch.distributed.ReduceOp.SUM,
 60 |                                  group=get_model_parallel_group())
 61 |     total_norm = norm_2.item() ** 0.5
 62 |     # Scale to get max_norm.
 63 |     clip_coef = float(max_norm) / (total_norm + 1.0e-6)
 64 |     grads = [p.grad for p in parameters_with_grads]
 65 |     if clip_coef < 1.0:
 66 |         multi_tensor_applier(
 67 |             amp_C.multi_tensor_scale,
 68 |             overflow_buf,
 69 |             [grads, grads],
 70 |             clip_coef)
 71 |     return total_norm
 72 | 
 73 | 
 74 | def clip_grad_norm(parameters, max_norm, norm_type=2):
 75 |     """Clips gradient norm of an iterable of parameters.
 76 | 
 77 |     This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
 78 |     added functionality to handle model parallel parameters. Note that
 79 |     the gradients are modified in place.
 80 | 
 81 |     Arguments:
 82 |         parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
 83 |             single Tensor that will have gradients normalized
 84 |         max_norm (float or int): max norm of the gradients
 85 |         norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
 86 |             infinity norm.
 87 | 
 88 |     Returns:
 89 |         Total norm of the parameters (viewed as a single vector).
 90 |     """
 91 |     if isinstance(parameters, torch.Tensor):
 92 |         parameters = [parameters]
 93 |     parameters = list(filter(lambda p: p.grad is not None, parameters))
 94 |     max_norm = float(max_norm)
 95 |     norm_type = float(norm_type)
 96 |     if norm_type == inf:
 97 |         total_norm = max(p.grad.data.abs().max() for p in parameters)
 98 |         total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
 99 |         # Take max across all GPUs.
100 |         torch.distributed.all_reduce(total_norm_cuda,
101 |                                      op=torch.distributed.ReduceOp.MAX,
102 |                                      group=get_model_parallel_group())
103 |         total_norm = total_norm_cuda[0].item()
104 |         clip_coef = max_norm / (total_norm + 1e-6)
105 |         if clip_coef < 1:
106 |             for p in parameters:
107 |                 p.grad.data.mul_(clip_coef)
108 |     #elif norm_type == 2:
109 |     #    total_norm = l2_grad_clipper(parameters, max_norm)
110 | 
111 |     else:
112 |         total_norm = 0
113 |         for p in parameters:
114 |             if p.model_parallel or (get_model_parallel_rank() == 0):
115 |                 param_norm = p.grad.data.norm(norm_type)
116 |                 total_norm += param_norm.item() ** norm_type
117 |         # Sum across all model parallel GPUs.
118 |         total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
119 |         torch.distributed.all_reduce(total_norm_cuda,
120 |                                      op=torch.distributed.ReduceOp.SUM,
121 |                                      group=get_model_parallel_group())
122 |         total_norm = total_norm_cuda[0].item() ** (1. / norm_type)
123 |         clip_coef = max_norm / (total_norm + 1e-6)
124 |         if clip_coef < 1:
125 |             for p in parameters:
126 |                 p.grad.data.mul_(clip_coef)
127 |     return total_norm
128 | 


--------------------------------------------------------------------------------
/megatron/learning_rates.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Learning rate decay functions."""
 17 | 
 18 | import math
 19 | 
 20 | from megatron import print_rank_0
 21 | 
 22 | 
 23 | class AnnealingLR(object):
 24 |     """Anneals the learning rate."""
 25 | 
 26 |     def __init__(self, optimizer, start_lr,
 27 |                  warmup_iter, total_iters,
 28 |                  decay_style, last_iter, min_lr=0.0,
 29 |                  use_checkpoint_lr_scheduler=True,
 30 |                  override_lr_scheduler=False):
 31 | 
 32 |         # Class values.
 33 |         self.optimizer = optimizer
 34 |         self.start_lr = start_lr
 35 |         self.min_lr = min_lr
 36 |         self.warmup_iter = warmup_iter
 37 |         self.num_iters = last_iter
 38 |         self.end_iter = total_iters
 39 |         assert self.end_iter > 0
 40 |         self.decay_style = decay_style
 41 |         self.override_lr_scheduler = override_lr_scheduler
 42 |         self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
 43 |         if self.override_lr_scheduler:
 44 |             assert not self.use_checkpoint_lr_scheduler, 'both override and '\
 45 |                 'use-checkpoint are set.'
 46 |         # Set the learning rate
 47 |         self.step(self.num_iters)
 48 | 
 49 |         print_rank_0('> learning rate decay style: {}'.format(self.decay_style))
 50 | 
 51 |     def get_lr(self):
 52 |         """Learning rate decay functions from:
 53 |               https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
 54 | 
 55 |         num_iters_ = min(self.num_iters, self.end_iter - self.warmup_iter)
 56 |         # Warmup.
 57 |         if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter:
 58 |             return float(self.start_lr) * num_iters_ / self.warmup_iter
 59 | 
 60 |         num_iters_ = num_iters_ - self.warmup_iter
 61 |         if self.decay_style == 'linear':
 62 |             lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter
 63 |         elif self.decay_style == 'cosine':
 64 |             lr = self.start_lr / 2.0 * (math.cos(
 65 |                 math.pi * num_iters_ / self.end_iter) + 1)
 66 |         elif self.decay_style == 'exponential':
 67 |             # exp(-0.693) = 1/2
 68 |             lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter)
 69 |         else:
 70 |             lr = self.start_lr
 71 |         return max(lr, self.min_lr)
 72 | 
 73 |     def step(self, step_num=None):
 74 |         """Set lr for all parameters groups."""
 75 |         if step_num is None:
 76 |             step_num = self.num_iters + 1
 77 |         self.num_iters = step_num
 78 |         new_lr = self.get_lr()
 79 |         for group in self.optimizer.param_groups:
 80 |             group['lr'] = new_lr
 81 | 
 82 |     def state_dict(self):
 83 |         state_dict = {
 84 |             'start_lr': self.start_lr,
 85 |             'warmup_iter': self.warmup_iter,
 86 |             'num_iters': self.num_iters,
 87 |             'decay_style': self.decay_style,
 88 |             'end_iter': self.end_iter,
 89 |             'min_lr': self.min_lr
 90 |         }
 91 |         return state_dict
 92 | 
 93 |     def _check_and_set(self, cls_value, sd_value, name):
 94 |         """Auxiliary function for checking the values in the checkpoint and
 95 |         setting them."""
 96 |         if self.override_lr_scheduler:
 97 |             print_rank_0(' > overriding {} value to {}'.format(name, cls_value))
 98 |             return cls_value
 99 | 
100 |         if not self.use_checkpoint_lr_scheduler:
101 |             assert cls_value == sd_value, 'AnnealingLR: class input value' \
102 |                 'and checkpoint values for {} do not match'.format(name)
103 |         print_rank_0(' > using checkpoint value {} for {}'.format(sd_value,
104 |                                                                   name))
105 |         return sd_value
106 | 
107 |     def load_state_dict(self, sd):
108 | 
109 |         self.start_lr = self._check_and_set(self.start_lr, sd['start_lr'],
110 |                                             'learning rate')
111 |         self.min_lr = self._check_and_set(self.min_lr, sd['min_lr'],
112 |                                           'minimum learning rate')
113 |         self.warmup_iter = self._check_and_set(self.warmup_iter,
114 |                                                sd['warmup_iter'],
115 |                                                'warmup iterations')
116 |         self.end_iter = self._check_and_set(self.end_iter, sd['end_iter'],
117 |                                             'total number of iterations')
118 |         self.decay_style = self._check_and_set(self.decay_style,
119 |                                                sd['decay_style'],
120 |                                                'decay style')
121 | 
122 |         self.num_iters = sd['num_iters']
123 |         self.step(self.num_iters)
124 | 


--------------------------------------------------------------------------------
/megatron/model/distributed.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import torch
 17 | from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 18 | import torch.distributed as dist
 19 | from torch.nn.modules import Module
 20 | from torch.autograd import Variable
 21 | 
 22 | from megatron import mpu
 23 | from megatron.module import MegatronModule
 24 | 
 25 | 
 26 | class DistributedDataParallel(MegatronModule):
 27 | 
 28 |     def __init__(self, module):
 29 |         super(DistributedDataParallel, self).__init__()
 30 |         self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
 31 | 
 32 |         self.module = module
 33 |         self.data_parallel_group = mpu.get_data_parallel_group()
 34 | 
 35 |         def allreduce_params(reduce_after=True, no_scale=False, fp32_allreduce=False):
 36 |             if(self.needs_reduction):
 37 |                 self.needs_reduction = False
 38 |                 buckets = {}
 39 |                 for name, param in self.module.named_parameters():
 40 |                     if param.requires_grad and param.grad is not None:
 41 |                         tp = (param.data.type())
 42 |                         if tp not in buckets:
 43 |                             buckets[tp] = []
 44 |                         buckets[tp].append(param)
 45 |                 if self.warn_on_half:
 46 |                     if torch.cuda.HalfTensor in buckets:
 47 |                         print("WARNING: gloo dist backend for half parameters may be extremely slow." +
 48 |                               " It is recommended to use the NCCL backend in this case.")
 49 |                         self.warn_on_half = False
 50 |                 for tp in buckets:
 51 |                     bucket = buckets[tp]
 52 |                     grads = [param.grad.data for param in bucket]
 53 |                     coalesced = _flatten_dense_tensors(grads)
 54 |                     if fp32_allreduce:
 55 |                         coalesced = coalesced.float()
 56 |                     if not no_scale and not reduce_after:
 57 |                         coalesced /= dist.get_world_size(group=self.data_parallel_group)
 58 |                     dist.all_reduce(coalesced, group=self.data_parallel_group)
 59 |                     torch.cuda.synchronize()
 60 |                     if not no_scale and reduce_after:
 61 |                         coalesced /= dist.get_world_size(group=self.data_parallel_group)
 62 |                     for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
 63 |                         buf.copy_(synced)
 64 |         self.hook_handles = []
 65 |         self.hooks = []
 66 |         for param in list(self.module.parameters()):
 67 |             def allreduce_hook(*unused):
 68 |                 Variable._execution_engine.queue_callback(allreduce_params)
 69 |         #    handle = param.register_hook(allreduce_hook)
 70 |             # self.hooks.append(allreduce_hook)
 71 |             # self.hook_handles.append(handle)
 72 |         self.allreduce_params = allreduce_params
 73 | 
 74 |     def forward(self, *inputs, **kwargs):
 75 |         self.needs_reduction = True
 76 |         return self.module(*inputs, **kwargs)
 77 | 
 78 |     def state_dict(self, destination=None, prefix='', keep_vars=False):
 79 |         #[h.remove() for h in self.hook_handles]
 80 |         sd = self.module.state_dict(destination, prefix, keep_vars)
 81 |        # for handle, hook in zip(self.hook_handles, self.hooks):
 82 |        #     d = handle.hooks_dict_ref()
 83 |        #     d[handle.id] = hook
 84 | 
 85 |         return sd
 86 | 
 87 |     def state_dict_for_save_checkpoint(self, destination=None, prefix='',
 88 |                                        keep_vars=False):
 89 |         return self.module.state_dict_for_save_checkpoint(destination, prefix,
 90 |                                                           keep_vars)
 91 | 
 92 |     def load_state_dict(self, state_dict, strict=True):
 93 |         self.module.load_state_dict(state_dict, strict=strict)
 94 | 
 95 |     '''
 96 |     def _sync_buffers(self):
 97 |         buffers = list(self.module._all_buffers())
 98 |         if len(buffers) > 0:
 99 |             # cross-node buffer sync
100 |             flat_buffers = _flatten_dense_tensors(buffers)
101 |             dist.broadcast(flat_buffers, 0)
102 |             for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
103 |                 buf.copy_(synced)
104 |     def train(self, mode=True):
105 |         # Clear NCCL communicator and CUDA event cache of the default group ID,
106 |         # These cache will be recreated at the later call. This is currently a
107 |         # work-around for a potential NCCL deadlock.
108 |         if dist._backend == dist.dist_backend.NCCL:
109 |             dist._clear_group_cache()
110 |         super(DistributedDataParallel, self).train(mode)
111 |         self.module.train(mode)
112 |     '''
113 | 


--------------------------------------------------------------------------------
/megatron/model/fused_softmax.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import torch
 17 | 
 18 | class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function) :
 19 |     """
 20 |        Fused operation which performs following three operations in sequence
 21 |        1. Scale the tensor. 
 22 |        2. Apply upper triangular mask (typically used in gpt models).
 23 |        3. Perform softmax.
 24 |     """
 25 |     @staticmethod
 26 |     def forward(ctx, inputs, scale):
 27 |         import scaled_upper_triang_masked_softmax_cuda
 28 |         scale_t = torch.tensor([scale])
 29 | 
 30 |         softmax_results =  \
 31 |             scaled_upper_triang_masked_softmax_cuda.forward(inputs, scale_t[0])
 32 |         ctx.save_for_backward(softmax_results, scale_t)
 33 |         return softmax_results
 34 | 
 35 |     @staticmethod
 36 |     def backward(ctx, output_grads):
 37 |         import scaled_upper_triang_masked_softmax_cuda
 38 |         softmax_results, scale_t = ctx.saved_tensors
 39 | 
 40 |         input_grads =   \
 41 |             scaled_upper_triang_masked_softmax_cuda.backward(output_grads,                             
 42 |                                                  softmax_results,                          
 43 |                                                  scale_t[0])
 44 |         return input_grads, None
 45 | 
 46 | class ScaledMaskedSoftmax(torch.autograd.Function) :
 47 |     """
 48 |        Fused operation which performs following three operations in sequence
 49 |        1. Scale the tensor. 
 50 |        2. Apply the mask.
 51 |        3. Perform softmax.
 52 |     """
 53 |     @staticmethod
 54 |     def forward(ctx, inputs, mask, scale):
 55 |         import scaled_masked_softmax_cuda
 56 |         scale_t = torch.tensor([scale])
 57 | 
 58 |         softmax_results =  \
 59 |             scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
 60 |         ctx.save_for_backward(softmax_results, scale_t)
 61 |         return softmax_results
 62 | 
 63 |     @staticmethod
 64 |     def backward(ctx, output_grads):
 65 |         import scaled_masked_softmax_cuda
 66 |         softmax_results, scale_t = ctx.saved_tensors
 67 | 
 68 |         input_grads =   \
 69 |             scaled_masked_softmax_cuda.backward(output_grads,
 70 |                                                 softmax_results,
 71 |                                                 scale_t[0])
 72 |         return input_grads, None, None
 73 | 
 74 | class FusedScaleMaskSoftmax(torch.nn.Module):
 75 |     """
 76 |        fused operation: scaling + mask + softmax
 77 |        Arguments:
 78 |            input_in_fp16: flag to indicate if input in fp16 data format.
 79 |            upper_triang_mask: if true, apply upper triangular masking.
 80 |                               (used in gpt family networks)
 81 |            mask_func: mask function to be applied.
 82 |            softmax_in_fp32: if true, softmax in performed at fp32 precision.
 83 |            scale: scaling factor used in input tensor scaling.
 84 | 
 85 |     """
 86 |     def __init__(self, input_in_fp16, upper_triang_mask_fusion, 
 87 |                  general_mask_fusion, mask_func, softmax_in_fp32, scale):
 88 |         super(FusedScaleMaskSoftmax, self).__init__()
 89 |         self.input_in_fp16 = input_in_fp16
 90 |         self.upper_triang_mask_fusion = upper_triang_mask_fusion
 91 |         self.general_mask_fusion = general_mask_fusion
 92 |         self.mask_func = mask_func
 93 |         self.softmax_in_fp32 = softmax_in_fp32
 94 |         self.scale = scale
 95 | 
 96 |         assert self.scale is None or softmax_in_fp32, \
 97 |             'softmax should be in fp32 when scaled'
 98 | 
 99 |     def forward(self, input, mask):
100 |         # [b, np, s, s]
101 |         data_size = input.size()
102 |         assert input.dim() == 4 
103 | 
104 |         # invoke custom kernel
105 |         if self.input_in_fp16 and data_size[-1] <= 2048 and \
106 |             (self.upper_triang_mask_fusion or self.general_mask_fusion) and \
107 |             input.size()[2] == input.size()[3]:
108 |             scale = self.scale if self.scale is not None  else 1.0
109 |             if self.upper_triang_mask_fusion:
110 |                 input = input.view(-1, data_size[2], data_size[3])
111 |                 probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale)
112 |                 probs = probs.view(*data_size)
113 |             else:
114 |                 probs = ScaledMaskedSoftmax.apply(input, mask, scale)
115 |         else:
116 |             if self.input_in_fp16 and self.softmax_in_fp32:
117 |                 input = input.float()
118 | 
119 |             if self.scale is not None:
120 |                 input = input * self.scale
121 |             mask_output = self.mask_func(input, mask)
122 |             probs = torch.nn.Softmax(dim=-1)(mask_output)
123 | 
124 |             if self.input_in_fp16 and self.softmax_in_fp32:
125 |                 probs = probs.half()
126 | 
127 |         return probs
128 | 


--------------------------------------------------------------------------------
/megatron/memory.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | import torch
 18 | 
 19 | 
 20 | # A dictionary of all the memory buffers allocated.
 21 | _MEM_BUFFS = dict()
 22 | 
 23 | 
 24 | def allocate_mem_buff(name, numel, dtype, track_usage):
 25 |     """Allocate a memory buffer."""
 26 |     assert name not in _MEM_BUFFS, \
 27 |         'memory buffer {} already allocated.'.format(name)
 28 |     _MEM_BUFFS[name] = MemoryBuffer(name, numel, dtype, track_usage)
 29 |     return _MEM_BUFFS[name]
 30 | 
 31 | 
 32 | def get_mem_buff(name):
 33 |     """Get the memory buffer."""
 34 |     return _MEM_BUFFS[name]
 35 | 
 36 | 
 37 | class MemoryBuffer:
 38 |     """Contiguous memory buffer.
 39 |     Allocate a contiguous memory of type `dtype` and size `numel`. It is
 40 |     used to reduce memory fragmentation.
 41 | 
 42 |     Usage: After the allocation, the `_start` index is set tot the first
 43 |            index of the memory. A memory chunk starting from `_start` index
 44 |            can be `allocated` for an input tensor, with the elements of the
 45 |            tensor being coppied. The buffer can be reused by resetting the
 46 |            `_start` index.
 47 | 
 48 |     """
 49 |     def __init__(self, name, numel, dtype, track_usage):
 50 |         if torch.distributed.get_rank() == 0:
 51 |             element_size = torch.tensor([], dtype=dtype).element_size()
 52 |             print('> building the {} memory buffer with {} num elements '
 53 |                   'and {} dtype ({:.1f} MB)...'.format(
 54 |                       name, numel, dtype, numel*element_size/1024/1024),
 55 |                   flush=True)
 56 |         self.name = name
 57 |         self.numel = numel
 58 |         self.dtype = dtype
 59 |         self.data = torch.empty(self.numel,
 60 |                                 dtype=self.dtype,
 61 |                                 device=torch.cuda.current_device(),
 62 |                                 requires_grad=False)
 63 | 
 64 |         # Index tracking the start of the free memory.
 65 |         self._start = 0
 66 | 
 67 |         # Values used for tracking usage.
 68 |         self.track_usage = track_usage
 69 |         if self.track_usage:
 70 |             self.in_use_value = 0.0
 71 |             self.total_value = 0.0
 72 | 
 73 | 
 74 |     def reset(self):
 75 |         """Reset the buffer start index to the beginning of the buffer."""
 76 |         self._start = 0
 77 | 
 78 | 
 79 |     def is_in_use(self):
 80 |         """Whether the current buffer hold on to any memory."""
 81 |         return self._start > 0
 82 | 
 83 | 
 84 |     def numel_in_use(self):
 85 |         """Return number of elements in use."""
 86 |         return self._start
 87 | 
 88 | 
 89 |     def add(self, tensor):
 90 |         """Allocate a chunk of memory from the buffer to tensor and copy
 91 |         the values."""
 92 |         assert tensor.dtype == self.dtype, \
 93 |             'Input tensor type {} different from buffer type {}'.format(
 94 |                 tensor.dtype, self.dtype)
 95 |         # Number of elements of the input tensor.
 96 |         tensor_numel = torch.numel(tensor)
 97 |         new_start = self._start + tensor_numel
 98 |         assert new_start <= self.numel, \
 99 |             'Not enough memory left in the buffer ({} > {})'.format(
100 |                 tensor_numel, self.numel - self._start)
101 |         # New tensor is a view into the memory.
102 |         new_tensor = self.data[self._start:new_start]
103 |         self._start = new_start
104 |         new_tensor = new_tensor.view(tensor.shape)
105 |         new_tensor.copy_(tensor)
106 |         # Return a pointer to the new tensor.
107 |         return new_tensor
108 | 
109 | 
110 |     def get_data(self):
111 |         """Return the data currently in use."""
112 |         if self.track_usage:
113 |             self.in_use_value += float(self._start)
114 |             self.total_value += float(self.numel)
115 |         return self.data[:self._start]
116 | 
117 | 
118 |     def print_average_usage(self):
119 |         """Print memory usage average over time. We would like this value
120 |         to be as high as possible."""
121 |         assert self.track_usage, 'You need to enable track usage.'
122 |         if torch.distributed.get_rank() == 0:
123 |             print(' > usage of {} memory buffer: {:.2f} %'.format(
124 |                 self.name, self.in_use_value * 100.0 / self.total_value),
125 |                   flush=True)
126 | 
127 | 
128 | 
129 | class RingMemBuffer:
130 |     """A ring of memory buffers."""
131 | 
132 |     def __init__(self, name, num_buffers, numel, dtype, track_usage):
133 |         self.num_buffers = num_buffers
134 |         self.buffers = [
135 |             allocate_mem_buff(name+' {}'.format(i), numel, dtype, track_usage)
136 |             for i in range(num_buffers)]
137 |         self._index = -1
138 | 
139 | 
140 |     def get_next_buffer(self):
141 |         self._index += 1
142 |         self._index = self._index % self.num_buffers
143 |         buff = self.buffers[self._index]
144 |         assert not buff.is_in_use(), 'buffer is already in use.'
145 |         return buff
146 | 


--------------------------------------------------------------------------------
/pretrain_gpt2.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Pretrain GPT2"""
 17 | 
 18 | import torch
 19 | 
 20 | from megatron import get_args
 21 | from megatron import print_rank_0
 22 | from megatron import get_timers
 23 | from megatron import get_tokenizer
 24 | from megatron import mpu
 25 | from megatron.data.gpt2_dataset import build_train_valid_test_datasets
 26 | from megatron.model import GPT2Model, GPT2ModelPipe
 27 | from megatron.training import pretrain
 28 | from megatron.utils import get_ltor_masks_and_position_ids
 29 | from megatron.utils import reduce_losses
 30 | from megatron.fp16 import fp32_to_fp16
 31 | 
 32 | # pretend this is a great DeepSpeed change too
 33 | 
 34 | def model_provider():
 35 |     """Build the model."""
 36 |     args = get_args()
 37 |     print_rank_0('building GPT2 model ...')
 38 |     if args.pipe_parallel_size == 0:
 39 |         model = GPT2Model(num_tokentypes=0, parallel_output=True)
 40 |     else:
 41 |         model = GPT2ModelPipe(num_tokentypes=0, parallel_output=True, topology=mpu.get_topology())
 42 |         model._megatron_batch_fn = get_batch_pipe
 43 |         model._input_grad = [True, False]
 44 |         model._input_type = ['float', 'bool']
 45 |         model._input_pipe_partitioned = [True, False]
 46 |     return model
 47 | 
 48 | def get_batch(data_iterator):
 49 |     """Generate a batch"""
 50 |     args = get_args()
 51 |     tokenizer = get_tokenizer()
 52 | 
 53 |     # Items and their type.
 54 |     keys = ['text']
 55 |     datatype = torch.int64
 56 | 
 57 |     # Broadcast data.
 58 |     if data_iterator is not None:
 59 |         data = next(data_iterator)
 60 |     else:
 61 |         data = None
 62 |     data_b = mpu.broadcast_data(keys, data, datatype)
 63 | 
 64 |     # Unpack.
 65 |     tokens_ = data_b['text'].long()
 66 |     labels = tokens_[:, 1:].contiguous()
 67 |     tokens = tokens_[:, :-1].contiguous()
 68 | 
 69 |     # Get the masks and postition ids.
 70 |     attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
 71 |         tokens,
 72 |         tokenizer.eod,
 73 |         args.reset_position_ids,
 74 |         args.reset_attention_mask,
 75 |         args.eod_mask_loss)
 76 | 
 77 |     return tokens, labels, loss_mask, attention_mask, position_ids
 78 | 
 79 | def get_batch_pipe(data):
 80 |     """A modification of get_batch() to work with the latest batch instead of an iterator. """
 81 |     args = get_args()
 82 |     tokenizer = get_tokenizer()
 83 | 
 84 |     # Items and their type.
 85 |     keys = ['text']
 86 |     datatype = torch.int64
 87 | 
 88 |     # Broadcast data.
 89 |     data_b = mpu.broadcast_data(keys, data, datatype)
 90 | 
 91 |     # Unpack.
 92 |     tokens_ = data_b['text'].long()
 93 |     labels = tokens_[:, 1:].contiguous()
 94 |     tokens = tokens_[:, :-1].contiguous()
 95 | 
 96 |     # Get the masks and postition ids.
 97 |     attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
 98 |         tokens,
 99 |         tokenizer.eod,
100 |         args.reset_position_ids,
101 |         args.reset_attention_mask,
102 |         args.eod_mask_loss)
103 | 
104 |     # unpack data
105 |     if args.fp16:
106 |         # cast to fp16 because pipeline parallelism skips the FP16 wrapper.
107 |         return fp32_to_fp16((tokens, position_ids, attention_mask)), fp32_to_fp16((labels, loss_mask))
108 |     else:
109 |         return (tokens, position_ids, attention_mask), (labels, loss_mask)
110 | 
111 | 
112 | def forward_step(data_iterator, model):
113 |     """Forward step."""
114 |     args = get_args()
115 |     timers = get_timers()
116 | 
117 |     # Get the batch.
118 |     timers('batch generator').start()
119 |     tokens, labels, loss_mask, attention_mask, position_ids = get_batch(data_iterator)
120 |     timers('batch generator').stop()
121 |     # Forward model.
122 |     losses = model(tokens, position_ids, attention_mask, labels=labels)
123 |     loss_mask = loss_mask.view(-1)
124 |     loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
125 | 
126 |     # Reduce loss for logging.
127 |     reduced_loss = reduce_losses([loss])
128 | 
129 |     return loss, {'lm loss': reduced_loss[0]}
130 | 
131 | 
132 | def train_valid_test_datasets_provider(train_val_test_num_samples):
133 |     """Build train, valid, and test datasets."""
134 |     args = get_args()
135 | 
136 |     print_rank_0('> building train, validation, and test datasets '
137 |                  'for GPT2 ...')
138 |     train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
139 |         data_prefix=args.data_path,
140 |         data_impl=args.data_impl,
141 |         splits_string=args.split,
142 |         train_valid_test_num_samples=train_val_test_num_samples,
143 |         seq_length=args.seq_length,
144 |         seed=args.seed,
145 |         skip_warmup=(not args.mmap_warmup))
146 |     print_rank_0("> finished creating GPT2 datasets ...")
147 |     return train_ds, valid_ds, test_ds
148 | 
149 | 
150 | if __name__ == "__main__":
151 |     pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
152 |              args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
153 | 


--------------------------------------------------------------------------------
/megatron/deprecated_data_utils/tf_dl.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """PyTorch DataLoader for TFRecords"""
 16 | 
 17 | import numpy as np
 18 | import torch
 19 | import queue
 20 | import threading
 21 | 
 22 | import tensorflow as tf
 23 | tf.enable_eager_execution()
 24 | 
 25 | 
 26 | class TFRecordDataLoader(object):
 27 |     def __init__(self, records, batch_size, max_seq_len, max_preds_per_seq,
 28 |                  train, num_workers=2, seed=1, threaded_dl=False):
 29 |         assert max_preds_per_seq is not None, "--max-preds-per-seq MUST BE SPECIFIED when using tfrecords"
 30 |         tf.set_random_seed(seed)
 31 |         if isinstance(records, str):
 32 |             records = [records]
 33 | 
 34 |         self.record_converter = Record2Example({"input_ids": tf.FixedLenFeature([max_seq_len], tf.int64),
 35 |                                                 "input_mask": tf.FixedLenFeature([max_seq_len], tf.int64),
 36 |                                                 "segment_ids": tf.FixedLenFeature([max_seq_len], tf.int64),
 37 |                                                 "masked_lm_positions": tf.FixedLenFeature([max_preds_per_seq], tf.int64),
 38 |                                                 "masked_lm_ids": tf.FixedLenFeature([max_preds_per_seq], tf.int64),
 39 |                                                 "masked_lm_weights": tf.FixedLenFeature([max_preds_per_seq], tf.float32),
 40 |                                                 "next_sentence_labels": tf.FixedLenFeature([1], tf.int64)})
 41 | 
 42 |         # Instantiate dataset according to original BERT implementation
 43 |         if train:
 44 |             self.dataset = tf.data.Dataset.from_tensor_slices(tf.constant(records))
 45 |             self.dataset = self.dataset.repeat()
 46 |             self.dataset = self.dataset.shuffle(buffer_size=len(records))
 47 | 
 48 |             # use sloppy tfrecord dataset
 49 |             self.dataset = self.dataset.apply(
 50 |                 tf.contrib.data.parallel_interleave(
 51 |                     tf.data.TFRecordDataset,
 52 |                     sloppy=train,
 53 |                     cycle_length=min(num_workers, len(records))))
 54 |             self.dataset = self.dataset.shuffle(buffer_size=100)
 55 |         else:
 56 |             self.dataset = tf.data.TFRecordDataset(records)
 57 |             self.dataset = self.dataset.repeat()
 58 | 
 59 |         # Instantiate dataloader (do not drop remainder for eval)
 60 |         loader_args = {'batch_size': batch_size,
 61 |                        'num_parallel_batches': num_workers,
 62 |                        'drop_remainder': train}
 63 |         self.dataloader = self.dataset.apply(
 64 |             tf.contrib.data.map_and_batch(
 65 |                 self.record_converter, **loader_args))
 66 |         self.threaded_dl = threaded_dl
 67 |         self.num_workers = num_workers
 68 | 
 69 |     def __iter__(self):
 70 |         if self.threaded_dl:
 71 |             data_iter = iter(MultiprocessLoader(self.dataloader, self.num_workers))
 72 |             for item in data_iter:
 73 |                 yield item
 74 |         else:
 75 |             data_iter = iter(self.dataloader)
 76 |             for item in data_iter:
 77 |                 yield convert_tf_example_to_torch_tensors(item)
 78 | 
 79 | 
 80 | class Record2Example(object):
 81 |     def __init__(self, feature_map):
 82 |         self.feature_map = feature_map
 83 | 
 84 |     def __call__(self, record):
 85 |         """Decodes a BERT TF record to a TF example."""
 86 |         example = tf.parse_single_example(record, self.feature_map)
 87 |         for k, v in list(example.items()):
 88 |             if v.dtype == tf.int64:
 89 |                 example[k] = tf.to_int32(v)
 90 |         return example
 91 | 
 92 | 
 93 | def convert_tf_example_to_torch_tensors(example):
 94 |     item = {k: (v.numpy()) for k, v in example.items()}
 95 |     mask = np.zeros_like(item['input_ids'])
 96 |     mask_labels = np.ones_like(item['input_ids']) * -1
 97 |     for b, row in enumerate(item['masked_lm_positions'].astype(int)):
 98 |         for i, idx in enumerate(row):
 99 |             if item['masked_lm_weights'][b, i] != 0:
100 |                 mask[b, idx] = 1
101 |                 mask_labels[b, idx] = item['masked_lm_ids'][b, i]
102 |     output = {'text': item['input_ids'], 'types': item['segment_ids'], 'is_random': item['next_sentence_labels'],
103 |               'pad_mask': 1 - item['input_mask'], 'mask': mask, 'mask_labels': mask_labels}
104 |     return {k: torch.from_numpy(v) for k, v in output.items()}
105 | 
106 | 
107 | class MultiprocessLoader(object):
108 |     def __init__(self, dataloader, num_workers=2):
109 |         self.dl = dataloader
110 |         self.queue_size = 2 * num_workers
111 | 
112 |     def __iter__(self):
113 |         output_queue = queue.Queue(self.queue_size)
114 |         output_thread = threading.Thread(target=_multiproc_iter,
115 |                                          args=(self.dl, output_queue))
116 |         output_thread.daemon = True
117 |         output_thread.start()
118 | 
119 |         while output_thread.is_alive():
120 |             yield output_queue.get(block=True)
121 |         else:
122 |             print(RuntimeError('TF record data loader thread exited unexpectedly'))
123 | 
124 | 
125 | def _multiproc_iter(dl, output_queue):
126 |     data_iter = iter(dl)
127 |     for item in data_iter:
128 |         tensors = convert_tf_example_to_torch_tensors(item)
129 |         output_queue.put(tensors, block=True)
130 | 


--------------------------------------------------------------------------------
/megatron/data/ict_dataset.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import random
  3 | 
  4 | import numpy as np
  5 | from torch.utils.data import Dataset
  6 | 
  7 | from megatron import get_tokenizer
  8 | from megatron import get_args
  9 | from megatron.data.dataset_utils import get_indexed_dataset_
 10 | from megatron.data.realm_dataset_utils import get_block_samples_mapping
 11 | 
 12 | 
 13 | def get_ict_dataset(use_titles=True, query_in_block_prob=1):
 14 |     """Get a dataset which uses block samples mappings to get ICT/block indexing data (via get_block())
 15 |     rather than for training, since it is only built with a single epoch sample mapping.
 16 |     """
 17 |     args = get_args()
 18 |     block_dataset = get_indexed_dataset_(args.data_path, 'mmap', True)
 19 |     titles_dataset = get_indexed_dataset_(args.titles_data_path, 'mmap', True)
 20 | 
 21 |     kwargs = dict(
 22 |         name='full',
 23 |         block_dataset=block_dataset,
 24 |         title_dataset=titles_dataset,
 25 |         data_prefix=args.data_path,
 26 |         num_epochs=1,
 27 |         max_num_samples=None,
 28 |         max_seq_length=args.seq_length,
 29 |         seed=1,
 30 |         query_in_block_prob=query_in_block_prob,
 31 |         use_titles=use_titles,
 32 |         use_one_sent_docs=args.use_one_sent_docs
 33 |     )
 34 |     dataset = ICTDataset(**kwargs)
 35 |     return dataset
 36 | 
 37 | 
 38 | class ICTDataset(Dataset):
 39 |     """Dataset containing sentences and their blocks for an inverse cloze task."""
 40 |     def __init__(self, name, block_dataset, title_dataset, data_prefix,
 41 |                  num_epochs, max_num_samples, max_seq_length, query_in_block_prob,
 42 |                  seed, use_titles=True, use_one_sent_docs=False):
 43 |         self.name = name
 44 |         self.seed = seed
 45 |         self.max_seq_length = max_seq_length
 46 |         self.query_in_block_prob = query_in_block_prob
 47 |         self.block_dataset = block_dataset
 48 |         self.title_dataset = title_dataset
 49 |         self.rng = random.Random(self.seed)
 50 |         self.use_titles = use_titles
 51 |         self.use_one_sent_docs = use_one_sent_docs
 52 | 
 53 |         self.samples_mapping = get_block_samples_mapping(
 54 |             block_dataset, title_dataset, data_prefix, num_epochs,
 55 |             max_num_samples, max_seq_length, seed, name, use_one_sent_docs)
 56 |         self.tokenizer = get_tokenizer()
 57 |         self.vocab_id_list = list(self.tokenizer.inv_vocab.keys())
 58 |         self.vocab_id_to_token_list = self.tokenizer.inv_vocab
 59 |         self.cls_id = self.tokenizer.cls
 60 |         self.sep_id = self.tokenizer.sep
 61 |         self.mask_id = self.tokenizer.mask
 62 |         self.pad_id = self.tokenizer.pad
 63 | 
 64 |     def __len__(self):
 65 |         return len(self.samples_mapping)
 66 | 
 67 |     def __getitem__(self, idx):
 68 |         """Get an ICT example of a pseudo-query and the block of text from which it was extracted"""
 69 |         sample_data = self.samples_mapping[idx]
 70 |         start_idx, end_idx, doc_idx, block_idx = sample_data.as_tuple()
 71 | 
 72 |         if self.use_titles:
 73 |             title = self.title_dataset[int(doc_idx)]
 74 |             title_pad_offset = 3 + len(title)
 75 |         else:
 76 |             title = None
 77 |             title_pad_offset = 2
 78 |         block = [self.block_dataset[i] for i in range(start_idx, end_idx)]
 79 |         assert len(block) > 1 or self.use_one_sent_docs or self.query_in_block_prob == 1
 80 | 
 81 |         # randint() is inclusive for Python rng
 82 |         rand_sent_idx = self.rng.randint(0, len(block) - 1)
 83 | 
 84 |         # keep the query in the context query_in_block_prob fraction of the time.
 85 |         if self.rng.random() < self.query_in_block_prob:
 86 |             query = block[rand_sent_idx].copy()
 87 |         else:
 88 |             query = block.pop(rand_sent_idx)
 89 | 
 90 |         # still need to truncate because blocks are concluded when
 91 |         # the sentence lengths have exceeded max_seq_length.
 92 |         query = query[:self.max_seq_length - 2]
 93 |         block = list(itertools.chain(*block))[:self.max_seq_length - title_pad_offset]
 94 | 
 95 |         query_tokens, query_pad_mask = self.concat_and_pad_tokens(query)
 96 |         block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title)
 97 |         block_data = sample_data.as_array()
 98 | 
 99 |         sample = {
100 |             'query_tokens': query_tokens,
101 |             'query_pad_mask': query_pad_mask,
102 |             'block_tokens': block_tokens,
103 |             'block_pad_mask': block_pad_mask,
104 |             'block_data': block_data,
105 |         }
106 | 
107 |         return sample
108 | 
109 |     def get_block(self, start_idx, end_idx, doc_idx):
110 |         """Get the IDs for an evidence block plus the title of the corresponding document"""
111 |         block = [self.block_dataset[i] for i in range(start_idx, end_idx)]
112 |         title = self.title_dataset[int(doc_idx)]
113 | 
114 |         block = list(itertools.chain(*block))[:self.max_seq_length - (3 + len(title))]
115 |         block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title)
116 | 
117 |         return block_tokens, block_pad_mask
118 | 
119 |     def get_null_block(self):
120 |         """Get empty block and title - used in REALM pretraining"""
121 |         block, title = [], []
122 |         block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title)
123 | 
124 |         return block_tokens, block_pad_mask
125 | 
126 |     def concat_and_pad_tokens(self, tokens, title=None):
127 |         """Concat with special tokens and pad sequence to self.max_seq_length"""
128 |         tokens = list(tokens)
129 |         if title is None:
130 |             tokens = [self.cls_id] + tokens + [self.sep_id]
131 |         else:
132 |             title = list(title)
133 |             tokens = [self.cls_id] + title + [self.sep_id] + tokens + [self.sep_id]
134 |         assert len(tokens) <= self.max_seq_length
135 | 
136 |         num_pad = self.max_seq_length - len(tokens)
137 |         pad_mask = [1] * len(tokens) + [0] * num_pad
138 |         tokens += [self.pad_id] * num_pad
139 | 
140 |         return np.array(tokens), np.array(pad_mask)
141 | 


--------------------------------------------------------------------------------
/megatron/deprecated_data_utils/samplers.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """batch samplers that work with either random or sequential data samplers"""
 16 | import math
 17 | import os
 18 | import sys
 19 | 
 20 | import torch
 21 | from torch.utils import data
 22 | import numpy as np
 23 | 
 24 | 
 25 | class RandomSampler(data.sampler.Sampler):
 26 |     r"""
 27 |     Based off of pytorch RandomSampler and DistributedSampler. Essentially a RandomSampler,
 28 |     but this class lets the user set an epoch like DistributedSampler
 29 |     Samples elements randomly. If without replacement, then sample from a shuffled dataset.
 30 |     If with replacement, then user can specify ``num_samples`` to draw.
 31 |     Arguments:
 32 |         data_source (Dataset): dataset to sample from
 33 |         num_samples (int): number of samples to draw, default=len(dataset)
 34 |         replacement (bool): samples are drawn with replacement if ``True``, default=False
 35 |     """
 36 | 
 37 |     def __init__(self, data_source, replacement=False, num_samples=None):
 38 |         self.data_source = data_source
 39 |         self.replacement = replacement
 40 |         self._num_samples = num_samples
 41 |         self.epoch = -1
 42 | 
 43 |         if self._num_samples is not None and replacement is False:
 44 |             raise ValueError("With replacement=False, num_samples should not be specified, "
 45 |                              "since a random permute will be performed.")
 46 | 
 47 |         if not isinstance(self.num_samples, int) or self.num_samples <= 0:
 48 |             raise ValueError("num_samples should be a positive integer "
 49 |                              "value, but got num_samples={}".format(self.num_samples))
 50 |         if not isinstance(self.replacement, bool):
 51 |             raise ValueError("replacement should be a boolean value, but got "
 52 |                              "replacement={}".format(self.replacement))
 53 | 
 54 |     @property
 55 |     def num_samples(self):
 56 |         # dataset size might change at runtime
 57 |         if self._num_samples is None:
 58 |             return len(self.data_source)
 59 |         return self._num_samples
 60 | 
 61 |     def __iter__(self):
 62 |         n = len(self.data_source)
 63 |         g = torch.Generator()
 64 |         if self.epoch >= 0:
 65 |             g.manual_seed(self.epoch)
 66 |         if self.replacement:
 67 |             return iter(torch.randint(high=n, size=(self.num_samples,),
 68 |                                       dtype=torch.int64, generator=g).tolist())
 69 |         return iter(torch.randperm(n, generator=g).tolist())
 70 | 
 71 |     def __len__(self):
 72 |         return self.num_samples
 73 | 
 74 |     def set_epoch(self, epoch):
 75 |         self.epoch = epoch
 76 | 
 77 | 
 78 | class DistributedBatchSampler(data.sampler.BatchSampler):
 79 |     """
 80 |     similar to normal implementation of distributed sampler, except implementation is at the
 81 |     batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary
 82 |     data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler.
 83 |     """
 84 | 
 85 |     def __init__(self, sampler, batch_size, drop_last, rank=-1, world_size=2, wrap_last=False):
 86 |         super(DistributedBatchSampler, self).__init__(sampler, batch_size, drop_last)
 87 |         if rank == -1:
 88 |             assert False, 'should not be here'
 89 |             rank = torch.distributed.get_rank()
 90 |         self.rank = rank
 91 |         self.world_size = world_size
 92 |         self.sampler.wrap_around = 0
 93 |         self.wrap_around = 0
 94 |         self.wrap_last = wrap_last
 95 |         self.start_iter = 0
 96 | 
 97 |     def __iter__(self):
 98 |         batch = []
 99 |         last_batch = None
100 |         i = 0
101 |         for idx in self.data_iterator(self.sampler, wrap_around=False):
102 |             batch.append(idx)
103 |             if len(batch) == self.batch_size:
104 |                 tbatch = self._batch(batch)
105 |                 if i >= self.start_iter:
106 |                     yield tbatch
107 |                     self.start_iter = 0
108 |                 i += 1
109 |                 last_batch = np.array(list(tbatch))
110 |                 batch = []
111 |         batch_len = len(batch)
112 |         if batch_len > 0 and not self.drop_last:
113 |             if self.wrap_last:
114 |                 self.sampler.wrap_around -= (self.batch_size)
115 |                 self.wrap_around += (len(batch))
116 |                 self.wrap_around %= self.batch_size
117 |                 if isinstance(self.sampler, TransposedSampler):
118 |                     for i, idx in enumerate(self.data_iterator(self.sampler, wrap_around=True)):
119 |                         if i == 0:
120 |                             continue
121 |                         batch.append(idx)
122 |                         new_batch_len = len(batch)
123 |                         if len(batch) == self.batch_size:
124 |                             break
125 |             yield self._batch(batch)
126 |         if self.wrap_last:
127 |             self.sampler.wrap_around += self.batch_size
128 | 
129 |     def data_iterator(self, _iter, wrap_around=False):
130 |         """iterates through data and handles wrap around"""
131 |         for i, idx in enumerate(_iter):
132 |             if i < self.wrap_around % self.batch_size:
133 |                 continue
134 |             if wrap_around:
135 |                 self.wrap_around += 1
136 |                 self.wrap_around %= self.batch_size
137 |             yield idx
138 | 
139 |     def _batch(self, batch):
140 |         """extracts samples only pertaining to this worker's batch"""
141 |         start = self.rank * self.batch_size // self.world_size
142 |         end = (self.rank + 1) * self.batch_size // self.world_size
143 |         return batch[start:end]
144 | 


--------------------------------------------------------------------------------
/megatron/deprecated_data_utils/__init__.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """utils for creating datasets"""
 16 | import os
 17 | import math
 18 | 
 19 | import torch
 20 | 
 21 | from .samplers import DistributedBatchSampler
 22 | from .datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, bert_sentencepair_dataset, GPT2Dataset
 23 | from .lazy_loader import exists_lazy, make_lazy, lazy_array_loader
 24 | from .tokenization import Tokenization, CommandToken, Tokenizer, CharacterLevelTokenizer, BertWordPieceTokenizer, GPT2BPETokenizer, make_tokenizer
 25 | from . import corpora
 26 | 
 27 | TRAIN_DATA = 0
 28 | VAL_DATA = 1
 29 | TEST_DATA = 2
 30 | 
 31 | 
 32 | def should_split(split):
 33 |     """
 34 |     given split proportions checks if should split
 35 |     Examples:
 36 |     >>> should_split([10,0,0])
 37 |     False
 38 |     >>> should_split([1,.1,.2])
 39 |     True
 40 |     """
 41 |     return max(split) / sum(split) != 1.
 42 | 
 43 | 
 44 | def get_ext(path):
 45 |     """gets path extension"""
 46 |     return os.path.splitext(path)[1]
 47 | 
 48 | 
 49 | def get_dataset(path, **kwargs):
 50 |     """gets dataset object based on keyword args and file at `path`"""
 51 |     if supported_corpus(path):
 52 |         return corpora.NAMED_CORPORA[path](**kwargs)
 53 |     ext = get_ext(path)
 54 |     if '.json' in ext:
 55 |         text = json_dataset(path, **kwargs)
 56 |     elif ext in ['.csv', '.tsv']:
 57 |         text = csv_dataset(path, **kwargs)
 58 |     else:
 59 |         raise NotImplementedError('data file type %s is not supported' % (ext))
 60 |     return text
 61 | 
 62 | 
 63 | def supported_corpus(corpus_name):
 64 |     """checks if corpus name is defined in `corpora.py`"""
 65 |     return corpus_name in corpora.NAMED_CORPORA
 66 | 
 67 | 
 68 | def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.],
 69 |                  delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None,
 70 |                  tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None,
 71 |                  model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None,
 72 |                  parallel_group=None, **kwargs):
 73 |     """function to create datasets+tokenizers for common options"""
 74 |     if isinstance(process_fn, str):
 75 |         process_fn = eval(process_fn)
 76 |     if non_binary_cols is not None:
 77 |         # multilabel dataset support (only for csvs)
 78 |         label_key = non_binary_cols
 79 | 
 80 |     def get_dataset_from_path(path_):
 81 |         if lazy:
 82 |             # get lazily loaded dataset
 83 |             named_corpora = False
 84 |             if supported_corpus(path_):
 85 |                 named_corpora = True
 86 |                 name = path_
 87 |                 path_ = corpora.NAMED_CORPORA[path_].PATH
 88 |             if torch.distributed.get_rank() == 0 and not exists_lazy(path_, data_type='data'):
 89 |                 # create cached version of dataset for lazy loading if it doesn't exist
 90 |                 text = get_dataset(name if named_corpora else path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
 91 |                                    delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose)
 92 |                 make_lazy(path_, text.X, data_type='data')
 93 |             # This should be a barrier but nccl barrier assumes
 94 |             # device_index=rank which is not the case for model
 95 |             # parallel case
 96 |             counts = torch.cuda.LongTensor([1])
 97 |             torch.distributed.all_reduce(counts, group=parallel_group)
 98 |             assert counts[0].item() == torch.distributed.get_world_size(
 99 |                 group=parallel_group)
100 | 
101 |             text = lazy_array_loader(path_, data_type='data', map_fn=process_fn)
102 |         else:
103 |             # get dataset
104 |             text = get_dataset(path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
105 |                                delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose, preprocess_fn=process_fn)
106 |         return text
107 |     # get one or multiple datasets and concatenate
108 |     if isinstance(path, str):
109 |         path = [path]
110 |     datasets = [get_dataset_from_path(p) for p in path]
111 |     if len(datasets) == 1:
112 |         ds = datasets[0]
113 |     else:
114 |         ds = ConcatDataset(datasets)
115 |     # make tokenizer for dataset
116 |     if tokenizer is None:
117 |         tokenizer = make_tokenizer(tokenizer_type, ds, tokenizer_model_path, vocab_size, model_type,
118 |                                    pad_token, character_converage, **kwargs)
119 | 
120 |     ds_type = ''
121 |     if 'ds_type' in kwargs:
122 |         ds_type = kwargs['ds_type']
123 |     ds.SetTokenizer(tokenizer)
124 |     # Split dataset into train/val/test (and wrap bert dataset)
125 |     if should_split(split):
126 |         ds = split_ds(ds, split)
127 |         if 'bert' in ds_type.lower():
128 |             presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
129 |             dstype = bert_sentencepair_dataset
130 |             ds = [dstype(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences)
131 |                   if d is not None else None for d in ds]
132 |         elif ds_type.lower() == 'gpt2':
133 |             ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
134 |     else:
135 |         if 'bert' in ds_type.lower():
136 |             presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
137 |             dstype = bert_sentencepair_dataset
138 |             ds = dstype(ds, max_seq_len=seq_length, presplit_sentences=presplit_sentences)
139 |         elif ds_type.lower() == 'gpt2':
140 |             ds = GPT2Dataset(ds, max_seq_len=seq_length)
141 |     return ds, tokenizer
142 | 


--------------------------------------------------------------------------------
/megatron/data/samplers.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Batch samplers that work with either random or sequential data samplers."""
 17 | 
 18 | import torch
 19 | from torch.utils import data
 20 | 
 21 | 
 22 | class RandomSampler(data.sampler.Sampler):
 23 |     """Based off of pytorch RandomSampler and DistributedSampler. Essentially
 24 |     a RandomSampler, but this class lets the user set an epoch like
 25 |     DistributedSampler Samples elements randomly. If without replacement, then
 26 |     sample from a shuffled dataset. If with replacement, then user can
 27 |     specify ``num_samples`` to draw.
 28 |     Arguments:
 29 |         data_source (Dataset): dataset to sample from
 30 |         num_samples (int): number of samples to draw, default=len(dataset)
 31 |         replacement (bool): samples are drawn with replacement if ``True``,
 32 |         default=False
 33 |     """
 34 | 
 35 |     def __init__(self, data_source, replacement=False, num_samples=None):
 36 |         self.data_source = data_source
 37 |         self.replacement = replacement
 38 |         self._num_samples = num_samples
 39 |         self.epoch = -1
 40 | 
 41 |         if self._num_samples is not None and replacement is False:
 42 |             raise ValueError("With replacement=False, num_samples should not "
 43 |                              "be specified, since a random permute will be "
 44 |                              "performed.")
 45 | 
 46 |         if not isinstance(self.num_samples, int) or self.num_samples <= 0:
 47 |             raise ValueError("num_samples should be a positive integer "
 48 |                              "value, but got num_samples={}".format(
 49 |                                  self.num_samples))
 50 |         if not isinstance(self.replacement, bool):
 51 |             raise ValueError("replacement should be a boolean value, but got "
 52 |                              "replacement={}".format(self.replacement))
 53 | 
 54 |     @property
 55 |     def num_samples(self):
 56 |         # dataset size might change at runtime
 57 |         if self._num_samples is None:
 58 |             return len(self.data_source)
 59 |         return self._num_samples
 60 | 
 61 |     def __iter__(self):
 62 |         n = len(self.data_source)
 63 |         g = torch.Generator()
 64 |         if self.epoch >= 0:
 65 |             g.manual_seed(self.epoch)
 66 |         if self.replacement:
 67 |             return iter(torch.randint(high=n, size=(self.num_samples,),
 68 |                                       dtype=torch.int64, generator=g).tolist())
 69 |         return iter(torch.randperm(n, generator=g).tolist())
 70 | 
 71 |     def __len__(self):
 72 |         return self.num_samples
 73 | 
 74 |     def set_epoch(self, epoch):
 75 |         self.epoch = epoch
 76 | 
 77 | 
 78 | class DistributedBatchSampler(data.sampler.BatchSampler):
 79 |     """Similar to normal implementation of distributed sampler, except
 80 |     implementation is at the batch sampler level, instead of just the
 81 |     sampler level. This allows wrapping of arbitrary data samplers
 82 |     (sequential, random, WeightedRandomSampler, etc.) with this batch
 83 |     sampler.
 84 |     
 85 |     The `interleave` argument specifies how to distribute a batch. A value
 86 |     of True combined with the above random sampler is equivalent to pytorch's
 87 |     torch.utils.data.distributed.DistributedSampler.
 88 | 
 89 |     For the following batch [0,1,2,3,4,5,6,7] and data parallelism of 2 
 90 |     specifying True will result in the following samples for each gpu:
 91 |         GPU0: [0,2,4,6] GPU1: [1,3,5,7]
 92 |     specifying False will result in the following samples:
 93 |         GPU0: [0,1,2,3] GPU1: [4,5,6,7]"""
 94 | 
 95 |     def __init__(self, sampler, batch_size, drop_last, rank=-1,
 96 |                  world_size=2, wrap_last=False, interleave=False):
 97 |         super(DistributedBatchSampler, self).__init__(sampler, batch_size,
 98 |                                                       drop_last)
 99 |         if rank == -1:
100 |             assert False, 'should not be here'
101 |             rank = torch.distributed.get_rank()
102 |         self.rank = rank
103 |         self.world_size = world_size
104 |         self.sampler.wrap_around = 0
105 |         self.wrap_around = 0
106 |         self.wrap_last = wrap_last
107 |         self.start_iter = 0
108 |         self.interleave = interleave
109 | 
110 |     def __iter__(self):
111 |         batch = []
112 |         i = 0
113 |         for idx in self.data_iterator(self.sampler, wrap_around=False):
114 |             batch.append(idx)
115 |             if len(batch) == self.batch_size:
116 |                 tbatch = self._batch(batch)
117 |                 if i >= self.start_iter:
118 |                     yield tbatch
119 |                     self.start_iter = 0
120 |                 i += 1
121 |                 batch = []
122 |         batch_len = len(batch)
123 |         if batch_len > 0 and not self.drop_last:
124 |             if self.wrap_last:
125 |                 self.sampler.wrap_around -= (self.batch_size)
126 |                 self.wrap_around += (len(batch))
127 |                 self.wrap_around %= self.batch_size
128 |             yield self._batch(batch)
129 |         if self.wrap_last:
130 |             self.sampler.wrap_around += self.batch_size
131 | 
132 |     def data_iterator(self, _iter, wrap_around=False):
133 |         """iterates through data and handles wrap around"""
134 |         for i, idx in enumerate(_iter):
135 |             if i < self.wrap_around % self.batch_size:
136 |                 continue
137 |             if wrap_around:
138 |                 self.wrap_around += 1
139 |                 self.wrap_around %= self.batch_size
140 |             yield idx
141 | 
142 |     def _batch(self, batch):
143 |         """extracts samples only pertaining to this worker's batch"""
144 |         if self.interleave:
145 |             return batch[self.rank:self.batch_size:self.world_size]
146 |         start = self.rank * self.batch_size // self.world_size
147 |         end = (self.rank + 1) * self.batch_size // self.world_size
148 |         return batch[start:end]
149 | 


--------------------------------------------------------------------------------
/pretrain_t5.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Pretrain T5"""
 17 | 
 18 | import torch
 19 | 
 20 | from megatron import get_args
 21 | from megatron import print_rank_0
 22 | from megatron import get_timers
 23 | from megatron import get_tokenizer
 24 | from megatron import mpu
 25 | from megatron.data.T5_dataset import build_train_valid_test_datasets
 26 | from megatron.model import T5ModelPipe, T5Model
 27 | from megatron.training import pretrain
 28 | from megatron.utils import get_masks_and_position_ids_for_t5
 29 | from megatron.utils import reduce_losses
 30 | from megatron.fp16 import fp32_to_fp16
 31 | 
 32 | def model_provider():
 33 |     """Build the model."""
 34 |     args = get_args()
 35 |     print_rank_0('building T5 model ...')
 36 |     if args.pipe_parallel_size == 0 or args.pipe_parallel_size == 1:
 37 |         model = T5Model(num_tokentypes=0, parallel_output=True)
 38 |     else:
 39 |         model = T5ModelPipe(num_tokentypes=0, parallel_output=True, topology=mpu.get_topology())
 40 |         model._megatron_batch_fn = get_batch_pipe
 41 |         model._input_grad = [True, False, True, False, False]
 42 |         model._input_type = ['float', 'int', 'float', 'int', 'int']
 43 |         model._input_pipe_partitioned = [True, False, True, False, False]
 44 |     return model
 45 | 
 46 | def get_batch(data_iterator):
 47 |     args = get_args()
 48 |     tokenizer = get_tokenizer()
 49 | 
 50 |     # Items and their type.
 51 |     keys = [
 52 |         "contexts",
 53 |         "targets",
 54 |         "labels",
 55 |         "ctx_eod_mask",
 56 |     ]
 57 |     datatype = torch.int64
 58 | 
 59 |     if data_iterator is not None:
 60 |         data = next(data_iterator)
 61 |     else:
 62 |         data = None
 63 |     
 64 |     # Broadcast data.
 65 |     data_b = mpu.broadcast_data(keys, data, datatype)
 66 | 
 67 |     # Unpack.
 68 |     contexts = data_b['contexts'].long()
 69 |     targets = data_b['targets'].long()
 70 |     labels = data_b['labels'].long()
 71 |     ctx_eod_mask = data_b['ctx_eod_mask'].long()
 72 | 
 73 |     # Unpack.
 74 |     enc_token_ids = contexts
 75 |     dec_token_ids = targets
 76 | 
 77 |     # Get the masks and postition ids.
 78 |     enc_attn_mask, enc_pos_ids, dec_attn_mask, dec_pos_ids, cross_attn_mask, loss_mask = get_masks_and_position_ids_for_t5(
 79 |         args,
 80 |         tokenizer,
 81 |         contexts,
 82 |         targets,
 83 |         labels,
 84 |         ctx_eod_mask,
 85 |         args.reset_position_ids,
 86 |         args.reset_attention_mask)
 87 | 
 88 |     if args.fp16:
 89 |         # cast to fp16 because pipeline parallelism skips the FP16 wrapper.
 90 |         return fp32_to_fp16((enc_token_ids, enc_pos_ids, enc_attn_mask, 
 91 |                 dec_token_ids, dec_pos_ids, dec_attn_mask,
 92 |                 cross_attn_mask)), fp32_to_fp16((labels, loss_mask))
 93 |     else:
 94 |         return (enc_token_ids, enc_pos_ids, enc_attn_mask, 
 95 |                 dec_token_ids, dec_pos_ids, dec_attn_mask,
 96 |                 cross_attn_mask), (labels, loss_mask)
 97 | 
 98 | def get_batch_pipe(data):
 99 |     args = get_args()
100 |     tokenizer = get_tokenizer()
101 | 
102 |     # Items and their type.
103 |     keys = [
104 |         "contexts",
105 |         "targets",
106 |         "labels",
107 |         "ctx_eod_mask",
108 |     ]
109 |     datatype = torch.int64
110 |     
111 |     # Broadcast data.
112 |     data_b = mpu.broadcast_data(keys, data, datatype)
113 | 
114 |     # Unpack.
115 |     contexts = data_b['contexts'].long()
116 |     targets = data_b['targets'].long()
117 |     labels = data_b['labels'].long()
118 |     ctx_eod_mask = data_b['ctx_eod_mask'].long()
119 | 
120 |     # Unpack.
121 |     enc_token_ids = contexts
122 |     dec_token_ids = targets
123 | 
124 |     # Get the masks and postition ids.
125 |     enc_attn_mask, enc_pos_ids, dec_attn_mask, dec_pos_ids, cross_attn_mask, loss_mask = get_masks_and_position_ids_for_t5(
126 |         args,
127 |         tokenizer,
128 |         contexts,
129 |         targets,
130 |         labels,
131 |         ctx_eod_mask,
132 |         args.reset_position_ids,
133 |         args.reset_attention_mask)
134 | 
135 |     if args.fp16:
136 |         # cast to fp16 because pipeline parallelism skips the FP16 wrapper.
137 |         return fp32_to_fp16((enc_token_ids, enc_pos_ids, enc_attn_mask, 
138 |                 dec_token_ids, dec_pos_ids, dec_attn_mask,
139 |                 cross_attn_mask)), fp32_to_fp16((labels, loss_mask))
140 |     else:
141 |         return (enc_token_ids, enc_pos_ids, enc_attn_mask, 
142 |                 dec_token_ids, dec_pos_ids, dec_attn_mask,
143 |                 cross_attn_mask), (labels, loss_mask)
144 | 
145 | def forward_step(data_iterator, model):
146 |     """Forward step."""
147 |     args = get_args()
148 |     timers = get_timers()
149 | 
150 |     # Get the batch.
151 |     timers('batch generator').start()
152 | 
153 |     (enc_token_ids, enc_pos_ids, enc_attn_mask, 
154 |      dec_token_ids, dec_pos_ids, dec_attn_mask,
155 |      cross_attn_mask), (labels, loss_mask) = get_batch(data_iterator)
156 |     
157 |     timers('batch generator').stop()
158 | 
159 |     # Forward model.
160 |     losses = model(enc_token_ids, enc_pos_ids, enc_attn_mask, 
161 |                    dec_token_ids, dec_pos_ids, dec_attn_mask, cross_attn_mask, 
162 |                    labels=labels)
163 | 
164 |     loss_mask = loss_mask.view(-1)
165 |     loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
166 | 
167 |     # Reduce loss for logging.
168 |     reduced_loss = reduce_losses([loss])
169 | 
170 |     return loss, {'lm loss': reduced_loss[0]}
171 | 
172 | 
173 | 
174 | def train_valid_test_datasets_provider(train_val_test_num_samples):
175 |     """Build train, valid, and test datasets."""
176 |     args = get_args()
177 |     tokenizer = get_tokenizer()
178 | 
179 |     print_rank_0('> building train, validation, and test datasets '
180 |                  'for Enc-Dec ...')
181 |     train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
182 |         tokenizer=tokenizer,
183 |         data_prefix=args.data_path,
184 |         data_impl=args.data_impl,
185 |         splits_string=args.split,
186 |         train_valid_test_num_samples=train_val_test_num_samples,
187 |         enc_seq_length=args.enc_seq_length,
188 |         dec_seq_length=args.dec_seq_length,
189 |         seed=args.seed,
190 |         skip_warmup=(not args.mmap_warmup))
191 |     print_rank_0("> finished creating Enc-Dec datasets ...")
192 | 
193 |     return train_ds, valid_ds, test_ds
194 | 
195 | 
196 | if __name__ == "__main__":
197 | 
198 |     pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
199 |              args_defaults={'tokenizer_type': 'T5Tokenizer'})
200 | 


--------------------------------------------------------------------------------
/megatron/tokenizer/t5_tokenization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Tokenization classes for OpenAI T5."""
 17 | 
 18 | from __future__ import (absolute_import, division, print_function,
 19 |                         unicode_literals)
 20 | 
 21 | import json
 22 | from io import open
 23 | import jieba
 24 | import collections
 25 | import six
 26 | 
 27 | 
 28 | try:
 29 |     from functools import lru_cache
 30 | except ImportError:
 31 |     # Just a dummy decorator to get the checks to run on python2
 32 |     # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
 33 |     def lru_cache():
 34 |         return lambda func: func
 35 | 
 36 | 
 37 | def convert_to_unicode(text):
 38 |   """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
 39 |   if six.PY3:
 40 |     if isinstance(text, str):
 41 |       return text
 42 |     elif isinstance(text, bytes):
 43 |       return text.decode("utf-8", "ignore")
 44 |     else:
 45 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 46 |   elif six.PY2:
 47 |     if isinstance(text, str):
 48 |       return text.decode("utf-8", "ignore")
 49 |     elif isinstance(text, unicode):
 50 |       return text
 51 |     else:
 52 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 53 |   else:
 54 |     raise ValueError("Not running on Python2 or Python 3?")
 55 | 
 56 | 
 57 | def load_vocab(vocab_file):
 58 |   """Loads a vocabulary file into a dictionary."""
 59 |   if vocab_file.find(".json")!=-1:
 60 |     with open(vocab_file, "r") as reader:
 61 |       vocab = json.loads(reader.read())
 62 |   else:
 63 |     vocab = collections.OrderedDict()
 64 |     index = 0
 65 |     with open(vocab_file, "r") as reader:
 66 |         while True:
 67 |             token = convert_to_unicode(reader.readline())
 68 |             if not token:
 69 |                 break
 70 |             token = token.strip()
 71 |             vocab[token] = index
 72 |             index += 1
 73 |   return vocab
 74 | 
 75 | 
 76 | class WordpieceTokenizer(object):
 77 | 
 78 |     def __init__(self, vocab, unk_token="<unk>", max_input_chars_per_word=200):
 79 |         self.vocab = vocab
 80 |         self.unk_token = unk_token
 81 |         self.max_input_chars_per_word = max_input_chars_per_word
 82 | 
 83 |     def tokenize(self, token):
 84 | 
 85 |         token = convert_to_unicode(token)
 86 | 
 87 |         chars = list(token)
 88 |         if len(chars) > self.max_input_chars_per_word:
 89 |             return [self.unk_token]
 90 | 
 91 |         start = 0
 92 |         sub_tokens = []
 93 |         while start < len(chars):
 94 |             end = len(chars)
 95 |             cur_substr = None
 96 |             while start < end:
 97 |                 substr = "".join(chars[start:end])
 98 |                 if substr in self.vocab:
 99 |                     cur_substr = substr
100 |                     break
101 |                 end -= 1
102 |             if cur_substr is None:
103 |                 sub_tokens.append(self.unk_token)
104 |                 start += 1
105 |                 continue
106 |             sub_tokens.append(cur_substr)
107 |             start = end
108 | 
109 |         return sub_tokens
110 | 
111 | 
112 | class T5Tokenizer(object):
113 | 
114 |     def __init__(self, vocab_file, max_len=None, max_sentinels=190):
115 |         self.max_len = max_len if max_len is not None else int(1e12)
116 |         self.encoder = load_vocab(vocab_file)
117 |         self.decoder = {v:k for k,v in self.encoder.items()}
118 |         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder)
119 | 
120 |         self.translator = str.maketrans(" \n", "\u2582\u2583")
121 | 
122 |         self.sentinel_list = [self.encoder['<s_{}>'.format(i)] for i in range(max_sentinels)]
123 | 
124 |     @property
125 |     def vocab_size(self):
126 |         return len(self.encoder)
127 | 
128 |     def __len__(self):
129 |         return len(self.encoder)
130 | 
131 |     @property
132 |     def eod_id(self):
133 |         return self.encoder[self.eod_token]
134 | 
135 |     @property
136 |     def pad_id(self):
137 |         return self.encoder[self.pad_token]
138 | 
139 |     @property
140 |     def eod_token(self):
141 |         return '<eod>'
142 | 
143 |     @property
144 |     def pad_token(self):
145 |         return '<pad>'
146 | 
147 |     def get_sentinel_num(self):
148 |         return len(self.sentinel_list)
149 | 
150 |     def get_sentinel_id(self, idx):
151 |         return self.sentinel_list[idx]
152 | 
153 |     def tokenize(self, text):
154 |         """ Tokenize a string. """
155 |         output_tokens = []
156 |         for x in jieba.cut(text, cut_all=False):
157 |             x = x.translate(self.translator)
158 |             output_tokens.extend(self.wordpiece_tokenizer.tokenize(x))
159 |         return output_tokens
160 | 
161 |     def encode(self, text):
162 |         res = [self.encoder[x] for x in self.tokenize(text)]
163 |         return res
164 | 
165 |     def decode(self, tokens):
166 |         text = ''.join([self.decoder[x] for x in tokens])
167 |         text = text.replace('\u2582', ' ').replace('\u2583', '\n')
168 |         return text
169 | 
170 | class GPT2TokenizerwoMerge(object):
171 | 
172 |     def __init__(self, vocab_file, max_len=None, max_sentinels=190):
173 |         self.max_len = max_len if max_len is not None else int(1e12)
174 |         self.encoder = load_vocab(vocab_file)
175 |         self.decoder = {v:k for k,v in self.encoder.items()}
176 |         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder)
177 |         self.translator = str.maketrans(" \n", "\u2582\u2583")
178 | 
179 |     @property
180 |     def vocab_size(self):
181 |         return len(self.encoder)
182 | 
183 |     def __len__(self):
184 |         return len(self.encoder)
185 | 
186 |     @property
187 |     def eod_id(self):
188 |         return self.encoder[self.eod_token]
189 | 
190 |     @property
191 |     def pad_id(self):
192 |         return self.encoder[self.pad_token]
193 | 
194 |     @property
195 |     def eod_token(self):
196 |         return '<eod>'
197 | 
198 |     @property
199 |     def pad_token(self):
200 |         return '<pad>'
201 | 
202 |     def tokenize(self, text):
203 |         """ Tokenize a string. """
204 |         output_tokens = []
205 |         for x in jieba.cut(text, cut_all=False):
206 |             x = x.translate(self.translator)
207 |             output_tokens.extend(self.wordpiece_tokenizer.tokenize(x))
208 |         return output_tokens
209 | 
210 |     def encode(self, text):
211 |         res = [self.encoder[x] for x in self.tokenize(text)]
212 |         return res
213 | 
214 |     def decode(self, tokens):
215 |         text = ''.join([self.decoder[x] for x in tokens])
216 |         text = text.replace('\u2582', ' ').replace('\u2583', '\n')
217 |         return text
218 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/test_random.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | from commons import print_separator
 17 | from commons import initialize_distributed
 18 | import mpu
 19 | import torch
 20 | import sys
 21 | sys.path.append("../..")
 22 | 
 23 | 
 24 | def test_set_cuda_rng_state(model_parallel_size):
 25 | 
 26 |     if torch.distributed.get_rank() == 0:
 27 |         print('> testing set_rng_state with size {} ...'.
 28 |               format(model_parallel_size))
 29 | 
 30 |     mpu.initialize_model_parallel(model_parallel_size)
 31 |     model_parallel_size = mpu.get_model_parallel_world_size()
 32 | 
 33 |     size = 123
 34 |     seed = 1234
 35 |     torch.cuda.manual_seed(1234)
 36 |     tensor = torch.cuda.FloatTensor(size)
 37 | 
 38 |     # Get the state
 39 |     rng_state = torch.cuda.get_rng_state()
 40 |     rng_state_copy = rng_state.clone()
 41 | 
 42 |     # Do some stuff.
 43 |     for _ in range(5):
 44 |         torch.randn(size, out=tensor)
 45 |     result_1 = tensor.clone()
 46 | 
 47 |     assert rng_state.sub(rng_state_copy).max() == 0
 48 |     assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0
 49 | 
 50 |     # State should be different.
 51 |     new_rng_state = torch.cuda.get_rng_state()
 52 |     max_diff = new_rng_state.sub(rng_state).max()
 53 |     print('   max diff in rng state (should be non-zero) on global rank {}: {}'.
 54 |           format(torch.distributed.get_rank(), max_diff))
 55 |     assert max_diff > 0
 56 | 
 57 |     # Reset the rng state and do the same stuff.
 58 |     mpu.random._set_cuda_rng_state(rng_state)
 59 |     for _ in range(5):
 60 |         torch.randn(size, out=tensor)
 61 |     mpu.random._set_cuda_rng_state(rng_state)
 62 |     for _ in range(5):
 63 |         torch.randn(size, out=tensor)
 64 |     result_2 = tensor.clone()
 65 | 
 66 |     # Results should be the same
 67 |     error = result_2.sub(result_1).abs().max()
 68 |     print('   max error in generated tensors (should be zero) on '
 69 |           'global rank {}: {}'.format(torch.distributed.get_rank(), error))
 70 |     assert error < 1.0e-6
 71 | 
 72 |     # Input state should have remained intact.
 73 |     error = rng_state.sub(rng_state_copy).max()
 74 |     print('   max error in rng state (should be zero) on global rank {}: {}'.
 75 |           format(torch.distributed.get_rank(), error))
 76 |     assert error == 0
 77 | 
 78 |     # Reset groups
 79 |     mpu.destroy_model_parallel()
 80 | 
 81 |     torch.distributed.barrier()
 82 |     if torch.distributed.get_rank() == 0:
 83 |         print('>> passed the test :-)')
 84 | 
 85 | 
 86 | def test_cuda_rng_tracker(model_parallel_size):
 87 | 
 88 |     if torch.distributed.get_rank() == 0:
 89 |         print('> testing cuda rng tracker with size {} ...'.
 90 |               format(model_parallel_size))
 91 | 
 92 |     mpu.initialize_model_parallel(model_parallel_size)
 93 |     model_parallel_size = mpu.get_model_parallel_world_size()
 94 | 
 95 |     seed_1 = 1234
 96 |     seed_2 = 4321
 97 |     size = [12, 21]
 98 |     tensor = torch.cuda.FloatTensor(size)
 99 | 
100 |     # Set to seed_1 and generate two tensors.
101 |     torch.cuda.manual_seed(seed_1)
102 |     torch.randn(size, out=tensor)
103 |     target_11 = tensor.clone()
104 |     torch.randn(size, out=tensor)
105 |     target_12 = tensor.clone()
106 | 
107 |     # Set to seed_2 and generate two tensors.
108 |     torch.cuda.manual_seed(seed_2)
109 |     torch.randn(size, out=tensor)
110 |     target_21 = tensor.clone()
111 |     torch.randn(size, out=tensor)
112 |     target_22 = tensor.clone()
113 | 
114 |     # Now if we interleave seed_1 and seed_2,
115 |     # we should still get the same tensors
116 |     torch.cuda.manual_seed(seed_1)
117 |     mpu.get_cuda_rng_tracker().add('test', seed_2)
118 | 
119 |     torch.randn(size, out=tensor)
120 |     result_11 = tensor.clone()
121 | 
122 |     with mpu.get_cuda_rng_tracker().fork('test'):
123 |         torch.randn(size, out=tensor)
124 |         result_21 = tensor.clone()
125 | 
126 |     torch.randn(size, out=tensor)
127 |     result_12 = tensor.clone()
128 | 
129 |     with mpu.get_cuda_rng_tracker().fork('test'):
130 |         torch.randn(size, out=tensor)
131 |         result_22 = tensor.clone()
132 | 
133 |     diff = result_11.sub(result_21).abs().max()
134 |     diff = min(diff, result_12.sub(result_22).abs().max())
135 |     print('   max diff in generated tensors (should be non-zero) on '
136 |           'global rank {}: {}'.format(torch.distributed.get_rank(), diff))
137 |     assert diff > 1.0e-6
138 |     error = max(result_11.sub(target_11).abs().max(),
139 |                 result_12.sub(target_12).abs().max())
140 |     error = max(error, result_21.sub(target_21).abs().max())
141 |     error = max(error, result_22.sub(target_22).abs().max())
142 |     print('   max error in generated tensors (should be zero) on '
143 |           'global rank {}: {}'.format(torch.distributed.get_rank(), error))
144 |     assert error < 1.0e-6
145 | 
146 |     # Reset the tracker
147 |     mpu.get_cuda_rng_tracker().reset()
148 | 
149 |     # Reset groups
150 |     mpu.destroy_model_parallel()
151 | 
152 |     torch.distributed.barrier()
153 |     if torch.distributed.get_rank() == 0:
154 |         print('>> passed the test :-)')
155 | 
156 | 
157 | def test_model_parallel_cuda_manual_seed(model_parallel_size):
158 | 
159 |     if torch.distributed.get_rank() == 0:
160 |         print('> testing model parallel cuda manual seed with size {} ...'.
161 |               format(model_parallel_size))
162 | 
163 |     mpu.initialize_model_parallel(model_parallel_size)
164 |     model_parallel_size = mpu.get_model_parallel_world_size()
165 | 
166 |     mpu.model_parallel_cuda_manual_seed(12345)
167 |     assert torch.cuda.initial_seed() == 12345
168 |     with mpu.get_cuda_rng_tracker().fork():
169 |         assert torch.cuda.initial_seed() == (12345 + 2718 +
170 |                                              mpu.get_model_parallel_rank())
171 | 
172 |     # Reset the tracker
173 |     mpu.get_cuda_rng_tracker().reset()
174 | 
175 |     # Reset groups
176 |     mpu.destroy_model_parallel()
177 | 
178 |     torch.distributed.barrier()
179 |     if torch.distributed.get_rank() == 0:
180 |         print('>> passed the test :-)')
181 | 
182 | 
183 | if __name__ == '__main__':
184 | 
185 |     initialize_distributed()
186 |     world_size = torch.distributed.get_world_size()
187 | 
188 |     model_parallel_size = 1
189 |     while model_parallel_size <= world_size:
190 |         print_separator('test set rng state')
191 |         test_set_cuda_rng_state(model_parallel_size)
192 |         model_parallel_size *= 2
193 | 
194 |     model_parallel_size = 1
195 |     while model_parallel_size <= world_size:
196 |         print_separator('test cuda rng tracker')
197 |         test_cuda_rng_tracker(model_parallel_size)
198 |         model_parallel_size *= 2
199 | 
200 |     model_parallel_size = 1
201 |     while model_parallel_size <= world_size:
202 |         print_separator('test model parallel cuda manual seed')
203 |         test_model_parallel_cuda_manual_seed(model_parallel_size)
204 |         model_parallel_size *= 2
205 | 


--------------------------------------------------------------------------------
/megatron/deprecated_data_utils/lazy_loader.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """utils for loading text from disk"""
 16 | import os
 17 | import mmap
 18 | import pickle as pkl
 19 | import time
 20 | from itertools import accumulate
 21 | 
 22 | import torch
 23 | from torch.multiprocessing import Lock
 24 | 
 25 | 
 26 | def get_lazy_path(path):
 27 |     """
 28 |     Gets directory path where lazy files are stored.
 29 |     """
 30 |     return os.path.splitext(path)[0] + '.lazy'
 31 | 
 32 | 
 33 | def exists_lazy(path, data_type='data'):
 34 |     """
 35 |     Check if we've already made a lazy version of this file for the `data_type` field.
 36 |     """
 37 |     if not os.path.exists(get_lazy_path(path)):
 38 |         return False
 39 |     contents = os.listdir(get_lazy_path(path))
 40 |     if data_type not in contents:
 41 |         return False
 42 |     if data_type + '.len.pkl' not in contents:
 43 |         return False
 44 |     return True
 45 | 
 46 | 
 47 | def make_lazy(path, strs, data_type='data'):
 48 |     """
 49 |     Make lazy version of `data_type` field of the file. Byte offsets
 50 |     corresponding to data indices are stored in a `.len.pkl` data file.
 51 |     """
 52 |     lazypath = get_lazy_path(path)
 53 |     if not os.path.exists(lazypath):
 54 |         os.makedirs(lazypath)
 55 |     datapath = os.path.join(lazypath, data_type)
 56 |     lenpath = os.path.join(lazypath, data_type + '.len.pkl')
 57 |     if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
 58 |         with open(datapath, 'wb') as f:
 59 |             str_lens = []
 60 |             str_cnt = 0
 61 |             for s in strs:
 62 |                 if isinstance(s, dict):
 63 |                     s = s['text']
 64 |                 encoded = s.encode('utf-8')
 65 |                 f.write(encoded)
 66 |                 str_cnt = len(encoded)
 67 |                 str_lens.append(str_cnt)
 68 |         pkl.dump(str_lens, open(lenpath, 'wb'))
 69 |     else:
 70 |         while not os.path.exists(lenpath):
 71 |             time.sleep(1)
 72 | 
 73 | 
 74 | def split_strings(strings, start, chr_lens):
 75 |     """
 76 |     Split strings based on string lengths and given start.
 77 |     """
 78 |     return [strings[i - start:j - start] for i, j in zip([start] + chr_lens[:-1], chr_lens)]
 79 | 
 80 | 
 81 | class ProcessorTokenizer:
 82 |     """
 83 |     callable class that runs a preprocessing, as well as tokenization step,
 84 |     on input text.
 85 |     """
 86 | 
 87 |     def __init__(self, tokenizer, process_fn=None):
 88 |         self.tokenizer = tokenizer
 89 |         self.process_fn = process_fn
 90 | 
 91 |     def __call__(self, string):
 92 |         if self.tokenizer is not None:
 93 |             string = self.tokenizer(string, process_fn=self.process_fn)
 94 |         elif self.process_fn is not None:
 95 |             string = self.process_fn(string)
 96 |         return string
 97 | 
 98 | 
 99 | class lazy_array_loader(object):
100 |     """
101 |     Arguments:
102 |         path: path to directory where array entries are concatenated into one big string file
103 |             and the .len file are located
104 |         data_type (str): Some datsets have multiple fields that are stored in different paths.
105 |             `data_type` specifies which of these fields to load in this class
106 |         mem_map  (boolean): Specifies whether to memory map file `path`
107 |         map_fn (callable): Fetched strings are passed through map_fn before being returned.
108 | 
109 |     Example of lazy loader directory structure:
110 |     file.json
111 |     file.lazy/
112 |         data_type1
113 |         data_type1.len.pkl
114 |         data_type2
115 |         data_type2.len.pkl
116 |     """
117 | 
118 |     def __init__(self, path, data_type='data', mem_map=False, map_fn=None):
119 |         lazypath = get_lazy_path(path)
120 |         datapath = os.path.join(lazypath, data_type)
121 |         # get file where array entries are concatenated into one big string
122 |         self._file = open(datapath, 'rb', buffering=0)
123 |         self.file = self._file
124 |         # memory map file if necessary
125 |         self.mem_map = mem_map
126 |         if self.mem_map:
127 |             self.file = mmap.mmap(self.file.fileno(), 0, prot=mmap.PROT_READ)
128 |         lenpath = os.path.join(lazypath, data_type + '.len.pkl')
129 |         self.lens = pkl.load(open(lenpath, 'rb'))
130 |         self.ends = list(accumulate(self.lens))
131 |         self.dumb_ends = list(self.ends)
132 |         self.read_lock = Lock()
133 |         self.process_fn = map_fn
134 |         self.map_fn = map_fn
135 |         self._tokenizer = None
136 | 
137 |     def SetTokenizer(self, tokenizer):
138 |         """
139 |         logic to set and remove (set to None) tokenizer.
140 |         combines preprocessing/tokenization into one callable.
141 |         """
142 |         if tokenizer is None:
143 |             if not hasattr(self, '_tokenizer'):
144 |                 self._tokenizer = tokenizer
145 |         else:
146 |             self._tokenizer = tokenizer
147 |         self.map_fn = ProcessorTokenizer(tokenizer, self.process_fn)
148 | 
149 |     def GetTokenizer(self):
150 |         return self._tokenizer
151 | 
152 |     def __getitem__(self, index):
153 |         """
154 |         read file and splice strings based on string ending array `self.ends`
155 |         """
156 |         if not isinstance(index, slice):
157 |             if index == 0:
158 |                 start = 0
159 |             else:
160 |                 start = self.ends[index - 1]
161 |             end = self.ends[index]
162 |             rtn = self.file_read(start, end)
163 |             if self.map_fn is not None:
164 |                 return self.map_fn(rtn)
165 |         else:
166 |             # if slice, fetch strings with 1 diskread and then splice in memory
167 |             chr_lens = self.ends[index]
168 |             if index.start == 0 or index.start is None:
169 |                 start = 0
170 |             else:
171 |                 start = self.ends[index.start - 1]
172 |             stop = chr_lens[-1]
173 |             strings = self.file_read(start, stop)
174 |             rtn = split_strings(strings, start, chr_lens)
175 |             if self.map_fn is not None:
176 |                 return self.map_fn([s for s in rtn])
177 |         return rtn
178 | 
179 |     def __len__(self):
180 |         return len(self.ends)
181 | 
182 |     def file_read(self, start=0, end=None):
183 |         """read specified portion of file"""
184 | 
185 |         # atomic reads to avoid race conditions with multiprocess dataloader
186 |         self.read_lock.acquire()
187 |         # seek to start of file read
188 |         self.file.seek(start)
189 |         # read to end of file if no end point provided
190 |         if end is None:
191 |             rtn = self.file.read()
192 |         # else read amount needed to reach end point
193 |         else:
194 |             rtn = self.file.read(end - start)
195 |         self.read_lock.release()
196 |         # TODO: @raulp figure out mem map byte string bug
197 |         # if mem map'd need to decode byte string to string
198 |         rtn = rtn.decode('utf-8', 'ignore')
199 |         # rtn = str(rtn)
200 |         if self.mem_map:
201 |             rtn = rtn.decode('unicode_escape')
202 |         return rtn
203 | 


--------------------------------------------------------------------------------
/megatron/data/realm_dataset_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | 
  7 | from megatron import mpu, print_rank_0
  8 | from megatron.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy
  9 | from megatron.data.samplers import DistributedBatchSampler
 10 | from megatron import get_args, get_tokenizer, print_rank_0, mpu
 11 | 
 12 | 
 13 | def get_one_epoch_dataloader(dataset, batch_size=None):
 14 |     """Specifically one epoch to be used in an indexing job."""
 15 |     args = get_args()
 16 | 
 17 |     world_size = mpu.get_data_parallel_world_size()
 18 |     rank = mpu.get_data_parallel_rank()
 19 |     if batch_size is None:
 20 |         batch_size = args.batch_size
 21 |     global_batch_size = batch_size * world_size
 22 |     num_workers = args.num_workers
 23 | 
 24 |     sampler = torch.utils.data.SequentialSampler(dataset)
 25 |     # importantly, drop_last must be False to get all the data.
 26 |     batch_sampler = DistributedBatchSampler(sampler,
 27 |                                             batch_size=global_batch_size,
 28 |                                             drop_last=False,
 29 |                                             rank=rank,
 30 |                                             world_size=world_size)
 31 | 
 32 |     return torch.utils.data.DataLoader(dataset,
 33 |                                        batch_sampler=batch_sampler,
 34 |                                        num_workers=num_workers,
 35 |                                        pin_memory=True)
 36 | 
 37 | 
 38 | def get_ict_batch(data_iterator):
 39 |     # Items and their type.
 40 |     keys = ['query_tokens', 'query_pad_mask',
 41 |             'block_tokens', 'block_pad_mask', 'block_data']
 42 |     datatype = torch.int64
 43 | 
 44 |     # Broadcast data.
 45 |     if data_iterator is None:
 46 |         data = None
 47 |     else:
 48 |         data = next(data_iterator)
 49 |     data_b = mpu.broadcast_data(keys, data, datatype)
 50 | 
 51 |     # Unpack.
 52 |     query_tokens = data_b['query_tokens'].long()
 53 |     query_pad_mask = data_b['query_pad_mask'].long()
 54 |     block_tokens = data_b['block_tokens'].long()
 55 |     block_pad_mask = data_b['block_pad_mask'].long()
 56 |     block_indices = data_b['block_data'].long()
 57 | 
 58 |     return query_tokens, query_pad_mask,\
 59 |            block_tokens, block_pad_mask, block_indices
 60 | 
 61 | 
 62 | def join_str_list(str_list):
 63 |     """Join a list of strings, handling spaces appropriately"""
 64 |     result = ""
 65 |     for s in str_list:
 66 |         if s.startswith("##"):
 67 |             result += s[2:]
 68 |         else:
 69 |             result += " " + s
 70 |     return result
 71 | 
 72 | 
 73 | class BlockSampleData(object):
 74 |     """A struct for fully describing a fixed-size block of data as used in REALM
 75 | 
 76 |     :param start_idx: for first sentence of the block
 77 |     :param end_idx: for last sentence of the block (may be partially truncated in sample construction)
 78 |     :param doc_idx: the index of the document from which the block comes in the original indexed dataset
 79 |     :param block_idx: a unique integer identifier given to every block.
 80 |     """
 81 |     def __init__(self, start_idx, end_idx, doc_idx, block_idx):
 82 |         self.start_idx = start_idx
 83 |         self.end_idx = end_idx
 84 |         self.doc_idx = doc_idx
 85 |         self.block_idx = block_idx
 86 | 
 87 |     def as_array(self):
 88 |         return np.array([self.start_idx, self.end_idx, self.doc_idx, self.block_idx]).astype(np.int64)
 89 | 
 90 |     def as_tuple(self):
 91 |         return self.start_idx, self.end_idx, self.doc_idx, self.block_idx
 92 | 
 93 | 
 94 | class BlockSamplesMapping(object):
 95 |     def __init__(self, mapping_array):
 96 |         # make sure that the array is compatible with BlockSampleData
 97 |         assert mapping_array.shape[1] == 4
 98 |         self.mapping_array = mapping_array
 99 | 
100 |     def __len__(self):
101 |         return self.mapping_array.shape[0]
102 | 
103 |     def __getitem__(self, idx):
104 |         """Get the data associated with an indexed sample."""
105 |         sample_data = BlockSampleData(*self.mapping_array[idx])
106 |         return sample_data
107 | 
108 | 
109 | def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epochs,
110 |                               max_num_samples, max_seq_length, seed, name, use_one_sent_docs=False):
111 |     """Get samples mapping for a dataset over fixed size blocks. This function also requires
112 |     a dataset of the titles for the source documents since their lengths must be taken into account.
113 | 
114 |     :return: samples_mapping (BlockSamplesMapping)
115 |     """
116 | 
117 |     if not num_epochs:
118 |         if not max_num_samples:
119 |             raise ValueError("Need to specify either max_num_samples "
120 |                              "or num_epochs")
121 |         num_epochs = np.iinfo(np.int32).max - 1
122 |     if not max_num_samples:
123 |         max_num_samples = np.iinfo(np.int64).max - 1
124 | 
125 |     # Filename of the index mapping
126 |     indexmap_filename = data_prefix
127 |     indexmap_filename += '_{}_indexmap'.format(name)
128 |     if num_epochs != (np.iinfo(np.int32).max - 1):
129 |         indexmap_filename += '_{}ep'.format(num_epochs)
130 |     if max_num_samples != (np.iinfo(np.int64).max - 1):
131 |         indexmap_filename += '_{}mns'.format(max_num_samples)
132 |     indexmap_filename += '_{}msl'.format(max_seq_length)
133 |     indexmap_filename += '_{}s'.format(seed)
134 |     if use_one_sent_docs:
135 |         indexmap_filename += '_1sentok'
136 |     indexmap_filename += '.npy'
137 | 
138 |     # Build the indexed mapping if not exist.
139 |     if mpu.get_data_parallel_rank() == 0 and \
140 |             not os.path.isfile(indexmap_filename):
141 |         print(' > WARNING: could not find index map file {}, building '
142 |               'the indices on rank 0 ...'.format(indexmap_filename))
143 | 
144 |         # Make sure the types match the helpers input types.
145 |         assert block_dataset.doc_idx.dtype == np.int64
146 |         assert block_dataset.sizes.dtype == np.int32
147 | 
148 |         # Build samples mapping
149 |         verbose = torch.distributed.get_rank() == 0
150 |         start_time = time.time()
151 |         print_rank_0(' > building samples index mapping for {} ...'.format(
152 |             name))
153 | 
154 |         # compile/bind the C++ helper code
155 |         from megatron.data.dataset_utils import compile_helper
156 |         compile_helper()
157 | 
158 |         from megatron.data import helpers
159 |         mapping_array = helpers.build_blocks_mapping(
160 |             block_dataset.doc_idx,
161 |             block_dataset.sizes,
162 |             title_dataset.sizes,
163 |             num_epochs,
164 |             max_num_samples,
165 |             max_seq_length - 3,  # account for added tokens
166 |             seed,
167 |             verbose,
168 |             use_one_sent_docs)
169 | 
170 | 
171 |         print_rank_0(' > done building samples index mapping')
172 |         np.save(indexmap_filename, mapping_array, allow_pickle=True)
173 |         print_rank_0(' > saved the index mapping in {}'.format(
174 |             indexmap_filename))
175 |         # Make sure all the ranks have built the mapping
176 |         print_rank_0(' > elapsed time to build and save samples mapping '
177 |                      '(seconds): {:4f}'.format(
178 |             time.time() - start_time))
179 | 
180 |     # This should be a barrier but nccl barrier assumes
181 |     # device_index=rank which is not the case for model
182 |     # parallel case
183 |     counts = torch.cuda.LongTensor([1])
184 |     torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
185 |     assert counts[0].item() == torch.distributed.get_world_size(
186 |         group=mpu.get_data_parallel_group())
187 | 
188 |     # Load indexed dataset.
189 |     print_rank_0(' > loading indexed mapping from {}'.format(
190 |         indexmap_filename))
191 |     start_time = time.time()
192 | 
193 |     mapping_array = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r')
194 |     samples_mapping = BlockSamplesMapping(mapping_array)
195 | 
196 |     print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(
197 |         time.time() - start_time))
198 |     print_rank_0('    total number of samples: {}'.format(
199 |         mapping_array.shape[0]))
200 | 
201 |     return samples_mapping
202 | 


--------------------------------------------------------------------------------
/megatron/model/bert_model.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """BERT model."""
 17 | 
 18 | import torch
 19 | 
 20 | from megatron import get_args
 21 | from megatron import mpu
 22 | from megatron.model.language_model import parallel_lm_logits
 23 | from megatron.model.language_model import get_language_model
 24 | from megatron.model.transformer import LayerNorm
 25 | from megatron.model.utils import openai_gelu, erf_gelu
 26 | from megatron.model.utils import get_linear_layer
 27 | from megatron.model.utils import init_method_normal
 28 | from megatron.model.utils import scaled_init_method_normal
 29 | from megatron.module import MegatronModule
 30 | 
 31 | def bert_attention_mask_func(attention_scores, attention_mask):
 32 |     attention_scores.masked_fill_(attention_mask, -10000.0)
 33 |     return attention_scores
 34 | 
 35 | def bert_extended_attention_mask(attention_mask):
 36 |     # We create a 3D attention mask from a 2D tensor mask.
 37 |     # [b, 1, s]
 38 |     attention_mask_b1s = attention_mask.unsqueeze(1)
 39 |     # [b, s, 1]
 40 |     attention_mask_bs1 = attention_mask.unsqueeze(2)
 41 |     # [b, s, s]
 42 |     attention_mask_bss = attention_mask_b1s * attention_mask_bs1
 43 |     # [b, 1, s, s]
 44 |     extended_attention_mask = attention_mask_bss.unsqueeze(1)
 45 | 
 46 |     # Convert attention mask to binary:
 47 |     extended_attention_mask = (extended_attention_mask < 0.5)
 48 | 
 49 |     return extended_attention_mask
 50 | 
 51 | def bert_position_ids(token_ids):
 52 |     # Create position ids
 53 |     seq_length = token_ids.size(1)
 54 |     position_ids = torch.arange(seq_length, dtype=torch.long,
 55 |                                 device=token_ids.device)
 56 |     position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
 57 | 
 58 |     return position_ids
 59 | 
 60 | 
 61 | class BertLMHead(MegatronModule):
 62 |     """Masked LM head for Bert
 63 | 
 64 |     Arguments:
 65 |         mpu_vocab_size: model parallel size of vocabulary.
 66 |         hidden_size: hidden size
 67 |         init_method: init method for weight initialization
 68 |         layernorm_epsilon: tolerance for layer norm divisions
 69 |         parallel_output: whether output logits being distributed or not.
 70 |     """
 71 | 
 72 |     def __init__(self, mpu_vocab_size, hidden_size, init_method,
 73 |                  layernorm_epsilon, parallel_output):
 74 | 
 75 |         super(BertLMHead, self).__init__()
 76 | 
 77 |         args = get_args()
 78 | 
 79 |         self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size))
 80 |         self.bias.model_parallel = True
 81 |         self.bias.partition_dim = 0
 82 |         self.bias.stride = 1
 83 |         self.parallel_output = parallel_output
 84 | 
 85 |         self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
 86 |         self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
 87 |         self.gelu = torch.nn.functional.gelu
 88 |         if args.openai_gelu:
 89 |             self.gelu = openai_gelu
 90 |         elif args.onnx_safe:
 91 |             self.gelu = erf_gelu
 92 | 
 93 |     def forward(self, hidden_states, word_embeddings_weight):
 94 |         hidden_states = self.dense(hidden_states)
 95 |         hidden_states = self.gelu(hidden_states)
 96 |         hidden_states = self.layernorm(hidden_states)
 97 |         output = parallel_lm_logits(hidden_states,
 98 |                                     word_embeddings_weight,
 99 |                                     self.parallel_output,
100 |                                     bias=self.bias)
101 |         return output
102 | 
103 | 
104 | class BertModel(MegatronModule):
105 |     """Bert Language model."""
106 | 
107 |     def __init__(self, num_tokentypes=2, add_binary_head=True,
108 |                  parallel_output=True):
109 |         super(BertModel, self).__init__()
110 |         args = get_args()
111 | 
112 |         self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
113 |         self.add_binary_head = add_binary_head
114 |         self.parallel_output = parallel_output
115 |         init_method = init_method_normal(args.init_method_std)
116 |         scaled_init_method = scaled_init_method_normal(args.init_method_std,
117 |                                                        args.num_layers)
118 | 
119 |         self.language_model, self._language_model_key = get_language_model(
120 |             attention_mask_func=bert_attention_mask_func,
121 |             num_tokentypes=num_tokentypes,
122 |             add_pooler=self.add_binary_head,
123 |             init_method=init_method,
124 |             scaled_init_method=scaled_init_method)
125 | 
126 |         self.lm_head = BertLMHead(
127 |             self.language_model.embedding.word_embeddings.weight.size(0),
128 |             args.hidden_size, init_method, args.layernorm_epsilon, parallel_output)
129 |         self._lm_head_key = 'lm_head'
130 |         if self.add_binary_head:
131 |             self.binary_head = get_linear_layer(args.hidden_size, 2,
132 |                                                 init_method)
133 |             self._binary_head_key = 'binary_head'
134 | 
135 |     def forward(self, input_ids, attention_mask,
136 |                 tokentype_ids=None, lm_labels=None):
137 | 
138 |         extended_attention_mask = bert_extended_attention_mask(attention_mask)
139 |         position_ids = bert_position_ids(input_ids)
140 | 
141 |         if self.add_binary_head:
142 |             lm_output, pooled_output = self.language_model(
143 |                 input_ids,
144 |                 position_ids,
145 |                 extended_attention_mask,
146 |                 tokentype_ids=tokentype_ids)
147 |         else:
148 |             lm_output = self.language_model(
149 |                 input_ids,
150 |                 position_ids,
151 |                 extended_attention_mask,
152 |                 tokentype_ids=tokentype_ids)
153 | 
154 |         # Output.
155 |         lm_logits = self.lm_head(
156 |             lm_output, self.language_model.embedding.word_embeddings.weight)
157 | 
158 |         binary_logits = None
159 |         if self.add_binary_head:
160 |             binary_logits = self.binary_head(pooled_output)
161 | 
162 |         if lm_labels is None:
163 |             return lm_logits, binary_logits
164 |         else:
165 |             if self.fp16_lm_cross_entropy:
166 |                 assert lm_logits.dtype == torch.half
167 |                 lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits, lm_labels)
168 |             else:
169 |                 lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits.float(),
170 |                                                            lm_labels)
171 |             return lm_loss, binary_logits
172 | 
173 | 
174 |     def state_dict_for_save_checkpoint(self, destination=None, prefix='',
175 |                                        keep_vars=False):
176 |         """For easy load when model is combined with other heads,
177 |         add an extra key."""
178 | 
179 |         state_dict_ = {}
180 |         state_dict_[self._language_model_key] \
181 |             = self.language_model.state_dict_for_save_checkpoint(
182 |             destination, prefix, keep_vars)
183 |         state_dict_[self._lm_head_key] \
184 |             = self.lm_head.state_dict_for_save_checkpoint(
185 |             destination, prefix, keep_vars)
186 |         if self.add_binary_head:
187 |             state_dict_[self._binary_head_key] \
188 |                 = self.binary_head.state_dict(destination, prefix, keep_vars)
189 |         return state_dict_
190 | 
191 |     def load_state_dict(self, state_dict, strict=True):
192 |         """Customized load."""
193 | 
194 |         self.language_model.load_state_dict(
195 |             state_dict[self._language_model_key], strict=strict)
196 |         self.lm_head.load_state_dict(
197 |             state_dict[self._lm_head_key], strict=strict)
198 |         if self.add_binary_head:
199 |             self.binary_head.load_state_dict(
200 |                 state_dict[self._binary_head_key], strict=strict)
201 | 


--------------------------------------------------------------------------------
/tds/__init__.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Copyright 2020 The Microsoft DeepSpeed Team
  3 | '''
  4 | import sys
  5 | import types
  6 | 
  7 | import deepspeed
  8 | from deepspeed import ops
  9 | from deepspeed import pipe
 10 | from deepspeed import runtime
 11 | from deepspeed.runtime.engine import DeepSpeedEngine
 12 | from deepspeed.runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER
 13 | from deepspeed.runtime.lr_schedules import add_tuning_arguments
 14 | from deepspeed.runtime.config import DeepSpeedConfig, DeepSpeedConfigError
 15 | from deepspeed.runtime.activation_checkpointing import checkpointing
 16 | from deepspeed.ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 17 | from deepspeed.utils import log_dist
 18 | from deepspeed.utils.distributed import init_distributed
 19 | from deepspeed.pipe import PipelineModule
 20 | from deepspeed.git_version_info import version, git_hash, git_branch
 21 | 
 22 | from .TPipelineEngine import PipelineEngine
 23 | 
 24 | def _parse_version(version_str):
 25 |     '''Parse a version string and extract the major, minor, and patch versions.'''
 26 |     import re
 27 |     matched = re.search('^(\d+)\.(\d+)\.(\d+)', version_str)
 28 |     return int(matched.group(1)), int(matched.group(2)), int(matched.group(3))
 29 | 
 30 | # Export version information
 31 | __version__ = version
 32 | __version_major__, __version_minor__, __version_patch__ = _parse_version(__version__)
 33 | __git_hash__ = git_hash
 34 | __git_branch__ = git_branch
 35 | 
 36 | # Provide backwards compatability with old deepspeed.pt module structure, should hopefully not be used
 37 | pt = types.ModuleType('pt', 'dummy pt module for backwards compatability')
 38 | deepspeed = sys.modules[__name__]
 39 | setattr(deepspeed, 'pt', pt)
 40 | setattr(deepspeed.pt, 'deepspeed_utils', deepspeed.runtime.utils)
 41 | sys.modules['deepspeed.pt'] = deepspeed.pt
 42 | sys.modules['deepspeed.pt.deepspeed_utils'] = deepspeed.runtime.utils
 43 | setattr(deepspeed.pt, 'deepspeed_config', deepspeed.runtime.config)
 44 | sys.modules['deepspeed.pt.deepspeed_config'] = deepspeed.runtime.config
 45 | setattr(deepspeed.pt, 'loss_scaler', deepspeed.runtime.fp16.loss_scaler)
 46 | sys.modules['deepspeed.pt.loss_scaler'] = deepspeed.runtime.fp16.loss_scaler
 47 | 
 48 | 
 49 | def initialize(args,
 50 |                model,
 51 |                optimizer=None,
 52 |                model_parameters=None,
 53 |                training_data=None,
 54 |                lr_scheduler=None,
 55 |                mpu=None,
 56 |                dist_init_required=None,
 57 |                collate_fn=None,
 58 |                config_params=None):
 59 |     """Initialize the DeepSpeed Engine.
 60 | 
 61 |     Arguments:
 62 |         args: a dictionary containing local_rank and deepspeed_config
 63 |             file location
 64 | 
 65 |         model: Required: nn.module class before apply any wrappers
 66 | 
 67 |         optimizer: Optional: a user defined optimizer, this is typically used instead of defining
 68 |             an optimizer in the DeepSpeed json config.
 69 | 
 70 |         model_parameters: Optional: An iterable of torch.Tensors or dicts.
 71 |             Specifies what Tensors should be optimized.
 72 | 
 73 |         training_data: Optional: Dataset of type torch.utils.data.Dataset
 74 | 
 75 |         lr_scheduler: Optional: Learning Rate Scheduler Object. It should define a get_lr(),
 76 |             step(), state_dict(), and load_state_dict() methods
 77 | 
 78 |         mpu: Optional: A model parallelism unit object that implements
 79 |             get_{model,data}_parallel_{rank,group,world_size}()
 80 | 
 81 |         dist_init_required: Optional: None will auto-initialize torch.distributed if needed,
 82 |             otherwise the user can force it to be initialized or not via boolean.
 83 | 
 84 |         collate_fn: Optional: Merges a list of samples to form a
 85 |             mini-batch of Tensor(s).  Used when using batched loading from a
 86 |             map-style dataset.
 87 | 
 88 |     Returns:
 89 |         A tuple of ``engine``, ``optimizer``, ``training_dataloader``, ``lr_scheduler``
 90 | 
 91 |         * ``engine``: DeepSpeed runtime engine which wraps the client model for distributed training.
 92 | 
 93 |         * ``optimizer``: Wrapped optimizer if a user defined ``optimizer`` is supplied, or if
 94 |           optimizer is specified in json config else ``None``.
 95 | 
 96 |         * ``training_dataloader``: DeepSpeed dataloader if ``training_data`` was supplied,
 97 |           otherwise ``None``.
 98 | 
 99 |         * ``lr_scheduler``: Wrapped lr scheduler if user ``lr_scheduler`` is passed, or
100 |           if ``lr_scheduler`` specified in JSON configuration. Otherwise ``None``.
101 |     """
102 |     log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
103 |         __version__,
104 |         __git_hash__,
105 |         __git_branch__),
106 |              ranks=[0])
107 | 
108 |     if not isinstance(model, PipelineModule):
109 |         engine = DeepSpeedEngine(args=args,
110 |                                  model=model,
111 |                                  optimizer=optimizer,
112 |                                  model_parameters=model_parameters,
113 |                                  training_data=training_data,
114 |                                  lr_scheduler=lr_scheduler,
115 |                                  mpu=mpu,
116 |                                  dist_init_required=dist_init_required,
117 |                                  collate_fn=collate_fn,
118 |                                  config_params=config_params)
119 |     else:
120 |         assert mpu is None, "mpu must be None with pipeline parallelism"
121 |         engine = PipelineEngine(args=args,
122 |                                 model=model,
123 |                                 optimizer=optimizer,
124 |                                 model_parameters=model_parameters,
125 |                                 training_data=training_data,
126 |                                 lr_scheduler=lr_scheduler,
127 |                                 mpu=model.mpu(),
128 |                                 dist_init_required=dist_init_required,
129 |                                 collate_fn=collate_fn,
130 |                                 config_params=config_params)
131 | 
132 |     return_items = [
133 |         engine,
134 |         engine.optimizer,
135 |         engine.training_dataloader,
136 |         engine.lr_scheduler
137 |     ]
138 |     return tuple(return_items)
139 | 
140 | 
141 | def _add_core_arguments(parser):
142 |     r"""Helper (internal) function to update an argument parser with an argument group of the core DeepSpeed arguments.
143 |         The core set of DeepSpeed arguments include the following:
144 |         1) --deepspeed: boolean flag to enable DeepSpeed
145 |         2) --deepspeed_config <json file path>: path of a json configuration file to configure DeepSpeed runtime.
146 | 
147 |         This is a helper function to the public add_config_arguments()
148 | 
149 |     Arguments:
150 |         parser: argument parser
151 |     Return:
152 |         parser: Updated Parser
153 |     """
154 |     group = parser.add_argument_group('DeepSpeed', 'DeepSpeed configurations')
155 | 
156 |     group.add_argument(
157 |         '--deepspeed',
158 |         default=False,
159 |         action='store_true',
160 |         help=
161 |         'Enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)')
162 | 
163 |     group.add_argument('--deepspeed_config',
164 |                        default=None,
165 |                        type=str,
166 |                        help='DeepSpeed json configuration file.')
167 | 
168 |     group.add_argument(
169 |         '--deepscale',
170 |         default=False,
171 |         action='store_true',
172 |         help=
173 |         'Deprecated enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)'
174 |     )
175 | 
176 |     group.add_argument('--deepscale_config',
177 |                        default=None,
178 |                        type=str,
179 |                        help='Deprecated DeepSpeed json configuration file.')
180 | 
181 |     group.add_argument(
182 |         '--deepspeed_mpi',
183 |         default=False,
184 |         action='store_true',
185 |         help=
186 |         "Run via MPI, this will attempt to discover the necessary variables to initialize torch "
187 |         "distributed from the MPI environment")
188 | 
189 |     return parser
190 | 
191 | 
192 | def add_config_arguments(parser):
193 |     r"""Update the argument parser to enabling parsing of DeepSpeed command line arguments.
194 |         The set of DeepSpeed arguments include the following:
195 |         1) --deepspeed: boolean flag to enable DeepSpeed
196 |         2) --deepspeed_config <json file path>: path of a json configuration file to configure DeepSpeed runtime.
197 | 
198 |     Arguments:
199 |         parser: argument parser
200 |     Return:
201 |         parser: Updated Parser
202 |     """
203 |     parser = _add_core_arguments(parser)
204 | 
205 |     return parser
206 | 


--------------------------------------------------------------------------------