├── .gitignore ├── LICENSE ├── README.md ├── SECURITY.md ├── docs ├── artifacts.md ├── bert-intro.md └── dataprep.md ├── finetune ├── PyTorch │ ├── azureml_bert_util.py │ ├── dockerfile │ ├── notebooks │ │ ├── BERT_Eval_GLUE.ipynb │ │ ├── BERT_Eval_SQUAD.ipynb │ │ ├── Pretrained-BERT-GLUE.ipynb │ │ └── Pretrained-BERT-NER.ipynb │ └── run_classifier_azureml.py ├── README.md ├── TensorFlow │ ├── download_model_and_dataset.py │ ├── notebooks │ │ └── Tensorflow-BERT-AzureML.ipynb │ └── run_classifier.py ├── evaluate_squad.py ├── run_classifier_azureml.py └── run_squad_azureml.py └── pretrain ├── PyTorch ├── README.md ├── azureml_adapter.py ├── benchmark.py ├── checkpoint.py ├── configuration.py ├── dataprep │ ├── create_pretraining.py │ ├── sentence_segmentation.py │ ├── single_line_doc_file_creation.py │ └── split_data_into_files.py ├── dataset.py ├── distributed_apex.py ├── logger.py ├── models.py ├── notebooks │ └── BERT_Pretrain.ipynb ├── optimization.py ├── sources.py ├── text.py ├── train.py └── utils.py ├── README.md └── configs ├── bert-base-single-node.json ├── bert-base.json ├── bert-large-single-node.json └── bert-large.json /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # JetBrains Rider 107 | .idea/ 108 | *.sln.iml 109 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BERT on Azure Machine Learning Service 2 | This repo contains end-to-end recipes to [pretrain](#pretrain) and [finetune](#finetune) the [BERT](https://arxiv.org/abs/1810.04805) (Bidirectional Encoder Representations from Transformers) language representation model using [Azure Machine Learning service](https://azure.microsoft.com/en-us/services/machine-learning-service/). 3 | 4 | **Update on 7/7/2020**: 🛑 A more recent implementation for BERT pretraining available at https://github.com/microsoft/onnxruntime-training-examples/tree/master/nvidia-bert is significantly faster than the implementation in this repo. That implementation uses [ONNX Runtime](https://github.com/microsoft/onnxruntime) to accelerate training and it can be used in environments with GPU including Azure Machine Learning service. Details on using ONNX Runtime for training and accelerating training of Transformer models like [BERT](https://arxiv.org/abs/1810.04805) and [GPT-2](https://openai.com/blog/better-language-models/) are available in the blog at [ONNX Runtime Training Technical Deep Dive](https://techcommunity.microsoft.com/t5/azure-ai/onnx-runtime-training-technical-deep-dive/ba-p/1398310). 5 | 6 | ## BERT 7 | BERT is a language representation model that is distinguished by its capacity to effectively capture deep and subtle textual relationships in a corpus. In the original paper, the authors demonstrate that the BERT model could be easily adapted to build state-of-the-art models for a number of NLP tasks, including text classification, named entity recognition and question answering. In this repo, we provide notebooks that allow a developer to pretrain a BERT model from scratch on a corpus, as well as to fine-tune an existing BERT model to solve a specialized task. A brief [introduction to BERT](docs/bert-intro.md) is available in this repo for a quick start on BERT. 8 | 9 | ### Pretrain 10 | ###### Challenges in BERT Pretraining 11 | Pretraining a BERT language representation model to the desired level of accuracy is quite challenging; as a result, most developers start from a BERT model that was pre-trained on a standard corpus (such as Wikipedia), instead of training it from scratch. This strategy works well if the final model is being trained on a corpus that is similar to the corpus used in the pre-train step; however, if the problem involves a specialized corpus that's quite different from the standard corpus, the results won't be optimal. Additionally, to advance language representation beyond BERT’s accuracy, users will need to change the model architecture, training data, cost function, tasks, and optimization routines. All these changes need to be explored at large parameter and training data sizes. In the case of BERT-large, this could be quite substantial as it has 340 million parameters and trained over a very large document corpus. To support this with GPUs, machine learning engineers will need distributed training support to train these large models. However, due to the complexity and fragility of configuring these distributed environments, even expert tweaking can end up with inferior results from the trained models. 12 | 13 | To address these issues, this repo is publishing a workflow for pretraining BERT-large models. Developers can now build their own language representation models like BERT using their domain-specific data on GPUs, either with their own hardware or using Azure Machine Learning service. The pretrain recipe in this repo includes the dataset and preprocessing scripts so anyone can experiment with building their own general purpose language representation models beyond BERT. Overall this is a stable, predictable recipe that converges to a good optimum for researchers to try explorations on their own. 14 | 15 | ###### Implementation 16 | The pretraining recipe in this repo is based on the [PyTorch Pretrained BERT v0.6.2](https://github.com/huggingface/pytorch-transformers/tree/v0.6.2) package from [Hugging Face](https://huggingface.co/). The implementation in this pretraining recipe includes optimization techniques such as `gradient accumulation` (gradients are accumulated for smaller mini-batches before updating model weights) and [`mixed precision training`](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html). The notebook and python modules for pretraining are available at [pretrain](./pretrain/) directory. 17 | 18 | ###### Data Preprocessing 19 | Data preparation is one of the important steps in any Machine Learning project. For BERT pretraining, document-level corpus is needed. The quality of the data used for pretraining directly impacts the quality of the trained models. To make the data preprocessing easier and for repeatability of results, data preprocessing code is included in the repo. It may be used to pre-process Wikipedia corpus or other datasets for pretraining. Refer to additional information at [data preparation for pretraining](docs/dataprep.md) for details on that. 20 | 21 | ### Finetune 22 | The finetuning recipe in this repo shows how to finetune the BERT language representation model using Azure Machine Learning service. The notebooks and python modules for finetuning are available at [finetune](./finetune/) directory. We finetune and evaluate our pretrained checkpoints against the following: 23 | 24 | ###### GLUE benchmark 25 | The [General Language Understanding Evaluation (GLUE) benchmark](https://gluebenchmark.com/) is a collection of nine sentence- or sentence-pair language understanding tasks for evaluating and analyzing natural language understanding systems. The [BERT_Eval_GLUE.ipynb](./finetune/PyTorch/notebooks/BERT_Eval_GLUE.ipynb) jupyter notebook allows the user to run one of the pretrained checkpoints against these tasks on Azure ML. 26 | 27 | ## Azure Machine Learning service 28 | [Azure Machine Learning service](https://azure.microsoft.com/en-us/services/machine-learning-service/) provides a cloud-based environment to prep data, train, test, deploy, manage, and track machine learning models. This service fully supports open-source technologies such as PyTorch, TensorFlow, and scikit-learn and can be used for any kind of machine learning, from classical ML to deep learning, supervised and unsupervised learning. 29 | 30 | #### Notebooks 31 | Jupyter notebooks can be used to use AzureML Python SDK and submit pretrain and finetune jobs. This repo contains the following notebooks for different activities. 32 | 33 | ###### PyTorch Notebooks 34 | |Activity |Notebook | 35 | |:---|:------| 36 | |Pretrain | [BERT_Pretrain.ipynb](./pretrain/PyTorch/notebooks/BERT_Pretrain.ipynb) | 37 | | [GLUE](https://www.nyu.edu/projects/bowman/glue.pdf) finetune/evaluate | [BERT_Eval_GLUE.ipynb](./finetune/PyTorch/notebooks/BERT_Eval_GLUE.ipynb) | 38 | 39 | ###### TensorFlow Notebooks 40 | |Activity |Notebook | 41 | |:---|:------| 42 | | [GLUE](https://www.nyu.edu/projects/bowman/glue.pdf) finetune/evaluate | [Tensorflow-BERT-AzureML.ipynb](finetune/TensorFlow/notebooks/Tensorflow-BERT-AzureML.ipynb) | 43 | 44 | 45 | ## Code of Conduct 46 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 47 | 48 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /docs/artifacts.md: -------------------------------------------------------------------------------- 1 | # Artifacts for pretrain and finetune 2 | 3 | The following artifacts are made available to make pretraining and finetuning of BERT models easier: 4 | * Preprocessed data 5 | * Pretrained BERT-base and BERT-large model checkpoints 6 | 7 | ## Preprocessed Data 8 | The Wikipedia corpus used for BERT pretraining is preprocessed following the [data prep instructions](dataprep.md) and uploaded to https://bertonazuremlwestus2.blob.core.windows.net/public2/bert_data.tar.gz (66 GB). The data files have the sequence length of 512. The directory structure is as follows and this directory hierarchy is assumed in the implementation in [train.py](../pretrain/pytorch/train.py). 9 | ``` 10 | bert_data 11 | │ bert-base.json 12 | │ bert-large.json 13 | │ bert-base-single-node.json 14 | │ bert-large-single-node.json 15 | │ 16 | └───512 17 | │ │ 18 | │ └───wiki_pretrain 19 | │ │ wikipedia_segmented_part_0.bin 20 | │ │ wikipedia_segmented_part_1.bin 21 | │ │ ... 22 | │ │ wikipedia_segmented_part_98.bin 23 | ``` 24 | 25 | Individual data files from wiki_pretrain directory are available at the following urls: 26 | * [wikipedia_segmented_part_0.bin](https://bertonazuremlwestus2.blob.core.windows.net/public2/data/preprocessed/512/wiki_pretrain/wikipedia_segmented_part_0.bin) 27 | * [wikipedia_segmented_part_1.bin](https://bertonazuremlwestus2.blob.core.windows.net/public2/data/preprocessed/512/wiki_pretrain/wikipedia_segmented_part_1.bin) 28 | * [wikipedia_segmented_part_2.bin](https://bertonazuremlwestus2.blob.core.windows.net/public2/data/preprocessed/512/wiki_pretrain/wikipedia_segmented_part_2.bin) 29 | * ... 30 | * [wikipedia_segmented_part_98.bin](https://bertonazuremlwestus2.blob.core.windows.net/public2/data/preprocessed/512/wiki_pretrain/wikipedia_segmented_part_98.bin) 31 | 32 | Use below script to transfer data to your private blob `azcopy copy "https://bertonazuremlwestus2.blob.core.windows.net/public2" "https://.blob.core.windows.net/?" --recursive`. See more about [Azure Blob Shared Access Signature](https://docs.microsoft.com/en-us/azure/storage/common/storage-dotnet-shared-access-signature-part-1) and [azcopy](https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-blobs). 33 | 34 | ## Pretrained BERT Model Checkpoints 35 | The models pretrained in AzureML based on the original BERT implementation are available at the following locations: 36 | * [BERT-Large, Uncased (original)](https://bertonazuremlwestus2.blob.core.windows.net/public/models/bert_large_uncased_original/bert_encoder_epoch_200.pt) 37 | * [BERT-Base, Uncased (original)](https://bertonazuremlwestus2.blob.core.windows.net/public/models/bert_base_uncased_original/bert_encoder_epoch_0300.pt) 38 | -------------------------------------------------------------------------------- /docs/bert-intro.md: -------------------------------------------------------------------------------- 1 | ## **Natural Language Processing** 2 | 3 | In the natural language processing (NLP) domain, pre-trained language representations have traditionally been a key topic for a few important use cases, such as [named entity recognition](https://arxiv.org/pdf/cs/0306050.pdf) (Sang and Meulder, 2003), [question answering](https://arxiv.org/pdf/1606.05250.pdf) (Rajpurkar et al., 2016), and [syntactic parsing](https://nlp.stanford.edu/~mcclosky/papers/dmcc-naacl-2010.pdf) (McClosky et al., 2010). 4 | 5 | The intuition for utilizing a pre-trained model is simple: A deep neural network that is trained on large corpus, say the entire Wikipedia dataset, should have enough knowledge about the underlying relationships between different words and sentences. One should then be able to adapt this DNN to be used on a different corpus, such as medical documents or financial documents, resulting in a model with better performance than one could obtain by training purely on the specialized corpus. 6 | 7 | Recently, a paper called "[BERT: Bidirectional Encoder Representations from Transformers](https://arxiv.org/abs/1810.04805)" was published by Devlin et al., which achieves new state-of-the-art results on 11 NLP tasks, using the pre-trained approach mentioned above. In this repo, we want to show how customers can efficiently and easily pretrain and then fine-tune BERT for their custom applications using Azure Machine Learning Services. We open sourced the code on [GitHub](https://github.com/Microsoft/AzureML-BERT). 8 | 9 | ## **Intuition behind BERT** 10 | 11 | The intuition behind the new language model, BERT, is simple yet powerful. Researchers believe that a large enough deep neural network model, with large enough training corpus, should capture the contextual relations in the corpus. In NLP domain, it is hard to get a large annotated corpus, so researchers used a novel technique to get a lot of training data. Instead of having human beings label the corpus and feed it into neural networks, researchers use the large Internet available corpus such as English Wikipedia with 2,500M words. Two approaches, each for different language tasks, are used to generate the labels for the language model. 12 | 13 | - **Masked language model:** To understand the relationship between words. The key idea is to mask some of the words in the sentence (around 15 percent) and use those masked words as labels to force the models to learn the relationship between words. For example, the original sentence would be: 14 | 15 | ``` 16 | The man went to the store. He bought a gallon of milk. 17 | ``` 18 | 19 | And the input/label pair to the language model is: 20 | 21 | ``` 22 | Input: The man went to the [MASK1]. He bought a [MASK2] of milk. 23 | 24 | Labels: [MASK1] = store; [MASK2] = gallon 25 | ``` 26 | 27 | - **Sentence prediction task:** To understand the relationship between sentences. This task helps the model predict whether sentence B is likely to be the next sentence following a given sentence A. Using the same example above, we can generate training data like: 28 | 29 | ``` 30 | Sentence A: The man went to the store. 31 | 32 | Sentence B: He bought a gallon of milk. 33 | 34 | Label: IsNextSentence 35 | ``` 36 | 37 | ## **Applying BERT to customized dataset** 38 | 39 | After BERT is trained on a large corpus (say all the available English Wikipedia) using the above steps, the assumption is that because the dataset is huge, the model can inherit a lot of knowledge about the English language. The next step is to fine-tune the model on different tasks, hoping the model can adapt to a new domain more quickly. The key idea is to use the large BERT model trained above and add different input/output layers for different types of tasks. For example, you might want to do sentiment analysis for a customer support department. This is a classification problem, so you might need to add an output classification layer (as shown on the left in the figure below) and structure your input. For a different task, say question answering, you might need to use a different input/output layer, where the input is the question and the corresponding paragraph, while the output is the start/end answer span for the question (see the figure on the right). In each case, the way BERT is designed, it can enable data scientists to plug in different layers easily so it can be adapted for different tasks. 40 | 41 | ![Adapting BERT for different tasks](https://azurecomcdn.azureedge.net/mediahandler/acomblog/media/Default/blog/39717ecf-8274-46c4-862d-21ca377b1957.png) 42 | 43 | _Figure 1. Adapting BERT for different tasks (_[_Source_](https://arxiv.org/pdf/1810.04805.pdf)_)_ 44 | 45 | The image below shows the results on one of the most popular datasets in NLP field, the [Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer/). 46 | 47 | ![Reported BERT performance on SQuAD 1.1 dataset](https://azurecomcdn.azureedge.net/mediahandler/acomblog/media/Default/blog/c37ee936-a5d2-4878-b8e2-ffc02a2797f2.png) 48 | 49 | _Figure 2. Reported BERT performance on SQuAD 1.1 dataset (_[_Source_](https://arxiv.org/pdf/1810.04805.pdf)_)._ 50 | 51 | In the GitHub repository, we demonstrated the GLUE [General Language Understanding Evaluation (GLUE)](https://gluebenchmark.com/) (Wang et al., 2018) task. 52 | -------------------------------------------------------------------------------- /docs/dataprep.md: -------------------------------------------------------------------------------- 1 | # Data Preparation for BERT Pretraining 2 | The following steps are to prepare Wikipedia corpus for pretraining. However, these steps can be used with little or no modification to preprocess other datasets as well: 3 | 4 | 1. Download wiki dump file from https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2. 5 | This is a zip file and needs to be unzipped. 6 | 2. Clone [Wikiextractor](https://github.com/attardi/wikiextractor), and run it: 7 | ``` 8 | git clone https://github.com/attardi/wikiextractor 9 | python3 wikiextractor/WikiExtractor.py -o out -b 1000M enwiki-latest-pages-articles.xml 10 | ``` 11 | Running time can be 5-10 minutes/GB. 12 | _output:_ `out` directory 13 | 3. Run: 14 | ``` 15 | ln -s out out2 16 | python3 AzureML-BERT/pretrain/PyTorch/dataprep/single_line_doc_file_creation.py 17 | ``` 18 | This script removes html tags and empty lines and outputs to one file where each line is a paragraph. 19 | (`pip install tqdm` if needed.) 20 | _output:_ `wikipedia.txt` 21 | 4. Run: 22 | ``` 23 | python3 AzureML-BERT/pretrain/PyTorch/dataprep/sentence_segmentation.py wikipedia.txt wikipedia.segmented.nltk.txt 24 | ``` 25 | This script converts `wikipedia.txt` to one file where each line is a sentence. 26 | (`pip install nltk` if needed.) 27 | _output:_ `wikipedia.segmented.nltk.txt` 28 | 5. Split the above output file into ~100 files by line with: 29 | ``` 30 | mkdir data_shards 31 | python3 AzureML-BERT/pretrain/PyTorch/dataprep/split_data_into_files.py 32 | ``` 33 | _output:_ `data_shards` directory 34 | 6. Run: 35 | ``` 36 | python3 AzureML-BERT/pretrain/PyTorch/dataprep/create_pretraining.py --input_dir=data_shards --output_dir=pickled_pretrain_data --do_lower_case=true 37 | ``` 38 | This script will convert each file into pickled `.bin` file. 39 | _output:_ `pickled_pretrain_data` directory 40 | 41 | -------------------------------------------------------------------------------- /finetune/PyTorch/azureml_bert_util.py: -------------------------------------------------------------------------------- 1 | from horovod.torch.mpi_ops import allreduce, allreduce_async_, synchronize 2 | from horovod.torch.compression import Compression 3 | import horovod.torch as hvd 4 | import torch 5 | import time 6 | 7 | from collections import OrderedDict 8 | try: 9 | from apex_C import flatten 10 | from apex_C import unflatten 11 | except ImportError: 12 | try: 13 | _ = warned_flatten 14 | except NameError: 15 | print("Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten.") 16 | warned_flatten = True 17 | from torch._utils import _flatten_dense_tensors as flatten 18 | from torch._utils import _unflatten_dense_tensors as unflatten 19 | 20 | 21 | def warmup_linear(x, warmup=0.002): 22 | if x < warmup: 23 | return x/warmup 24 | return 1.0 - x 25 | 26 | 27 | def adjust_gradient_accumulation_steps(x, initial_steps, target_steps, warmup): 28 | return min(max(int(x/warmup*target_steps), initial_steps), target_steps) 29 | 30 | 31 | class DistributedCommunicator: 32 | def __init__(self, accumulation_step=1): 33 | hvd.init() 34 | self.local_rank = hvd.local_rank() 35 | self.world_size = hvd.size() 36 | self.rank = hvd.rank() 37 | self.n_gpu = torch.cuda.device_count() 38 | self.node_count = self.world_size // self.n_gpu 39 | self.accumulation_step = accumulation_step 40 | self.count_down = accumulation_step - 1 41 | self._multi_node = self.node_count > 1 42 | if not self._multi_node: 43 | # use PyTorch build-in NCCL backend for single node training 44 | torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:6000', 45 | world_size=self.n_gpu, rank=self.local_rank) 46 | 47 | 48 | def register_model(self, model, fp16): 49 | # broadcast model parameters 50 | if self.node_count > 1: 51 | hvd.broadcast_parameters(model.state_dict(), root_rank=0) 52 | else: 53 | for param in model.parameters(): 54 | torch.distributed.broadcast_multigpu([param], 0) 55 | 56 | # register hook for reduce when backpropagate 57 | self._parameter_names = {v: k for k, v in sorted(model.named_parameters())} 58 | self._handles = {} 59 | self._requires_update = set() 60 | self._grad_accs = [] 61 | self._grad = [] 62 | self._compression = hvd.Compression.fp16 if fp16 else hvd.Compression.none 63 | for p in model.parameters(): 64 | if p.requires_grad: 65 | p.grad = p.data.new(p.size()).zero_() 66 | self._requires_update.add(p) 67 | p_tmp = p.expand_as(p) 68 | grad_acc = p_tmp.grad_fn.next_functions[0][0] 69 | grad_acc.register_hook(self._make_hook(p)) 70 | self._grad_accs.append(grad_acc) 71 | 72 | 73 | def _allreduce_tensor(self, p): 74 | assert p not in self._handles 75 | assert not p.grad.requires_grad 76 | tensor = p.grad 77 | name = self._parameter_names.get(p) 78 | if self._multi_node: 79 | tensor_compressed, ctx = self._compression.compress(tensor) 80 | handle = allreduce_async_(tensor_compressed, average=True, name=name) 81 | self._handles[p] = (handle, ctx) 82 | else: 83 | self._handles[p] = tensor 84 | 85 | 86 | def _make_hook(self, p): 87 | def hook(*ignore): 88 | if self.count_down == 0: 89 | self._allreduce_tensor(p) 90 | return hook 91 | 92 | 93 | def synchronize(self): 94 | synced = False 95 | if self.count_down == 0: 96 | missing_p = self._requires_update - set(self._handles.keys()) 97 | for p in missing_p: 98 | self._allreduce_tensor(p) 99 | 100 | if self._multi_node: 101 | for p, value in self._handles.items(): 102 | handle, ctx = value 103 | output = synchronize(handle) 104 | p.grad.set_(self._compression.decompress(output, ctx) / self.accumulation_step) 105 | else: 106 | buckets = OrderedDict() 107 | for tensor in self._handles.values(): 108 | tp = tensor.type() 109 | if tp not in buckets: 110 | buckets[tp] = [] 111 | buckets[tp].append(tensor) 112 | for tp in buckets: 113 | bucket = buckets[tp] 114 | coalesced = flatten(bucket) / self.world_size / self.accumulation_step 115 | torch.distributed.all_reduce_multigpu([coalesced]) 116 | for buf, synced in zip(bucket, unflatten(coalesced, bucket)): 117 | buf.copy_(synced) 118 | self._handles.clear() 119 | synced = True 120 | self.count_down = self.accumulation_step 121 | 122 | self.count_down -= 1 123 | return synced 124 | 125 | def set_accumulation_step(self, accumulation_step): 126 | self.accumulation_step = accumulation_step 127 | self.count_down = self.accumulation_step - 1 -------------------------------------------------------------------------------- /finetune/PyTorch/dockerfile: -------------------------------------------------------------------------------- 1 | FROM mcr.microsoft.com/azureml/base-gpu:0.2.1 2 | 3 | RUN apt update && apt install git -y && rm -rf /var/lib/apt/lists/* 4 | 5 | RUN pip install numpy torch boto3 tqdm 6 | 7 | RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext 8 | 9 | RUN pip install horovod 10 | 11 | RUN pip install azureml-sdk -------------------------------------------------------------------------------- /finetune/PyTorch/notebooks/BERT_Eval_GLUE.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Copyright (c) Microsoft Corporation. All rights reserved.\n", 8 | "\n", 9 | "Licensed under the MIT License." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# PyTorch Pretrained BERT on AzureML with GLUE Dataset\n", 17 | "\n", 18 | "In this notebook, you will find the following contents:\n", 19 | "- Download GLUE dataset on the remote compute and store them in Azure storage\n", 20 | "- Speed-up fine-tuning BERT for GLUE dataset on AzureML GPU clusters" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Prerequisites\n", 28 | "Follow instructions in BERT_pretraining.ipynb notebook for setting up AzureML" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# Check core SDK version number\n", 38 | "import azureml.core\n", 39 | "\n", 40 | "print(\"SDK version:\", azureml.core.VERSION)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "## Initialize workspace\n", 48 | "\n", 49 | "To create or access an Azure ML Workspace, you will need to import the AML library and the following information:\n", 50 | "* A name for your workspace\n", 51 | "* Your subscription id\n", 52 | "* The resource group name\n", 53 | "\n", 54 | "Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step or create a new one. " 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "from azureml.core.workspace import Workspace\n", 64 | "ws = Workspace.setup()\n", 65 | "ws_details = ws.get_details()\n", 66 | "print('Name:\\t\\t{}\\nLocation:\\t{}'\n", 67 | " .format(ws_details['name'],\n", 68 | " ws_details['location']))\n" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "### Create an experiment\n", 76 | "Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed PyTorch tutorial. " 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "## Download GLUE dataset on the remote compute\n", 84 | "\n", 85 | "Before we start to fine-tune the pretained BERT model, we need to download the [GLUE data](https://gluebenchmark.com/tasks) by running the [script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) and unpack it to an Azure Blob container." 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "### Define AzureML datastore to collect training dataset\n", 93 | "\n", 94 | "To make data accessible for remote training, AML provides a convenient way to do so via a [Datastore](https://docs.microsoft.com/azure/machine-learning/service/how-to-access-data). The datastore provides a mechanism for you to upload/download data to Azure Storage, and interact with it from your remote compute targets.\n", 95 | "\n", 96 | "Each workspace is associated with a default Azure Blob datastore named `'workspaceblobstore'`. In this work, we use this default datastore to collect the GLUE training dataset ." 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "from azureml.core import Datastore\n", 106 | "ds = ws.get_default_datastore()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "### Create a project directory\n", 114 | "Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script and any additional files your training script depends on." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "import os\n", 124 | "import os.path as path\n", 125 | "project_root = path.abspath(path.join(os.getcwd(),\"../../../\"))" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "Download GLUE dataset in BingBert/ directory" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "ds.upload(src_dir=os.path.join(project_root,'data','glue_data'), target_path='glue_data')" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "Create a folder named \"bert-large-checkpoints\" which contains the .pt bert checkpoint file against which you want to run your eval tasks. The following code will upload the folder to the datastore. The URL for the checkpoint is: https://bertonazuremlwestus2.blob.core.windows.net/public/models/bert_large_uncased_original/bert_encoder_epoch_200.pt" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "ds.upload(src_dir=os.path.join(project_root,'data','bert-large-checkpoints') , target_path='bert-large-checkpoints')" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "Uploading bert-large config file to datastore" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "scrolled": true 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "ds.upload(src_dir=os.path.join(project_root,'pretrain','configs'), target_path='config')" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "**Remove /data folder to avoid uploading folder greater than 300MB.**" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "## Fine-tuning BERT with Distributed Training\n", 190 | "As our `GLUE` dataset are ready in Azure storage, we can start the fine-tune the model by exploting the power of distributed training. " 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "### Create a GPU remote compute target\n", 198 | "\n", 199 | "We need to create a GPU [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) to perform the fine-tuning. In this example, we create an AmlCompute cluster as our training compute resource.\n", 200 | "\n", 201 | "This code creates a cluster for you if it does not already exist in your workspace." 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "from azureml.core.compute import ComputeTarget, AmlCompute\n", 211 | "from azureml.core.compute_target import ComputeTargetException\n", 212 | "\n", 213 | "# choose a name for your cluster\n", 214 | "gpu_cluster_name = \"bertcodetesting\"\n", 215 | "\n", 216 | "try:\n", 217 | " gpu_compute_target = ComputeTarget(workspace=ws, name=gpu_cluster_name)\n", 218 | " print('Found existing compute target.')\n", 219 | "except ComputeTargetException:\n", 220 | " print('Creating a new compute target...')\n", 221 | " compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC24', max_nodes=4)\n", 222 | "\n", 223 | " # create the cluster\n", 224 | " gpu_compute_target = ComputeTarget.create(ws, gpu_cluster_name, compute_config)\n", 225 | " gpu_compute_target.wait_for_completion(show_output=True)\n", 226 | "\n", 227 | "# Use the 'status' property to get a detailed status for the current cluster. \n", 228 | "print(gpu_compute_target.status.serialize())" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "### Create a PyTorch estimator for fine-tuning\n", 236 | "Let us create a new PyTorch estimator to run the fine-tuning script `run_classifier.py`, that is already provided at [the original repository](https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_classifier.py). Please refer [here](https://github.com/huggingface/pytorch-pretrained-BERT#fine-tuning-with-bert-running-the-examples) for more detail about the script. \n", 237 | "\n", 238 | "The original `run_classifier.py` script uses PyTorch distributed launch untility to launch multiple processes across nodes and GPUs. We prepared a modified version [run_classifier_azureml.py](./run_classifier_azureml.py) so that we can launch it based on AzureML build-in MPI backend.\n", 239 | "\n", 240 | "To use AML's tracking and metrics capabilities, we need to add a small amount of AzureML code inside the training script.\n", 241 | "\n", 242 | "In `run_classifier_azureml.py`, we will log some metrics to our AML run. To do so, we will access the AML run object within the script:\n", 243 | "```Python\n", 244 | "from azureml.core.run import Run\n", 245 | "run = Run.get_context()\n", 246 | "```\n", 247 | "Further within `run_classifier_azureml.py`, we log learning rate, training loss and evaluation accuracy the model achieves as:\n", 248 | "```Python\n", 249 | "run.log('lr', np.float(args.learning_rate))\n", 250 | "...\n", 251 | "\n", 252 | "for step, batch in enumerate(tqdm(train_dataloader, desc=\"Iteration\")): \n", 253 | " ...\n", 254 | " run.log('train_loss', np.float(loss))\n", 255 | "\n", 256 | "...\n", 257 | "\n", 258 | "result = {'eval_loss': eval_loss,\n", 259 | " 'eval_accuracy': eval_accuracy}\n", 260 | "for key in sorted(result.keys()):\n", 261 | " run.log(key, str(result[key]))\n", 262 | "```" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "The following code runs GLUE RTE task against a bert-large checkpoint with the parameters used by Huggingface for finetuning.\n", 270 | "- num_train_epochs = 3\n", 271 | "- max_seq_length = 128\n", 272 | "- train_batch_size = 8\n", 273 | "- learning_rate = 2e-5\n", 274 | "- grad_accumulation_step = 2" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "from azureml.train.dnn import PyTorch\n", 284 | "from azureml.core.runconfig import RunConfiguration\n", 285 | "from azureml.core.container_registry import ContainerRegistry\n", 286 | "\n", 287 | "run_user_managed = RunConfiguration()\n", 288 | "run_user_managed.environment.python.user_managed_dependencies = True\n", 289 | "\n", 290 | "# Using a pre-defined public docker image published on AzureML\n", 291 | "image_name = 'mcr.microsoft.com/azureml/bert:pretrain-openmpi3.1.2-cuda10.0-cudnn7-ubuntu16.04'\n", 292 | "\n", 293 | "estimator = PyTorch(source_directory='../../../',\n", 294 | " compute_target=gpu_compute_target,\n", 295 | " #Docker image\n", 296 | " use_docker=True,\n", 297 | " custom_docker_image=image_name,\n", 298 | " user_managed=True,\n", 299 | " \n", 300 | " script_params = {\n", 301 | " '--bert_model':'bert-large-uncased',\n", 302 | " \"--model_file_location\": ds.path('bert-large-checkpoints/').as_mount(),\n", 303 | " '--task_name': 'RTE',\n", 304 | " '--data_dir': ds.path('glue_data/RTE/').as_mount(),\n", 305 | " '--do_train' : '',\n", 306 | " '--do_eval': '', \n", 307 | " '--do_lower_case': '',\n", 308 | " '--max_seq_length': 128,\n", 309 | " '--train_batch_size': 8,\n", 310 | " '--gradient_accumulation_steps': 2,\n", 311 | " '--learning_rate': 2e-5,\n", 312 | " '--num_train_epochs': 3.0,\n", 313 | " '--output_dir': ds.path('output/').as_mount(),\n", 314 | " '--model_file': 'bert_encoder_epoch_245.pt',\n", 315 | " '--fp16': \"\"\n", 316 | " },\n", 317 | " entry_script='./finetune/run_classifier_azureml.py',\n", 318 | " node_count=1,\n", 319 | " process_count_per_node=4,\n", 320 | " distributed_backend='mpi',\n", 321 | " use_gpu=True)\n", 322 | "\n", 323 | "# path to the Python environment in the custom Docker image\n", 324 | "estimator._estimator_config.environment.python.interpreter_path = '/opt/miniconda/envs/amlbert/bin/python'" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "### Submit and Monitor your run" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "from azureml.core import Experiment\n", 341 | "\n", 342 | "experiment_name = 'bert-large-RTE'\n", 343 | "experiment = Experiment(ws, name=experiment_name)" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "run = experiment.submit(estimator)\n", 353 | "from azureml.widgets import RunDetails\n", 354 | "RunDetails(run).show()" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "#run.cancel()" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [] 372 | } 373 | ], 374 | "metadata": { 375 | "authors": [ 376 | { 377 | "name": "aagarg" 378 | } 379 | ], 380 | "kernelspec": { 381 | "display_name": "Python 3", 382 | "language": "python", 383 | "name": "python3" 384 | }, 385 | "language_info": { 386 | "codemirror_mode": { 387 | "name": "ipython", 388 | "version": 3 389 | }, 390 | "file_extension": ".py", 391 | "mimetype": "text/x-python", 392 | "name": "python", 393 | "nbconvert_exporter": "python", 394 | "pygments_lexer": "ipython3", 395 | "version": "3.6.7" 396 | }, 397 | "msauthor": "aagarg" 398 | }, 399 | "nbformat": 4, 400 | "nbformat_minor": 2 401 | } 402 | -------------------------------------------------------------------------------- /finetune/PyTorch/notebooks/BERT_Eval_SQUAD.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Copyright (c) Microsoft Corporation. All rights reserved.\n", 8 | "\n", 9 | "Licensed under the MIT License." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# PyTorch Pretrained BERT on AzureML with SQuAD Dataset\n", 17 | "\n", 18 | "In this notebook, you will find the following contents:\n", 19 | "- Download SQuAD dataset on the remote compute and store them in Azure storage\n", 20 | "- Speed-up fine-tuning BERT for SQuAD dataset on AzureML GPU clusters" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Prerequisites\n", 28 | "Follow instructions in BERT_pretraining.ipynb notebook for setting up AzureML" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# Check core SDK version number\n", 38 | "import azureml.core\n", 39 | "\n", 40 | "print(\"SDK version:\", azureml.core.VERSION)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "## Initialize workspace\n", 48 | "\n", 49 | "To create or access an Azure ML Workspace, you will need to import the AML library and the following information:\n", 50 | "* A name for your workspace\n", 51 | "* Your subscription id\n", 52 | "* The resource group name\n", 53 | "\n", 54 | "Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace or create a new one. " 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "from azureml.core.workspace import Workspace\n", 64 | "ws = Workspace.setup()\n", 65 | "ws_details = ws.get_details()\n", 66 | "print('Name:\\t\\t{}\\nLocation:\\t{}'\n", 67 | " .format(ws_details['name'],\n", 68 | " ws_details['location']))" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "### Create a project directory\n", 76 | "Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script and any additional files your training script depends on." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "import os\n", 86 | "import os.path as path\n", 87 | "project_root = path.abspath(path.join(os.getcwd(),\"../../../\"))" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "### Define AzureML datastore to collect training dataset\n", 95 | "\n", 96 | "To make data accessible for remote training, AML provides a convenient way to do so via a [Datastore](https://docs.microsoft.com/azure/machine-learning/service/how-to-access-data). The datastore provides a mechanism for you to upload/download data to Azure Storage, and interact with it from your remote compute targets.\n", 97 | "\n", 98 | "Each workspace is associated with a default Azure Blob datastore named `'workspaceblobstore'`. In this work, we use this default datastore to collect the SQuAD training data ." 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "from azureml.core import Datastore\n", 108 | "ds = ws.get_default_datastore()" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "The data for SQuAD can be downloaded with the following links and should be saved in a blob storage.\n", 116 | "- [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)\n", 117 | "- [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)\n", 118 | "- [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "The following code will upload the training data to the path ./squad on the default datastore." 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "import os\n", 135 | "ds.upload(src_dir=project_root+'\\data\\squad', target_path='squad')\n", 136 | "ds.upload(src_dir=os.path.join(project_root,'data','bert-large-checkpoints') , target_path='bert-large-checkpoints')" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "### Create an experiment\n", 144 | "Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed PyTorch tutorial. " 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "from azureml.core import Experiment\n", 154 | "\n", 155 | "experiment_name = 'BERT-SQuAD'\n", 156 | "experiment = Experiment(ws, name=experiment_name)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "## Fine-tuning BERT with Distributed Training\n", 164 | "As our `SQuAD` dataset are ready in Azure storage, we can start the fine-tune the model by exploting the power of distributed training. " 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "### Create a GPU remote compute target\n", 172 | "\n", 173 | "We need to create a GPU [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) to perform the fine-tuning. In this example, we create an AmlCompute cluster as our training compute resource.\n", 174 | "\n", 175 | "This code creates a cluster for you if it does not already exist in your workspace." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "from azureml.core.compute import ComputeTarget, AmlCompute\n", 185 | "from azureml.core.compute_target import ComputeTargetException\n", 186 | "\n", 187 | "# choose a name for your cluster\n", 188 | "gpu_cluster_name = \"bertcodetesting\"\n", 189 | "\n", 190 | "try:\n", 191 | " gpu_compute_target = ComputeTarget(workspace=ws, name=gpu_cluster_name)\n", 192 | " print('Found existing compute target.')\n", 193 | "except ComputeTargetException:\n", 194 | " print('Creating a new compute target...')\n", 195 | " \n", 196 | " compute_config = AmlCompute.provisioning_configuration(vm_size=\"STANDARD_NC24s_v3\", max_nodes=4)\n", 197 | "\n", 198 | " # create the cluster\n", 199 | " gpu_compute_target = AmlCompute.create(ws, gpu_cluster_name, compute_config)\n", 200 | " gpu_compute_target.wait_for_completion(show_output=True)\n", 201 | "\n", 202 | " # Use the 'status' property to get a detailed status for the current cluster. \n", 203 | " print(gpu_compute_target.status.serialize())" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "### Create a PyTorch estimator for fine-tuning\n", 211 | "Let us create a new PyTorch estimator to run the fine-tuning script `run_squad.py`, that is already provided at [the original repository](https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_squad.py). Please refer [here](https://github.com/huggingface/pytorch-pretrained-BERT#fine-tuning-with-bert-running-the-examples) for more detail about the script. \n", 212 | "\n", 213 | "The original `run_squad.py` script uses PyTorch distributed launch untility to launch multiple processes across nodes and GPUs. We prepared a modified version [run_squad_azureml.py](./run_squad_azureml.py) so that we can launch it based on AzureML build-in MPI backend.\n", 214 | "\n", 215 | "To use AML's tracking and metrics capabilities, we need to add a small amount of AzureML code inside the training script.\n", 216 | "\n", 217 | "In `run_squad_azureml.py`, we will log some metrics to our AML run. To do so, we will access the AML run object within the script:\n", 218 | "```Python\n", 219 | "from azureml.core.run import Run\n", 220 | "run = Run.get_context()\n", 221 | "```\n", 222 | "Further within `run_squad_azureml.py`, we log learning rate, training loss and prediction scores the model achieves as:\n", 223 | "```Python\n", 224 | "run.log('lr', np.float(args.learning_rate))\n", 225 | "...\n", 226 | "\n", 227 | "for step, batch in enumerate(tqdm(train_dataloader, desc=\"Iteration\")): \n", 228 | " ...\n", 229 | " run.log('train_loss', np.float(loss))\n", 230 | "\n", 231 | "..\n", 232 | "```" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "from azureml.train.dnn import PyTorch\n", 242 | "from azureml.core.runconfig import RunConfiguration\n", 243 | "from azureml.core.container_registry import ContainerRegistry\n", 244 | "\n", 245 | "run_user_managed = RunConfiguration()\n", 246 | "run_user_managed.environment.python.user_managed_dependencies = True\n", 247 | "\n", 248 | "# Define custom Docker image info\n", 249 | "image_name = 'mcr.microsoft.com/azureml/bert:pretrain-openmpi3.1.2-cuda10.0-cudnn7-ubuntu16.04'\n", 250 | "\n", 251 | "estimator = PyTorch(source_directory='../../../',\n", 252 | " compute_target=gpu_compute_target,\n", 253 | " #Docker image\n", 254 | " use_docker=True,\n", 255 | " custom_docker_image=image_name,\n", 256 | " user_managed=True,\n", 257 | " script_params = {\n", 258 | " '--bert_model':'bert-large-uncased',\n", 259 | " \"--model_file_location\": ds.path('bert-large-checkpoints/').as_mount(),\n", 260 | " '--model_file': 'bert_encoder_epoch_245.pt',\n", 261 | " '--do_train' : '',\n", 262 | " '--do_predict': '',\n", 263 | " '--train_file': ds.path('squad/train-v1.1.json').as_mount(),\n", 264 | " '--predict_file': ds.path('squad/dev-v1.1.json').as_mount(),\n", 265 | " '--max_seq_length': 512,\n", 266 | " '--train_batch_size': 8,\n", 267 | " '--learning_rate': 3e-5,\n", 268 | " '--num_train_epochs': 2.0,\n", 269 | " '--doc_stride': 128,\n", 270 | " '--seed': 32,\n", 271 | " '--gradient_accumulation_steps':4,\n", 272 | " '--warmup_proportion':0.25,\n", 273 | " '--output_dir': './outputs',\n", 274 | " '--fp16':'',\n", 275 | " #'--loss_scale':128,\n", 276 | " },\n", 277 | " entry_script='./finetune/run_squad_azureml.py',\n", 278 | " node_count=1,\n", 279 | " process_count_per_node=4,\n", 280 | " distributed_backend='mpi',\n", 281 | " use_gpu=True)\n", 282 | "\n", 283 | "# path to the Python environment in the custom Docker image\n", 284 | "estimator._estimator_config.environment.python.interpreter_path = '/opt/miniconda/envs/amlbert/bin/python'" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "### Submit and Monitor your run" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": { 298 | "scrolled": false 299 | }, 300 | "outputs": [], 301 | "source": [ 302 | "run = experiment.submit(estimator)\n", 303 | "from azureml.widgets import RunDetails\n", 304 | "RunDetails(run).show()" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "To achieve over **90.5 F1 score** and **83.5 Exact-Match** with `SQuAD v1.1` dataset, it requires **2** epochs when fine-tune with `BERT large` model. Below please find the elapsed time using deferent Azure GPU VMs and configures. \n", 312 | "\n", 313 | "The default configuration in this notebook uses 2 `STANDARD_NC24rs_v3` (8 x V100) with `fp16` enabled. The training phase should take **22 mins** to complete 2 epochs. \n", 314 | "\n", 315 | "| GPU counts \t| 1 GPU \t| 2 GPU \t| 4 GPU \t| 8 GPU \t|\n", 316 | "|------------:\t|:-----------:\t|--------------:\t|------------\t|------------\t|\n", 317 | "| NCv3-series \t| 340 mins | 180 mins \t | 80 mins \t| 48 mins \t|\n", 318 | "| NCv3 with fp16| 140 mins | 79 mins \t | 38 mins \t| 22 mins \t|" 319 | ] 320 | } 321 | ], 322 | "metadata": { 323 | "authors": [ 324 | { 325 | "name": "aagarg" 326 | } 327 | ], 328 | "kernelspec": { 329 | "display_name": "Python 3", 330 | "language": "python", 331 | "name": "python3" 332 | }, 333 | "language_info": { 334 | "codemirror_mode": { 335 | "name": "ipython", 336 | "version": 3 337 | }, 338 | "file_extension": ".py", 339 | "mimetype": "text/x-python", 340 | "name": "python", 341 | "nbconvert_exporter": "python", 342 | "pygments_lexer": "ipython3", 343 | "version": "3.6.7" 344 | }, 345 | "msauthor": "aagarg" 346 | }, 347 | "nbformat": 4, 348 | "nbformat_minor": 2 349 | } -------------------------------------------------------------------------------- /finetune/PyTorch/notebooks/Pretrained-BERT-GLUE.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Copyright (c) Microsoft Corporation. All rights reserved.\n", 8 | "\n", 9 | "Licensed under the MIT License." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# PyTorch Pretrained BERT on AzureML with GLUE Dataset\n", 17 | "This notebook contains an end-to-end walkthrough of using Azure Machine Learning Service to run [PyTorch reimplementation](https://github.com/huggingface/pytorch-pretrained-BERT) of [Google's TensorFlow repository for the BERT model](https://github.com/google-research/bert) developed by Hugging Face.\n", 18 | "\n", 19 | "You will find the following contents:\n", 20 | "- Download GLUE dataset on the remote compute and store them in Azure storage\n", 21 | "- Speep-up fine-tuning BERT for GLUE dataset on AzureML GPU clusters\n", 22 | "- Further fine-tune BERT wtih AzureML hyperparameter optimizer " 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## Prerequisites\n", 30 | "- Understand the [architecture and terms](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture) introduced by Azure Machine Learning (AML)\n", 31 | "\n", 32 | "- Install the Python SDK: make sure to install notebook, and contrib\n", 33 | "```\n", 34 | "conda create -n azureml -y Python=3.6\n", 35 | "source activate azureml\n", 36 | "pip install --upgrade azureml-sdk[notebooks,contrib] \n", 37 | "conda install ipywidgets\n", 38 | "jupyter nbextension install --py --user azureml.widgets\n", 39 | "jupyter nbextension enable azureml.widgets --user --py\n", 40 | "```\n", 41 | "\n", 42 | "You will need to restart jupyter after this\n", 43 | "Detailed instructions are here: https://docs.microsoft.com/en-us/azure/machine-learning/service/quickstart-create-workspace-with-python " 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "# Check core SDK version number\n", 53 | "import azureml.core\n", 54 | "\n", 55 | "print(\"SDK version:\", azureml.core.VERSION)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "## Initialize workspace\n", 63 | "\n", 64 | "To create or access an Azure ML Workspace, you will need to import the AML library and the following information:\n", 65 | "* A name for your workspace\n", 66 | "* Your subscription id\n", 67 | "* The resource group name\n", 68 | "\n", 69 | "Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step or create a new one. " 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "from azureml.core.workspace import Workspace\n", 79 | "\n", 80 | "workspace_name = ''\n", 81 | "subscription_id = ''\n", 82 | "resource_group_name = ''\n", 83 | "location = ''\n", 84 | "\n", 85 | "ws = Workspace._get_or_create(workspace_name,\n", 86 | " subscription_id=subscription_id,\n", 87 | " resource_group=resource_group_name,\n", 88 | " location=location)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "### Create an experiment\n", 96 | "Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed PyTorch tutorial. " 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "from azureml.core import Experiment\n", 106 | "\n", 107 | "experiment_name = 'BERT-GLUE'\n", 108 | "experiment = Experiment(ws, name=experiment_name)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "## Download GLUE dataset on the remote compute\n", 116 | "\n", 117 | "Before we start to fine-tune the pretained BERT model, we need to download the [GLUE data](https://gluebenchmark.com/tasks) by running the [script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) and unpack it to an Azure Blob container." 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "### Define AzureML datastore to collect training dataset\n", 125 | "\n", 126 | "To make data accessible for remote training, AML provides a convenient way to do so via a [Datastore](https://docs.microsoft.com/azure/machine-learning/service/how-to-access-data). The datastore provides a mechanism for you to upload/download data to Azure Storage, and interact with it from your remote compute targets.\n", 127 | "\n", 128 | "Each workspace is associated with a default Azure Blob datastore named `'workspaceblobstore'`. In this work, we use this default datastore to collect the GLUE training dataset ." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "from azureml.core import Datastore\n", 138 | "ds = Datastore(ws, 'workspaceblobstore')" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "### Create a project directory\n", 146 | "Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script and any additional files your training script depends on." 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "import os\n", 156 | "\n", 157 | "project_folder = './pytorch-pretrained-BERT'" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "Make a local clone of the original [PyTorch reimplementation](https://github.com/huggingface/pytorch-pretrained-BERT) repository" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "!git clone -b v0.4.0 https://github.com/huggingface/pytorch-pretrained-BERT.git" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "We need to run the [script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) to download the [GLUE data](https://gluebenchmark.com/tasks) in the mounted Azure Blob container. In our example, we only download `MRPC` dataset" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "import urllib, os\n", 190 | "urllib.request.urlretrieve( 'https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/becd574dd938f045ea5bd3cb77d1d506541b5345/download_glue_data.py', filename='./download_glue_data.py')\n", 191 | "import download_glue_data\n", 192 | "download_glue_data.main([\"--tasks\", \"MRPC\", \"--data_dir\",\"./glue\"])" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "Please note that, if you receive `UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 1183: character maps to `. Please modify all `'with open(...)'` operations in the downloaded `download_glue_data.py` to `'open(.., encoding=\"utf8\")'`" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "The following code will upload the training data to the path ./glue on the default datastore." 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "ds.upload(src_dir='./glue', target_path='./glue')" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "## Fine-tuning BERT with Distributed Training\n", 223 | "As our `GLUE` dataset are ready in Azure storage, we can start the fine-tune the model by exploting the power of distributed training. " 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "### Create a GPU remote compute target\n", 231 | "\n", 232 | "We need to create a GPU [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) to perform the fine-tuning. In this example, we create an AmlCompute cluster as our training compute resource. Please find the information of Azure VM size in below table.\n", 233 | "\n", 234 | "\n", 235 | "| VM Size \t| CPU \t| GPU \t| Storage (SSD) \t| GPU memory \t| InfiniBand \t|\n", 236 | "|:-------------:\t|:---:\t|:-------:\t|:-------------:\t|:----------:\t|:----------:\t|\n", 237 | "| Standard_NC6 \t| 6 \t| 1 x K80 \t| 340 GiB \t| 8 GiB \t| No \t|\n", 238 | "| Standard_NC12 \t| 12 \t| 2 x K80 \t| 680 GiB \t| 16 GiB \t| No \t|\n", 239 | "| Standard_NC24 \t| 24 \t| 4 x K80 \t| 1440 GiB \t| 32 GiB \t| No \t|\n", 240 | "| Standard_NC24r \t| 24 \t| 4 x K80 \t| 1440 GiB \t| 32 GiB \t| Yes \t|\n", 241 | "| Standard_NC6s_v3 \t| 6 \t| 1 x V100 \t| 736 GiB \t| 16 GiB \t| No \t|\n", 242 | "| Standard_NC12s_v3 | 12 \t| 2 x V100 \t| 1474 GiB \t| 32 GiB \t| No \t|\n", 243 | "| Standard_NC24s_v3 | 24 \t| 4 x V100 \t| 2948 GiB \t| 64 GiB \t| No \t|\n", 244 | "| Standard_NC24rs_v3| 24 \t| 4 x V100 \t| 2948 GiB \t| 64 GiB \t| Yes \t|\n", 245 | "\n", 246 | "\n", 247 | "***Note that*** you need to request NCv3-serie quota if you would like to use NVIDIA Tesla V100 \n", 248 | "\n", 249 | "This code creates a cluster for you if it does not already exist in your workspace." 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "from azureml.core.compute import ComputeTarget, AmlCompute\n", 259 | "from azureml.core.compute_target import ComputeTargetException\n", 260 | "\n", 261 | "# choose a name for your cluster\n", 262 | "gpu_cluster_name = \"nc24Cluster\"\n", 263 | "\n", 264 | "try:\n", 265 | " gpu_compute_target = ComputeTarget(workspace=ws, name=gpu_cluster_name)\n", 266 | " print('Found existing compute target.')\n", 267 | "except ComputeTargetException:\n", 268 | " print('Creating a new compute target...')\n", 269 | " compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC24', max_nodes=4)\n", 270 | "\n", 271 | " # create the cluster\n", 272 | " gpu_compute_target = ComputeTarget.create(ws, gpu_cluster_name, compute_config)\n", 273 | " gpu_compute_target.wait_for_completion(show_output=True)\n", 274 | "\n", 275 | "# Use the 'status' property to get a detailed status for the current cluster. \n", 276 | "print(gpu_compute_target.status.serialize())" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "### Create a PyTorch estimator for fine-tuning\n", 284 | "Let us create a new PyTorch estimator to run the fine-tuning script `run_classifier.py`, that is already provided at [the original repository](https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_classifier.py). Please refer [here](https://github.com/huggingface/pytorch-pretrained-BERT#fine-tuning-with-bert-running-the-examples) for more detail about the script. \n", 285 | "\n", 286 | "The original `run_classifier.py` script uses PyTorch distributed launch untility to launch multiple processes across nodes and GPUs. We prepared a modified version [run_classifier_azureml.py](./run_classifier_azureml.py) so that we can launch it based on AzureML build-in MPI backend.\n", 287 | "\n", 288 | "To use AML's tracking and metrics capabilities, we need to add a small amount of AzureML code inside the training script.\n", 289 | "\n", 290 | "In `run_classifier_azureml.py`, we will log some metrics to our AML run. To do so, we will access the AML run object within the script:\n", 291 | "```Python\n", 292 | "from azureml.core.run import Run\n", 293 | "run = Run.get_context()\n", 294 | "```\n", 295 | "Further within `run_classifier_azureml.py`, we log learning rate, training loss and evaluation accuracy the model achieves as:\n", 296 | "```Python\n", 297 | "run.log('lr', np.float(args.learning_rate))\n", 298 | "...\n", 299 | "\n", 300 | "for step, batch in enumerate(tqdm(train_dataloader, desc=\"Iteration\")): \n", 301 | " ...\n", 302 | " run.log('train_loss', np.float(loss))\n", 303 | "\n", 304 | "...\n", 305 | "\n", 306 | "result = {'eval_loss': eval_loss,\n", 307 | " 'eval_accuracy': eval_accuracy}\n", 308 | "for key in sorted(result.keys()):\n", 309 | " run.log(key, str(result[key]))\n", 310 | "```\n", 311 | "These run metrics will become particularly important when we begin hyperparameter tuning our model in the \"Tune model hyperparameters\" section.\n", 312 | "\n", 313 | "Let's first copy the training script `run_classifier_azureml.py` into our project directory." 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "import shutil\n", 323 | "shutil.copy('run_classifier_azureml.py', project_folder)\n", 324 | "shutil.copy('azureml_bert_util.py', project_folder)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "Then, AzureML PyTorch estimator can be defined as below. We use `azuremlsamples/bert:torch-1.0.0-apex-cuda9` as the base docker image with [dockerfile](./dockerfile)" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "from azureml.train.dnn import PyTorch\n", 341 | "\n", 342 | "estimator = PyTorch(source_directory=project_folder,\n", 343 | " compute_target=gpu_compute_target,\n", 344 | " script_params = {\n", 345 | " '--bert_model':'bert-base-cased',\n", 346 | " '--task_name': 'MRPC',\n", 347 | " '--data_dir': ds.path('glue/MRPC/').as_mount(),\n", 348 | " '--do_train' : '',\n", 349 | " '--do_eval': '',\n", 350 | " '--max_seq_length': 128,\n", 351 | " '--train_batch_size': 32,\n", 352 | " '--learning_rate': 2e-5,\n", 353 | " '--num_train_epochs': 3.0,\n", 354 | " '--output_dir': './outputs',\n", 355 | " '--seed':16\n", 356 | " },\n", 357 | " custom_docker_base_image='azuremlsamples/bert:torch-1.0.0-apex-cuda9',\n", 358 | " entry_script='run_classifier_azureml.py',\n", 359 | " node_count=1,\n", 360 | " process_count_per_node=4,\n", 361 | " distributed_backend='mpi',\n", 362 | " use_gpu=True)\n", 363 | "\n", 364 | "estimator._estimator_config.environment.python.user_managed_dependencies=True" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": {}, 370 | "source": [ 371 | "### Submit and Monitor your run" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "run = experiment.submit(estimator)\n", 381 | "from azureml.widgets import RunDetails\n", 382 | "RunDetails(run).show()" 383 | ] 384 | }, 385 | { 386 | "cell_type": "markdown", 387 | "metadata": {}, 388 | "source": [ 389 | "To achieve an average of **85% evaluation accuracy** with `MRPC dataset`, it requires **3** epochs when fine-tune with `BERT base` model. Below please find the elapsed time per epoch using deferent Azure GPU VMs with above hyperparameters\n", 390 | "\n", 391 | "| GPU counts \t| 1 GPU \t| 2 GPU \t| 4 GPU \t|\n", 392 | "|------------:\t|:-----------:\t|--------------:\t|------------\t|\n", 393 | "| NC-series \t| 191 s/epoch \t| 105 s/epoch \t| 60 s/epoch \t|\n", 394 | "| NCv3-series \t| 36 s/epoch \t| 22 s/epoch \t| 13 s/epoch \t|\n", 395 | "| NCv3 with fp16| 32 s/epoch \t| 18 s/epoch \t| 12 s/epoch \t|" 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "## Fine-Tuning BERT with Hyperparameter Tuning\n", 403 | "\n", 404 | "We would also like to optimize our hyperparameter, `learning rate`, using Azure Machine Learning's hyperparameter tuning capabilities." 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": {}, 410 | "source": [ 411 | "### Start a hyperparameter sweep\n", 412 | "First, we will define the hyperparameter space to sweep over. In this example we will use random sampling to try different configuration sets of hyperparameter to minimize our primary metric, the evaluation accuracy (`eval_accuracy`)." 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "from azureml.train.hyperdrive import *\n", 422 | "import math\n", 423 | "\n", 424 | "param_sampling = RandomParameterSampling( {\n", 425 | " 'learning_rate': loguniform(math.log(1e-4), math.log(1e-6)),\n", 426 | " }\n", 427 | ")\n", 428 | "\n", 429 | "hyperdrive_run_config = HyperDriveRunConfig(estimator=estimator,\n", 430 | " hyperparameter_sampling=param_sampling, \n", 431 | " primary_metric_name='eval_accuracy',\n", 432 | " primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,\n", 433 | " max_total_runs=16,\n", 434 | " max_concurrent_runs=4)" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "Finally, lauch the hyperparameter tuning job." 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "hyperdrive_run = experiment.submit(hyperdrive_run_config)" 451 | ] 452 | }, 453 | { 454 | "cell_type": "markdown", 455 | "metadata": {}, 456 | "source": [ 457 | "### Monitor HyperDrive runs\n", 458 | "We can monitor the progress of the runs with the following Jupyter widget. " 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": { 465 | "scrolled": false 466 | }, 467 | "outputs": [], 468 | "source": [ 469 | "from azureml.widgets import RunDetails\n", 470 | "\n", 471 | "RunDetails(hyperdrive_run).show()" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "### Find and register the best model\n", 479 | "Once all the runs complete, we can find the run that produced the model with the highest evaluation accuracy." 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": null, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "best_run = hyperdrive_run.get_best_run_by_primary_metric()\n", 489 | "best_run_metrics = best_run.get_metrics()\n", 490 | "print(best_run)\n", 491 | "print('Best Run is:\\n accuracy: {0:.5f} \\n Learning rate: {1:.8f}'.format(\n", 492 | " best_run_metrics['eval_accuracy'][-1],\n", 493 | " best_run_metrics['lr']\n", 494 | " ))" 495 | ] 496 | }, 497 | { 498 | "cell_type": "markdown", 499 | "metadata": {}, 500 | "source": [ 501 | "You can compare the resulting optimal `learning_rate` with the value suggested by the [original implementation hyper-parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks): 2e-5" 502 | ] 503 | } 504 | ], 505 | "metadata": { 506 | "authors": [ 507 | { 508 | "name": "minxia" 509 | } 510 | ], 511 | "kernelspec": { 512 | "display_name": "Python [default]", 513 | "language": "python", 514 | "name": "python3" 515 | }, 516 | "language_info": { 517 | "codemirror_mode": { 518 | "name": "ipython", 519 | "version": 3 520 | }, 521 | "file_extension": ".py", 522 | "mimetype": "text/x-python", 523 | "name": "python", 524 | "nbconvert_exporter": "python", 525 | "pygments_lexer": "ipython3", 526 | "version": "3.6.6" 527 | }, 528 | "msauthor": "minxia" 529 | }, 530 | "nbformat": 4, 531 | "nbformat_minor": 2 532 | } 533 | -------------------------------------------------------------------------------- /finetune/README.md: -------------------------------------------------------------------------------- 1 | # Finetune natural language processing models using Azure Machine Learning service 2 | 3 | This part of the repo contains a walkthrough of using [Azure Machine Learning Service](https://docs.microsoft.com/en-us/azure/machine-learning/service/) to finetune [BERT model](https://github.com/google-research/bert). See more details in this blogpost: https://azure.microsoft.com/en-us/blog/fine-tune-natural-language-processing-models-using-azure-machine-learning-service/ 4 | 5 | We provide two set of notebooks: one for PyTorch, and another one for TensorFlow. Please follow the notebooks below for more information: 6 | - [GLUE eval using BERT](PyTorch/notebooks/BERT_Eval_GLUE.ipynb) 7 | - [Tensorflow-BERT-AzureML](TensorFlow/notebooks/Tensorflow-BERT-AzureML.ipynb) 8 | - [Named Entity Recognition using BERT](PyTorch/notebooks/Pretrained-BERT-NER.ipynb) (Updated on 6/17/2019) 9 | 10 | 11 | ## **Using the Azure Machine Learning Service** 12 | 13 | We are going to demonstrate different experiments on different datasets. In addition to tuning different hyperparameters for various use cases, Azure Machine Learning service can be used to manage the entire lifecycle of the experiments. Azure Machine Learning service provides an end-to-end cloud-based machine learning environment, so customers can develop, train, test, deploy, manage, and track machine learning models, as shown below. It also has full support for open-source technologies, such as PyTorch and TensorFlow which we will be using later. 14 | 15 | ![Azure Machine Learning Service Overview](https://azurecomcdn.azureedge.net/mediahandler/acomblog/media/Default/blog/07ebbbb6-0fd4-40a6-b4e6-c9d0b11cf159.png) 16 | _Figure 3. Azure Machine Learning Service Overview_ 17 | 18 | ## **What is in the notebook** 19 | 20 | ### **Defining the right model for specific task** 21 | 22 | To fine-tune the BERT model, the first step is to define the right input and output layer. In the GLUE example, it is defined as a classification task, and the code snippet shows how to create a language classification model using BERT pre-trained models: 23 | ``` 24 | model = modeling.BertModel( 25 | config=bert_config, 26 | is_training=is_training, 27 | input_ids=input_ids, 28 | input_mask=input_mask, 29 | token_type_ids=segment_ids, 30 | use_one_hot_embeddings=use_one_hot_embeddings) 31 | 32 | logits = tf.matmul(output_layer, output_weights, transpose_b=True) 33 | logits = tf.nn.bias_add(logits, output_bias) 34 | probabilities = tf.nn.softmax(logits, axis=-1) 35 | log_probs = tf.nn.log_softmax(logits, axis=-1) 36 | one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) 37 | per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) 38 | loss = tf.reduce_mean(per_example_loss) 39 | 40 | ``` 41 | 42 | ### **Set up training environment using Azure Machine Learning service** 43 | 44 | Depending on the size of the dataset, training the model on the actual dataset might be time-consuming. Azure Machine Learning Compute provides access to GPUs either for a single node or multiple nodes to accelerate the training process. Creating a cluster with one or multiple nodes on Azure Machine Learning Compute is very intuitive, as below: 45 | 46 | ``` 47 | compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC24s_v3', 48 | min_nodes=0, 49 | max_nodes=8) 50 | # create the cluster 51 | gpu_compute_target = ComputeTarget.create(ws, gpu_cluster_name, compute_config) 52 | gpu_compute_target.wait_for_completion(show_output=True) 53 | estimator = PyTorch(source_directory=project_folder, 54 | compute_target=gpu_compute_target, 55 | script_params = {...}, 56 | entry_script='run_squad.azureml.py', 57 | conda_packages=['tensorflow', 'boto3', 'tqdm'], 58 | node_count=node_count, 59 | process_count_per_node=process_count_per_node, 60 | distributed_backend='mpi', 61 | use_gpu=True) 62 | ``` 63 | Azure Machine Learning is greatly simplifying the work involved in setting up and running a distributed training job. As you can see, scaling the job to multiple workers is done by just changing the number of nodes in the configuration and providing a distributed backend. For distributed backends, Azure Machine Learning supports popular frameworks such as TensorFlow Parameter server as well as MPI with Horovod, and it ties in with the Azure hardware such as InfiniBand to connect the different worker nodes to achieve optimal performance. We will have a follow up blogpost on how to use the distributed training capability on Azure Machine Learning service to fine-tune NLP models. 64 | 65 | For more information on how to create and set up compute targets for model training, please visit our [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets). 66 | 67 | ### **Hyper Parameter Tuning** 68 | 69 | For a given customer's specific use case, model performance depends heavily on the hyperparameter values selected. Hyperparameters can have a big search space, and exploring each option can be very expensive. Azure Machine Learning Services provide an automated machine learning service, which provides hyperparameter tuning capabilities and can search across various hyperparameter configurations to find a configuration that results in the best performance. 70 | 71 | In the provided example, random sampling is used, in which case hyperparameter values are randomly selected from the defined search space. In the example below, we explored the learning rate space from 1e-4 to 1e-6 in log uniform manner, so the learning rate might be 2 values around 1e-4, 2 values around 1e-5, and 2 values around 1e-6. 72 | 73 | Customers can also select which metric to optimize. Validation loss, accuracy score, and F1 score are some popular metrics that could be selected for optimization. 74 | 75 | ``` 76 | from azureml.train.hyperdrive import * 77 | import math 78 | 79 | param_sampling = RandomParameterSampling( { 80 | 'learning_rate': loguniform(math.log(1e-4), math.log(1e-6)), 81 | }) 82 | 83 | hyperdrive_run_config = HyperDriveRunConfig( 84 | estimator=estimator, 85 | hyperparameter_sampling=param_sampling, 86 | primary_metric_name='f1', 87 | primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 88 | max_total_runs=16, 89 | max_concurrent_runs=4) 90 | ``` 91 | 92 | For each experiment, customers can watch the progress for different hyperparameter combinations. For example, the picture below shows the mean loss over time using different hyperparameter combinations. Some of the experiments can be terminated early if the training loss doesn't meet expectations (like the top red curve). 93 | 94 | ![Mean loss for training data for different runs, as well as early termination](https://azurecomcdn.azureedge.net/mediahandler/acomblog/media/Default/blog/bdbe13c8-0011-49de-a019-4731cd3951cb.png) 95 | _Figure 4. Mean loss for training data for different runs, as well as early termination_ 96 | 97 | For more information on how to use the Azure ML's automated hyperparameter tuning feature, please visit our documentation on [tuning hyperparameters](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters). And for how to track all the experiments, please visit the documentation on [how to track experiments and metrics](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-track-experiments). 98 | 99 | ## **Visualizing the result** 100 | 101 | Using the Azure Machine Learning service, customers can achieve 85 percent evaluation accuracy when fine-tuning MRPC in GLUE dataset (it requires 3 epochs for BERT base model), which is close to the state-of-the-art result. Using multiple GPUs can shorten the training time and using more powerful GPUs (say V100) can also improve the training time. For one of the specific experiments, the details are as below: 102 | 103 | | **GPU#** | **1** | **2** | **4** | 104 | | --- | --- | --- | --- | 105 | | **K80 (NC Family)** | 191 s/epoch | 105 s/epoch | 60 s/epoch | 106 | | **V100 (NCv3 Family)** | 36 s/epoch | 22 s/epoch | 13 s/epoch | 107 | 108 | _Table 1. Training time per epoch for MRPC in GLUE dataset_ 109 | 110 | After all the experiments are done, the Azure Machine Learning service SDK also provides a summary visualization on the selected metrics and the corresponding hyperparameter(s). Below is an example on how learning rate affects validation loss. Throughout the experiments, the learning rate has been changed from around 7e-6 (the far left) to around 1e-3 (the far right), and the best learning rate with lowest validation loss is around 3.1e-4. This chart can also be leveraged to evaluate other metrics that customers want to optimize. 111 | 112 | ![Learning rate versus validation loss](https://azurecomcdn.azureedge.net/mediahandler/acomblog/media/Default/blog/189651c7-05e1-4381-81b7-32d871b360b7.png) 113 | _Figure 5. Learning rate versus validation loss_ 114 | 115 | ## **Summary** 116 | 117 | In this repo, we showed how customers can fine-tune BERT easily using the Azure Machine Learning service, as well as topics such as using distributed settings and tuning hyperparameters for the corresponding dataset. We also showed some preliminary results to demonstrate how to use Azure Machine Learning service to fine tune the NLP models. All the code is [available on the GitHub repository](https://github.com/Microsoft/AzureML-BERT). Please let us know if there are any questions or comments by raising an issue in the GitHub repo. 118 | 119 | ### **References** 120 | 121 | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/pdf/1810.04805.pdf) and its [GitHub site](https://github.com/google-research/bert). 122 | 123 | - Visit the [Azure Machine Learning service](https://azure.microsoft.com/en-us/free/services/machine-learning/) homepage today to get started with your free-trial. 124 | - Learn more about [Azure Machine Learning service](https://azure.microsoft.com/en-us/services/machine-learning-service/). 125 | -------------------------------------------------------------------------------- /finetune/TensorFlow/download_model_and_dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import sys 4 | import os 5 | import shutil 6 | import zipfile 7 | import urllib 8 | 9 | parser = argparse.ArgumentParser() 10 | 11 | ## Required parameters 12 | parser.add_argument("--bert_model_name", 13 | default = None, 14 | type = str, 15 | required = True, 16 | help = "Name of pretrained BERT model. Possible values: " 17 | "uncased_L-12_H-768_A-12,uncased_L-24_H-1024_A-16,cased_L-12_H-768_A-12," 18 | "multilingual_L-12_H-768_A-12,chinese_L-12_H-768_A-12") 19 | 20 | parser.add_argument("--model_dump_path", 21 | default = None, 22 | type = str, 23 | required = True, 24 | help = "Path to the output model.") 25 | 26 | parser.add_argument("--glue_data_path", 27 | default = None, 28 | type = str, 29 | required = True, 30 | help = "Path to store downloaded GLUE dataset") 31 | 32 | args = parser.parse_args() 33 | 34 | bert_model_url_map = { 35 | 'uncased_L-12_H-768_A-12': 'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip', 36 | 'uncased_L-24_H-1024_A-16': 'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip', 37 | 'cased_L-12_H-768_A-12': 'https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip', 38 | 'multilingual_L-12_H-768_A-12': 'https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 39 | 'chinese_L-12_H-768_A-12': 'https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip' 40 | } 41 | 42 | if args.bert_model_name not in bert_model_url_map: 43 | sys.stderr.write('Unknown BERT model name ' + args.bert_model_name) 44 | sys.exit(1) 45 | 46 | pretrained_model_url = bert_model_url_map.get(args.bert_model_name) 47 | 48 | # make local directory for pretrained tensorflow BERT model 49 | tensorflow_model_dir = './tensorflow_model' 50 | if not os.path.exists(tensorflow_model_dir): 51 | os.makedirs(tensorflow_model_dir) 52 | 53 | # download and extract pretrained tensorflow BERT model 54 | download_file_name = 'tensorflow_model.zip' 55 | urllib.request.urlretrieve(pretrained_model_url, filename=download_file_name) 56 | print('Extracting pretrained model...') 57 | with zipfile.ZipFile(download_file_name, 'r') as z: 58 | z.extractall(tensorflow_model_dir) 59 | 60 | # make destination path 61 | if not os.path.exists(args.model_dump_path): 62 | os.makedirs(args.model_dump_path) 63 | 64 | files = ['bert_model.ckpt.meta', 'bert_model.ckpt.index', 'bert_model.ckpt.data-00000-of-00001', 'bert_config.json', 'vocab.txt'] 65 | for file in files: 66 | shutil.copy(os.path.join(tensorflow_model_dir, args.bert_model_name, file), os.path.join(args.model_dump_path, file)) 67 | 68 | print('Start to download GLUE dataset...\n') 69 | urllib.request.urlretrieve( 70 | 'https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py', 71 | filename='download_glue_data.py') 72 | if os.system('python download_glue_data.py --data_dir {0} --tasks all'.format(args.glue_data_path)) != 0: sys.exit(1) -------------------------------------------------------------------------------- /finetune/evaluate_squad.py: -------------------------------------------------------------------------------- 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """ 2 | from __future__ import print_function 3 | from collections import Counter 4 | import string 5 | import re 6 | import argparse 7 | import json 8 | import sys 9 | 10 | 11 | def normalize_answer(s): 12 | """Lower text and remove punctuation, articles and extra whitespace.""" 13 | def remove_articles(text): 14 | return re.sub(r'\b(a|an|the)\b', ' ', text) 15 | 16 | def white_space_fix(text): 17 | return ' '.join(text.split()) 18 | 19 | def remove_punc(text): 20 | exclude = set(string.punctuation) 21 | return ''.join(ch for ch in text if ch not in exclude) 22 | 23 | def lower(text): 24 | return text.lower() 25 | 26 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 27 | 28 | 29 | def f1_score(prediction, ground_truth): 30 | prediction_tokens = normalize_answer(prediction).split() 31 | ground_truth_tokens = normalize_answer(ground_truth).split() 32 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 33 | num_same = sum(common.values()) 34 | if num_same == 0: 35 | return 0 36 | precision = 1.0 * num_same / len(prediction_tokens) 37 | recall = 1.0 * num_same / len(ground_truth_tokens) 38 | f1 = (2 * precision * recall) / (precision + recall) 39 | return f1 40 | 41 | 42 | def exact_match_score(prediction, ground_truth): 43 | return (normalize_answer(prediction) == normalize_answer(ground_truth)) 44 | 45 | 46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 47 | scores_for_ground_truths = [] 48 | for ground_truth in ground_truths: 49 | score = metric_fn(prediction, ground_truth) 50 | scores_for_ground_truths.append(score) 51 | return max(scores_for_ground_truths) 52 | 53 | 54 | def evaluate(dataset, predictions): 55 | f1 = exact_match = total = 0 56 | for article in dataset: 57 | for paragraph in article['paragraphs']: 58 | for qa in paragraph['qas']: 59 | total += 1 60 | if qa['id'] not in predictions: 61 | message = 'Unanswered question ' + qa['id'] + \ 62 | ' will receive score 0.' 63 | print(message, file=sys.stderr) 64 | continue 65 | ground_truths = list(map(lambda x: x['text'], qa['answers'])) 66 | prediction = predictions[qa['id']] 67 | exact_match += metric_max_over_ground_truths( 68 | exact_match_score, prediction, ground_truths) 69 | f1 += metric_max_over_ground_truths( 70 | f1_score, prediction, ground_truths) 71 | 72 | exact_match = 100.0 * exact_match / total 73 | f1 = 100.0 * f1 / total 74 | 75 | return {'exact_match': exact_match, 'f1': f1} 76 | 77 | 78 | if __name__ == '__main__': 79 | expected_version = '1.1' 80 | parser = argparse.ArgumentParser( 81 | description='Evaluation for SQuAD ' + expected_version) 82 | parser.add_argument('dataset_file', help='Dataset file') 83 | parser.add_argument('prediction_file', help='Prediction File') 84 | args = parser.parse_args() 85 | with open(args.dataset_file) as dataset_file: 86 | dataset_json = json.load(dataset_file) 87 | if (dataset_json['version'] != expected_version): 88 | print('Evaluation expects v-' + expected_version + 89 | ', but got dataset with v-' + dataset_json['version'], 90 | file=sys.stderr) 91 | dataset = dataset_json['data'] 92 | with open(args.prediction_file) as prediction_file: 93 | predictions = json.load(prediction_file) 94 | print(json.dumps(evaluate(dataset, predictions))) -------------------------------------------------------------------------------- /pretrain/PyTorch/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch Pretrain BERT Source Code 2 | This folder contains source code and instructions on pretraining BERT model on Azure Machine Learning. 3 | 4 | The sub folder structure is as follows: 5 | - [docker](./docker/) folder for docker file and instruction to use Azure Container Registry 6 | - [dataprep](./dataprep/) folder for data preparation instructions 7 | -------------------------------------------------------------------------------- /pretrain/PyTorch/azureml_adapter.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def set_environment_variables_for_nccl_backend(single_node=False, master_port=6105): 5 | os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK'] 6 | os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE'] 7 | 8 | if not single_node: 9 | master_node_params = os.environ['AZ_BATCH_MASTER_NODE'].split(':') 10 | os.environ['MASTER_ADDR'] = master_node_params[0] 11 | 12 | # Do not overwrite master port with that defined in AZ_BATCH_MASTER_NODE 13 | if 'MASTER_PORT' not in os.environ: 14 | os.environ['MASTER_PORT'] = str(master_port) 15 | else: 16 | os.environ['MASTER_ADDR'] = os.environ['AZ_BATCHAI_MPI_MASTER_NODE'] 17 | os.environ['MASTER_PORT'] = '54965' 18 | print('NCCL_SOCKET_IFNAME original value = {}'.format(os.environ['NCCL_SOCKET_IFNAME'])) 19 | # TODO make this parameterizable 20 | os.environ['NCCL_SOCKET_IFNAME'] = '^docker0,lo' 21 | 22 | print('RANK = {}'.format(os.environ['RANK'])) 23 | print('WORLD_SIZE = {}'.format(os.environ['WORLD_SIZE'])) 24 | print('MASTER_ADDR = {}'.format(os.environ['MASTER_ADDR'])) 25 | print('MASTER_PORT = {}'.format(os.environ['MASTER_PORT'])) 26 | # print('MASTER_NODE = {}'.format(os.environ['MASTER_NODE'])) 27 | print('NCCL_SOCKET_IFNAME new value = {}'.format(os.environ['NCCL_SOCKET_IFNAME'])) 28 | 29 | def get_local_rank(): 30 | return int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) 31 | 32 | def get_global_size(): 33 | return int(os.environ['OMPI_COMM_WORLD_SIZE']) 34 | 35 | def get_local_size(): 36 | return int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE']) 37 | 38 | def get_world_size(): 39 | return int(os.environ['WORLD_SIZE']) 40 | 41 | -------------------------------------------------------------------------------- /pretrain/PyTorch/benchmark.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datetime import datetime 3 | 4 | 5 | def get_timestamp(text): 6 | datepattern = re.compile("\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}") 7 | matcher = datepattern.search(text) 8 | return datetime.strptime(matcher.group(0), '%m/%d/%Y %H:%M:%S') 9 | 10 | def get_perf_metrics(filename): 11 | with open(filename) as f: 12 | datafile = f.readlines() 13 | throughput = 0 14 | epoch = 1 15 | time_diff=0 16 | num_seq=0 17 | for line in datafile: 18 | if 'Training epoch:' in line: 19 | start_time = get_timestamp(line) 20 | 21 | if epoch == 1: 22 | training_start_time = start_time 23 | epoch += 1 24 | if 'Completed processing' in line: 25 | end_time = get_timestamp(line) 26 | time_diff += int((end_time-start_time).total_seconds()) 27 | num_seq += [int(s) for s in line[int(line.find('Completed processing')):].split() if s.isdigit()][0] 28 | throughput = num_seq/time_diff 29 | #print(throughput) 30 | if 'Validation Loss' in line: 31 | valid_loss = float(line[int(line.find('is:'))+3:]) 32 | avg_throughput = (num_seq/time_diff) 33 | total_training_time = end_time-training_start_time 34 | d = datetime(1,1,1) + total_training_time 35 | 36 | print('Num epochs:', epoch) 37 | print('Total time to train:', d.day-1,'days,', d.hour ,'hours') 38 | print('Average throughput:',avg_throughput, 'sequences/second') 39 | print('Final Validation Loss:', valid_loss) 40 | -------------------------------------------------------------------------------- /pretrain/PyTorch/checkpoint.py: -------------------------------------------------------------------------------- 1 | from logger import Logger 2 | import torch 3 | import os 4 | from operator import itemgetter 5 | 6 | from torch import __init__ 7 | 8 | def checkpoint_model(PATH, model, optimizer, epoch, last_global_step, **kwargs): 9 | """Utility function for checkpointing model + optimizer dictionaries 10 | The main purpose for this is to be able to resume training from that instant again 11 | """ 12 | checkpoint_state_dict = {'epoch': epoch, 13 | 'last_global_step': last_global_step, 14 | 'model_state_dict': model.network.module.state_dict(), 15 | 'optimizer_state_dict': optimizer.state_dict()} 16 | # Add extra kwargs too 17 | checkpoint_state_dict.update(kwargs) 18 | torch.save(checkpoint_state_dict, PATH) 19 | return 20 | 21 | 22 | def load_checkpoint(model, optimizer, PATH): 23 | """Utility function for checkpointing model + optimizer dictionaries 24 | The main purpose for this is to be able to resume training from that instant again 25 | """ 26 | checkpoint_state_dict = torch.load(PATH, map_location=torch.device("cpu")) 27 | #from train import model 28 | model.network.module.load_state_dict( 29 | checkpoint_state_dict['model_state_dict']) 30 | #from train import optimizer 31 | optimizer.load_state_dict(checkpoint_state_dict['optimizer_state_dict']) 32 | epoch = checkpoint_state_dict['epoch'] 33 | last_global_step = checkpoint_state_dict['last_global_step'] 34 | del checkpoint_state_dict 35 | return (epoch + 1, last_global_step) 36 | 37 | 38 | def latest_checkpoint_file(reference_folder: str, no_cuda) -> str: 39 | """Extracts the name of the last checkpoint file 40 | 41 | :param reference_folder: (str) Path to the parent_folder 42 | :return: (str) Path to the most recent checkpoint tar file 43 | """ 44 | 45 | logger = Logger(cuda=torch.cuda.is_available() and not no_cuda) 46 | 47 | # For each folder inside the parent folder find all files 48 | # ending with .tar and extreact the last checkpoint. 49 | candidate_files = [] 50 | for dir_path, dir_names, filenames in os.walk(reference_folder): 51 | logger.info(f"Searching for checkpoint in {reference_folder}") 52 | relevant_files = [f for f in filenames if f.endswith('.tar')] 53 | if relevant_files: 54 | latest_file = max(relevant_files) # assumes that checkpoint number is of format 000x 55 | candidate_files.append((dir_path, latest_file)) 56 | 57 | checkpoint_file = max(candidate_files, key=itemgetter(1)) 58 | checkpoint_path = os.path.join(checkpoint_file[0], checkpoint_file[1]) 59 | 60 | return checkpoint_path 61 | -------------------------------------------------------------------------------- /pretrain/PyTorch/configuration.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | # TODO better json handling 5 | class BertJobConfiguration: 6 | def __init__(self, config_file_path): 7 | self.config = json.load(open(config_file_path, 'r', encoding='utf-8')) 8 | 9 | # TODO improve this implementation 10 | def replace_path_placeholders(self, files_location): 11 | self.config['data']['datasets'] = {key: value.replace('placeholder/', files_location) 12 | for (key, value) in self.config['data']['datasets'].items()} 13 | self.config['validation']['path'] = self.config['validation']['path'].replace('placeholder/', files_location) 14 | 15 | def get_name(self): 16 | return self.config['name'] 17 | 18 | def get_token_file_type(self): 19 | return self.config["bert_token_file"] 20 | 21 | def get_model_file_type(self): 22 | return self.config["bert_model_file"] 23 | 24 | def get_learning_rate(self): 25 | return self.config["training"]["learning_rate"] 26 | 27 | def get_warmup_proportion(self): 28 | return self.config["training"]["warmup_proportion"] 29 | 30 | def get_total_training_steps(self): 31 | return self.config["training"]["total_training_steps"] 32 | 33 | def get_total_epoch_count(self): 34 | return self.config["training"]["num_epochs"] 35 | 36 | def get_num_workers(self): 37 | return self.config['training']['num_workers'] 38 | 39 | def get_validation_folder_path(self): 40 | return self.config['validation']['path'] 41 | 42 | def get_wiki_pretrain_dataset_path(self): 43 | return self.config["data"]["datasets"]['wiki_pretrain_dataset'] 44 | 45 | def get_decay_rate(self): 46 | return self.config["training"]["decay_rate"] 47 | 48 | def get_decay_step(self): 49 | return self.config["training"]["decay_step"] 50 | 51 | def get_model_config(self): 52 | return self.config["bert_model_config"] 53 | -------------------------------------------------------------------------------- /pretrain/PyTorch/dataprep/create_pretraining.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from multiprocessing import Pool 6 | import multiprocessing 7 | import os 8 | import logging 9 | import argparse 10 | 11 | 12 | import sys 13 | sys.path.append("..") 14 | from pytorch_pretrained_bert.tokenization import BertTokenizer 15 | from dataset import TokenInstance, PretrainingDataCreator, GenericPretrainingDataCreator 16 | 17 | 18 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 19 | datefmt='%m/%d/%Y %H:%M:%S', 20 | level=logging.INFO) 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | def parse_data(inp_file, out_file): 25 | if not os.path.exists(out_file): 26 | print(inp_file) 27 | dataset = GenericPretrainingDataCreator(inp_file, tokenizer, dupe_factor=9, max_seq_length=512) 28 | dataset.save(out_file) 29 | print(f"Completed Pickling: {out_file}") 30 | else: 31 | print(f'Already parsed: {out_file}') 32 | 33 | 34 | parser = argparse.ArgumentParser( 35 | description="Give initial arguments for parsing") 36 | 37 | parser.add_argument("--input_dir", type=str, 38 | help="This folder contains .txt files of Wikipedia Data." 39 | " Each .txt file contains the text from the documents." 40 | " It makes an assumption that each line in the file represents" 41 | " a single line in the document too. A blank line represents completion of a document.") 42 | parser.add_argument("--output_dir", type=str, help="Path to Output Directory.") 43 | parser.add_argument("--token_file", default="bert-large-uncased", type=str) 44 | parser.add_argument("--do_lower_case", default=False, action="store_true", 45 | help="This flag indicates the wheter the text should be lowercase or not") 46 | parser.add_argument("--processes", "-p", default=0, type=int, 47 | help="This is to do parallel processing of the txt files. It should be >=0. Default: 0 represents" 48 | " that it will use all the available cores in the CPU.") 49 | 50 | args = parser.parse_args() 51 | tokenizer = BertTokenizer.from_pretrained( 52 | args.token_file, do_lower_case=args.do_lower_case) 53 | 54 | input_files = [] 55 | output_files = [] 56 | num_processes = 1 57 | 58 | if args.processes < 0 or args.processes > multiprocessing.cpu_count(): 59 | raise ValueError( 60 | "The value of --processes should be >=0 and less than the max cores in the CPU.") 61 | elif args.processes == 0: # Use all cores 62 | num_processes = multiprocessing.cpu_count() 63 | else: 64 | num_processes = args.processes 65 | 66 | for filename in os.listdir(args.input_dir): 67 | input_file = os.path.join(args.input_dir, filename) 68 | outfilename = "_".join(filename.split('.')[:-1]) + ".bin" 69 | output_file = os.path.join(args.output_dir, outfilename) 70 | input_files.append(input_file) 71 | output_files.append(output_file) 72 | # parse_data(input_file, output_file) # this line is for single core processing 73 | 74 | # parse data using multiple cores 75 | with Pool(processes=num_processes) as pool: 76 | pool.starmap(parse_data, zip(input_files, output_files)) 77 | -------------------------------------------------------------------------------- /pretrain/PyTorch/dataprep/sentence_segmentation.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import os 3 | from tqdm import tqdm 4 | import sys 5 | 6 | nltk.download('punkt') 7 | 8 | input_file = sys.argv[1] 9 | output_file = sys.argv[2] 10 | 11 | doc_seperator = "\n" 12 | 13 | with open(input_file) as ifile: 14 | with open(output_file, "w") as ofile: 15 | for i, line in tqdm(enumerate(ifile)): 16 | if line != "\n": 17 | sent_list = nltk.tokenize.sent_tokenize(line) 18 | for sent in sent_list: 19 | ofile.write(sent + "\n") 20 | ofile.write(doc_seperator) 21 | -------------------------------------------------------------------------------- /pretrain/PyTorch/dataprep/single_line_doc_file_creation.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | from tqdm import tqdm 4 | 5 | output_file = 'wikipedia.txt' 6 | 7 | with open(output_file, "w") as ofile: 8 | for dirname in glob.glob('out2/*/', recursive=False): 9 | for filename in glob.glob(dirname + 'wiki_*', recursive=True): 10 | print(filename) 11 | article_lines = [] 12 | article_open = False 13 | 14 | with open(filename, "r") as file: 15 | for i, line in tqdm(enumerate(file)): 16 | if "" in line: 19 | article_open = False 20 | for oline in article_lines[1:]: 21 | if oline != "\n": 22 | ofile.write(oline.rstrip() + " ") 23 | ofile.write("\n\n") 24 | article_lines = [] 25 | else: 26 | if article_open: 27 | article_lines.append(line) 28 | -------------------------------------------------------------------------------- /pretrain/PyTorch/dataprep/split_data_into_files.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | 3 | input_file = "wikipedia.segmented.nltk.txt" 4 | output_file = "./data_shards/wikipedia.segmented.part." 5 | 6 | doc_seperator = "\n" 7 | total_partitions = 100 # Mostly will create 1 extra partition 8 | # shard_size = 396000 # Approximate, will split at next article break 9 | 10 | with open(input_file, encoding="UTF-8") as ifile: 11 | ifile_lines = sum(1 for _ in tqdm(ifile)) 12 | 13 | print("Input file contains", ifile_lines, "lines.") 14 | 15 | shard_size = ifile_lines // total_partitions 16 | 17 | with open(input_file, encoding="UTF-8") as ifile: 18 | shard_line_counter = 0 19 | shard_index = 0 20 | ofile = open(f"{output_file}{shard_index}.txt", "w", encoding="UTF-8") # Open the first file 21 | # Output files should not have doc_separator at the end of the file, but we accept input ending with doc_separator 22 | for iline_counter, line in tqdm(enumerate(ifile, start=1)): 23 | if line != doc_seperator or shard_line_counter < shard_size: 24 | shard_line_counter += 1 25 | ofile.write(line) 26 | # Prevent opening an empty output file or writing a doc_sep 27 | # when the iteration has reached the end of the input file (iline_counter == ifile_lines) 28 | elif iline_counter < ifile_lines: 29 | shard_line_counter = 0 30 | shard_index += 1 31 | ofile.close() 32 | ofile = open(f"{output_file}{shard_index}.txt", "w", encoding="UTF-8") 33 | ofile.close() # Close the lastfile 34 | -------------------------------------------------------------------------------- /pretrain/PyTorch/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | from torch.utils.data import DataLoader, Dataset 4 | from enum import IntEnum 5 | from random import choice 6 | import random 7 | import collections 8 | 9 | from text import mask, torch_long, PAD 10 | from sources import PretrainingDataCreator, TokenInstance, GenericPretrainingDataCreator 11 | from sources import WikiPretrainingDataCreator 12 | from pytorch_pretrained_bert.tokenization import BertTokenizer 13 | 14 | 15 | class BatchType(IntEnum): 16 | PRETRAIN_BATCH = 0 17 | 18 | 19 | class PretrainDataType(IntEnum): 20 | WIKIPEDIA = 1 21 | VALIDATION = 2 22 | 23 | MaskedLMInstance = collections.namedtuple( 24 | "MaskedLMInstance", ["index", "label"]) 25 | 26 | PretrainBatch = collections.namedtuple( 27 | 'PreTrainBatch', ['input_ids', 'input_mask', 'sequence_ids', 28 | 'is_next_label', 'masked_lm_output'] 29 | ) 30 | 31 | def get_random_partition(data_directory, index): 32 | partitions = [os.path.join(data_directory, x) 33 | for x in os.listdir(data_directory)] 34 | partitions = sorted(partitions) 35 | i = index % len(partitions) 36 | return partitions[i] 37 | 38 | 39 | def map_to_torch(encoding): 40 | encoding = torch_long(encoding) 41 | encoding.requires_grad_(False) 42 | return encoding 43 | 44 | 45 | def map_to_torch_float(encoding): 46 | encoding = torch.FloatTensor(encoding) 47 | encoding.requires_grad_(False) 48 | return encoding 49 | 50 | 51 | def map_to_torch_half(encoding): 52 | encoding = torch.HalfTensor(encoding) 53 | encoding.requires_grad_(False) 54 | return encoding 55 | 56 | 57 | def encode_sequence(seqA, seqB, max_seq_len, tokenizer): 58 | seqA = ["[CLS]"] + seqA + ["[SEP]"] 59 | seqB = seqB + ["[SEP]"] 60 | 61 | input_tokens = seqA + seqB 62 | input_ids = tokenizer.convert_tokens_to_ids(input_tokens) 63 | sequence_ids = [0]*len(seqA) + [1]*len(seqB) 64 | input_mask = [1]*len(input_ids) 65 | 66 | while len(input_ids) < max_seq_len: 67 | input_ids.append(PAD) 68 | sequence_ids.append(PAD) 69 | input_mask.append(PAD) 70 | 71 | return (map_to_torch(input_ids), map_to_torch(input_mask), map_to_torch(sequence_ids)) 72 | 73 | 74 | def truncate_input_sequence(tokens_a, tokens_b, max_num_tokens): 75 | while True: 76 | total_length = len(tokens_a) + len(tokens_b) 77 | if total_length <= max_num_tokens: 78 | break 79 | 80 | trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b 81 | assert len(trunc_tokens) >= 1 82 | 83 | # We want to sometimes truncate from the front and sometimes from the 84 | # back to add more randomness and avoid biases. 85 | if random.random() < 0.5: 86 | del trunc_tokens[0] 87 | else: 88 | trunc_tokens.pop() 89 | 90 | class PreTrainingDataset(Dataset): 91 | def __init__(self, tokenizer: BertTokenizer, folder: str, logger, max_seq_length, index, data_type: PretrainDataType = PretrainDataType.WIKIPEDIA, max_predictions_per_seq=20, masked_lm_prob=0.15): 92 | self.tokenizer = tokenizer 93 | self.dir_path = folder 94 | self.max_seq_length = max_seq_length 95 | self.len = 0 96 | self.masked_lm_prob = masked_lm_prob 97 | self.max_predictions_per_seq = max_predictions_per_seq 98 | self.vocab_words = list(tokenizer.vocab.keys()) 99 | 100 | path = get_random_partition(self.dir_path, index) 101 | 102 | logger.info(f"Loading Pretraining Data from {path}") 103 | if data_type == PretrainDataType.WIKIPEDIA: 104 | self.data = GenericPretrainingDataCreator.load(path) 105 | elif data_type == PretrainDataType.VALIDATION: 106 | self.data = WikiPretrainingDataCreator.load(path) 107 | self.len = len(self.data) 108 | logger.info( 109 | f"Data Loading Completed for Pretraining Data from {path} with {self.len} samples.") 110 | 111 | def __len__(self): 112 | return self.len 113 | 114 | def __getitem__(self, index): 115 | i = index % self.len 116 | 117 | instance: TokenInstance = self.data.instances[i] 118 | return self.create_training_instance(instance) 119 | 120 | def create_training_instance(self, instance: TokenInstance): 121 | tokens_a, tokens_b, is_next = instance.get_values() 122 | # print(f'is_next label:{is_next}') 123 | # Create mapper 124 | tokens = [] 125 | segment_ids = [] 126 | tokens.append("[CLS]") 127 | segment_ids.append(0) 128 | for token in tokens_a: 129 | tokens.append(token) 130 | segment_ids.append(0) 131 | 132 | tokens.append("[SEP]") 133 | segment_ids.append(0) 134 | 135 | for token in tokens_b: 136 | tokens.append(token) 137 | segment_ids.append(1) 138 | 139 | tokens.append("[SEP]") 140 | segment_ids.append(1) 141 | 142 | # Get Masked LM predictions 143 | tokens, masked_lm_output = self.create_masked_lm_predictions(tokens) 144 | 145 | # Convert to Ids 146 | input_ids = self.tokenizer.convert_tokens_to_ids(tokens) 147 | input_mask = [1] * len(input_ids) 148 | 149 | while len(input_ids) < self.max_seq_length: 150 | input_ids.append(PAD) 151 | segment_ids.append(PAD) 152 | input_mask.append(PAD) 153 | masked_lm_output.append(-1) 154 | return([map_to_torch([BatchType.PRETRAIN_BATCH]), map_to_torch(input_ids), map_to_torch(input_mask), map_to_torch(segment_ids), map_to_torch([is_next]), map_to_torch(masked_lm_output)]) 155 | 156 | def create_masked_lm_predictions(self, tokens): 157 | cand_indexes = [] 158 | for i, token in enumerate(tokens): 159 | if token == "[CLS]" or token == "[SEP]": 160 | continue 161 | cand_indexes.append(i) 162 | 163 | random.shuffle(cand_indexes) 164 | output_tokens = list(tokens) 165 | 166 | num_to_predict = min(self.max_predictions_per_seq, max( 167 | 1, int(round(len(tokens) * self.masked_lm_prob)))) 168 | 169 | masked_lms = [] 170 | covered_indexes = set() 171 | for index in cand_indexes: 172 | if len(masked_lms) >= num_to_predict: 173 | break 174 | if index in covered_indexes: 175 | continue 176 | covered_indexes.add(index) 177 | 178 | masked_token = None 179 | # 80% mask 180 | if random.random() < 0.8: 181 | masked_token = "[MASK]" 182 | else: 183 | # 10% Keep Original 184 | if random.random() < 0.5: 185 | masked_token = tokens[index] 186 | # 10% replace w/ random word 187 | else: 188 | masked_token = self.vocab_words[random.randint( 189 | 0, len(self.vocab_words) - 1)] 190 | 191 | output_tokens[index] = masked_token 192 | masked_lms.append(MaskedLMInstance( 193 | index=index, label=tokens[index])) 194 | 195 | masked_lms = sorted(masked_lms, key=lambda x: x.index) 196 | masked_lm_output = [-1] * len(output_tokens) 197 | for p in masked_lms: 198 | masked_lm_output[p.index] = self.tokenizer.vocab[p.label] 199 | 200 | return (output_tokens, masked_lm_output) 201 | -------------------------------------------------------------------------------- /pretrain/PyTorch/distributed_apex.py: -------------------------------------------------------------------------------- 1 | # TODO: This is a copy of apex code from NVIDIA/APEX. Details to be added on the updates made here. 2 | 3 | import torch 4 | 5 | try: 6 | from apex_C import flatten 7 | from apex_C import unflatten 8 | except ImportError: 9 | try: 10 | _ = warned_flatten 11 | except NameError: 12 | print("Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten.") 13 | warned_flatten = True 14 | from torch._utils import _flatten_dense_tensors as flatten 15 | from torch._utils import _unflatten_dense_tensors as unflatten 16 | import torch.distributed as dist 17 | from torch.nn.modules import Module 18 | from torch.autograd import Variable 19 | from collections import OrderedDict 20 | from itertools import chain 21 | import copy 22 | 23 | # apply_dist_call requires that tensors in 'bucket' are all the same type. 24 | 25 | 26 | def apply_flat_dist_call(bucket, call, extra_args=None): 27 | 28 | coalesced = flatten(bucket) 29 | #print("Rank", dist.get_rank(), "Broadcasting ", coalesced.device, " Size", coalesced.size()) 30 | if extra_args is not None: 31 | call(coalesced, *extra_args) 32 | else: 33 | call(coalesced) 34 | 35 | if call is dist.all_reduce: 36 | coalesced /= dist.get_world_size() 37 | 38 | for buf, synced in zip(bucket, unflatten(coalesced, bucket)): 39 | buf.copy_(synced) 40 | 41 | 42 | def split_half_float_double(tensors): 43 | dtypes = ["torch.cuda.HalfTensor", 44 | "torch.cuda.FloatTensor", "torch.cuda.DoubleTensor"] 45 | buckets = [] 46 | for i, dtype in enumerate(dtypes): 47 | bucket = [t for t in tensors if t.type() == dtype] 48 | if bucket: 49 | buckets.append(bucket) 50 | return buckets 51 | 52 | 53 | def split_by_type(tensors): 54 | buckets = OrderedDict() 55 | for tensor in tensors: 56 | tp = tensor.type() 57 | if tp not in buckets: 58 | buckets[tp] = [] 59 | buckets[tp].append(tensor) 60 | return buckets 61 | 62 | # flat_dist_call organizes 'tensors' by type. 63 | 64 | 65 | def flat_dist_call(tensors, call, extra_args=None): 66 | buckets = split_by_type(tensors) 67 | 68 | for tp in buckets: 69 | bucket = buckets[tp] 70 | apply_flat_dist_call(bucket, call, extra_args) 71 | 72 | 73 | def extract_tensors(maybe_tensor, tensor_list): 74 | if torch.is_tensor(maybe_tensor): 75 | tensor_list.append(maybe_tensor) 76 | else: 77 | try: 78 | for item in maybe_tensor: 79 | extract_tensors(item, tensor_list) 80 | except TypeError: 81 | return 82 | 83 | 84 | class Reducer(object): 85 | """ 86 | :class:`apex.parallel.Reducer` is a simple class that helps allreduce a module's parameters 87 | across processes. :class:`Reducer` is intended to give the user additional control: 88 | Unlike :class:`DistributedDataParallel`, :class:`Reducer` will not automatically allreduce 89 | parameters during ``backward()``. 90 | Instead, :class:`Reducer` waits for the user to call `.reduce()` manually. 91 | This enables, for example, delaying the allreduce to be carried out every 92 | several iterations instead of every single iteration. 93 | 94 | Like :class:`DistributedDataParallel`, :class:`Reducer` averages any tensors it allreduces 95 | over the number of participating processes. 96 | 97 | :class:`Reducer` is designed to work with the upstream launch utility script 98 | ``torch.distributed.launch`` with ``--nproc_per_node <= number of gpus per node``. 99 | When used with this launcher, :class:`Reducer` assumes 1:1 mapping of processes to GPUs. 100 | It also assumes that your script calls ``torch.cuda.set_device(args.rank)`` before creating the model. 101 | 102 | main_reducer.py in https://github.com/NVIDIA/apex/tree/master/examples/imagenet shows example usage. 103 | 104 | Args: 105 | module_or_grads_list: Either a network definition (module) being run in multi-gpu/distributed mode, or an iterable of gradients to be reduced. If a module is passed in, the Reducer constructor will sync the parameters across processes (broadcasting from rank 0) to make sure they're all initialized with the same values. If a list of gradients (that came from some module) is passed in, the user is responsible for manually syncing that module's parameters at the beginning of training. 106 | """ 107 | 108 | def __init__(self, module_or_grads_list): 109 | if isinstance(module_or_grads_list, Module): 110 | self.module = module_or_grads_list 111 | flat_dist_call( 112 | [param.data for param in self.module.parameters()], dist.broadcast, (0,)) 113 | 114 | else: 115 | self.module = None 116 | self.grads = [] 117 | extract_tensors(module_or_grads_list, self.grads) 118 | 119 | def reduce(self): 120 | if self.module: 121 | grads = [param.grad.data for param in self.module.parameters() 122 | if param.grad is not None] 123 | flat_dist_call(grads, dist.all_reduce) 124 | else: 125 | flat_dist_call(self.grads, dist.all_reduce) 126 | 127 | 128 | class DistributedDataParallel(Module): 129 | """ 130 | :class:`apex.parallel.DistributedDataParallel` is a module wrapper that enables 131 | easy multiprocess distributed data parallel training, similar to ``torch.nn.parallel.DistributedDataParallel``. Parameters are broadcast across participating processes on initialization, and gradients are 132 | allreduced and averaged over processes during ``backward()``. 133 | 134 | :class:`DistributedDataParallel` is optimized for use with NCCL. It achieves high performance by 135 | overlapping communication with computation during ``backward()`` and bucketing smaller gradient 136 | transfers to reduce the total number of transfers required. 137 | 138 | :class:`DistributedDataParallel` is designed to work with the upstream launch utility script 139 | ``torch.distributed.launch`` with ``--nproc_per_node <= number of gpus per node``. 140 | When used with this launcher, :class:`DistributedDataParallel` assumes 1:1 mapping of processes to GPUs. 141 | It also assumes that your script calls ``torch.cuda.set_device(args.rank)`` before creating the model. 142 | 143 | https://github.com/NVIDIA/apex/tree/master/examples/distributed shows detailed usage. 144 | https://github.com/NVIDIA/apex/tree/master/examples/imagenet shows another example 145 | that combines :class:`DistributedDataParallel` with mixed precision training. 146 | 147 | Args: 148 | module: Network definition to be run in multi-gpu/distributed mode. 149 | message_size (int, default=1e7): Minimum number of elements in a communication bucket. 150 | delay_allreduce (bool, default=False): Delay all communication to the end of the backward pass. This disables overlapping communication with computation. 151 | allreduce_trigger_params (list, optional, default=None): If supplied, should contain a list of parameters drawn from the model. Allreduces will be kicked off whenever one of these parameters receives its gradient (as opposed to when a bucket of size message_size is full). At the end of backward(), a cleanup allreduce to catch any remaining gradients will also be performed automatically. If allreduce_trigger_params is supplied, the message_size argument will be ignored. 152 | allreduce_always_fp32 (bool, default=False): Convert any FP16 gradients to FP32 before allreducing. This can improve stability for widely scaled-out runs. 153 | gradient_average (bool, default=True): Option to toggle whether or not DDP averages the allreduced gradients over processes. For proper scaling, the default value of True is recommended. 154 | gradient_predivide_factor (float, default=1.0): Allows perfoming the average of gradients over processes partially before and partially after the allreduce. Before allreduce: ``grads.mul_(1.0/gradient_predivide_factor)``. After allreduce: ``grads.mul_(gradient_predivide_factor/world size)``. This can reduce the stress on the dynamic range of FP16 allreduces for widely scaled-out runs. 155 | 156 | .. warning:: 157 | If ``gradient_average=False``, the pre-allreduce division (``grads.mul_(1.0/gradient_predivide_factor)``) will still be applied, but the post-allreduce gradient averaging (``grads.mul_(gradient_predivide_factor/world size)``) will be omitted. 158 | 159 | """ 160 | 161 | def __init__(self, 162 | module, 163 | message_size=10000000, 164 | delay_allreduce=False, 165 | shared_param=None, 166 | allreduce_trigger_params=None, 167 | retain_allreduce_buffers=False, 168 | allreduce_always_fp32=False, 169 | gradient_average=True, 170 | gradient_predivide_factor=1.0, 171 | gradient_average_split_factor=None): 172 | super(DistributedDataParallel, self).__init__() 173 | 174 | # Backward/forward compatibility around 175 | # https://github.com/pytorch/pytorch/commit/540ef9b1fc5506369a48491af8a285a686689b36 and 176 | # https://github.com/pytorch/pytorch/commit/044d00516ccd6572c0d6ab6d54587155b02a3b86 177 | if hasattr(dist, "get_backend"): 178 | self._backend = dist.get_backend() 179 | if hasattr(dist, "DistBackend"): 180 | self.backend_enum_holder = dist.DistBackend 181 | else: 182 | self.backend_enum_holder = dist.Backend 183 | else: 184 | self._backend = dist._backend 185 | self.backend_enum_holder = dist.dist_backend 186 | 187 | self.warn_on_half = True if self._backend == self.backend_enum_holder.GLOO else False 188 | 189 | if shared_param is not None: 190 | raise ValueError("shared_param is no longer supported as an option. It was misleadingly named from the start. It turns out overlapping communication with computation should work fine with shared parameters. If you still wish to delay communication to the end of the backward pass, use delay_allreduce=True|False instead.") 191 | 192 | if gradient_average_split_factor is not None: 193 | print("Warning: gradient_average_split_factor has been renamed to gradient_predivide_factor. For now, gradient_average_split_factor will also work, but please update to gradient_predivide_factor instead.") 194 | self.gradient_predivide_factor = gradient_average_split_factor 195 | 196 | self.world_size = float(dist.get_world_size()) 197 | 198 | self.retain_allreduce_buffers = retain_allreduce_buffers 199 | self.allreduce_always_fp32 = allreduce_always_fp32 200 | self.gradient_average = gradient_average 201 | self.gradient_predivide_factor = gradient_predivide_factor 202 | 203 | self.custom_allreduce_triggers = False 204 | if allreduce_trigger_params is not None: 205 | if delay_allreduce: 206 | raise ValueError( 207 | "Setting allreduce_trigger_params is only valid if delay_allreduce=False.") 208 | self.custom_allreduce_triggers = True 209 | self.allreduce_trigger_params = set( 210 | [id(param) for param in allreduce_trigger_params]) 211 | 212 | self.delay_allreduce = delay_allreduce 213 | self.message_size = message_size 214 | 215 | self.reduction_stream = torch.cuda.Stream() 216 | self.reduction_event = torch.cuda.Event( 217 | enable_timing=False, blocking=False) 218 | 219 | self.module = module 220 | 221 | if self._backend == self.backend_enum_holder.NCCL: 222 | for param in self.module.parameters(): 223 | assert param.is_cuda, "NCCL backend only supports model parameters to be on GPU." 224 | 225 | self.active_params = [] 226 | 227 | self.param_type_to_tmp_i = {"torch.cuda.HalfTensor": 0, 228 | "torch.cuda.FloatTensor": 1, 229 | "torch.cuda.DoubleTensor": 2} 230 | 231 | # to make sure reduction only happens after gradient accumulation 232 | self.need_reduction = False 233 | 234 | self.create_hooks() 235 | 236 | flat_dist_call( 237 | [param.data for param in self.module.parameters()], dist.broadcast, (0,)) 238 | 239 | def enable_need_reduction(self): 240 | self.need_reduction = True 241 | 242 | def disable_need_reduction(self): 243 | self.need_reduction = False 244 | 245 | def __setstate__(self, state): 246 | super(DistributedDataParallel, self).__setstate__(state) 247 | self.reduction_stream = torch.cuda.Stream() 248 | self.reduction_event = torch.cuda.Event( 249 | enable_timing=False, blocking=False) 250 | 251 | def __getstate__(self): 252 | attrs = copy.copy(self.__dict__) 253 | if self._backend != self.backend_enum_holder.NCCL: 254 | del attrs['self.reduction_stream'] 255 | del attrs['self.reduction_event'] 256 | return attrs 257 | 258 | # Broadcast rank 0's bucket structure across all processes, and have all processes 259 | # regenerate their bucket structures to match. 260 | def sync_bucket_structure(self): 261 | # Append leftover buckets 262 | for tmp_bucket in self.tmp_buckets: 263 | if len(tmp_bucket) > 0: 264 | self.active_i_buckets.append(tmp_bucket) 265 | 266 | self.num_buckets = len(self.active_i_buckets) 267 | self.bucket_sizes = [len(bucket) for bucket in self.active_i_buckets] 268 | 269 | info_tensor = torch.cuda.IntTensor([self.num_buckets] + 270 | self.bucket_sizes + 271 | list(chain(*self.active_i_buckets))) 272 | #print("Sync Bucket Structure Broadcast. Rank", dist.get_rank(), "Tensor Size ", info_tensor.size(), "Device ", info_tensor.device, "Current Device ", torch.cuda.current_device()) 273 | dist.broadcast(info_tensor, 0) 274 | 275 | info = [int(entry) for entry in info_tensor] 276 | 277 | self.num_buckets = info[0] 278 | self.bucket_sizes = info[1:self.num_buckets + 1] 279 | self.buckets = [[None for _ in range(self.bucket_sizes[i])] 280 | for i in range(self.num_buckets)] 281 | # Technically, active_i_buckets' work is done. But the information is still useful to 282 | # keep around. Therefore, refresh active_i_buckets based on rank 0 as well. 283 | self.active_i_buckets = [[None for _ in range(self.bucket_sizes[i])] 284 | for i in range(self.num_buckets)] 285 | 286 | flattened_buckets = info[self.num_buckets + 1:] 287 | flat_i = 0 288 | for bucket_idx in range(self.num_buckets): 289 | for bucket_loc in range(self.bucket_sizes[bucket_idx]): 290 | param_i = flattened_buckets[flat_i] 291 | self.active_i_buckets[bucket_idx][bucket_loc] = param_i 292 | self.param_id_to_bucket[id(self.active_params[param_i])] = ( 293 | bucket_idx, bucket_loc) 294 | flat_i += 1 295 | 296 | def create_hooks(self): 297 | # Fallback hook that's only called at the end of backward. 298 | # Used if you deliberately want to delay allreduces to the end, or to refresh the 299 | # bucket structure that will be used to overlap communication with computation in later 300 | # iterations. 301 | def allreduce_params(): 302 | # Bucket record refresh 303 | if not self.delay_allreduce: 304 | if self.needs_refresh: 305 | self.sync_bucket_structure() 306 | 307 | self.needs_refresh = False 308 | 309 | self.allreduce_fallback() 310 | 311 | def overlapping_backward_epilogue(): 312 | self.reduction_stream.record_event(self.reduction_event) 313 | torch.cuda.current_stream().wait_event(self.reduction_event) 314 | 315 | # Sanity checks that all the buckets were kicked off 316 | if self.next_bucket != self.num_buckets: 317 | raise RuntimeError("In epilogue, next_bucket ({}) != num_buckets ({}). ".format( 318 | self.next_bucket, self.num_buckets), 319 | "This probably indicates some buckets were not allreduced.") 320 | 321 | for actual, expected in zip(self.buckets_ready_size, self.bucket_sizes): 322 | if actual != expected: 323 | raise RuntimeError( 324 | "Some param buckets were not allreduced.") 325 | 326 | self.grad_accs = [] 327 | for param in self.module.parameters(): 328 | if param.requires_grad: 329 | def wrapper(param): 330 | param_tmp = param.expand_as(param) 331 | grad_acc = param_tmp.grad_fn.next_functions[0][0] 332 | 333 | def allreduce_hook(*unused): 334 | # user must explicitly specify when to do all reduce 335 | if self.need_reduction == False: 336 | #print("Does not need Reduction") 337 | return 338 | #print("Needs Reduction") 339 | if self.delay_allreduce or self.needs_refresh: 340 | # TODO: How do we want to handle multiple backward passes between 341 | # each forward, e.g., backward passes with retain_graph=True? 342 | # needs_refresh and callback_queued are both vulnerable states. 343 | if not self.delay_allreduce and self.needs_refresh: 344 | # Use the backward pass to build the bucket structure on the fly. 345 | active_i = self.param_id_to_active_i[id(param)] 346 | 347 | # Float, half, and double tensors are grouped into buckets separately. 348 | current_type = self.param_type_to_tmp_i[param.type( 349 | )] 350 | 351 | self.tmp_buckets[current_type].append(active_i) 352 | 353 | ship_tmp_bucket = False 354 | if self.custom_allreduce_triggers: 355 | if id(param) in self.allreduce_trigger_params: 356 | ship_tmp_bucket = True 357 | else: 358 | self.tmp_numels[current_type] += param.numel() 359 | if self.tmp_numels[current_type] >= self.message_size: 360 | ship_tmp_bucket = True 361 | 362 | # To consider: If custom_allreduce_triggers are in use, ship all 363 | # tmp_buckets, not just tmp_buckets[current_type]. 364 | if ship_tmp_bucket: 365 | self.active_i_buckets.append( 366 | self.tmp_buckets[current_type]) 367 | self.tmp_buckets[current_type] = [] 368 | self.tmp_numels[current_type] = 0 369 | 370 | if not self.callback_queued: 371 | Variable._execution_engine.queue_callback( 372 | allreduce_params) 373 | self.callback_queued = True 374 | else: 375 | if not self.callback_queued: 376 | Variable._execution_engine.queue_callback( 377 | overlapping_backward_epilogue) 378 | self.callback_queued = True 379 | 380 | self.comm_ready_buckets(param) 381 | 382 | grad_acc.register_hook(allreduce_hook) 383 | self.grad_accs.append(grad_acc) 384 | 385 | wrapper(param) 386 | 387 | def allreduce_bucket(self, bucket): 388 | tensor = flatten(bucket) 389 | 390 | tensor_to_allreduce = tensor 391 | 392 | if self.allreduce_always_fp32: 393 | tensor_to_allreduce = tensor.float() 394 | 395 | if self.gradient_predivide_factor != 1.0: 396 | tensor_to_allreduce.mul_(1./self.gradient_predivide_factor) 397 | 398 | dist.all_reduce(tensor_to_allreduce) 399 | 400 | if self.gradient_average: 401 | tensor_to_allreduce.mul_( 402 | self.gradient_predivide_factor/self.world_size) 403 | 404 | if self.allreduce_always_fp32 and tensor is not tensor_to_allreduce: 405 | tensor.copy_(tensor_to_allreduce) 406 | 407 | return tensor 408 | 409 | def allreduce_maybe_retain(self, bucket, bucket_idx=-1): 410 | allreduced = self.allreduce_bucket(bucket) 411 | if self.retain_allreduce_buffers: 412 | if self.allreduce_buffers[bucket_idx] is not None: 413 | raise RuntimeError("The backward pass is attempting to replace an already-filled " 414 | "allreduce buffer. This is almost certainly an error.") 415 | self.allreduce_buffers[bucket_idx] = allreduced 416 | else: 417 | for buf, synced in zip(bucket, unflatten(allreduced, bucket)): 418 | buf.copy_(synced) 419 | 420 | def allreduce_fallback(self): 421 | grads = [param.grad.data for param in self.module.parameters() 422 | if param.grad is not None] 423 | 424 | split_buckets = split_half_float_double(grads) 425 | 426 | # If retain_allreduce_buffers is True and delay_allreduce is False, 427 | # this will only be done during the first backward pass, ignored by the 428 | # training script, and overwritten in the next forward pass. So it's harmless. 429 | if self.retain_allreduce_buffers: 430 | self.allreduce_buffers = [None for _ in range(len(split_buckets))] 431 | 432 | for i, bucket in enumerate(split_buckets): 433 | allreduced = self.allreduce_maybe_retain(bucket, i) 434 | 435 | def comm_ready_buckets(self, param): 436 | # Need to do this in every hook for compatibility with Ruberry's streaming backward PR. 437 | # self.reduction_stream.wait_stream(torch.cuda.current_stream()) 438 | #if dist.get_rank() == 0: 439 | # print("Parameter Name", param.name) 440 | bucket_idx, bucket_loc = self.param_id_to_bucket[id(param)] 441 | 442 | if self.buckets[bucket_idx][bucket_loc] is not None: 443 | raise RuntimeError("The backward pass is attempting to replace an already-filled " 444 | "bucket slot. This is almost certainly an error.") 445 | 446 | self.buckets[bucket_idx][bucket_loc] = param.grad.data 447 | self.buckets_ready_size[bucket_idx] += 1 448 | 449 | if self.buckets_ready_size[bucket_idx] == self.bucket_sizes[bucket_idx]: 450 | if bucket_idx == self.next_bucket: 451 | torch.cuda.current_stream().record_event(self.reduction_event) 452 | self.reduction_stream.wait_event(self.reduction_event) 453 | with torch.cuda.stream(self.reduction_stream): 454 | self.allreduce_maybe_retain( 455 | self.buckets[bucket_idx], bucket_idx) 456 | 457 | self.next_bucket += 1 458 | 459 | # Reversing upstream's logic here, because we constructed our buckets based on 460 | # the order things were received during backward. 461 | if len(self.ready_buckets_not_reduced) > 0: 462 | sorted_todo = sorted(self.ready_buckets_not_reduced) 463 | for i in sorted_todo: 464 | # Nothing can be reduced now 465 | if i > self.next_bucket: 466 | break 467 | elif i == self.next_bucket: 468 | self.allreduce_maybe_retain(self.buckets[i], i) 469 | self.ready_buckets_not_reduced.remove(i) 470 | self.next_bucket += 1 471 | else: 472 | raise ValueError( 473 | "i should always be >= next_bucket") 474 | else: 475 | self.ready_buckets_not_reduced.add(bucket_idx) 476 | 477 | 478 | def needs_refresh(self): 479 | self.needs_refresh = True 480 | 481 | def forward(self, *inputs, **kwargs): 482 | result = self.module(*inputs, **kwargs) 483 | 484 | if not self.delay_allreduce: 485 | param_list = [ 486 | param for param in self.module.parameters() if param.requires_grad] 487 | 488 | # Conditions under which to refresh self.record 489 | # Forward has the authority to set needs_refresh to True, but only allreduce_params 490 | # in backward has the authority to set needs_refresh to False. 491 | # Parentheses are not necessary for correct order of operations, but make the intent clearer. 492 | if ((not self.active_params) or 493 | (len(param_list) != len(self.active_params)) or 494 | any([param1 is not param2 for param1, param2 in zip(param_list, self.active_params)])): 495 | self.needs_refresh = True 496 | #self.needs_refresh = True 497 | if self.needs_refresh: 498 | self.active_i_buckets = [] 499 | self.buckets = [] 500 | # [running half, float, double buckets] 501 | self.tmp_buckets = [[], [], []] 502 | self.tmp_numels = [0, 0, 0] 503 | self.bucket_sizes = [] 504 | self.param_id_to_active_i = { 505 | id(param): i for i, param in enumerate(param_list)} 506 | self.param_id_to_bucket = {} 507 | else: 508 | self.buckets = [[None for _ in range(self.bucket_sizes[i])] 509 | for i in range(self.num_buckets)] 510 | self.buckets_ready_size = [0 for i in range(self.num_buckets)] 511 | if(self.retain_allreduce_buffers): 512 | self.allreduce_buffers = [ 513 | None for _ in range(self.num_buckets)] 514 | self.next_bucket = 0 515 | self.ready_buckets_not_reduced = set() 516 | 517 | self.active_params = param_list 518 | 519 | self.callback_queued = False 520 | 521 | return result 522 | -------------------------------------------------------------------------------- /pretrain/PyTorch/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | 5 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 6 | datefmt='%m/%d/%Y %H:%M:%S', 7 | level=logging.INFO) 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class Logger(): 12 | def __init__(self, cuda=False): 13 | self.logger = logging.getLogger(__name__) 14 | self.cuda = cuda 15 | 16 | def info(self, message, *args, **kwargs): 17 | local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) 18 | if (self.cuda and local_rank == 0) or not self.cuda: 19 | self.logger.info(message, *args, **kwargs) 20 | 21 | def error(self, message, *args, **kwargs): 22 | self.logger.error(message, *args, **kwargs) 23 | -------------------------------------------------------------------------------- /pretrain/PyTorch/models.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import random 4 | import numpy as np 5 | import os 6 | import torch 7 | import json 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | import torch.distributed as dist 11 | from torch.nn import CrossEntropyLoss, MSELoss 12 | from logger import Logger 13 | 14 | from dataset import BatchType 15 | from pytorch_pretrained_bert.tokenization import BertTokenizer 16 | from pytorch_pretrained_bert.modeling import BertModel, BertConfig 17 | from pytorch_pretrained_bert.modeling import BertPreTrainingHeads, BertPreTrainedModel, BertPreTrainingHeads, BertLMPredictionHead 18 | from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE 19 | 20 | 21 | class BertPretrainingLoss(BertPreTrainedModel): 22 | def __init__(self, bert_encoder, config): 23 | super(BertPretrainingLoss, self).__init__(config) 24 | self.bert = bert_encoder 25 | self.cls = BertPreTrainingHeads( 26 | config, self.bert.embeddings.word_embeddings.weight) 27 | self.cls.apply(self.init_bert_weights) 28 | 29 | def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None): 30 | sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, 31 | output_all_encoded_layers=False) 32 | prediction_scores, seq_relationship_score = self.cls( 33 | sequence_output, pooled_output) 34 | 35 | if masked_lm_labels is not None and next_sentence_label is not None: 36 | loss_fct = CrossEntropyLoss(ignore_index=-1) 37 | next_sentence_loss = loss_fct( 38 | seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) 39 | masked_lm_loss = loss_fct( 40 | prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) 41 | total_loss = masked_lm_loss + next_sentence_loss 42 | return total_loss 43 | else: 44 | return prediction_scores, seq_relationship_score 45 | 46 | 47 | class MTLRouting(nn.Module): 48 | """This setup is to add MultiTask Training support in BERT Training. 49 | """ 50 | def __init__(self, encoder: BertModel, write_log, summary_writer): 51 | super(MTLRouting, self).__init__() 52 | self.bert_encoder = encoder 53 | self._batch_loss_calculation = nn.ModuleDict() 54 | self._batch_counter = {} 55 | self._batch_module_name = {} 56 | self._batch_name = {} 57 | self.write_log = write_log 58 | self.logger = Logger(cuda=torch.cuda.is_available()) 59 | self.summary_writer = summary_writer 60 | 61 | def register_batch(self, batch_type, module_name, loss_calculation: nn.Module): 62 | assert isinstance(loss_calculation, nn.Module) 63 | self._batch_loss_calculation[str(batch_type.value)] = loss_calculation 64 | self._batch_counter[batch_type] = 0 65 | self._batch_module_name[batch_type] = module_name 66 | 67 | def log_summary_writer(self, batch_type, logs: dict, base='Train'): 68 | if self.write_log: 69 | counter = self._batch_counter[batch_type] 70 | module_name = self._batch_module_name.get( 71 | batch_type, self._get_batch_type_error(batch_type)) 72 | for key, log in logs.items(): 73 | self.summary_writer.add_scalar( 74 | f'{base}/{module_name}/{key}', log, counter) 75 | self._batch_counter[batch_type] = counter + 1 76 | 77 | def _get_batch_type_error(self, batch_type): 78 | def f(*args, **kwargs): 79 | message = f'Misunderstood batch type of {batch_type}' 80 | self.logger.error(message) 81 | raise ValueError(message) 82 | return f 83 | 84 | def forward(self, batch, log=True): 85 | batch_type = batch[0][0].item() 86 | 87 | # Pretrain Batch 88 | if batch_type == BatchType.PRETRAIN_BATCH: 89 | loss_function = self._batch_loss_calculation[str(batch_type)] 90 | 91 | loss = loss_function(input_ids=batch[1], 92 | token_type_ids=batch[3], 93 | attention_mask=batch[2], 94 | masked_lm_labels=batch[5], 95 | next_sentence_label=batch[4]) 96 | if log: 97 | self.log_summary_writer( 98 | batch_type, logs={'pretrain_loss': loss.item()}) 99 | return loss 100 | 101 | 102 | class BertMultiTask: 103 | def __init__(self, job_config, use_pretrain, tokenizer, cache_dir, device, write_log, summary_writer): 104 | self.job_config = job_config 105 | 106 | if not use_pretrain: 107 | model_config = self.job_config.get_model_config() 108 | bert_config = BertConfig(**model_config) 109 | bert_config.vocab_size = len(tokenizer.vocab) 110 | 111 | self.bert_encoder = BertModel(bert_config) 112 | # Use pretrained bert weights 113 | else: 114 | self.bert_encoder = BertModel.from_pretrained(self.job_config.get_model_file_type(), cache_dir=cache_dir) 115 | bert_config = self.bert_encoder.config 116 | 117 | self.network=MTLRouting(self.bert_encoder, write_log = write_log, summary_writer = summary_writer) 118 | 119 | #config_data=self.config['data'] 120 | 121 | # Pretrain Dataset 122 | self.network.register_batch(BatchType.PRETRAIN_BATCH, "pretrain_dataset", loss_calculation=BertPretrainingLoss(self.bert_encoder, bert_config)) 123 | 124 | self.device=device 125 | # self.network = self.network.float() 126 | # print(f"Bert ID: {id(self.bert_encoder)} from GPU: {dist.get_rank()}") 127 | 128 | def save(self, filename: str): 129 | network=self.network.module 130 | return torch.save(network.state_dict(), filename) 131 | 132 | def load(self, model_state_dict: str): 133 | return self.network.module.load_state_dict(torch.load(model_state_dict, map_location=lambda storage, loc: storage)) 134 | 135 | def move_batch(self, batch, non_blocking=False): 136 | return batch.to(self.device, non_blocking) 137 | 138 | def eval(self): 139 | self.network.eval() 140 | 141 | def train(self): 142 | self.network.train() 143 | 144 | def save_bert(self, filename: str): 145 | return torch.save(self.bert_encoder.state_dict(), filename) 146 | 147 | def to(self, device): 148 | assert isinstance(device, torch.device) 149 | self.network.to(device) 150 | 151 | def half(self): 152 | self.network.half() 153 | -------------------------------------------------------------------------------- /pretrain/PyTorch/notebooks/BERT_Pretrain.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Pretraining of the BERT model\n", 8 | "\n", 9 | "This notebook contains an end-to-end walkthrough of using Azure Machine Learning service and pretraining [BERT: Bidirectional Encoder Representations from Transformers](https://arxiv.org/abs/1810.04805) models.\n", 10 | "\n", 11 | "Methodology:\n", 12 | "- Intialize an AzureML workspace\n", 13 | "- Register a datastore\n", 14 | "- Create an experiment\n", 15 | "- Provision a compute target\n", 16 | "- Create an Estimator\n", 17 | "- Configure and Run" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Prerequisites\n", 25 | "If you are using an [Azure Machine Learning Notebook VM](https://docs.microsoft.com/en-us/azure/machine-learning/service/quickstart-run-cloud-notebook), you are all set. Otherwise, refer to the [configuration Notebook](https://github.com/Azure/MachineLearningNotebooks/blob/56e0ebc5acb9614fac51d8b98ede5acee8003820/configuration.ipynb) first if you haven't already to establish your connection to the AzureML Workspace. Prerequisites are:\n", 26 | "* Azure subscription\n", 27 | "* Azure Machine Learning Workspace\n", 28 | "* Azure Machine Learning SDK" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Library import" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# Regular python libraries\n", 45 | "import os\n", 46 | "import requests\n", 47 | "import sys\n", 48 | "\n", 49 | "# AzureML libraries\n", 50 | "import azureml.core\n", 51 | "from azureml.core import Experiment, Workspace, Datastore, Run\n", 52 | "from azureml.core.compute import ComputeTarget, AmlCompute\n", 53 | "from azureml.core.compute_target import ComputeTargetException\n", 54 | "from azureml.core.conda_dependencies import CondaDependencies\n", 55 | "from azureml.core.container_registry import ContainerRegistry\n", 56 | "from azureml.core.runconfig import MpiConfiguration, RunConfiguration, DEFAULT_GPU_IMAGE\n", 57 | "from azureml.train.dnn import PyTorch\n", 58 | "from azureml.train.estimator import Estimator\n", 59 | "from azureml.widgets import RunDetails\n", 60 | "\n", 61 | "# Check core SDK version number\n", 62 | "print(\"SDK version:\", azureml.core.VERSION)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## Workspace setup\n", 70 | "\n", 71 | "Initialize a Workspace object from the existing workspace you created in the Prerequisites step or create a new one." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# Retrieve the workspace\n", 81 | "ws = Workspace.setup()\n", 82 | "\n", 83 | "# Print the workspace attributes\n", 84 | "print('Workspace name: ' + ws.name, \n", 85 | " 'Workspace region: ' + ws.location, \n", 86 | " 'Subscription id: ' + ws.subscription_id, \n", 87 | " 'Resource group: ' + ws.resource_group, sep = '\\n')" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "## Datastore registration" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "[BERT paper](https://arxiv.org/pdf/1810.04805) references `Wikipedia` and `BookCorpus` datasets for pretraining. This notebook is configured to use Wikipedia dataset only, but can be used with other datasets as well, including custom datasets. The preprocessed data should be available in a `Datastore` in AzureML `Workspace`. \n", 102 | "\n", 103 | "The Wikipedia corpus used for BERT pretraining is preprocessed following the [data prep instructions](https://github.com/microsoft/AzureML-BERT/blob/master/docs/dataprep.md) and uploaded to https://bertonazuremlwestus2.blob.core.windows.net/public2/bert_data.tar.gz (70 GB). You need to extract the files and copy them to another Azure blob container and register it as a workspace to use it in the pretraining job. Additional details on the tar.gz file and the data transfer are available at [artifacts.md](https://github.com/microsoft/AzureML-BERT/blob/master/docs/artifacts.md).\n", 104 | "\n", 105 | "Alternatively, you can preprocess the raw data from scratch (instructions available at the [data prep notes](https://github.com/microsoft/AzureML-BERT/blob/master/docs/dataprep.md)), upload that to an Azure blob container and use it as the datastore for the job. \n", 106 | "\n", 107 | "Note: it is also possible to use datasets other than Wikipedia corpus with this implementation. \n", 108 | "\n", 109 | "The following code assumes that the data is already copied to an Azure blob container with the following directory structure. It is recommended to retain this directory structure to run this notebook without code updates. In case the directory structure is different, the constructor of PyTorch estimator where the datastore is mounted should be modified.\n", 110 | "\n", 111 | " \n", 112 | "```\n", 113 | "bert_data\n", 114 | "│ bert-base.json\n", 115 | "│ bert-large.json\n", 116 | "│ bert-base-single-node.json\n", 117 | "│ bert-large-single-node.json\n", 118 | "│\n", 119 | "└───512\n", 120 | "│ │\n", 121 | "│ └───wiki_pretrain\n", 122 | "│ │ wikipedia_segmented_part_0.bin\n", 123 | "│ │ wikipedia_segmented_part_1.bin\n", 124 | "│ │ ...\n", 125 | "│ │ wikipedia_segmented_part_98.bin\n", 126 | "└───validation_512_only\n", 127 | " │\n", 128 | " └───validation_set.bin\n", 129 | "\n", 130 | "```" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "# Register the datastore with the workspace\n", 140 | "ds = Datastore.register_azure_blob_container(workspace=ws, \n", 141 | " datastore_name='BERT_Preprocessed_Data',\n", 142 | " container_name='data',\n", 143 | " account_name='', \n", 144 | " account_key=''\n", 145 | " )\n", 146 | "\n", 147 | "# Help from: https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-access-data" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "# Print the workspace attributes\n", 157 | "print('Datastore name: ' + ds.name, \n", 158 | " 'Container name: ' + ds.container_name, \n", 159 | " 'Datastore type: ' + ds.datastore_type, \n", 160 | " 'Workspace name: ' + ds.workspace.name, sep = '\\n')" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "## Create an Experiment\n", 168 | "\n", 169 | "Experiment is a logical container in an Azure ML Workspace. It hosts run records which can include run metrics and output artifacts from your experiments." 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "# Create an experiment\n", 179 | "experiment_name = 'BERT-pretraining'\n", 180 | "experiment = Experiment(ws, name=experiment_name)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "## Provision a cluster\n", 188 | "\n", 189 | "### Introduction to AmlCompute\n", 190 | "\n", 191 | "Azure Machine Learning Compute is managed compute infrastructure that allows the user to easily create single to multi-node compute of the appropriate VM Family. It is created within your workspace region and is a resource that can be used by other users in your workspace. It autoscales by default to the max_nodes, when a job is submitted, and executes in a containerized environment packaging the dependencies as specified by the user.\n", 192 | "\n", 193 | "Since it is managed compute, job scheduling and cluster management are handled internally by Azure Machine Learning service.\n", 194 | "\n", 195 | "For more information on Azure Machine Learning Compute, please read [this](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute).\n", 196 | "\n", 197 | "Note: As with other Azure services, there are limits on certain resources (for eg. AmlCompute quota) associated with the Azure Machine Learning service. Please read [this](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota." 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "### Create a compute target\n", 205 | "BERT pretraining on Azure Machine Learning Service is supported on 16 x `Standard_NC24s_v3` or 8 x `Standard_ND40_v2` VMs. In the next step, you will create a 16 node (i.e. 64 GPUs) AMLCompute cluster of `Standard_NC24s_v3` GPU VMs, if it doesn't already exist in your workspace. The code to create a cluster with 8 `Standard_ND40_v2` VMs is commented out in the cell below.\n", 206 | "\n", 207 | "* vm_size: VM family of the nodes provisioned by AmlCompute. Simply choose from the supported_vmsizes() above\n", 208 | "* max_nodes: Maximum nodes to autoscale to while running a job on AmlCompute\n", 209 | "* min_nodes: Minimum number of nodes while running a job on AmlCompute" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "# Create the compute cluster\n", 219 | "gpu_cluster_name = \"pretraincluster\" \n", 220 | "\n", 221 | "# Verify that the cluster doesn't exist already\n", 222 | "try:\n", 223 | " gpu_compute_target = ComputeTarget(workspace=ws, name=gpu_cluster_name)\n", 224 | " print('Found existing compute target.')\n", 225 | "except ComputeTargetException:\n", 226 | " print('Creating a new compute target...')\n", 227 | " compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC24s_v3', min_nodes=0, max_nodes=16)\n", 228 | " # compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC40_v2', min_nodes=0, max_nodes=8)\n", 229 | " \n", 230 | " # create the cluster\n", 231 | " gpu_compute_target = ComputeTarget.create(ws, gpu_cluster_name, compute_config)\n", 232 | " gpu_compute_target.wait_for_completion(show_output=True)\n", 233 | "\n", 234 | "# Use the 'status' property to get a detailed status for the current cluster. \n", 235 | "#print(gpu_compute_target.status.serialize())" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "## Estimator definition and run submission\n", 243 | "\n", 244 | "The estimator uses a custom docker image and train.py as the entry script for execution.\n", 245 | "\n", 246 | "For more information on Estimator, refer [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-train-pytorch)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "# Define the project folder\n", 256 | "project_folder = '..' # This is to allow the libraries stored under pytorch/ to be loaded\n", 257 | "\n", 258 | "## Using a public image published on Azure.\n", 259 | "image_name = 'mcr.microsoft.com/azureml/bert:pretrain-openmpi3.1.2-cuda10.0-cudnn7-ubuntu16.04'\n", 260 | "\n", 261 | "# Using MPI to execute a distributed run\n", 262 | "mpi = MpiConfiguration()\n", 263 | "# Standard_NC24s_v3 VM has 4 GPUs. !!!! update this appropriately if you use a different VM size !!!!\n", 264 | "mpi.process_count_per_node = 4 \n", 265 | "# !!!! use the following for Standard_NC40_v2 VM !!!!\n", 266 | "# mpi.process_count_per_node = 8\n", 267 | "\n", 268 | "# Define the Pytorch estimator\n", 269 | "estimator = PyTorch(source_directory=project_folder,\n", 270 | " # Compute configuration\n", 271 | " compute_target=gpu_compute_target,\n", 272 | " node_count=16, \n", 273 | " distributed_training=mpi,\n", 274 | " use_gpu=True,\n", 275 | " \n", 276 | " #Docker image\n", 277 | " use_docker=True,\n", 278 | " custom_docker_image=image_name,\n", 279 | " user_managed=True,\n", 280 | " \n", 281 | " # Training script parameters\n", 282 | " script_params = {\n", 283 | " # Required Params\n", 284 | " \"--config_file\": \"bert-large.json\",\n", 285 | " # bert_data is where pre-processed training data are\n", 286 | " '--train_path' : ds.path('bert_data/512/wiki_pretrain/').as_mount(), \n", 287 | " '--validation_path':ds.path('bert_data/validation_512_only/').as_mount(),\n", 288 | " # Optional Params\n", 289 | " \"--max_seq_length\": 512,\n", 290 | " \"--max_predictions_per_seq\": 80,\n", 291 | " \"--masked_lm_prob\": 0.15,\n", 292 | " \"--train_batch_size\": 64,\n", 293 | " '--seed': 42,\n", 294 | " '--accumulate_gradients': \"True\",\n", 295 | " '--gradient_accumulation_steps': 16,\n", 296 | " '--fp16': \"True\",\n", 297 | " '--loss_scale': 0,\n", 298 | " '--epochs' : 2,\n", 299 | " '--config_file_path' :ds.path('bert_data/').as_mount() ,\n", 300 | " '--output_dir':ds.path(f'bert_data/output/{experiment_name}/').as_mount(),\n", 301 | " '--best_cp_dir':ds.path(f'bert_data/best_cp/{experiment_name}/').as_mount(),\n", 302 | " '--latest_cp_dir':ds.path(f'bert_data/latest_cp/{experiment_name}/').as_mount(),\n", 303 | " '--backend':\"nccl\"\n", 304 | " },\n", 305 | " \n", 306 | " entry_script='train.py',\n", 307 | " inputs=[ds.path('bert_data/').as_mount()]\n", 308 | " )\n", 309 | "# path to the Python environment in the custom Docker image\n", 310 | "estimator._estimator_config.environment.python.interpreter_path = '/opt/miniconda/envs/amlbert/bin/python'" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "For single node (1 NC24s_v3 VM), multi-GPU runs for debugging purposes, use the following configuration:\n", 318 | "- node_count=1, '--config_file':`bert-base-single-node.json`, '--gradient_accumulation_steps': `64`,\"--train_batch_size\": `1024` (for bert-base)\n", 319 | "- node_count=1, '--config_file':`bert-large-single-node.json`,'--gradient_accumulation_steps': `256`,\"--train_batch_size\": `1024` (for bert-large)\n", 320 | "\n", 321 | "To resume from the latest checkpoint, use `load_training_checkpoint` parameter to pass the checkpoint directory. It will load the latest checkpoint from the directory.\n", 322 | "\n", 323 | "'--load_training_checkpoint':ds.path(f'bert_data/latest_cp/{experiment_name}/').as_mount()," 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "# Submit the run\n", 333 | "run = experiment.submit(estimator)\n", 334 | "RunDetails(run).show()" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "# Python packages required to use Tensorboard with AzureML are azureml-tensorboard, tensorboardX and tensorboard\n", 344 | "# The Tensorboard constructor takes an array of runs, so be sure and pass it in as a single-element array here\n", 345 | "from azureml.tensorboard import Tensorboard\n", 346 | "tb = Tensorboard([run])\n", 347 | "tb.start()" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "#tb.stop()" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "#run.cancel()" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "# Downloading log file to run the perf benchmarking script\n", 375 | "fetched_run = Run(experiment, run.id)\n", 376 | "for f in fetched_run.get_file_names():\n", 377 | " if \"70_driver_log_rank_0.txt\" in f:\n", 378 | " dest = os.path.join('outputs', f.split('/')[-1])\n", 379 | " print('Downloading file {} to {}...'.format(f, dest))\n", 380 | " fetched_run.download_file(f, dest) " 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "# Get average throughput of the training run\n", 390 | "sys.path.append(os.path.abspath(os.path.join('..', '')))\n", 391 | "from benchmark import *\n", 392 | "get_perf_metrics(dest)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "metadata": {}, 399 | "outputs": [], 400 | "source": [] 401 | } 402 | ], 403 | "metadata": { 404 | "file_extension": ".py", 405 | "kernelspec": { 406 | "display_name": "Python 3", 407 | "language": "python", 408 | "name": "python3" 409 | }, 410 | "language_info": { 411 | "codemirror_mode": { 412 | "name": "ipython", 413 | "version": 3 414 | }, 415 | "file_extension": ".py", 416 | "mimetype": "text/x-python", 417 | "name": "python", 418 | "nbconvert_exporter": "python", 419 | "pygments_lexer": "ipython3", 420 | "version": "3.6.9" 421 | }, 422 | "mimetype": "text/x-python", 423 | "name": "python", 424 | "npconvert_exporter": "python", 425 | "pygments_lexer": "ipython3", 426 | "version": 3 427 | }, 428 | "nbformat": 4, 429 | "nbformat_minor": 2 430 | } -------------------------------------------------------------------------------- /pretrain/PyTorch/optimization.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | def warmup_linear(x, warmup=0.002): 4 | if warmup == 0.0: 5 | return 1.0 6 | elif x < warmup: 7 | return x/warmup 8 | return 1.0 - x 9 | 10 | 11 | def warmup_linear_decay_exp(global_step, decay_rate, decay_steps, total_steps, warmup=0.002): 12 | x = global_step/total_steps 13 | warmup_end = warmup * total_steps 14 | if warmup == 0.0: 15 | return 1.0 16 | elif x < warmup: 17 | return x/warmup 18 | return decay_rate**((global_step-warmup_end)/decay_steps) 19 | 20 | class LinearWarmupExponentialSchedule(): 21 | def __init__(self, warmup=0.002, t_total=-1, initial_lr = 2e-5, final_lr=5e-6, decay_rate=0.99): 22 | self.warmup = warmup 23 | self.total_steps = t_total 24 | self.decay_rate = decay_rate 25 | self.warmup_end = self.warmup * t_total 26 | 27 | # Calculate the decay Steps 28 | self.decay_steps = int(math.ceil((math.log(self.decay_rate)/ math.log(final_lr/initial_lr)) * (1.0 - warmup) * t_total)) 29 | 30 | def get_lr(self, global_step): 31 | x = global_step/self.total_steps 32 | if self.warmup == 0.0: 33 | return 1.0 34 | elif x < self.warmup: 35 | return x/self.warmup 36 | return self.decay_rate**((global_step-self.warmup_end)/self.decay_steps) 37 | -------------------------------------------------------------------------------- /pretrain/PyTorch/sources.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from typing import Tuple 3 | from random import shuffle 4 | import pickle 5 | import random 6 | 7 | from pytorch_pretrained_bert.tokenization import BertTokenizer 8 | 9 | 10 | def truncate_input_sequence(tokens_a, tokens_b, max_num_tokens): 11 | while True: 12 | total_length = len(tokens_a) + len(tokens_b) 13 | if total_length <= max_num_tokens: 14 | break 15 | 16 | trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b 17 | assert len(trunc_tokens) >= 1 18 | 19 | # We want to sometimes truncate from the front and sometimes from the 20 | # back to add more randomness and avoid biases. 21 | if random.random() < 0.5: 22 | del trunc_tokens[0] 23 | else: 24 | trunc_tokens.pop() 25 | 26 | 27 | class TokenInstance: 28 | def __init__(self, tokens_a, tokens_b, is_next): 29 | self.tokens_a = tokens_a 30 | self.tokens_b = tokens_b 31 | self.is_next = is_next # 0 is if in continuation, 1 if is random 32 | 33 | def get_values(self): 34 | return (self.tokens_a, self.tokens_b, self.is_next) 35 | 36 | 37 | class PretrainingDataCreator: 38 | def __init__(self, path, tokenizer: BertTokenizer, max_seq_length, readin: int = 2000000, dupe_factor: int = 5, small_seq_prob: float = 0.1): 39 | self.dupe_factor = dupe_factor 40 | self.max_seq_length = max_seq_length 41 | self.small_seq_prob = small_seq_prob 42 | 43 | documents = [] 44 | instances = [] 45 | with open(path, encoding='utf-8') as fd: 46 | for i, line in enumerate(tqdm(fd)): 47 | line = line.replace('\n', '') 48 | # Expected format (Q,T,U,S,D) 49 | # query, title, url, snippet, document = line.split('\t') 50 | # ! remove this following line later 51 | document = line 52 | if len(document.split("")) <= 3: 53 | continue 54 | lines = document.split("") 55 | document = [] 56 | for seq in lines: 57 | document.append(tokenizer.tokenize(seq)) 58 | # document = list(map(tokenizer.tokenize, lines)) 59 | documents.append(document) 60 | 61 | documents = [x for x in documents if x] 62 | 63 | self.documents = documents 64 | for _ in range(self.dupe_factor): 65 | for index in range(len(self.documents)): 66 | instances.extend(self.create_training_instance(index)) 67 | 68 | shuffle(instances) 69 | self.instances = instances 70 | self.len = len(self.instances) 71 | self.documents = None 72 | documents = None 73 | 74 | def __len__(self): 75 | return self.len 76 | 77 | def __getstate__(self): 78 | state = self.__dict__.copy() 79 | return state 80 | 81 | def __setstate__(self, state): 82 | self.__dict__.update(state) 83 | 84 | def save(self, filename): 85 | with open(filename, 'wb') as outfile: 86 | pickle.dump(self, outfile) 87 | 88 | @staticmethod 89 | def load(filename): 90 | print("Loading filename {}".format(filename)) 91 | with open(filename, 'rb') as f: 92 | return pickle.load(f) 93 | 94 | def create_training_instance(self, index): 95 | document = self.documents[index] 96 | 97 | # Need to add [CLS] + 2*[SEP] tokens 98 | max_num_tokens = self.max_seq_length - 3 99 | 100 | # We want to maximize the inp sequence but also want inputs similar 101 | # to our generic task inputs which will be compartively smaller 102 | # than the data on which we intend to pre-train. 103 | target_seq_length = max_num_tokens 104 | if random.random() < self.small_seq_prob: 105 | target_seq_length = random.randint(5, max_num_tokens) 106 | 107 | # Need to make the sequences split for NSP task for interesting 108 | # rather than choosing some arbitrary point. If not the NSP 109 | # task might become way too easy. 110 | instances = [] 111 | current_chunk = [] 112 | current_length = 0 113 | i = 0 114 | while i < len(document): 115 | segment = document[i] 116 | current_chunk.append(segment) 117 | current_length += len(segment) 118 | if i == len(document)-1 or current_length >= target_seq_length: 119 | if current_chunk: 120 | # `a_end` is how many segments from `current_chunk` go into the `A` 121 | # (first) sentence. 122 | a_end = 1 123 | if len(current_chunk) >= 2: 124 | a_end = random.randint(1, len(current_chunk) - 1) 125 | 126 | tokens_a = [] 127 | for j in range(a_end): 128 | tokens_a.extend(current_chunk[j]) 129 | 130 | tokens_b = [] 131 | 132 | # Random Next 133 | is_random_next = False 134 | if len(current_chunk) == 1 or random.random() < 0.5: 135 | is_random_next = True 136 | target_b_length = target_seq_length - len(tokens_a) 137 | 138 | # Pick a random document 139 | for _ in range(10): 140 | random_doc_index = random.randint( 141 | 0, len(self.documents) - 1) 142 | if random_doc_index != index: 143 | break 144 | 145 | random_doc = self.documents[random_doc_index] 146 | random_start = random.randint(0, len(random_doc)-1) 147 | for j in range(random_start, len(random_doc)): 148 | tokens_b.extend(random_doc[j]) 149 | if len(tokens_b) >= target_b_length: 150 | break 151 | 152 | # We didn't actually use these segments so we "put them back" so 153 | # they don't go to waste. 154 | num_unused_segments = len(current_chunk) - a_end 155 | i -= num_unused_segments 156 | 157 | # Actual Next 158 | else: 159 | is_random_next = False 160 | for j in range(a_end, len(current_chunk)): 161 | tokens_b.extend(current_chunk[j]) 162 | 163 | truncate_input_sequence(tokens_a, tokens_b, max_num_tokens) 164 | 165 | assert len(tokens_a) >= 1 166 | assert len(tokens_b) >= 1 167 | 168 | instances.append(TokenInstance( 169 | tokens_a, tokens_b, int(is_random_next))) 170 | 171 | current_chunk = [] 172 | current_length = 0 173 | i += 1 174 | 175 | return instances 176 | 177 | 178 | class GenericPretrainingDataCreator(PretrainingDataCreator): 179 | def __init__(self, path, tokenizer: BertTokenizer, max_seq_length: int = 512, readin: int = 2000000, dupe_factor: int = 6, small_seq_prob: float = 0.1): 180 | self.dupe_factor = dupe_factor 181 | self.max_seq_length = max_seq_length 182 | self.small_seq_prob = small_seq_prob 183 | 184 | documents = [] 185 | instances = [] 186 | with open(path, encoding='utf-8') as fd: 187 | document = [] 188 | for i, line in enumerate(tqdm(fd)): 189 | line = line.replace('\n', '') 190 | # document = line 191 | # if len(document.split("")) <= 3: 192 | # continue 193 | if len(line) == 0: # This is end of document 194 | documents.append(document) 195 | document = [] 196 | if len(line.split(' ')) > 2: 197 | document.append(tokenizer.tokenize(line)) 198 | if len(document) > 0: 199 | documents.append(document) 200 | 201 | documents = [x for x in documents if x] 202 | print(documents[0]) 203 | print(len(documents)) 204 | self.documents = documents 205 | for _ in range(self.dupe_factor): 206 | for index in range(len(self.documents)): 207 | instances.extend(self.create_training_instance(index)) 208 | 209 | shuffle(instances) 210 | self.instances = instances 211 | self.len = len(self.instances) 212 | self.documents = None 213 | documents = None 214 | 215 | class WikiPretrainingDataCreator(PretrainingDataCreator): 216 | def __init__(self, path, tokenizer: BertTokenizer, max_seq_length: int = 512, readin: int = 2000000, dupe_factor: int = 6, small_seq_prob: float = 0.1): 217 | self.dupe_factor = dupe_factor 218 | self.max_seq_length = max_seq_length 219 | self.small_seq_prob = small_seq_prob 220 | 221 | documents = [] 222 | instances = [] 223 | with open(path, encoding='utf-8') as fd: 224 | document = [] 225 | for i, line in enumerate(tqdm(fd)): 226 | line = line.replace('\n', '') 227 | # document = line 228 | # if len(document.split("")) <= 3: 229 | # continue 230 | if len(line) > 0 and line[:2] == "[[" : # This is end of document 231 | documents.append(document) 232 | document = [] 233 | if len(line.split(' ')) > 2: 234 | document.append(tokenizer.tokenize(line)) 235 | if len(document) > 0: 236 | documents.append(document) 237 | 238 | documents = [x for x in documents if x] 239 | self.documents = documents 240 | for _ in range(self.dupe_factor): 241 | for index in range(len(self.documents)): 242 | instances.extend(self.create_training_instance(index)) 243 | 244 | shuffle(instances) 245 | self.instances = instances 246 | self.len = len(self.instances) 247 | self.documents = None 248 | documents = None -------------------------------------------------------------------------------- /pretrain/PyTorch/text.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | PAD = 0 4 | 5 | def mask(x): 6 | return x != PAD 7 | 8 | def torch_long(x): 9 | return torch.LongTensor(x) 10 | -------------------------------------------------------------------------------- /pretrain/PyTorch/train.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import numpy as np 4 | import random 5 | import os 6 | import sys 7 | import json 8 | import shutil 9 | import torch 10 | import torch.nn as nn 11 | import torch.distributed as dist 12 | from torch.utils.data import DataLoader, Dataset 13 | from torch.utils.data.sampler import RandomSampler 14 | from torch.utils.data.distributed import DistributedSampler 15 | 16 | import argparse 17 | from tqdm import tqdm 18 | from checkpoint import checkpoint_model, load_checkpoint, latest_checkpoint_file 19 | from logger import Logger 20 | from utils import get_sample_writer 21 | from models import BertMultiTask 22 | from dataset import PreTrainingDataset 23 | from dataset import PretrainDataType 24 | from pytorch_pretrained_bert.tokenization import BertTokenizer 25 | from pytorch_pretrained_bert.optimization import BertAdam 26 | from optimization import warmup_linear_decay_exp 27 | from azureml_adapter import set_environment_variables_for_nccl_backend, get_local_rank, get_global_size, get_local_size 28 | from sources import PretrainingDataCreator, TokenInstance, GenericPretrainingDataCreator 29 | from sources import WikiPretrainingDataCreator 30 | from configuration import BertJobConfiguration 31 | 32 | from azureml.core.run import Run 33 | 34 | 35 | def get_effective_batch(total): 36 | if use_multigpu_with_single_device_per_process: 37 | return total//dist.get_world_size()//train_batch_size//gradient_accumulation_steps 38 | else: 39 | return total//train_batch_size//gradient_accumulation_steps # Dividing with gradient_accumulation_steps since we multiplied it earlier 40 | 41 | 42 | def get_dataloader(dataset: Dataset, eval_set=False): 43 | if not use_multigpu_with_single_device_per_process: 44 | train_sampler = RandomSampler(dataset) 45 | else: 46 | train_sampler = DistributedSampler(dataset) 47 | return (x for x in DataLoader(dataset, batch_size=train_batch_size // 2 if eval_set else train_batch_size, 48 | sampler=train_sampler, num_workers=job_config.get_num_workers())) 49 | 50 | 51 | def pretrain_validation(index): 52 | model.eval() 53 | dataset = PreTrainingDataset(tokenizer=tokenizer, 54 | folder=args.validation_path, 55 | logger=logger, max_seq_length=max_seq_length, 56 | index=index, data_type=PretrainDataType.VALIDATION, 57 | max_predictions_per_seq=max_predictions_per_seq, 58 | masked_lm_prob=masked_lm_prob) 59 | data_batches = get_dataloader(dataset, eval_set=True) 60 | eval_loss = 0 61 | nb_eval_steps = 0 62 | 63 | for batch in data_batches: 64 | batch = tuple(t.to(device) for t in batch) 65 | tmp_eval_loss = model.network(batch, log=False) 66 | dist.reduce(tmp_eval_loss, 0) 67 | # Reduce to get the loss from all the GPU's 68 | tmp_eval_loss = tmp_eval_loss / dist.get_world_size() 69 | eval_loss += tmp_eval_loss.mean().item() 70 | nb_eval_steps += 1 71 | eval_loss = eval_loss / nb_eval_steps 72 | logger.info(f"Validation Loss for epoch {index + 1} is: {eval_loss}") 73 | if check_write_log(): 74 | summary_writer.add_scalar(f'Validation/Loss', eval_loss, index + 1) 75 | run.log("validation_loss", np.float(eval_loss)) 76 | run.log_row("validation_loss over epochs", epoch = index, val_loss = np.float(eval_loss)) 77 | return eval_loss 78 | 79 | 80 | def train(index): 81 | model.train() 82 | dataloaders = {} 83 | i = 0 84 | global global_step 85 | datalengths = [] 86 | batchs_per_dataset = [] 87 | 88 | # Pretraining datasets 89 | wiki_pretrain_dataset = PreTrainingDataset(tokenizer=tokenizer, 90 | folder=args.train_path, 91 | logger=logger, max_seq_length=max_seq_length, 92 | index=index, data_type=PretrainDataType.WIKIPEDIA, 93 | max_predictions_per_seq=max_predictions_per_seq, 94 | masked_lm_prob=masked_lm_prob) 95 | 96 | datalengths.append(len(wiki_pretrain_dataset)) 97 | dataloaders[i] = get_dataloader(wiki_pretrain_dataset) 98 | 99 | num_batches_in_dataset = get_effective_batch(len(wiki_pretrain_dataset)) 100 | logger.info('Wikpedia data file: Number of samples {}, number of batches required to process these samples: {}'.format(len(wiki_pretrain_dataset), num_batches_in_dataset)) 101 | 102 | batchs_per_dataset.append(num_batches_in_dataset) 103 | i += 1 104 | 105 | logger.info("Training on Wikipedia dataset") 106 | 107 | total_length = sum(datalengths) 108 | 109 | dataset_batches = [] 110 | for i, batch_count in enumerate(batchs_per_dataset): 111 | dataset_batches.extend([i] * batch_count) 112 | logger.info('Number of batches to process *all* data samples in this epoch: {}'.format(len(dataset_batches))) 113 | # shuffle 114 | random.shuffle(dataset_batches) 115 | 116 | # We don't want the dataset to be n the form of alternate chunks if we have more than 117 | # one dataset type, instead we want to organize them into contiguous chunks of each 118 | # data type, hence the multiplication with grad_accumulation_steps with dataset_batch_type 119 | dataset_picker = [] 120 | for dataset_batch_type in dataset_batches: 121 | dataset_picker.extend([dataset_batch_type] * gradient_accumulation_steps ) 122 | 123 | logger.info('Number of steps to process all batches in this epoch: {}'.format(len(dataset_picker))) 124 | model.train() 125 | 126 | # Counter of sequences in an "epoch" 127 | sequences_counter = 0 128 | global_step_loss = 0 129 | 130 | for step, dataset_type in enumerate(dataset_picker): 131 | try: 132 | batch = next(dataloaders[dataset_type]) 133 | 134 | sequences_counter += len(batch) 135 | 136 | if n_gpu == 1: 137 | batch = tuple(t.to(device) for t in batch) # Move to GPU 138 | 139 | if step > 1 and step % 1000 == 0: 140 | logger.info("{} Number of sequences processed so far: {} (cumulative in {} steps)".format(datetime.utcnow(), sequences_counter, step)) 141 | # Calculate forward pass 142 | loss = model.network(batch) 143 | 144 | if n_gpu > 1: 145 | # this is to average loss for multi-gpu. In DistributedDataParallel 146 | # setting, we get tuple of losses form all proccesses 147 | loss = loss.mean() 148 | 149 | if gradient_accumulation_steps > 1: 150 | loss = loss / gradient_accumulation_steps 151 | 152 | # Enabling optimized Reduction 153 | # reduction only happens in backward if this method is called before 154 | # when using the distributed module 155 | if accumulate_gradients: 156 | if use_multigpu_with_single_device_per_process and (step + 1) % gradient_accumulation_steps == 0: 157 | model.network.enable_need_reduction() 158 | else: 159 | model.network.disable_need_reduction() 160 | if fp16: 161 | optimizer.backward(loss) 162 | else: 163 | loss.backward() 164 | 165 | global_step_loss += loss 166 | if (step + 1) % gradient_accumulation_steps == 0: 167 | if fp16: 168 | # modify learning rate with special warm up BERT uses 169 | # if fp16 is False, BertAdam is used that handles this automatically 170 | lr_this_step = \ 171 | job_config.get_learning_rate() * warmup_linear_decay_exp(global_step, 172 | job_config.get_decay_rate(), 173 | job_config.get_decay_step(), 174 | job_config.get_total_training_steps(), 175 | job_config.get_warmup_proportion()) 176 | for param_group in optimizer.param_groups: 177 | param_group['lr'] = lr_this_step 178 | 179 | # Record the LR against global_step on tensorboard 180 | if check_write_log(): 181 | summary_writer.add_scalar(f'Train/lr', lr_this_step, global_step) 182 | 183 | optimizer.step() 184 | optimizer.zero_grad() 185 | global_step += 1 186 | if check_write_log() and (global_step%args.log_steps == 0): 187 | run.log("training_loss", np.float(global_step_loss)) 188 | run.log("lr_this_step", np.float(lr_this_step)) 189 | run.log_row("loss over steps", global_step = global_step, loss = np.float(global_step_loss)) 190 | run.log_row("lr over steps", global_step = global_step, lr = np.float(lr_this_step)) 191 | global_step_loss = 0 192 | except StopIteration: 193 | continue 194 | 195 | logger.info("Completed {} steps".format(step)) 196 | logger.info("Completed processing {} sequences".format(sequences_counter)) 197 | 198 | # Run Validation Loss 199 | if max_seq_length == 512: 200 | logger.info(f"TRAIN BATCH SIZE: {train_batch_size}") 201 | return pretrain_validation(index) 202 | else: 203 | return None 204 | 205 | 206 | def str2bool(val): 207 | return val.lower() == "true" or val.lower() == "t" or val.lower() == "1" 208 | 209 | def check_write_log(): 210 | return dist.get_rank() == 0 or not use_multigpu_with_single_device_per_process 211 | 212 | if __name__ == '__main__': 213 | print("The arguments are: " + str(sys.argv)) 214 | 215 | parser = argparse.ArgumentParser() 216 | 217 | # Required_parameters 218 | parser.add_argument("--config_file", "--cf", 219 | help="pointer to the configuration file of the experiment", type=str, required=True) 220 | 221 | parser.add_argument("--config_file_path", default=None, type=str, required=True, 222 | help="The blob storage directory where config file is located.") 223 | 224 | parser.add_argument("--train_path", default=None, type=str, required=True, 225 | help="The blob storage directory for train data, cache and output.") 226 | 227 | parser.add_argument("--validation_path", default=None, type=str, required=True, 228 | help="The blob storage directory for validation data, cache and output.") 229 | 230 | parser.add_argument('--tokenizer_path', type=str, default=False, 231 | help="Path to load the tokenizer from") 232 | parser.add_argument("--output_dir", default=None, type=str, required=True, 233 | help="If given, model checkpoints will be saved to this directory.") 234 | 235 | # Optional Params 236 | parser.add_argument("--best_cp_dir", default=None, type=str, 237 | help="If given, model best checkpoint will be saved to this directory.") 238 | parser.add_argument("--latest_cp_dir", default=None, type=str, 239 | help="If given, model latest checkpoint will be saved to this directory.") 240 | parser.add_argument("--max_seq_length", default=512, type=int, 241 | help="The maximum total input sequence length after WordPiece tokenization. Sequences " 242 | "longer than this will be truncated, and sequences shorter than this will be padded.") 243 | parser.add_argument("--max_predictions_per_seq", "--max_pred", default=80, type=int, 244 | help="The maximum number of masked tokens in a sequence to be predicted.") 245 | parser.add_argument("--masked_lm_prob", "--mlm_prob", default=0.15, 246 | type=float, help="The masking probability for languge model.") 247 | parser.add_argument("--train_batch_size", default=32, 248 | type=int, help="Total batch size for training.") 249 | parser.add_argument("--no_cuda", 250 | type=str, 251 | default='False', 252 | help="Whether not to use CUDA when available") 253 | parser.add_argument('--seed', 254 | type=int, 255 | default=42, 256 | help="random seed for initialization") 257 | parser.add_argument('--accumulate_gradients', 258 | type=str, 259 | default='True', 260 | help="Enabling gradient accumulation optimization") 261 | parser.add_argument('--gradient_accumulation_steps', 262 | type=int, 263 | default=1, 264 | help="Number of updates steps to accumulate before performing a backward/update pass.") 265 | parser.add_argument('--fp16', 266 | type=str, 267 | default='False', 268 | help="Whether to use 16-bit float precision instead of 32-bit") 269 | parser.add_argument('--use_pretrain', 270 | type=str, 271 | default='False', 272 | help="Whether to use Bert Pretrain Weights or not") 273 | parser.add_argument('--loss_scale', 274 | type=float, 275 | default=0, 276 | help='Loss scaling, positive power of 2 values can improve fp16 convergence.') 277 | parser.add_argument('--load_training_checkpoint', '--load_cp', 278 | type=str, 279 | default='False', 280 | help="This is the path to the TAR file which contains model+opt state_dict() checkpointed.") 281 | parser.add_argument('--use_multigpu_with_single_device_per_process', 282 | type=str, 283 | default='True', 284 | help="Whether only one device is managed per process") 285 | parser.add_argument('--epochs', 286 | type=int, 287 | default=250, 288 | help="total number of epochs") 289 | parser.add_argument('--log_steps', 290 | type=int, 291 | default=50, 292 | help="logging intervals") 293 | parser.add_argument('--backend', 294 | type=str, 295 | default='nccl', 296 | help="reduce backend to use") 297 | 298 | parser.add_argument('--master_port', 299 | type=int, 300 | default=6105, 301 | help="user specified master port for non-mpi job") 302 | 303 | args = parser.parse_args() 304 | 305 | if args.output_dir: 306 | os.makedirs(args.output_dir, exist_ok=True) 307 | if args.best_cp_dir: 308 | os.makedirs(args.best_cp_dir, exist_ok=True) 309 | if args.latest_cp_dir: 310 | os.makedirs(args.latest_cp_dir, exist_ok=True) 311 | 312 | no_cuda = str2bool(args.no_cuda) 313 | fp16 = str2bool(args.fp16) 314 | accumulate_gradients = str2bool(args.accumulate_gradients) 315 | use_pretrain = str2bool(args.use_pretrain) 316 | use_multigpu_with_single_device_per_process = str2bool(args.use_multigpu_with_single_device_per_process) 317 | 318 | config_file = args.config_file 319 | gradient_accumulation_steps = args.gradient_accumulation_steps 320 | train_batch_size = args.train_batch_size 321 | seed = args.seed 322 | loss_scale = args.loss_scale 323 | load_training_checkpoint = args.load_training_checkpoint 324 | max_seq_length = args.max_seq_length 325 | max_predictions_per_seq = args.max_predictions_per_seq 326 | masked_lm_prob = args.masked_lm_prob 327 | master_port = args.master_port 328 | 329 | local_rank = -1 330 | 331 | local_rank = get_local_rank() 332 | global_size = get_global_size() 333 | local_size = get_local_size() 334 | # TODO use logger 335 | print('local_rank = {}'.format(local_rank)) 336 | print('global_size = {}'.format(global_size)) 337 | print('local_size = {}'.format(local_size)) 338 | 339 | set_environment_variables_for_nccl_backend(local_size == global_size, master_port) 340 | 341 | # Prepare Logger 342 | logger = Logger(cuda=torch.cuda.is_available()) 343 | 344 | # # Extact config file from blob storage 345 | job_config = BertJobConfiguration(config_file_path=os.path.join(args.config_file_path, config_file)) 346 | 347 | job_name = job_config.get_name() 348 | # Setting the distributed variables 349 | 350 | run = Run.get_context() 351 | 352 | if not use_multigpu_with_single_device_per_process: 353 | device = torch.device("cuda") 354 | n_gpu = torch.cuda.device_count() 355 | else: 356 | device = torch.device("cuda", local_rank) 357 | n_gpu = 1 358 | # Initializes the distributed backend which will take care of synchronizing nodes/GPUs 359 | torch.distributed.init_process_group(backend=args.backend) 360 | if fp16: 361 | logger.info("16-bits distributed training is not officially supported in the version of PyTorch currently used, but it works. Refer to https://github.com/pytorch/pytorch/pull/13496 for supported version.") 362 | fp16 = True # 363 | logger.info("device: {} n_gpu: {}, use_multigpu_with_single_device_per_process: {}, 16-bits training: {}".format( 364 | device, n_gpu, use_multigpu_with_single_device_per_process, fp16)) 365 | 366 | if gradient_accumulation_steps < 1: 367 | raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( 368 | gradient_accumulation_steps)) 369 | 370 | train_batch_size = int(train_batch_size / gradient_accumulation_steps) 371 | 372 | # Setting all the seeds so that the task is random but same accross processes 373 | random.seed(seed) 374 | np.random.seed(seed) 375 | torch.manual_seed(seed) 376 | logger.info 377 | if n_gpu > 0: 378 | torch.cuda.manual_seed_all(seed) 379 | 380 | # Create an outputs/ folder in the blob storage 381 | if args.output_dir is None: 382 | parent_dir = os.path.join(args.output_dir, 'outputs', str(run.experiment.name)) 383 | output_dir = os.path.join(parent_dir, str(run.id)) 384 | os.makedirs(output_dir, exist_ok=True) 385 | saved_model_path = os.path.join(output_dir, "saved_models", job_name) 386 | os.makedirs(saved_model_path, exist_ok=True) 387 | else: 388 | saved_model_path = args.output_dir 389 | 390 | summary_writer = None 391 | # Prepare Summary Writer and saved_models path 392 | if check_write_log(): 393 | #azureml.tensorboard only streams from /logs directory, therefore hardcoded 394 | summary_writer = get_sample_writer( 395 | name=job_name, base='./logs') 396 | 397 | # Loading Tokenizer (vocabulary from blob storage, if exists) 398 | logger.info("Extracting the vocabulary") 399 | if args.tokenizer_path: 400 | logger.info(f'Loading tokenizer from {args.tokenizer_path}') 401 | tokenizer = BertTokenizer.from_pretrained( 402 | args.tokenizer_path, cache_dir=args.output_dir) 403 | else: 404 | tokenizer = BertTokenizer.from_pretrained(job_config.get_token_file_type(), cache_dir=args.output_dir) 405 | logger.info("Vocabulary contains {} tokens".format(len(list(tokenizer.vocab.keys())))) 406 | 407 | 408 | # Loading Model 409 | logger.info("Initializing BertMultiTask model") 410 | model = BertMultiTask(job_config = job_config, use_pretrain = use_pretrain, tokenizer = tokenizer, 411 | cache_dir = args.output_dir, device = device, write_log = check_write_log(), 412 | summary_writer = summary_writer) 413 | 414 | logger.info("Converting the input parameters") 415 | if fp16: 416 | model.half() 417 | 418 | model.to(device) 419 | 420 | if use_multigpu_with_single_device_per_process: 421 | try: 422 | if accumulate_gradients: 423 | logger.info("Enabling gradient accumulation by using a forked version of DistributedDataParallel implementation available in the branch bertonazureml/apex at https://www.github.com/microsoft/apex") 424 | from distributed_apex import DistributedDataParallel as DDP 425 | else: 426 | logger.info("Using Default Apex DistributedDataParallel implementation") 427 | from apex.parallel import DistributedDataParallel as DDP 428 | except ImportError: 429 | raise ImportError("To use distributed and fp16 training, please install apex from the branch bertonazureml/apex at https://www.github.com/microsoft/apex.") 430 | torch.cuda.set_device(local_rank) 431 | model.network = DDP(model.network, delay_allreduce=False) 432 | 433 | elif n_gpu > 1: 434 | model.network = nn.DataParallel(model.network) 435 | 436 | # Prepare Optimizer 437 | logger.info("Preparing the optimizer") 438 | param_optimizer = list(model.network.named_parameters()) 439 | param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] 440 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] 441 | optimizer_grouped_parameters = [ 442 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, 443 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 444 | ] 445 | 446 | logger.info("Loading Apex and building the FusedAdam optimizer") 447 | 448 | if fp16: 449 | try: 450 | from apex.optimizers import FP16_Optimizer, FusedAdam 451 | except: 452 | raise ImportError("To use distributed and fp16 training, please install apex from the branch bertonazureml/apex at https://www.github.com/microsoft/apex.") 453 | 454 | optimizer = FusedAdam(optimizer_grouped_parameters, 455 | lr=job_config.get_learning_rate(), 456 | bias_correction=False, 457 | max_grad_norm=1.0) 458 | if loss_scale == 0: 459 | optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) 460 | else: 461 | optimizer = FP16_Optimizer( 462 | optimizer, static_loss_scale=loss_scale) 463 | else: 464 | optimizer = BertAdam(optimizer_grouped_parameters, 465 | lr=job_config.get_learning_rate(), 466 | warmup=job_config.get_warmup_proportion(), 467 | t_total=job_config.get_total_training_steps()) 468 | 469 | global_step = 0 470 | start_epoch = 0 471 | 472 | # if args.load_training_checkpoint is not None: 473 | if load_training_checkpoint != 'False': 474 | logger.info(f"Looking for previous training checkpoint.") 475 | latest_checkpoint_path = latest_checkpoint_file(args.load_training_checkpoint, no_cuda) 476 | 477 | logger.info(f"Restoring previous training checkpoint from {latest_checkpoint_path}") 478 | start_epoch, global_step = load_checkpoint(model, optimizer, latest_checkpoint_path) 479 | logger.info(f"The model is loaded from last checkpoint at epoch {start_epoch} when the global steps were at {global_step}") 480 | 481 | 482 | logger.info("Training the model") 483 | 484 | best_loss = None 485 | for index in range(start_epoch, args.epochs): 486 | logger.info(f"Training epoch: {index + 1}") 487 | 488 | eval_loss = train(index) 489 | 490 | if check_write_log(): 491 | if best_loss is None or eval_loss is None or eval_loss < best_loss*0.99: 492 | best_loss = eval_loss 493 | epoch_ckp_path = os.path.join(saved_model_path, "bert_encoder_epoch_{0:04d}.pt".format(index + 1)) 494 | checkpoint_model(os.path.join(saved_model_path, "training_state_checkpoint_{0:04d}.tar".format(index + 1)), model, optimizer, index, global_step) 495 | logger.info(f"Saving checkpoint of the model from epoch {index + 1} at {epoch_ckp_path}") 496 | model.save_bert(epoch_ckp_path) 497 | 498 | #save best checkpoint in separate directory 499 | if args.best_cp_dir: 500 | best_ckp_path = os.path.join(args.best_cp_dir, "bert_encoder_epoch_{0:04d}.pt".format(index + 1)) 501 | shutil.rmtree(args.best_cp_dir) 502 | os.makedirs(args.best_cp_dir,exist_ok=True) 503 | model.save_bert(best_ckp_path) 504 | 505 | if args.latest_cp_dir: 506 | shutil.rmtree(args.latest_cp_dir) 507 | os.makedirs(args.latest_cp_dir,exist_ok=True) 508 | checkpoint_model(os.path.join(args.latest_cp_dir, "training_state_checkpoint_{0:04d}.tar".format(index + 1)), model, optimizer, index, global_step) 509 | latest_ckp_path = os.path.join(args.latest_cp_dir, "bert_encoder_epoch_{0:04d}.pt".format(index + 1)) 510 | model.save_bert(latest_ckp_path) 511 | -------------------------------------------------------------------------------- /pretrain/PyTorch/utils.py: -------------------------------------------------------------------------------- 1 | import sys as _sys 2 | 3 | from typing import List 4 | from collections import _iskeyword # type: ignore 5 | from tensorboardX import SummaryWriter 6 | import os 7 | 8 | SUMMARY_WRITER_DIR_NAME = 'runs' 9 | 10 | 11 | def get_sample_writer(name, base=".."): 12 | """Returns a tensorboard summary writer 13 | """ 14 | return SummaryWriter(log_dir=os.path.join(base, SUMMARY_WRITER_DIR_NAME, name)) 15 | -------------------------------------------------------------------------------- /pretrain/README.md: -------------------------------------------------------------------------------- 1 | # Pretrain BERT Model on Azure Machine Learning service 2 | To pretrain BERT language representation models on AzureML, following artifacts are required: 3 | - [Azure Machine Learning Workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/setup-create-workspace) with an AzureML Compute cluster with 64 V100 GPUs (either 16 x `NC24s_v3` or 8 x `ND40_v2` VMs). Note that by default your subscription might not have enough quota and you are likely to submit a support ticket to get enough quota by following the guide [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas). 4 | - Preprocessed data: [BERT paper](https://arxiv.org/pdf/1810.04805) references `Wikipedia` and `BookCorpus` datasets for pretraining. The notebook in this pretrain recipe is configured to use Wikipedia dataset only, but can be used with other datasets as well, including custom datasets. The preprocessed data should be available in a `Datastore` registered to the AzureML `Workspace` that will be used for BERT pretraining. Preprocessed Wikipedia corpus is made available for use with the pretraining recipe in this repo. Refer to the [instructions](../docs/artifacts.md) to access preprocessed Wikipedia corpus for pretraining. You can copy the Wikipedia dataset from this location to another Azure blob container and register it as a workspace before using it in the pretraining job. Alternatively, you can preprocess the data from scratch (refer to [instructions](../docs/dataprep.md) on this), upload that to an Azure blob container and use it as the datastore for the pretraining job. Note that it is also possible to use other datasets with little or no modifications in this pretraining recipe. 5 | - Job configuration to define the parameters for the pretraining job. Refer to [configs](./configs/) directory for different configuration settings (`BERT-base` vs. `BERT-large`, like `single-node configurations for debugging` vs. `multi-node configurations for production-ready pretraining`). 6 | - Code to pretrain BERT model in AzureML. The notebook to submit a pretrain job to AzureML is available at [BERT_Pretrain.ipynb](./PyTorch/notebooks/BERT_Pretrain.ipynb). 7 | 8 | ## Submit Pretrain job 9 | [BERT_Pretrain.ipynb](./PyTorch/notebooks/BERT_Pretrain.ipynb) notebook has the recipe to submit bert-large pretraining job to AzureML service and monitor metrics in Tensorboard. 10 | -------------------------------------------------------------------------------- /pretrain/configs/bert-base-single-node.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "bing-bert-base-single-node-4-gpu-4096-bs", 3 | "bert_token_file": "bert-base-uncased", 4 | "bert_model_file": "bert-base-uncased", 5 | "bert_model_config": { 6 | "vocab_size_or_config_json_file": 119547, 7 | "hidden_size": 768, 8 | "num_hidden_layers": 12, 9 | "num_attention_heads": 12, 10 | "intermediate_size": 3072, 11 | "hidden_act": "gelu", 12 | "hidden_dropout_prob": 0.1, 13 | "attention_probs_dropout_prob": 0.1, 14 | "max_position_embeddings": 512, 15 | "type_vocab_size": 2, 16 | "initializer_range": 0.02 17 | }, 18 | "data": { 19 | "datasets": { 20 | "wiki_pretrain_dataset": "placeholder/512/wiki_pretrain" 21 | } 22 | }, 23 | "training": { 24 | "num_epochs": 500, 25 | "warmup_proportion": 0.1, 26 | "learning_rate": 4e-4, 27 | "num_workers": 0, 28 | "decay_rate": 0.99, 29 | "decay_step": 520, 30 | "total_training_steps": 125000 31 | }, 32 | "validation": { 33 | "path": "placeholder/validation_512_only" 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /pretrain/configs/bert-base.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "bing-bert-base", 3 | "bert_token_file": "bert-base-uncased", 4 | "bert_model_file": "bert-base-uncased", 5 | "bert_model_config": { 6 | "vocab_size_or_config_json_file": 119547, 7 | "hidden_size": 768, 8 | "num_hidden_layers": 12, 9 | "num_attention_heads": 12, 10 | "intermediate_size": 3072, 11 | "hidden_act": "gelu", 12 | "hidden_dropout_prob": 0.1, 13 | "attention_probs_dropout_prob": 0.1, 14 | "max_position_embeddings": 512, 15 | "type_vocab_size": 2, 16 | "initializer_range": 0.02 17 | }, 18 | "data": { 19 | "datasets": { 20 | "wiki_pretrain_dataset": "placeholder/512/wiki_pretrain" 21 | } 22 | }, 23 | "training": { 24 | "num_epochs": 325, 25 | "warmup_proportion": 0.1, 26 | "learning_rate": 4e-4, 27 | "num_workers": 0, 28 | "decay_rate": 0.99, 29 | "decay_step": 520, 30 | "total_training_steps": 125000 31 | }, 32 | "validation": { 33 | "path": "placeholder/validation_512_only" 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /pretrain/configs/bert-large-single-node.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "bing-bert-large-single-node-4-gpu-4096-bs", 3 | "bert_token_file": "bert-large-uncased", 4 | "bert_model_file": "bert-large-uncased", 5 | "bert_model_config": { 6 | "vocab_size_or_config_json_file": 119547, 7 | "hidden_size": 1024, 8 | "num_hidden_layers": 24, 9 | "num_attention_heads": 16, 10 | "intermediate_size": 4096, 11 | "hidden_act": "gelu", 12 | "hidden_dropout_prob": 0.1, 13 | "attention_probs_dropout_prob": 0.1, 14 | "max_position_embeddings": 512, 15 | "type_vocab_size": 2, 16 | "initializer_range": 0.02 17 | }, 18 | "data": { 19 | "datasets": { 20 | "wiki_pretrain_dataset": "placeholder/512/wiki_pretrain" 21 | } 22 | }, 23 | "training": { 24 | "num_epochs": 500, 25 | "warmup_proportion": 0.02, 26 | "learning_rate": 2e-4, 27 | "num_workers": 0, 28 | "decay_rate": 0.99, 29 | "decay_step": 1000, 30 | "total_training_steps": 187000 31 | }, 32 | "validation": { 33 | "path": "placeholder/validation_512_only" 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /pretrain/configs/bert-large.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "bing-bert-large", 3 | "bert_token_file": "bert-large-uncased", 4 | "bert_model_file": "bert-large-uncased", 5 | "bert_model_config": { 6 | "vocab_size_or_config_json_file": 119547, 7 | "hidden_size": 1024, 8 | "num_hidden_layers": 24, 9 | "num_attention_heads": 16, 10 | "intermediate_size": 4096, 11 | "hidden_act": "gelu", 12 | "hidden_dropout_prob": 0.1, 13 | "attention_probs_dropout_prob": 0.1, 14 | "max_position_embeddings": 512, 15 | "type_vocab_size": 2, 16 | "initializer_range": 0.02 17 | }, 18 | "data": { 19 | "datasets": { 20 | "wiki_pretrain_dataset": "placeholder/512/wiki_pretrain" 21 | } 22 | }, 23 | "training": { 24 | "num_epochs": 250, 25 | "warmup_proportion": 0.02, 26 | "learning_rate": 2e-4, 27 | "num_workers": 0, 28 | "decay_rate": 0.99, 29 | "decay_step": 1000, 30 | "total_training_steps": 187000 31 | }, 32 | "validation": { 33 | "path": "placeholder/validation_512_only" 34 | } 35 | } 36 | --------------------------------------------------------------------------------