├── .gitignore
├── LICENSE
├── README.md
├── SECURITY.md
├── docs
    ├── artifacts.md
    ├── bert-intro.md
    └── dataprep.md
├── finetune
    ├── PyTorch
    │   ├── azureml_bert_util.py
    │   ├── dockerfile
    │   ├── notebooks
    │   │   ├── BERT_Eval_GLUE.ipynb
    │   │   ├── BERT_Eval_SQUAD.ipynb
    │   │   ├── Pretrained-BERT-GLUE.ipynb
    │   │   └── Pretrained-BERT-NER.ipynb
    │   └── run_classifier_azureml.py
    ├── README.md
    ├── TensorFlow
    │   ├── download_model_and_dataset.py
    │   ├── notebooks
    │   │   └── Tensorflow-BERT-AzureML.ipynb
    │   └── run_classifier.py
    ├── evaluate_squad.py
    ├── run_classifier_azureml.py
    └── run_squad_azureml.py
└── pretrain
    ├── PyTorch
        ├── README.md
        ├── azureml_adapter.py
        ├── benchmark.py
        ├── checkpoint.py
        ├── configuration.py
        ├── dataprep
        │   ├── create_pretraining.py
        │   ├── sentence_segmentation.py
        │   ├── single_line_doc_file_creation.py
        │   └── split_data_into_files.py
        ├── dataset.py
        ├── distributed_apex.py
        ├── logger.py
        ├── models.py
        ├── notebooks
        │   └── BERT_Pretrain.ipynb
        ├── optimization.py
        ├── sources.py
        ├── text.py
        ├── train.py
        └── utils.py
    ├── README.md
    └── configs
        ├── bert-base-single-node.json
        ├── bert-base.json
        ├── bert-large-single-node.json
        └── bert-large.json


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # JetBrains Rider
107 | .idea/
108 | *.sln.iml
109 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation. All rights reserved.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # BERT on Azure Machine Learning Service
 2 | This repo contains end-to-end recipes to [pretrain](#pretrain) and [finetune](#finetune) the [BERT](https://arxiv.org/abs/1810.04805) (Bidirectional Encoder Representations from Transformers) language representation model using [Azure Machine Learning service](https://azure.microsoft.com/en-us/services/machine-learning-service/). 
 3 | 
 4 | **Update on 7/7/2020**: 🛑 A more recent implementation for BERT pretraining available at https://github.com/microsoft/onnxruntime-training-examples/tree/master/nvidia-bert is significantly faster than the implementation in this repo. That implementation uses [ONNX Runtime](https://github.com/microsoft/onnxruntime) to accelerate training and it can be used in environments with GPU including Azure Machine Learning service. Details on using ONNX Runtime for training and accelerating training of Transformer models like [BERT](https://arxiv.org/abs/1810.04805) and [GPT-2](https://openai.com/blog/better-language-models/) are available in the blog at [ONNX Runtime Training Technical Deep Dive](https://techcommunity.microsoft.com/t5/azure-ai/onnx-runtime-training-technical-deep-dive/ba-p/1398310).
 5 | 
 6 | ## BERT
 7 | BERT is a language representation model that is distinguished by its capacity to effectively capture deep and subtle textual relationships in a corpus. In the original paper, the authors demonstrate that the BERT model could be easily adapted to build state-of-the-art models for a number of NLP tasks, including text classification, named entity recognition and question answering. In this repo, we provide notebooks that allow a developer to pretrain a BERT model from scratch on a corpus, as well as to fine-tune an existing BERT model to solve a specialized task. A brief [introduction to BERT](docs/bert-intro.md) is available in this repo for a quick start on BERT. 
 8 | 
 9 | ### Pretrain
10 | ###### Challenges in BERT Pretraining
11 | Pretraining a BERT language representation model to the desired level of accuracy is quite challenging; as a result, most developers start from a BERT model that was pre-trained on a standard corpus (such as Wikipedia), instead of training it from scratch. This strategy works well if the final model is being trained on a corpus that is similar to the corpus used in the pre-train step; however, if the problem involves a specialized corpus that's quite different from the standard corpus, the results won't be optimal. Additionally, to advance language representation beyond BERT’s accuracy, users will need to change the model architecture, training data, cost function, tasks, and optimization routines. All these changes need to be explored at large parameter and training data sizes. In the case of BERT-large, this could be quite substantial as it has 340 million parameters and trained over a very large document corpus. To support this with GPUs, machine learning engineers will need distributed training support to train these large models. However, due to the complexity and fragility of configuring these distributed environments, even expert tweaking can end up with inferior results from the trained models.
12 | 
13 | To address these issues, this repo is publishing a workflow for pretraining BERT-large models. Developers can now build their own language representation models like BERT using their domain-specific data on GPUs, either with their own hardware or using Azure Machine Learning service. The pretrain recipe in this repo includes the dataset and preprocessing scripts so anyone can experiment with building their own general purpose language representation models beyond BERT. Overall this is a stable, predictable recipe that converges to a good optimum for researchers to try explorations on their own.
14 | 
15 | ###### Implementation 
16 | The pretraining recipe in this repo is based on the [PyTorch Pretrained BERT v0.6.2](https://github.com/huggingface/pytorch-transformers/tree/v0.6.2) package from [Hugging Face](https://huggingface.co/). The implementation in this pretraining recipe includes optimization techniques such as `gradient accumulation` (gradients are accumulated for smaller mini-batches before updating model weights) and [`mixed precision training`](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html). The notebook and python modules for pretraining are available at [pretrain](./pretrain/) directory.
17 | 
18 | ###### Data Preprocessing
19 | Data preparation is one of the important steps in any Machine Learning project. For BERT pretraining, document-level corpus is needed. The quality of the data used for pretraining directly impacts the quality of the trained models. To make the data preprocessing easier and for repeatability of results, data preprocessing code is included in the repo. It may be used to pre-process Wikipedia corpus or other datasets for pretraining. Refer to additional information at [data preparation for pretraining](docs/dataprep.md) for details on that.
20 | 
21 | ### Finetune
22 | The finetuning recipe in this repo shows how to finetune the BERT language representation model using Azure Machine Learning service. The notebooks and python modules for finetuning are available at [finetune](./finetune/) directory. We finetune and evaluate our pretrained checkpoints against the following:
23 | 
24 | ###### GLUE benchmark
25 | The [General Language Understanding Evaluation (GLUE) benchmark](https://gluebenchmark.com/) is a collection of nine sentence- or sentence-pair language understanding tasks for evaluating and analyzing natural language understanding systems. The [BERT_Eval_GLUE.ipynb](./finetune/PyTorch/notebooks/BERT_Eval_GLUE.ipynb) jupyter notebook allows the user to run one of the pretrained checkpoints against these tasks on Azure ML.
26 | 
27 | ## Azure Machine Learning service
28 | [Azure Machine Learning service](https://azure.microsoft.com/en-us/services/machine-learning-service/) provides a cloud-based environment to prep data, train, test, deploy, manage, and track machine learning models. This service fully supports open-source technologies such as PyTorch, TensorFlow, and scikit-learn and can be used for any kind of machine learning, from classical ML to deep learning, supervised and unsupervised learning.
29 | 
30 | #### Notebooks
31 | Jupyter notebooks can be used to use AzureML Python SDK and submit pretrain and finetune jobs. This repo contains the following notebooks for different activities.
32 | 
33 | ###### PyTorch Notebooks
34 | |Activity |Notebook |
35 | |:---|:------|
36 | |Pretrain | [BERT_Pretrain.ipynb](./pretrain/PyTorch/notebooks/BERT_Pretrain.ipynb) |
37 | | [GLUE](https://www.nyu.edu/projects/bowman/glue.pdf) finetune/evaluate | [BERT_Eval_GLUE.ipynb](./finetune/PyTorch/notebooks/BERT_Eval_GLUE.ipynb) |
38 | 
39 | ###### TensorFlow Notebooks
40 | |Activity |Notebook |
41 | |:---|:------|
42 | | [GLUE](https://www.nyu.edu/projects/bowman/glue.pdf) finetune/evaluate | [Tensorflow-BERT-AzureML.ipynb](finetune/TensorFlow/notebooks/Tensorflow-BERT-AzureML.ipynb) |
43 | 
44 | 
45 | ## Code of Conduct
46 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
47 | 
48 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.8 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/docs/artifacts.md:
--------------------------------------------------------------------------------
 1 | # Artifacts for pretrain and finetune
 2 | 
 3 | The following artifacts are made available to make pretraining and finetuning of BERT models easier:
 4 | * Preprocessed data
 5 | * Pretrained BERT-base and BERT-large model checkpoints
 6 | 
 7 | ## Preprocessed Data
 8 | The Wikipedia corpus used for BERT pretraining is preprocessed following the [data prep instructions](dataprep.md) and uploaded to https://bertonazuremlwestus2.blob.core.windows.net/public2/bert_data.tar.gz (66 GB). The data files have the sequence length of 512. The directory structure is as follows and this directory hierarchy is assumed in the implementation in [train.py](../pretrain/pytorch/train.py).
 9 | ```
10 | bert_data
11 | │   bert-base.json
12 | │   bert-large.json
13 | │   bert-base-single-node.json
14 | │   bert-large-single-node.json
15 | │
16 | └───512
17 | │   │
18 | │   └───wiki_pretrain
19 | │       │   wikipedia_segmented_part_0.bin
20 | │       │   wikipedia_segmented_part_1.bin
21 | │       │   ...
22 | │       │   wikipedia_segmented_part_98.bin
23 | ```
24 | 
25 | Individual data files from wiki_pretrain directory are available at the following urls:
26 | * [wikipedia_segmented_part_0.bin](https://bertonazuremlwestus2.blob.core.windows.net/public2/data/preprocessed/512/wiki_pretrain/wikipedia_segmented_part_0.bin)
27 | * [wikipedia_segmented_part_1.bin](https://bertonazuremlwestus2.blob.core.windows.net/public2/data/preprocessed/512/wiki_pretrain/wikipedia_segmented_part_1.bin)
28 | * [wikipedia_segmented_part_2.bin](https://bertonazuremlwestus2.blob.core.windows.net/public2/data/preprocessed/512/wiki_pretrain/wikipedia_segmented_part_2.bin)
29 | * ...
30 | * [wikipedia_segmented_part_98.bin](https://bertonazuremlwestus2.blob.core.windows.net/public2/data/preprocessed/512/wiki_pretrain/wikipedia_segmented_part_98.bin)
31 | 
32 | Use below script to transfer data to your private blob `azcopy copy "https://bertonazuremlwestus2.blob.core.windows.net/public2"  "https://<destination-storage-account-name>.blob.core.windows.net/<container-name>?<SAS token>" --recursive`. See more about [Azure Blob Shared Access Signature](https://docs.microsoft.com/en-us/azure/storage/common/storage-dotnet-shared-access-signature-part-1) and [azcopy](https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-blobs).
33 | 
34 | ## Pretrained BERT Model Checkpoints
35 | The models pretrained in AzureML based on the original BERT implementation are available at the following locations:
36 | * [BERT-Large, Uncased (original)](https://bertonazuremlwestus2.blob.core.windows.net/public/models/bert_large_uncased_original/bert_encoder_epoch_200.pt)
37 | * [BERT-Base, Uncased (original)](https://bertonazuremlwestus2.blob.core.windows.net/public/models/bert_base_uncased_original/bert_encoder_epoch_0300.pt)
38 | 


--------------------------------------------------------------------------------
/docs/bert-intro.md:
--------------------------------------------------------------------------------
 1 | ## **Natural Language Processing**
 2 | 
 3 | In the natural language processing (NLP) domain, pre-trained language representations have traditionally been a key topic for a few important use cases, such as [named entity recognition](https://arxiv.org/pdf/cs/0306050.pdf) (Sang and Meulder, 2003), [question answering](https://arxiv.org/pdf/1606.05250.pdf) (Rajpurkar et al., 2016), and [syntactic parsing](https://nlp.stanford.edu/~mcclosky/papers/dmcc-naacl-2010.pdf) (McClosky et al., 2010).
 4 | 
 5 | The intuition for utilizing a pre-trained model is simple: A deep neural network that is trained on large corpus, say the entire Wikipedia dataset, should have enough knowledge about the underlying relationships between different words and sentences. One should then be able to adapt this DNN to be used on a different corpus, such as medical documents or financial documents, resulting in a model with better performance than one could obtain by training purely on the specialized corpus.
 6 | 
 7 | Recently, a paper called &quot;[BERT: Bidirectional Encoder Representations from Transformers](https://arxiv.org/abs/1810.04805)&quot; was published by Devlin et al., which achieves new state-of-the-art results on 11 NLP tasks, using the pre-trained approach mentioned above. In this repo, we want to show how customers can efficiently and easily pretrain and then fine-tune BERT for their custom applications using Azure Machine Learning Services. We open sourced the code on [GitHub](https://github.com/Microsoft/AzureML-BERT).
 8 | 
 9 | ## **Intuition behind BERT**
10 | 
11 | The intuition behind the new language model, BERT, is simple yet powerful. Researchers believe that a large enough deep neural network model, with large enough training corpus, should capture the contextual relations in the corpus. In NLP domain, it is hard to get a large annotated corpus, so researchers used a novel technique to get a lot of training data. Instead of having human beings label the corpus and feed it into neural networks, researchers use the large Internet available corpus such as English Wikipedia with 2,500M words. Two approaches, each for different language tasks, are used to generate the labels for the language model.
12 | 
13 | - **Masked language model:** To understand the relationship between words. The key idea is to mask some of the words in the sentence (around 15 percent) and use those masked words as labels to force the models to learn the relationship between words. For example, the original sentence would be:
14 | 
15 | ```
16 | The man went to the store. He bought a gallon of milk.
17 | ```
18 | 
19 | And the input/label pair to the language model is:
20 | 
21 | ```
22 | Input: The man went to the [MASK1]. He bought a [MASK2] of milk.
23 | 
24 | Labels: [MASK1] = store; [MASK2] = gallon
25 | ```
26 | 
27 | - **Sentence prediction task:**  To understand the relationship between sentences. This task helps the model predict whether sentence B is likely to be the next sentence following a given sentence A. Using the same example above, we can generate training data like:
28 | 
29 | ```
30 | Sentence A: The man went to the store.
31 | 
32 | Sentence B: He bought a gallon of milk.
33 | 
34 | Label: IsNextSentence
35 | ```
36 | 
37 | ## **Applying BERT to customized dataset**
38 | 
39 | After BERT is trained on a large corpus (say all the available English Wikipedia) using the above steps, the assumption is that because the dataset is huge, the model can inherit a lot of knowledge about the English language. The next step is to fine-tune the model on different tasks, hoping the model can adapt to a new domain more quickly. The key idea is to use the large BERT model trained above and add different input/output layers for different types of tasks. For example, you might want to do sentiment analysis for a customer support department. This is a classification problem, so you might need to add an output classification layer (as shown on the left in the figure below) and structure your input. For a different task, say question answering, you might need to use a different input/output layer, where the input is the question and the corresponding paragraph, while the output is the start/end answer span for the question (see the figure on the right). In each case, the way BERT is designed, it can enable data scientists to plug in different layers easily so it can be adapted for different tasks.
40 | 
41 | ![Adapting BERT for different tasks](https://azurecomcdn.azureedge.net/mediahandler/acomblog/media/Default/blog/39717ecf-8274-46c4-862d-21ca377b1957.png)
42 | 
43 | _Figure 1. Adapting BERT for different tasks (_[_Source_](https://arxiv.org/pdf/1810.04805.pdf)_)_
44 | 
45 | The image below shows the results on one of the most popular datasets in NLP field, the [Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer/).
46 | 
47 | ![Reported BERT performance on SQuAD 1.1 dataset](https://azurecomcdn.azureedge.net/mediahandler/acomblog/media/Default/blog/c37ee936-a5d2-4878-b8e2-ffc02a2797f2.png)
48 | 
49 | _Figure 2. Reported BERT performance on SQuAD 1.1 dataset (_[_Source_](https://arxiv.org/pdf/1810.04805.pdf)_)._
50 | 
51 | In the GitHub repository, we demonstrated the GLUE [General Language Understanding Evaluation (GLUE)](https://gluebenchmark.com/) (Wang et al., 2018) task.
52 | 


--------------------------------------------------------------------------------
/docs/dataprep.md:
--------------------------------------------------------------------------------
 1 | # Data Preparation for BERT Pretraining
 2 | The following steps are to prepare Wikipedia corpus for pretraining. However, these steps can be used with little or no modification to preprocess other datasets as well:
 3 | 
 4 | 1. Download wiki dump file from https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2.  
 5 |    This is a zip file and needs to be unzipped.
 6 | 2. Clone [Wikiextractor](https://github.com/attardi/wikiextractor), and run it:
 7 |    ```
 8 |    git clone https://github.com/attardi/wikiextractor
 9 |    python3 wikiextractor/WikiExtractor.py -o out -b 1000M enwiki-latest-pages-articles.xml
10 |    ```
11 |    Running time can be 5-10 minutes/GB.  
12 |    _output:_ `out` directory
13 | 3. Run:
14 |    ```
15 |    ln -s out out2
16 |    python3 AzureML-BERT/pretrain/PyTorch/dataprep/single_line_doc_file_creation.py
17 |    ```
18 |    This script removes html tags and empty lines and outputs to one file where each line is a paragraph.  
19 |    (`pip install tqdm` if needed.)  
20 |     _output:_ `wikipedia.txt`
21 | 4. Run:
22 |    ```
23 |    python3 AzureML-BERT/pretrain/PyTorch/dataprep/sentence_segmentation.py wikipedia.txt wikipedia.segmented.nltk.txt
24 |    ```
25 |    This script converts `wikipedia.txt` to one file where each line is a sentence.  
26 |    (`pip install nltk` if needed.)  
27 |     _output:_ `wikipedia.segmented.nltk.txt`
28 | 5. Split the above output file into ~100 files by line with:
29 |    ```
30 |    mkdir data_shards
31 |    python3 AzureML-BERT/pretrain/PyTorch/dataprep/split_data_into_files.py
32 |    ```
33 |    _output:_ `data_shards` directory
34 | 6. Run:
35 |    ```
36 |    python3 AzureML-BERT/pretrain/PyTorch/dataprep/create_pretraining.py --input_dir=data_shards --output_dir=pickled_pretrain_data --do_lower_case=true
37 |    ```
38 |    This script will convert each file into pickled `.bin` file.  
39 |    _output:_ `pickled_pretrain_data` directory
40 | 
41 | 


--------------------------------------------------------------------------------
/finetune/PyTorch/azureml_bert_util.py:
--------------------------------------------------------------------------------
  1 | from horovod.torch.mpi_ops import allreduce, allreduce_async_, synchronize
  2 | from horovod.torch.compression import Compression
  3 | import horovod.torch as hvd
  4 | import torch
  5 | import time
  6 | 
  7 | from collections import OrderedDict
  8 | try: 
  9 |     from apex_C import flatten
 10 |     from apex_C import unflatten
 11 | except ImportError:
 12 |     try:
 13 |         _ = warned_flatten
 14 |     except NameError:
 15 |         print("Warning:  apex was installed without --cpp_ext.  Falling back to Python flatten and unflatten.")
 16 |         warned_flatten = True
 17 |     from torch._utils import _flatten_dense_tensors as flatten
 18 |     from torch._utils import _unflatten_dense_tensors as unflatten
 19 | 
 20 | 
 21 | def warmup_linear(x, warmup=0.002):
 22 |     if x < warmup:
 23 |         return x/warmup
 24 |     return 1.0 - x
 25 | 
 26 | 
 27 | def adjust_gradient_accumulation_steps(x, initial_steps, target_steps, warmup):
 28 |     return min(max(int(x/warmup*target_steps), initial_steps), target_steps)
 29 | 
 30 | 
 31 | class DistributedCommunicator:
 32 |     def __init__(self, accumulation_step=1):
 33 |         hvd.init()
 34 |         self.local_rank = hvd.local_rank()
 35 |         self.world_size = hvd.size()
 36 |         self.rank = hvd.rank()
 37 |         self.n_gpu = torch.cuda.device_count()
 38 |         self.node_count = self.world_size // self.n_gpu
 39 |         self.accumulation_step = accumulation_step
 40 |         self.count_down = accumulation_step - 1
 41 |         self._multi_node = self.node_count > 1 
 42 |         if not self._multi_node:
 43 |             # use PyTorch build-in NCCL backend for single node training
 44 |             torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:6000',
 45 |                                 world_size=self.n_gpu,  rank=self.local_rank)
 46 | 
 47 | 
 48 |     def register_model(self, model, fp16):
 49 |         #  broadcast model parameters
 50 |         if self.node_count > 1:
 51 |             hvd.broadcast_parameters(model.state_dict(), root_rank=0)
 52 |         else:
 53 |             for param in model.parameters():
 54 |                 torch.distributed.broadcast_multigpu([param], 0)
 55 | 
 56 |         # register hook for reduce when backpropagate
 57 |         self._parameter_names = {v: k for k, v in sorted(model.named_parameters())}
 58 |         self._handles = {}
 59 |         self._requires_update = set()
 60 |         self._grad_accs = []
 61 |         self._grad = []
 62 |         self._compression = hvd.Compression.fp16 if fp16 else hvd.Compression.none
 63 |         for p in model.parameters():
 64 |             if p.requires_grad:
 65 |                 p.grad = p.data.new(p.size()).zero_()
 66 |                 self._requires_update.add(p)
 67 |                 p_tmp = p.expand_as(p)
 68 |                 grad_acc = p_tmp.grad_fn.next_functions[0][0]
 69 |                 grad_acc.register_hook(self._make_hook(p))
 70 |                 self._grad_accs.append(grad_acc)
 71 | 
 72 | 
 73 |     def _allreduce_tensor(self, p):
 74 |         assert p not in self._handles
 75 |         assert not p.grad.requires_grad
 76 |         tensor = p.grad
 77 |         name = self._parameter_names.get(p)
 78 |         if self._multi_node: 
 79 |             tensor_compressed, ctx = self._compression.compress(tensor)
 80 |             handle = allreduce_async_(tensor_compressed, average=True, name=name)
 81 |             self._handles[p] = (handle, ctx)
 82 |         else:
 83 |             self._handles[p] = tensor
 84 | 
 85 | 
 86 |     def _make_hook(self, p):
 87 |         def hook(*ignore):
 88 |             if self.count_down == 0:
 89 |                 self._allreduce_tensor(p)
 90 |         return hook
 91 | 
 92 | 
 93 |     def synchronize(self):
 94 |         synced = False
 95 |         if self.count_down == 0:
 96 |             missing_p = self._requires_update - set(self._handles.keys())
 97 |             for p in missing_p:
 98 |                 self._allreduce_tensor(p)
 99 | 
100 |             if self._multi_node:
101 |                 for p, value in self._handles.items():
102 |                     handle, ctx = value
103 |                     output = synchronize(handle)
104 |                     p.grad.set_(self._compression.decompress(output, ctx) / self.accumulation_step)
105 |             else:
106 |                 buckets = OrderedDict()
107 |                 for tensor in self._handles.values():
108 |                     tp = tensor.type()
109 |                     if tp not in buckets:
110 |                         buckets[tp] = []
111 |                     buckets[tp].append(tensor)
112 |                 for tp in buckets:
113 |                     bucket = buckets[tp]
114 |                     coalesced = flatten(bucket) / self.world_size / self.accumulation_step
115 |                     torch.distributed.all_reduce_multigpu([coalesced])
116 |                     for buf, synced in zip(bucket, unflatten(coalesced, bucket)):
117 |                         buf.copy_(synced)
118 |             self._handles.clear()
119 |             synced = True
120 |             self.count_down = self.accumulation_step
121 | 
122 |         self.count_down -= 1
123 |         return synced
124 | 
125 |     def set_accumulation_step(self, accumulation_step):
126 |         self.accumulation_step = accumulation_step
127 |         self.count_down = self.accumulation_step - 1


--------------------------------------------------------------------------------
/finetune/PyTorch/dockerfile:
--------------------------------------------------------------------------------
 1 | FROM mcr.microsoft.com/azureml/base-gpu:0.2.1
 2 | 
 3 | RUN apt update && apt install git -y && rm -rf /var/lib/apt/lists/*
 4 | 
 5 | RUN pip install numpy torch boto3 tqdm
 6 | 
 7 | RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
 8 | 
 9 | RUN pip install horovod
10 | 
11 | RUN pip install azureml-sdk


--------------------------------------------------------------------------------
/finetune/PyTorch/notebooks/BERT_Eval_GLUE.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Copyright (c) Microsoft Corporation. All rights reserved.\n",
  8 |     "\n",
  9 |     "Licensed under the MIT License."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# PyTorch Pretrained BERT on AzureML with GLUE Dataset\n",
 17 |     "\n",
 18 |     "In this notebook, you will find the following contents:\n",
 19 |     "- Download GLUE dataset on the remote compute and store them in Azure storage\n",
 20 |     "- Speed-up fine-tuning BERT for GLUE dataset on AzureML GPU clusters"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "## Prerequisites\n",
 28 |     "Follow instructions in BERT_pretraining.ipynb notebook for setting up AzureML"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "# Check core SDK version number\n",
 38 |     "import azureml.core\n",
 39 |     "\n",
 40 |     "print(\"SDK version:\", azureml.core.VERSION)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "## Initialize workspace\n",
 48 |     "\n",
 49 |     "To create or access an Azure ML Workspace, you will need to import the AML library and the following information:\n",
 50 |     "* A name for your workspace\n",
 51 |     "* Your subscription id\n",
 52 |     "* The resource group name\n",
 53 |     "\n",
 54 |     "Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step or create a new one. "
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "from azureml.core.workspace import Workspace\n",
 64 |     "ws = Workspace.setup()\n",
 65 |     "ws_details = ws.get_details()\n",
 66 |     "print('Name:\\t\\t{}\\nLocation:\\t{}'\n",
 67 |     "      .format(ws_details['name'],\n",
 68 |     "              ws_details['location']))\n"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "### Create an experiment\n",
 76 |     "Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed PyTorch tutorial. "
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "## Download GLUE dataset on the remote compute\n",
 84 |     "\n",
 85 |     "Before we start to fine-tune the pretained BERT model, we need to download the [GLUE data](https://gluebenchmark.com/tasks) by running the [script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) and unpack it to an Azure Blob container."
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "### Define AzureML datastore to collect training dataset\n",
 93 |     "\n",
 94 |     "To make data accessible for remote training, AML provides a convenient way to do so via a [Datastore](https://docs.microsoft.com/azure/machine-learning/service/how-to-access-data). The datastore provides a mechanism for you to upload/download data to Azure Storage, and interact with it from your remote compute targets.\n",
 95 |     "\n",
 96 |     "Each workspace is associated with a default Azure Blob datastore named `'workspaceblobstore'`. In this work, we use this default datastore to collect the GLUE training dataset ."
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "from azureml.core import Datastore\n",
106 |     "ds = ws.get_default_datastore()"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "### Create a project directory\n",
114 |     "Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script and any additional files your training script depends on."
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "import os\n",
124 |     "import os.path as path\n",
125 |     "project_root = path.abspath(path.join(os.getcwd(),\"../../../\"))"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "Download GLUE dataset in BingBert/ directory"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "ds.upload(src_dir=os.path.join(project_root,'data','glue_data'), target_path='glue_data')"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "Create a folder named \"bert-large-checkpoints\" which contains the .pt bert checkpoint file against which you want to run your eval tasks. The following code will upload the folder to the datastore. The URL for the checkpoint is: https://bertonazuremlwestus2.blob.core.windows.net/public/models/bert_large_uncased_original/bert_encoder_epoch_200.pt"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "ds.upload(src_dir=os.path.join(project_root,'data','bert-large-checkpoints') , target_path='bert-large-checkpoints')"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "Uploading bert-large config file to datastore"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {
171 |     "scrolled": true
172 |    },
173 |    "outputs": [],
174 |    "source": [
175 |     "ds.upload(src_dir=os.path.join(project_root,'pretrain','configs'), target_path='config')"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "**Remove /data folder to avoid uploading folder greater than 300MB.**"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "## Fine-tuning BERT with Distributed Training\n",
190 |     "As our `GLUE` dataset are ready in Azure storage, we can start the fine-tune the model by exploting the power of distributed training. "
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {},
196 |    "source": [
197 |     "### Create a GPU remote compute target\n",
198 |     "\n",
199 |     "We need to create a GPU [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) to perform the fine-tuning. In this example, we create an AmlCompute cluster as our training compute resource.\n",
200 |     "\n",
201 |     "This code creates a cluster for you if it does not already exist in your workspace."
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "from azureml.core.compute import ComputeTarget, AmlCompute\n",
211 |     "from azureml.core.compute_target import ComputeTargetException\n",
212 |     "\n",
213 |     "# choose a name for your cluster\n",
214 |     "gpu_cluster_name = \"bertcodetesting\"\n",
215 |     "\n",
216 |     "try:\n",
217 |     "    gpu_compute_target = ComputeTarget(workspace=ws, name=gpu_cluster_name)\n",
218 |     "    print('Found existing compute target.')\n",
219 |     "except ComputeTargetException:\n",
220 |     "    print('Creating a new compute target...')\n",
221 |     "    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC24', max_nodes=4)\n",
222 |     "\n",
223 |     "    # create the cluster\n",
224 |     "    gpu_compute_target = ComputeTarget.create(ws, gpu_cluster_name, compute_config)\n",
225 |     "    gpu_compute_target.wait_for_completion(show_output=True)\n",
226 |     "\n",
227 |     "# Use the 'status' property to get a detailed status for the current cluster. \n",
228 |     "print(gpu_compute_target.status.serialize())"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "### Create a PyTorch estimator for fine-tuning\n",
236 |     "Let us create a new PyTorch estimator to run the fine-tuning script `run_classifier.py`, that is already provided at [the original repository](https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_classifier.py). Please refer [here](https://github.com/huggingface/pytorch-pretrained-BERT#fine-tuning-with-bert-running-the-examples) for more detail about the script. \n",
237 |     "\n",
238 |     "The original `run_classifier.py` script uses PyTorch distributed launch untility to launch multiple processes across nodes and GPUs. We prepared a modified version [run_classifier_azureml.py](./run_classifier_azureml.py) so that we can launch it based on AzureML build-in MPI backend.\n",
239 |     "\n",
240 |     "To use AML's tracking and metrics capabilities, we need to add a small amount of AzureML code inside the training script.\n",
241 |     "\n",
242 |     "In `run_classifier_azureml.py`, we will log some metrics to our AML run. To do so, we will access the AML run object within the script:\n",
243 |     "```Python\n",
244 |     "from azureml.core.run import Run\n",
245 |     "run = Run.get_context()\n",
246 |     "```\n",
247 |     "Further within `run_classifier_azureml.py`, we log learning rate, training loss and evaluation accuracy the model achieves as:\n",
248 |     "```Python\n",
249 |     "run.log('lr', np.float(args.learning_rate))\n",
250 |     "...\n",
251 |     "\n",
252 |     "for step, batch in enumerate(tqdm(train_dataloader, desc=\"Iteration\")): \n",
253 |     "    ...\n",
254 |     "    run.log('train_loss', np.float(loss))\n",
255 |     "\n",
256 |     "...\n",
257 |     "\n",
258 |     "result = {'eval_loss': eval_loss,\n",
259 |     "          'eval_accuracy': eval_accuracy}\n",
260 |     "for key in sorted(result.keys()):\n",
261 |     "    run.log(key, str(result[key]))\n",
262 |     "```"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {},
268 |    "source": [
269 |     "The following code runs GLUE RTE task against a bert-large checkpoint with the parameters used by Huggingface for finetuning.\n",
270 |     "- num_train_epochs = 3\n",
271 |     "- max_seq_length = 128\n",
272 |     "- train_batch_size = 8\n",
273 |     "- learning_rate = 2e-5\n",
274 |     "- grad_accumulation_step = 2"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "from azureml.train.dnn import PyTorch\n",
284 |     "from azureml.core.runconfig import RunConfiguration\n",
285 |     "from azureml.core.container_registry import ContainerRegistry\n",
286 |     "\n",
287 |     "run_user_managed = RunConfiguration()\n",
288 |     "run_user_managed.environment.python.user_managed_dependencies = True\n",
289 |     "\n",
290 |     "# Using a pre-defined public docker image published on AzureML\n",
291 |     "image_name = 'mcr.microsoft.com/azureml/bert:pretrain-openmpi3.1.2-cuda10.0-cudnn7-ubuntu16.04'\n",
292 |     "\n",
293 |     "estimator = PyTorch(source_directory='../../../',\n",
294 |     "                    compute_target=gpu_compute_target,\n",
295 |     "                     #Docker image\n",
296 |     "                    use_docker=True,\n",
297 |     "                    custom_docker_image=image_name,\n",
298 |     "                    user_managed=True,\n",
299 |     "                    \n",
300 |     "                    script_params = {\n",
301 |     "                          '--bert_model':'bert-large-uncased',\n",
302 |     "                          \"--model_file_location\": ds.path('bert-large-checkpoints/').as_mount(),\n",
303 |     "                          '--task_name': 'RTE',\n",
304 |     "                          '--data_dir': ds.path('glue_data/RTE/').as_mount(),\n",
305 |     "                          '--do_train' : '',\n",
306 |     "                          '--do_eval': '',                      \n",
307 |     "                          '--do_lower_case': '',\n",
308 |     "                          '--max_seq_length': 128,\n",
309 |     "                          '--train_batch_size': 8,\n",
310 |     "                          '--gradient_accumulation_steps': 2,\n",
311 |     "                          '--learning_rate': 2e-5,\n",
312 |     "                          '--num_train_epochs': 3.0,\n",
313 |     "                          '--output_dir': ds.path('output/').as_mount(),\n",
314 |     "                          '--model_file': 'bert_encoder_epoch_245.pt',\n",
315 |     "                          '--fp16': \"\"\n",
316 |     "                    },\n",
317 |     "                    entry_script='./finetune/run_classifier_azureml.py',\n",
318 |     "                    node_count=1,\n",
319 |     "                    process_count_per_node=4,\n",
320 |     "                    distributed_backend='mpi',\n",
321 |     "                    use_gpu=True)\n",
322 |     "\n",
323 |     "# path to the Python environment in the custom Docker image\n",
324 |     "estimator._estimator_config.environment.python.interpreter_path = '/opt/miniconda/envs/amlbert/bin/python'"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "markdown",
329 |    "metadata": {},
330 |    "source": [
331 |     "### Submit and Monitor your run"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": null,
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "from azureml.core import Experiment\n",
341 |     "\n",
342 |     "experiment_name = 'bert-large-RTE'\n",
343 |     "experiment = Experiment(ws, name=experiment_name)"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "metadata": {},
350 |    "outputs": [],
351 |    "source": [
352 |     "run = experiment.submit(estimator)\n",
353 |     "from azureml.widgets import RunDetails\n",
354 |     "RunDetails(run).show()"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": null,
360 |    "metadata": {},
361 |    "outputs": [],
362 |    "source": [
363 |     "#run.cancel()"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": null,
369 |    "metadata": {},
370 |    "outputs": [],
371 |    "source": []
372 |   }
373 |  ],
374 |  "metadata": {
375 |   "authors": [
376 |    {
377 |     "name": "aagarg"
378 |    }
379 |   ],
380 |   "kernelspec": {
381 |    "display_name": "Python 3",
382 |    "language": "python",
383 |    "name": "python3"
384 |   },
385 |   "language_info": {
386 |    "codemirror_mode": {
387 |     "name": "ipython",
388 |     "version": 3
389 |    },
390 |    "file_extension": ".py",
391 |    "mimetype": "text/x-python",
392 |    "name": "python",
393 |    "nbconvert_exporter": "python",
394 |    "pygments_lexer": "ipython3",
395 |    "version": "3.6.7"
396 |   },
397 |   "msauthor": "aagarg"
398 |  },
399 |  "nbformat": 4,
400 |  "nbformat_minor": 2
401 | }
402 | 


--------------------------------------------------------------------------------
/finetune/PyTorch/notebooks/BERT_Eval_SQUAD.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Copyright (c) Microsoft Corporation. All rights reserved.\n",
  8 |     "\n",
  9 |     "Licensed under the MIT License."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# PyTorch Pretrained BERT on AzureML with SQuAD Dataset\n",
 17 |     "\n",
 18 |     "In this notebook, you will find the following contents:\n",
 19 |     "- Download SQuAD dataset on the remote compute and store them in Azure storage\n",
 20 |     "- Speed-up fine-tuning BERT for SQuAD dataset on AzureML GPU clusters"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "## Prerequisites\n",
 28 |     "Follow instructions in BERT_pretraining.ipynb notebook for setting up AzureML"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "# Check core SDK version number\n",
 38 |     "import azureml.core\n",
 39 |     "\n",
 40 |     "print(\"SDK version:\", azureml.core.VERSION)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "## Initialize workspace\n",
 48 |     "\n",
 49 |     "To create or access an Azure ML Workspace, you will need to import the AML library and the following information:\n",
 50 |     "* A name for your workspace\n",
 51 |     "* Your subscription id\n",
 52 |     "* The resource group name\n",
 53 |     "\n",
 54 |     "Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace or create a new one. "
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "from azureml.core.workspace import Workspace\n",
 64 |     "ws = Workspace.setup()\n",
 65 |     "ws_details = ws.get_details()\n",
 66 |     "print('Name:\\t\\t{}\\nLocation:\\t{}'\n",
 67 |     "      .format(ws_details['name'],\n",
 68 |     "              ws_details['location']))"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "### Create a project directory\n",
 76 |     "Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script and any additional files your training script depends on."
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "import os\n",
 86 |     "import os.path as path\n",
 87 |     "project_root = path.abspath(path.join(os.getcwd(),\"../../../\"))"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "### Define AzureML datastore to collect training dataset\n",
 95 |     "\n",
 96 |     "To make data accessible for remote training, AML provides a convenient way to do so via a [Datastore](https://docs.microsoft.com/azure/machine-learning/service/how-to-access-data). The datastore provides a mechanism for you to upload/download data to Azure Storage, and interact with it from your remote compute targets.\n",
 97 |     "\n",
 98 |     "Each workspace is associated with a default Azure Blob datastore named `'workspaceblobstore'`. In this work, we use this default datastore to collect the SQuAD training data ."
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "from azureml.core import Datastore\n",
108 |     "ds = ws.get_default_datastore()"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "The data for SQuAD can be downloaded with the following links and should be saved in a blob storage.\n",
116 |     "- [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)\n",
117 |     "- [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)\n",
118 |     "- [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "The following code will upload the training data to the path ./squad on the default datastore."
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "import os\n",
135 |     "ds.upload(src_dir=project_root+'\\data\\squad', target_path='squad')\n",
136 |     "ds.upload(src_dir=os.path.join(project_root,'data','bert-large-checkpoints') , target_path='bert-large-checkpoints')"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "### Create an experiment\n",
144 |     "Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed PyTorch tutorial. "
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "from azureml.core import Experiment\n",
154 |     "\n",
155 |     "experiment_name = 'BERT-SQuAD'\n",
156 |     "experiment = Experiment(ws, name=experiment_name)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "## Fine-tuning BERT with Distributed Training\n",
164 |     "As our `SQuAD` dataset are ready in Azure storage, we can start the fine-tune the model by exploting the power of distributed training. "
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "### Create a GPU remote compute target\n",
172 |     "\n",
173 |     "We need to create a GPU [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) to perform the fine-tuning. In this example, we create an AmlCompute cluster as our training compute resource.\n",
174 |     "\n",
175 |     "This code creates a cluster for you if it does not already exist in your workspace."
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "from azureml.core.compute import ComputeTarget, AmlCompute\n",
185 |     "from azureml.core.compute_target import ComputeTargetException\n",
186 |     "\n",
187 |     "# choose a name for your cluster\n",
188 |     "gpu_cluster_name = \"bertcodetesting\"\n",
189 |     "\n",
190 |     "try:\n",
191 |     "    gpu_compute_target = ComputeTarget(workspace=ws, name=gpu_cluster_name)\n",
192 |     "    print('Found existing compute target.')\n",
193 |     "except ComputeTargetException:\n",
194 |     "    print('Creating a new compute target...')\n",
195 |     "    \n",
196 |     "    compute_config = AmlCompute.provisioning_configuration(vm_size=\"STANDARD_NC24s_v3\", max_nodes=4)\n",
197 |     "\n",
198 |     "    # create the cluster\n",
199 |     "    gpu_compute_target = AmlCompute.create(ws, gpu_cluster_name, compute_config)\n",
200 |     "    gpu_compute_target.wait_for_completion(show_output=True)\n",
201 |     "\n",
202 |     "    # Use the 'status' property to get a detailed status for the current cluster. \n",
203 |     "    print(gpu_compute_target.status.serialize())"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "### Create a PyTorch estimator for fine-tuning\n",
211 |     "Let us create a new PyTorch estimator to run the fine-tuning script `run_squad.py`, that is already provided at [the original repository](https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_squad.py). Please refer [here](https://github.com/huggingface/pytorch-pretrained-BERT#fine-tuning-with-bert-running-the-examples) for more detail about the script. \n",
212 |     "\n",
213 |     "The original `run_squad.py` script uses PyTorch distributed launch untility to launch multiple processes across nodes and GPUs. We prepared a modified version [run_squad_azureml.py](./run_squad_azureml.py) so that we can launch it based on AzureML build-in MPI backend.\n",
214 |     "\n",
215 |     "To use AML's tracking and metrics capabilities, we need to add a small amount of AzureML code inside the training script.\n",
216 |     "\n",
217 |     "In `run_squad_azureml.py`, we will log some metrics to our AML run. To do so, we will access the AML run object within the script:\n",
218 |     "```Python\n",
219 |     "from azureml.core.run import Run\n",
220 |     "run = Run.get_context()\n",
221 |     "```\n",
222 |     "Further within `run_squad_azureml.py`, we log learning rate, training loss and prediction scores the model achieves as:\n",
223 |     "```Python\n",
224 |     "run.log('lr', np.float(args.learning_rate))\n",
225 |     "...\n",
226 |     "\n",
227 |     "for step, batch in enumerate(tqdm(train_dataloader, desc=\"Iteration\")): \n",
228 |     "    ...\n",
229 |     "    run.log('train_loss', np.float(loss))\n",
230 |     "\n",
231 |     "..\n",
232 |     "```"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": [
241 |     "from azureml.train.dnn import PyTorch\n",
242 |     "from azureml.core.runconfig import RunConfiguration\n",
243 |     "from azureml.core.container_registry import ContainerRegistry\n",
244 |     "\n",
245 |     "run_user_managed = RunConfiguration()\n",
246 |     "run_user_managed.environment.python.user_managed_dependencies = True\n",
247 |     "\n",
248 |     "# Define custom Docker image info\n",
249 |     "image_name = 'mcr.microsoft.com/azureml/bert:pretrain-openmpi3.1.2-cuda10.0-cudnn7-ubuntu16.04'\n",
250 |     "\n",
251 |     "estimator = PyTorch(source_directory='../../../',\n",
252 |     "                    compute_target=gpu_compute_target,\n",
253 |     "                     #Docker image\n",
254 |     "                    use_docker=True,\n",
255 |     "                    custom_docker_image=image_name,\n",
256 |     "                    user_managed=True,\n",
257 |     "                    script_params = {\n",
258 |     "                          '--bert_model':'bert-large-uncased',\n",
259 |     "                          \"--model_file_location\": ds.path('bert-large-checkpoints/').as_mount(),\n",
260 |     "                          '--model_file': 'bert_encoder_epoch_245.pt',\n",
261 |     "                          '--do_train' : '',\n",
262 |     "                          '--do_predict': '',\n",
263 |     "                          '--train_file': ds.path('squad/train-v1.1.json').as_mount(),\n",
264 |     "                          '--predict_file': ds.path('squad/dev-v1.1.json').as_mount(),\n",
265 |     "                          '--max_seq_length': 512,\n",
266 |     "                          '--train_batch_size': 8,\n",
267 |     "                          '--learning_rate': 3e-5,\n",
268 |     "                          '--num_train_epochs': 2.0,\n",
269 |     "                          '--doc_stride': 128,\n",
270 |     "                          '--seed': 32,\n",
271 |     "                          '--gradient_accumulation_steps':4,\n",
272 |     "                          '--warmup_proportion':0.25,\n",
273 |     "                          '--output_dir': './outputs',\n",
274 |     "                          '--fp16':'',\n",
275 |     "                          #'--loss_scale':128,\n",
276 |     "                    },\n",
277 |     "                    entry_script='./finetune/run_squad_azureml.py',\n",
278 |     "                    node_count=1,\n",
279 |     "                    process_count_per_node=4,\n",
280 |     "                    distributed_backend='mpi',\n",
281 |     "                    use_gpu=True)\n",
282 |     "\n",
283 |     "# path to the Python environment in the custom Docker image\n",
284 |     "estimator._estimator_config.environment.python.interpreter_path = '/opt/miniconda/envs/amlbert/bin/python'"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "metadata": {},
290 |    "source": [
291 |     "### Submit and Monitor your run"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": null,
297 |    "metadata": {
298 |     "scrolled": false
299 |    },
300 |    "outputs": [],
301 |    "source": [
302 |     "run = experiment.submit(estimator)\n",
303 |     "from azureml.widgets import RunDetails\n",
304 |     "RunDetails(run).show()"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "markdown",
309 |    "metadata": {},
310 |    "source": [
311 |     "To achieve over **90.5 F1 score** and **83.5 Exact-Match** with `SQuAD v1.1` dataset, it requires **2** epochs when fine-tune with `BERT large` model. Below please find the elapsed time using deferent Azure GPU VMs and configures. \n",
312 |     "\n",
313 |     "The default configuration in this notebook uses 2 `STANDARD_NC24rs_v3` (8 x V100) with `fp16` enabled. The training phase should take **22 mins** to complete 2 epochs. \n",
314 |     "\n",
315 |     "|  GPU counts \t|    1 GPU    \t|         2 GPU \t| 4 GPU      \t| 8 GPU      \t|\n",
316 |     "|------------:\t|:-----------:\t|--------------:\t|------------\t|------------\t|\n",
317 |     "| NCv3-series \t|     340 mins  |    180 mins \t    |    80 mins \t|   48 mins \t|\n",
318 |     "| NCv3 with fp16|     140 mins  |    79 mins \t    |    38 mins \t|   22 mins \t|"
319 |    ]
320 |   }
321 |  ],
322 |  "metadata": {
323 |   "authors": [
324 |    {
325 |     "name": "aagarg"
326 |    }
327 |   ],
328 |   "kernelspec": {
329 |    "display_name": "Python 3",
330 |    "language": "python",
331 |    "name": "python3"
332 |   },
333 |   "language_info": {
334 |    "codemirror_mode": {
335 |     "name": "ipython",
336 |     "version": 3
337 |    },
338 |    "file_extension": ".py",
339 |    "mimetype": "text/x-python",
340 |    "name": "python",
341 |    "nbconvert_exporter": "python",
342 |    "pygments_lexer": "ipython3",
343 |    "version": "3.6.7"
344 |   },
345 |   "msauthor": "aagarg"
346 |  },
347 |  "nbformat": 4,
348 |  "nbformat_minor": 2
349 | }


--------------------------------------------------------------------------------
/finetune/PyTorch/notebooks/Pretrained-BERT-GLUE.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Copyright (c) Microsoft Corporation. All rights reserved.\n",
  8 |     "\n",
  9 |     "Licensed under the MIT License."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# PyTorch Pretrained BERT on AzureML with GLUE Dataset\n",
 17 |     "This notebook contains an end-to-end walkthrough of using Azure Machine Learning Service to run [PyTorch reimplementation](https://github.com/huggingface/pytorch-pretrained-BERT) of [Google's TensorFlow repository for the BERT model](https://github.com/google-research/bert) developed by Hugging Face.\n",
 18 |     "\n",
 19 |     "You will find the following contents:\n",
 20 |     "- Download GLUE dataset on the remote compute and store them in Azure storage\n",
 21 |     "- Speep-up fine-tuning BERT for GLUE dataset on AzureML GPU clusters\n",
 22 |     "- Further fine-tune BERT wtih AzureML hyperparameter optimizer "
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "## Prerequisites\n",
 30 |     "- Understand the [architecture and terms](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture) introduced by Azure Machine Learning (AML)\n",
 31 |     "\n",
 32 |     "- Install the Python SDK:  make sure to install notebook, and contrib\n",
 33 |     "```\n",
 34 |     "conda create -n azureml -y Python=3.6\n",
 35 |     "source activate azureml\n",
 36 |     "pip install --upgrade azureml-sdk[notebooks,contrib] \n",
 37 |     "conda install ipywidgets\n",
 38 |     "jupyter nbextension install --py --user azureml.widgets\n",
 39 |     "jupyter nbextension enable azureml.widgets --user --py\n",
 40 |     "```\n",
 41 |     "\n",
 42 |     "You will need to restart jupyter after this\n",
 43 |     "Detailed instructions are here: https://docs.microsoft.com/en-us/azure/machine-learning/service/quickstart-create-workspace-with-python "
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "# Check core SDK version number\n",
 53 |     "import azureml.core\n",
 54 |     "\n",
 55 |     "print(\"SDK version:\", azureml.core.VERSION)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "## Initialize workspace\n",
 63 |     "\n",
 64 |     "To create or access an Azure ML Workspace, you will need to import the AML library and the following information:\n",
 65 |     "* A name for your workspace\n",
 66 |     "* Your subscription id\n",
 67 |     "* The resource group name\n",
 68 |     "\n",
 69 |     "Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step or create a new one. "
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "from azureml.core.workspace import Workspace\n",
 79 |     "\n",
 80 |     "workspace_name = ''\n",
 81 |     "subscription_id = ''\n",
 82 |     "resource_group_name = ''\n",
 83 |     "location = ''\n",
 84 |     "\n",
 85 |     "ws = Workspace._get_or_create(workspace_name,\n",
 86 |     "                             subscription_id=subscription_id,\n",
 87 |     "                             resource_group=resource_group_name,\n",
 88 |     "                             location=location)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "### Create an experiment\n",
 96 |     "Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed PyTorch tutorial. "
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "from azureml.core import Experiment\n",
106 |     "\n",
107 |     "experiment_name = 'BERT-GLUE'\n",
108 |     "experiment = Experiment(ws, name=experiment_name)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "## Download GLUE dataset on the remote compute\n",
116 |     "\n",
117 |     "Before we start to fine-tune the pretained BERT model, we need to download the [GLUE data](https://gluebenchmark.com/tasks) by running the [script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) and unpack it to an Azure Blob container."
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "### Define AzureML datastore to collect training dataset\n",
125 |     "\n",
126 |     "To make data accessible for remote training, AML provides a convenient way to do so via a [Datastore](https://docs.microsoft.com/azure/machine-learning/service/how-to-access-data). The datastore provides a mechanism for you to upload/download data to Azure Storage, and interact with it from your remote compute targets.\n",
127 |     "\n",
128 |     "Each workspace is associated with a default Azure Blob datastore named `'workspaceblobstore'`. In this work, we use this default datastore to collect the GLUE training dataset ."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "from azureml.core import Datastore\n",
138 |     "ds = Datastore(ws, 'workspaceblobstore')"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "### Create a project directory\n",
146 |     "Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script and any additional files your training script depends on."
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "import os\n",
156 |     "\n",
157 |     "project_folder = './pytorch-pretrained-BERT'"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "Make a local clone of the original [PyTorch reimplementation](https://github.com/huggingface/pytorch-pretrained-BERT) repository"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "!git clone -b v0.4.0 https://github.com/huggingface/pytorch-pretrained-BERT.git"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "We need to run the [script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) to download the [GLUE data](https://gluebenchmark.com/tasks) in the mounted Azure Blob container. In our example, we only download `MRPC` dataset"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "import urllib, os\n",
190 |     "urllib.request.urlretrieve( 'https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/becd574dd938f045ea5bd3cb77d1d506541b5345/download_glue_data.py', filename='./download_glue_data.py')\n",
191 |     "import download_glue_data\n",
192 |     "download_glue_data.main([\"--tasks\", \"MRPC\", \"--data_dir\",\"./glue\"])"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "Please note that, if you receive `UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 1183: character maps to <undefined>`. Please modify all `'with open(...)'` operations in the downloaded `download_glue_data.py` to `'open(.., encoding=\"utf8\")'`"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "The following code will upload the training data to the path ./glue on the default datastore."
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "ds.upload(src_dir='./glue', target_path='./glue')"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {},
221 |    "source": [
222 |     "## Fine-tuning BERT with Distributed Training\n",
223 |     "As our `GLUE` dataset are ready in Azure storage, we can start the fine-tune the model by exploting the power of distributed training. "
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "### Create a GPU remote compute target\n",
231 |     "\n",
232 |     "We need to create a GPU [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) to perform the fine-tuning. In this example, we create an AmlCompute cluster as our training compute resource. Please find the information of Azure VM size in below table.\n",
233 |     "\n",
234 |     "\n",
235 |     "|    VM Size    \t| CPU \t|   GPU   \t| Storage (SSD) \t| GPU memory \t| InfiniBand  \t|\n",
236 |     "|:-------------:\t|:---:\t|:-------:\t|:-------------:\t|:----------:\t|:----------:\t|\n",
237 |     "|  Standard_NC6 \t|  6  \t| 1 x K80 \t|    340 GiB    \t|    8 GiB   \t|      No   \t|\n",
238 |     "| Standard_NC12 \t|  12 \t| 2 x K80 \t|    680 GiB    \t|   16 GiB   \t|      No   \t|\n",
239 |     "| Standard_NC24 \t|  24 \t| 4 x K80 \t|    1440 GiB   \t|   32 GiB   \t|      No   \t|\n",
240 |     "| Standard_NC24r \t|  24 \t| 4 x K80 \t|    1440 GiB   \t|   32 GiB   \t|      Yes   \t|\n",
241 |     "| Standard_NC6s_v3 \t|  6  \t| 1 x V100 \t|    736 GiB    \t|   16 GiB   \t|      No   \t|\n",
242 |     "| Standard_NC12s_v3 |  12 \t| 2 x V100 \t|    1474 GiB   \t|   32 GiB   \t|      No   \t|\n",
243 |     "| Standard_NC24s_v3 |  24 \t| 4 x V100 \t|    2948 GiB   \t|   64 GiB   \t|      No   \t|\n",
244 |     "| Standard_NC24rs_v3|  24 \t| 4 x V100 \t|    2948 GiB   \t|   64 GiB   \t|      Yes   \t|\n",
245 |     "\n",
246 |     "\n",
247 |     "***Note that*** you need to request NCv3-serie quota if you would like to use NVIDIA Tesla V100 \n",
248 |     "\n",
249 |     "This code creates a cluster for you if it does not already exist in your workspace."
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": null,
255 |    "metadata": {},
256 |    "outputs": [],
257 |    "source": [
258 |     "from azureml.core.compute import ComputeTarget, AmlCompute\n",
259 |     "from azureml.core.compute_target import ComputeTargetException\n",
260 |     "\n",
261 |     "# choose a name for your cluster\n",
262 |     "gpu_cluster_name = \"nc24Cluster\"\n",
263 |     "\n",
264 |     "try:\n",
265 |     "    gpu_compute_target = ComputeTarget(workspace=ws, name=gpu_cluster_name)\n",
266 |     "    print('Found existing compute target.')\n",
267 |     "except ComputeTargetException:\n",
268 |     "    print('Creating a new compute target...')\n",
269 |     "    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC24', max_nodes=4)\n",
270 |     "\n",
271 |     "    # create the cluster\n",
272 |     "    gpu_compute_target = ComputeTarget.create(ws, gpu_cluster_name, compute_config)\n",
273 |     "    gpu_compute_target.wait_for_completion(show_output=True)\n",
274 |     "\n",
275 |     "# Use the 'status' property to get a detailed status for the current cluster. \n",
276 |     "print(gpu_compute_target.status.serialize())"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "metadata": {},
282 |    "source": [
283 |     "### Create a PyTorch estimator for fine-tuning\n",
284 |     "Let us create a new PyTorch estimator to run the fine-tuning script `run_classifier.py`, that is already provided at [the original repository](https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_classifier.py). Please refer [here](https://github.com/huggingface/pytorch-pretrained-BERT#fine-tuning-with-bert-running-the-examples) for more detail about the script. \n",
285 |     "\n",
286 |     "The original `run_classifier.py` script uses PyTorch distributed launch untility to launch multiple processes across nodes and GPUs. We prepared a modified version [run_classifier_azureml.py](./run_classifier_azureml.py) so that we can launch it based on AzureML build-in MPI backend.\n",
287 |     "\n",
288 |     "To use AML's tracking and metrics capabilities, we need to add a small amount of AzureML code inside the training script.\n",
289 |     "\n",
290 |     "In `run_classifier_azureml.py`, we will log some metrics to our AML run. To do so, we will access the AML run object within the script:\n",
291 |     "```Python\n",
292 |     "from azureml.core.run import Run\n",
293 |     "run = Run.get_context()\n",
294 |     "```\n",
295 |     "Further within `run_classifier_azureml.py`, we log learning rate, training loss and evaluation accuracy the model achieves as:\n",
296 |     "```Python\n",
297 |     "run.log('lr', np.float(args.learning_rate))\n",
298 |     "...\n",
299 |     "\n",
300 |     "for step, batch in enumerate(tqdm(train_dataloader, desc=\"Iteration\")): \n",
301 |     "    ...\n",
302 |     "    run.log('train_loss', np.float(loss))\n",
303 |     "\n",
304 |     "...\n",
305 |     "\n",
306 |     "result = {'eval_loss': eval_loss,\n",
307 |     "          'eval_accuracy': eval_accuracy}\n",
308 |     "for key in sorted(result.keys()):\n",
309 |     "    run.log(key, str(result[key]))\n",
310 |     "```\n",
311 |     "These run metrics will become particularly important when we begin hyperparameter tuning our model in the \"Tune model hyperparameters\" section.\n",
312 |     "\n",
313 |     "Let's first copy the training script `run_classifier_azureml.py` into our project directory."
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": null,
319 |    "metadata": {},
320 |    "outputs": [],
321 |    "source": [
322 |     "import shutil\n",
323 |     "shutil.copy('run_classifier_azureml.py', project_folder)\n",
324 |     "shutil.copy('azureml_bert_util.py', project_folder)"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "markdown",
329 |    "metadata": {},
330 |    "source": [
331 |     "Then, AzureML PyTorch estimator can be defined as below. We use `azuremlsamples/bert:torch-1.0.0-apex-cuda9` as the base docker image with [dockerfile](./dockerfile)"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": null,
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "from azureml.train.dnn import PyTorch\n",
341 |     "\n",
342 |     "estimator = PyTorch(source_directory=project_folder,\n",
343 |     "                    compute_target=gpu_compute_target,\n",
344 |     "                    script_params = {\n",
345 |     "                          '--bert_model':'bert-base-cased',\n",
346 |     "                          '--task_name': 'MRPC',\n",
347 |     "                          '--data_dir': ds.path('glue/MRPC/').as_mount(),\n",
348 |     "                          '--do_train' : '',\n",
349 |     "                          '--do_eval': '',\n",
350 |     "                          '--max_seq_length': 128,\n",
351 |     "                          '--train_batch_size': 32,\n",
352 |     "                          '--learning_rate': 2e-5,\n",
353 |     "                          '--num_train_epochs': 3.0,\n",
354 |     "                          '--output_dir': './outputs',\n",
355 |     "                          '--seed':16\n",
356 |     "                    },\n",
357 |     "                    custom_docker_base_image='azuremlsamples/bert:torch-1.0.0-apex-cuda9',\n",
358 |     "                    entry_script='run_classifier_azureml.py',\n",
359 |     "                    node_count=1,\n",
360 |     "                    process_count_per_node=4,\n",
361 |     "                    distributed_backend='mpi',\n",
362 |     "                    use_gpu=True)\n",
363 |     "\n",
364 |     "estimator._estimator_config.environment.python.user_managed_dependencies=True"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "markdown",
369 |    "metadata": {},
370 |    "source": [
371 |     "### Submit and Monitor your run"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": null,
377 |    "metadata": {},
378 |    "outputs": [],
379 |    "source": [
380 |     "run = experiment.submit(estimator)\n",
381 |     "from azureml.widgets import RunDetails\n",
382 |     "RunDetails(run).show()"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "markdown",
387 |    "metadata": {},
388 |    "source": [
389 |     "To achieve an average of **85% evaluation accuracy** with `MRPC dataset`, it requires **3** epochs when fine-tune with `BERT base` model. Below please find the elapsed time per epoch using deferent Azure GPU VMs with above hyperparameters\n",
390 |     "\n",
391 |     "|  GPU counts \t|    1 GPU    \t|         2 GPU \t| 4 GPU      \t|\n",
392 |     "|------------:\t|:-----------:\t|--------------:\t|------------\t|\n",
393 |     "|   NC-series \t| 191 s/epoch \t| 105 s/epoch   \t| 60 s/epoch \t|\n",
394 |     "| NCv3-series \t|  36 s/epoch \t|    22 s/epoch \t| 13 s/epoch \t|\n",
395 |     "| NCv3 with fp16|  32 s/epoch \t|    18 s/epoch \t| 12 s/epoch \t|"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "markdown",
400 |    "metadata": {},
401 |    "source": [
402 |     "## Fine-Tuning BERT with Hyperparameter Tuning\n",
403 |     "\n",
404 |     "We would also like to optimize our hyperparameter, `learning rate`, using Azure Machine Learning's hyperparameter tuning capabilities."
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "markdown",
409 |    "metadata": {},
410 |    "source": [
411 |     "### Start a hyperparameter sweep\n",
412 |     "First, we will define the hyperparameter space to sweep over. In this example we will use random sampling to try different configuration sets of hyperparameter to minimize our primary metric, the evaluation accuracy (`eval_accuracy`)."
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "code",
417 |    "execution_count": null,
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "from azureml.train.hyperdrive import *\n",
422 |     "import math\n",
423 |     "\n",
424 |     "param_sampling = RandomParameterSampling( {\n",
425 |     "        'learning_rate': loguniform(math.log(1e-4), math.log(1e-6)),\n",
426 |     "    }\n",
427 |     ")\n",
428 |     "\n",
429 |     "hyperdrive_run_config = HyperDriveRunConfig(estimator=estimator,\n",
430 |     "                                            hyperparameter_sampling=param_sampling, \n",
431 |     "                                            primary_metric_name='eval_accuracy',\n",
432 |     "                                            primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,\n",
433 |     "                                            max_total_runs=16,\n",
434 |     "                                            max_concurrent_runs=4)"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "markdown",
439 |    "metadata": {},
440 |    "source": [
441 |     "Finally, lauch the hyperparameter tuning job."
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": null,
447 |    "metadata": {},
448 |    "outputs": [],
449 |    "source": [
450 |     "hyperdrive_run = experiment.submit(hyperdrive_run_config)"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "markdown",
455 |    "metadata": {},
456 |    "source": [
457 |     "### Monitor HyperDrive runs\n",
458 |     "We can monitor the progress of the runs with the following Jupyter widget. "
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": null,
464 |    "metadata": {
465 |     "scrolled": false
466 |    },
467 |    "outputs": [],
468 |    "source": [
469 |     "from azureml.widgets import RunDetails\n",
470 |     "\n",
471 |     "RunDetails(hyperdrive_run).show()"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "markdown",
476 |    "metadata": {},
477 |    "source": [
478 |     "### Find and register the best model\n",
479 |     "Once all the runs complete, we can find the run that produced the model with the highest evaluation accuracy."
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": null,
485 |    "metadata": {},
486 |    "outputs": [],
487 |    "source": [
488 |     "best_run = hyperdrive_run.get_best_run_by_primary_metric()\n",
489 |     "best_run_metrics = best_run.get_metrics()\n",
490 |     "print(best_run)\n",
491 |     "print('Best Run is:\\n  accuracy: {0:.5f} \\n  Learning rate: {1:.8f}'.format(\n",
492 |     "        best_run_metrics['eval_accuracy'][-1],\n",
493 |     "        best_run_metrics['lr']\n",
494 |     "     ))"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "markdown",
499 |    "metadata": {},
500 |    "source": [
501 |     "You can compare the resulting optimal `learning_rate` with the value suggested by the [original implementation hyper-parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks): 2e-5"
502 |    ]
503 |   }
504 |  ],
505 |  "metadata": {
506 |   "authors": [
507 |    {
508 |     "name": "minxia"
509 |    }
510 |   ],
511 |   "kernelspec": {
512 |    "display_name": "Python [default]",
513 |    "language": "python",
514 |    "name": "python3"
515 |   },
516 |   "language_info": {
517 |    "codemirror_mode": {
518 |     "name": "ipython",
519 |     "version": 3
520 |    },
521 |    "file_extension": ".py",
522 |    "mimetype": "text/x-python",
523 |    "name": "python",
524 |    "nbconvert_exporter": "python",
525 |    "pygments_lexer": "ipython3",
526 |    "version": "3.6.6"
527 |   },
528 |   "msauthor": "minxia"
529 |  },
530 |  "nbformat": 4,
531 |  "nbformat_minor": 2
532 | }
533 | 


--------------------------------------------------------------------------------
/finetune/README.md:
--------------------------------------------------------------------------------
  1 | # Finetune natural language processing models using Azure Machine Learning service
  2 | 
  3 | This part of the repo contains a walkthrough of using [Azure Machine Learning Service](https://docs.microsoft.com/en-us/azure/machine-learning/service/) to finetune [BERT model](https://github.com/google-research/bert). See more details in this blogpost: https://azure.microsoft.com/en-us/blog/fine-tune-natural-language-processing-models-using-azure-machine-learning-service/
  4 | 
  5 | We provide two set of notebooks: one for PyTorch, and another one for TensorFlow. Please follow the notebooks below for more information:
  6 | - [GLUE eval using BERT](PyTorch/notebooks/BERT_Eval_GLUE.ipynb)
  7 | - [Tensorflow-BERT-AzureML](TensorFlow/notebooks/Tensorflow-BERT-AzureML.ipynb)
  8 | - [Named Entity Recognition using BERT](PyTorch/notebooks/Pretrained-BERT-NER.ipynb)  (Updated on 6/17/2019)
  9 | 
 10 | 
 11 | ## **Using the Azure Machine Learning Service**
 12 | 
 13 | We are going to demonstrate different experiments on different datasets. In addition to tuning different hyperparameters for various use cases, Azure Machine Learning service can be used to manage the entire lifecycle of the experiments. Azure Machine Learning service provides an end-to-end cloud-based machine learning environment, so customers can develop, train, test, deploy, manage, and track machine learning models, as shown below. It also has full support for open-source technologies, such as PyTorch and TensorFlow which we will be using later.
 14 | 
 15 | ![Azure Machine Learning Service Overview](https://azurecomcdn.azureedge.net/mediahandler/acomblog/media/Default/blog/07ebbbb6-0fd4-40a6-b4e6-c9d0b11cf159.png)
 16 | _Figure 3. Azure Machine Learning Service Overview_
 17 | 
 18 | ## **What is in the notebook**
 19 | 
 20 | ### **Defining the right model for specific task**
 21 | 
 22 | To fine-tune the BERT model, the first step is to define the right input and output layer. In the GLUE example, it is defined as a classification task, and the code snippet shows how to create a language classification model using BERT pre-trained models:
 23 | ```
 24 | model = modeling.BertModel(
 25 |      config=bert_config,
 26 |      is_training=is_training,
 27 |      input_ids=input_ids,
 28 |      input_mask=input_mask,
 29 |      token_type_ids=segment_ids,
 30 |      use_one_hot_embeddings=use_one_hot_embeddings)
 31 | 
 32 | logits = tf.matmul(output_layer, output_weights, transpose_b=True)
 33 | logits = tf.nn.bias_add(logits, output_bias)
 34 | probabilities = tf.nn.softmax(logits, axis=-1)
 35 | log_probs = tf.nn.log_softmax(logits, axis=-1)
 36 | one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
 37 | per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
 38 | loss = tf.reduce_mean(per_example_loss)
 39 | 
 40 | ```
 41 | 
 42 | ### **Set up training environment using Azure Machine Learning service**
 43 | 
 44 | Depending on the size of the dataset, training the model on the actual dataset might be time-consuming. Azure Machine Learning Compute provides access to GPUs either for a single node or multiple nodes to accelerate the training process. Creating a cluster with one or multiple nodes on Azure Machine Learning Compute is very intuitive, as below:
 45 | 
 46 | ```
 47 | compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC24s_v3',
 48 |                                                          min_nodes=0,
 49 |                                                          max_nodes=8)
 50 | # create the cluster
 51 | gpu_compute_target = ComputeTarget.create(ws, gpu_cluster_name, compute_config)
 52 | gpu_compute_target.wait_for_completion(show_output=True)
 53 | estimator = PyTorch(source_directory=project_folder,
 54 |                  compute_target=gpu_compute_target,
 55 |                  script_params = {...},
 56 |                  entry_script='run_squad.azureml.py',
 57 |                  conda_packages=['tensorflow', 'boto3', 'tqdm'],
 58 |                  node_count=node_count,
 59 |                  process_count_per_node=process_count_per_node,
 60 |                  distributed_backend='mpi',
 61 |                  use_gpu=True)
 62 | ```
 63 | Azure Machine Learning is greatly simplifying the work involved in setting up and running a distributed training job. As you can see, scaling the job to multiple workers is done by just changing the number of nodes in the configuration and providing a distributed backend. For distributed backends, Azure Machine Learning supports popular frameworks such as TensorFlow Parameter server as well as MPI with Horovod, and it ties in with the Azure hardware such as InfiniBand to connect the different worker nodes to achieve optimal performance. We will have a follow up blogpost on how to use the distributed training capability on Azure Machine Learning service to fine-tune NLP models.
 64 | 
 65 | For more information on how to create and set up compute targets for model training, please visit our [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets).
 66 | 
 67 | ### **Hyper Parameter Tuning**
 68 | 
 69 | For a given customer&#39;s specific use case, model performance depends heavily on the hyperparameter values selected. Hyperparameters can have a big search space, and exploring each option can be very expensive. Azure Machine Learning Services provide an automated machine learning service, which provides hyperparameter tuning capabilities and can search across various hyperparameter configurations to find a configuration that results in the best performance.
 70 | 
 71 | In the provided example, random sampling is used, in which case hyperparameter values are randomly selected from the defined search space. In the example below, we explored the learning rate space from 1e-4 to 1e-6 in log uniform manner, so the learning rate might be 2 values around 1e-4, 2 values around 1e-5, and 2 values around 1e-6.
 72 | 
 73 | Customers can also select which metric to optimize. Validation loss, accuracy score, and F1 score are some popular metrics that could be selected for optimization.
 74 | 
 75 | ```
 76 | from azureml.train.hyperdrive import *
 77 | import math
 78 | 
 79 | param_sampling = RandomParameterSampling( {
 80 |          'learning_rate': loguniform(math.log(1e-4), math.log(1e-6)),
 81 | })
 82 | 
 83 | hyperdrive_run_config = HyperDriveRunConfig(
 84 |      estimator=estimator,
 85 |      hyperparameter_sampling=param_sampling,
 86 |      primary_metric_name='f1',
 87 |      primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
 88 |      max_total_runs=16,
 89 |      max_concurrent_runs=4)
 90 | ```
 91 | 
 92 | For each experiment, customers can watch the progress for different hyperparameter combinations. For example, the picture below shows the mean loss over time using different hyperparameter combinations. Some of the experiments can be terminated early if the training loss doesn&#39;t meet expectations (like the top red curve).
 93 | 
 94 | ![Mean loss for training data for different runs, as well as early termination](https://azurecomcdn.azureedge.net/mediahandler/acomblog/media/Default/blog/bdbe13c8-0011-49de-a019-4731cd3951cb.png)
 95 | _Figure 4. Mean loss for training data for different runs, as well as early termination_
 96 | 
 97 | For more information on how to use the Azure ML&#39;s automated hyperparameter tuning feature, please visit our documentation on [tuning hyperparameters](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters). And for how to track all the experiments, please visit the documentation on [how to track experiments and metrics](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-track-experiments).
 98 | 
 99 | ## **Visualizing the result**
100 | 
101 | Using the Azure Machine Learning service, customers can achieve 85 percent evaluation accuracy when fine-tuning MRPC in GLUE dataset (it requires 3 epochs for BERT base model), which is close to the state-of-the-art result. Using multiple GPUs can shorten the training time and using more powerful GPUs (say V100) can also improve the training time. For one of the specific experiments, the details are as below:
102 | 
103 | | **GPU#** | **1** | **2** | **4** |
104 | | --- | --- | --- | --- |
105 | | **K80 (NC Family)** | 191 s/epoch | 105 s/epoch | 60 s/epoch |
106 | | **V100 (NCv3 Family)** | 36 s/epoch | 22 s/epoch | 13 s/epoch |
107 | 
108 | _Table 1. Training time per epoch for MRPC in GLUE dataset_
109 | 
110 | After all the experiments are done, the Azure Machine Learning service SDK also provides a summary visualization on the selected metrics and the corresponding hyperparameter(s). Below is an example on how learning rate affects validation loss. Throughout the experiments, the learning rate has been changed from around 7e-6 (the far left) to around 1e-3 (the far right), and the best learning rate with lowest validation loss is around 3.1e-4. This chart can also be leveraged to evaluate other metrics that customers want to optimize.
111 | 
112 | ![Learning rate versus validation loss](https://azurecomcdn.azureedge.net/mediahandler/acomblog/media/Default/blog/189651c7-05e1-4381-81b7-32d871b360b7.png)
113 | _Figure 5. Learning rate versus validation loss_
114 | 
115 | ## **Summary**
116 | 
117 | In this repo, we showed how customers can fine-tune BERT easily using the Azure Machine Learning service, as well as topics such as using distributed settings and tuning hyperparameters for the corresponding dataset. We also showed some preliminary results to demonstrate how to use Azure Machine Learning service to fine tune the NLP models. All the code is [available on the GitHub repository](https://github.com/Microsoft/AzureML-BERT). Please let us know if there are any questions or comments by raising an issue in the GitHub repo.
118 | 
119 | ### **References**
120 | 
121 | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/pdf/1810.04805.pdf) and its [GitHub site](https://github.com/google-research/bert).
122 | 
123 | - Visit the [Azure Machine Learning service](https://azure.microsoft.com/en-us/free/services/machine-learning/) homepage today to get started with your free-trial.
124 | - Learn more about [Azure Machine Learning service](https://azure.microsoft.com/en-us/services/machine-learning-service/).
125 | 


--------------------------------------------------------------------------------
/finetune/TensorFlow/download_model_and_dataset.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import argparse
 3 | import sys
 4 | import os
 5 | import shutil
 6 | import zipfile
 7 | import urllib
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | 
11 | ## Required parameters
12 | parser.add_argument("--bert_model_name",
13 |                     default = None,
14 |                     type = str,
15 |                     required = True,
16 |                     help = "Name of pretrained BERT model. Possible values: "
17 |                            "uncased_L-12_H-768_A-12,uncased_L-24_H-1024_A-16,cased_L-12_H-768_A-12,"
18 |                            "multilingual_L-12_H-768_A-12,chinese_L-12_H-768_A-12")
19 | 
20 | parser.add_argument("--model_dump_path",
21 |                     default = None,
22 |                     type = str,
23 |                     required = True,
24 |                     help = "Path to the output model.")
25 | 
26 | parser.add_argument("--glue_data_path",
27 |                     default = None,
28 |                     type = str,
29 |                     required = True,
30 |                     help = "Path to store downloaded GLUE dataset")
31 | 
32 | args = parser.parse_args()
33 | 
34 | bert_model_url_map = {
35 |     'uncased_L-12_H-768_A-12': 'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip',
36 |     'uncased_L-24_H-1024_A-16': 'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip',
37 |     'cased_L-12_H-768_A-12': 'https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip',
38 |     'multilingual_L-12_H-768_A-12': 'https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip',
39 |     'chinese_L-12_H-768_A-12': 'https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip'
40 | }
41 | 
42 | if args.bert_model_name not in bert_model_url_map:
43 |     sys.stderr.write('Unknown BERT model name ' + args.bert_model_name)
44 |     sys.exit(1)
45 | 
46 | pretrained_model_url = bert_model_url_map.get(args.bert_model_name)
47 | 
48 | # make local directory for pretrained tensorflow BERT model
49 | tensorflow_model_dir = './tensorflow_model'
50 | if not os.path.exists(tensorflow_model_dir):
51 |     os.makedirs(tensorflow_model_dir)
52 | 
53 | # download and extract pretrained tensorflow BERT model
54 | download_file_name = 'tensorflow_model.zip'
55 | urllib.request.urlretrieve(pretrained_model_url, filename=download_file_name)
56 | print('Extracting pretrained model...')
57 | with zipfile.ZipFile(download_file_name, 'r') as z:
58 |     z.extractall(tensorflow_model_dir)
59 | 
60 | # make destination path
61 | if not os.path.exists(args.model_dump_path):
62 |     os.makedirs(args.model_dump_path)
63 | 
64 | files = ['bert_model.ckpt.meta', 'bert_model.ckpt.index', 'bert_model.ckpt.data-00000-of-00001', 'bert_config.json', 'vocab.txt']
65 | for file in files:
66 |     shutil.copy(os.path.join(tensorflow_model_dir, args.bert_model_name, file), os.path.join(args.model_dump_path, file))
67 | 
68 | print('Start to download GLUE dataset...\n')
69 | urllib.request.urlretrieve(
70 |     'https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py',
71 |     filename='download_glue_data.py')
72 | if os.system('python download_glue_data.py --data_dir {0} --tasks all'.format(args.glue_data_path)) != 0: sys.exit(1)


--------------------------------------------------------------------------------
/finetune/evaluate_squad.py:
--------------------------------------------------------------------------------
 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """
 2 | from __future__ import print_function
 3 | from collections import Counter
 4 | import string
 5 | import re
 6 | import argparse
 7 | import json
 8 | import sys
 9 | 
10 | 
11 | def normalize_answer(s):
12 |     """Lower text and remove punctuation, articles and extra whitespace."""
13 |     def remove_articles(text):
14 |         return re.sub(r'\b(a|an|the)\b', ' ', text)
15 | 
16 |     def white_space_fix(text):
17 |         return ' '.join(text.split())
18 | 
19 |     def remove_punc(text):
20 |         exclude = set(string.punctuation)
21 |         return ''.join(ch for ch in text if ch not in exclude)
22 | 
23 |     def lower(text):
24 |         return text.lower()
25 | 
26 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
27 | 
28 | 
29 | def f1_score(prediction, ground_truth):
30 |     prediction_tokens = normalize_answer(prediction).split()
31 |     ground_truth_tokens = normalize_answer(ground_truth).split()
32 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
33 |     num_same = sum(common.values())
34 |     if num_same == 0:
35 |         return 0
36 |     precision = 1.0 * num_same / len(prediction_tokens)
37 |     recall = 1.0 * num_same / len(ground_truth_tokens)
38 |     f1 = (2 * precision * recall) / (precision + recall)
39 |     return f1
40 | 
41 | 
42 | def exact_match_score(prediction, ground_truth):
43 |     return (normalize_answer(prediction) == normalize_answer(ground_truth))
44 | 
45 | 
46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
47 |     scores_for_ground_truths = []
48 |     for ground_truth in ground_truths:
49 |         score = metric_fn(prediction, ground_truth)
50 |         scores_for_ground_truths.append(score)
51 |     return max(scores_for_ground_truths)
52 | 
53 | 
54 | def evaluate(dataset, predictions):
55 |     f1 = exact_match = total = 0
56 |     for article in dataset:
57 |         for paragraph in article['paragraphs']:
58 |             for qa in paragraph['qas']:
59 |                 total += 1
60 |                 if qa['id'] not in predictions:
61 |                     message = 'Unanswered question ' + qa['id'] + \
62 |                               ' will receive score 0.'
63 |                     print(message, file=sys.stderr)
64 |                     continue
65 |                 ground_truths = list(map(lambda x: x['text'], qa['answers']))
66 |                 prediction = predictions[qa['id']]
67 |                 exact_match += metric_max_over_ground_truths(
68 |                     exact_match_score, prediction, ground_truths)
69 |                 f1 += metric_max_over_ground_truths(
70 |                     f1_score, prediction, ground_truths)
71 | 
72 |     exact_match = 100.0 * exact_match / total
73 |     f1 = 100.0 * f1 / total
74 | 
75 |     return {'exact_match': exact_match, 'f1': f1}
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     expected_version = '1.1'
80 |     parser = argparse.ArgumentParser(
81 |         description='Evaluation for SQuAD ' + expected_version)
82 |     parser.add_argument('dataset_file', help='Dataset file')
83 |     parser.add_argument('prediction_file', help='Prediction File')
84 |     args = parser.parse_args()
85 |     with open(args.dataset_file) as dataset_file:
86 |         dataset_json = json.load(dataset_file)
87 |         if (dataset_json['version'] != expected_version):
88 |             print('Evaluation expects v-' + expected_version +
89 |                   ', but got dataset with v-' + dataset_json['version'],
90 |                   file=sys.stderr)
91 |         dataset = dataset_json['data']
92 |     with open(args.prediction_file) as prediction_file:
93 |         predictions = json.load(prediction_file)
94 |     print(json.dumps(evaluate(dataset, predictions)))


--------------------------------------------------------------------------------
/pretrain/PyTorch/README.md:
--------------------------------------------------------------------------------
1 | # PyTorch Pretrain BERT Source Code
2 | This folder contains source code and instructions on pretraining BERT model on Azure Machine Learning. 
3 | 
4 | The sub folder structure is as follows:
5 | - [docker](./docker/) folder for docker file and instruction to use Azure Container Registry
6 | - [dataprep](./dataprep/) folder for data preparation instructions
7 | 


--------------------------------------------------------------------------------
/pretrain/PyTorch/azureml_adapter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def set_environment_variables_for_nccl_backend(single_node=False, master_port=6105):
 5 |     os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK']
 6 |     os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE']
 7 | 
 8 |     if not single_node: 
 9 |         master_node_params = os.environ['AZ_BATCH_MASTER_NODE'].split(':')
10 |         os.environ['MASTER_ADDR'] = master_node_params[0]
11 | 
12 |         # Do not overwrite master port with that defined in AZ_BATCH_MASTER_NODE
13 |         if 'MASTER_PORT' not in os.environ:
14 |             os.environ['MASTER_PORT'] = str(master_port)
15 |     else:
16 |         os.environ['MASTER_ADDR'] = os.environ['AZ_BATCHAI_MPI_MASTER_NODE']
17 |         os.environ['MASTER_PORT'] = '54965'
18 |     print('NCCL_SOCKET_IFNAME original value = {}'.format(os.environ['NCCL_SOCKET_IFNAME']))
19 |     # TODO make this parameterizable
20 |     os.environ['NCCL_SOCKET_IFNAME'] = '^docker0,lo'
21 | 
22 |     print('RANK = {}'.format(os.environ['RANK']))
23 |     print('WORLD_SIZE = {}'.format(os.environ['WORLD_SIZE']))
24 |     print('MASTER_ADDR = {}'.format(os.environ['MASTER_ADDR']))
25 |     print('MASTER_PORT = {}'.format(os.environ['MASTER_PORT']))
26 |     # print('MASTER_NODE = {}'.format(os.environ['MASTER_NODE']))
27 |     print('NCCL_SOCKET_IFNAME new value = {}'.format(os.environ['NCCL_SOCKET_IFNAME']))
28 | 
29 | def get_local_rank():
30 |     return int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
31 | 
32 | def get_global_size():
33 |     return int(os.environ['OMPI_COMM_WORLD_SIZE'])
34 | 
35 | def get_local_size():
36 |     return int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])	
37 | 
38 | def get_world_size():
39 |     return int(os.environ['WORLD_SIZE'])
40 | 	 
41 | 


--------------------------------------------------------------------------------
/pretrain/PyTorch/benchmark.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from datetime import datetime
 3 | 
 4 | 
 5 | def get_timestamp(text):
 6 | 	datepattern = re.compile("\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}")
 7 | 	matcher = datepattern.search(text)
 8 | 	return datetime.strptime(matcher.group(0), '%m/%d/%Y %H:%M:%S')
 9 | 
10 | def get_perf_metrics(filename):
11 | 	with open(filename) as f:
12 | 		datafile = f.readlines()
13 | 		throughput = 0
14 | 		epoch = 1
15 | 		time_diff=0
16 | 		num_seq=0
17 | 		for line in datafile:
18 | 			if 'Training epoch:' in line:
19 | 				start_time = get_timestamp(line)
20 | 
21 | 				if epoch == 1:
22 | 					training_start_time = start_time
23 | 				epoch += 1
24 | 			if 'Completed processing' in line:
25 | 				end_time = get_timestamp(line)
26 | 				time_diff += int((end_time-start_time).total_seconds())
27 | 				num_seq += [int(s) for s in line[int(line.find('Completed processing')):].split() if s.isdigit()][0]
28 | 				throughput = num_seq/time_diff
29 | 				#print(throughput)
30 | 			if 'Validation Loss' in line:
31 | 				valid_loss = float(line[int(line.find('is:'))+3:])
32 | 		avg_throughput = (num_seq/time_diff)
33 | 		total_training_time = end_time-training_start_time
34 | 		d = datetime(1,1,1) + total_training_time
35 | 
36 | 		print('Num epochs:', epoch)
37 | 		print('Total time to train:', d.day-1,'days,', d.hour ,'hours')
38 | 		print('Average throughput:',avg_throughput, 'sequences/second')
39 | 		print('Final Validation Loss:', valid_loss)
40 | 


--------------------------------------------------------------------------------
/pretrain/PyTorch/checkpoint.py:
--------------------------------------------------------------------------------
 1 | from logger import Logger
 2 | import torch
 3 | import os
 4 | from operator import itemgetter
 5 | 
 6 | from torch import __init__
 7 | 
 8 | def checkpoint_model(PATH, model, optimizer, epoch, last_global_step, **kwargs):
 9 |     """Utility function for checkpointing model + optimizer dictionaries
10 |        The main purpose for this is to be able to resume training from that instant again
11 |     """
12 |     checkpoint_state_dict = {'epoch': epoch,
13 |                              'last_global_step': last_global_step,
14 |                              'model_state_dict': model.network.module.state_dict(),
15 |                              'optimizer_state_dict': optimizer.state_dict()}
16 |     # Add extra kwargs too
17 |     checkpoint_state_dict.update(kwargs)
18 |     torch.save(checkpoint_state_dict, PATH)
19 |     return
20 | 
21 | 
22 | def load_checkpoint(model, optimizer, PATH):
23 |     """Utility function for checkpointing model + optimizer dictionaries
24 |        The main purpose for this is to be able to resume training from that instant again
25 |     """
26 |     checkpoint_state_dict = torch.load(PATH, map_location=torch.device("cpu"))
27 |     #from train import model
28 |     model.network.module.load_state_dict(
29 |         checkpoint_state_dict['model_state_dict'])
30 |     #from train import optimizer
31 |     optimizer.load_state_dict(checkpoint_state_dict['optimizer_state_dict'])
32 |     epoch = checkpoint_state_dict['epoch']
33 |     last_global_step = checkpoint_state_dict['last_global_step']
34 |     del checkpoint_state_dict
35 |     return (epoch + 1, last_global_step)
36 | 
37 | 
38 | def latest_checkpoint_file(reference_folder: str, no_cuda) -> str:
39 |     """Extracts the name of the last checkpoint file
40 | 
41 |     :param reference_folder: (str) Path to the parent_folder
42 |     :return: (str) Path to the most recent checkpoint tar file
43 |     """
44 | 
45 |     logger = Logger(cuda=torch.cuda.is_available() and not no_cuda)
46 |     
47 |     # For each folder inside the parent folder find all files
48 |     # ending with .tar and extreact the last checkpoint.
49 |     candidate_files = []
50 |     for dir_path, dir_names, filenames in os.walk(reference_folder):
51 |         logger.info(f"Searching for checkpoint in {reference_folder}")
52 |         relevant_files = [f for f in filenames if f.endswith('.tar')]
53 |         if relevant_files:
54 |             latest_file = max(relevant_files)  # assumes that checkpoint number is of format 000x
55 |             candidate_files.append((dir_path, latest_file))
56 |     
57 |     checkpoint_file = max(candidate_files, key=itemgetter(1))
58 |     checkpoint_path = os.path.join(checkpoint_file[0], checkpoint_file[1])
59 | 
60 |     return checkpoint_path
61 | 


--------------------------------------------------------------------------------
/pretrain/PyTorch/configuration.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | # TODO better json handling
 5 | class BertJobConfiguration:
 6 |     def __init__(self, config_file_path):
 7 |         self.config = json.load(open(config_file_path, 'r', encoding='utf-8'))
 8 | 
 9 |     # TODO improve this implementation
10 |     def replace_path_placeholders(self, files_location):
11 |         self.config['data']['datasets'] = {key: value.replace('placeholder/', files_location)
12 |                                       for (key, value) in self.config['data']['datasets'].items()}
13 |         self.config['validation']['path'] = self.config['validation']['path'].replace('placeholder/', files_location)
14 | 
15 |     def get_name(self):
16 |         return self.config['name']
17 | 
18 |     def get_token_file_type(self):
19 |         return self.config["bert_token_file"]
20 | 
21 |     def get_model_file_type(self):
22 |         return self.config["bert_model_file"]
23 | 
24 |     def get_learning_rate(self):
25 |         return self.config["training"]["learning_rate"]
26 | 
27 |     def get_warmup_proportion(self):
28 |         return self.config["training"]["warmup_proportion"]
29 | 
30 |     def get_total_training_steps(self):
31 |         return self.config["training"]["total_training_steps"]
32 | 
33 |     def get_total_epoch_count(self):
34 |         return self.config["training"]["num_epochs"]
35 | 
36 |     def get_num_workers(self):
37 |         return self.config['training']['num_workers']
38 | 
39 |     def get_validation_folder_path(self):
40 |         return self.config['validation']['path']
41 | 
42 |     def get_wiki_pretrain_dataset_path(self):
43 |         return self.config["data"]["datasets"]['wiki_pretrain_dataset']
44 | 
45 |     def get_decay_rate(self):
46 |         return self.config["training"]["decay_rate"]
47 | 
48 |     def get_decay_step(self):
49 |         return self.config["training"]["decay_step"]
50 | 
51 |     def get_model_config(self):
52 |         return self.config["bert_model_config"]
53 | 


--------------------------------------------------------------------------------
/pretrain/PyTorch/dataprep/create_pretraining.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | 
 5 | from multiprocessing import Pool
 6 | import multiprocessing
 7 | import os
 8 | import logging
 9 | import argparse
10 | 
11 | 
12 | import sys
13 | sys.path.append("..")
14 | from pytorch_pretrained_bert.tokenization import BertTokenizer
15 | from dataset import TokenInstance, PretrainingDataCreator, GenericPretrainingDataCreator
16 | 
17 | 
18 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
19 |                     datefmt='%m/%d/%Y %H:%M:%S',
20 |                     level=logging.INFO)
21 | logger = logging.getLogger(__name__)
22 | 
23 | 
24 | def parse_data(inp_file, out_file):
25 |     if not os.path.exists(out_file):
26 |         print(inp_file)
27 |         dataset = GenericPretrainingDataCreator(inp_file, tokenizer, dupe_factor=9, max_seq_length=512)
28 |         dataset.save(out_file)
29 |         print(f"Completed Pickling: {out_file}")
30 |     else:
31 |         print(f'Already parsed: {out_file}')
32 | 
33 | 
34 | parser = argparse.ArgumentParser(
35 |     description="Give initial arguments for parsing")
36 | 
37 | parser.add_argument("--input_dir", type=str,
38 |                     help="This folder contains .txt files of Wikipedia Data."
39 |                          " Each .txt file contains the text from the documents."
40 |                          " It makes an assumption that each line in the file represents"
41 |                          " a single line in the document too. A blank line represents completion of a document.")
42 | parser.add_argument("--output_dir", type=str, help="Path to Output Directory.")
43 | parser.add_argument("--token_file", default="bert-large-uncased", type=str)
44 | parser.add_argument("--do_lower_case", default=False, action="store_true",
45 |                     help="This flag indicates the wheter the text should be lowercase or not")
46 | parser.add_argument("--processes", "-p", default=0, type=int,
47 |                     help="This is to do parallel processing of the txt files. It should be >=0. Default: 0 represents"
48 |                          " that it will use all the available cores in the CPU.")
49 | 
50 | args = parser.parse_args()
51 | tokenizer = BertTokenizer.from_pretrained(
52 |     args.token_file, do_lower_case=args.do_lower_case)
53 | 
54 | input_files = []
55 | output_files = []
56 | num_processes = 1
57 | 
58 | if args.processes < 0 or args.processes > multiprocessing.cpu_count():
59 |     raise ValueError(
60 |         "The value of --processes should be >=0 and less than the max cores in the CPU.")
61 | elif args.processes == 0:  # Use all cores
62 |     num_processes = multiprocessing.cpu_count()
63 | else:
64 |     num_processes = args.processes
65 | 
66 | for filename in os.listdir(args.input_dir):
67 |     input_file = os.path.join(args.input_dir, filename)
68 |     outfilename = "_".join(filename.split('.')[:-1]) + ".bin"
69 |     output_file = os.path.join(args.output_dir, outfilename)
70 |     input_files.append(input_file)
71 |     output_files.append(output_file)
72 |     # parse_data(input_file, output_file) # this line is for single core processing
73 | 
74 | # parse data using multiple cores
75 | with Pool(processes=num_processes) as pool:
76 |     pool.starmap(parse_data, zip(input_files, output_files))
77 | 


--------------------------------------------------------------------------------
/pretrain/PyTorch/dataprep/sentence_segmentation.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import os
 3 | from tqdm import tqdm
 4 | import sys
 5 | 
 6 | nltk.download('punkt')
 7 | 
 8 | input_file = sys.argv[1]
 9 | output_file = sys.argv[2]
10 | 
11 | doc_seperator = "\n"
12 | 
13 | with open(input_file) as ifile:
14 |     with open(output_file, "w") as ofile:
15 |         for i, line in tqdm(enumerate(ifile)):
16 |             if line != "\n":
17 |                 sent_list = nltk.tokenize.sent_tokenize(line)
18 |                 for sent in sent_list:
19 |                     ofile.write(sent + "\n")
20 |                 ofile.write(doc_seperator)
21 | 


--------------------------------------------------------------------------------
/pretrain/PyTorch/dataprep/single_line_doc_file_creation.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | from tqdm import tqdm
 4 | 
 5 | output_file = 'wikipedia.txt'
 6 | 
 7 | with open(output_file, "w") as ofile:
 8 |   for dirname in glob.glob('out2/*/', recursive=False):
 9 |     for filename in glob.glob(dirname + 'wiki_*', recursive=True):
10 |       print(filename)
11 |       article_lines = []
12 |       article_open = False
13 |       
14 |       with open(filename, "r") as file:
15 |         for i, line in tqdm(enumerate(file)):
16 |           if "<doc id=" in line:
17 |             article_open = True
18 |           elif "</doc>" in line:
19 |             article_open = False
20 |             for oline in article_lines[1:]:
21 |               if oline != "\n":
22 |                 ofile.write(oline.rstrip() + " ")
23 |             ofile.write("\n\n")
24 |             article_lines = []
25 |           else:
26 |             if article_open:
27 |               article_lines.append(line)
28 | 


--------------------------------------------------------------------------------
/pretrain/PyTorch/dataprep/split_data_into_files.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | 
 3 | input_file = "wikipedia.segmented.nltk.txt"
 4 | output_file = "./data_shards/wikipedia.segmented.part."
 5 | 
 6 | doc_seperator = "\n"
 7 | total_partitions = 100  # Mostly will create 1 extra partition
 8 | # shard_size = 396000 # Approximate, will split at next article break
 9 | 
10 | with open(input_file, encoding="UTF-8") as ifile:
11 |     ifile_lines = sum(1 for _ in tqdm(ifile))
12 | 
13 | print("Input file contains", ifile_lines, "lines.")
14 | 
15 | shard_size = ifile_lines // total_partitions
16 | 
17 | with open(input_file, encoding="UTF-8") as ifile:
18 |     shard_line_counter = 0
19 |     shard_index = 0
20 |     ofile = open(f"{output_file}{shard_index}.txt", "w", encoding="UTF-8")  # Open the first file
21 |     # Output files should not have doc_separator at the end of the file, but we accept input ending with doc_separator
22 |     for iline_counter, line in tqdm(enumerate(ifile, start=1)):
23 |         if line != doc_seperator or shard_line_counter < shard_size:
24 |             shard_line_counter += 1
25 |             ofile.write(line)
26 |         # Prevent opening an empty output file or writing a doc_sep
27 |         # when the iteration has reached the end of the input file (iline_counter == ifile_lines)
28 |         elif iline_counter < ifile_lines:
29 |             shard_line_counter = 0
30 |             shard_index += 1
31 |             ofile.close()
32 |             ofile = open(f"{output_file}{shard_index}.txt", "w", encoding="UTF-8")
33 |     ofile.close()  # Close the lastfile
34 | 


--------------------------------------------------------------------------------
/pretrain/PyTorch/dataset.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import os
  3 | from torch.utils.data import DataLoader, Dataset
  4 | from enum import IntEnum
  5 | from random import choice
  6 | import random
  7 | import collections
  8 | 
  9 | from text import mask, torch_long, PAD
 10 | from sources import PretrainingDataCreator, TokenInstance, GenericPretrainingDataCreator
 11 | from sources import WikiPretrainingDataCreator
 12 | from pytorch_pretrained_bert.tokenization import BertTokenizer
 13 | 
 14 | 
 15 | class BatchType(IntEnum):
 16 |     PRETRAIN_BATCH = 0
 17 | 
 18 | 
 19 | class PretrainDataType(IntEnum):
 20 |     WIKIPEDIA = 1
 21 |     VALIDATION = 2
 22 | 
 23 | MaskedLMInstance = collections.namedtuple(
 24 |     "MaskedLMInstance", ["index", "label"])
 25 | 
 26 | PretrainBatch = collections.namedtuple(
 27 |     'PreTrainBatch', ['input_ids', 'input_mask', 'sequence_ids',
 28 |                       'is_next_label', 'masked_lm_output']
 29 | )
 30 | 
 31 | def get_random_partition(data_directory, index):
 32 |     partitions = [os.path.join(data_directory, x)
 33 |                   for x in os.listdir(data_directory)]
 34 |     partitions = sorted(partitions)
 35 |     i = index % len(partitions)
 36 |     return partitions[i]
 37 | 
 38 | 
 39 | def map_to_torch(encoding):
 40 |     encoding = torch_long(encoding)
 41 |     encoding.requires_grad_(False)
 42 |     return encoding
 43 | 
 44 | 
 45 | def map_to_torch_float(encoding):
 46 |     encoding = torch.FloatTensor(encoding)
 47 |     encoding.requires_grad_(False)
 48 |     return encoding
 49 | 
 50 | 
 51 | def map_to_torch_half(encoding):
 52 |     encoding = torch.HalfTensor(encoding)
 53 |     encoding.requires_grad_(False)
 54 |     return encoding
 55 | 
 56 | 
 57 | def encode_sequence(seqA, seqB, max_seq_len, tokenizer):
 58 |     seqA = ["[CLS]"] + seqA + ["[SEP]"]
 59 |     seqB = seqB + ["[SEP]"]
 60 | 
 61 |     input_tokens = seqA + seqB
 62 |     input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
 63 |     sequence_ids = [0]*len(seqA) + [1]*len(seqB)
 64 |     input_mask = [1]*len(input_ids)
 65 | 
 66 |     while len(input_ids) < max_seq_len:
 67 |         input_ids.append(PAD)
 68 |         sequence_ids.append(PAD)
 69 |         input_mask.append(PAD)
 70 | 
 71 |     return (map_to_torch(input_ids), map_to_torch(input_mask), map_to_torch(sequence_ids))
 72 | 
 73 | 
 74 | def truncate_input_sequence(tokens_a, tokens_b, max_num_tokens):
 75 |     while True:
 76 |         total_length = len(tokens_a) + len(tokens_b)
 77 |         if total_length <= max_num_tokens:
 78 |             break
 79 | 
 80 |         trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
 81 |         assert len(trunc_tokens) >= 1
 82 | 
 83 |         # We want to sometimes truncate from the front and sometimes from the
 84 |         # back to add more randomness and avoid biases.
 85 |         if random.random() < 0.5:
 86 |             del trunc_tokens[0]
 87 |         else:
 88 |             trunc_tokens.pop()
 89 | 
 90 | class PreTrainingDataset(Dataset):
 91 |     def __init__(self, tokenizer: BertTokenizer, folder: str, logger, max_seq_length, index, data_type: PretrainDataType = PretrainDataType.WIKIPEDIA, max_predictions_per_seq=20, masked_lm_prob=0.15):
 92 |         self.tokenizer = tokenizer
 93 |         self.dir_path = folder
 94 |         self.max_seq_length = max_seq_length
 95 |         self.len = 0
 96 |         self.masked_lm_prob = masked_lm_prob
 97 |         self.max_predictions_per_seq = max_predictions_per_seq
 98 |         self.vocab_words = list(tokenizer.vocab.keys())
 99 | 
100 |         path = get_random_partition(self.dir_path, index)
101 | 
102 |         logger.info(f"Loading Pretraining Data from {path}")
103 |         if data_type == PretrainDataType.WIKIPEDIA:
104 |             self.data = GenericPretrainingDataCreator.load(path)
105 |         elif data_type == PretrainDataType.VALIDATION:
106 |             self.data = WikiPretrainingDataCreator.load(path)
107 |         self.len = len(self.data)
108 |         logger.info(
109 |             f"Data Loading Completed for Pretraining Data from {path} with {self.len} samples.")
110 | 
111 |     def __len__(self):
112 |         return self.len
113 | 
114 |     def __getitem__(self, index):
115 |         i = index % self.len
116 | 
117 |         instance: TokenInstance = self.data.instances[i]
118 |         return self.create_training_instance(instance)
119 | 
120 |     def create_training_instance(self, instance: TokenInstance):
121 |         tokens_a, tokens_b, is_next = instance.get_values()
122 |         # print(f'is_next label:{is_next}')
123 |         # Create mapper
124 |         tokens = []
125 |         segment_ids = []
126 |         tokens.append("[CLS]")
127 |         segment_ids.append(0)
128 |         for token in tokens_a:
129 |             tokens.append(token)
130 |             segment_ids.append(0)
131 | 
132 |         tokens.append("[SEP]")
133 |         segment_ids.append(0)
134 | 
135 |         for token in tokens_b:
136 |             tokens.append(token)
137 |             segment_ids.append(1)
138 | 
139 |         tokens.append("[SEP]")
140 |         segment_ids.append(1)
141 | 
142 |         # Get Masked LM predictions
143 |         tokens, masked_lm_output = self.create_masked_lm_predictions(tokens)
144 | 
145 |         # Convert to Ids
146 |         input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
147 |         input_mask = [1] * len(input_ids)
148 | 
149 |         while len(input_ids) < self.max_seq_length:
150 |             input_ids.append(PAD)
151 |             segment_ids.append(PAD)
152 |             input_mask.append(PAD)
153 |             masked_lm_output.append(-1)
154 |         return([map_to_torch([BatchType.PRETRAIN_BATCH]), map_to_torch(input_ids), map_to_torch(input_mask), map_to_torch(segment_ids), map_to_torch([is_next]), map_to_torch(masked_lm_output)])
155 | 
156 |     def create_masked_lm_predictions(self, tokens):
157 |         cand_indexes = []
158 |         for i, token in enumerate(tokens):
159 |             if token == "[CLS]" or token == "[SEP]":
160 |                 continue
161 |             cand_indexes.append(i)
162 | 
163 |         random.shuffle(cand_indexes)
164 |         output_tokens = list(tokens)
165 | 
166 |         num_to_predict = min(self.max_predictions_per_seq, max(
167 |             1, int(round(len(tokens) * self.masked_lm_prob))))
168 | 
169 |         masked_lms = []
170 |         covered_indexes = set()
171 |         for index in cand_indexes:
172 |             if len(masked_lms) >= num_to_predict:
173 |                 break
174 |             if index in covered_indexes:
175 |                 continue
176 |             covered_indexes.add(index)
177 | 
178 |             masked_token = None
179 |             # 80% mask
180 |             if random.random() < 0.8:
181 |                 masked_token = "[MASK]"
182 |             else:
183 |                 # 10% Keep Original
184 |                 if random.random() < 0.5:
185 |                     masked_token = tokens[index]
186 |                 # 10% replace w/ random word
187 |                 else:
188 |                     masked_token = self.vocab_words[random.randint(
189 |                         0, len(self.vocab_words) - 1)]
190 | 
191 |             output_tokens[index] = masked_token
192 |             masked_lms.append(MaskedLMInstance(
193 |                 index=index, label=tokens[index]))
194 | 
195 |         masked_lms = sorted(masked_lms, key=lambda x: x.index)
196 |         masked_lm_output = [-1] * len(output_tokens)
197 |         for p in masked_lms:
198 |             masked_lm_output[p.index] = self.tokenizer.vocab[p.label]
199 | 
200 |         return (output_tokens, masked_lm_output)
201 | 


--------------------------------------------------------------------------------
/pretrain/PyTorch/distributed_apex.py:
--------------------------------------------------------------------------------
  1 | # TODO: This is a copy of apex code from NVIDIA/APEX. Details to be added on the updates made here.
  2 | 
  3 | import torch
  4 | 
  5 | try:
  6 |     from apex_C import flatten
  7 |     from apex_C import unflatten
  8 | except ImportError:
  9 |     try:
 10 |         _ = warned_flatten
 11 |     except NameError:
 12 |         print("Warning:  apex was installed without --cpp_ext.  Falling back to Python flatten and unflatten.")
 13 |         warned_flatten = True
 14 |     from torch._utils import _flatten_dense_tensors as flatten
 15 |     from torch._utils import _unflatten_dense_tensors as unflatten
 16 | import torch.distributed as dist
 17 | from torch.nn.modules import Module
 18 | from torch.autograd import Variable
 19 | from collections import OrderedDict
 20 | from itertools import chain
 21 | import copy
 22 | 
 23 | # apply_dist_call requires that tensors in 'bucket' are all the same type.
 24 | 
 25 | 
 26 | def apply_flat_dist_call(bucket, call, extra_args=None):
 27 | 
 28 |     coalesced = flatten(bucket)
 29 |     #print("Rank", dist.get_rank(), "Broadcasting ", coalesced.device, " Size", coalesced.size())
 30 |     if extra_args is not None:
 31 |         call(coalesced, *extra_args)
 32 |     else:
 33 |         call(coalesced)
 34 | 
 35 |     if call is dist.all_reduce:
 36 |         coalesced /= dist.get_world_size()
 37 | 
 38 |     for buf, synced in zip(bucket, unflatten(coalesced, bucket)):
 39 |         buf.copy_(synced)
 40 | 
 41 | 
 42 | def split_half_float_double(tensors):
 43 |     dtypes = ["torch.cuda.HalfTensor",
 44 |               "torch.cuda.FloatTensor", "torch.cuda.DoubleTensor"]
 45 |     buckets = []
 46 |     for i, dtype in enumerate(dtypes):
 47 |         bucket = [t for t in tensors if t.type() == dtype]
 48 |         if bucket:
 49 |             buckets.append(bucket)
 50 |     return buckets
 51 | 
 52 | 
 53 | def split_by_type(tensors):
 54 |     buckets = OrderedDict()
 55 |     for tensor in tensors:
 56 |         tp = tensor.type()
 57 |         if tp not in buckets:
 58 |             buckets[tp] = []
 59 |         buckets[tp].append(tensor)
 60 |     return buckets
 61 | 
 62 | # flat_dist_call organizes 'tensors' by type.
 63 | 
 64 | 
 65 | def flat_dist_call(tensors, call, extra_args=None):
 66 |     buckets = split_by_type(tensors)
 67 | 
 68 |     for tp in buckets:
 69 |         bucket = buckets[tp]
 70 |         apply_flat_dist_call(bucket, call, extra_args)
 71 | 
 72 | 
 73 | def extract_tensors(maybe_tensor, tensor_list):
 74 |     if torch.is_tensor(maybe_tensor):
 75 |         tensor_list.append(maybe_tensor)
 76 |     else:
 77 |         try:
 78 |             for item in maybe_tensor:
 79 |                 extract_tensors(item, tensor_list)
 80 |         except TypeError:
 81 |             return
 82 | 
 83 | 
 84 | class Reducer(object):
 85 |     """
 86 |     :class:`apex.parallel.Reducer` is a simple class that helps allreduce a module's parameters
 87 |     across processes.  :class:`Reducer` is intended to give the user additional control:
 88 |     Unlike :class:`DistributedDataParallel`, :class:`Reducer` will not automatically allreduce
 89 |     parameters during ``backward()``.
 90 |     Instead, :class:`Reducer` waits for the user to call `<reducer_instance>.reduce()` manually.
 91 |     This enables, for example, delaying the allreduce to be carried out every 
 92 |     several iterations instead of every single iteration.
 93 | 
 94 |     Like :class:`DistributedDataParallel`, :class:`Reducer` averages any tensors it allreduces 
 95 |     over the number of participating processes.
 96 | 
 97 |     :class:`Reducer` is designed to work with the upstream launch utility script 
 98 |     ``torch.distributed.launch`` with ``--nproc_per_node <= number of gpus per node``.
 99 |     When used with this launcher, :class:`Reducer` assumes 1:1 mapping of processes to GPUs.
100 |     It also assumes that your script calls ``torch.cuda.set_device(args.rank)`` before creating the model.
101 | 
102 |     main_reducer.py in https://github.com/NVIDIA/apex/tree/master/examples/imagenet shows example usage.
103 | 
104 |     Args:
105 |         module_or_grads_list: Either a network definition (module) being run in multi-gpu/distributed mode, or an iterable of gradients to be reduced.  If a module is passed in, the Reducer constructor will sync the parameters across processes (broadcasting from rank 0) to make sure they're all initialized with the same values.  If a list of gradients (that came from some module) is passed in, the user is responsible for manually syncing that module's parameters at the beginning of training.
106 |     """
107 | 
108 |     def __init__(self, module_or_grads_list):
109 |         if isinstance(module_or_grads_list, Module):
110 |             self.module = module_or_grads_list
111 |             flat_dist_call(
112 |                 [param.data for param in self.module.parameters()], dist.broadcast, (0,))
113 | 
114 |         else:
115 |             self.module = None
116 |             self.grads = []
117 |             extract_tensors(module_or_grads_list, self.grads)
118 | 
119 |     def reduce(self):
120 |         if self.module:
121 |             grads = [param.grad.data for param in self.module.parameters()
122 |                      if param.grad is not None]
123 |             flat_dist_call(grads, dist.all_reduce)
124 |         else:
125 |             flat_dist_call(self.grads, dist.all_reduce)
126 | 
127 | 
128 | class DistributedDataParallel(Module):
129 |     """
130 |     :class:`apex.parallel.DistributedDataParallel` is a module wrapper that enables
131 |     easy multiprocess distributed data parallel training, similar to ``torch.nn.parallel.DistributedDataParallel``.  Parameters are broadcast across participating processes on initialization, and gradients are
132 |     allreduced and averaged over processes during ``backward()``.
133 | 
134 |     :class:`DistributedDataParallel` is optimized for use with NCCL.  It achieves high performance by 
135 |     overlapping communication with computation during ``backward()`` and bucketing smaller gradient
136 |     transfers to reduce the total number of transfers required.
137 | 
138 |     :class:`DistributedDataParallel` is designed to work with the upstream launch utility script 
139 |     ``torch.distributed.launch`` with ``--nproc_per_node <= number of gpus per node``.
140 |     When used with this launcher, :class:`DistributedDataParallel` assumes 1:1 mapping of processes to GPUs.
141 |     It also assumes that your script calls ``torch.cuda.set_device(args.rank)`` before creating the model.
142 | 
143 |     https://github.com/NVIDIA/apex/tree/master/examples/distributed shows detailed usage.
144 |     https://github.com/NVIDIA/apex/tree/master/examples/imagenet shows another example
145 |     that combines :class:`DistributedDataParallel` with mixed precision training.
146 | 
147 |     Args:
148 |         module: Network definition to be run in multi-gpu/distributed mode.
149 |         message_size (int, default=1e7): Minimum number of elements in a communication bucket.
150 |         delay_allreduce (bool, default=False):  Delay all communication to the end of the backward pass.  This disables overlapping communication with computation.
151 |         allreduce_trigger_params (list, optional, default=None):  If supplied, should contain a list of parameters drawn from the model.  Allreduces will be kicked off whenever one of these parameters receives its gradient (as opposed to when a bucket of size message_size is full).  At the end of backward(), a cleanup allreduce to catch any remaining gradients will also be performed automatically.  If allreduce_trigger_params is supplied, the message_size argument will be ignored.
152 |         allreduce_always_fp32 (bool, default=False):  Convert any FP16 gradients to FP32 before allreducing.  This can improve stability for widely scaled-out runs.
153 |         gradient_average (bool, default=True):  Option to toggle whether or not DDP averages the allreduced gradients over processes.  For proper scaling, the default value of True is recommended.
154 |         gradient_predivide_factor (float, default=1.0):  Allows perfoming the average of gradients over processes partially before and partially after the allreduce.  Before allreduce:  ``grads.mul_(1.0/gradient_predivide_factor)``.  After allreduce:  ``grads.mul_(gradient_predivide_factor/world size)``.  This can reduce the stress on the dynamic range of FP16 allreduces for widely scaled-out runs.
155 | 
156 |     .. warning::
157 |         If ``gradient_average=False``, the pre-allreduce division (``grads.mul_(1.0/gradient_predivide_factor)``) will still be applied, but the post-allreduce gradient averaging (``grads.mul_(gradient_predivide_factor/world size)``) will be omitted.
158 | 
159 |     """
160 | 
161 |     def __init__(self,
162 |                  module,
163 |                  message_size=10000000,
164 |                  delay_allreduce=False,
165 |                  shared_param=None,
166 |                  allreduce_trigger_params=None,
167 |                  retain_allreduce_buffers=False,
168 |                  allreduce_always_fp32=False,
169 |                  gradient_average=True,
170 |                  gradient_predivide_factor=1.0,
171 |                  gradient_average_split_factor=None):
172 |         super(DistributedDataParallel, self).__init__()
173 | 
174 |         # Backward/forward compatibility around
175 |         # https://github.com/pytorch/pytorch/commit/540ef9b1fc5506369a48491af8a285a686689b36 and
176 |         # https://github.com/pytorch/pytorch/commit/044d00516ccd6572c0d6ab6d54587155b02a3b86
177 |         if hasattr(dist, "get_backend"):
178 |             self._backend = dist.get_backend()
179 |             if hasattr(dist, "DistBackend"):
180 |                 self.backend_enum_holder = dist.DistBackend
181 |             else:
182 |                 self.backend_enum_holder = dist.Backend
183 |         else:
184 |             self._backend = dist._backend
185 |             self.backend_enum_holder = dist.dist_backend
186 | 
187 |         self.warn_on_half = True if self._backend == self.backend_enum_holder.GLOO else False
188 | 
189 |         if shared_param is not None:
190 |             raise ValueError("shared_param is no longer supported as an option.  It was misleadingly named from the start.  It turns out overlapping communication with computation should work fine with shared parameters.  If you still wish to delay communication to the end of the backward pass, use delay_allreduce=True|False instead.")
191 | 
192 |         if gradient_average_split_factor is not None:
193 |             print("Warning:  gradient_average_split_factor has been renamed to gradient_predivide_factor.  For now, gradient_average_split_factor will also work, but please update to gradient_predivide_factor instead.")
194 |             self.gradient_predivide_factor = gradient_average_split_factor
195 | 
196 |         self.world_size = float(dist.get_world_size())
197 | 
198 |         self.retain_allreduce_buffers = retain_allreduce_buffers
199 |         self.allreduce_always_fp32 = allreduce_always_fp32
200 |         self.gradient_average = gradient_average
201 |         self.gradient_predivide_factor = gradient_predivide_factor
202 | 
203 |         self.custom_allreduce_triggers = False
204 |         if allreduce_trigger_params is not None:
205 |             if delay_allreduce:
206 |                 raise ValueError(
207 |                     "Setting allreduce_trigger_params is only valid if delay_allreduce=False.")
208 |             self.custom_allreduce_triggers = True
209 |             self.allreduce_trigger_params = set(
210 |                 [id(param) for param in allreduce_trigger_params])
211 | 
212 |         self.delay_allreduce = delay_allreduce
213 |         self.message_size = message_size
214 | 
215 |         self.reduction_stream = torch.cuda.Stream()
216 |         self.reduction_event = torch.cuda.Event(
217 |             enable_timing=False, blocking=False)
218 | 
219 |         self.module = module
220 | 
221 |         if self._backend == self.backend_enum_holder.NCCL:
222 |             for param in self.module.parameters():
223 |                 assert param.is_cuda, "NCCL backend only supports model parameters to be on GPU."
224 | 
225 |         self.active_params = []
226 | 
227 |         self.param_type_to_tmp_i = {"torch.cuda.HalfTensor": 0,
228 |                                     "torch.cuda.FloatTensor": 1,
229 |                                     "torch.cuda.DoubleTensor": 2}
230 | 
231 |         # to make sure reduction only happens after gradient accumulation
232 |         self.need_reduction = False
233 | 
234 |         self.create_hooks()
235 | 
236 |         flat_dist_call(
237 |             [param.data for param in self.module.parameters()], dist.broadcast, (0,))
238 | 
239 |     def enable_need_reduction(self):
240 |         self.need_reduction = True
241 | 
242 |     def disable_need_reduction(self):
243 |         self.need_reduction = False
244 | 
245 |     def __setstate__(self, state):
246 |         super(DistributedDataParallel, self).__setstate__(state)
247 |         self.reduction_stream = torch.cuda.Stream()
248 |         self.reduction_event = torch.cuda.Event(
249 |             enable_timing=False, blocking=False)
250 | 
251 |     def __getstate__(self):
252 |         attrs = copy.copy(self.__dict__)
253 |         if self._backend != self.backend_enum_holder.NCCL:
254 |             del attrs['self.reduction_stream']
255 |             del attrs['self.reduction_event']
256 |             return attrs
257 | 
258 |     # Broadcast rank 0's bucket structure across all processes, and have all processes
259 |     # regenerate their bucket structures to match.
260 |     def sync_bucket_structure(self):
261 |         # Append leftover buckets
262 |         for tmp_bucket in self.tmp_buckets:
263 |             if len(tmp_bucket) > 0:
264 |                 self.active_i_buckets.append(tmp_bucket)
265 | 
266 |         self.num_buckets = len(self.active_i_buckets)
267 |         self.bucket_sizes = [len(bucket) for bucket in self.active_i_buckets]
268 | 
269 |         info_tensor = torch.cuda.IntTensor([self.num_buckets] +
270 |                                            self.bucket_sizes +
271 |                                            list(chain(*self.active_i_buckets)))
272 |         #print("Sync Bucket Structure Broadcast. Rank", dist.get_rank(), "Tensor Size ", info_tensor.size(), "Device ", info_tensor.device, "Current Device ", torch.cuda.current_device())
273 |         dist.broadcast(info_tensor, 0)
274 | 
275 |         info = [int(entry) for entry in info_tensor]
276 | 
277 |         self.num_buckets = info[0]
278 |         self.bucket_sizes = info[1:self.num_buckets + 1]
279 |         self.buckets = [[None for _ in range(self.bucket_sizes[i])]
280 |                         for i in range(self.num_buckets)]
281 |         # Technically, active_i_buckets' work is done.  But the information is still useful to
282 |         # keep around.  Therefore, refresh active_i_buckets based on rank 0 as well.
283 |         self.active_i_buckets = [[None for _ in range(self.bucket_sizes[i])]
284 |                                  for i in range(self.num_buckets)]
285 | 
286 |         flattened_buckets = info[self.num_buckets + 1:]
287 |         flat_i = 0
288 |         for bucket_idx in range(self.num_buckets):
289 |             for bucket_loc in range(self.bucket_sizes[bucket_idx]):
290 |                 param_i = flattened_buckets[flat_i]
291 |                 self.active_i_buckets[bucket_idx][bucket_loc] = param_i
292 |                 self.param_id_to_bucket[id(self.active_params[param_i])] = (
293 |                     bucket_idx, bucket_loc)
294 |                 flat_i += 1
295 | 
296 |     def create_hooks(self):
297 |         # Fallback hook that's only called at the end of backward.
298 |         # Used if you deliberately want to delay allreduces to the end, or to refresh the
299 |         # bucket structure that will be used to overlap communication with computation in later
300 |         # iterations.
301 |         def allreduce_params():
302 |             # Bucket record refresh
303 |             if not self.delay_allreduce:
304 |                 if self.needs_refresh:
305 |                     self.sync_bucket_structure()
306 | 
307 |                     self.needs_refresh = False
308 | 
309 |             self.allreduce_fallback()
310 | 
311 |         def overlapping_backward_epilogue():
312 |             self.reduction_stream.record_event(self.reduction_event)
313 |             torch.cuda.current_stream().wait_event(self.reduction_event)
314 | 
315 |             # Sanity checks that all the buckets were kicked off
316 |             if self.next_bucket != self.num_buckets:
317 |                 raise RuntimeError("In epilogue, next_bucket ({}) != num_buckets ({}).  ".format(
318 |                                    self.next_bucket, self.num_buckets),
319 |                                    "This probably indicates some buckets were not allreduced.")
320 | 
321 |             for actual, expected in zip(self.buckets_ready_size, self.bucket_sizes):
322 |                 if actual != expected:
323 |                     raise RuntimeError(
324 |                         "Some param buckets were not allreduced.")
325 | 
326 |         self.grad_accs = []
327 |         for param in self.module.parameters():
328 |             if param.requires_grad:
329 |                 def wrapper(param):
330 |                     param_tmp = param.expand_as(param)
331 |                     grad_acc = param_tmp.grad_fn.next_functions[0][0]
332 | 
333 |                     def allreduce_hook(*unused):
334 |                         # user must explicitly specify when to do all reduce
335 |                         if self.need_reduction == False:
336 |                             #print("Does not need Reduction")
337 |                             return
338 |                         #print("Needs Reduction")
339 |                         if self.delay_allreduce or self.needs_refresh:
340 |                             # TODO:  How do we want to handle multiple backward passes between
341 |                             # each forward, e.g., backward passes with retain_graph=True?
342 |                             # needs_refresh and callback_queued are both vulnerable states.
343 |                             if not self.delay_allreduce and self.needs_refresh:
344 |                                 # Use the backward pass to build the bucket structure on the fly.
345 |                                 active_i = self.param_id_to_active_i[id(param)]
346 | 
347 |                                 # Float, half, and double tensors are grouped into buckets separately.
348 |                                 current_type = self.param_type_to_tmp_i[param.type(
349 |                                 )]
350 | 
351 |                                 self.tmp_buckets[current_type].append(active_i)
352 | 
353 |                                 ship_tmp_bucket = False
354 |                                 if self.custom_allreduce_triggers:
355 |                                     if id(param) in self.allreduce_trigger_params:
356 |                                         ship_tmp_bucket = True
357 |                                 else:
358 |                                     self.tmp_numels[current_type] += param.numel()
359 |                                     if self.tmp_numels[current_type] >= self.message_size:
360 |                                         ship_tmp_bucket = True
361 | 
362 |                                 # To consider:  If custom_allreduce_triggers are in use, ship all
363 |                                 # tmp_buckets, not just tmp_buckets[current_type].
364 |                                 if ship_tmp_bucket:
365 |                                     self.active_i_buckets.append(
366 |                                         self.tmp_buckets[current_type])
367 |                                     self.tmp_buckets[current_type] = []
368 |                                     self.tmp_numels[current_type] = 0
369 | 
370 |                             if not self.callback_queued:
371 |                                 Variable._execution_engine.queue_callback(
372 |                                     allreduce_params)
373 |                                 self.callback_queued = True
374 |                         else:
375 |                             if not self.callback_queued:
376 |                                 Variable._execution_engine.queue_callback(
377 |                                     overlapping_backward_epilogue)
378 |                                 self.callback_queued = True
379 | 
380 |                             self.comm_ready_buckets(param)
381 | 
382 |                     grad_acc.register_hook(allreduce_hook)
383 |                     self.grad_accs.append(grad_acc)
384 | 
385 |                 wrapper(param)
386 | 
387 |     def allreduce_bucket(self, bucket):
388 |         tensor = flatten(bucket)
389 | 
390 |         tensor_to_allreduce = tensor
391 | 
392 |         if self.allreduce_always_fp32:
393 |             tensor_to_allreduce = tensor.float()
394 | 
395 |         if self.gradient_predivide_factor != 1.0:
396 |             tensor_to_allreduce.mul_(1./self.gradient_predivide_factor)
397 | 
398 |         dist.all_reduce(tensor_to_allreduce)
399 | 
400 |         if self.gradient_average:
401 |             tensor_to_allreduce.mul_(
402 |                 self.gradient_predivide_factor/self.world_size)
403 | 
404 |         if self.allreduce_always_fp32 and tensor is not tensor_to_allreduce:
405 |             tensor.copy_(tensor_to_allreduce)
406 | 
407 |         return tensor
408 | 
409 |     def allreduce_maybe_retain(self, bucket, bucket_idx=-1):
410 |         allreduced = self.allreduce_bucket(bucket)
411 |         if self.retain_allreduce_buffers:
412 |             if self.allreduce_buffers[bucket_idx] is not None:
413 |                 raise RuntimeError("The backward pass is attempting to replace an already-filled "
414 |                                    "allreduce buffer.  This is almost certainly an error.")
415 |             self.allreduce_buffers[bucket_idx] = allreduced
416 |         else:
417 |             for buf, synced in zip(bucket, unflatten(allreduced, bucket)):
418 |                 buf.copy_(synced)
419 | 
420 |     def allreduce_fallback(self):
421 |         grads = [param.grad.data for param in self.module.parameters()
422 |                  if param.grad is not None]
423 | 
424 |         split_buckets = split_half_float_double(grads)
425 | 
426 |         # If retain_allreduce_buffers is True and delay_allreduce is False,
427 |         # this will only be done during the first backward pass, ignored by the
428 |         # training script, and overwritten in the next forward pass.  So it's harmless.
429 |         if self.retain_allreduce_buffers:
430 |             self.allreduce_buffers = [None for _ in range(len(split_buckets))]
431 | 
432 |         for i, bucket in enumerate(split_buckets):
433 |             allreduced = self.allreduce_maybe_retain(bucket, i)
434 | 
435 |     def comm_ready_buckets(self, param):
436 |         # Need to do this in every hook for compatibility with Ruberry's streaming backward PR.
437 |         # self.reduction_stream.wait_stream(torch.cuda.current_stream())
438 |         #if dist.get_rank() == 0:
439 |         #    print("Parameter Name", param.name)
440 |         bucket_idx, bucket_loc = self.param_id_to_bucket[id(param)]
441 | 
442 |         if self.buckets[bucket_idx][bucket_loc] is not None:
443 |             raise RuntimeError("The backward pass is attempting to replace an already-filled "
444 |                                "bucket slot.  This is almost certainly an error.")
445 | 
446 |         self.buckets[bucket_idx][bucket_loc] = param.grad.data
447 |         self.buckets_ready_size[bucket_idx] += 1
448 | 
449 |         if self.buckets_ready_size[bucket_idx] == self.bucket_sizes[bucket_idx]:
450 |             if bucket_idx == self.next_bucket:
451 |                 torch.cuda.current_stream().record_event(self.reduction_event)
452 |                 self.reduction_stream.wait_event(self.reduction_event)
453 |                 with torch.cuda.stream(self.reduction_stream):
454 |                     self.allreduce_maybe_retain(
455 |                         self.buckets[bucket_idx], bucket_idx)
456 | 
457 |                     self.next_bucket += 1
458 | 
459 |                     # Reversing upstream's logic here, because we constructed our buckets based on
460 |                     # the order things were received during backward.
461 |                     if len(self.ready_buckets_not_reduced) > 0:
462 |                         sorted_todo = sorted(self.ready_buckets_not_reduced)
463 |                         for i in sorted_todo:
464 |                             # Nothing can be reduced now
465 |                             if i > self.next_bucket:
466 |                                 break
467 |                             elif i == self.next_bucket:
468 |                                 self.allreduce_maybe_retain(self.buckets[i], i)
469 |                                 self.ready_buckets_not_reduced.remove(i)
470 |                                 self.next_bucket += 1
471 |                             else:
472 |                                 raise ValueError(
473 |                                     "i should always be >= next_bucket")
474 |             else:
475 |                 self.ready_buckets_not_reduced.add(bucket_idx)
476 | 
477 | 
478 |     def needs_refresh(self):
479 |         self.needs_refresh = True
480 | 
481 |     def forward(self, *inputs, **kwargs):
482 |         result = self.module(*inputs, **kwargs)
483 | 
484 |         if not self.delay_allreduce:
485 |             param_list = [
486 |                 param for param in self.module.parameters() if param.requires_grad]
487 | 
488 |             # Conditions under which to refresh self.record
489 |             # Forward has the authority to set needs_refresh to True, but only allreduce_params
490 |             # in backward has the authority to set needs_refresh to False.
491 |             # Parentheses are not necessary for correct order of operations, but make the intent clearer.
492 |             if ((not self.active_params) or
493 |                 (len(param_list) != len(self.active_params)) or
494 |                     any([param1 is not param2 for param1, param2 in zip(param_list, self.active_params)])):
495 |                 self.needs_refresh = True
496 |             #self.needs_refresh = True
497 |             if self.needs_refresh:
498 |                 self.active_i_buckets = []
499 |                 self.buckets = []
500 |                 # [running half, float, double buckets]
501 |                 self.tmp_buckets = [[], [], []]
502 |                 self.tmp_numels = [0, 0, 0]
503 |                 self.bucket_sizes = []
504 |                 self.param_id_to_active_i = {
505 |                     id(param): i for i, param in enumerate(param_list)}
506 |                 self.param_id_to_bucket = {}
507 |             else:
508 |                 self.buckets = [[None for _ in range(self.bucket_sizes[i])]
509 |                                 for i in range(self.num_buckets)]
510 |                 self.buckets_ready_size = [0 for i in range(self.num_buckets)]
511 |                 if(self.retain_allreduce_buffers):
512 |                     self.allreduce_buffers = [
513 |                         None for _ in range(self.num_buckets)]
514 |                 self.next_bucket = 0
515 |                 self.ready_buckets_not_reduced = set()
516 | 
517 |             self.active_params = param_list
518 | 
519 |         self.callback_queued = False
520 | 
521 |         return result
522 | 


--------------------------------------------------------------------------------
/pretrain/PyTorch/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | 
 5 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
 6 |                     datefmt='%m/%d/%Y %H:%M:%S',
 7 |                     level=logging.INFO)
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | class Logger():
12 |     def __init__(self, cuda=False):
13 |         self.logger = logging.getLogger(__name__)
14 |         self.cuda = cuda
15 | 
16 |     def info(self, message, *args, **kwargs):
17 |         local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
18 |         if (self.cuda and local_rank == 0) or not self.cuda:
19 |             self.logger.info(message, *args, **kwargs)
20 | 
21 |     def error(self, message, *args, **kwargs):
22 |         self.logger.error(message, *args, **kwargs)
23 | 


--------------------------------------------------------------------------------
/pretrain/PyTorch/models.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import random
  4 | import numpy as np
  5 | import os
  6 | import torch
  7 | import json
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | import torch.distributed as dist
 11 | from torch.nn import CrossEntropyLoss, MSELoss
 12 | from logger import Logger
 13 | 
 14 | from dataset import BatchType
 15 | from pytorch_pretrained_bert.tokenization import BertTokenizer
 16 | from pytorch_pretrained_bert.modeling import BertModel, BertConfig
 17 | from pytorch_pretrained_bert.modeling import BertPreTrainingHeads, BertPreTrainedModel, BertPreTrainingHeads, BertLMPredictionHead
 18 | from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 19 | 
 20 | 
 21 | class BertPretrainingLoss(BertPreTrainedModel):
 22 |     def __init__(self, bert_encoder, config):
 23 |         super(BertPretrainingLoss, self).__init__(config)
 24 |         self.bert = bert_encoder
 25 |         self.cls = BertPreTrainingHeads(
 26 |             config, self.bert.embeddings.word_embeddings.weight)
 27 |         self.cls.apply(self.init_bert_weights)
 28 | 
 29 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None):
 30 |         sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
 31 |                                                    output_all_encoded_layers=False)
 32 |         prediction_scores, seq_relationship_score = self.cls(
 33 |             sequence_output, pooled_output)
 34 | 
 35 |         if masked_lm_labels is not None and next_sentence_label is not None:
 36 |             loss_fct = CrossEntropyLoss(ignore_index=-1)
 37 |             next_sentence_loss = loss_fct(
 38 |                 seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
 39 |             masked_lm_loss = loss_fct(
 40 |                 prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
 41 |             total_loss = masked_lm_loss + next_sentence_loss
 42 |             return total_loss
 43 |         else:
 44 |             return prediction_scores, seq_relationship_score
 45 | 
 46 | 
 47 | class MTLRouting(nn.Module):
 48 |     """This setup is to add MultiTask Training support in BERT Training. 
 49 |     """
 50 |     def __init__(self, encoder: BertModel, write_log, summary_writer):
 51 |         super(MTLRouting, self).__init__()
 52 |         self.bert_encoder = encoder
 53 |         self._batch_loss_calculation = nn.ModuleDict()
 54 |         self._batch_counter = {}
 55 |         self._batch_module_name = {}
 56 |         self._batch_name = {}
 57 |         self.write_log = write_log
 58 |         self.logger = Logger(cuda=torch.cuda.is_available())
 59 |         self.summary_writer = summary_writer
 60 | 
 61 |     def register_batch(self, batch_type, module_name, loss_calculation: nn.Module):
 62 |         assert isinstance(loss_calculation, nn.Module)
 63 |         self._batch_loss_calculation[str(batch_type.value)] = loss_calculation
 64 |         self._batch_counter[batch_type] = 0
 65 |         self._batch_module_name[batch_type] = module_name
 66 | 
 67 |     def log_summary_writer(self, batch_type, logs: dict, base='Train'):
 68 |         if self.write_log:
 69 |             counter = self._batch_counter[batch_type]
 70 |             module_name = self._batch_module_name.get(
 71 |                 batch_type, self._get_batch_type_error(batch_type))
 72 |             for key, log in logs.items():
 73 |                 self.summary_writer.add_scalar(
 74 |                     f'{base}/{module_name}/{key}', log, counter)
 75 |             self._batch_counter[batch_type] = counter + 1
 76 | 
 77 |     def _get_batch_type_error(self, batch_type):
 78 |         def f(*args, **kwargs):
 79 |             message = f'Misunderstood batch type of {batch_type}'
 80 |             self.logger.error(message)
 81 |             raise ValueError(message)
 82 |         return f
 83 | 
 84 |     def forward(self, batch, log=True):
 85 |         batch_type = batch[0][0].item()
 86 | 
 87 |         # Pretrain Batch
 88 |         if batch_type == BatchType.PRETRAIN_BATCH:
 89 |             loss_function = self._batch_loss_calculation[str(batch_type)]
 90 | 
 91 |             loss = loss_function(input_ids=batch[1],
 92 |                                  token_type_ids=batch[3],
 93 |                                  attention_mask=batch[2],
 94 |                                  masked_lm_labels=batch[5],
 95 |                                  next_sentence_label=batch[4])
 96 |             if log:
 97 |                 self.log_summary_writer(
 98 |                     batch_type, logs={'pretrain_loss': loss.item()})
 99 |             return loss
100 | 
101 | 
102 | class BertMultiTask:
103 |     def __init__(self, job_config, use_pretrain, tokenizer, cache_dir, device, write_log, summary_writer):
104 |         self.job_config = job_config
105 | 
106 |         if not use_pretrain:
107 |             model_config = self.job_config.get_model_config()
108 |             bert_config = BertConfig(**model_config)
109 |             bert_config.vocab_size = len(tokenizer.vocab)
110 | 
111 |             self.bert_encoder = BertModel(bert_config)
112 |         # Use pretrained bert weights
113 |         else:
114 |             self.bert_encoder = BertModel.from_pretrained(self.job_config.get_model_file_type(), cache_dir=cache_dir)
115 |             bert_config = self.bert_encoder.config
116 | 
117 |         self.network=MTLRouting(self.bert_encoder, write_log = write_log, summary_writer = summary_writer)
118 | 
119 |         #config_data=self.config['data']
120 | 
121 |         # Pretrain Dataset
122 |         self.network.register_batch(BatchType.PRETRAIN_BATCH, "pretrain_dataset", loss_calculation=BertPretrainingLoss(self.bert_encoder, bert_config))
123 | 
124 |         self.device=device
125 |         # self.network = self.network.float()
126 |         # print(f"Bert ID: {id(self.bert_encoder)}  from GPU: {dist.get_rank()}")
127 | 
128 |     def save(self, filename: str):
129 |         network=self.network.module
130 |         return torch.save(network.state_dict(), filename)
131 | 
132 |     def load(self, model_state_dict: str):
133 |         return self.network.module.load_state_dict(torch.load(model_state_dict, map_location=lambda storage, loc: storage))
134 | 
135 |     def move_batch(self, batch, non_blocking=False):
136 |         return batch.to(self.device, non_blocking)
137 | 
138 |     def eval(self):
139 |         self.network.eval()
140 | 
141 |     def train(self):
142 |         self.network.train()
143 | 
144 |     def save_bert(self, filename: str):
145 |         return torch.save(self.bert_encoder.state_dict(), filename)
146 | 
147 |     def to(self, device):
148 |         assert isinstance(device, torch.device)
149 |         self.network.to(device)
150 | 
151 |     def half(self):
152 |         self.network.half()
153 | 


--------------------------------------------------------------------------------
/pretrain/PyTorch/notebooks/BERT_Pretrain.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Pretraining of the BERT model\n",
  8 |     "\n",
  9 |     "This notebook contains an end-to-end walkthrough of using Azure Machine Learning service and pretraining [BERT: Bidirectional Encoder Representations from Transformers](https://arxiv.org/abs/1810.04805) models.\n",
 10 |     "\n",
 11 |     "Methodology:\n",
 12 |     "- Intialize an AzureML workspace\n",
 13 |     "- Register a datastore\n",
 14 |     "- Create an experiment\n",
 15 |     "- Provision a compute target\n",
 16 |     "- Create an Estimator\n",
 17 |     "- Configure and Run"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "## Prerequisites\n",
 25 |     "If you are using an [Azure Machine Learning Notebook VM](https://docs.microsoft.com/en-us/azure/machine-learning/service/quickstart-run-cloud-notebook), you are all set. Otherwise, refer to the [configuration Notebook](https://github.com/Azure/MachineLearningNotebooks/blob/56e0ebc5acb9614fac51d8b98ede5acee8003820/configuration.ipynb) first if you haven't already to establish your connection to the AzureML Workspace. Prerequisites are:\n",
 26 |     "* Azure subscription\n",
 27 |     "* Azure Machine Learning Workspace\n",
 28 |     "* Azure Machine Learning SDK"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## Library import"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "# Regular python libraries\n",
 45 |     "import os\n",
 46 |     "import requests\n",
 47 |     "import sys\n",
 48 |     "\n",
 49 |     "# AzureML libraries\n",
 50 |     "import azureml.core\n",
 51 |     "from azureml.core import Experiment, Workspace, Datastore, Run\n",
 52 |     "from azureml.core.compute import ComputeTarget, AmlCompute\n",
 53 |     "from azureml.core.compute_target import ComputeTargetException\n",
 54 |     "from azureml.core.conda_dependencies import CondaDependencies\n",
 55 |     "from azureml.core.container_registry import ContainerRegistry\n",
 56 |     "from azureml.core.runconfig import MpiConfiguration, RunConfiguration, DEFAULT_GPU_IMAGE\n",
 57 |     "from azureml.train.dnn import PyTorch\n",
 58 |     "from azureml.train.estimator import Estimator\n",
 59 |     "from azureml.widgets import RunDetails\n",
 60 |     "\n",
 61 |     "# Check core SDK version number\n",
 62 |     "print(\"SDK version:\", azureml.core.VERSION)"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "## Workspace setup\n",
 70 |     "\n",
 71 |     "Initialize a Workspace object from the existing workspace you created in the Prerequisites step or create a new one."
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "# Retrieve the workspace\n",
 81 |     "ws = Workspace.setup()\n",
 82 |     "\n",
 83 |     "# Print the workspace attributes\n",
 84 |     "print('Workspace name: ' + ws.name, \n",
 85 |     "      'Workspace region: ' + ws.location, \n",
 86 |     "      'Subscription id: ' + ws.subscription_id, \n",
 87 |     "      'Resource group: ' + ws.resource_group, sep = '\\n')"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "## Datastore registration"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "[BERT paper](https://arxiv.org/pdf/1810.04805) references `Wikipedia` and `BookCorpus` datasets for pretraining. This notebook is configured to use Wikipedia dataset only, but can be used with other datasets as well, including custom datasets. The preprocessed data should be available in a `Datastore` in AzureML `Workspace`. \n",
102 |     "\n",
103 |     "The Wikipedia corpus used for BERT pretraining is preprocessed following the [data prep instructions](https://github.com/microsoft/AzureML-BERT/blob/master/docs/dataprep.md) and uploaded to  https://bertonazuremlwestus2.blob.core.windows.net/public2/bert_data.tar.gz (70 GB). You need to extract the files and copy them to another Azure blob container and register it as a workspace to use it in the pretraining job. Additional details on the tar.gz file and the data transfer are available at [artifacts.md](https://github.com/microsoft/AzureML-BERT/blob/master/docs/artifacts.md).\n",
104 |     "\n",
105 |     "Alternatively, you can preprocess the raw data from scratch (instructions available at the [data prep notes](https://github.com/microsoft/AzureML-BERT/blob/master/docs/dataprep.md)), upload that to an Azure blob container and use it as the datastore for the job. \n",
106 |     "\n",
107 |     "Note: it is also possible to use datasets other than Wikipedia corpus with this implementation. \n",
108 |     "\n",
109 |     "The following code assumes that the data is already copied to an Azure blob container with the following directory structure. It is recommended to retain this directory structure to run this notebook without code updates. In case the directory structure is different, the constructor of PyTorch estimator where the datastore is mounted should be modified.\n",
110 |     "\n",
111 |     "       \n",
112 |     "```\n",
113 |     "bert_data\n",
114 |     "│   bert-base.json\n",
115 |     "│   bert-large.json\n",
116 |     "│   bert-base-single-node.json\n",
117 |     "│   bert-large-single-node.json\n",
118 |     "│\n",
119 |     "└───512\n",
120 |     "│   │\n",
121 |     "│   └───wiki_pretrain\n",
122 |     "│       │   wikipedia_segmented_part_0.bin\n",
123 |     "│       │   wikipedia_segmented_part_1.bin\n",
124 |     "│       │   ...\n",
125 |     "│       │   wikipedia_segmented_part_98.bin\n",
126 |     "└───validation_512_only\n",
127 |     "    │\n",
128 |     "    └───validation_set.bin\n",
129 |     "\n",
130 |     "```"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "# Register the datastore with the workspace\n",
140 |     "ds = Datastore.register_azure_blob_container(workspace=ws, \n",
141 |     "                                             datastore_name='BERT_Preprocessed_Data',\n",
142 |     "                                             container_name='data',\n",
143 |     "                                             account_name='<name goes here>', \n",
144 |     "                                             account_key='<key goes here>'\n",
145 |     "                                            )\n",
146 |     "\n",
147 |     "# Help from: https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-access-data"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "# Print the workspace attributes\n",
157 |     "print('Datastore name: ' + ds.name, \n",
158 |     "      'Container name: ' + ds.container_name, \n",
159 |     "      'Datastore type: ' + ds.datastore_type, \n",
160 |     "      'Workspace name: ' + ds.workspace.name, sep = '\\n')"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "## Create an Experiment\n",
168 |     "\n",
169 |     "Experiment is a logical container in an Azure ML Workspace. It hosts run records which can include run metrics and output artifacts from your experiments."
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "# Create an experiment\n",
179 |     "experiment_name = 'BERT-pretraining'\n",
180 |     "experiment = Experiment(ws, name=experiment_name)"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "## Provision a cluster\n",
188 |     "\n",
189 |     "### Introduction to AmlCompute\n",
190 |     "\n",
191 |     "Azure Machine Learning Compute is managed compute infrastructure that allows the user to easily create single to multi-node compute of the appropriate VM Family. It is created within your workspace region and is a resource that can be used by other users in your workspace. It autoscales by default to the max_nodes, when a job is submitted, and executes in a containerized environment packaging the dependencies as specified by the user.\n",
192 |     "\n",
193 |     "Since it is managed compute, job scheduling and cluster management are handled internally by Azure Machine Learning service.\n",
194 |     "\n",
195 |     "For more information on Azure Machine Learning Compute, please read [this](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute).\n",
196 |     "\n",
197 |     "Note: As with other Azure services, there are limits on certain resources (for eg. AmlCompute quota) associated with the Azure Machine Learning service. Please read [this](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "### Create a compute target\n",
205 |     "BERT pretraining on Azure Machine Learning Service is supported on 16 x `Standard_NC24s_v3` or 8 x `Standard_ND40_v2` VMs. In the next step, you will create a 16 node (i.e. 64 GPUs) AMLCompute cluster of `Standard_NC24s_v3` GPU VMs, if it doesn't already exist in your workspace. The code to create a cluster with 8 `Standard_ND40_v2` VMs is commented out in the cell below.\n",
206 |     "\n",
207 |     "* vm_size: VM family of the nodes provisioned by AmlCompute. Simply choose from the supported_vmsizes() above\n",
208 |     "* max_nodes: Maximum nodes to autoscale to while running a job on AmlCompute\n",
209 |     "* min_nodes: Minimum number of nodes while running a job on AmlCompute"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "# Create the compute cluster\n",
219 |     "gpu_cluster_name = \"pretraincluster\" \n",
220 |     "\n",
221 |     "# Verify that the cluster doesn't exist already\n",
222 |     "try:\n",
223 |     "    gpu_compute_target = ComputeTarget(workspace=ws, name=gpu_cluster_name)\n",
224 |     "    print('Found existing compute target.')\n",
225 |     "except ComputeTargetException:\n",
226 |     "    print('Creating a new compute target...')\n",
227 |     "    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC24s_v3', min_nodes=0, max_nodes=16)\n",
228 |     "    # compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC40_v2', min_nodes=0, max_nodes=8)\n",
229 |     "    \n",
230 |     "    # create the cluster\n",
231 |     "    gpu_compute_target = ComputeTarget.create(ws, gpu_cluster_name, compute_config)\n",
232 |     "    gpu_compute_target.wait_for_completion(show_output=True)\n",
233 |     "\n",
234 |     "# Use the 'status' property to get a detailed status for the current cluster. \n",
235 |     "#print(gpu_compute_target.status.serialize())"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {},
241 |    "source": [
242 |     "## Estimator definition and run submission\n",
243 |     "\n",
244 |     "The estimator uses a custom docker image and train.py as the entry script for execution.\n",
245 |     "\n",
246 |     "For more information on Estimator, refer [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-train-pytorch)"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "# Define the project folder\n",
256 |     "project_folder = '..' # This is to allow the libraries stored under pytorch/ to be loaded\n",
257 |     "\n",
258 |     "## Using a public image published on Azure.\n",
259 |     "image_name = 'mcr.microsoft.com/azureml/bert:pretrain-openmpi3.1.2-cuda10.0-cudnn7-ubuntu16.04'\n",
260 |     "\n",
261 |     "# Using MPI to execute a distributed run\n",
262 |     "mpi = MpiConfiguration()\n",
263 |     "# Standard_NC24s_v3 VM has 4 GPUs. !!!! update this appropriately if you use a different VM size !!!!\n",
264 |     "mpi.process_count_per_node = 4 \n",
265 |     "# !!!! use the following for Standard_NC40_v2 VM !!!!\n",
266 |     "# mpi.process_count_per_node = 8\n",
267 |     "\n",
268 |     "# Define the Pytorch estimator\n",
269 |     "estimator = PyTorch(source_directory=project_folder,\n",
270 |     "                    # Compute configuration\n",
271 |     "                    compute_target=gpu_compute_target,\n",
272 |     "                    node_count=16, \n",
273 |     "                    distributed_training=mpi,\n",
274 |     "                    use_gpu=True,\n",
275 |     "                    \n",
276 |     "                    #Docker image\n",
277 |     "                    use_docker=True,\n",
278 |     "                    custom_docker_image=image_name,\n",
279 |     "                    user_managed=True,\n",
280 |     "                    \n",
281 |     "                    # Training script parameters\n",
282 |     "                    script_params = {\n",
283 |     "                        # Required Params\n",
284 |     "                        \"--config_file\": \"bert-large.json\",\n",
285 |     "                        # bert_data is where pre-processed training data are\n",
286 |     "                        '--train_path' : ds.path('bert_data/512/wiki_pretrain/').as_mount(), \n",
287 |     "                        '--validation_path':ds.path('bert_data/validation_512_only/').as_mount(),\n",
288 |     "                        # Optional Params\n",
289 |     "                        \"--max_seq_length\": 512,\n",
290 |     "                        \"--max_predictions_per_seq\": 80,\n",
291 |     "                        \"--masked_lm_prob\": 0.15,\n",
292 |     "                        \"--train_batch_size\": 64,\n",
293 |     "                        '--seed': 42,\n",
294 |     "                        '--accumulate_gradients': \"True\",\n",
295 |     "                        '--gradient_accumulation_steps': 16,\n",
296 |     "                        '--fp16': \"True\",\n",
297 |     "                        '--loss_scale': 0,\n",
298 |     "                        '--epochs' : 2,\n",
299 |     "                        '--config_file_path' :ds.path('bert_data/').as_mount() ,\n",
300 |     "                        '--output_dir':ds.path(f'bert_data/output/{experiment_name}/').as_mount(),\n",
301 |     "                        '--best_cp_dir':ds.path(f'bert_data/best_cp/{experiment_name}/').as_mount(),\n",
302 |     "                        '--latest_cp_dir':ds.path(f'bert_data/latest_cp/{experiment_name}/').as_mount(),\n",
303 |     "                        '--backend':\"nccl\"\n",
304 |     "                    },\n",
305 |     "                    \n",
306 |     "                    entry_script='train.py',\n",
307 |     "                    inputs=[ds.path('bert_data/').as_mount()]\n",
308 |     "                   )\n",
309 |     "# path to the Python environment in the custom Docker image\n",
310 |     "estimator._estimator_config.environment.python.interpreter_path = '/opt/miniconda/envs/amlbert/bin/python'"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "markdown",
315 |    "metadata": {},
316 |    "source": [
317 |     "For single node (1 NC24s_v3 VM), multi-GPU runs for debugging purposes, use the following configuration:\n",
318 |     "- node_count=1, '--config_file':`bert-base-single-node.json`, '--gradient_accumulation_steps': `64`,\"--train_batch_size\": `1024` (for bert-base)\n",
319 |     "- node_count=1, '--config_file':`bert-large-single-node.json`,'--gradient_accumulation_steps': `256`,\"--train_batch_size\": `1024` (for bert-large)\n",
320 |     "\n",
321 |     "To resume from the latest checkpoint, use `load_training_checkpoint` parameter to pass the checkpoint directory. It will load the latest checkpoint from the directory.\n",
322 |     "\n",
323 |     "'--load_training_checkpoint':ds.path(f'bert_data/latest_cp/{experiment_name}/').as_mount(),"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "# Submit the run\n",
333 |     "run = experiment.submit(estimator)\n",
334 |     "RunDetails(run).show()"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": null,
340 |    "metadata": {},
341 |    "outputs": [],
342 |    "source": [
343 |     "# Python packages required to use Tensorboard with AzureML are azureml-tensorboard, tensorboardX and tensorboard\n",
344 |     "# The Tensorboard constructor takes an array of runs, so be sure and pass it in as a single-element array here\n",
345 |     "from azureml.tensorboard import Tensorboard\n",
346 |     "tb = Tensorboard([run])\n",
347 |     "tb.start()"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": null,
353 |    "metadata": {},
354 |    "outputs": [],
355 |    "source": [
356 |     "#tb.stop()"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": null,
362 |    "metadata": {},
363 |    "outputs": [],
364 |    "source": [
365 |     "#run.cancel()"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "metadata": {},
372 |    "outputs": [],
373 |    "source": [
374 |     "# Downloading log file to run the perf benchmarking script\n",
375 |     "fetched_run = Run(experiment, run.id)\n",
376 |     "for f in fetched_run.get_file_names():\n",
377 |     "    if \"70_driver_log_rank_0.txt\" in f:\n",
378 |     "        dest = os.path.join('outputs', f.split('/')[-1])\n",
379 |     "        print('Downloading file {} to {}...'.format(f, dest))\n",
380 |     "        fetched_run.download_file(f, dest)   "
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": null,
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": [
389 |     "# Get average throughput of the training run\n",
390 |     "sys.path.append(os.path.abspath(os.path.join('..', '')))\n",
391 |     "from benchmark import *\n",
392 |     "get_perf_metrics(dest)"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": null,
398 |    "metadata": {},
399 |    "outputs": [],
400 |    "source": []
401 |   }
402 |  ],
403 |  "metadata": {
404 |   "file_extension": ".py",
405 |   "kernelspec": {
406 |    "display_name": "Python 3",
407 |    "language": "python",
408 |    "name": "python3"
409 |   },
410 |   "language_info": {
411 |    "codemirror_mode": {
412 |     "name": "ipython",
413 |     "version": 3
414 |    },
415 |    "file_extension": ".py",
416 |    "mimetype": "text/x-python",
417 |    "name": "python",
418 |    "nbconvert_exporter": "python",
419 |    "pygments_lexer": "ipython3",
420 |    "version": "3.6.9"
421 |   },
422 |   "mimetype": "text/x-python",
423 |   "name": "python",
424 |   "npconvert_exporter": "python",
425 |   "pygments_lexer": "ipython3",
426 |   "version": 3
427 |  },
428 |  "nbformat": 4,
429 |  "nbformat_minor": 2
430 | }


--------------------------------------------------------------------------------
/pretrain/PyTorch/optimization.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | def warmup_linear(x, warmup=0.002):
 4 |     if warmup == 0.0:
 5 |         return 1.0
 6 |     elif x < warmup:
 7 |         return x/warmup
 8 |     return 1.0 - x
 9 | 
10 | 
11 | def warmup_linear_decay_exp(global_step, decay_rate, decay_steps, total_steps, warmup=0.002):
12 |     x = global_step/total_steps
13 |     warmup_end = warmup * total_steps
14 |     if warmup == 0.0:
15 |         return 1.0
16 |     elif x < warmup:
17 |         return x/warmup
18 |     return decay_rate**((global_step-warmup_end)/decay_steps)
19 | 
20 | class LinearWarmupExponentialSchedule():
21 |     def __init__(self, warmup=0.002, t_total=-1, initial_lr = 2e-5, final_lr=5e-6, decay_rate=0.99):
22 |         self.warmup = warmup
23 |         self.total_steps = t_total
24 |         self.decay_rate = decay_rate
25 |         self.warmup_end = self.warmup * t_total
26 | 
27 |         # Calculate the decay Steps
28 |         self.decay_steps = int(math.ceil((math.log(self.decay_rate)/ math.log(final_lr/initial_lr)) * (1.0 - warmup) * t_total))
29 | 
30 |     def get_lr(self, global_step):
31 |         x = global_step/self.total_steps
32 |         if self.warmup == 0.0:
33 |             return 1.0
34 |         elif x < self.warmup:
35 |             return x/self.warmup
36 |         return self.decay_rate**((global_step-self.warmup_end)/self.decay_steps)
37 | 


--------------------------------------------------------------------------------
/pretrain/PyTorch/sources.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm
  2 | from typing import Tuple
  3 | from random import shuffle
  4 | import pickle
  5 | import random
  6 | 
  7 | from pytorch_pretrained_bert.tokenization import BertTokenizer
  8 | 
  9 | 
 10 | def truncate_input_sequence(tokens_a, tokens_b, max_num_tokens):
 11 |     while True:
 12 |         total_length = len(tokens_a) + len(tokens_b)
 13 |         if total_length <= max_num_tokens:
 14 |             break
 15 | 
 16 |         trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
 17 |         assert len(trunc_tokens) >= 1
 18 | 
 19 |         # We want to sometimes truncate from the front and sometimes from the
 20 |         # back to add more randomness and avoid biases.
 21 |         if random.random() < 0.5:
 22 |             del trunc_tokens[0]
 23 |         else:
 24 |             trunc_tokens.pop()
 25 | 
 26 | 
 27 | class TokenInstance:
 28 |     def __init__(self, tokens_a, tokens_b, is_next):
 29 |         self.tokens_a = tokens_a
 30 |         self.tokens_b = tokens_b
 31 |         self.is_next = is_next  # 0 is if in continuation, 1 if is random
 32 | 
 33 |     def get_values(self):
 34 |         return (self.tokens_a, self.tokens_b, self.is_next)
 35 | 
 36 | 
 37 | class PretrainingDataCreator:
 38 |     def __init__(self, path, tokenizer: BertTokenizer,  max_seq_length, readin: int = 2000000, dupe_factor: int = 5, small_seq_prob: float = 0.1):
 39 |         self.dupe_factor = dupe_factor
 40 |         self.max_seq_length = max_seq_length
 41 |         self.small_seq_prob = small_seq_prob
 42 | 
 43 |         documents = []
 44 |         instances = []
 45 |         with open(path, encoding='utf-8') as fd:
 46 |             for i, line in enumerate(tqdm(fd)):
 47 |                 line = line.replace('\n', '')
 48 |                 # Expected format (Q,T,U,S,D)
 49 |                 # query, title, url, snippet, document = line.split('\t')
 50 |                 # ! remove this following line later
 51 |                 document = line
 52 |                 if len(document.split("<sep>")) <= 3:
 53 |                     continue
 54 |                 lines = document.split("<sep>")
 55 |                 document = []
 56 |                 for seq in lines:
 57 |                     document.append(tokenizer.tokenize(seq))
 58 |                 # document = list(map(tokenizer.tokenize, lines))
 59 |                 documents.append(document)
 60 | 
 61 |         documents = [x for x in documents if x]
 62 | 
 63 |         self.documents = documents
 64 |         for _ in range(self.dupe_factor):
 65 |             for index in range(len(self.documents)):
 66 |                 instances.extend(self.create_training_instance(index))
 67 | 
 68 |         shuffle(instances)
 69 |         self.instances = instances
 70 |         self.len = len(self.instances)
 71 |         self.documents = None
 72 |         documents = None
 73 | 
 74 |     def __len__(self):
 75 |         return self.len
 76 | 
 77 |     def __getstate__(self):
 78 |         state = self.__dict__.copy()
 79 |         return state
 80 | 
 81 |     def __setstate__(self, state):
 82 |         self.__dict__.update(state)
 83 | 
 84 |     def save(self, filename):
 85 |         with open(filename, 'wb') as outfile:
 86 |             pickle.dump(self, outfile)
 87 | 
 88 |     @staticmethod
 89 |     def load(filename):
 90 |         print("Loading filename {}".format(filename))
 91 |         with open(filename, 'rb') as f:
 92 |             return pickle.load(f)
 93 | 
 94 |     def create_training_instance(self, index):
 95 |         document = self.documents[index]
 96 | 
 97 |         # Need to add [CLS] + 2*[SEP] tokens
 98 |         max_num_tokens = self.max_seq_length - 3
 99 | 
100 |         # We want to maximize the inp sequence but also want inputs similar
101 |         # to our generic task inputs which will be compartively smaller
102 |         # than the data on which we intend to pre-train.
103 |         target_seq_length = max_num_tokens
104 |         if random.random() < self.small_seq_prob:
105 |             target_seq_length = random.randint(5, max_num_tokens)
106 | 
107 |         # Need to make the sequences split for NSP task for interesting
108 |         # rather than choosing some arbitrary point. If not the NSP
109 |         # task might become way too easy.
110 |         instances = []
111 |         current_chunk = []
112 |         current_length = 0
113 |         i = 0
114 |         while i < len(document):
115 |             segment = document[i]
116 |             current_chunk.append(segment)
117 |             current_length += len(segment)
118 |             if i == len(document)-1 or current_length >= target_seq_length:
119 |                 if current_chunk:
120 |                     # `a_end` is how many segments from `current_chunk` go into the `A`
121 |                     # (first) sentence.
122 |                     a_end = 1
123 |                     if len(current_chunk) >= 2:
124 |                         a_end = random.randint(1, len(current_chunk) - 1)
125 | 
126 |                     tokens_a = []
127 |                     for j in range(a_end):
128 |                         tokens_a.extend(current_chunk[j])
129 | 
130 |                     tokens_b = []
131 | 
132 |                     # Random Next
133 |                     is_random_next = False
134 |                     if len(current_chunk) == 1 or random.random() < 0.5:
135 |                         is_random_next = True
136 |                         target_b_length = target_seq_length - len(tokens_a)
137 | 
138 |                         # Pick a random document
139 |                         for _ in range(10):
140 |                             random_doc_index = random.randint(
141 |                                 0, len(self.documents) - 1)
142 |                             if random_doc_index != index:
143 |                                 break
144 | 
145 |                         random_doc = self.documents[random_doc_index]
146 |                         random_start = random.randint(0, len(random_doc)-1)
147 |                         for j in range(random_start, len(random_doc)):
148 |                             tokens_b.extend(random_doc[j])
149 |                             if len(tokens_b) >= target_b_length:
150 |                                 break
151 | 
152 |                         # We didn't actually use these segments so we "put them back" so
153 |                         # they don't go to waste.
154 |                         num_unused_segments = len(current_chunk) - a_end
155 |                         i -= num_unused_segments
156 | 
157 |                     # Actual Next
158 |                     else:
159 |                         is_random_next = False
160 |                         for j in range(a_end, len(current_chunk)):
161 |                             tokens_b.extend(current_chunk[j])
162 | 
163 |                     truncate_input_sequence(tokens_a, tokens_b, max_num_tokens)
164 | 
165 |                     assert len(tokens_a) >= 1
166 |                     assert len(tokens_b) >= 1
167 | 
168 |                     instances.append(TokenInstance(
169 |                         tokens_a, tokens_b, int(is_random_next)))
170 | 
171 |                 current_chunk = []
172 |                 current_length = 0
173 |             i += 1
174 | 
175 |         return instances
176 | 
177 | 
178 | class GenericPretrainingDataCreator(PretrainingDataCreator):
179 |     def __init__(self, path, tokenizer: BertTokenizer,  max_seq_length: int = 512, readin: int = 2000000, dupe_factor: int = 6, small_seq_prob: float = 0.1):
180 |         self.dupe_factor = dupe_factor
181 |         self.max_seq_length = max_seq_length
182 |         self.small_seq_prob = small_seq_prob
183 | 
184 |         documents = []
185 |         instances = []
186 |         with open(path, encoding='utf-8') as fd:
187 |             document = []
188 |             for i, line in enumerate(tqdm(fd)):
189 |                 line = line.replace('\n', '')
190 |                 # document = line
191 |                 # if len(document.split("<sep>")) <= 3:
192 |                 #     continue
193 |                 if len(line) == 0:  # This is end of document
194 |                     documents.append(document)
195 |                     document = []
196 |                 if len(line.split(' ')) > 2:
197 |                     document.append(tokenizer.tokenize(line))
198 |             if len(document) > 0:
199 |                 documents.append(document)
200 | 
201 |         documents = [x for x in documents if x]
202 |         print(documents[0])
203 |         print(len(documents))
204 |         self.documents = documents
205 |         for _ in range(self.dupe_factor):
206 |             for index in range(len(self.documents)):
207 |                 instances.extend(self.create_training_instance(index))
208 | 
209 |         shuffle(instances)
210 |         self.instances = instances
211 |         self.len = len(self.instances)
212 |         self.documents = None
213 |         documents = None
214 | 
215 | class WikiPretrainingDataCreator(PretrainingDataCreator):
216 |     def __init__(self, path, tokenizer: BertTokenizer,  max_seq_length: int = 512, readin: int = 2000000, dupe_factor: int = 6, small_seq_prob: float = 0.1):
217 |         self.dupe_factor = dupe_factor
218 |         self.max_seq_length = max_seq_length
219 |         self.small_seq_prob = small_seq_prob
220 | 
221 |         documents = []
222 |         instances = []
223 |         with open(path, encoding='utf-8') as fd:
224 |             document = []
225 |             for i, line in enumerate(tqdm(fd)):
226 |                 line = line.replace('\n', '')
227 |                 # document = line
228 |                 # if len(document.split("<sep>")) <= 3:
229 |                 #     continue
230 |                 if len(line) > 0 and line[:2] ==  "[[" : # This is end of document
231 |                     documents.append(document)
232 |                     document = []
233 |                 if len(line.split(' ')) > 2:
234 |                     document.append(tokenizer.tokenize(line))
235 |             if len(document) > 0:
236 |                 documents.append(document)
237 | 
238 |         documents = [x for x in documents if x]
239 |         self.documents = documents
240 |         for _ in range(self.dupe_factor):
241 |             for index in range(len(self.documents)):
242 |                 instances.extend(self.create_training_instance(index))
243 | 
244 |         shuffle(instances)
245 |         self.instances = instances
246 |         self.len = len(self.instances)
247 |         self.documents = None
248 |         documents = None


--------------------------------------------------------------------------------
/pretrain/PyTorch/text.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | PAD = 0
 4 | 
 5 | def mask(x):
 6 |     return x != PAD
 7 | 
 8 | def torch_long(x):
 9 |     return torch.LongTensor(x)
10 | 


--------------------------------------------------------------------------------
/pretrain/PyTorch/train.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | 
  3 | import numpy as np
  4 | import random
  5 | import os
  6 | import sys
  7 | import json
  8 | import shutil
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.distributed as dist
 12 | from torch.utils.data import DataLoader, Dataset
 13 | from torch.utils.data.sampler import RandomSampler
 14 | from torch.utils.data.distributed import DistributedSampler
 15 | 
 16 | import argparse
 17 | from tqdm import tqdm
 18 | from checkpoint import checkpoint_model, load_checkpoint, latest_checkpoint_file
 19 | from logger import Logger
 20 | from utils import get_sample_writer
 21 | from models import BertMultiTask
 22 | from dataset import PreTrainingDataset
 23 | from dataset import PretrainDataType
 24 | from pytorch_pretrained_bert.tokenization import BertTokenizer
 25 | from pytorch_pretrained_bert.optimization import BertAdam
 26 | from optimization import warmup_linear_decay_exp
 27 | from azureml_adapter import set_environment_variables_for_nccl_backend, get_local_rank, get_global_size, get_local_size
 28 | from sources import PretrainingDataCreator, TokenInstance, GenericPretrainingDataCreator
 29 | from sources import WikiPretrainingDataCreator
 30 | from configuration import BertJobConfiguration
 31 | 
 32 | from azureml.core.run import Run
 33 | 
 34 | 
 35 | def get_effective_batch(total):
 36 |     if use_multigpu_with_single_device_per_process:
 37 |         return total//dist.get_world_size()//train_batch_size//gradient_accumulation_steps
 38 |     else:
 39 |         return total//train_batch_size//gradient_accumulation_steps # Dividing with gradient_accumulation_steps since we multiplied it earlier
 40 | 
 41 | 
 42 | def get_dataloader(dataset: Dataset, eval_set=False):
 43 |     if not use_multigpu_with_single_device_per_process:
 44 |         train_sampler = RandomSampler(dataset)
 45 |     else:
 46 |         train_sampler = DistributedSampler(dataset)
 47 |     return (x for x in DataLoader(dataset, batch_size=train_batch_size // 2 if eval_set else train_batch_size,
 48 |                                   sampler=train_sampler, num_workers=job_config.get_num_workers()))
 49 | 
 50 | 
 51 | def pretrain_validation(index):
 52 |     model.eval()
 53 |     dataset = PreTrainingDataset(tokenizer=tokenizer,
 54 |                                  folder=args.validation_path,
 55 |                                  logger=logger, max_seq_length=max_seq_length,
 56 |                                  index=index, data_type=PretrainDataType.VALIDATION,
 57 |                                  max_predictions_per_seq=max_predictions_per_seq,
 58 |                                  masked_lm_prob=masked_lm_prob)
 59 |     data_batches = get_dataloader(dataset, eval_set=True)
 60 |     eval_loss = 0
 61 |     nb_eval_steps = 0
 62 | 
 63 |     for batch in data_batches:
 64 |         batch = tuple(t.to(device) for t in batch)
 65 |         tmp_eval_loss = model.network(batch, log=False)
 66 |         dist.reduce(tmp_eval_loss, 0)
 67 |         # Reduce to get the loss from all the GPU's
 68 |         tmp_eval_loss = tmp_eval_loss / dist.get_world_size()
 69 |         eval_loss += tmp_eval_loss.mean().item()
 70 |         nb_eval_steps += 1
 71 |     eval_loss = eval_loss / nb_eval_steps
 72 |     logger.info(f"Validation Loss for epoch {index + 1} is: {eval_loss}")
 73 |     if check_write_log():
 74 |         summary_writer.add_scalar(f'Validation/Loss', eval_loss, index + 1)
 75 |         run.log("validation_loss", np.float(eval_loss))
 76 |         run.log_row("validation_loss over epochs", epoch = index, val_loss = np.float(eval_loss))
 77 |     return eval_loss
 78 | 
 79 | 
 80 | def train(index):
 81 |     model.train()
 82 |     dataloaders = {}
 83 |     i = 0
 84 |     global global_step
 85 |     datalengths = []
 86 |     batchs_per_dataset = []
 87 | 
 88 |     # Pretraining datasets
 89 |     wiki_pretrain_dataset = PreTrainingDataset(tokenizer=tokenizer,
 90 |                                                folder=args.train_path,
 91 |                                                logger=logger, max_seq_length=max_seq_length,
 92 |                                                index=index, data_type=PretrainDataType.WIKIPEDIA,
 93 |                                                max_predictions_per_seq=max_predictions_per_seq,
 94 |                                                masked_lm_prob=masked_lm_prob)
 95 | 
 96 |     datalengths.append(len(wiki_pretrain_dataset))
 97 |     dataloaders[i] = get_dataloader(wiki_pretrain_dataset)
 98 | 
 99 |     num_batches_in_dataset = get_effective_batch(len(wiki_pretrain_dataset))
100 |     logger.info('Wikpedia data file: Number of samples {}, number of batches required to process these samples: {}'.format(len(wiki_pretrain_dataset), num_batches_in_dataset))
101 |     
102 |     batchs_per_dataset.append(num_batches_in_dataset)
103 |     i += 1
104 | 
105 |     logger.info("Training on Wikipedia dataset")
106 | 
107 |     total_length = sum(datalengths)
108 | 
109 |     dataset_batches = []
110 |     for i, batch_count in enumerate(batchs_per_dataset):
111 |         dataset_batches.extend([i] * batch_count)
112 |     logger.info('Number of batches to process *all* data samples in this epoch: {}'.format(len(dataset_batches)))
113 |     # shuffle
114 |     random.shuffle(dataset_batches)
115 | 
116 |     # We don't want the dataset to be n the form of alternate chunks if we have more than
117 |     # one dataset type, instead we want to organize them into contiguous chunks of each
118 |     # data type, hence the multiplication with grad_accumulation_steps with dataset_batch_type
119 |     dataset_picker = []
120 |     for dataset_batch_type in dataset_batches:
121 |         dataset_picker.extend([dataset_batch_type] * gradient_accumulation_steps )
122 | 
123 |     logger.info('Number of steps to process all batches in this epoch: {}'.format(len(dataset_picker)))
124 |     model.train()
125 | 
126 |     # Counter of sequences in an "epoch"
127 |     sequences_counter = 0
128 |     global_step_loss = 0
129 | 
130 |     for step, dataset_type in enumerate(dataset_picker):
131 |         try:
132 |             batch = next(dataloaders[dataset_type])
133 | 
134 |             sequences_counter += len(batch)
135 | 
136 |             if n_gpu == 1:
137 |                 batch = tuple(t.to(device) for t in batch)  # Move to GPU
138 | 
139 |             if step > 1 and step % 1000 == 0:
140 |                 logger.info("{} Number of sequences processed so far: {} (cumulative in {} steps)".format(datetime.utcnow(), sequences_counter, step))
141 |             # Calculate forward pass
142 |             loss = model.network(batch)
143 | 
144 |             if n_gpu > 1:
145 |                 # this is to average loss for multi-gpu. In DistributedDataParallel
146 |                 # setting, we get tuple of losses form all proccesses
147 |                 loss = loss.mean()
148 | 
149 |             if gradient_accumulation_steps > 1:
150 |                 loss = loss / gradient_accumulation_steps
151 | 
152 |             # Enabling  optimized Reduction
153 |             # reduction only happens in backward if this method is called before
154 |             # when using the distributed module
155 |             if accumulate_gradients:
156 |                 if use_multigpu_with_single_device_per_process and (step + 1) % gradient_accumulation_steps == 0:
157 |                     model.network.enable_need_reduction()
158 |                 else:
159 |                     model.network.disable_need_reduction()
160 |             if fp16:
161 |                 optimizer.backward(loss)
162 |             else:
163 |                 loss.backward()
164 | 
165 |             global_step_loss += loss
166 |             if (step + 1) % gradient_accumulation_steps == 0:
167 |                 if fp16:
168 |                     # modify learning rate with special warm up BERT uses
169 |                     # if fp16 is False, BertAdam is used that handles this automatically
170 |                     lr_this_step = \
171 |                         job_config.get_learning_rate() * warmup_linear_decay_exp(global_step,
172 |                                                                                  job_config.get_decay_rate(),
173 |                                                                                  job_config.get_decay_step(),
174 |                                                                                  job_config.get_total_training_steps(),
175 |                                                                                  job_config.get_warmup_proportion())
176 |                     for param_group in optimizer.param_groups:
177 |                         param_group['lr'] = lr_this_step
178 | 
179 |                     # Record the LR against global_step on tensorboard
180 |                     if check_write_log():
181 |                         summary_writer.add_scalar(f'Train/lr', lr_this_step, global_step)
182 |                     
183 |                 optimizer.step()
184 |                 optimizer.zero_grad()
185 |                 global_step += 1
186 |                 if check_write_log() and (global_step%args.log_steps == 0):
187 |                     run.log("training_loss", np.float(global_step_loss))
188 |                     run.log("lr_this_step", np.float(lr_this_step))
189 |                     run.log_row("loss over steps", global_step = global_step, loss =  np.float(global_step_loss))
190 |                     run.log_row("lr over steps", global_step = global_step, lr  = np.float(lr_this_step))
191 |                 global_step_loss = 0
192 |         except StopIteration:
193 |             continue
194 |         
195 |     logger.info("Completed {} steps".format(step))
196 |     logger.info("Completed processing {} sequences".format(sequences_counter))
197 | 
198 |     # Run Validation Loss
199 |     if max_seq_length == 512:
200 |         logger.info(f"TRAIN BATCH SIZE: {train_batch_size}")
201 |         return pretrain_validation(index)
202 |     else:
203 |         return None
204 | 
205 | 
206 | def str2bool(val):
207 |     return val.lower() == "true" or val.lower() == "t" or val.lower() == "1"
208 | 
209 | def check_write_log():
210 |     return dist.get_rank() == 0 or not use_multigpu_with_single_device_per_process
211 | 
212 | if __name__ == '__main__':
213 |     print("The arguments are: " + str(sys.argv))
214 | 
215 |     parser = argparse.ArgumentParser()
216 | 
217 |     # Required_parameters
218 |     parser.add_argument("--config_file", "--cf",
219 |                         help="pointer to the configuration file of the experiment", type=str, required=True)
220 | 
221 |     parser.add_argument("--config_file_path", default=None, type=str, required=True,
222 |                         help="The blob storage directory where config file is located.")
223 | 
224 |     parser.add_argument("--train_path", default=None, type=str, required=True,
225 |                         help="The blob storage directory for train data, cache and output.")
226 | 
227 |     parser.add_argument("--validation_path", default=None, type=str, required=True,
228 |                         help="The blob storage directory for validation data, cache and output.")
229 | 
230 |     parser.add_argument('--tokenizer_path', type=str, default=False,
231 |                     help="Path to load the tokenizer from")
232 |     parser.add_argument("--output_dir", default=None, type=str, required=True,
233 |                         help="If given, model checkpoints will be saved to this directory.")
234 |     
235 |     # Optional Params
236 |     parser.add_argument("--best_cp_dir", default=None, type=str,
237 |                         help="If given, model best checkpoint will be saved to this directory.")
238 |     parser.add_argument("--latest_cp_dir", default=None, type=str,
239 |                         help="If given, model latest checkpoint will be saved to this directory.")
240 |     parser.add_argument("--max_seq_length", default=512, type=int,
241 |                         help="The maximum total input sequence length after WordPiece tokenization. Sequences "
242 |                              "longer than this will be truncated, and sequences shorter than this will be padded.")
243 |     parser.add_argument("--max_predictions_per_seq", "--max_pred", default=80, type=int,
244 |                         help="The maximum number of masked tokens in a sequence to be predicted.")
245 |     parser.add_argument("--masked_lm_prob", "--mlm_prob", default=0.15,
246 |                         type=float, help="The masking probability for languge model.")
247 |     parser.add_argument("--train_batch_size", default=32,
248 |                         type=int, help="Total batch size for training.")
249 |     parser.add_argument("--no_cuda",
250 |                         type=str,
251 |                         default='False',
252 |                         help="Whether not to use CUDA when available")
253 |     parser.add_argument('--seed',
254 |                         type=int,
255 |                         default=42,
256 |                         help="random seed for initialization")
257 |     parser.add_argument('--accumulate_gradients',
258 |                         type=str,
259 |                         default='True',
260 |                         help="Enabling gradient accumulation optimization")
261 |     parser.add_argument('--gradient_accumulation_steps',
262 |                         type=int,
263 |                         default=1,
264 |                         help="Number of updates steps to accumulate before performing a backward/update pass.")
265 |     parser.add_argument('--fp16',
266 |                         type=str,
267 |                         default='False',
268 |                         help="Whether to use 16-bit float precision instead of 32-bit")
269 |     parser.add_argument('--use_pretrain',
270 |                         type=str,
271 |                         default='False',
272 |                         help="Whether to use Bert Pretrain Weights or not")
273 |     parser.add_argument('--loss_scale',
274 |                         type=float,
275 |                         default=0,
276 |                         help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
277 |     parser.add_argument('--load_training_checkpoint', '--load_cp',
278 |                         type=str,
279 |                         default='False',
280 |                         help="This is the path to the TAR file which contains model+opt state_dict() checkpointed.")
281 |     parser.add_argument('--use_multigpu_with_single_device_per_process',
282 |                         type=str,
283 |                         default='True',
284 |                         help="Whether only one device is managed per process")	    
285 |     parser.add_argument('--epochs',		
286 |                         type=int,		
287 |                         default=250,		
288 |                         help="total number of epochs")
289 |     parser.add_argument('--log_steps',		
290 |                         type=int,		
291 |                         default=50,		
292 |                         help="logging intervals")
293 |     parser.add_argument('--backend',		
294 |                         type=str,		
295 |                         default='nccl',		
296 |                         help="reduce backend to use")
297 | 
298 |     parser.add_argument('--master_port',		
299 |                         type=int,		
300 |                         default=6105,		
301 |                         help="user specified master port for non-mpi job")
302 |     
303 |     args = parser.parse_args()
304 | 
305 |     if args.output_dir:
306 |         os.makedirs(args.output_dir, exist_ok=True)
307 |     if args.best_cp_dir:
308 |         os.makedirs(args.best_cp_dir, exist_ok=True)
309 |     if args.latest_cp_dir:
310 |         os.makedirs(args.latest_cp_dir, exist_ok=True)
311 | 
312 |     no_cuda = str2bool(args.no_cuda)
313 |     fp16 = str2bool(args.fp16)
314 |     accumulate_gradients = str2bool(args.accumulate_gradients)
315 |     use_pretrain = str2bool(args.use_pretrain)
316 |     use_multigpu_with_single_device_per_process = str2bool(args.use_multigpu_with_single_device_per_process)
317 | 
318 |     config_file = args.config_file
319 |     gradient_accumulation_steps = args.gradient_accumulation_steps
320 |     train_batch_size = args.train_batch_size
321 |     seed = args.seed
322 |     loss_scale = args.loss_scale
323 |     load_training_checkpoint = args.load_training_checkpoint
324 |     max_seq_length = args.max_seq_length
325 |     max_predictions_per_seq = args.max_predictions_per_seq
326 |     masked_lm_prob = args.masked_lm_prob
327 |     master_port = args.master_port
328 | 
329 |     local_rank = -1
330 | 
331 |     local_rank = get_local_rank()
332 |     global_size = get_global_size()
333 |     local_size = get_local_size()	
334 |     # TODO use logger	
335 |     print('local_rank = {}'.format(local_rank))
336 |     print('global_size = {}'.format(global_size))
337 |     print('local_size = {}'.format(local_size))
338 | 
339 |     set_environment_variables_for_nccl_backend(local_size == global_size, master_port)
340 | 
341 |     # Prepare Logger
342 |     logger = Logger(cuda=torch.cuda.is_available())
343 | 
344 |     # # Extact config file from blob storage
345 |     job_config = BertJobConfiguration(config_file_path=os.path.join(args.config_file_path, config_file))
346 | 
347 |     job_name = job_config.get_name()
348 |     # Setting the distributed variables
349 | 
350 |     run = Run.get_context()
351 | 
352 |     if not use_multigpu_with_single_device_per_process:
353 |         device = torch.device("cuda")
354 |         n_gpu = torch.cuda.device_count()
355 |     else:
356 |         device = torch.device("cuda", local_rank)
357 |         n_gpu = 1
358 |         # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
359 |         torch.distributed.init_process_group(backend=args.backend)
360 |         if fp16:
361 |             logger.info("16-bits distributed training is not officially supported in the version of PyTorch currently used, but it works. Refer to https://github.com/pytorch/pytorch/pull/13496 for supported version.")
362 |             fp16 = True  #
363 |     logger.info("device: {} n_gpu: {}, use_multigpu_with_single_device_per_process: {}, 16-bits training: {}".format(
364 |         device, n_gpu, use_multigpu_with_single_device_per_process, fp16))
365 | 
366 |     if gradient_accumulation_steps < 1:
367 |         raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
368 |             gradient_accumulation_steps))
369 | 
370 |     train_batch_size = int(train_batch_size / gradient_accumulation_steps)
371 | 
372 |     # Setting all the seeds so that the task is random but same accross processes
373 |     random.seed(seed)
374 |     np.random.seed(seed)
375 |     torch.manual_seed(seed)
376 |     logger.info
377 |     if n_gpu > 0:
378 |         torch.cuda.manual_seed_all(seed)
379 | 
380 |     # Create an outputs/ folder in the blob storage
381 |     if args.output_dir is None:
382 |         parent_dir = os.path.join(args.output_dir, 'outputs', str(run.experiment.name))
383 |         output_dir = os.path.join(parent_dir, str(run.id))
384 |         os.makedirs(output_dir, exist_ok=True)
385 |         saved_model_path = os.path.join(output_dir, "saved_models", job_name)
386 |         os.makedirs(saved_model_path, exist_ok=True)
387 |     else:
388 |         saved_model_path = args.output_dir
389 | 
390 |     summary_writer = None
391 |     # Prepare Summary Writer and saved_models path
392 |     if check_write_log():
393 |         #azureml.tensorboard only streams from /logs directory, therefore hardcoded
394 |         summary_writer = get_sample_writer(
395 |             name=job_name, base='./logs')
396 | 
397 |     # Loading Tokenizer (vocabulary from blob storage, if exists)
398 |     logger.info("Extracting the vocabulary")
399 |     if args.tokenizer_path:
400 |         logger.info(f'Loading tokenizer from {args.tokenizer_path}')
401 |         tokenizer = BertTokenizer.from_pretrained(
402 |             args.tokenizer_path, cache_dir=args.output_dir)
403 |     else:
404 |         tokenizer = BertTokenizer.from_pretrained(job_config.get_token_file_type(), cache_dir=args.output_dir)
405 |     logger.info("Vocabulary contains {} tokens".format(len(list(tokenizer.vocab.keys()))))
406 | 
407 | 
408 |     # Loading Model
409 |     logger.info("Initializing BertMultiTask model")
410 |     model = BertMultiTask(job_config = job_config, use_pretrain = use_pretrain, tokenizer = tokenizer, 
411 |                           cache_dir = args.output_dir, device = device, write_log = check_write_log(), 
412 |                           summary_writer = summary_writer)
413 | 
414 |     logger.info("Converting the input parameters")
415 |     if fp16:
416 |         model.half()
417 |         
418 |     model.to(device)
419 | 
420 |     if use_multigpu_with_single_device_per_process:
421 |         try:
422 |             if accumulate_gradients:
423 |                 logger.info("Enabling gradient accumulation by using a forked version of DistributedDataParallel implementation available in the branch bertonazureml/apex at https://www.github.com/microsoft/apex")
424 |                 from distributed_apex import DistributedDataParallel as DDP
425 |             else:
426 |                 logger.info("Using Default Apex DistributedDataParallel implementation")
427 |                 from apex.parallel import DistributedDataParallel as DDP
428 |         except ImportError:
429 |             raise ImportError("To use distributed and fp16 training, please install apex from the branch bertonazureml/apex at https://www.github.com/microsoft/apex.")
430 |         torch.cuda.set_device(local_rank)
431 |         model.network = DDP(model.network, delay_allreduce=False)
432 | 
433 |     elif n_gpu > 1:
434 |         model.network = nn.DataParallel(model.network)
435 | 
436 |     # Prepare Optimizer
437 |     logger.info("Preparing the optimizer")
438 |     param_optimizer = list(model.network.named_parameters())
439 |     param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
440 |     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
441 |     optimizer_grouped_parameters = [
442 |         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
443 |         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
444 |     ]
445 | 
446 |     logger.info("Loading Apex and building the FusedAdam optimizer")
447 | 
448 |     if fp16:
449 |         try:
450 |             from apex.optimizers import FP16_Optimizer, FusedAdam
451 |         except:
452 |             raise ImportError("To use distributed and fp16 training, please install apex from the branch bertonazureml/apex at https://www.github.com/microsoft/apex.")
453 | 
454 |         optimizer = FusedAdam(optimizer_grouped_parameters,
455 |                               lr=job_config.get_learning_rate(),
456 |                               bias_correction=False,
457 |                               max_grad_norm=1.0)
458 |         if loss_scale == 0:
459 |             optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
460 |         else:
461 |             optimizer = FP16_Optimizer(
462 |                 optimizer, static_loss_scale=loss_scale)
463 |     else:
464 |         optimizer = BertAdam(optimizer_grouped_parameters,
465 |                              lr=job_config.get_learning_rate(),
466 |                              warmup=job_config.get_warmup_proportion(),
467 |                              t_total=job_config.get_total_training_steps())
468 | 
469 |     global_step = 0
470 |     start_epoch = 0
471 |     
472 |     # if args.load_training_checkpoint is not None:
473 |     if load_training_checkpoint != 'False':
474 |         logger.info(f"Looking for previous training checkpoint.")
475 |         latest_checkpoint_path = latest_checkpoint_file(args.load_training_checkpoint, no_cuda)
476 | 
477 |         logger.info(f"Restoring previous training checkpoint from {latest_checkpoint_path}")
478 |         start_epoch, global_step = load_checkpoint(model, optimizer, latest_checkpoint_path)
479 |         logger.info(f"The model is loaded from last checkpoint at epoch {start_epoch} when the global steps were at {global_step}")
480 | 
481 | 
482 |     logger.info("Training the model")
483 | 
484 |     best_loss = None
485 |     for index in range(start_epoch, args.epochs):
486 |         logger.info(f"Training epoch: {index + 1}")
487 |         
488 |         eval_loss = train(index)
489 | 
490 |         if check_write_log():
491 |             if best_loss is None or eval_loss is None or eval_loss < best_loss*0.99:
492 |                 best_loss = eval_loss
493 |                 epoch_ckp_path = os.path.join(saved_model_path, "bert_encoder_epoch_{0:04d}.pt".format(index + 1))
494 |                 checkpoint_model(os.path.join(saved_model_path, "training_state_checkpoint_{0:04d}.tar".format(index + 1)), model, optimizer, index, global_step)
495 |                 logger.info(f"Saving checkpoint of the model from epoch {index + 1} at {epoch_ckp_path}")
496 |                 model.save_bert(epoch_ckp_path)
497 | 
498 |                 #save best checkpoint in separate directory
499 |                 if args.best_cp_dir:
500 |                     best_ckp_path = os.path.join(args.best_cp_dir, "bert_encoder_epoch_{0:04d}.pt".format(index + 1))
501 |                     shutil.rmtree(args.best_cp_dir)
502 |                     os.makedirs(args.best_cp_dir,exist_ok=True)
503 |                     model.save_bert(best_ckp_path)
504 |                 
505 |             if args.latest_cp_dir:
506 |                 shutil.rmtree(args.latest_cp_dir)
507 |                 os.makedirs(args.latest_cp_dir,exist_ok=True)
508 |                 checkpoint_model(os.path.join(args.latest_cp_dir, "training_state_checkpoint_{0:04d}.tar".format(index + 1)), model, optimizer, index, global_step)
509 |                 latest_ckp_path = os.path.join(args.latest_cp_dir, "bert_encoder_epoch_{0:04d}.pt".format(index + 1))
510 |                 model.save_bert(latest_ckp_path)
511 | 


--------------------------------------------------------------------------------
/pretrain/PyTorch/utils.py:
--------------------------------------------------------------------------------
 1 | import sys as _sys
 2 | 
 3 | from typing import List
 4 | from collections import _iskeyword  # type: ignore
 5 | from tensorboardX import SummaryWriter
 6 | import os
 7 | 
 8 | SUMMARY_WRITER_DIR_NAME = 'runs'
 9 | 
10 | 
11 | def get_sample_writer(name, base=".."):
12 |     """Returns a tensorboard summary writer
13 |     """
14 |     return SummaryWriter(log_dir=os.path.join(base, SUMMARY_WRITER_DIR_NAME, name))
15 | 


--------------------------------------------------------------------------------
/pretrain/README.md:
--------------------------------------------------------------------------------
 1 | # Pretrain BERT Model on Azure Machine Learning service
 2 | To pretrain BERT language representation models on AzureML, following artifacts are required:
 3 | - [Azure Machine Learning Workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/setup-create-workspace) with an AzureML Compute cluster with 64 V100 GPUs (either 16 x `NC24s_v3` or 8 x `ND40_v2` VMs). Note that by default your subscription might not have enough quota and you are likely to submit a support ticket to get enough quota by following the guide [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas).
 4 | - Preprocessed data: [BERT paper](https://arxiv.org/pdf/1810.04805) references `Wikipedia` and `BookCorpus` datasets for pretraining. The notebook in this pretrain recipe is configured to use Wikipedia dataset only, but can be used with other datasets as well, including custom datasets.  The preprocessed data should be available in a `Datastore` registered to the AzureML `Workspace` that will be used for BERT pretraining. Preprocessed Wikipedia corpus is made available for use with the pretraining recipe in this repo. Refer to the [instructions](../docs/artifacts.md) to access preprocessed Wikipedia corpus for pretraining. You can copy the Wikipedia dataset from this location to another Azure blob container and register it as a workspace before using it in the pretraining job. Alternatively, you can preprocess the data from scratch (refer to [instructions](../docs/dataprep.md) on this), upload that to an Azure blob container and use it as the datastore for the pretraining job. Note that it is also possible to use other datasets with little or no modifications in this pretraining recipe. 
 5 | - Job configuration to define the parameters for the pretraining job. Refer to [configs](./configs/) directory for different configuration settings (`BERT-base` vs. `BERT-large`, like `single-node configurations for debugging` vs. `multi-node configurations for production-ready pretraining`).
 6 | - Code to pretrain BERT model in AzureML. The notebook to submit a pretrain job to AzureML is available at [BERT_Pretrain.ipynb](./PyTorch/notebooks/BERT_Pretrain.ipynb). 
 7 | 
 8 | ## Submit Pretrain job
 9 | [BERT_Pretrain.ipynb](./PyTorch/notebooks/BERT_Pretrain.ipynb) notebook has the recipe to submit bert-large pretraining job to AzureML service and monitor metrics in Tensorboard.
10 | 


--------------------------------------------------------------------------------
/pretrain/configs/bert-base-single-node.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "bing-bert-base-single-node-4-gpu-4096-bs",
 3 |     "bert_token_file": "bert-base-uncased",
 4 |     "bert_model_file": "bert-base-uncased",
 5 |     "bert_model_config": {
 6 |         "vocab_size_or_config_json_file": 119547,
 7 |         "hidden_size": 768,
 8 |         "num_hidden_layers": 12,
 9 |         "num_attention_heads": 12,
10 |         "intermediate_size": 3072,
11 |         "hidden_act": "gelu",
12 |         "hidden_dropout_prob": 0.1,
13 |         "attention_probs_dropout_prob": 0.1,
14 |         "max_position_embeddings": 512,
15 |         "type_vocab_size": 2,
16 |         "initializer_range": 0.02
17 |     },
18 |     "data": {
19 |         "datasets": {
20 |             "wiki_pretrain_dataset": "placeholder/512/wiki_pretrain"
21 |         }
22 |     },
23 |     "training": {
24 |         "num_epochs": 500,
25 |         "warmup_proportion": 0.1,
26 |         "learning_rate": 4e-4,
27 |         "num_workers": 0,
28 |         "decay_rate": 0.99,
29 |         "decay_step": 520,
30 |         "total_training_steps": 125000
31 |     },
32 |     "validation": {
33 |         "path": "placeholder/validation_512_only"
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/pretrain/configs/bert-base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "bing-bert-base",
 3 |     "bert_token_file": "bert-base-uncased",
 4 |     "bert_model_file": "bert-base-uncased",
 5 |     "bert_model_config": {
 6 |         "vocab_size_or_config_json_file": 119547,
 7 |         "hidden_size": 768,
 8 |         "num_hidden_layers": 12,
 9 |         "num_attention_heads": 12,
10 |         "intermediate_size": 3072,
11 |         "hidden_act": "gelu",
12 |         "hidden_dropout_prob": 0.1,
13 |         "attention_probs_dropout_prob": 0.1,
14 |         "max_position_embeddings": 512,
15 |         "type_vocab_size": 2,
16 |         "initializer_range": 0.02
17 |     },
18 |     "data": {
19 |         "datasets": {
20 |             "wiki_pretrain_dataset": "placeholder/512/wiki_pretrain"
21 |         }
22 |     },
23 |     "training": {
24 |         "num_epochs": 325,
25 |         "warmup_proportion": 0.1,
26 |         "learning_rate": 4e-4,
27 |         "num_workers": 0,
28 |         "decay_rate": 0.99,
29 |         "decay_step": 520,
30 |         "total_training_steps": 125000
31 |     },
32 |     "validation": {
33 |         "path": "placeholder/validation_512_only"
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/pretrain/configs/bert-large-single-node.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "bing-bert-large-single-node-4-gpu-4096-bs",
 3 |     "bert_token_file": "bert-large-uncased",
 4 |     "bert_model_file": "bert-large-uncased",
 5 |     "bert_model_config": {
 6 |         "vocab_size_or_config_json_file": 119547,
 7 |         "hidden_size": 1024,
 8 |         "num_hidden_layers": 24,
 9 |         "num_attention_heads": 16,
10 |         "intermediate_size": 4096,
11 |         "hidden_act": "gelu",
12 |         "hidden_dropout_prob": 0.1,
13 |         "attention_probs_dropout_prob": 0.1,
14 |         "max_position_embeddings": 512,
15 |         "type_vocab_size": 2,
16 |         "initializer_range": 0.02
17 |     },
18 |     "data": {
19 |         "datasets": {
20 |             "wiki_pretrain_dataset": "placeholder/512/wiki_pretrain"
21 |         }
22 |     },
23 |     "training": {
24 |         "num_epochs": 500,
25 |         "warmup_proportion": 0.02,
26 |         "learning_rate": 2e-4,
27 |         "num_workers": 0,
28 |         "decay_rate": 0.99,
29 |         "decay_step": 1000,
30 |         "total_training_steps": 187000
31 |     },
32 |     "validation": {
33 |         "path": "placeholder/validation_512_only"
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/pretrain/configs/bert-large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "bing-bert-large",
 3 |     "bert_token_file": "bert-large-uncased",
 4 |     "bert_model_file": "bert-large-uncased",
 5 |     "bert_model_config": {
 6 |         "vocab_size_or_config_json_file": 119547,
 7 |         "hidden_size": 1024,
 8 |         "num_hidden_layers": 24,
 9 |         "num_attention_heads": 16,
10 |         "intermediate_size": 4096,
11 |         "hidden_act": "gelu",
12 |         "hidden_dropout_prob": 0.1,
13 |         "attention_probs_dropout_prob": 0.1,
14 |         "max_position_embeddings": 512,
15 |         "type_vocab_size": 2,
16 |         "initializer_range": 0.02
17 |     },
18 |     "data": {
19 |         "datasets": {
20 |             "wiki_pretrain_dataset": "placeholder/512/wiki_pretrain"
21 |         }
22 |     },
23 |     "training": {
24 |         "num_epochs": 250,
25 |         "warmup_proportion": 0.02,
26 |         "learning_rate": 2e-4,
27 |         "num_workers": 0,
28 |         "decay_rate": 0.99,
29 |         "decay_step": 1000,
30 |         "total_training_steps": 187000
31 |     },
32 |     "validation": {
33 |         "path": "placeholder/validation_512_only"
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------