├── .gitignore
├── 00_DataProcessing.ipynb
├── 01_CreateResources.ipynb
├── Docker
    ├── dockerfile
    ├── environment.yml
    └── jupyter_notebook_config.py
├── HorovodKeras
    ├── 00_CreateImageAndTest.ipynb
    ├── 01_TrainKerasModel.ipynb
    ├── Docker
    │   └── Dockerfile
    └── src
    │   ├── data_generator.py
    │   └── imagenet_keras_horovod.py
├── HorovodPytorch
    ├── 00_CreateImageAndTest.ipynb
    ├── 01_TrainPyTorchModel.ipynb
    ├── Docker
    │   └── Dockerfile
    ├── cluster_config
    │   ├── cluster.json
    │   ├── docker.service
    │   └── nodeprep.sh
    └── src
    │   └── imagenet_pytorch_horovod.py
├── HorovodTF
    ├── 00_CreateImageAndTest.ipynb
    ├── 01_TrainTensorflowModel.ipynb
    ├── Docker
    │   └── Dockerfile
    └── src
    │   ├── imagenet_estimator_tf_horovod.py
    │   └── resnet_model.py
├── LICENSE
├── Makefile
├── README.md
├── common
    ├── timer.py
    └── utils.py
├── images
    └── dist_training_diag2.png
├── include
    └── build.mk
└── valprep.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/00_DataProcessing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "cells": [
  3 |         {
  4 |             "cell_type": "markdown",
  5 |             "metadata": {},
  6 |             "source": "# Data Processing\nIn this notebook we convert the ImageNet data to the appropriate format so that we can use it for training.\n\nThe dataset has many versions, the one commonly used for image classification is ILSVRC 2012. Go to the [download page](http://www.image-net.org/download-images) (you may need to register an account), and find the page for ILSVRC2012. You will need to download two files ILSVRC2012_img_train.tar and ILSVRC2012_img_val.tar"
  7 |         },
  8 |         {
  9 |             "cell_type": "code",
 10 |             "execution_count": null,
 11 |             "metadata": {},
 12 |             "outputs": [],
 13 |             "source": "from pathlib import Path"
 14 |         },
 15 |         {
 16 |             "cell_type": "code",
 17 |             "execution_count": null,
 18 |             "metadata": {},
 19 |             "outputs": [],
 20 |             "source": "DATA=Path(\"/data\")"
 21 |         },
 22 |         {
 23 |             "cell_type": "code",
 24 |             "execution_count": null,
 25 |             "metadata": {},
 26 |             "outputs": [],
 27 |             "source": "!mkdir -p {DATA/\"train\"}\n!tar -C {DATA/\"train\"} -xf {DATA/\"ILSVRC2012_img_train.tar\"}"
 28 |         },
 29 |         {
 30 |             "cell_type": "code",
 31 |             "execution_count": null,
 32 |             "metadata": {},
 33 |             "outputs": [],
 34 |             "source": "import tarfile\nfrom tqdm import tqdm_notebook\nimport os"
 35 |         },
 36 |         {
 37 |             "cell_type": "code",
 38 |             "execution_count": null,
 39 |             "metadata": {},
 40 |             "outputs": [],
 41 |             "source": "filenames = list((DATA/\"train\").glob(\"*.tar\"))\npbar = tqdm_notebook(total=len(filenames))\nfor class_tar in filenames:\n    pbar.set_description('Extracting '+class_tar.name+ '...')\n    class_dir = os.path.splitext(class_tar)[0]\n    os.mkdir(class_dir)\n    with tarfile.open(class_tar) as f:\n        f.extractall(class_dir)\n    os.remove(class_tar)\n    pbar.update(1)"
 42 |         },
 43 |         {
 44 |             "cell_type": "code",
 45 |             "execution_count": null,
 46 |             "metadata": {},
 47 |             "outputs": [],
 48 |             "source": "!rm -r {DATA/\"validation\"}"
 49 |         },
 50 |         {
 51 |             "cell_type": "code",
 52 |             "execution_count": null,
 53 |             "metadata": {},
 54 |             "outputs": [],
 55 |             "source": "!mkdir -p {DATA/\"validation\"}\n!tar -C {DATA/\"validation\"} -xf {DATA/\"ILSVRC2012_img_val.tar\"}"
 56 |         },
 57 |         {
 58 |             "cell_type": "markdown",
 59 |             "metadata": {},
 60 |             "source": "The validation data comes without labels so wee ned to run a script to asign the images to the appropriate classes."
 61 |         },
 62 |         {
 63 |             "cell_type": "code",
 64 |             "execution_count": null,
 65 |             "metadata": {},
 66 |             "outputs": [],
 67 |             "source": "validation_path = DATA/\"validation\"\nvalidation_preparation_script = Path(os.getcwd())/\"valprep.sh\""
 68 |         },
 69 |         {
 70 |             "cell_type": "code",
 71 |             "execution_count": null,
 72 |             "metadata": {},
 73 |             "outputs": [],
 74 |             "source": "!bash -c \"cd {validation_path} && {validation_preparation_script}\""
 75 |         },
 76 |         {
 77 |             "cell_type": "markdown",
 78 |             "metadata": {},
 79 |             "source": "Finally we package the processed directories so that we can upload them quicker."
 80 |         },
 81 |         {
 82 |             "cell_type": "code",
 83 |             "execution_count": null,
 84 |             "metadata": {},
 85 |             "outputs": [],
 86 |             "source": "!cd {DATA} && tar -czvf train.tar.gz train"
 87 |         },
 88 |         {
 89 |             "cell_type": "code",
 90 |             "execution_count": null,
 91 |             "metadata": {},
 92 |             "outputs": [],
 93 |             "source": "!cd {DATA} && tar -czvf validation.tar.gz validation"
 94 |         }
 95 |     ],
 96 |     "metadata": {
 97 |         "jupytext": {
 98 |             "text_representation": {
 99 |                 "extension": ".py",
100 |                 "format_name": "light",
101 |                 "format_version": "1.3",
102 |                 "jupytext_version": "0.8.6"
103 |             }
104 |         },
105 |         "kernelspec": {
106 |             "display_name": "Python 3",
107 |             "language": "python",
108 |             "name": "python3"
109 |         }
110 |     },
111 |     "nbformat": 4,
112 |     "nbformat_minor": 2
113 | }
114 | 


--------------------------------------------------------------------------------
/01_CreateResources.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "cells": [
  3 |         {
  4 |             "cell_type": "markdown",
  5 |             "metadata": {},
  6 |             "source": "# Create Azure and Batch AI Resources\nIn this notebook we will create the necessary resources to train a ResNet50 model([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the ImageNet dataset. If you plan on using fake data then the sections marked optional can be skipped. This notebook will take you through the following steps:\n * [Create Azure Resources](#azure_resources)\n * [Create Fileserver(NFS)](#create_fileshare)\n * [Upload Data to Blob (Optional)](#upload_data)\n * [Configure Batch AI Cluster](#configure_cluster)"
  7 |         },
  8 |         {
  9 |             "cell_type": "code",
 10 |             "execution_count": null,
 11 |             "metadata": {},
 12 |             "outputs": [],
 13 |             "source": "import sys\nsys.path.append(\"common\") \n\nfrom dotenv import set_key\nimport os\nimport json\nfrom utils import get_password, dotenv_for\nfrom pathlib import Path"
 14 |         },
 15 |         {
 16 |             "cell_type": "markdown",
 17 |             "metadata": {},
 18 |             "source": "Below are the variables that describe our experiment. By default we are using the NC24rs_v3 (Standard_NC24rs_v3) VMs which have V100 GPUs and Infiniband. By default we are using 2 nodes with each node having 4 GPUs, this equates to 8 GPUs. Feel free to increase the number of nodes but be aware what limitations your subscription may have.\n\nSet the USE_FAKE to True if you want to use fake data rather than the Imagenet dataset. This is often a good way to debug your models as well as checking what IO overhead is."
 19 |         },
 20 |         {
 21 |             "cell_type": "code",
 22 |             "execution_count": null,
 23 |             "metadata": {
 24 |                 "tags": [
 25 |                     "parameters"
 26 |                 ]
 27 |             },
 28 |             "outputs": [],
 29 |             "source": "# Variables for Batch AI - change as necessary\nID                     = \"dtdemo\"\nGROUP_NAME             = f\"batch{ID}rg\"\nSTORAGE_ACCOUNT_NAME   = f\"batch{ID}st\"\nFILE_SHARE_NAME        = f\"batch{ID}share\"\nSELECTED_SUBSCRIPTION  = \"<YOUR_SUBSCRIPTION>\"\nWORKSPACE              = \"workspace\"\nNUM_NODES              = 2\nCLUSTER_NAME           = \"msv100\"\nVM_SIZE                = \"Standard_NC24rs_v3\"\nGPU_TYPE               = \"V100\"\nPROCESSES_PER_NODE     = 4\nLOCATION               = \"eastus\"\nNFS_NAME               = f\"batch{ID}nfs\"\nUSERNAME               = \"batchai_user\"\nUSE_FAKE               = False\nDOCKERHUB              = os.getenv('DOCKER_REPOSITORY', \"masalvar\")\nDATA                   = Path(\"/data\")\nCONTAINER_NAME         = f\"batch{ID}container\"\nDOCKER_PWD             = \"<YOUR_DOCKER_PWD>\"\n\ndotenv_path = dotenv_for()\nset_key(dotenv_path, 'DOCKER_PWD', DOCKER_PWD)\nset_key(dotenv_path, 'GROUP_NAME', GROUP_NAME)\nset_key(dotenv_path, 'FILE_SHARE_NAME', FILE_SHARE_NAME)\nset_key(dotenv_path, 'WORKSPACE', WORKSPACE)\nset_key(dotenv_path, 'NUM_NODES', str(NUM_NODES))\nset_key(dotenv_path, 'CLUSTER_NAME', CLUSTER_NAME)\nset_key(dotenv_path, 'GPU_TYPE', GPU_TYPE)\nset_key(dotenv_path, 'PROCESSES_PER_NODE', str(PROCESSES_PER_NODE))\nset_key(dotenv_path, 'STORAGE_ACCOUNT_NAME', STORAGE_ACCOUNT_NAME)"
 30 |         },
 31 |         {
 32 |             "cell_type": "markdown",
 33 |             "metadata": {},
 34 |             "source": "<a id='azure_resources'></a>\n## Create Azure Resources\nFirst we need to log in to our Azure account. "
 35 |         },
 36 |         {
 37 |             "cell_type": "code",
 38 |             "execution_count": null,
 39 |             "metadata": {
 40 |                 "tags": [
 41 |                     "stripout"
 42 |                 ]
 43 |             },
 44 |             "outputs": [],
 45 |             "source": "!az login -o table"
 46 |         },
 47 |         {
 48 |             "cell_type": "markdown",
 49 |             "metadata": {},
 50 |             "source": "If you have more than one Azure account you will need to select it with the command below. If you only have one account you can skip this step."
 51 |         },
 52 |         {
 53 |             "cell_type": "code",
 54 |             "execution_count": null,
 55 |             "metadata": {},
 56 |             "outputs": [],
 57 |             "source": "!az account set --subscription \"$SELECTED_SUBSCRIPTION\""
 58 |         },
 59 |         {
 60 |             "cell_type": "code",
 61 |             "execution_count": null,
 62 |             "metadata": {
 63 |                 "tags": [
 64 |                     "stripout"
 65 |                 ]
 66 |             },
 67 |             "outputs": [],
 68 |             "source": "!az account list -o table"
 69 |         },
 70 |         {
 71 |             "cell_type": "markdown",
 72 |             "metadata": {},
 73 |             "source": "Next we create the group that will hold all our Azure resources."
 74 |         },
 75 |         {
 76 |             "cell_type": "code",
 77 |             "execution_count": null,
 78 |             "metadata": {},
 79 |             "outputs": [],
 80 |             "source": "!az group create -n $GROUP_NAME -l $LOCATION -o table"
 81 |         },
 82 |         {
 83 |             "cell_type": "markdown",
 84 |             "metadata": {},
 85 |             "source": "We will create the storage account that will store our fileshare where all the outputs from the jobs will be stored."
 86 |         },
 87 |         {
 88 |             "cell_type": "code",
 89 |             "execution_count": null,
 90 |             "metadata": {},
 91 |             "outputs": [],
 92 |             "source": "json_data = !az storage account create -l $LOCATION -n $STORAGE_ACCOUNT_NAME -g $GROUP_NAME --sku Standard_LRS\nprint('Storage account {} provisioning state: {}'.format(STORAGE_ACCOUNT_NAME, \n                                                         json.loads(''.join(json_data))['provisioningState']))"
 93 |         },
 94 |         {
 95 |             "cell_type": "code",
 96 |             "execution_count": null,
 97 |             "metadata": {},
 98 |             "outputs": [],
 99 |             "source": "json_data = !az storage account keys list -n $STORAGE_ACCOUNT_NAME -g $GROUP_NAME\nstorage_account_key = json.loads(''.join([i for i in json_data if 'WARNING' not in i]))[0]['value']"
100 |         },
101 |         {
102 |             "cell_type": "code",
103 |             "execution_count": null,
104 |             "metadata": {},
105 |             "outputs": [],
106 |             "source": "!az storage share create --account-name $STORAGE_ACCOUNT_NAME \\\n--account-key $storage_account_key --name $FILE_SHARE_NAME"
107 |         },
108 |         {
109 |             "cell_type": "code",
110 |             "execution_count": null,
111 |             "metadata": {},
112 |             "outputs": [],
113 |             "source": "!az storage directory create --share-name $FILE_SHARE_NAME  --name scripts \\\n--account-name $STORAGE_ACCOUNT_NAME --account-key $storage_account_key"
114 |         },
115 |         {
116 |             "cell_type": "markdown",
117 |             "metadata": {},
118 |             "source": "Here we are setting some defaults so we don't have to keep adding them to every command"
119 |         },
120 |         {
121 |             "cell_type": "code",
122 |             "execution_count": null,
123 |             "metadata": {},
124 |             "outputs": [],
125 |             "source": "!az configure --defaults location=$LOCATION\n!az configure --defaults group=$GROUP_NAME"
126 |         },
127 |         {
128 |             "cell_type": "code",
129 |             "execution_count": null,
130 |             "metadata": {
131 |                 "tags": [
132 |                     "stripout"
133 |                 ]
134 |             },
135 |             "outputs": [],
136 |             "source": "%env AZURE_STORAGE_ACCOUNT $STORAGE_ACCOUNT_NAME\n%env AZURE_STORAGE_KEY=$storage_account_key"
137 |         },
138 |         {
139 |             "cell_type": "markdown",
140 |             "metadata": {},
141 |             "source": "#### Create Workspace\nBatch AI has the concept of workspaces and experiments. Below we will create the workspace for our work."
142 |         },
143 |         {
144 |             "cell_type": "code",
145 |             "execution_count": null,
146 |             "metadata": {
147 |                 "tags": [
148 |                     "stripout"
149 |                 ]
150 |             },
151 |             "outputs": [],
152 |             "source": "!az batchai workspace create -n $WORKSPACE -g $GROUP_NAME"
153 |         },
154 |         {
155 |             "cell_type": "markdown",
156 |             "metadata": {},
157 |             "source": "<a id='upload_data'></a>\n## Upload Data to Blob (Optional)\nIn this section we will create a blob container and upload the imagenet data we prepared locally in the previous notebook.\n\n**You only need to run this section if you want to use real data. If USE_FAKE is set to False the commands below won't be executed.**\n"
158 |         },
159 |         {
160 |             "cell_type": "code",
161 |             "execution_count": null,
162 |             "metadata": {},
163 |             "outputs": [],
164 |             "source": "if USE_FAKE is False:\n    !az storage container create --account-name {STORAGE_ACCOUNT_NAME} \\\n                                 --account-key {storage_account_key} \\\n                                 --name {CONTAINER_NAME}"
165 |         },
166 |         {
167 |             "cell_type": "code",
168 |             "execution_count": null,
169 |             "metadata": {
170 |                 "tags": [
171 |                     "stripout"
172 |                 ]
173 |             },
174 |             "outputs": [],
175 |             "source": "if USE_FAKE is False:\n    # Should take about 20 minutes\n    !azcopy --source {DATA/\"train.tar.gz\"} \\\n    --destination https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/train.tar.gz \\\n    --dest-key {storage_account_key} --quiet"
176 |         },
177 |         {
178 |             "cell_type": "code",
179 |             "execution_count": null,
180 |             "metadata": {
181 |                 "tags": [
182 |                     "stripout"
183 |                 ]
184 |             },
185 |             "outputs": [],
186 |             "source": "if USE_FAKE is False:\n    !azcopy --source {DATA/\"validation.tar.gz\"} \\\n    --destination https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/validation.tar.gz \\\n    --dest-key {storage_account_key} --quiet"
187 |         },
188 |         {
189 |             "cell_type": "markdown",
190 |             "metadata": {},
191 |             "source": "<a id='create_fileshare'></a>\n## Create Fileserver\nIn this example we will store the data on an NFS fileshare. It is possible to use many storage solutions with Batch AI. NFS offers the best tradeoff between performance and ease of use. The best performance is achieved by loading the data locally but this can be cumbersome since it requires that the data is download by the all the nodes which with the ImageNet dataset can take hours. If you are using fake data we won't be using the fileserver but we will create one so that if you want to run the real ImageNet data later the server is ready."
192 |         },
193 |         {
194 |             "cell_type": "code",
195 |             "execution_count": null,
196 |             "metadata": {
197 |                 "tags": [
198 |                     "stripout"
199 |                 ]
200 |             },
201 |             "outputs": [],
202 |             "source": "!az batchai file-server create -n $NFS_NAME --disk-count 4 --disk-size 250 -w $WORKSPACE \\\n-s Standard_DS4_v2 -u $USERNAME -p {get_password(dotenv_for())} -g $GROUP_NAME --storage-sku Premium_LRS"
203 |         },
204 |         {
205 |             "cell_type": "code",
206 |             "execution_count": null,
207 |             "metadata": {},
208 |             "outputs": [],
209 |             "source": "!az batchai file-server list -o table -w $WORKSPACE -g $GROUP_NAME"
210 |         },
211 |         {
212 |             "cell_type": "code",
213 |             "execution_count": null,
214 |             "metadata": {},
215 |             "outputs": [],
216 |             "source": "json_data = !az batchai file-server list -w $WORKSPACE -g $GROUP_NAME\nnfs_ip=json.loads(''.join([i for i in json_data if 'WARNING' not in i]))[0]['mountSettings']['fileServerPublicIp']"
217 |         },
218 |         {
219 |             "cell_type": "markdown",
220 |             "metadata": {},
221 |             "source": "After we have created the NFS share we need to copy the data to it. To do this we write the script below which will be executed on the fileserver. It installs a tool called azcopy and then downloads and extracts the data to the appropriate directory."
222 |         },
223 |         {
224 |             "cell_type": "code",
225 |             "execution_count": null,
226 |             "metadata": {},
227 |             "outputs": [],
228 |             "source": "nodeprep_script = f\"\"\"\n#!/usr/bin/env bash\nwget https://gist.githubusercontent.com/msalvaris/073c28a9993d58498957294d20d74202/raw/87a78275879f7c9bb8d6fb9de8a2d2996bb66c24/install_azcopy\nchmod 777 install_azcopy\nsudo ./install_azcopy\n\nmkdir -p /data/imagenet\n\nazcopy --source https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/validation.tar.gz \\\n        --destination  /data/imagenet/validation.tar.gz\\\n        --source-key {storage_account_key}\\\n        --quiet\n\n\nazcopy --source https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/train.tar.gz \\\n        --destination  /data/imagenet/train.tar.gz\\\n        --source-key {storage_account_key}\\\n        --quiet\n\ncd /data/imagenet\ntar -xzf train.tar.gz\ntar -xzf validation.tar.gz\n\"\"\""
229 |         },
230 |         {
231 |             "cell_type": "code",
232 |             "execution_count": null,
233 |             "metadata": {},
234 |             "outputs": [],
235 |             "source": "with open('nodeprep.sh', 'w') as f:\n    f.write(nodeprep_script)"
236 |         },
237 |         {
238 |             "cell_type": "markdown",
239 |             "metadata": {
240 |                 "lines_to_next_cell": 2
241 |             },
242 |             "source": "Next we will copy the file over and run it on the NFS VM. This will install azcopy and download and prepare the data"
243 |         },
244 |         {
245 |             "cell_type": "code",
246 |             "execution_count": null,
247 |             "metadata": {},
248 |             "outputs": [],
249 |             "source": "if USE_FAKE:\n    raise Warning(\"You should not be running this section if you simply want to use fake data\")"
250 |         },
251 |         {
252 |             "cell_type": "code",
253 |             "execution_count": null,
254 |             "metadata": {
255 |                 "tags": [
256 |                     "stripout"
257 |                 ]
258 |             },
259 |             "outputs": [],
260 |             "source": "if USE_FAKE is False:\n    !sshpass -p {get_password(dotenv_for())} scp -o \"StrictHostKeyChecking=no\" nodeprep.sh $USERNAME@{nfs_ip}:~/"
261 |         },
262 |         {
263 |             "cell_type": "code",
264 |             "execution_count": null,
265 |             "metadata": {
266 |                 "tags": [
267 |                     "stripout"
268 |                 ]
269 |             },
270 |             "outputs": [],
271 |             "source": "if USE_FAKE is False:\n    !sshpass -p {get_password(dotenv_for())} ssh -o \"StrictHostKeyChecking=no\" $USERNAME@{nfs_ip} \"sudo chmod 777 ~/nodeprep.sh && ./nodeprep.sh\""
272 |         },
273 |         {
274 |             "cell_type": "markdown",
275 |             "metadata": {},
276 |             "source": "<a id='configure_cluster'></a>\n## Configure Batch AI Cluster\nWe then upload the scripts we wish to execute onto the fileshare. The fileshare will later be mounted by Batch AI. An alternative to uploading the scripts would be to embedd them inside the Docker image."
277 |         },
278 |         {
279 |             "cell_type": "code",
280 |             "execution_count": null,
281 |             "metadata": {},
282 |             "outputs": [],
283 |             "source": "!az storage file upload --share-name $FILE_SHARE_NAME --source HorovodPytorch/cluster_config/docker.service --path scripts\n!az storage file upload --share-name $FILE_SHARE_NAME --source HorovodPytorch/cluster_config/nodeprep.sh --path scripts"
284 |         },
285 |         {
286 |             "cell_type": "markdown",
287 |             "metadata": {},
288 |             "source": "Below it the command to create the cluster. "
289 |         },
290 |         {
291 |             "cell_type": "code",
292 |             "execution_count": null,
293 |             "metadata": {
294 |                 "tags": [
295 |                     "stripout"
296 |                 ]
297 |             },
298 |             "outputs": [],
299 |             "source": "!az batchai cluster create \\\n    -w $WORKSPACE \\\n    --name $CLUSTER_NAME \\\n    --image UbuntuLTS \\\n    --vm-size $VM_SIZE \\\n    --min $NUM_NODES --max $NUM_NODES \\\n    --afs-name $FILE_SHARE_NAME \\\n    --afs-mount-path extfs \\\n    --user-name $USERNAME \\\n    --password {get_password(dotenv_for())} \\\n    --storage-account-name $STORAGE_ACCOUNT_NAME \\\n    --storage-account-key $storage_account_key \\\n    --nfs $NFS_NAME \\\n    --nfs-mount-path nfs \\\n    --config-file HorovodPytorch/cluster_config/cluster.json"
300 |         },
301 |         {
302 |             "cell_type": "markdown",
303 |             "metadata": {},
304 |             "source": "Let's check that the cluster was created succesfully."
305 |         },
306 |         {
307 |             "cell_type": "code",
308 |             "execution_count": null,
309 |             "metadata": {
310 |                 "tags": [
311 |                     "stripout"
312 |                 ]
313 |             },
314 |             "outputs": [],
315 |             "source": "!az batchai cluster show -n $CLUSTER_NAME -w $WORKSPACE"
316 |         },
317 |         {
318 |             "cell_type": "code",
319 |             "execution_count": null,
320 |             "metadata": {},
321 |             "outputs": [],
322 |             "source": "!az batchai cluster list -w $WORKSPACE -o table"
323 |         },
324 |         {
325 |             "cell_type": "code",
326 |             "execution_count": null,
327 |             "metadata": {},
328 |             "outputs": [],
329 |             "source": "!az batchai cluster node list -c $CLUSTER_NAME -w $WORKSPACE -o table"
330 |         }
331 |     ],
332 |     "metadata": {
333 |         "jupytext": {
334 |             "text_representation": {
335 |                 "extension": ".py",
336 |                 "format_name": "light",
337 |                 "format_version": "1.3",
338 |                 "jupytext_version": "0.8.6"
339 |             }
340 |         },
341 |         "kernelspec": {
342 |             "display_name": "Python 3",
343 |             "language": "python",
344 |             "name": "python3"
345 |         }
346 |     },
347 |     "nbformat": 4,
348 |     "nbformat_minor": 2
349 | }
350 | 


--------------------------------------------------------------------------------
/Docker/dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | 
 3 | COPY environment.yml .
 4 | 
 5 | RUN apt-get update && apt-get install -y --no-install-recommends \
 6 |         build-essential \
 7 |         ca-certificates \
 8 |         cmake \
 9 |         wget \
10 |         curl \
11 |         gfortran \
12 |         apt-transport-https \
13 |         jq \
14 |         locales \
15 |         git \
16 |         sshpass \
17 |         openssh-client \
18 |         software-properties-common && \
19 |      	rm -rf /var/lib/apt/lists/*
20 | 
21 | RUN locale-gen en_US.UTF-8
22 | ENV LANG en_US.UTF-8
23 | ENV LANGUAGE en_US:en
24 | ENV LC_ALL en_US.UTF-8
25 | 
26 | # Install Docker
27 | RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - && \
28 | 	apt-key fingerprint 0EBFCD88 && \
29 | 	add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
30 |    						$(lsb_release -cs) \
31 |    						stable" &&\
32 |    	apt-get update && apt-get install -y --no-install-recommends docker-ce
33 | 
34 | ENV ENV_NAME=py3.6
35 | RUN curl -o ~/miniconda.sh -O  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
36 |      chmod +x ~/miniconda.sh && \
37 |      ~/miniconda.sh -b -p /opt/conda && \
38 |      rm ~/miniconda.sh && \
39 |      /opt/conda/bin/conda env create -q --name $ENV_NAME -f environment.yml && \
40 |      /opt/conda/bin/conda clean -ya && \
41 |      ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
42 |      echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
43 |      echo "conda activate $ENV_NAME" >> ~/.bashrc
44 | ENV PATH /opt/conda/envs/$ENV_NAME/bin:/opt/conda/bin:$PATH
45 | 
46 | COPY jupyter_notebook_config.py /root/.jupyter/
47 | 
48 | # Install Azure CLI
49 | RUN echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ xenial main" | \
50 |     tee /etc/apt/sources.list.d/azure-cli.list && \
51 |     curl -L https://packages.microsoft.com/keys/microsoft.asc | apt-key add - && \
52 |     apt-get update && \
53 |     apt-get install -y --no-install-recommends \
54 |     azure-cli
55 | 
56 | # Install AzCopy
57 | RUN echo "deb [arch=amd64] https://packages.microsoft.com/repos/microsoft-ubuntu-xenial-prod/ xenial main" > azure.list &&\
58 | 	cp ./azure.list /etc/apt/sources.list.d/ &&\
59 | 	apt-key adv --keyserver packages.microsoft.com --recv-keys B02C46DF417A0893 &&\
60 | 	apt-get update &&\
61 | 	apt-get install -y --no-install-recommends azcopy
62 | 
63 | WORKDIR /workspace
64 | CMD /bin/bash


--------------------------------------------------------------------------------
/Docker/environment.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |  - conda-forge
 3 | dependencies:
 4 |   - python=3.6
 5 |   - numpy
 6 |   - pyyaml
 7 |   - scipy
 8 |   - ipython
 9 |   - pandas
10 |   - jupyter
11 |   - ipykernel
12 |   - scikit-learn
13 |   - pillow
14 |   - bokeh=0.13.0
15 |   - pip:
16 |     - https://github.com/theskumar/python-dotenv/archive/master.zip
17 |     - docker
18 | 


--------------------------------------------------------------------------------
/Docker/jupyter_notebook_config.py:
--------------------------------------------------------------------------------
1 | # Configuration file for jupyter-notebook.
2 | 
3 | c.NotebookApp.ip = "0.0.0.0"
4 | c.NotebookApp.port = 9999
5 | c.NotebookApp.open_browser = False
6 | c.NotebookApp.allow_root = True
7 | 


--------------------------------------------------------------------------------
/HorovodKeras/00_CreateImageAndTest.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "cells": [
  3 |         {
  4 |             "cell_type": "markdown",
  5 |             "metadata": {},
  6 |             "source": "# Create Docker Image for Keras\nIn this notebook we will create the Docker image for our Keras script to run in. We will go through the process of creating the image and testing it locally to make sure it runs before submitting it to the cluster. It is often recommended you do this rather than debugging on the cluster since debugging on a cluster can be much more difficult and time consuming.\n \n**You will need to be running everything on a GPU enabled VM to run this notebook.** "
  7 |         },
  8 |         {
  9 |             "cell_type": "code",
 10 |             "execution_count": null,
 11 |             "metadata": {},
 12 |             "outputs": [],
 13 |             "source": "import sys\nsys.path.append(\"../common\") \n\nfrom dotenv import get_key\nimport os\nfrom utils import dotenv_for\nimport docker"
 14 |         },
 15 |         {
 16 |             "cell_type": "markdown",
 17 |             "metadata": {},
 18 |             "source": "We will use fake data here since we don't want to have to download the data etc. Using fake data is often a good way to debug your models as well as checking what IO overhead is. Here we are setting the number of processes (NUM_PROCESSES) to 2 since the VM we are testing on has 2 GPUs. If you are running on a machine with 1 GPU set NUM_PROCESSES to 1."
 19 |         },
 20 |         {
 21 |             "cell_type": "code",
 22 |             "execution_count": null,
 23 |             "metadata": {
 24 |                 "tags": [
 25 |                     "parameters"
 26 |                 ]
 27 |             },
 28 |             "outputs": [],
 29 |             "source": "dotenv_path = dotenv_for()\nUSE_FAKE               = True\nDOCKERHUB              = os.getenv('DOCKER_REPOSITORY', \"masalvar\")\nNUM_PROCESSES          = 2\nDOCKER_PWD             = get_key(dotenv_path, 'DOCKER_PWD')"
 30 |         },
 31 |         {
 32 |             "cell_type": "code",
 33 |             "execution_count": null,
 34 |             "metadata": {},
 35 |             "outputs": [],
 36 |             "source": "dc = docker.from_env()"
 37 |         },
 38 |         {
 39 |             "cell_type": "code",
 40 |             "execution_count": null,
 41 |             "metadata": {},
 42 |             "outputs": [],
 43 |             "source": "image, log_iter = dc.images.build(path='Docker', \n                          tag='{}/caia-horovod-keras'.format(DOCKERHUB))"
 44 |         },
 45 |         {
 46 |             "cell_type": "code",
 47 |             "execution_count": null,
 48 |             "metadata": {},
 49 |             "outputs": [],
 50 |             "source": "container_labels = {'containerName': 'kerasgpu'}\nenvironment ={\n    \"DISTRIBUTED\":True,\n    \"PYTHONPATH\":'/workspace/common/',\n}\n\nvolumes = {\n    os.getenv('EXT_PWD'): {\n                                'bind': '/workspace', \n                                'mode': 'rw'\n                               }\n}\n\nif USE_FAKE:\n    environment['FAKE'] = True\nelse:\n    environment['FAKE'] = False\n    volumes[os.getenv('EXT_DATA')]={'bind': '/mnt/input', 'mode': 'rw'}\n    environment['AZ_BATCHAI_INPUT_TRAIN'] = '/mnt/input/train'\n    environment['AZ_BATCHAI_INPUT_TEST'] = '/mnt/input/validation'"
 51 |         },
 52 |         {
 53 |             "cell_type": "code",
 54 |             "execution_count": null,
 55 |             "metadata": {},
 56 |             "outputs": [],
 57 |             "source": "cmd=f'mpirun -np {NUM_PROCESSES} -H localhost:{NUM_PROCESSES} '\\\n     'python -u /workspace/HorovodTF/src/imagenet_estimator_tf_horovod.py'\ncontainer = dc.containers.run(image.tags[0], \n                              command=cmd,\n                              detach=True, \n                              labels=container_labels,\n                              runtime='nvidia',\n                              volumes=volumes,\n                              environment=environment,\n                              shm_size='8G',\n                              privileged=True)"
 58 |         },
 59 |         {
 60 |             "cell_type": "markdown",
 61 |             "metadata": {},
 62 |             "source": "With the code below we are simply monitoring what is happening in the container. Feel free to stop the notebook when you are happy that everything is working."
 63 |         },
 64 |         {
 65 |             "cell_type": "code",
 66 |             "execution_count": null,
 67 |             "metadata": {
 68 |                 "tags": [
 69 |                     "stripout"
 70 |                 ]
 71 |             },
 72 |             "outputs": [],
 73 |             "source": "for line in container.logs(stderr=True, stream=True):\n    print(line.decode(\"utf-8\"),end =\"\")"
 74 |         },
 75 |         {
 76 |             "cell_type": "code",
 77 |             "execution_count": null,
 78 |             "metadata": {},
 79 |             "outputs": [],
 80 |             "source": "container.reload() # Refresh state\nif container.status is 'running':\n    container.kill()"
 81 |         },
 82 |         {
 83 |             "cell_type": "code",
 84 |             "execution_count": null,
 85 |             "metadata": {
 86 |                 "tags": [
 87 |                     "stripout"
 88 |                 ]
 89 |             },
 90 |             "outputs": [],
 91 |             "source": "for line in dc.images.push(image.tags[0], \n                           stream=True,\n                           auth_config={\"username\": DOCKERHUB,\n                                        \"password\": DOCKER_PWD}):\n    print(line)"
 92 |         }
 93 |     ],
 94 |     "metadata": {
 95 |         "jupytext": {
 96 |             "text_representation": {
 97 |                 "extension": ".py",
 98 |                 "format_name": "light",
 99 |                 "format_version": "1.3",
100 |                 "jupytext_version": "0.8.6"
101 |             }
102 |         },
103 |         "kernelspec": {
104 |             "display_name": "Python 3",
105 |             "language": "python",
106 |             "name": "python3"
107 |         }
108 |     },
109 |     "nbformat": 4,
110 |     "nbformat_minor": 2
111 | }
112 | 


--------------------------------------------------------------------------------
/HorovodKeras/01_TrainKerasModel.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "cells": [
  3 |         {
  4 |             "cell_type": "markdown",
  5 |             "metadata": {},
  6 |             "source": "# Train Keras Model Distributed on Batch AI\nIn this notebook we will train a Keras model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n * [Create Experiment](#experiment)\n * [Upload Training Scripts](#training_scripts)\n * [Submit and Monitor Job](#job)\n * [Clean Up Resources](#clean_up)"
  7 |         },
  8 |         {
  9 |             "cell_type": "code",
 10 |             "execution_count": null,
 11 |             "metadata": {},
 12 |             "outputs": [],
 13 |             "source": "import sys\nsys.path.append(\"../common\") \n\nimport json\nfrom dotenv import get_key\nimport os\nfrom utils import write_json_to_file, dotenv_for"
 14 |         },
 15 |         {
 16 |             "cell_type": "markdown",
 17 |             "metadata": {},
 18 |             "source": "Set the USE_FAKE to True if you want to use fake data rather than the ImageNet dataset. This is often a good way to debug your models as well as checking what IO overhead is."
 19 |         },
 20 |         {
 21 |             "cell_type": "code",
 22 |             "execution_count": null,
 23 |             "metadata": {
 24 |                 "tags": [
 25 |                     "parameters"
 26 |                 ]
 27 |             },
 28 |             "outputs": [],
 29 |             "source": "# Variables for Batch AI - change as necessary\ndotenv_path = dotenv_for()\nGROUP_NAME             = get_key(dotenv_path, 'GROUP_NAME')\nFILE_SHARE_NAME        = get_key(dotenv_path, 'FILE_SHARE_NAME')\nWORKSPACE              = get_key(dotenv_path, 'WORKSPACE')\nNUM_NODES              = int(get_key(dotenv_path, 'NUM_NODES'))\nCLUSTER_NAME           = get_key(dotenv_path, 'CLUSTER_NAME')\nGPU_TYPE               = get_key(dotenv_path, 'GPU_TYPE')\nPROCESSES_PER_NODE     = int(get_key(dotenv_path, 'PROCESSES_PER_NODE'))\nSTORAGE_ACCOUNT_NAME   = get_key(dotenv_path, 'STORAGE_ACCOUNT_NAME')\n\nEXPERIMENT             = f\"distributed_keras_{GPU_TYPE}\"\nUSE_FAKE               = False\nDOCKERHUB              = os.getenv('DOCKER_REPOSITORY', \"masalvar\")"
 30 |         },
 31 |         {
 32 |             "cell_type": "code",
 33 |             "execution_count": null,
 34 |             "metadata": {},
 35 |             "outputs": [],
 36 |             "source": "FAKE='-x FAKE=True' if USE_FAKE else ''\nTOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES"
 37 |         },
 38 |         {
 39 |             "cell_type": "markdown",
 40 |             "metadata": {},
 41 |             "source": "<a id='experiment'></a>\n# Create Experiment\nNext we create our experiment."
 42 |         },
 43 |         {
 44 |             "cell_type": "code",
 45 |             "execution_count": null,
 46 |             "metadata": {},
 47 |             "outputs": [],
 48 |             "source": "!az batchai experiment create -n $EXPERIMENT -g $GROUP_NAME -w $WORKSPACE"
 49 |         },
 50 |         {
 51 |             "cell_type": "markdown",
 52 |             "metadata": {},
 53 |             "source": "<a id='training_scripts'></a>\n# Upload Training Scripts\nWe need to upload our training scripts and associated files"
 54 |         },
 55 |         {
 56 |             "cell_type": "code",
 57 |             "execution_count": null,
 58 |             "metadata": {},
 59 |             "outputs": [],
 60 |             "source": "json_data = !az storage account keys list -n $STORAGE_ACCOUNT_NAME -g $GROUP_NAME\nstorage_account_key = json.loads(''.join([i for i in json_data if 'WARNING' not in i]))[0]['value']"
 61 |         },
 62 |         {
 63 |             "cell_type": "code",
 64 |             "execution_count": null,
 65 |             "metadata": {
 66 |                 "tags": [
 67 |                     "stripout"
 68 |                 ]
 69 |             },
 70 |             "outputs": [],
 71 |             "source": "%env AZURE_STORAGE_ACCOUNT $STORAGE_ACCOUNT_NAME\n%env AZURE_STORAGE_KEY=$storage_account_key"
 72 |         },
 73 |         {
 74 |             "cell_type": "markdown",
 75 |             "metadata": {},
 76 |             "source": "Upload our training scripts"
 77 |         },
 78 |         {
 79 |             "cell_type": "code",
 80 |             "execution_count": null,
 81 |             "metadata": {},
 82 |             "outputs": [],
 83 |             "source": "!az storage file upload --share-name $FILE_SHARE_NAME --source src/imagenet_keras_horovod.py --path scripts\n!az storage file upload --share-name $FILE_SHARE_NAME --source src/data_generator.py --path scripts\n!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts"
 84 |         },
 85 |         {
 86 |             "cell_type": "markdown",
 87 |             "metadata": {},
 88 |             "source": "Let's check our cluster we created earlier"
 89 |         },
 90 |         {
 91 |             "cell_type": "code",
 92 |             "execution_count": null,
 93 |             "metadata": {},
 94 |             "outputs": [],
 95 |             "source": "!az batchai cluster list -w $WORKSPACE -o table"
 96 |         },
 97 |         {
 98 |             "cell_type": "markdown",
 99 |             "metadata": {},
100 |             "source": "<a id='job'></a>\n## Submit and Monitor Job\nBelow we specify the job we wish to execute.  "
101 |         },
102 |         {
103 |             "cell_type": "code",
104 |             "execution_count": null,
105 |             "metadata": {},
106 |             "outputs": [],
107 |             "source": "jobs_dict = {\n  \"$schema\": \"https://raw.githubusercontent.com/Azure/BatchAI/master/schemas/2017-09-01-preview/job.json\",\n  \"properties\": {\n    \"nodeCount\": NUM_NODES,\n    \"customToolkitSettings\": {\n      \"commandLine\": f\"echo $AZ_BATCH_HOST_LIST; \\\n    cat $AZ_BATCHAI_MPI_HOST_FILE; \\\n    mpirun -np {TOTAL_PROCESSES} --hostfile $AZ_BATCHAI_MPI_HOST_FILE \\\n    -bind-to none -map-by slot \\\n    -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \\\n    -mca btl_tcp_if_include eth0 \\\n    -x NCCL_SOCKET_IFNAME=eth0 \\\n    -mca btl ^openib \\\n    -x NCCL_IB_DISABLE=1 \\\n    -x DISTRIBUTED=True \\\n    -x AZ_BATCHAI_INPUT_TRAIN \\\n    -x AZ_BATCHAI_INPUT_TEST \\\n    --allow-run-as-root \\\n      {FAKE} \\\n      python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_leras_horovod.py\"\n    },\n    \"stdOutErrPathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n    \"inputDirectories\": [{\n        \"id\": \"SCRIPTS\",\n        \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts\"\n      },\n      {\n        \"id\": \"TRAIN\",\n        \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\",\n      },\n      {\n        \"id\": \"TEST\",\n        \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\",\n      },\n    ],\n    \"outputDirectories\": [{\n        \"id\": \"MODEL\",\n        \"pathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n        \"pathSuffix\": \"Models\"\n    }],\n    \"containerSettings\": {\n      \"imageSourceRegistry\": {\n        \"image\": f\"{DOCKERHUB}/caia-horovod-keras\"\n      }\n    }\n  }\n}"
108 |         },
109 |         {
110 |             "cell_type": "code",
111 |             "execution_count": null,
112 |             "metadata": {},
113 |             "outputs": [],
114 |             "source": "write_json_to_file(jobs_dict, 'job.json')"
115 |         },
116 |         {
117 |             "cell_type": "code",
118 |             "execution_count": null,
119 |             "metadata": {},
120 |             "outputs": [],
121 |             "source": "JOB_NAME='keras-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)"
122 |         },
123 |         {
124 |             "cell_type": "markdown",
125 |             "metadata": {},
126 |             "source": "We now submit the job to Batch AI"
127 |         },
128 |         {
129 |             "cell_type": "code",
130 |             "execution_count": null,
131 |             "metadata": {
132 |                 "tags": [
133 |                     "stripout"
134 |                 ]
135 |             },
136 |             "outputs": [],
137 |             "source": "!az batchai job create -n $JOB_NAME --cluster $CLUSTER_NAME -w $WORKSPACE -e $EXPERIMENT -f job.json"
138 |         },
139 |         {
140 |             "cell_type": "markdown",
141 |             "metadata": {},
142 |             "source": "With the command below we can check the status of the job"
143 |         },
144 |         {
145 |             "cell_type": "code",
146 |             "execution_count": null,
147 |             "metadata": {},
148 |             "outputs": [],
149 |             "source": "!az batchai job list -w $WORKSPACE -e $EXPERIMENT -o table"
150 |         },
151 |         {
152 |             "cell_type": "markdown",
153 |             "metadata": {},
154 |             "source": "To view the files that the job has generated use the command below"
155 |         },
156 |         {
157 |             "cell_type": "code",
158 |             "execution_count": null,
159 |             "metadata": {
160 |                 "tags": [
161 |                     "stripout"
162 |                 ]
163 |             },
164 |             "outputs": [],
165 |             "source": "!az batchai job file list -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr"
166 |         },
167 |         {
168 |             "cell_type": "markdown",
169 |             "metadata": {},
170 |             "source": "We are also able to stream the stdout and stderr that our job produces. This is great to check the progress of our job as well as debug issues."
171 |         },
172 |         {
173 |             "cell_type": "code",
174 |             "execution_count": null,
175 |             "metadata": {
176 |                 "tags": [
177 |                     "stripout"
178 |                 ]
179 |             },
180 |             "outputs": [],
181 |             "source": "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stdout.txt"
182 |         },
183 |         {
184 |             "cell_type": "code",
185 |             "execution_count": null,
186 |             "metadata": {
187 |                 "tags": [
188 |                     "stripout"
189 |                 ]
190 |             },
191 |             "outputs": [],
192 |             "source": "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stderr.txt"
193 |         },
194 |         {
195 |             "cell_type": "markdown",
196 |             "metadata": {},
197 |             "source": "We can either wait for the job to complete or delete it with the command below."
198 |         },
199 |         {
200 |             "cell_type": "code",
201 |             "execution_count": null,
202 |             "metadata": {},
203 |             "outputs": [],
204 |             "source": "!az batchai job delete -w $WORKSPACE -e $EXPERIMENT --name $JOB_NAME -y"
205 |         },
206 |         {
207 |             "cell_type": "markdown",
208 |             "metadata": {},
209 |             "source": "<a id='clean_up'></a>\n## Clean Up Resources\nNext we wish to tidy up the resource we created.  \nFirst we reset the default values we set earlier."
210 |         },
211 |         {
212 |             "cell_type": "code",
213 |             "execution_count": null,
214 |             "metadata": {},
215 |             "outputs": [],
216 |             "source": "!az configure --defaults group=''\n!az configure --defaults location=''"
217 |         },
218 |         {
219 |             "cell_type": "markdown",
220 |             "metadata": {},
221 |             "source": " Next we delete the cluster"
222 |         },
223 |         {
224 |             "cell_type": "code",
225 |             "execution_count": null,
226 |             "metadata": {},
227 |             "outputs": [],
228 |             "source": "!az batchai cluster delete -w $WORKSPACE --name $CLUSTER_NAME -g $GROUP_NAME -y"
229 |         },
230 |         {
231 |             "cell_type": "markdown",
232 |             "metadata": {},
233 |             "source": "Once the cluster is deleted you will not incur any cost for the computation but you can still retain your experiments and workspace. If you wish to delete those as well execute the commands below."
234 |         },
235 |         {
236 |             "cell_type": "code",
237 |             "execution_count": null,
238 |             "metadata": {},
239 |             "outputs": [],
240 |             "source": "!az batchai experiment delete -w $WORKSPACE --name $EXPERIMENT -g $GROUP_NAME -y"
241 |         },
242 |         {
243 |             "cell_type": "code",
244 |             "execution_count": null,
245 |             "metadata": {},
246 |             "outputs": [],
247 |             "source": "!az batchai workspace delete -n $WORKSPACE -g $GROUP_NAME -y"
248 |         },
249 |         {
250 |             "cell_type": "markdown",
251 |             "metadata": {},
252 |             "source": "Finally we can delete the group and we will have deleted everything created for this tutorial."
253 |         },
254 |         {
255 |             "cell_type": "code",
256 |             "execution_count": null,
257 |             "metadata": {},
258 |             "outputs": [],
259 |             "source": "!az group delete --name $GROUP_NAME -y"
260 |         }
261 |     ],
262 |     "metadata": {
263 |         "jupytext": {
264 |             "text_representation": {
265 |                 "extension": ".py",
266 |                 "format_name": "light",
267 |                 "format_version": "1.3",
268 |                 "jupytext_version": "0.8.6"
269 |             }
270 |         },
271 |         "kernelspec": {
272 |             "display_name": "Python 3",
273 |             "language": "python",
274 |             "name": "python3"
275 |         }
276 |     },
277 |     "nbformat": 4,
278 |     "nbformat_minor": 2
279 | }
280 | 


--------------------------------------------------------------------------------
/HorovodKeras/Docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:9.0-devel-ubuntu16.04
 2 | 
 3 | # TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
 4 | ENV PYTHON_VERSION=3.5
 5 | ENV TENSORFLOW_VERSION=1.9.0
 6 | ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
 7 | ENV NCCL_VERSION=2.2.13-1+cuda9.0
 8 | 
 9 | RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
10 | 
11 | RUN apt-get update && apt-get install -y --no-install-recommends  --allow-downgrades --allow-change-held-packages \
12 |         build-essential \
13 |         cmake \
14 |         cpio \
15 |         git \
16 |         curl \
17 |         wget \
18 |         ca-certificates \
19 |         libdapl2 \
20 |         libcudnn7=${CUDNN_VERSION} \
21 |         libnccl2=${NCCL_VERSION} \
22 |         libnccl-dev=${NCCL_VERSION} \
23 |         libjpeg-dev \
24 |         libpng-dev \
25 |         libmlx4-1 \
26 |         libsm6 \ 
27 |         libxext6 \
28 |         python$PYTHON_VERSION \
29 |         python$PYTHON_VERSION-dev
30 | 
31 | 
32 | # install intel MPI
33 | RUN cd /tmp && \
34 |     wget -q 'http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/11595/l_mpi_2017.3.196.tgz' && \
35 |     tar zxvf l_mpi_2017.3.196.tgz && \
36 |     sed -i -e 's/^ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
37 |     sed -i -e 's|^#ACTIVATION_LICENSE_FILE=|ACTIVATION_LICENSE_FILE=/tmp/l_mpi_2017.3.196/USE_SERVER.lic|g' \
38 |     			/tmp/l_mpi_2017.3.196/silent.cfg && \
39 |     sed -i -e 's/^ACTIVATION_TYPE=exist_lic/ACTIVATION_TYPE=license_server/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
40 |     cd /tmp/l_mpi_2017.3.196 && \
41 |     ./install.sh -s silent.cfg && \
42 |     cd .. && \
43 |     rm -rf l_mpi_2017.3.196* && \
44 |     echo "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" >> ~/.bashrc
45 | 
46 | ENV PATH $PATH:/opt/intel/compilers_and_libraries/linux/mpi/bin64
47 |         
48 | RUN ln -s /usr/bin/python$PYTHON_VERSION /usr/bin/python
49 | 
50 | RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
51 |     python get-pip.py && \
52 |     rm get-pip.py
53 | 
54 | 
55 | # Install TensorFlow and Keras
56 | RUN pip install --no-cache-dir tensorflow-gpu==$TENSORFLOW_VERSION h5py scipy jupyter ipykernel numpy toolz pandas \
57 |  	scikit-learn keras pillow
58 | 
59 | # Install Horovod, temporarily using CUDA stubs
60 | RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
61 |     /bin/bash -c "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" && \
62 |     HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod==0.13.2 && \
63 |     ldconfig


--------------------------------------------------------------------------------
/HorovodKeras/src/data_generator.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import keras
 3 | import logging
 4 | 
 5 | 
 6 | def _get_logger():
 7 |     return logging.getLogger(__name__)
 8 | 
 9 | def _create_data(batch_size, num_batches, dim, channels, seed=42):
10 |     np.random.seed(42)
11 |     return np.random.rand(batch_size * num_batches,
12 |                           dim[0],
13 |                           dim[1],
14 |                           channels).astype(np.float32)
15 | 
16 | 
17 | def _create_labels(batch_size, num_batches, n_classes):
18 |     return np.random.choice(n_classes, batch_size * num_batches)
19 | 
20 | 
21 | 
22 | class FakeDataGenerator(keras.preprocessing.image.Iterator):
23 | 
24 |     def __init__(self,
25 |                  batch_size=32,
26 |                  num_batches=20,
27 |                  dim=(224, 224),
28 |                  n_channels=3,
29 |                  n_classes=10,
30 |                  length=1000,
31 |                  shuffle=True,
32 |                  seed=42):
33 | 
34 |         'Initialization'
35 |         super(FakeDataGenerator, self).__init__(length,
36 |                                                 batch_size,
37 |                                                 shuffle,
38 |                                                 seed)
39 |         self.dim = dim
40 |         self.n_channels = n_channels
41 |         self.n_classes = n_classes
42 |         self.num_batches = num_batches
43 |         self._data = _create_data(self.batch_size, self.num_batches, self.dim, self.n_channels)
44 |         self._labels = _create_labels(self.batch_size, self.num_batches, self.n_classes)
45 |         self.translation_index = np.random.choice(len(self._labels), length)
46 | 
47 | 
48 |     def _get_batches_of_transformed_samples(self, index_array):
49 |         logger = _get_logger()
50 |         logger.debug('Retrieving samples')
51 |         logger.debug(str(index_array))
52 |         tr_index_array = self.translation_index[index_array]
53 |         return self._data[tr_index_array], keras.utils.to_categorical(self._labels[tr_index_array], num_classes=self.n_classes)


--------------------------------------------------------------------------------
/HorovodKeras/src/imagenet_keras_horovod.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Trains ResNet50 in Keras using Horovod.
  3 | 
  4 | It requires the following env variables
  5 | AZ_BATCHAI_INPUT_TRAIN
  6 | AZ_BATCHAI_INPUT_TEST
  7 | AZ_BATCHAI_OUTPUT_MODEL
  8 | AZ_BATCHAI_JOB_TEMP_DIR
  9 | """
 10 | import logging
 11 | import os
 12 | import sys
 13 | from functools import lru_cache
 14 | from timer import Timer
 15 | 
 16 | import keras
 17 | import tensorflow as tf
 18 | from data_generator import FakeDataGenerator
 19 | from keras import backend as K
 20 | from keras.preprocessing import image
 21 | 
 22 | 
 23 | def _str_to_bool(in_str):
 24 |     if "t" in in_str.lower():
 25 |         return True
 26 |     else:
 27 |         return False
 28 | 
 29 | 
 30 | _WIDTH = 224
 31 | _HEIGHT = 224
 32 | _CHANNELS = 3
 33 | _LR = 0.001
 34 | _EPOCHS = os.getenv("EPOCHS", 1)
 35 | _BATCHSIZE = 64
 36 | _R_MEAN = 123.68
 37 | _G_MEAN = 116.78
 38 | _B_MEAN = 103.94
 39 | 
 40 | # Settings from https://arxiv.org/abs/1706.02677.
 41 | _WARMUP_EPOCHS = 5
 42 | _WEIGHT_DECAY = 0.00005
 43 | 
 44 | _NUM_WORKERS = int(os.getenv("NUM_WORKERS", 10))
 45 | _MAX_QUEUE_SIZE = int(os.getenv("MAX_QUEUE_SIZE", 10))
 46 | _MULTIPROCESSING = _str_to_bool(os.getenv("MULTIPROCESSING", "False"))
 47 | _DISTRIBUTED = _str_to_bool(os.getenv("DISTRIBUTED", "False"))
 48 | _FAKE = _str_to_bool(os.getenv("FAKE", "False"))
 49 | _DATA_LENGTH = int(
 50 |     os.getenv("FAKE_DATA_LENGTH", 1281167)
 51 | )  # How much fake data to simulate, default to size of imagenet dataset
 52 | _VALIDATION = _str_to_bool(os.getenv("VALIDATION", "False"))
 53 | 
 54 | 
 55 | if _DISTRIBUTED:
 56 |     import horovod.keras as hvd
 57 | 
 58 | 
 59 | def _get_rank():
 60 |     if _DISTRIBUTED:
 61 |         try:
 62 |             return hvd.rank()
 63 |         except:
 64 |             return 0
 65 |     else:
 66 |         return 0
 67 | 
 68 | 
 69 | class HorovodAdapter(logging.LoggerAdapter):
 70 |     def __init__(self, logger):
 71 |         self._str_epoch = ""
 72 |         self._gpu_rank = 0
 73 |         super(HorovodAdapter, self).__init__(logger, {})
 74 | 
 75 |     def set_epoch(self, epoch):
 76 |         self._str_epoch = "[Epoch {}]".format(epoch)
 77 | 
 78 |     def process(self, msg, kwargs):
 79 |         kwargs["extra"] = {"gpurank": _get_rank(), "epoch": self._str_epoch}
 80 |         return msg, kwargs
 81 | 
 82 | 
 83 | @lru_cache()
 84 | def _get_logger():
 85 |     logger = logging.getLogger(__name__)
 86 |     logger.setLevel(logging.INFO)
 87 |     ch = logging.StreamHandler(stream=sys.stdout)
 88 |     formatter = logging.Formatter(
 89 |         "%(levelname)s:%(name)s:%(gpurank)d: %(epoch)s %(message)s"
 90 |     )
 91 |     ch.setFormatter(formatter)
 92 |     logger.addHandler(ch)
 93 |     adapter = HorovodAdapter(logger)
 94 |     return adapter
 95 | 
 96 | 
 97 | def _create_model():
 98 |     logger = _get_logger()
 99 |     logger.info("Creating model")
100 |     # Set up standard ResNet-50 model.
101 |     model = keras.applications.resnet50.ResNet50(weights=None)
102 |     # ResNet-50 model that is included with Keras is optimized for inference.
103 |     # Add L2 weight decay & adjust BN settings.
104 |     model_config = model.get_config()
105 |     for layer, layer_config in zip(model.layers, model_config["layers"]):
106 |         if hasattr(layer, "kernel_regularizer"):
107 |             regularizer = keras.regularizers.l2(_WEIGHT_DECAY)
108 |             layer_config["config"]["kernel_regularizer"] = {
109 |                 "class_name": regularizer.__class__.__name__,
110 |                 "config": regularizer.get_config(),
111 |             }
112 |         if type(layer) == keras.layers.BatchNormalization:
113 |             layer_config["config"]["momentum"] = 0.9
114 |             layer_config["config"]["epsilon"] = 1e-5
115 |     model = keras.models.Model.from_config(model_config)
116 |     return model
117 | 
118 | 
119 | def _validation_data_iterator_from():
120 |     # Validation data iterator.
121 | 
122 |     test_gen = image.ImageDataGenerator(
123 |         zoom_range=(0.875, 0.875),
124 |         preprocessing_function=keras.applications.resnet50.preprocess_input,
125 |     )
126 |     test_iter = test_gen.flow_from_directory(
127 |         os.getenv("AZ_BATCHAI_INPUT_TEST"),
128 |         batch_size=_BATCHSIZE,
129 |         target_size=(224, 224),
130 |     )
131 |     return test_iter
132 | 
133 | 
134 | def _training_data_iterator_from():
135 |     # Training data iterator.
136 |     train_gen = image.ImageDataGenerator(
137 |         width_shift_range=0.33,
138 |         height_shift_range=0.33,
139 |         zoom_range=0.5,
140 |         horizontal_flip=True,
141 |         preprocessing_function=keras.applications.resnet50.preprocess_input,
142 |     )
143 |     train_iter = train_gen.flow_from_directory(
144 |         os.getenv("AZ_BATCHAI_INPUT_TRAIN"),
145 |         batch_size=_BATCHSIZE,
146 |         target_size=(224, 224),
147 |     )
148 |     return train_iter
149 | 
150 | 
151 | def _fake_data_iterator_from(length=_DATA_LENGTH):
152 |     return FakeDataGenerator(batch_size=_BATCHSIZE, n_classes=1000, length=length)
153 | 
154 | 
155 | def _get_optimizer(params, is_distributed=_DISTRIBUTED):
156 |     if is_distributed:
157 |         # Horovod: adjust learning rate based on number of GPUs.
158 |         opt = keras.optimizers.SGD(
159 |             lr=params["learning_rate"] * hvd.size(), momentum=params["momentum"]
160 |         )
161 |         # Horovod: add Horovod Distributed Optimizer.
162 |         return hvd.DistributedOptimizer(opt)
163 |     else:
164 |         return keras.optimizers.SGD(
165 |             lr=params["learning_rate"], momentum=params["momentum"]
166 |         )
167 | 
168 | 
169 | def _get_runconfig(is_distributed=_DISTRIBUTED):
170 |     if is_distributed:
171 |         # Horovod: pin GPU to be used to process local rank (one GPU per process)
172 |         config = tf.ConfigProto()
173 |         config.gpu_options.allow_growth = True
174 |         config.gpu_options.visible_device_list = str(hvd.local_rank())
175 |     else:
176 |         config = tf.ConfigProto()
177 |         config.gpu_options.allow_growth = True
178 |     return config
179 | 
180 | 
181 | def _get_model_dir(is_distributed=_DISTRIBUTED):
182 |     if is_distributed:
183 |         # Horovod: save checkpoints only on worker 0 to prevent other workers from
184 |         # corrupting them.
185 |         return (
186 |             os.getenv("AZ_BATCHAI_OUTPUT_MODEL")
187 |             if hvd.rank() == 0
188 |             else os.getenv("AZ_BATCHAI_JOB_TEMP_DIR")
189 |         )
190 |     else:
191 |         return os.getenv("AZ_BATCHAI_OUTPUT_MODEL")
192 | 
193 | 
194 | def _get_hooks(is_distributed=_DISTRIBUTED, verbose=1):
195 |     logger = _get_logger()
196 |     if is_distributed:
197 |         logger.info("Rank: {} Cluster Size {}".format(hvd.local_rank(), hvd.size()))
198 |         return [
199 |             # Horovod: broadcast initial variable states from rank 0 to all other processes.
200 |             # This is necessary to ensure consistent initialization of all workers when
201 |             # training is started with random weights or restored from a checkpoint.
202 |             hvd.callbacks.BroadcastGlobalVariablesCallback(0),
203 |             # Horovod: average metrics among workers at the end of every epoch.
204 |             #
205 |             # Note: This callback must be in the list before the ReduceLROnPlateau,
206 |             # TensorBoard, or other metrics-based callbacks.
207 |             hvd.callbacks.MetricAverageCallback(),
208 |             # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
209 |             # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
210 |             # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
211 |             hvd.callbacks.LearningRateWarmupCallback(
212 |                 warmup_epochs=_WARMUP_EPOCHS, verbose=verbose
213 |             ),
214 |             # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs.
215 |             hvd.callbacks.LearningRateScheduleCallback(
216 |                 start_epoch=_WARMUP_EPOCHS, end_epoch=30, multiplier=1.0
217 |             ),
218 |             hvd.callbacks.LearningRateScheduleCallback(
219 |                 start_epoch=30, end_epoch=60, multiplier=1e-1
220 |             ),
221 |             hvd.callbacks.LearningRateScheduleCallback(
222 |                 start_epoch=60, end_epoch=80, multiplier=1e-2
223 |             ),
224 |             hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3),
225 |         ]
226 |     else:
227 |         return []
228 | 
229 | 
230 | class LoggerCallback(keras.callbacks.Callback):
231 |     def __init__(self, logger, data_length):
232 |         self._timer = Timer(
233 |             output=logger.info, prefix="Epoch duration: ", fmt="{:.3f} seconds"
234 |         )
235 |         self._data_length = data_length
236 | 
237 |     def on_epoch_begin(self, epoch, logs):
238 |         logger = _get_logger()
239 |         logger.set_epoch(epoch)
240 |         self._timer.start()
241 | 
242 |     def on_epoch_end(self, epoch, logs):
243 |         duration = self._timer.elapsed
244 |         _log_summary(self._data_length, duration)
245 | 
246 | 
247 | def _is_master(is_distributed=_DISTRIBUTED):
248 |     if is_distributed:
249 |         if hvd.rank() == 0:
250 |             return True
251 |         else:
252 |             return False
253 |     else:
254 |         return True
255 | 
256 | 
257 | def _log_summary(data_length, duration):
258 |     logger = _get_logger()
259 |     images_per_second = data_length / duration
260 |     logger.info("Data length:      {}".format(data_length))
261 |     logger.info("Total duration:   {:.3f}".format(duration))
262 |     logger.info("Total images/sec: {:.3f}".format(images_per_second))
263 |     logger.info(
264 |         "Batch size:       (Per GPU {}: Total {})".format(
265 |             _BATCHSIZE, hvd.size() * _BATCHSIZE if _DISTRIBUTED else _BATCHSIZE
266 |         )
267 |     )
268 |     logger.info("Distributed:      {}".format("True" if _DISTRIBUTED else "False"))
269 |     logger.info("Num GPUs:         {:.3f}".format(hvd.size() if _DISTRIBUTED else 1))
270 |     logger.info("Dataset:          {}".format("Synthetic" if _FAKE else "Imagenet"))
271 | 
272 | 
273 | def main():
274 |     verbose = 1
275 |     logger = _get_logger()
276 |     if _DISTRIBUTED:
277 |         # Horovod: initialize Horovod.
278 |         hvd.init()
279 |         logger.info("Runnin Distributed")
280 |         verbose = 1 if hvd.rank() == 0 else 0
281 | 
282 |     logger.info("Tensorflow version {}".format(tf.__version__))
283 |     K.set_session(tf.Session(config=_get_runconfig()))
284 | 
285 |     # Horovod: broadcast resume_from_epoch from rank 0 (which will have
286 |     # checkpoints) to other ranks.
287 |     resume_from_epoch = 0
288 |     if _DISTRIBUTED:
289 |         resume_from_epoch = hvd.broadcast(
290 |             resume_from_epoch, 0, name="resume_from_epoch"
291 |         )
292 | 
293 |     if _FAKE:
294 |         train_iter = _fake_data_iterator_from()
295 |     else:
296 |         train_iter = _training_data_iterator_from()
297 |         test_iter = _validation_data_iterator_from() if _VALIDATION else None
298 | 
299 |     model = _create_model()
300 | 
301 |     params = {"learning_rate": _LR, "momentum": 0.9}
302 | 
303 |     opt = _get_optimizer(params)
304 |     model.compile(
305 |         loss=keras.losses.categorical_crossentropy,
306 |         optimizer=opt,
307 |         metrics=["accuracy", "top_k_categorical_accuracy"],
308 |     )
309 | 
310 |     model_dir = _get_model_dir()
311 |     checkpoint_format = os.path.join(model_dir, "checkpoint-{epoch}.h5")
312 | 
313 |     callbacks = _get_hooks()
314 |     callbacks.append(LoggerCallback(logger, len(train_iter) * _BATCHSIZE))
315 | 
316 |     # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
317 |     if _is_master():
318 |         callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_format))
319 |         # callbacks.append(keras.callbacks.TensorBoard(log_dir))
320 | 
321 |     # Restore from a previous checkpoint, if initial_epoch is specified.
322 |     # Horovod: restore on the first worker which will broadcast weights to other workers.
323 |     if resume_from_epoch > 0 and _is_master():
324 |         model.load_weights(checkpoint_format.format(epoch=resume_from_epoch))
325 | 
326 |     logger.info("Training...")
327 |     # Train the model. The training will randomly sample 1 / N batches of training data and
328 |     # 3 / N batches of validation data on every worker, where N is the number of workers.
329 |     # Over-sampling of validation data helps to increase probability that every validation
330 |     # example will be evaluated.
331 |     num_workers = hvd.size() if _DISTRIBUTED else 1
332 |     model.fit_generator(
333 |         train_iter,
334 |         steps_per_epoch=len(train_iter) // num_workers,
335 |         callbacks=callbacks,
336 |         epochs=_EPOCHS,
337 |         verbose=verbose,
338 |         workers=_NUM_WORKERS,
339 |         max_queue_size=_MAX_QUEUE_SIZE,
340 |         use_multiprocessing=_MULTIPROCESSING,
341 |         initial_epoch=resume_from_epoch,
342 |     )
343 | 
344 |     if _FAKE is False and _VALIDATION:
345 |         # Evaluate the model on the full data set.
346 |         with Timer(output=logger.info, prefix="Testing"):
347 |             logger.info("Testing...")
348 |             score = hvd.allreduce(
349 |                 model.evaluate_generator(test_iter, len(test_iter), workers=10)
350 |             )
351 |             if verbose:
352 |                 print("Test loss:", score[0])
353 |             print("Test accuracy:", score[1])
354 | 
355 | 
356 | if __name__ == "__main__":
357 |     main()
358 | 


--------------------------------------------------------------------------------
/HorovodPytorch/00_CreateImageAndTest.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "cells": [
  3 |         {
  4 |             "cell_type": "markdown",
  5 |             "metadata": {},
  6 |             "source": "# Create Docker Image for PyTorch\nIn this notebook we will create the Docker image for our PyTorch script to run in. We will go through the process of creating the image and testing it locally to make sure it runs before submitting it to the cluster. It is often recommended you do this rather than debugging on the cluster since debugging on a cluster can be much more difficult and time consuming.\n \n**You will need to be running everything on a GPU enabled VM to run this notebook.** "
  7 |         },
  8 |         {
  9 |             "cell_type": "code",
 10 |             "execution_count": null,
 11 |             "metadata": {},
 12 |             "outputs": [],
 13 |             "source": "import sys\nsys.path.append(\"../common\") \n\nfrom dotenv import get_key\nimport os\nfrom utils import dotenv_for\nimport docker"
 14 |         },
 15 |         {
 16 |             "cell_type": "markdown",
 17 |             "metadata": {},
 18 |             "source": "We will use fake data here since we don't want to have to download the data etc. Using fake data is often a good way to debug your models as well as checking what IO overhead is. Here we are setting the number of processes (NUM_PROCESSES) to 2 since the VM we are testing on has 2 GPUs. If you are running on a machine with 1 GPU set NUM_PROCESSES to 1."
 19 |         },
 20 |         {
 21 |             "cell_type": "code",
 22 |             "execution_count": null,
 23 |             "metadata": {
 24 |                 "tags": [
 25 |                     "parameters"
 26 |                 ]
 27 |             },
 28 |             "outputs": [],
 29 |             "source": "dotenv_path = dotenv_for()\nUSE_FAKE               = True\nDOCKERHUB              = os.getenv('DOCKER_REPOSITORY', \"masalvar\")\nNUM_PROCESSES          = 2\nDOCKER_PWD             = get_key(dotenv_path, 'DOCKER_PWD')"
 30 |         },
 31 |         {
 32 |             "cell_type": "code",
 33 |             "execution_count": null,
 34 |             "metadata": {},
 35 |             "outputs": [],
 36 |             "source": "dc = docker.from_env()"
 37 |         },
 38 |         {
 39 |             "cell_type": "code",
 40 |             "execution_count": null,
 41 |             "metadata": {},
 42 |             "outputs": [],
 43 |             "source": "image, log_iter = dc.images.build(path='Docker', \n                          tag='{}/caia-horovod-pytorch'.format(DOCKERHUB))"
 44 |         },
 45 |         {
 46 |             "cell_type": "code",
 47 |             "execution_count": null,
 48 |             "metadata": {},
 49 |             "outputs": [],
 50 |             "source": "container_labels = {'containerName': 'pytorchgpu'}\nenvironment ={\n    \"DISTRIBUTED\":True,\n    \"PYTHONPATH\":'/workspace/common/',\n}\n\nvolumes = {\n    os.getenv('EXT_PWD'): {\n                                'bind': '/workspace', \n                                'mode': 'rw'\n                               }\n}\n\nif USE_FAKE:\n    environment['FAKE'] = True\nelse:\n    environment['FAKE'] = False\n    volumes[os.getenv('EXT_DATA')]={'bind': '/mnt/input', 'mode': 'rw'}\n    environment['AZ_BATCHAI_INPUT_TRAIN'] = '/mnt/input/train'\n    environment['AZ_BATCHAI_INPUT_TEST'] = '/mnt/input/validation'"
 51 |         },
 52 |         {
 53 |             "cell_type": "code",
 54 |             "execution_count": null,
 55 |             "metadata": {},
 56 |             "outputs": [],
 57 |             "source": "cmd=f'mpirun -np {NUM_PROCESSES} -H localhost:{NUM_PROCESSES} '\\\n     'python -u /workspace/HorovodPytorch/src/imagenet_pytorch_horovod.py'\ncontainer = dc.containers.run(image.tags[0], \n                              command=cmd,\n                              detach=True, \n                              labels=container_labels,\n                              runtime='nvidia',\n                              volumes=volumes,\n                              environment=environment,\n                              shm_size='8G',\n                              privileged=True)"
 58 |         },
 59 |         {
 60 |             "cell_type": "markdown",
 61 |             "metadata": {},
 62 |             "source": "With the code below we are simply monitoring what is happening in the container. Feel free to stop the notebook when you are happy that everything is working."
 63 |         },
 64 |         {
 65 |             "cell_type": "code",
 66 |             "execution_count": null,
 67 |             "metadata": {
 68 |                 "tags": [
 69 |                     "stripout"
 70 |                 ]
 71 |             },
 72 |             "outputs": [],
 73 |             "source": "for line in container.logs(stderr=True, stream=True):\n    print(line.decode(\"utf-8\"),end =\"\")"
 74 |         },
 75 |         {
 76 |             "cell_type": "code",
 77 |             "execution_count": null,
 78 |             "metadata": {},
 79 |             "outputs": [],
 80 |             "source": "container.reload() # Refresh state\nif container.status is 'running':\n    container.kill()"
 81 |         },
 82 |         {
 83 |             "cell_type": "code",
 84 |             "execution_count": null,
 85 |             "metadata": {
 86 |                 "tags": [
 87 |                     "stripout"
 88 |                 ]
 89 |             },
 90 |             "outputs": [],
 91 |             "source": "for line in dc.images.push(image.tags[0], \n                           stream=True,\n                           auth_config={\"username\": DOCKERHUB,\n                                        \"password\": DOCKER_PWD}):\n    print(line)"
 92 |         }
 93 |     ],
 94 |     "metadata": {
 95 |         "jupytext": {
 96 |             "text_representation": {
 97 |                 "extension": ".py",
 98 |                 "format_name": "light",
 99 |                 "format_version": "1.3",
100 |                 "jupytext_version": "0.8.6"
101 |             }
102 |         },
103 |         "kernelspec": {
104 |             "display_name": "Python 3",
105 |             "language": "python",
106 |             "name": "python3"
107 |         }
108 |     },
109 |     "nbformat": 4,
110 |     "nbformat_minor": 2
111 | }
112 | 


--------------------------------------------------------------------------------
/HorovodPytorch/01_TrainPyTorchModel.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "cells": [
  3 |         {
  4 |             "cell_type": "markdown",
  5 |             "metadata": {},
  6 |             "source": "# Train PyTorch Model Distributed on Batch AI\nIn this notebook we will train a PyTorch model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n * [Create Experiment](#experiment)\n * [Upload Training Scripts](#training_scripts)\n * [Submit and Monitor Job](#job)\n * [Clean Up Resources](#clean_up)"
  7 |         },
  8 |         {
  9 |             "cell_type": "code",
 10 |             "execution_count": null,
 11 |             "metadata": {},
 12 |             "outputs": [],
 13 |             "source": "import sys\nsys.path.append(\"../common\") \n\nimport json\nfrom dotenv import get_key\nimport os\nfrom utils import write_json_to_file, dotenv_for"
 14 |         },
 15 |         {
 16 |             "cell_type": "markdown",
 17 |             "metadata": {},
 18 |             "source": "Set the USE_FAKE to True if you want to use fake data rather than the Imagenet dataset. This is often a good way to debug your models as well as checking what IO overhead is."
 19 |         },
 20 |         {
 21 |             "cell_type": "code",
 22 |             "execution_count": null,
 23 |             "metadata": {
 24 |                 "tags": [
 25 |                     "parameters"
 26 |                 ]
 27 |             },
 28 |             "outputs": [],
 29 |             "source": "# Variables for Batch AI - change as necessary\ndotenv_path = dotenv_for()\nGROUP_NAME             = get_key(dotenv_path, 'GROUP_NAME')\nFILE_SHARE_NAME        = get_key(dotenv_path, 'FILE_SHARE_NAME')\nWORKSPACE              = get_key(dotenv_path, 'WORKSPACE')\nNUM_NODES              = int(get_key(dotenv_path, 'NUM_NODES'))\nCLUSTER_NAME           = get_key(dotenv_path, 'CLUSTER_NAME')\nGPU_TYPE               = get_key(dotenv_path, 'GPU_TYPE')\nPROCESSES_PER_NODE     = int(get_key(dotenv_path, 'PROCESSES_PER_NODE'))\nSTORAGE_ACCOUNT_NAME   = get_key(dotenv_path, 'STORAGE_ACCOUNT_NAME')\n\nEXPERIMENT             = f\"distributed_pytorch_{GPU_TYPE}\"\nUSE_FAKE               = False\nDOCKERHUB              = os.getenv('DOCKER_REPOSITORY', \"masalvar\")  #\"<YOUR DOCKERHUB>\""
 30 |         },
 31 |         {
 32 |             "cell_type": "code",
 33 |             "execution_count": null,
 34 |             "metadata": {},
 35 |             "outputs": [],
 36 |             "source": "FAKE='-x FAKE=True' if USE_FAKE else ''\nTOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES"
 37 |         },
 38 |         {
 39 |             "cell_type": "markdown",
 40 |             "metadata": {},
 41 |             "source": "<a id='experiment'></a>\n# Create Experiment\nNext we create our experiment."
 42 |         },
 43 |         {
 44 |             "cell_type": "code",
 45 |             "execution_count": null,
 46 |             "metadata": {},
 47 |             "outputs": [],
 48 |             "source": "!az batchai experiment create -n $EXPERIMENT -g $GROUP_NAME -w $WORKSPACE"
 49 |         },
 50 |         {
 51 |             "cell_type": "markdown",
 52 |             "metadata": {},
 53 |             "source": "<a id='training_scripts'></a>\n# Upload Training Scripts\nWe need to upload our training scripts and associated files"
 54 |         },
 55 |         {
 56 |             "cell_type": "code",
 57 |             "execution_count": null,
 58 |             "metadata": {},
 59 |             "outputs": [],
 60 |             "source": "json_data = !az storage account keys list -n $STORAGE_ACCOUNT_NAME -g $GROUP_NAME\nstorage_account_key = json.loads(''.join([i for i in json_data if 'WARNING' not in i]))[0]['value']"
 61 |         },
 62 |         {
 63 |             "cell_type": "code",
 64 |             "execution_count": null,
 65 |             "metadata": {
 66 |                 "tags": [
 67 |                     "stripout"
 68 |                 ]
 69 |             },
 70 |             "outputs": [],
 71 |             "source": "%env AZURE_STORAGE_ACCOUNT $STORAGE_ACCOUNT_NAME\n%env AZURE_STORAGE_KEY=$storage_account_key"
 72 |         },
 73 |         {
 74 |             "cell_type": "markdown",
 75 |             "metadata": {},
 76 |             "source": "Upload our training scripts"
 77 |         },
 78 |         {
 79 |             "cell_type": "code",
 80 |             "execution_count": null,
 81 |             "metadata": {},
 82 |             "outputs": [],
 83 |             "source": "!az storage file upload --share-name $FILE_SHARE_NAME --source src/imagenet_pytorch_horovod.py --path scripts\n!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts"
 84 |         },
 85 |         {
 86 |             "cell_type": "markdown",
 87 |             "metadata": {},
 88 |             "source": "Let's check our cluster we created earlier"
 89 |         },
 90 |         {
 91 |             "cell_type": "code",
 92 |             "execution_count": null,
 93 |             "metadata": {},
 94 |             "outputs": [],
 95 |             "source": "!az batchai cluster list -w $WORKSPACE -o table"
 96 |         },
 97 |         {
 98 |             "cell_type": "markdown",
 99 |             "metadata": {},
100 |             "source": "<a id='job'></a>\n## Submit and Monitor Job\nBelow we specify the job we wish to execute.  "
101 |         },
102 |         {
103 |             "cell_type": "code",
104 |             "execution_count": null,
105 |             "metadata": {},
106 |             "outputs": [],
107 |             "source": "jobs_dict = {\n  \"$schema\": \"https://raw.githubusercontent.com/Azure/BatchAI/master/schemas/2017-09-01-preview/job.json\",\n  \"properties\": {\n    \"nodeCount\": NUM_NODES,\n    \"customToolkitSettings\": {\n      \"commandLine\": f\"echo $AZ_BATCH_HOST_LIST; \\\n    cat $AZ_BATCHAI_MPI_HOST_FILE; \\\n    mpirun -np {TOTAL_PROCESSES} --hostfile $AZ_BATCHAI_MPI_HOST_FILE \\\n    -bind-to none -map-by slot \\\n    -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \\\n    -mca btl_tcp_if_include eth0 \\\n    -x NCCL_SOCKET_IFNAME=eth0 \\\n    -mca btl ^openib \\\n    -x NCCL_IB_DISABLE=1 \\\n    -x DISTRIBUTED=True \\\n    -x AZ_BATCHAI_INPUT_TRAIN \\\n    -x AZ_BATCHAI_INPUT_TEST \\\n    --allow-run-as-root \\\n      {FAKE} \\\n      python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py\"\n    },\n    \"stdOutErrPathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n    \"inputDirectories\": [{\n        \"id\": \"SCRIPTS\",\n        \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts\"\n      },\n      {\n        \"id\": \"TRAIN\",\n        \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\",\n      },\n      {\n        \"id\": \"TEST\",\n        \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\",\n      },\n    ],\n    \"outputDirectories\": [{\n        \"id\": \"MODEL\",\n        \"pathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n        \"pathSuffix\": \"Models\"\n    }],\n    \"containerSettings\": {\n      \"imageSourceRegistry\": {\n        \"image\": f\"{DOCKERHUB}/caia-horovod-pytorch\"\n      }\n    }\n  }\n}"
108 |         },
109 |         {
110 |             "cell_type": "code",
111 |             "execution_count": null,
112 |             "metadata": {},
113 |             "outputs": [],
114 |             "source": "write_json_to_file(jobs_dict, 'job.json')"
115 |         },
116 |         {
117 |             "cell_type": "code",
118 |             "execution_count": null,
119 |             "metadata": {},
120 |             "outputs": [],
121 |             "source": "JOB_NAME='pytorch-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)"
122 |         },
123 |         {
124 |             "cell_type": "markdown",
125 |             "metadata": {},
126 |             "source": "We now submit the job to Batch AI"
127 |         },
128 |         {
129 |             "cell_type": "code",
130 |             "execution_count": null,
131 |             "metadata": {
132 |                 "tags": [
133 |                     "stripout"
134 |                 ]
135 |             },
136 |             "outputs": [],
137 |             "source": "!az batchai job create -n $JOB_NAME --cluster $CLUSTER_NAME -w $WORKSPACE -e $EXPERIMENT -f job.json"
138 |         },
139 |         {
140 |             "cell_type": "markdown",
141 |             "metadata": {},
142 |             "source": "With the command below we can check the status of the job"
143 |         },
144 |         {
145 |             "cell_type": "code",
146 |             "execution_count": null,
147 |             "metadata": {},
148 |             "outputs": [],
149 |             "source": "!az batchai job list -w $WORKSPACE -e $EXPERIMENT -o table"
150 |         },
151 |         {
152 |             "cell_type": "markdown",
153 |             "metadata": {},
154 |             "source": "To view the files that the job has generated use the command below"
155 |         },
156 |         {
157 |             "cell_type": "code",
158 |             "execution_count": null,
159 |             "metadata": {
160 |                 "tags": [
161 |                     "stripout"
162 |                 ]
163 |             },
164 |             "outputs": [],
165 |             "source": "!az batchai job file list -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr"
166 |         },
167 |         {
168 |             "cell_type": "markdown",
169 |             "metadata": {},
170 |             "source": "We are also able to stream the stdout and stderr that our job produces. This is great to check the progress of our job as well as debug issues."
171 |         },
172 |         {
173 |             "cell_type": "code",
174 |             "execution_count": null,
175 |             "metadata": {
176 |                 "tags": [
177 |                     "stripout"
178 |                 ]
179 |             },
180 |             "outputs": [],
181 |             "source": "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stdout.txt"
182 |         },
183 |         {
184 |             "cell_type": "code",
185 |             "execution_count": null,
186 |             "metadata": {
187 |                 "tags": [
188 |                     "stripout"
189 |                 ]
190 |             },
191 |             "outputs": [],
192 |             "source": "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stderr.txt"
193 |         },
194 |         {
195 |             "cell_type": "markdown",
196 |             "metadata": {},
197 |             "source": "We can either wait for the job to complete or delete it with the command below."
198 |         },
199 |         {
200 |             "cell_type": "code",
201 |             "execution_count": null,
202 |             "metadata": {},
203 |             "outputs": [],
204 |             "source": "!az batchai job delete -w $WORKSPACE -e $EXPERIMENT --name $JOB_NAME -y"
205 |         },
206 |         {
207 |             "cell_type": "markdown",
208 |             "metadata": {},
209 |             "source": "<a id='clean_up'></a>\n## Clean Up Resources\nNext we wish to tidy up the resource we created.  \nFirst we reset the default values we set earlier."
210 |         },
211 |         {
212 |             "cell_type": "code",
213 |             "execution_count": null,
214 |             "metadata": {},
215 |             "outputs": [],
216 |             "source": "!az configure --defaults group=''\n!az configure --defaults location=''"
217 |         },
218 |         {
219 |             "cell_type": "markdown",
220 |             "metadata": {},
221 |             "source": " Next we delete the cluster"
222 |         },
223 |         {
224 |             "cell_type": "code",
225 |             "execution_count": null,
226 |             "metadata": {},
227 |             "outputs": [],
228 |             "source": "!az batchai cluster delete -w $WORKSPACE --name $CLUSTER_NAME -g $GROUP_NAME -y"
229 |         },
230 |         {
231 |             "cell_type": "markdown",
232 |             "metadata": {},
233 |             "source": "Once the cluster is deleted you will not incur any cost for the computation but you can still retain your experiments and workspace. If you wish to delete those as well execute the commands below."
234 |         },
235 |         {
236 |             "cell_type": "code",
237 |             "execution_count": null,
238 |             "metadata": {},
239 |             "outputs": [],
240 |             "source": "!az batchai experiment delete -w $WORKSPACE --name $EXPERIMENT -g $GROUP_NAME -y"
241 |         },
242 |         {
243 |             "cell_type": "code",
244 |             "execution_count": null,
245 |             "metadata": {},
246 |             "outputs": [],
247 |             "source": "!az batchai workspace delete -n $WORKSPACE -g $GROUP_NAME -y"
248 |         },
249 |         {
250 |             "cell_type": "markdown",
251 |             "metadata": {},
252 |             "source": "Finally we can delete the group and we will have deleted everything created for this tutorial."
253 |         },
254 |         {
255 |             "cell_type": "code",
256 |             "execution_count": null,
257 |             "metadata": {},
258 |             "outputs": [],
259 |             "source": "!az group delete --name $GROUP_NAME -y"
260 |         }
261 |     ],
262 |     "metadata": {
263 |         "jupytext": {
264 |             "text_representation": {
265 |                 "extension": ".py",
266 |                 "format_name": "light",
267 |                 "format_version": "1.3",
268 |                 "jupytext_version": "0.8.6"
269 |             }
270 |         },
271 |         "kernelspec": {
272 |             "display_name": "Python 3",
273 |             "language": "python",
274 |             "name": "python3"
275 |         }
276 |     },
277 |     "nbformat": 4,
278 |     "nbformat_minor": 2
279 | }
280 | 


--------------------------------------------------------------------------------
/HorovodPytorch/Docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:9.0-devel-ubuntu16.04
 2 | 
 3 | # TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
 4 | ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
 5 | ENV NCCL_VERSION=2.2.13-1+cuda9.0
 6 | ENV PYTORCH_VERSION=0.4.0
 7 | ENV PYTHON_VERSION=3.5
 8 | 
 9 | RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
10 | 
11 | RUN apt-get update && apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
12 |         build-essential \
13 |         cmake \
14 |         git \
15 |         curl \
16 |         vim \
17 |         wget \
18 |         ca-certificates \
19 |         libcudnn7=${CUDNN_VERSION} \
20 |         libnccl2=${NCCL_VERSION} \
21 |         libnccl-dev=${NCCL_VERSION} \
22 |         libjpeg-dev \
23 |         libpng-dev \
24 |         python${PYTHON_VERSION} \
25 |         python${PYTHON_VERSION}-dev
26 | 
27 | RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
28 | 
29 | RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
30 |     python get-pip.py && \
31 |     rm get-pip.py
32 | 
33 | 
34 | # Install PyTorch
35 | RUN pip install http://download.pytorch.org/whl/cu90/torch-${PYTORCH_VERSION}-cp35-cp35m-linux_x86_64.whl && \
36 | 	pip install --no-cache-dir torchvision h5py scipy jupyter ipykernel numpy toolz pandas scikit-learn pillow
37 | 
38 | # Install Open MPI
39 | RUN mkdir /tmp/openmpi && \
40 |     cd /tmp/openmpi && \
41 |     wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
42 |     tar zxf openmpi-3.0.0.tar.gz && \
43 |     cd openmpi-3.0.0 && \
44 |     ./configure --enable-orterun-prefix-by-default && \
45 |     make -j $(nproc) all && \
46 |     make install && \
47 |     ldconfig && \
48 |     rm -rf /tmp/openmpi
49 | 
50 | # Install Horovod, temporarily using CUDA stubs
51 | RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
52 |     HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod==0.13.2 && \
53 |     ldconfig
54 | 
55 | # Create a wrapper for OpenMPI to allow running as root by default
56 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
57 |     echo '#!/bin/bash' > /usr/local/bin/mpirun && \
58 |     echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
59 |     chmod a+x /usr/local/bin/mpirun
60 | 
61 | # Configure OpenMPI to run good defaults:
62 | #   --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
63 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
64 |     echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
65 | 
66 | # Set default NCCL parameters
67 | RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf && \
68 |     echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
69 | 
70 | # Install OpenSSH for MPI to communicate between containers
71 | RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
72 |     mkdir -p /var/run/sshd
73 | 
74 | # Allow OpenSSH to talk to containers without asking for confirmation
75 | RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
76 |     echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
77 |     mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
78 | 
79 | WORKDIR "/examples"


--------------------------------------------------------------------------------
/HorovodPytorch/cluster_config/cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "properties": {
 3 |         "nodeSetup": {
 4 |             "setupTask": {
 5 |                 "commandLine": "$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts/nodeprep.sh",
 6 |                 "runElevated": "True",
 7 |                 "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs"
 8 |             }
 9 |         }
10 |     }
11 | }


--------------------------------------------------------------------------------
/HorovodPytorch/cluster_config/docker.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Docker Application Container Engine
 3 | Documentation=https://docs.docker.com
 4 | After=network-online.target docker.socket firewalld.service
 5 | Wants=network-online.target
 6 | Requires=docker.socket
 7 | 
 8 | [Service]
 9 | EnvironmentFile=/etc/default/docker
10 | Type=notify
11 | # the default is not to use systemd for cgroups because the delegate issues still
12 | # exists and systemd currently does not support the cgroup feature set required
13 | # for containers run by docker
14 | ExecStart=/usr/bin/dockerd --default-shm-size 8G -g /mnt/docker/ -H fd://
15 | ExecReload=/bin/kill -s HUP $MAINPID
16 | LimitNOFILE=1048576
17 | # Having non-zero Limit*s causes performance problems due to accounting overhead
18 | # in the kernel. We recommend using cgroups to do container-local accounting.
19 | LimitNPROC=infinity
20 | LimitCORE=infinity
21 | # Uncomment TasksMax if your systemd version supports it.
22 | # Only systemd 226 and above support this version.
23 | TasksMax=infinity
24 | TimeoutStartSec=0
25 | # set delegate yes so that systemd does not reset the cgroups of docker containers
26 | Delegate=yes
27 | # kill only the docker process, not all processes in the cgroup
28 | KillMode=process
29 | # restart the docker process if it exits prematurely
30 | Restart=on-failure
31 | StartLimitBurst=3
32 | StartLimitInterval=60s
33 | 
34 | [Install]
35 | WantedBy=multi-user.target
36 | 


--------------------------------------------------------------------------------
/HorovodPytorch/cluster_config/nodeprep.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | sudo cp $AZ_BATCHAI_MOUNT_ROOT/extfs/scripts/docker.service /lib/systemd/system
3 | sudo systemctl daemon-reload
4 | sudo systemctl restart docker
5 | 


--------------------------------------------------------------------------------
/HorovodPytorch/src/imagenet_pytorch_horovod.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Trains ResNet50 in Keras using Horovod.
  3 | 
  4 | It requires the following env variables
  5 | AZ_BATCHAI_INPUT_TRAIN
  6 | AZ_BATCHAI_INPUT_TEST
  7 | AZ_BATCHAI_OUTPUT_MODEL
  8 | AZ_BATCHAI_JOB_TEMP_DIR
  9 | """
 10 | import logging
 11 | import os
 12 | import sys
 13 | from functools import lru_cache
 14 | from os import path
 15 | from timer import Timer
 16 | 
 17 | import numpy as np
 18 | import pandas as pd
 19 | import torch.backends.cudnn as cudnn
 20 | import torch.nn.functional as F
 21 | import torch.optim as optim
 22 | import torch.utils.data.distributed
 23 | import torchvision.models as models
 24 | from torch.utils.data import Dataset
 25 | from torchvision import transforms, datasets
 26 | 
 27 | 
 28 | def _str_to_bool(in_str):
 29 |     if "t" in in_str.lower():
 30 |         return True
 31 |     else:
 32 |         return False
 33 | 
 34 | 
 35 | _WIDTH = 224
 36 | _HEIGHT = 224
 37 | _CHANNELS = 3
 38 | _LR = 0.001
 39 | _EPOCHS = os.getenv("EPOCHS", 1)
 40 | _BATCHSIZE = 64
 41 | _RGB_MEAN = [0.485, 0.456, 0.406]
 42 | _RGB_SD = [0.229, 0.224, 0.225]
 43 | _SEED = 42
 44 | 
 45 | # Settings from https://arxiv.org/abs/1706.02677.
 46 | _WARMUP_EPOCHS = 5
 47 | _WEIGHT_DECAY = 0.00005
 48 | 
 49 | _FAKE = _str_to_bool(os.getenv("FAKE", "False"))
 50 | _DATA_LENGTH = int(
 51 |     os.getenv("FAKE_DATA_LENGTH", 1281167)
 52 | )  # How much fake data to simulate, default to size of imagenet dataset
 53 | _DISTRIBUTED = _str_to_bool(os.getenv("DISTRIBUTED", "False"))
 54 | 
 55 | if _DISTRIBUTED:
 56 |     import horovod.torch as hvd
 57 | 
 58 | 
 59 | def _get_rank():
 60 |     if _DISTRIBUTED:
 61 |         try:
 62 |             return hvd.rank()
 63 |         except:
 64 |             return 0
 65 |     else:
 66 |         return 0
 67 | 
 68 | 
 69 | 
 70 | class HorovodAdapter(logging.LoggerAdapter):
 71 |     def __init__(self, logger):
 72 |         self._str_epoch = ""
 73 |         self._gpu_rank = 0
 74 |         super(HorovodAdapter, self).__init__(logger, {})
 75 | 
 76 |     def set_epoch(self, epoch):
 77 |         self._str_epoch = "[Epoch {}]".format(epoch)
 78 | 
 79 |     def process(self, msg, kwargs):
 80 |         kwargs["extra"] = {"gpurank": _get_rank(), "epoch": self._str_epoch}
 81 |         return msg, kwargs
 82 | 
 83 | 
 84 | @lru_cache()
 85 | def _get_logger():
 86 |     logger = logging.getLogger(__name__)
 87 |     logger.setLevel(logging.INFO)
 88 |     ch = logging.StreamHandler(stream=sys.stdout)
 89 |     formatter = logging.Formatter(
 90 |         "%(levelname)s:%(name)s:%(gpurank)d: %(epoch)s %(message)s"
 91 |     )
 92 |     ch.setFormatter(formatter)
 93 |     logger.addHandler(ch)
 94 |     adapter = HorovodAdapter(logger)
 95 |     return adapter
 96 | 
 97 | 
 98 | def _append_path_to(data_path, data_series):
 99 |     return data_series.apply(lambda x: path.join(data_path, x))
100 | 
101 | 
102 | def _load_training(data_dir):
103 |     logger = _get_logger()
104 |     logger.info("Reading training data from {}".format(data_dir))
105 |     train_df = pd.read_csv(path.join(data_dir, "train.csv"))
106 |     return train_df.assign(
107 |         filenames=_append_path_to(path.join(data_dir, "train"), train_df.filenames)
108 |     )
109 | 
110 | 
111 | def _load_validation(data_dir):
112 |     logger = _get_logger()
113 |     logger.info("Reading validation data from {}".format(data_dir))
114 |     train_df = pd.read_csv(path.join(data_dir, "validation.csv"))
115 |     return train_df.assign(
116 |         filenames=_append_path_to(path.join(data_dir, "validation"), train_df.filenames)
117 |     )
118 | 
119 | 
120 | def _create_data_fn(train_path, test_path):
121 |     train_df = _load_training(train_path)
122 |     validation_df = _load_validation(test_path)
123 |     # File-path
124 |     train_X = train_df["filenames"].values
125 |     validation_X = validation_df["filenames"].values
126 |     # One-hot encoded labels for torch
127 |     train_labels = train_df[["num_id"]].values.ravel()
128 |     validation_labels = validation_df[["num_id"]].values.ravel()
129 |     # Index starts from 0
130 |     train_labels -= 1
131 |     validation_labels -= 1
132 |     return train_X, train_labels, validation_X, validation_labels
133 | 
134 | 
135 | def _create_data(batch_size, num_batches, dim, channels, seed=42):
136 |     np.random.seed(seed)
137 |     return np.random.rand(batch_size * num_batches, channels, dim[0], dim[1]).astype(
138 |         np.float32
139 |     )
140 | 
141 | 
142 | def _create_labels(batch_size, num_batches, n_classes):
143 |     return np.random.choice(n_classes, batch_size * num_batches)
144 | 
145 | 
146 | class FakeData(Dataset):
147 |     def __init__(
148 |         self,
149 |         batch_size=32,
150 |         num_batches=20,
151 |         dim=(224, 224),
152 |         n_channels=3,
153 |         n_classes=10,
154 |         length=_DATA_LENGTH,
155 |         seed=42,
156 |         data_transform=None,
157 |     ):
158 |         self.dim = dim
159 |         self.n_channels = n_channels
160 |         self.n_classes = n_classes
161 |         self.num_batches = num_batches
162 |         self._data = _create_data(
163 |             batch_size, self.num_batches, self.dim, self.n_channels
164 |         )
165 |         self._labels = _create_labels(batch_size, self.num_batches, self.n_classes)
166 |         self.translation_index = np.random.choice(len(self._labels), length)
167 |         self._length = length
168 | 
169 |         self._data_transform = data_transform
170 |         logger = _get_logger()
171 |         logger.info(
172 |             "Creating fake data {} labels and {} images".format(
173 |                 n_classes, len(self._data)
174 |             )
175 |         )
176 | 
177 |     def __getitem__(self, idx):
178 |         logger = _get_logger()
179 |         logger.debug("Retrieving samples")
180 |         logger.debug(str(idx))
181 |         tr_index_array = self.translation_index[idx]
182 | 
183 |         if self._data_transform is not None:
184 |             data = self._data_transform(self._data[tr_index_array])
185 |         else:
186 |             data = self._data[tr_index_array]
187 | 
188 |         return data, self._labels[tr_index_array]
189 | 
190 |     def __len__(self):
191 |         return self._length
192 | 
193 | 
194 | def _is_master(is_distributed=_DISTRIBUTED):
195 |     if is_distributed:
196 |         if hvd.rank() == 0:
197 |             return True
198 |         else:
199 |             return False
200 |     else:
201 |         return True
202 | 
203 | 
204 | def train(train_loader, model, criterion, optimizer, epoch):
205 |     logger = _get_logger()
206 |     msg = " duration({})  loss:{} total-samples: {}"
207 |     t = Timer()
208 |     t.start()
209 |     logger.set_epoch(epoch)
210 |     for i, (data, target) in enumerate(train_loader):
211 |         data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
212 |         optimizer.zero_grad()
213 |         # compute output
214 |         output = model(data)
215 |         loss = criterion(output, target)
216 |         # compute gradient and do SGD step
217 |         loss.backward()
218 |         optimizer.step()
219 |         if i % 100 == 0:
220 |             logger.info(msg.format(t.elapsed, loss.item(), i * len(data)))
221 |             t.start()
222 | 
223 | 
224 | def validate(train_loader, model, criterion):
225 |     logger = _get_logger()
226 |     msg = "validation duration({})  loss:{} total-samples: {}"
227 |     t = Timer()
228 |     t.start()
229 |     model.eval()
230 |     with torch.no_grad():
231 |         for i, (data, target) in enumerate(train_loader):
232 |             data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
233 |             # compute output
234 |             output = model(data)
235 |             loss = criterion(output, target)
236 |             # compute gradient and do SGD step
237 |             if i % 100 == 0:
238 |                 logger.info(msg.format(t.elapsed, loss.item(), i * len(data)))
239 |                 t.start()
240 | 
241 | 
242 | def _log_summary(data_length, duration):
243 |     logger = _get_logger()
244 |     images_per_second = data_length / duration
245 |     logger.info("Data length:      {}".format(data_length))
246 |     logger.info("Total duration:   {:.3f}".format(duration))
247 |     logger.info("Total images/sec: {:.3f}".format(images_per_second))
248 |     logger.info(
249 |         "Batch size:       (Per GPU {}: Total {})".format(
250 |             _BATCHSIZE, hvd.size() * _BATCHSIZE if _DISTRIBUTED else _BATCHSIZE
251 |         )
252 |     )
253 |     logger.info("Distributed:      {}".format("True" if _DISTRIBUTED else "False"))
254 |     logger.info("Num GPUs:         {:.3f}".format(hvd.size() if _DISTRIBUTED else 1))
255 |     logger.info("Dataset:          {}".format("Synthetic" if _FAKE else "Imagenet"))
256 | 
257 | 
258 | def _get_sampler(dataset, is_distributed=_DISTRIBUTED):
259 |     if is_distributed:
260 |         return torch.utils.data.distributed.DistributedSampler(
261 |             dataset, num_replicas=hvd.size(), rank=hvd.rank()
262 |         )
263 |     else:
264 |         return torch.utils.data.sampler.RandomSampler(dataset)
265 | 
266 | 
267 | def main():
268 |     logger = _get_logger()
269 |     if _DISTRIBUTED:
270 |         # Horovod: initialize Horovod.
271 | 
272 |         hvd.init()
273 |         logger.info("Runnin Distributed")
274 |         torch.manual_seed(_SEED)
275 |         # Horovod: pin GPU to local rank.
276 |         torch.cuda.set_device(hvd.local_rank())
277 |         torch.cuda.manual_seed(_SEED)
278 | 
279 |     logger.info("PyTorch version {}".format(torch.__version__))
280 | 
281 |     if _FAKE:
282 |         logger.info("Setting up fake loaders")
283 |         train_dataset = FakeData(n_classes=1000, data_transform=torch.FloatTensor)
284 |     else:
285 |         normalize = transforms.Normalize(_RGB_MEAN, _RGB_SD)
286 |         logger.info("Setting up loaders")
287 |         train_dataset = datasets.ImageFolder(
288 |             os.getenv("AZ_BATCHAI_INPUT_TRAIN"),
289 |             transforms.Compose(
290 |                 [
291 |                     transforms.RandomResizedCrop(_WIDTH),
292 |                     transforms.RandomHorizontalFlip(),
293 |                     transforms.ToTensor(),
294 |                     normalize,
295 |                 ]
296 |             ),
297 |         )
298 | 
299 |         validation_dataset = datasets.ImageFolder(
300 |             os.getenv("AZ_BATCHAI_INPUT_TRAIN"),
301 |             transforms.Compose(
302 |                 [
303 |                     transforms.Resize(256),
304 |                     transforms.CenterCrop(224),
305 |                     transforms.ToTensor(),
306 |                     normalize,
307 |                 ]
308 |             ),
309 |         )
310 | 
311 |     train_sampler = _get_sampler(train_dataset)
312 | 
313 |     kwargs = {"num_workers": 5, "pin_memory": True}
314 |     train_loader = torch.utils.data.DataLoader(
315 |         train_dataset, batch_size=_BATCHSIZE, sampler=train_sampler, **kwargs
316 |     )
317 | 
318 |     # Autotune
319 |     cudnn.benchmark = True
320 | 
321 |     logger.info("Loading model")
322 |     # Load symbol
323 |     model = models.__dict__["resnet50"](pretrained=False)
324 | 
325 |     model.cuda()
326 | 
327 |     if _DISTRIBUTED:
328 |         # Horovod: broadcast parameters.
329 |         hvd.broadcast_parameters(model.state_dict(), root_rank=0)
330 | 
331 |     num_gpus = hvd.size() if _DISTRIBUTED else 1
332 |     # Horovod: scale learning rate by the number of GPUs.
333 |     optimizer = optim.SGD(model.parameters(), lr=_LR * num_gpus, momentum=0.9)
334 |     if _DISTRIBUTED:
335 |         # Horovod: wrap optimizer with DistributedOptimizer.
336 |         optimizer = hvd.DistributedOptimizer(
337 |             optimizer, named_parameters=model.named_parameters()
338 |         )
339 | 
340 |     criterion = F.cross_entropy
341 | 
342 |     if not _FAKE:
343 |         val_sampler = _get_sampler(validation_dataset)
344 |         val_loader = torch.utils.data.DataLoader(
345 |             validation_dataset, batch_size=_BATCHSIZE, sampler=val_sampler, **kwargs
346 |         )
347 | 
348 |     # Main training-loop
349 |     logger.info("Training ...")
350 |     for epoch in range(_EPOCHS):
351 |         with Timer(output=logger.info, prefix="Training") as t:
352 |             model.train()
353 |             if _DISTRIBUTED:
354 |                 train_sampler.set_epoch(epoch)
355 |             train(train_loader, model, criterion, optimizer, epoch)
356 |         _log_summary(len(train_dataset), t.elapsed)
357 | 
358 |     if not _FAKE:
359 |         validate(val_loader, model, criterion)
360 | 
361 | 
362 | if __name__ == "__main__":
363 |     main()
364 | 


--------------------------------------------------------------------------------
/HorovodTF/00_CreateImageAndTest.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "cells": [
  3 |         {
  4 |             "cell_type": "markdown",
  5 |             "metadata": {},
  6 |             "source": "# Create Docker Image for TensorFlow\nIn this notebook we will create the Docker image for our TensorFlow script to run in. We will go through the process of creating the image and testing it locally to make sure it runs before submitting it to the cluster. It is often recommended you do this rather than debugging on the cluster since debugging on a cluster can be much more difficult and time consuming.\n \n**You will need to be running everything on a GPU enabled VM to run this notebook.** "
  7 |         },
  8 |         {
  9 |             "cell_type": "code",
 10 |             "execution_count": null,
 11 |             "metadata": {},
 12 |             "outputs": [],
 13 |             "source": "import sys\nsys.path.append(\"../common\") \n\nfrom dotenv import get_key\nimport os\nfrom utils import dotenv_for\nimport docker"
 14 |         },
 15 |         {
 16 |             "cell_type": "markdown",
 17 |             "metadata": {},
 18 |             "source": "We will use fake data here since we don't want to have to download the data etc. Using fake data is often a good way to debug your models as well as checking what IO overhead is. Here we are setting the number of processes (NUM_PROCESSES) to 2 since the VM we are testing on has 2 GPUs. If you are running on a machine with 1 GPU set NUM_PROCESSES to 1."
 19 |         },
 20 |         {
 21 |             "cell_type": "code",
 22 |             "execution_count": null,
 23 |             "metadata": {
 24 |                 "tags": [
 25 |                     "parameters"
 26 |                 ]
 27 |             },
 28 |             "outputs": [],
 29 |             "source": "dotenv_path = dotenv_for()\nUSE_FAKE               = True\nDOCKERHUB              = os.getenv('DOCKER_REPOSITORY', \"masalvar\")\nNUM_PROCESSES          = 2\nDOCKER_PWD             = get_key(dotenv_path, 'DOCKER_PWD')"
 30 |         },
 31 |         {
 32 |             "cell_type": "code",
 33 |             "execution_count": null,
 34 |             "metadata": {},
 35 |             "outputs": [],
 36 |             "source": "dc = docker.from_env()"
 37 |         },
 38 |         {
 39 |             "cell_type": "code",
 40 |             "execution_count": null,
 41 |             "metadata": {},
 42 |             "outputs": [],
 43 |             "source": "image, log_iter = dc.images.build(path='Docker', \n                          tag='{}/caia-horovod-tensorflow'.format(DOCKERHUB))"
 44 |         },
 45 |         {
 46 |             "cell_type": "code",
 47 |             "execution_count": null,
 48 |             "metadata": {},
 49 |             "outputs": [],
 50 |             "source": "container_labels = {'containerName': 'tensorflowgpu'}\nenvironment ={\n    \"DISTRIBUTED\":True,\n    \"PYTHONPATH\":'/workspace/common/',\n}\n\nvolumes = {\n    os.getenv('EXT_PWD'): {\n                                'bind': '/workspace', \n                                'mode': 'rw'\n                               }\n}\n\nif USE_FAKE:\n    environment['FAKE'] = True\nelse:\n    environment['FAKE'] = False\n    volumes[os.getenv('EXT_DATA')]={'bind': '/mnt/input', 'mode': 'rw'}\n    environment['AZ_BATCHAI_INPUT_TRAIN'] = '/mnt/input/train'\n    environment['AZ_BATCHAI_INPUT_TEST'] = '/mnt/input/validation'"
 51 |         },
 52 |         {
 53 |             "cell_type": "code",
 54 |             "execution_count": null,
 55 |             "metadata": {},
 56 |             "outputs": [],
 57 |             "source": "cmd=f'mpirun -np {NUM_PROCESSES} -H localhost:{NUM_PROCESSES} '\\\n     'python -u /workspace/HorovodTF/src/imagenet_estimator_tf_horovod.py'\ncontainer = dc.containers.run(image.tags[0], \n                              command=cmd,\n                              detach=True, \n                              labels=container_labels,\n                              runtime='nvidia',\n                              volumes=volumes,\n                              environment=environment,\n                              shm_size='8G',\n                              privileged=True)"
 58 |         },
 59 |         {
 60 |             "cell_type": "markdown",
 61 |             "metadata": {},
 62 |             "source": "With the code below we are simply monitoring what is happening in the container. Feel free to stop the notebook when you are happy that everything is working."
 63 |         },
 64 |         {
 65 |             "cell_type": "code",
 66 |             "execution_count": null,
 67 |             "metadata": {
 68 |                 "tags": [
 69 |                     "stripout"
 70 |                 ]
 71 |             },
 72 |             "outputs": [],
 73 |             "source": "for line in container.logs(stderr=True, stream=True):\n    print(line.decode(\"utf-8\"),end =\"\")"
 74 |         },
 75 |         {
 76 |             "cell_type": "code",
 77 |             "execution_count": null,
 78 |             "metadata": {},
 79 |             "outputs": [],
 80 |             "source": "container.reload() # Refresh state\nif container.status is 'running':\n    container.kill()"
 81 |         },
 82 |         {
 83 |             "cell_type": "code",
 84 |             "execution_count": null,
 85 |             "metadata": {
 86 |                 "tags": [
 87 |                     "stripout"
 88 |                 ]
 89 |             },
 90 |             "outputs": [],
 91 |             "source": "for line in dc.images.push(image.tags[0], \n                           stream=True,\n                           auth_config={\"username\": DOCKERHUB,\n                                        \"password\": DOCKER_PWD}):\n    print(line)"
 92 |         }
 93 |     ],
 94 |     "metadata": {
 95 |         "jupytext": {
 96 |             "text_representation": {
 97 |                 "extension": ".py",
 98 |                 "format_name": "light",
 99 |                 "format_version": "1.3",
100 |                 "jupytext_version": "0.8.6"
101 |             }
102 |         },
103 |         "kernelspec": {
104 |             "display_name": "Python 3",
105 |             "language": "python",
106 |             "name": "python3"
107 |         }
108 |     },
109 |     "nbformat": 4,
110 |     "nbformat_minor": 2
111 | }
112 | 


--------------------------------------------------------------------------------
/HorovodTF/01_TrainTensorflowModel.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "cells": [
  3 |         {
  4 |             "cell_type": "markdown",
  5 |             "metadata": {},
  6 |             "source": "# Train TensorFlow Model Distributed on Batch AI\nIn this notebook we will train a TensorFlow model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n * [Create Experiment](#experiment)\n * [Upload Training Scripts](#training_scripts)\n * [Submit and Monitor Job](#job)\n * [Clean Up Resources](#clean_up)"
  7 |         },
  8 |         {
  9 |             "cell_type": "code",
 10 |             "execution_count": null,
 11 |             "metadata": {},
 12 |             "outputs": [],
 13 |             "source": "import sys\nsys.path.append(\"../common\") \n\nimport json\nfrom dotenv import get_key\nimport os\nfrom utils import write_json_to_file, dotenv_for"
 14 |         },
 15 |         {
 16 |             "cell_type": "markdown",
 17 |             "metadata": {},
 18 |             "source": "Set the USE_FAKE to True if you want to use fake data rather than the ImageNet dataset. This is often a good way to debug your models as well as checking what IO overhead is."
 19 |         },
 20 |         {
 21 |             "cell_type": "code",
 22 |             "execution_count": null,
 23 |             "metadata": {
 24 |                 "tags": [
 25 |                     "parameters"
 26 |                 ]
 27 |             },
 28 |             "outputs": [],
 29 |             "source": "# Variables for Batch AI - change as necessary\ndotenv_path = dotenv_for()\nGROUP_NAME             = get_key(dotenv_path, 'GROUP_NAME')\nFILE_SHARE_NAME        = get_key(dotenv_path, 'FILE_SHARE_NAME')\nWORKSPACE              = get_key(dotenv_path, 'WORKSPACE')\nNUM_NODES              = int(get_key(dotenv_path, 'NUM_NODES'))\nCLUSTER_NAME           = get_key(dotenv_path, 'CLUSTER_NAME')\nGPU_TYPE               = get_key(dotenv_path, 'GPU_TYPE')\nPROCESSES_PER_NODE     = int(get_key(dotenv_path, 'PROCESSES_PER_NODE'))\nSTORAGE_ACCOUNT_NAME   = get_key(dotenv_path, 'STORAGE_ACCOUNT_NAME')\n\nEXPERIMENT             = f\"distributed_tensorflow_{GPU_TYPE}\"\nUSE_FAKE               = False\nDOCKERHUB              = os.getenv('DOCKER_REPOSITORY', \"masalvar\")"
 30 |         },
 31 |         {
 32 |             "cell_type": "code",
 33 |             "execution_count": null,
 34 |             "metadata": {},
 35 |             "outputs": [],
 36 |             "source": "FAKE='-x FAKE=True' if USE_FAKE else ''\nTOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES"
 37 |         },
 38 |         {
 39 |             "cell_type": "markdown",
 40 |             "metadata": {},
 41 |             "source": "<a id='experiment'></a>\n# Create Experiment\nNext we create our experiment."
 42 |         },
 43 |         {
 44 |             "cell_type": "code",
 45 |             "execution_count": null,
 46 |             "metadata": {},
 47 |             "outputs": [],
 48 |             "source": "!az batchai experiment create -n $EXPERIMENT -g $GROUP_NAME -w $WORKSPACE"
 49 |         },
 50 |         {
 51 |             "cell_type": "markdown",
 52 |             "metadata": {},
 53 |             "source": "<a id='training_scripts'></a>\n# Upload Training Scripts\nWe need to upload our training scripts and associated files"
 54 |         },
 55 |         {
 56 |             "cell_type": "code",
 57 |             "execution_count": null,
 58 |             "metadata": {},
 59 |             "outputs": [],
 60 |             "source": "json_data = !az storage account keys list -n $STORAGE_ACCOUNT_NAME -g $GROUP_NAME\nstorage_account_key = json.loads(''.join([i for i in json_data if 'WARNING' not in i]))[0]['value']"
 61 |         },
 62 |         {
 63 |             "cell_type": "code",
 64 |             "execution_count": null,
 65 |             "metadata": {
 66 |                 "tags": [
 67 |                     "stripout"
 68 |                 ]
 69 |             },
 70 |             "outputs": [],
 71 |             "source": "%env AZURE_STORAGE_ACCOUNT $STORAGE_ACCOUNT_NAME\n%env AZURE_STORAGE_KEY=$storage_account_key"
 72 |         },
 73 |         {
 74 |             "cell_type": "markdown",
 75 |             "metadata": {},
 76 |             "source": "Upload our training scripts"
 77 |         },
 78 |         {
 79 |             "cell_type": "code",
 80 |             "execution_count": null,
 81 |             "metadata": {},
 82 |             "outputs": [],
 83 |             "source": "!az storage file upload --share-name $FILE_SHARE_NAME --source src/imagenet_estimator_tf_horovod.py --path scripts\n!az storage file upload --share-name $FILE_SHARE_NAME --source src/resnet_model.py --path scripts\n!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts"
 84 |         },
 85 |         {
 86 |             "cell_type": "markdown",
 87 |             "metadata": {},
 88 |             "source": "Let's check our cluster we created earlier"
 89 |         },
 90 |         {
 91 |             "cell_type": "code",
 92 |             "execution_count": null,
 93 |             "metadata": {},
 94 |             "outputs": [],
 95 |             "source": "!az batchai cluster list -w $WORKSPACE -o table"
 96 |         },
 97 |         {
 98 |             "cell_type": "markdown",
 99 |             "metadata": {},
100 |             "source": "<a id='job'></a>\n## Submit and Monitor Job\nBelow we specify the job we wish to execute.  "
101 |         },
102 |         {
103 |             "cell_type": "code",
104 |             "execution_count": null,
105 |             "metadata": {},
106 |             "outputs": [],
107 |             "source": "jobs_dict = {\n  \"$schema\": \"https://raw.githubusercontent.com/Azure/BatchAI/master/schemas/2017-09-01-preview/job.json\",\n  \"properties\": {\n    \"nodeCount\": NUM_NODES,\n    \"customToolkitSettings\": {\n      \"commandLine\": f\"echo $AZ_BATCH_HOST_LIST; \\\n    cat $AZ_BATCHAI_MPI_HOST_FILE; \\\n    mpirun -np {TOTAL_PROCESSES} --hostfile $AZ_BATCHAI_MPI_HOST_FILE \\\n    -bind-to none -map-by slot \\\n    -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \\\n    -mca btl_tcp_if_include eth0 \\\n    -x NCCL_SOCKET_IFNAME=eth0 \\\n    -mca btl ^openib \\\n    -x NCCL_IB_DISABLE=1 \\\n    -x DISTRIBUTED=True \\\n    -x AZ_BATCHAI_INPUT_TRAIN \\\n    -x AZ_BATCHAI_INPUT_TEST \\\n    --allow-run-as-root \\\n      {FAKE} \\\n      python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py\"\n    },\n    \"stdOutErrPathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n    \"inputDirectories\": [{\n        \"id\": \"SCRIPTS\",\n        \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts\"\n      },\n      {\n        \"id\": \"TRAIN\",\n        \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\",\n      },\n      {\n        \"id\": \"TEST\",\n        \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\",\n      },\n    ],\n    \"outputDirectories\": [{\n        \"id\": \"MODEL\",\n        \"pathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n        \"pathSuffix\": \"Models\"\n    }],\n    \"containerSettings\": {\n      \"imageSourceRegistry\": {\n        \"image\": f\"{DOCKERHUB}/caia-horovod-tensorflow\"\n      }\n    }\n  }\n}"
108 |         },
109 |         {
110 |             "cell_type": "code",
111 |             "execution_count": null,
112 |             "metadata": {},
113 |             "outputs": [],
114 |             "source": "write_json_to_file(jobs_dict, 'job.json')"
115 |         },
116 |         {
117 |             "cell_type": "code",
118 |             "execution_count": null,
119 |             "metadata": {},
120 |             "outputs": [],
121 |             "source": "JOB_NAME='tensorflow-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)"
122 |         },
123 |         {
124 |             "cell_type": "markdown",
125 |             "metadata": {},
126 |             "source": "We now submit the job to Batch AI"
127 |         },
128 |         {
129 |             "cell_type": "code",
130 |             "execution_count": null,
131 |             "metadata": {
132 |                 "tags": [
133 |                     "stripout"
134 |                 ]
135 |             },
136 |             "outputs": [],
137 |             "source": "!az batchai job create -n $JOB_NAME --cluster $CLUSTER_NAME -w $WORKSPACE -e $EXPERIMENT -f job.json"
138 |         },
139 |         {
140 |             "cell_type": "markdown",
141 |             "metadata": {},
142 |             "source": "With the command below we can check the status of the job"
143 |         },
144 |         {
145 |             "cell_type": "code",
146 |             "execution_count": null,
147 |             "metadata": {},
148 |             "outputs": [],
149 |             "source": "!az batchai job list -w $WORKSPACE -e $EXPERIMENT -o table"
150 |         },
151 |         {
152 |             "cell_type": "markdown",
153 |             "metadata": {},
154 |             "source": "To view the files that the job has generated use the command below"
155 |         },
156 |         {
157 |             "cell_type": "code",
158 |             "execution_count": null,
159 |             "metadata": {
160 |                 "tags": [
161 |                     "stripout"
162 |                 ]
163 |             },
164 |             "outputs": [],
165 |             "source": "!az batchai job file list -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr"
166 |         },
167 |         {
168 |             "cell_type": "markdown",
169 |             "metadata": {},
170 |             "source": "We are also able to stream the stdout and stderr that our job produces. This is great to check the progress of our job as well as debug issues."
171 |         },
172 |         {
173 |             "cell_type": "code",
174 |             "execution_count": null,
175 |             "metadata": {
176 |                 "tags": [
177 |                     "stripout"
178 |                 ]
179 |             },
180 |             "outputs": [],
181 |             "source": "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stdout.txt"
182 |         },
183 |         {
184 |             "cell_type": "code",
185 |             "execution_count": null,
186 |             "metadata": {
187 |                 "tags": [
188 |                     "stripout"
189 |                 ]
190 |             },
191 |             "outputs": [],
192 |             "source": "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stderr.txt"
193 |         },
194 |         {
195 |             "cell_type": "markdown",
196 |             "metadata": {},
197 |             "source": "We can either wait for the job to complete or delete it with the command below."
198 |         },
199 |         {
200 |             "cell_type": "code",
201 |             "execution_count": null,
202 |             "metadata": {},
203 |             "outputs": [],
204 |             "source": "!az batchai job delete -w $WORKSPACE -e $EXPERIMENT --name $JOB_NAME -y"
205 |         },
206 |         {
207 |             "cell_type": "markdown",
208 |             "metadata": {},
209 |             "source": "<a id='clean_up'></a>\n## Clean Up Resources\nNext we wish to tidy up the resource we created.  \nFirst we reset the default values we set earlier."
210 |         },
211 |         {
212 |             "cell_type": "code",
213 |             "execution_count": null,
214 |             "metadata": {},
215 |             "outputs": [],
216 |             "source": "!az configure --defaults group=''\n!az configure --defaults location=''"
217 |         },
218 |         {
219 |             "cell_type": "markdown",
220 |             "metadata": {},
221 |             "source": " Next we delete the cluster"
222 |         },
223 |         {
224 |             "cell_type": "code",
225 |             "execution_count": null,
226 |             "metadata": {},
227 |             "outputs": [],
228 |             "source": "!az batchai cluster delete -w $WORKSPACE --name $CLUSTER_NAME -g $GROUP_NAME -y"
229 |         },
230 |         {
231 |             "cell_type": "markdown",
232 |             "metadata": {},
233 |             "source": "Once the cluster is deleted you will not incur any cost for the computation but you can still retain your experiments and workspace. If you wish to delete those as well execute the commands below."
234 |         },
235 |         {
236 |             "cell_type": "code",
237 |             "execution_count": null,
238 |             "metadata": {},
239 |             "outputs": [],
240 |             "source": "!az batchai experiment delete -w $WORKSPACE --name $EXPERIMENT -g $GROUP_NAME -y"
241 |         },
242 |         {
243 |             "cell_type": "code",
244 |             "execution_count": null,
245 |             "metadata": {},
246 |             "outputs": [],
247 |             "source": "!az batchai workspace delete -n $WORKSPACE -g $GROUP_NAME -y"
248 |         },
249 |         {
250 |             "cell_type": "markdown",
251 |             "metadata": {},
252 |             "source": "Finally we can delete the group and we will have deleted everything created for this tutorial."
253 |         },
254 |         {
255 |             "cell_type": "code",
256 |             "execution_count": null,
257 |             "metadata": {},
258 |             "outputs": [],
259 |             "source": "!az group delete --name $GROUP_NAME -y"
260 |         }
261 |     ],
262 |     "metadata": {
263 |         "jupytext": {
264 |             "text_representation": {
265 |                 "extension": ".py",
266 |                 "format_name": "light",
267 |                 "format_version": "1.3",
268 |                 "jupytext_version": "0.8.6"
269 |             }
270 |         },
271 |         "kernelspec": {
272 |             "display_name": "Python 3",
273 |             "language": "python",
274 |             "name": "python3"
275 |         }
276 |     },
277 |     "nbformat": 4,
278 |     "nbformat_minor": 2
279 | }
280 | 


--------------------------------------------------------------------------------
/HorovodTF/Docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:9.0-devel-ubuntu16.04
 2 | 
 3 | # TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
 4 | ENV PYTHON_VERSION=3.5
 5 | ENV TENSORFLOW_VERSION=1.9.0
 6 | ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
 7 | 
 8 | RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
 9 | 
10 | RUN apt-get update && apt-get install -y --no-install-recommends  --allow-downgrades --allow-change-held-packages  \
11 |         build-essential \
12 |         cmake \
13 |         cpio \
14 |         git \
15 |         curl \
16 |         wget \
17 |         ca-certificates \
18 |         libdapl2 \
19 |         libcudnn7=$CUDNN_VERSION \
20 |         libjpeg-dev \
21 |         libpng-dev \
22 |         libmlx4-1 \
23 |         libsm6 \
24 |         libxext6 \
25 |         python$PYTHON_VERSION \
26 |         python$PYTHON_VERSION-dev
27 | 
28 | 
29 | # install intel MPI
30 | RUN cd /tmp && \
31 |     wget -q 'http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/11595/l_mpi_2017.3.196.tgz' && \
32 |     tar zxvf l_mpi_2017.3.196.tgz && \
33 |     sed -i -e 's/^ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
34 |     sed -i -e 's|^#ACTIVATION_LICENSE_FILE=|ACTIVATION_LICENSE_FILE=/tmp/l_mpi_2017.3.196/USE_SERVER.lic|g' \
35 |     			/tmp/l_mpi_2017.3.196/silent.cfg && \
36 |     sed -i -e 's/^ACTIVATION_TYPE=exist_lic/ACTIVATION_TYPE=license_server/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
37 |     cd /tmp/l_mpi_2017.3.196 && \
38 |     ./install.sh -s silent.cfg && \
39 |     cd .. && \
40 |     rm -rf l_mpi_2017.3.196* && \
41 |     echo "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" >> ~/.bashrc
42 | 
43 | ENV PATH $PATH:/opt/intel/compilers_and_libraries/linux/mpi/bin64
44 | 
45 | RUN ln -s /usr/bin/python$PYTHON_VERSION /usr/bin/python
46 | 
47 | RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
48 |     python get-pip.py && \
49 |     rm get-pip.py
50 | 
51 | # Install TensorFlow
52 | RUN pip install --no-cache-dir tensorflow-gpu==$TENSORFLOW_VERSION h5py scipy jupyter ipykernel numpy toolz pandas \
53 |  	scikit-learn
54 | 
55 | # Install Horovod, temporarily using CUDA stubs
56 | RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
57 |     /bin/bash -c "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" && \
58 |     HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod==0.13.2 && \
59 |     ldconfig


--------------------------------------------------------------------------------
/HorovodTF/src/imagenet_estimator_tf_horovod.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Trains ResNet50 using Horovod.
  3 | 
  4 | It requires the following env variables
  5 | AZ_BATCHAI_INPUT_TRAIN
  6 | AZ_BATCHAI_INPUT_TEST
  7 | AZ_BATCHAI_OUTPUT_MODEL
  8 | AZ_BATCHAI_JOB_TEMP_DIR
  9 | """
 10 | import glob
 11 | import logging
 12 | import os
 13 | import sys
 14 | from functools import lru_cache
 15 | from os import path
 16 | from pathlib import Path
 17 | from timer import Timer
 18 | 
 19 | import numpy as np
 20 | import tensorflow as tf
 21 | from resnet_model import resnet_v1
 22 | from toolz import pipe
 23 | 
 24 | _WIDTH = 224
 25 | _HEIGHT = 224
 26 | _CHANNELS = 3
 27 | _LR = 0.001
 28 | _EPOCHS = os.getenv("EPOCHS", 1)
 29 | _BATCHSIZE = 64
 30 | _R_MEAN = 123.68
 31 | _G_MEAN = 116.78
 32 | _B_MEAN = 103.94
 33 | _BUFFER = 256
 34 | 
 35 | 
 36 | def _str_to_bool(in_str):
 37 |     if "t" in in_str.lower():
 38 |         return True
 39 |     else:
 40 |         return False
 41 | 
 42 | 
 43 | _DISTRIBUTED = _str_to_bool(os.getenv("DISTRIBUTED", "False"))
 44 | _FAKE = _str_to_bool(os.getenv("FAKE", "False"))
 45 | _DATA_LENGTH = int(
 46 |     os.getenv("FAKE_DATA_LENGTH", 1281167)
 47 | )  # How much fake data to simulate, default to size of imagenet dataset
 48 | _VALIDATION = _str_to_bool(os.getenv("VALIDATION", "False"))
 49 | 
 50 | if _DISTRIBUTED:
 51 |     import horovod.tensorflow as hvd
 52 | 
 53 | 
 54 | tf_logger = logging.getLogger("tensorflow")
 55 | tf_logger.setLevel(logging.INFO)
 56 | stout = logging.StreamHandler(stream=sys.stdout)
 57 | tf_logger.addHandler(stout)
 58 | 
 59 | 
 60 | def _get_rank():
 61 |     if _DISTRIBUTED:
 62 |         try:
 63 |             return hvd.rank()
 64 |         except:
 65 |             return 0
 66 |     else:
 67 |         return 0
 68 | 
 69 | 
 70 | class HorovodAdapter(logging.LoggerAdapter):
 71 |     def __init__(self, logger):
 72 |         self._str_epoch = ""
 73 |         self._gpu_rank = 0
 74 |         super(HorovodAdapter, self).__init__(logger, {})
 75 | 
 76 |     def set_epoch(self, epoch):
 77 |         self._str_epoch = "[Epoch {}]".format(epoch)
 78 | 
 79 |     def process(self, msg, kwargs):
 80 |         kwargs["extra"] = {"gpurank": _get_rank(), "epoch": self._str_epoch}
 81 |         return msg, kwargs
 82 | 
 83 | 
 84 | @lru_cache()
 85 | def _get_logger():
 86 |     logger = logging.getLogger(__name__)
 87 |     logger.setLevel(logging.INFO)
 88 |     ch = logging.StreamHandler(stream=sys.stdout)
 89 |     formatter = logging.Formatter(
 90 |         "%(levelname)s:%(name)s:%(gpurank)d: %(epoch)s %(message)s"
 91 |     )
 92 |     ch.setFormatter(formatter)
 93 |     logger.addHandler(ch)
 94 |     adapter = HorovodAdapter(logger)
 95 |     return adapter
 96 | 
 97 | 
 98 | def _load_image(filename, channels=_CHANNELS):
 99 |     return tf.to_float(tf.image.decode_png(tf.read_file(filename), channels=channels))
100 | 
101 | 
102 | def _resize(img, width=_WIDTH, height=_HEIGHT):
103 |     return tf.image.resize_images(img, [height, width])
104 | 
105 | 
106 | def _centre(img, mean_subtraction=(_R_MEAN, _G_MEAN, _B_MEAN)):
107 |     return tf.subtract(img, list(mean_subtraction))
108 | 
109 | 
110 | def _random_crop(img, width=_WIDTH, height=_HEIGHT, channels=_CHANNELS):
111 |     return tf.random_crop(img, [height, width, channels])
112 | 
113 | 
114 | def _random_horizontal_flip(img):
115 |     return tf.image.random_flip_left_right(img)
116 | 
117 | 
118 | def _preprocess_images(filename):
119 |     return pipe(filename, _load_image, _resize, _centre)
120 | 
121 | 
122 | def _preprocess_labels(label):
123 |     return tf.cast(label, dtype=tf.int32)
124 | 
125 | 
126 | def _transform_to_NCHW(img):
127 |     return tf.transpose(img, [2, 0, 1])  # Transform from NHWC to NCHW
128 | 
129 | 
130 | def _parse_function_train(tensor, label):
131 |     img_rgb = pipe(tensor, _random_crop, _random_horizontal_flip, _transform_to_NCHW)
132 | 
133 |     return img_rgb, label
134 | 
135 | 
136 | def _prep(filename, label):
137 |     return tf.data.Dataset.from_tensor_slices(
138 |         ([_preprocess_images(filename)], [_preprocess_labels(label)])
139 |     )
140 | 
141 | 
142 | def _parse_function_eval(filename, label):
143 |     return (
144 |         pipe(filename, _preprocess_images, _transform_to_NCHW),
145 |         _preprocess_labels(label),
146 |     )
147 | 
148 | 
149 | def _get_optimizer(params, is_distributed=_DISTRIBUTED):
150 |     if is_distributed:
151 |         # Horovod: add Horovod Distributed Optimizer.
152 |         return hvd.DistributedOptimizer(
153 |             tf.train.MomentumOptimizer(
154 |                 learning_rate=params["learning_rate"] * hvd.size(), momentum=0.9
155 |             )
156 |         )
157 |     else:
158 |         return tf.train.MomentumOptimizer(
159 |             learning_rate=params["learning_rate"], momentum=0.9
160 |         )
161 | 
162 | 
163 | def build_network(features, mode, params):
164 |     network = resnet_v1(
165 |         resnet_depth=50, num_classes=params["classes"], data_format="channels_first"
166 |     )
167 |     return network(inputs=features, is_training=(mode == tf.estimator.ModeKeys.TRAIN))
168 | 
169 | 
170 | def model_fn(features, labels, mode, params):
171 |     """
172 |     features: This is the x-arg from the input_fn.
173 |     labels:   This is the y-arg from the input_fn,
174 |               see e.g. train_input_fn for these two.
175 |     mode:     Either TRAIN, EVAL, or PREDICT
176 |     params:   User-defined hyper-parameters, e.g. learning-rate.
177 |     """
178 |     logger = _get_logger()
179 |     logger.info("Creating model in {} mode".format(mode))
180 | 
181 |     logits = build_network(features, mode, params)
182 | 
183 |     if mode == tf.estimator.ModeKeys.PREDICT:
184 |         # Softmax output of the neural network.
185 |         y_pred = tf.nn.softmax(logits=logits)
186 | 
187 |         # Classification output of the neural network.
188 |         y_pred_cls = tf.argmax(y_pred, axis=1)
189 | 
190 |         predictions = {
191 |             "class_ids": y_pred_cls,
192 |             "probabilities": y_pred,
193 |             "logits": logits,
194 |         }
195 |         return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
196 | 
197 |     cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
198 |         logits=logits, labels=labels
199 |     )
200 |     loss = tf.reduce_mean(cross_entropy)
201 | 
202 |     if mode == tf.estimator.ModeKeys.EVAL:
203 |         # Softmax output of the neural network.
204 |         y_pred = tf.nn.softmax(logits=logits)
205 | 
206 |         # Classification output of the neural network.
207 |         y_pred_cls = tf.argmax(y_pred, axis=1)
208 | 
209 |         accuracy = tf.metrics.accuracy(
210 |             labels=tf.argmax(labels, axis=1), predictions=y_pred_cls, name="acc_op"
211 |         )
212 |         metrics = {"accuracy": accuracy}
213 |         tf.summary.scalar("accuracy", accuracy[1])
214 |         return tf.estimator.EstimatorSpec(mode=mode, eval_metric_ops=metrics, loss=loss)
215 | 
216 |     optimizer = _get_optimizer(params)
217 | 
218 |     train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
219 | 
220 |     return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
221 | 
222 | 
223 | def _append_path_to(data_path, data_series):
224 |     return data_series.apply(lambda x: path.join(data_path, x))
225 | 
226 | 
227 | def _load_training(data_dir):
228 |     return list(glob.glob(Path(data_dir) / "**" / "*.jpg"))
229 | 
230 | 
231 | def _load_validation(data_dir):
232 |     return list(glob.glob(Path(data_dir) / "**" / "*.jpg"))
233 | 
234 | 
235 | def _create_data_fn(train_path, test_path):
236 |     logger = _get_logger()
237 |     logger.info("Reading training data info")
238 |     train_df = _load_training(train_path)
239 | 
240 |     logger.info("Reading validation data info")
241 |     validation_df = _load_validation(test_path)
242 | 
243 |     train_labels = train_df[["num_id"]].values.ravel() - 1
244 |     validation_labels = validation_df[["num_id"]].values.ravel() - 1
245 | 
246 |     train_data = tf.data.Dataset.from_tensor_slices(
247 |         (train_df["filenames"].values, train_labels)
248 |     )
249 |     train_data_transform = tf.contrib.data.map_and_batch(
250 |         _parse_function_train, _BATCHSIZE, num_parallel_batches=5
251 |     )
252 |     train_data = train_data.apply(
253 |         tf.contrib.data.parallel_interleave(
254 |             _prep, cycle_length=5, buffer_output_elements=1024
255 |         )
256 |     )
257 | 
258 |     train_data = (
259 |         train_data.shuffle(1024).repeat().apply(train_data_transform).prefetch(_BUFFER)
260 |     )
261 | 
262 |     validation_data = tf.data.Dataset.from_tensor_slices(
263 |         (validation_df["filenames"].values, validation_labels)
264 |     )
265 |     validation_data_transform = tf.contrib.data.map_and_batch(
266 |         _parse_function_eval, _BATCHSIZE, num_parallel_batches=4
267 |     )
268 |     validation_data = validation_data.apply(validation_data_transform).prefetch(_BUFFER)
269 | 
270 |     def _train_input_fn():
271 |         return train_data.make_one_shot_iterator().get_next()
272 | 
273 |     def _validation_input_fn():
274 |         return validation_data.make_one_shot_iterator().get_next()
275 | 
276 |     _train_input_fn.length = len(train_df)
277 |     _validation_input_fn.length = len(validation_df)
278 |     _train_input_fn.classes = 1000
279 |     _validation_input_fn.classes = 1000
280 | 
281 |     return _train_input_fn, _validation_input_fn
282 | 
283 | 
284 | def _create_data(batch_size, num_batches, dim, channels, seed=42):
285 |     np.random.seed(seed)
286 |     return np.random.rand(batch_size * num_batches, channels, dim[0], dim[1]).astype(
287 |         np.float32
288 |     )
289 | 
290 | 
291 | def _create_labels(batch_size, num_batches, n_classes):
292 |     return np.random.choice(n_classes, batch_size * num_batches)
293 | 
294 | 
295 | def _create_fake_data_fn(train_length=_DATA_LENGTH, valid_length=50000, num_batches=40):
296 |     """ Creates fake dataset
297 | 
298 |     Data is returned in NCHW since this tends to be faster on GPUs
299 |     """
300 |     logger = _get_logger()
301 |     logger.info("Creating fake data")
302 | 
303 |     data_array = _create_data(_BATCHSIZE, num_batches, (_HEIGHT, _WIDTH), _CHANNELS)
304 |     labels_array = _create_labels(_BATCHSIZE, num_batches, 1000)
305 | 
306 |     def fake_data_generator():
307 |         for i in range(num_batches):
308 |             yield data_array[i * _BATCHSIZE : (i + 1) * _BATCHSIZE], labels_array[
309 |                 i * _BATCHSIZE : (i + 1) * _BATCHSIZE
310 |             ]
311 | 
312 |     train_data = tf.data.Dataset().from_generator(
313 |         fake_data_generator,
314 |         output_types=(tf.float32, tf.int32),
315 |         output_shapes=(
316 |             tf.TensorShape([None, _CHANNELS, _HEIGHT, _WIDTH]),
317 |             tf.TensorShape([None]),
318 |         ),
319 |     )
320 | 
321 |     train_data = train_data.shuffle(40 * _BATCHSIZE).repeat().prefetch(_BUFFER)
322 | 
323 |     validation_data = tf.data.Dataset().from_generator(
324 |         fake_data_generator,
325 |         output_types=(tf.float32, tf.int32),
326 |         output_shapes=(
327 |             tf.TensorShape([None, _CHANNELS, _HEIGHT, _WIDTH]),
328 |             tf.TensorShape([None]),
329 |         ),
330 |     )
331 | 
332 |     validation_data = validation_data.prefetch(_BUFFER)
333 | 
334 |     def _train_input_fn():
335 |         return train_data.make_one_shot_iterator().get_next()
336 | 
337 |     def _validation_input_fn():
338 |         return validation_data.make_one_shot_iterator().get_next()
339 | 
340 |     _train_input_fn.length = train_length
341 |     _validation_input_fn.length = valid_length
342 |     _train_input_fn.classes = 1000
343 |     _validation_input_fn.classes = 1000
344 | 
345 |     return _train_input_fn, _validation_input_fn
346 | 
347 | 
348 | def _get_runconfig(is_distributed=_DISTRIBUTED):
349 |     if is_distributed:
350 |         # Horovod: pin GPU to be used to process local rank (one GPU per process)
351 |         config = tf.ConfigProto()
352 |         config.gpu_options.allow_growth = True
353 |         config.gpu_options.visible_device_list = str(hvd.local_rank())
354 | 
355 |         return tf.estimator.RunConfig(
356 |             save_checkpoints_steps=None,
357 |             save_checkpoints_secs=None,
358 |             session_config=config,
359 |         )
360 |     else:
361 |         return tf.estimator.RunConfig(save_checkpoints_steps=None)
362 | 
363 | 
364 | def _get_model_dir(is_distributed=_DISTRIBUTED):
365 |     if is_distributed:
366 |         # Horovod: save checkpoints only on worker 0 to prevent other workers from
367 |         # corrupting them.
368 |         return (
369 |             os.getenv("AZ_BATCHAI_OUTPUT_MODEL")
370 |             if hvd.rank() == 0
371 |             else os.getenv("AZ_BATCHAI_JOB_TEMP_DIR")
372 |         )
373 |     else:
374 |         return os.getenv("AZ_BATCHAI_OUTPUT_MODEL")
375 | 
376 | 
377 | def _get_hooks(is_distributed=_DISTRIBUTED):
378 |     logger = _get_logger()
379 |     if is_distributed:
380 |         bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
381 |         logger.info("Rank: {} Cluster Size {}".format(hvd.local_rank(), hvd.size()))
382 |         return [bcast_hook]
383 |     else:
384 |         return []
385 | 
386 | 
387 | def _is_master(is_distributed=_DISTRIBUTED):
388 |     if is_distributed:
389 |         if hvd.rank() == 0:
390 |             return True
391 |         else:
392 |             return False
393 |     else:
394 |         return True
395 | 
396 | 
397 | def _log_summary(data_length, duration):
398 |     logger = _get_logger()
399 |     images_per_second = data_length / duration
400 |     logger.info("Data length:      {}".format(data_length))
401 |     logger.info("Total duration:   {:.3f}".format(duration))
402 |     logger.info("Total images/sec: {:.3f}".format(images_per_second))
403 |     logger.info(
404 |         "Batch size:       (Per GPU {}: Total {})".format(
405 |             _BATCHSIZE, hvd.size() * _BATCHSIZE if _DISTRIBUTED else _BATCHSIZE
406 |         )
407 |     )
408 |     logger.info("Distributed:      {}".format("True" if _DISTRIBUTED else "False"))
409 |     logger.info("Num GPUs:         {:.3f}".format(hvd.size() if _DISTRIBUTED else 1))
410 |     logger.info("Dataset:          {}".format("Synthetic" if _FAKE else "Imagenet"))
411 | 
412 | 
413 | def main():
414 | 
415 |     if _DISTRIBUTED:
416 |         # Horovod: initialize Horovod.
417 |         hvd.init()
418 |         logger = _get_logger()
419 |         logger.info("Runnin Distributed")
420 |     else:
421 |         logger = _get_logger()
422 | 
423 |     logger.info("Tensorflow version {}".format(tf.__version__))
424 |     if _FAKE:
425 |         train_input_fn, validation_input_fn = _create_fake_data_fn()
426 |     else:
427 |         train_input_fn, validation_input_fn = _create_data_fn(
428 |             os.getenv("AZ_BATCHAI_INPUT_TRAIN"), os.getenv("AZ_BATCHAI_INPUT_TEST")
429 |         )
430 | 
431 |     run_config = _get_runconfig()
432 |     model_dir = _get_model_dir()
433 | 
434 |     params = {"learning_rate": _LR, "classes": train_input_fn.classes}
435 |     logger.info("Creating estimator with params: {}".format(params))
436 |     model = tf.estimator.Estimator(
437 |         model_fn=model_fn, params=params, model_dir=model_dir, config=run_config
438 |     )
439 | 
440 |     hooks = _get_hooks()
441 |     num_gpus = hvd.size() if _DISTRIBUTED else 1
442 |     with Timer(output=logger.info, prefix="Training") as t:
443 |         logger.info("Training...")
444 |         model.train(
445 |             input_fn=train_input_fn,
446 |             steps=_EPOCHS * train_input_fn.length // (_BATCHSIZE * num_gpus),
447 |             hooks=hooks,
448 |         )
449 | 
450 |     _log_summary(_EPOCHS * train_input_fn.length, t.elapsed)
451 | 
452 |     if _is_master() and _FAKE is False and _VALIDATION:
453 |         with Timer(output=logger.info, prefix="Testing"):
454 |             logger.info("Testing...")
455 |             model.evaluate(input_fn=validation_input_fn)
456 | 
457 | 
458 | if __name__ == "__main__":
459 |     main()
460 | 


--------------------------------------------------------------------------------
/HorovodTF/src/resnet_model.py:
--------------------------------------------------------------------------------
  1 | """ Taken from official Tensorflow TPU spec https://github.com/tensorflow/tpu
  2 | """
  3 | 
  4 | from __future__ import absolute_import
  5 | from __future__ import division
  6 | from __future__ import print_function
  7 | 
  8 | import tensorflow as tf
  9 | 
 10 | BATCH_NORM_DECAY = 0.9
 11 | BATCH_NORM_EPSILON = 1e-5
 12 | 
 13 | 
 14 | def batch_norm_relu(inputs, is_training, relu=True, init_zero=False,
 15 |                     data_format='channels_first'):
 16 |     """Performs a batch normalization followed by a ReLU.
 17 | 
 18 |     Args:
 19 |       inputs: `Tensor` of shape `[batch, channels, ...]`.
 20 |       is_training: `bool` for whether the model is training.
 21 |       relu: `bool` if False, omits the ReLU operation.
 22 |       init_zero: `bool` if True, initializes scale parameter of batch
 23 |           normalization with 0 instead of 1 (default).
 24 |       data_format: `str` either "channels_first" for `[batch, channels, height,
 25 |           width]` or "channels_last for `[batch, height, width, channels]`.
 26 | 
 27 |     Returns:
 28 |       A normalized `Tensor` with the same `data_format`.
 29 |     """
 30 |     if init_zero:
 31 |         gamma_initializer = tf.zeros_initializer()
 32 |     else:
 33 |         gamma_initializer = tf.ones_initializer()
 34 | 
 35 |     if data_format == 'channels_first':
 36 |         axis = 1
 37 |     else:
 38 |         axis = 3
 39 | 
 40 |     inputs = tf.layers.batch_normalization(
 41 |         inputs=inputs,
 42 |         axis=axis,
 43 |         momentum=BATCH_NORM_DECAY,
 44 |         epsilon=BATCH_NORM_EPSILON,
 45 |         center=True,
 46 |         scale=True,
 47 |         training=is_training,
 48 |         fused=True,
 49 |         gamma_initializer=gamma_initializer)
 50 | 
 51 |     if relu:
 52 |         inputs = tf.nn.relu(inputs)
 53 |     return inputs
 54 | 
 55 | 
 56 | def fixed_padding(inputs, kernel_size, data_format='channels_first'):
 57 |     """Pads the input along the spatial dimensions independently of input size.
 58 | 
 59 |     Args:
 60 |       inputs: `Tensor` of size `[batch, channels, height, width]` or
 61 |           `[batch, height, width, channels]` depending on `data_format`.
 62 |       kernel_size: `int` kernel size to be used for `conv2d` or max_pool2d`
 63 |           operations. Should be a positive integer.
 64 |       data_format: `str` either "channels_first" for `[batch, channels, height,
 65 |           width]` or "channels_last for `[batch, height, width, channels]`.
 66 | 
 67 |     Returns:
 68 |       A padded `Tensor` of the same `data_format` with size either intact
 69 |       (if `kernel_size == 1`) or padded (if `kernel_size > 1`).
 70 |     """
 71 |     pad_total = kernel_size - 1
 72 |     pad_beg = pad_total // 2
 73 |     pad_end = pad_total - pad_beg
 74 |     if data_format == 'channels_first':
 75 |         padded_inputs = tf.pad(inputs, [[0, 0], [0, 0],
 76 |                                         [pad_beg, pad_end], [pad_beg, pad_end]])
 77 |     else:
 78 |         padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
 79 |                                         [pad_beg, pad_end], [0, 0]])
 80 | 
 81 |     return padded_inputs
 82 | 
 83 | 
 84 | def conv2d_fixed_padding(inputs, filters, kernel_size, strides,
 85 |                          data_format='channels_first'):
 86 |     """Strided 2-D convolution with explicit padding.
 87 | 
 88 |     The padding is consistent and is based only on `kernel_size`, not on the
 89 |     dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
 90 | 
 91 |     Args:
 92 |       inputs: `Tensor` of size `[batch, channels, height_in, width_in]`.
 93 |       filters: `int` number of filters in the convolution.
 94 |       kernel_size: `int` size of the kernel to be used in the convolution.
 95 |       strides: `int` strides of the convolution.
 96 |       data_format: `str` either "channels_first" for `[batch, channels, height,
 97 |           width]` or "channels_last for `[batch, height, width, channels]`.
 98 | 
 99 |     Returns:
100 |       A `Tensor` of shape `[batch, filters, height_out, width_out]`.
101 |     """
102 |     if strides > 1:
103 |         inputs = fixed_padding(inputs, kernel_size, data_format=data_format)
104 | 
105 |     return tf.layers.conv2d(
106 |         inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides,
107 |         padding=('SAME' if strides == 1 else 'VALID'), use_bias=False,
108 |         kernel_initializer=tf.variance_scaling_initializer(),
109 |         data_format=data_format)
110 | 
111 | 
112 | def residual_block(inputs, filters, is_training, strides,
113 |                    use_projection=False, data_format='channels_first'):
114 |     """Standard building block for residual networks with BN after convolutions.
115 | 
116 |     Args:
117 |       inputs: `Tensor` of size `[batch, channels, height, width]`.
118 |       filters: `int` number of filters for the first two convolutions. Note that
119 |           the third and final convolution will use 4 times as many filters.
120 |       is_training: `bool` for whether the model is in training.
121 |       strides: `int` block stride. If greater than 1, this block will ultimately
122 |           downsample the input.
123 |       use_projection: `bool` for whether this block should use a projection
124 |           shortcut (versus the default identity shortcut). This is usually `True`
125 |           for the first block of a block group, which may change the number of
126 |           filters and the resolution.
127 |       data_format: `str` either "channels_first" for `[batch, channels, height,
128 |           width]` or "channels_last for `[batch, height, width, channels]`.
129 | 
130 |     Returns:
131 |       The output `Tensor` of the block.
132 |     """
133 |     shortcut = inputs
134 |     if use_projection:
135 |         # Projection shortcut in first layer to match filters and strides
136 |         shortcut = conv2d_fixed_padding(
137 |             inputs=inputs, filters=filters, kernel_size=1, strides=strides,
138 |             data_format=data_format)
139 |         shortcut = batch_norm_relu(shortcut, is_training, relu=False,
140 |                                    data_format=data_format)
141 | 
142 |     inputs = conv2d_fixed_padding(
143 |         inputs=inputs, filters=filters, kernel_size=3, strides=strides,
144 |         data_format=data_format)
145 |     inputs = batch_norm_relu(inputs, is_training, data_format=data_format)
146 | 
147 |     inputs = conv2d_fixed_padding(
148 |         inputs=inputs, filters=filters, kernel_size=3, strides=1,
149 |         data_format=data_format)
150 |     inputs = batch_norm_relu(inputs, is_training, relu=False, init_zero=True,
151 |                              data_format=data_format)
152 | 
153 |     return tf.nn.relu(inputs + shortcut)
154 | 
155 | 
156 | def bottleneck_block(inputs, filters, is_training, strides,
157 |                      use_projection=False, data_format='channels_first'):
158 |     """Bottleneck block variant for residual networks with BN after convolutions.
159 | 
160 |     Args:
161 |       inputs: `Tensor` of size `[batch, channels, height, width]`.
162 |       filters: `int` number of filters for the first two convolutions. Note that
163 |           the third and final convolution will use 4 times as many filters.
164 |       is_training: `bool` for whether the model is in training.
165 |       strides: `int` block stride. If greater than 1, this block will ultimately
166 |           downsample the input.
167 |       use_projection: `bool` for whether this block should use a projection
168 |           shortcut (versus the default identity shortcut). This is usually `True`
169 |           for the first block of a block group, which may change the number of
170 |           filters and the resolution.
171 |       data_format: `str` either "channels_first" for `[batch, channels, height,
172 |           width]` or "channels_last for `[batch, height, width, channels]`.
173 | 
174 |     Returns:
175 |       The output `Tensor` of the block.
176 |     """
177 |     shortcut = inputs
178 |     if use_projection:
179 |         # Projection shortcut only in first block within a group. Bottleneck blocks
180 |         # end with 4 times the number of filters.
181 |         filters_out = 4 * filters
182 |         shortcut = conv2d_fixed_padding(
183 |             inputs=inputs, filters=filters_out, kernel_size=1, strides=strides,
184 |             data_format=data_format)
185 |         shortcut = batch_norm_relu(shortcut, is_training, relu=False,
186 |                                    data_format=data_format)
187 | 
188 |     inputs = conv2d_fixed_padding(
189 |         inputs=inputs, filters=filters, kernel_size=1, strides=1,
190 |         data_format=data_format)
191 |     inputs = batch_norm_relu(inputs, is_training, data_format=data_format)
192 | 
193 |     inputs = conv2d_fixed_padding(
194 |         inputs=inputs, filters=filters, kernel_size=3, strides=strides,
195 |         data_format=data_format)
196 |     inputs = batch_norm_relu(inputs, is_training, data_format=data_format)
197 | 
198 |     inputs = conv2d_fixed_padding(
199 |         inputs=inputs, filters=4 * filters, kernel_size=1, strides=1,
200 |         data_format=data_format)
201 |     inputs = batch_norm_relu(inputs, is_training, relu=False, init_zero=True,
202 |                              data_format=data_format)
203 | 
204 |     return tf.nn.relu(inputs + shortcut)
205 | 
206 | 
207 | def block_group(inputs, filters, block_fn, blocks, strides, is_training, name,
208 |                 data_format='channels_first'):
209 |     """Creates one group of blocks for the ResNet model.
210 | 
211 |     Args:
212 |       inputs: `Tensor` of size `[batch, channels, height, width]`.
213 |       filters: `int` number of filters for the first convolution of the layer.
214 |       block_fn: `function` for the block to use within the model
215 |       blocks: `int` number of blocks contained in the layer.
216 |       strides: `int` stride to use for the first convolution of the layer. If
217 |           greater than 1, this layer will downsample the input.
218 |       is_training: `bool` for whether the model is training.
219 |       name: `str`name for the Tensor output of the block layer.
220 |       data_format: `str` either "channels_first" for `[batch, channels, height,
221 |           width]` or "channels_last for `[batch, height, width, channels]`.
222 | 
223 |     Returns:
224 |       The output `Tensor` of the block layer.
225 |     """
226 |     # Only the first block per block_group uses projection shortcut and strides.
227 |     inputs = block_fn(inputs, filters, is_training, strides,
228 |                       use_projection=True, data_format=data_format)
229 | 
230 |     for _ in range(1, blocks):
231 |         inputs = block_fn(inputs, filters, is_training, 1,
232 |                           data_format=data_format)
233 | 
234 |     return tf.identity(inputs, name)
235 | 
236 | 
237 | def resnet_v1_generator(block_fn, layers, num_classes,
238 |                         data_format='channels_first'):
239 |     """Generator for ResNet v1 models.
240 | 
241 |     Args:
242 |       block_fn: `function` for the block to use within the model. Either
243 |           `residual_block` or `bottleneck_block`.
244 |       layers: list of 4 `int`s denoting the number of blocks to include in each
245 |         of the 4 block groups. Each group consists of blocks that take inputs of
246 |         the same resolution.
247 |       num_classes: `int` number of possible classes for image classification.
248 |       data_format: `str` either "channels_first" for `[batch, channels, height,
249 |           width]` or "channels_last for `[batch, height, width, channels]`.
250 | 
251 |     Returns:
252 |       Model `function` that takes in `inputs` and `is_training` and returns the
253 |       output `Tensor` of the ResNet model.
254 |     """
255 | 
256 |     def model(inputs, is_training):
257 |         """Creation of the model graph."""
258 |         inputs = conv2d_fixed_padding(
259 |             inputs=inputs, filters=64, kernel_size=7, strides=2,
260 |             data_format=data_format)
261 |         inputs = tf.identity(inputs, 'initial_conv')
262 |         inputs = batch_norm_relu(inputs, is_training, data_format=data_format)
263 | 
264 |         inputs = tf.layers.max_pooling2d(
265 |             inputs=inputs, pool_size=3, strides=2, padding='SAME',
266 |             data_format=data_format)
267 |         inputs = tf.identity(inputs, 'initial_max_pool')
268 | 
269 |         inputs = block_group(
270 |             inputs=inputs, filters=64, block_fn=block_fn, blocks=layers[0],
271 |             strides=1, is_training=is_training, name='block_group1',
272 |             data_format=data_format)
273 |         inputs = block_group(
274 |             inputs=inputs, filters=128, block_fn=block_fn, blocks=layers[1],
275 |             strides=2, is_training=is_training, name='block_group2',
276 |             data_format=data_format)
277 |         inputs = block_group(
278 |             inputs=inputs, filters=256, block_fn=block_fn, blocks=layers[2],
279 |             strides=2, is_training=is_training, name='block_group3',
280 |             data_format=data_format)
281 |         inputs = block_group(
282 |             inputs=inputs, filters=512, block_fn=block_fn, blocks=layers[3],
283 |             strides=2, is_training=is_training, name='block_group4',
284 |             data_format=data_format)
285 | 
286 |         # The activation is 7x7 so this is a global average pool.
287 |         inputs = tf.layers.average_pooling2d(
288 |             inputs=inputs, pool_size=7, strides=1, padding='VALID',
289 |             data_format=data_format)
290 |         inputs = tf.identity(inputs, 'final_avg_pool')
291 |         inputs = tf.reshape(
292 |             inputs, [-1, 2048 if block_fn is bottleneck_block else 512])
293 |         inputs = tf.layers.dense(
294 |             inputs=inputs,
295 |             units=num_classes,
296 |             kernel_initializer=tf.random_normal_initializer(stddev=.01))
297 |         inputs = tf.identity(inputs, 'final_dense')
298 |         return inputs
299 | 
300 |     model.default_image_size = 224
301 |     return model
302 | 
303 | 
304 | def resnet_v1(resnet_depth, num_classes, data_format='channels_first'):
305 |     """Returns the ResNet model for a given size and number of output classes."""
306 |     model_params = {
307 |         18: {'block': residual_block, 'layers': [2, 2, 2, 2]},
308 |         34: {'block': residual_block, 'layers': [3, 4, 6, 3]},
309 |         50: {'block': bottleneck_block, 'layers': [3, 4, 6, 3]},
310 |         101: {'block': bottleneck_block, 'layers': [3, 4, 23, 3]},
311 |         152: {'block': bottleneck_block, 'layers': [3, 8, 36, 3]},
312 |         200: {'block': bottleneck_block, 'layers': [3, 24, 36, 3]}
313 |     }
314 | 
315 |     if resnet_depth not in model_params:
316 |         raise ValueError('Not a valid resnet_depth:', resnet_depth)
317 | 
318 |     params = model_params[resnet_depth]
319 |     return resnet_v1_generator(
320 |         params['block'], params['layers'], num_classes, data_format)
321 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation. All rights reserved.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | define PROJECT_HELP_MSG
 2 | Usage:
 3 |     make help                  show this message
 4 |     make build                 build docker image
 5 |     make push                  push container
 6 |     make run                   run benchmarking container
 7 |     make jupyter               run jupyter notebook inside container
 8 | endef
 9 | export PROJECT_HELP_MSG
10 | PWD:=$(shell pwd)
11 | dockerhub:=
12 | data:=
13 | image_name:=$(dockerhub)/distributed-training-control
14 | 
15 | help:
16 | 	echo "$$PROJECT_HELP_MSG" | less
17 | 
18 | build:
19 | 	docker build -t $(image_name) Docker
20 | 
21 | jupyter:
22 | 	docker run -p 9999:9999 \
23 | 	           -e EXT_PWD=$(PWD) \
24 | 	           -e EXT_DATA=$(data) \
25 | 	           -e DOCKER_REPOSITORY=$(dockerhub) \
26 | 	           -v $(PWD):/workspace \
27 | 	           -v $(data):/data \
28 | 	           -v /var/run/docker.sock:/var/run/docker.sock \
29 | 	           -it $(image_name) bash -c "jupyter notebook"
30 | 
31 | run:
32 | 	docker run -p 9999:9999 \
33 | 	           -e EXT_PWD=$(PWD) \
34 | 	           -e EXT_DATA=$(data) \
35 | 	           -e DOCKER_REPOSITORY=$(dockerhub) \
36 | 	           -v $(PWD):/workspace \
37 | 	           -v $(data):/data \
38 | 	           -v /var/run/docker.sock:/var/run/docker.sock \
39 | 	           -it $(image_name)
40 | 
41 | push:
42 | 	docker push $(image_name)
43 | 
44 | 
45 | 
46 | .PHONY: help build push
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Training Distributed Training on Batch AI
 2 | 
 3 | This repo is a tutorial on how to train a CNN model in a distributed fashion using Batch AI. 
 4 | The scenario covered is image classification, but the solution can be generalized for other deep learning scenarios such as segmentation and object detection. 
 5 | 
 6 | ![Distributed training diagram](images/dist_training_diag2.png "Distributed training diagram")
 7 | 
 8 | Image classification is a common task in computer vision applications and is often tackled by training a convolutional neural network (CNN). 
 9 | For particularly large models with large datasets, the training process can take weeks or months on a single GPU. 
10 | In some situations, the models are so large that it isn’t possible to fit reasonable batch sizes onto the GPU. 
11 | Using distributed training in these situations helps shorten the training time. 
12 | In this specific scenario, a ResNet50 CNN model is trained using Horovod on the ImageNet dataset as well as on synthetic data. 
13 | The tutorial demonstrates how to accomplish this using three of the most popular deep learning frameworks: TensorFlow, Keras, and PyTorch.
14 | There are number of ways to train a deep learning model in a distributed fashion, including data parallel and model parallel approaches based on synchronous and asynchronous updates. 
15 | Currently the most common scenario is data parallel with synchronous updates—it’s the easiest to implement and sufficient for the majority of use cases. 
16 | In data parallel distributed training with synchronous updates the model is replicated across N hardware devices and a 
17 | mini-batch of training samples is divided into N micro-batches (see Figure 2). 
18 | Each device performs the forward and backward pass for a micro-batch and when it finishes the process it shares the 
19 | updates with the other devices. These are then used to calculate the updated weights of the entire mini-batch and then the 
20 | weights are synchronized across the models. This is the scenario that is covered in the GitHub repository. The same architecture though can 
21 | be used for model parallel and asynchronous updates.
22 | 
23 | 
24 | ## Prerequisites
25 | * Computer with Nvidia GPU (The path was tested on an [Azure NC12 Ubuntu DSVM](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu))
26 | * Linux 
27 | * [Docker](https://docs.docker.com/install/linux/docker-ce/ubuntu/) installed
28 | * [Nvidia Docker runtime](https://github.com/NVIDIA/nvidia-container-runtime) installed
29 | * [Dockerhub](https://hub.docker.com/) account
30 | * Port 9999 open on the VM or computer
31 | * ImageNet dataset (look at [this](00_DataProcessing.ipynb) notebook for details)
32 | 
33 | ## Setup 
34 | Before you begin make sure you are logged into your dockerhub account by running on your machine:
35 | 
36 | ```bash
37 | docker login 
38 | ```
39 | 
40 | 
41 | 
42 | ### Setup Execution Environment
43 | Before being able to run anything you will need to set up the environment in which you will be executing the Batch AI commands etc. 
44 | There are a number of dependencies therefore we offer a dockerfile that will take care of these dependencies for you. 
45 | If you don't want to use Docker simply look inside the Docker directory at the dockerfile and environment.yml file for the dependencies. 
46 | To build the container run(replace all instances of <dockerhub account> with your own dockerhub account name):
47 | 
48 | ```bash
49 | make build dockerhub=<dockerhub account>
50 | ```
51 | 
52 | The you run the command to start the environment (replace <data_location> with a location on your file system. Make sure it has at least 300GB of free space for the ImageNet dataset)
53 | ```bash
54 | make jupyter dockerhub=<dockerhub account> data=<data_location>
55 | ```
56 | 
57 | This will start the Jupyter notebook on port 9999. Simply point your browser to the IP or DNS of your machine. 
58 | From there you can navigate to [00_DataProcessing.ipynb](00_DataProcessing.ipynb) to process the ImageNet Data.
59 | 
60 | Once you have covered the two prerequisite notebooks folders [00_DataProcessing.ipynb](00_DataProcessing.ipynb) and [01_CreateResources.ipynb](01_CreateResources.ipynb)  you can 
61 | navigate to the tutorials for each of the frameworks [HorovodTF](HorovodTF), [HorovodPytorch](HorovodPytorch) and [HorovodKeras](HorovodKeras).
62 | 
63 | 
64 | 
65 | # Contributing
66 | 
67 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
68 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
69 | the rights to use your contribution. For details, visit https://cla.microsoft.com.
70 | 
71 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
72 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
73 | provided by the bot. You will only need to do this once across all repos using our CLA.
74 | 
75 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
76 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
77 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
78 | 


--------------------------------------------------------------------------------
/common/timer.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import functools
  3 | import logging
  4 | from timeit import default_timer
  5 | 
  6 | 
  7 | class Timer(object):
  8 | 
  9 |     """
 10 | 
 11 |     Keyword arguments:
 12 |         output:   if True, print output after exiting context.
 13 |                   if callable, pass output to callable.
 14 |         format:   str.format string to be used for output; default "took {} seconds"
 15 |         prefix:   string to prepend (plus a space) to output
 16 |                   For convenience, if you only specify this, output defaults to True.
 17 |     """
 18 | 
 19 |     def __init__(self,
 20 |                  timer=default_timer,
 21 |                  factor=1,
 22 |                  output=None,
 23 |                  fmt="took {:.3f} seconds",
 24 |                  prefix=""):
 25 |         self._timer = timer
 26 |         self._factor = factor
 27 |         self._output = output
 28 |         self._fmt = fmt
 29 |         self._prefix = prefix
 30 |         self._end = None
 31 |         self._start = None
 32 | 
 33 |     def start(self):
 34 |         self._start = self()
 35 | 
 36 |     def stop(self):
 37 |         self._end = self()
 38 | 
 39 |     def __call__(self):
 40 |         """ Return the current time """
 41 |         return self._timer()
 42 | 
 43 |     def __enter__(self):
 44 |         """ Set the start time """
 45 |         self.start()
 46 |         return self
 47 | 
 48 |     def __exit__(self, exc_type, exc_value, exc_traceback):
 49 |         """ Set the end time """
 50 |         self.stop()
 51 | 
 52 |         if self._output is True or (self._output is None and self._prefix):
 53 |             self._output = print
 54 | 
 55 |         if callable(self._output):
 56 |             output = " ".join([self._prefix, self._fmt.format(self.elapsed)])
 57 |             self._output(output)
 58 | 
 59 |     def __str__(self):
 60 |         return '%.3f' % (self.elapsed)
 61 | 
 62 |     @property
 63 |     def elapsed(self):
 64 |         """ Return the elapsed time
 65 |         """
 66 |         if self._end is None:
 67 |             # if elapsed is called in the context manager scope
 68 |             return (self() - self._start) * self._factor
 69 |         else:
 70 |             # if elapsed is called out of the context manager scope
 71 |             return (self._end - self._start) * self._factor
 72 | 
 73 | 
 74 | def timer(logger=None,
 75 |           level=logging.INFO,
 76 |           fmt="function %(function_name)s execution time: %(execution_time).3f",
 77 |           *func_or_func_args,
 78 |           **timer_kwargs):
 79 |     """ Function decorator displaying the function execution time
 80 |     """
 81 |     def wrapped_f(f):
 82 |         @functools.wraps(f)
 83 |         def wrapped(*args, **kwargs):
 84 |             with Timer(**timer_kwargs) as t:
 85 |                 out = f(*args, **kwargs)
 86 |             context = {
 87 |                 'function_name': f.__name__,
 88 |                 'execution_time': t.elapsed,
 89 |             }
 90 |             if logger:
 91 |                 logger.log(
 92 |                     level,
 93 |                     fmt % context,
 94 |                     extra=context)
 95 |             else:
 96 |                 print(fmt % context)
 97 |             return out
 98 | 
 99 |         return wrapped
100 | 
101 |     if (len(func_or_func_args) == 1
102 |             and isinstance(func_or_func_args[0], collections.Callable)):
103 |         return wrapped_f(func_or_func_args[0])
104 |     else:
105 |         return wrapped_f
106 | 


--------------------------------------------------------------------------------
/common/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from dotenv import dotenv_values, set_key, find_dotenv, get_key
 4 | from getpass import getpass
 5 | 
 6 | 
 7 | def _create_env(dotenv_path):
 8 |     with open(dotenv_path, 'a'):
 9 |         os.utime(dotenv_path)
10 | 
11 | 
12 | def dotenv_for():
13 |     dotenv_path = find_dotenv()
14 |     if dotenv_path == '':
15 |         dotenv_path = '.env'
16 |         _create_env(dotenv_path)
17 |     return dotenv_path
18 | 
19 | 
20 | def get_password(dotenv_path):
21 |     if 'PASSWORD' not in dotenv_values(dotenv_path=dotenv_path):
22 |         print('Password not set')
23 |         password = getpass('Please enter password to use for the cluster')
24 |         _ = set_key(dotenv_path, 'PASSWORD', password)
25 |     return get_key(dotenv_path, 'PASSWORD')
26 | 
27 | 
28 | def write_json_to_file(json_dict, filename, mode='w'):
29 |     with open(filename, mode) as outfile:
30 |         json.dump(json_dict, outfile, indent=4, sort_keys=True)
31 |         outfile.write('\n\n')
32 | 


--------------------------------------------------------------------------------
/images/dist_training_diag2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/DistributedDeepLearning/d037c568bbd4394fbf2f668937d32122ae5a1a37/images/dist_training_diag2.png


--------------------------------------------------------------------------------
/include/build.mk:
--------------------------------------------------------------------------------
 1 | define PROJECT_HELP_MSG
 2 | Usage:
 3 |     make help                  show this message
 4 |     make build                 make image
 5 |     make push					push image
 6 | endef
 7 | export PROJECT_HELP_MSG
 8 | 
 9 | help:
10 | 	echo "$$PROJECT_HELP_MSG" | less
11 | 
12 | build:
13 | 	docker build -t $(image) $(dockerpath)
14 | 
15 | push:
16 | 	docker push $(image)
17 | 
18 | 
19 | .PHONY: help build push
20 | 


--------------------------------------------------------------------------------