├── .gitignore ├── 00_DataProcessing.ipynb ├── 01_CreateResources.ipynb ├── Docker ├── dockerfile ├── environment.yml └── jupyter_notebook_config.py ├── HorovodKeras ├── 00_CreateImageAndTest.ipynb ├── 01_TrainKerasModel.ipynb ├── Docker │ └── Dockerfile └── src │ ├── data_generator.py │ └── imagenet_keras_horovod.py ├── HorovodPytorch ├── 00_CreateImageAndTest.ipynb ├── 01_TrainPyTorchModel.ipynb ├── Docker │ └── Dockerfile ├── cluster_config │ ├── cluster.json │ ├── docker.service │ └── nodeprep.sh └── src │ └── imagenet_pytorch_horovod.py ├── HorovodTF ├── 00_CreateImageAndTest.ipynb ├── 01_TrainTensorflowModel.ipynb ├── Docker │ └── Dockerfile └── src │ ├── imagenet_estimator_tf_horovod.py │ └── resnet_model.py ├── LICENSE ├── Makefile ├── README.md ├── common ├── timer.py └── utils.py ├── images └── dist_training_diag2.png ├── include └── build.mk └── valprep.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /00_DataProcessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": "# Data Processing\nIn this notebook we convert the ImageNet data to the appropriate format so that we can use it for training.\n\nThe dataset has many versions, the one commonly used for image classification is ILSVRC 2012. Go to the [download page](http://www.image-net.org/download-images) (you may need to register an account), and find the page for ILSVRC2012. You will need to download two files ILSVRC2012_img_train.tar and ILSVRC2012_img_val.tar" 7 | }, 8 | { 9 | "cell_type": "code", 10 | "execution_count": null, 11 | "metadata": {}, 12 | "outputs": [], 13 | "source": "from pathlib import Path" 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": "DATA=Path(\"/data\")" 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": "!mkdir -p {DATA/\"train\"}\n!tar -C {DATA/\"train\"} -xf {DATA/\"ILSVRC2012_img_train.tar\"}" 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": "import tarfile\nfrom tqdm import tqdm_notebook\nimport os" 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": "filenames = list((DATA/\"train\").glob(\"*.tar\"))\npbar = tqdm_notebook(total=len(filenames))\nfor class_tar in filenames:\n pbar.set_description('Extracting '+class_tar.name+ '...')\n class_dir = os.path.splitext(class_tar)[0]\n os.mkdir(class_dir)\n with tarfile.open(class_tar) as f:\n f.extractall(class_dir)\n os.remove(class_tar)\n pbar.update(1)" 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": "!rm -r {DATA/\"validation\"}" 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": "!mkdir -p {DATA/\"validation\"}\n!tar -C {DATA/\"validation\"} -xf {DATA/\"ILSVRC2012_img_val.tar\"}" 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": "The validation data comes without labels so wee ned to run a script to asign the images to the appropriate classes." 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": "validation_path = DATA/\"validation\"\nvalidation_preparation_script = Path(os.getcwd())/\"valprep.sh\"" 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": "!bash -c \"cd {validation_path} && {validation_preparation_script}\"" 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": "Finally we package the processed directories so that we can upload them quicker." 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": "!cd {DATA} && tar -czvf train.tar.gz train" 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": "!cd {DATA} && tar -czvf validation.tar.gz validation" 94 | } 95 | ], 96 | "metadata": { 97 | "jupytext": { 98 | "text_representation": { 99 | "extension": ".py", 100 | "format_name": "light", 101 | "format_version": "1.3", 102 | "jupytext_version": "0.8.6" 103 | } 104 | }, 105 | "kernelspec": { 106 | "display_name": "Python 3", 107 | "language": "python", 108 | "name": "python3" 109 | } 110 | }, 111 | "nbformat": 4, 112 | "nbformat_minor": 2 113 | } 114 | -------------------------------------------------------------------------------- /01_CreateResources.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": "# Create Azure and Batch AI Resources\nIn this notebook we will create the necessary resources to train a ResNet50 model([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the ImageNet dataset. If you plan on using fake data then the sections marked optional can be skipped. This notebook will take you through the following steps:\n * [Create Azure Resources](#azure_resources)\n * [Create Fileserver(NFS)](#create_fileshare)\n * [Upload Data to Blob (Optional)](#upload_data)\n * [Configure Batch AI Cluster](#configure_cluster)" 7 | }, 8 | { 9 | "cell_type": "code", 10 | "execution_count": null, 11 | "metadata": {}, 12 | "outputs": [], 13 | "source": "import sys\nsys.path.append(\"common\") \n\nfrom dotenv import set_key\nimport os\nimport json\nfrom utils import get_password, dotenv_for\nfrom pathlib import Path" 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": "Below are the variables that describe our experiment. By default we are using the NC24rs_v3 (Standard_NC24rs_v3) VMs which have V100 GPUs and Infiniband. By default we are using 2 nodes with each node having 4 GPUs, this equates to 8 GPUs. Feel free to increase the number of nodes but be aware what limitations your subscription may have.\n\nSet the USE_FAKE to True if you want to use fake data rather than the Imagenet dataset. This is often a good way to debug your models as well as checking what IO overhead is." 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": { 24 | "tags": [ 25 | "parameters" 26 | ] 27 | }, 28 | "outputs": [], 29 | "source": "# Variables for Batch AI - change as necessary\nID = \"dtdemo\"\nGROUP_NAME = f\"batch{ID}rg\"\nSTORAGE_ACCOUNT_NAME = f\"batch{ID}st\"\nFILE_SHARE_NAME = f\"batch{ID}share\"\nSELECTED_SUBSCRIPTION = \"\"\nWORKSPACE = \"workspace\"\nNUM_NODES = 2\nCLUSTER_NAME = \"msv100\"\nVM_SIZE = \"Standard_NC24rs_v3\"\nGPU_TYPE = \"V100\"\nPROCESSES_PER_NODE = 4\nLOCATION = \"eastus\"\nNFS_NAME = f\"batch{ID}nfs\"\nUSERNAME = \"batchai_user\"\nUSE_FAKE = False\nDOCKERHUB = os.getenv('DOCKER_REPOSITORY', \"masalvar\")\nDATA = Path(\"/data\")\nCONTAINER_NAME = f\"batch{ID}container\"\nDOCKER_PWD = \"\"\n\ndotenv_path = dotenv_for()\nset_key(dotenv_path, 'DOCKER_PWD', DOCKER_PWD)\nset_key(dotenv_path, 'GROUP_NAME', GROUP_NAME)\nset_key(dotenv_path, 'FILE_SHARE_NAME', FILE_SHARE_NAME)\nset_key(dotenv_path, 'WORKSPACE', WORKSPACE)\nset_key(dotenv_path, 'NUM_NODES', str(NUM_NODES))\nset_key(dotenv_path, 'CLUSTER_NAME', CLUSTER_NAME)\nset_key(dotenv_path, 'GPU_TYPE', GPU_TYPE)\nset_key(dotenv_path, 'PROCESSES_PER_NODE', str(PROCESSES_PER_NODE))\nset_key(dotenv_path, 'STORAGE_ACCOUNT_NAME', STORAGE_ACCOUNT_NAME)" 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": "\n## Create Azure Resources\nFirst we need to log in to our Azure account. " 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "tags": [ 41 | "stripout" 42 | ] 43 | }, 44 | "outputs": [], 45 | "source": "!az login -o table" 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": "If you have more than one Azure account you will need to select it with the command below. If you only have one account you can skip this step." 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": "!az account set --subscription \"$SELECTED_SUBSCRIPTION\"" 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "tags": [ 64 | "stripout" 65 | ] 66 | }, 67 | "outputs": [], 68 | "source": "!az account list -o table" 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": "Next we create the group that will hold all our Azure resources." 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": "!az group create -n $GROUP_NAME -l $LOCATION -o table" 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": "We will create the storage account that will store our fileshare where all the outputs from the jobs will be stored." 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": "json_data = !az storage account create -l $LOCATION -n $STORAGE_ACCOUNT_NAME -g $GROUP_NAME --sku Standard_LRS\nprint('Storage account {} provisioning state: {}'.format(STORAGE_ACCOUNT_NAME, \n json.loads(''.join(json_data))['provisioningState']))" 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": "json_data = !az storage account keys list -n $STORAGE_ACCOUNT_NAME -g $GROUP_NAME\nstorage_account_key = json.loads(''.join([i for i in json_data if 'WARNING' not in i]))[0]['value']" 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": "!az storage share create --account-name $STORAGE_ACCOUNT_NAME \\\n--account-key $storage_account_key --name $FILE_SHARE_NAME" 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": "!az storage directory create --share-name $FILE_SHARE_NAME --name scripts \\\n--account-name $STORAGE_ACCOUNT_NAME --account-key $storage_account_key" 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": "Here we are setting some defaults so we don't have to keep adding them to every command" 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": "!az configure --defaults location=$LOCATION\n!az configure --defaults group=$GROUP_NAME" 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": { 131 | "tags": [ 132 | "stripout" 133 | ] 134 | }, 135 | "outputs": [], 136 | "source": "%env AZURE_STORAGE_ACCOUNT $STORAGE_ACCOUNT_NAME\n%env AZURE_STORAGE_KEY=$storage_account_key" 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": "#### Create Workspace\nBatch AI has the concept of workspaces and experiments. Below we will create the workspace for our work." 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "tags": [ 148 | "stripout" 149 | ] 150 | }, 151 | "outputs": [], 152 | "source": "!az batchai workspace create -n $WORKSPACE -g $GROUP_NAME" 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": "\n## Upload Data to Blob (Optional)\nIn this section we will create a blob container and upload the imagenet data we prepared locally in the previous notebook.\n\n**You only need to run this section if you want to use real data. If USE_FAKE is set to False the commands below won't be executed.**\n" 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": "if USE_FAKE is False:\n !az storage container create --account-name {STORAGE_ACCOUNT_NAME} \\\n --account-key {storage_account_key} \\\n --name {CONTAINER_NAME}" 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": { 170 | "tags": [ 171 | "stripout" 172 | ] 173 | }, 174 | "outputs": [], 175 | "source": "if USE_FAKE is False:\n # Should take about 20 minutes\n !azcopy --source {DATA/\"train.tar.gz\"} \\\n --destination https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/train.tar.gz \\\n --dest-key {storage_account_key} --quiet" 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": { 181 | "tags": [ 182 | "stripout" 183 | ] 184 | }, 185 | "outputs": [], 186 | "source": "if USE_FAKE is False:\n !azcopy --source {DATA/\"validation.tar.gz\"} \\\n --destination https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/validation.tar.gz \\\n --dest-key {storage_account_key} --quiet" 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": "\n## Create Fileserver\nIn this example we will store the data on an NFS fileshare. It is possible to use many storage solutions with Batch AI. NFS offers the best tradeoff between performance and ease of use. The best performance is achieved by loading the data locally but this can be cumbersome since it requires that the data is download by the all the nodes which with the ImageNet dataset can take hours. If you are using fake data we won't be using the fileserver but we will create one so that if you want to run the real ImageNet data later the server is ready." 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": { 197 | "tags": [ 198 | "stripout" 199 | ] 200 | }, 201 | "outputs": [], 202 | "source": "!az batchai file-server create -n $NFS_NAME --disk-count 4 --disk-size 250 -w $WORKSPACE \\\n-s Standard_DS4_v2 -u $USERNAME -p {get_password(dotenv_for())} -g $GROUP_NAME --storage-sku Premium_LRS" 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": "!az batchai file-server list -o table -w $WORKSPACE -g $GROUP_NAME" 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": "json_data = !az batchai file-server list -w $WORKSPACE -g $GROUP_NAME\nnfs_ip=json.loads(''.join([i for i in json_data if 'WARNING' not in i]))[0]['mountSettings']['fileServerPublicIp']" 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": "After we have created the NFS share we need to copy the data to it. To do this we write the script below which will be executed on the fileserver. It installs a tool called azcopy and then downloads and extracts the data to the appropriate directory." 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": "nodeprep_script = f\"\"\"\n#!/usr/bin/env bash\nwget https://gist.githubusercontent.com/msalvaris/073c28a9993d58498957294d20d74202/raw/87a78275879f7c9bb8d6fb9de8a2d2996bb66c24/install_azcopy\nchmod 777 install_azcopy\nsudo ./install_azcopy\n\nmkdir -p /data/imagenet\n\nazcopy --source https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/validation.tar.gz \\\n --destination /data/imagenet/validation.tar.gz\\\n --source-key {storage_account_key}\\\n --quiet\n\n\nazcopy --source https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/train.tar.gz \\\n --destination /data/imagenet/train.tar.gz\\\n --source-key {storage_account_key}\\\n --quiet\n\ncd /data/imagenet\ntar -xzf train.tar.gz\ntar -xzf validation.tar.gz\n\"\"\"" 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": "with open('nodeprep.sh', 'w') as f:\n f.write(nodeprep_script)" 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": { 240 | "lines_to_next_cell": 2 241 | }, 242 | "source": "Next we will copy the file over and run it on the NFS VM. This will install azcopy and download and prepare the data" 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": "if USE_FAKE:\n raise Warning(\"You should not be running this section if you simply want to use fake data\")" 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": { 255 | "tags": [ 256 | "stripout" 257 | ] 258 | }, 259 | "outputs": [], 260 | "source": "if USE_FAKE is False:\n !sshpass -p {get_password(dotenv_for())} scp -o \"StrictHostKeyChecking=no\" nodeprep.sh $USERNAME@{nfs_ip}:~/" 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": { 266 | "tags": [ 267 | "stripout" 268 | ] 269 | }, 270 | "outputs": [], 271 | "source": "if USE_FAKE is False:\n !sshpass -p {get_password(dotenv_for())} ssh -o \"StrictHostKeyChecking=no\" $USERNAME@{nfs_ip} \"sudo chmod 777 ~/nodeprep.sh && ./nodeprep.sh\"" 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": "\n## Configure Batch AI Cluster\nWe then upload the scripts we wish to execute onto the fileshare. The fileshare will later be mounted by Batch AI. An alternative to uploading the scripts would be to embedd them inside the Docker image." 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": "!az storage file upload --share-name $FILE_SHARE_NAME --source HorovodPytorch/cluster_config/docker.service --path scripts\n!az storage file upload --share-name $FILE_SHARE_NAME --source HorovodPytorch/cluster_config/nodeprep.sh --path scripts" 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": "Below it the command to create the cluster. " 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": { 294 | "tags": [ 295 | "stripout" 296 | ] 297 | }, 298 | "outputs": [], 299 | "source": "!az batchai cluster create \\\n -w $WORKSPACE \\\n --name $CLUSTER_NAME \\\n --image UbuntuLTS \\\n --vm-size $VM_SIZE \\\n --min $NUM_NODES --max $NUM_NODES \\\n --afs-name $FILE_SHARE_NAME \\\n --afs-mount-path extfs \\\n --user-name $USERNAME \\\n --password {get_password(dotenv_for())} \\\n --storage-account-name $STORAGE_ACCOUNT_NAME \\\n --storage-account-key $storage_account_key \\\n --nfs $NFS_NAME \\\n --nfs-mount-path nfs \\\n --config-file HorovodPytorch/cluster_config/cluster.json" 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": "Let's check that the cluster was created succesfully." 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": { 310 | "tags": [ 311 | "stripout" 312 | ] 313 | }, 314 | "outputs": [], 315 | "source": "!az batchai cluster show -n $CLUSTER_NAME -w $WORKSPACE" 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": "!az batchai cluster list -w $WORKSPACE -o table" 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": "!az batchai cluster node list -c $CLUSTER_NAME -w $WORKSPACE -o table" 330 | } 331 | ], 332 | "metadata": { 333 | "jupytext": { 334 | "text_representation": { 335 | "extension": ".py", 336 | "format_name": "light", 337 | "format_version": "1.3", 338 | "jupytext_version": "0.8.6" 339 | } 340 | }, 341 | "kernelspec": { 342 | "display_name": "Python 3", 343 | "language": "python", 344 | "name": "python3" 345 | } 346 | }, 347 | "nbformat": 4, 348 | "nbformat_minor": 2 349 | } 350 | -------------------------------------------------------------------------------- /Docker/dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | COPY environment.yml . 4 | 5 | RUN apt-get update && apt-get install -y --no-install-recommends \ 6 | build-essential \ 7 | ca-certificates \ 8 | cmake \ 9 | wget \ 10 | curl \ 11 | gfortran \ 12 | apt-transport-https \ 13 | jq \ 14 | locales \ 15 | git \ 16 | sshpass \ 17 | openssh-client \ 18 | software-properties-common && \ 19 | rm -rf /var/lib/apt/lists/* 20 | 21 | RUN locale-gen en_US.UTF-8 22 | ENV LANG en_US.UTF-8 23 | ENV LANGUAGE en_US:en 24 | ENV LC_ALL en_US.UTF-8 25 | 26 | # Install Docker 27 | RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - && \ 28 | apt-key fingerprint 0EBFCD88 && \ 29 | add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ 30 | $(lsb_release -cs) \ 31 | stable" &&\ 32 | apt-get update && apt-get install -y --no-install-recommends docker-ce 33 | 34 | ENV ENV_NAME=py3.6 35 | RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ 36 | chmod +x ~/miniconda.sh && \ 37 | ~/miniconda.sh -b -p /opt/conda && \ 38 | rm ~/miniconda.sh && \ 39 | /opt/conda/bin/conda env create -q --name $ENV_NAME -f environment.yml && \ 40 | /opt/conda/bin/conda clean -ya && \ 41 | ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ 42 | echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ 43 | echo "conda activate $ENV_NAME" >> ~/.bashrc 44 | ENV PATH /opt/conda/envs/$ENV_NAME/bin:/opt/conda/bin:$PATH 45 | 46 | COPY jupyter_notebook_config.py /root/.jupyter/ 47 | 48 | # Install Azure CLI 49 | RUN echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ xenial main" | \ 50 | tee /etc/apt/sources.list.d/azure-cli.list && \ 51 | curl -L https://packages.microsoft.com/keys/microsoft.asc | apt-key add - && \ 52 | apt-get update && \ 53 | apt-get install -y --no-install-recommends \ 54 | azure-cli 55 | 56 | # Install AzCopy 57 | RUN echo "deb [arch=amd64] https://packages.microsoft.com/repos/microsoft-ubuntu-xenial-prod/ xenial main" > azure.list &&\ 58 | cp ./azure.list /etc/apt/sources.list.d/ &&\ 59 | apt-key adv --keyserver packages.microsoft.com --recv-keys B02C46DF417A0893 &&\ 60 | apt-get update &&\ 61 | apt-get install -y --no-install-recommends azcopy 62 | 63 | WORKDIR /workspace 64 | CMD /bin/bash -------------------------------------------------------------------------------- /Docker/environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | dependencies: 4 | - python=3.6 5 | - numpy 6 | - pyyaml 7 | - scipy 8 | - ipython 9 | - pandas 10 | - jupyter 11 | - ipykernel 12 | - scikit-learn 13 | - pillow 14 | - bokeh=0.13.0 15 | - pip: 16 | - https://github.com/theskumar/python-dotenv/archive/master.zip 17 | - docker 18 | -------------------------------------------------------------------------------- /Docker/jupyter_notebook_config.py: -------------------------------------------------------------------------------- 1 | # Configuration file for jupyter-notebook. 2 | 3 | c.NotebookApp.ip = "0.0.0.0" 4 | c.NotebookApp.port = 9999 5 | c.NotebookApp.open_browser = False 6 | c.NotebookApp.allow_root = True 7 | -------------------------------------------------------------------------------- /HorovodKeras/00_CreateImageAndTest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": "# Create Docker Image for Keras\nIn this notebook we will create the Docker image for our Keras script to run in. We will go through the process of creating the image and testing it locally to make sure it runs before submitting it to the cluster. It is often recommended you do this rather than debugging on the cluster since debugging on a cluster can be much more difficult and time consuming.\n \n**You will need to be running everything on a GPU enabled VM to run this notebook.** " 7 | }, 8 | { 9 | "cell_type": "code", 10 | "execution_count": null, 11 | "metadata": {}, 12 | "outputs": [], 13 | "source": "import sys\nsys.path.append(\"../common\") \n\nfrom dotenv import get_key\nimport os\nfrom utils import dotenv_for\nimport docker" 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": "We will use fake data here since we don't want to have to download the data etc. Using fake data is often a good way to debug your models as well as checking what IO overhead is. Here we are setting the number of processes (NUM_PROCESSES) to 2 since the VM we are testing on has 2 GPUs. If you are running on a machine with 1 GPU set NUM_PROCESSES to 1." 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": { 24 | "tags": [ 25 | "parameters" 26 | ] 27 | }, 28 | "outputs": [], 29 | "source": "dotenv_path = dotenv_for()\nUSE_FAKE = True\nDOCKERHUB = os.getenv('DOCKER_REPOSITORY', \"masalvar\")\nNUM_PROCESSES = 2\nDOCKER_PWD = get_key(dotenv_path, 'DOCKER_PWD')" 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": "dc = docker.from_env()" 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": "image, log_iter = dc.images.build(path='Docker', \n tag='{}/caia-horovod-keras'.format(DOCKERHUB))" 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": "container_labels = {'containerName': 'kerasgpu'}\nenvironment ={\n \"DISTRIBUTED\":True,\n \"PYTHONPATH\":'/workspace/common/',\n}\n\nvolumes = {\n os.getenv('EXT_PWD'): {\n 'bind': '/workspace', \n 'mode': 'rw'\n }\n}\n\nif USE_FAKE:\n environment['FAKE'] = True\nelse:\n environment['FAKE'] = False\n volumes[os.getenv('EXT_DATA')]={'bind': '/mnt/input', 'mode': 'rw'}\n environment['AZ_BATCHAI_INPUT_TRAIN'] = '/mnt/input/train'\n environment['AZ_BATCHAI_INPUT_TEST'] = '/mnt/input/validation'" 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": "cmd=f'mpirun -np {NUM_PROCESSES} -H localhost:{NUM_PROCESSES} '\\\n 'python -u /workspace/HorovodTF/src/imagenet_estimator_tf_horovod.py'\ncontainer = dc.containers.run(image.tags[0], \n command=cmd,\n detach=True, \n labels=container_labels,\n runtime='nvidia',\n volumes=volumes,\n environment=environment,\n shm_size='8G',\n privileged=True)" 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": "With the code below we are simply monitoring what is happening in the container. Feel free to stop the notebook when you are happy that everything is working." 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": { 68 | "tags": [ 69 | "stripout" 70 | ] 71 | }, 72 | "outputs": [], 73 | "source": "for line in container.logs(stderr=True, stream=True):\n print(line.decode(\"utf-8\"),end =\"\")" 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": "container.reload() # Refresh state\nif container.status is 'running':\n container.kill()" 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "tags": [ 87 | "stripout" 88 | ] 89 | }, 90 | "outputs": [], 91 | "source": "for line in dc.images.push(image.tags[0], \n stream=True,\n auth_config={\"username\": DOCKERHUB,\n \"password\": DOCKER_PWD}):\n print(line)" 92 | } 93 | ], 94 | "metadata": { 95 | "jupytext": { 96 | "text_representation": { 97 | "extension": ".py", 98 | "format_name": "light", 99 | "format_version": "1.3", 100 | "jupytext_version": "0.8.6" 101 | } 102 | }, 103 | "kernelspec": { 104 | "display_name": "Python 3", 105 | "language": "python", 106 | "name": "python3" 107 | } 108 | }, 109 | "nbformat": 4, 110 | "nbformat_minor": 2 111 | } 112 | -------------------------------------------------------------------------------- /HorovodKeras/01_TrainKerasModel.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": "# Train Keras Model Distributed on Batch AI\nIn this notebook we will train a Keras model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n * [Create Experiment](#experiment)\n * [Upload Training Scripts](#training_scripts)\n * [Submit and Monitor Job](#job)\n * [Clean Up Resources](#clean_up)" 7 | }, 8 | { 9 | "cell_type": "code", 10 | "execution_count": null, 11 | "metadata": {}, 12 | "outputs": [], 13 | "source": "import sys\nsys.path.append(\"../common\") \n\nimport json\nfrom dotenv import get_key\nimport os\nfrom utils import write_json_to_file, dotenv_for" 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": "Set the USE_FAKE to True if you want to use fake data rather than the ImageNet dataset. This is often a good way to debug your models as well as checking what IO overhead is." 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": { 24 | "tags": [ 25 | "parameters" 26 | ] 27 | }, 28 | "outputs": [], 29 | "source": "# Variables for Batch AI - change as necessary\ndotenv_path = dotenv_for()\nGROUP_NAME = get_key(dotenv_path, 'GROUP_NAME')\nFILE_SHARE_NAME = get_key(dotenv_path, 'FILE_SHARE_NAME')\nWORKSPACE = get_key(dotenv_path, 'WORKSPACE')\nNUM_NODES = int(get_key(dotenv_path, 'NUM_NODES'))\nCLUSTER_NAME = get_key(dotenv_path, 'CLUSTER_NAME')\nGPU_TYPE = get_key(dotenv_path, 'GPU_TYPE')\nPROCESSES_PER_NODE = int(get_key(dotenv_path, 'PROCESSES_PER_NODE'))\nSTORAGE_ACCOUNT_NAME = get_key(dotenv_path, 'STORAGE_ACCOUNT_NAME')\n\nEXPERIMENT = f\"distributed_keras_{GPU_TYPE}\"\nUSE_FAKE = False\nDOCKERHUB = os.getenv('DOCKER_REPOSITORY', \"masalvar\")" 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": "FAKE='-x FAKE=True' if USE_FAKE else ''\nTOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES" 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": "\n# Create Experiment\nNext we create our experiment." 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": "!az batchai experiment create -n $EXPERIMENT -g $GROUP_NAME -w $WORKSPACE" 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": "\n# Upload Training Scripts\nWe need to upload our training scripts and associated files" 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": "json_data = !az storage account keys list -n $STORAGE_ACCOUNT_NAME -g $GROUP_NAME\nstorage_account_key = json.loads(''.join([i for i in json_data if 'WARNING' not in i]))[0]['value']" 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "tags": [ 67 | "stripout" 68 | ] 69 | }, 70 | "outputs": [], 71 | "source": "%env AZURE_STORAGE_ACCOUNT $STORAGE_ACCOUNT_NAME\n%env AZURE_STORAGE_KEY=$storage_account_key" 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": "Upload our training scripts" 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": "!az storage file upload --share-name $FILE_SHARE_NAME --source src/imagenet_keras_horovod.py --path scripts\n!az storage file upload --share-name $FILE_SHARE_NAME --source src/data_generator.py --path scripts\n!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts" 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": "Let's check our cluster we created earlier" 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": "!az batchai cluster list -w $WORKSPACE -o table" 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": "\n## Submit and Monitor Job\nBelow we specify the job we wish to execute. " 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": "jobs_dict = {\n \"$schema\": \"https://raw.githubusercontent.com/Azure/BatchAI/master/schemas/2017-09-01-preview/job.json\",\n \"properties\": {\n \"nodeCount\": NUM_NODES,\n \"customToolkitSettings\": {\n \"commandLine\": f\"echo $AZ_BATCH_HOST_LIST; \\\n cat $AZ_BATCHAI_MPI_HOST_FILE; \\\n mpirun -np {TOTAL_PROCESSES} --hostfile $AZ_BATCHAI_MPI_HOST_FILE \\\n -bind-to none -map-by slot \\\n -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \\\n -mca btl_tcp_if_include eth0 \\\n -x NCCL_SOCKET_IFNAME=eth0 \\\n -mca btl ^openib \\\n -x NCCL_IB_DISABLE=1 \\\n -x DISTRIBUTED=True \\\n -x AZ_BATCHAI_INPUT_TRAIN \\\n -x AZ_BATCHAI_INPUT_TEST \\\n --allow-run-as-root \\\n {FAKE} \\\n python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_leras_horovod.py\"\n },\n \"stdOutErrPathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n \"inputDirectories\": [{\n \"id\": \"SCRIPTS\",\n \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts\"\n },\n {\n \"id\": \"TRAIN\",\n \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\",\n },\n {\n \"id\": \"TEST\",\n \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\",\n },\n ],\n \"outputDirectories\": [{\n \"id\": \"MODEL\",\n \"pathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n \"pathSuffix\": \"Models\"\n }],\n \"containerSettings\": {\n \"imageSourceRegistry\": {\n \"image\": f\"{DOCKERHUB}/caia-horovod-keras\"\n }\n }\n }\n}" 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": "write_json_to_file(jobs_dict, 'job.json')" 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": "JOB_NAME='keras-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)" 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": "We now submit the job to Batch AI" 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "tags": [ 133 | "stripout" 134 | ] 135 | }, 136 | "outputs": [], 137 | "source": "!az batchai job create -n $JOB_NAME --cluster $CLUSTER_NAME -w $WORKSPACE -e $EXPERIMENT -f job.json" 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": "With the command below we can check the status of the job" 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": "!az batchai job list -w $WORKSPACE -e $EXPERIMENT -o table" 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": "To view the files that the job has generated use the command below" 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "tags": [ 161 | "stripout" 162 | ] 163 | }, 164 | "outputs": [], 165 | "source": "!az batchai job file list -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr" 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": "We are also able to stream the stdout and stderr that our job produces. This is great to check the progress of our job as well as debug issues." 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "tags": [ 177 | "stripout" 178 | ] 179 | }, 180 | "outputs": [], 181 | "source": "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stdout.txt" 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": { 187 | "tags": [ 188 | "stripout" 189 | ] 190 | }, 191 | "outputs": [], 192 | "source": "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stderr.txt" 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": "We can either wait for the job to complete or delete it with the command below." 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": "!az batchai job delete -w $WORKSPACE -e $EXPERIMENT --name $JOB_NAME -y" 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": "\n## Clean Up Resources\nNext we wish to tidy up the resource we created. \nFirst we reset the default values we set earlier." 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": "!az configure --defaults group=''\n!az configure --defaults location=''" 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": " Next we delete the cluster" 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": "!az batchai cluster delete -w $WORKSPACE --name $CLUSTER_NAME -g $GROUP_NAME -y" 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": "Once the cluster is deleted you will not incur any cost for the computation but you can still retain your experiments and workspace. If you wish to delete those as well execute the commands below." 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": "!az batchai experiment delete -w $WORKSPACE --name $EXPERIMENT -g $GROUP_NAME -y" 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": "!az batchai workspace delete -n $WORKSPACE -g $GROUP_NAME -y" 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": "Finally we can delete the group and we will have deleted everything created for this tutorial." 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": "!az group delete --name $GROUP_NAME -y" 260 | } 261 | ], 262 | "metadata": { 263 | "jupytext": { 264 | "text_representation": { 265 | "extension": ".py", 266 | "format_name": "light", 267 | "format_version": "1.3", 268 | "jupytext_version": "0.8.6" 269 | } 270 | }, 271 | "kernelspec": { 272 | "display_name": "Python 3", 273 | "language": "python", 274 | "name": "python3" 275 | } 276 | }, 277 | "nbformat": 4, 278 | "nbformat_minor": 2 279 | } 280 | -------------------------------------------------------------------------------- /HorovodKeras/Docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:9.0-devel-ubuntu16.04 2 | 3 | # TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully 4 | ENV PYTHON_VERSION=3.5 5 | ENV TENSORFLOW_VERSION=1.9.0 6 | ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0 7 | ENV NCCL_VERSION=2.2.13-1+cuda9.0 8 | 9 | RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list 10 | 11 | RUN apt-get update && apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \ 12 | build-essential \ 13 | cmake \ 14 | cpio \ 15 | git \ 16 | curl \ 17 | wget \ 18 | ca-certificates \ 19 | libdapl2 \ 20 | libcudnn7=${CUDNN_VERSION} \ 21 | libnccl2=${NCCL_VERSION} \ 22 | libnccl-dev=${NCCL_VERSION} \ 23 | libjpeg-dev \ 24 | libpng-dev \ 25 | libmlx4-1 \ 26 | libsm6 \ 27 | libxext6 \ 28 | python$PYTHON_VERSION \ 29 | python$PYTHON_VERSION-dev 30 | 31 | 32 | # install intel MPI 33 | RUN cd /tmp && \ 34 | wget -q 'http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/11595/l_mpi_2017.3.196.tgz' && \ 35 | tar zxvf l_mpi_2017.3.196.tgz && \ 36 | sed -i -e 's/^ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' /tmp/l_mpi_2017.3.196/silent.cfg && \ 37 | sed -i -e 's|^#ACTIVATION_LICENSE_FILE=|ACTIVATION_LICENSE_FILE=/tmp/l_mpi_2017.3.196/USE_SERVER.lic|g' \ 38 | /tmp/l_mpi_2017.3.196/silent.cfg && \ 39 | sed -i -e 's/^ACTIVATION_TYPE=exist_lic/ACTIVATION_TYPE=license_server/g' /tmp/l_mpi_2017.3.196/silent.cfg && \ 40 | cd /tmp/l_mpi_2017.3.196 && \ 41 | ./install.sh -s silent.cfg && \ 42 | cd .. && \ 43 | rm -rf l_mpi_2017.3.196* && \ 44 | echo "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" >> ~/.bashrc 45 | 46 | ENV PATH $PATH:/opt/intel/compilers_and_libraries/linux/mpi/bin64 47 | 48 | RUN ln -s /usr/bin/python$PYTHON_VERSION /usr/bin/python 49 | 50 | RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ 51 | python get-pip.py && \ 52 | rm get-pip.py 53 | 54 | 55 | # Install TensorFlow and Keras 56 | RUN pip install --no-cache-dir tensorflow-gpu==$TENSORFLOW_VERSION h5py scipy jupyter ipykernel numpy toolz pandas \ 57 | scikit-learn keras pillow 58 | 59 | # Install Horovod, temporarily using CUDA stubs 60 | RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \ 61 | /bin/bash -c "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" && \ 62 | HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod==0.13.2 && \ 63 | ldconfig -------------------------------------------------------------------------------- /HorovodKeras/src/data_generator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import keras 3 | import logging 4 | 5 | 6 | def _get_logger(): 7 | return logging.getLogger(__name__) 8 | 9 | def _create_data(batch_size, num_batches, dim, channels, seed=42): 10 | np.random.seed(42) 11 | return np.random.rand(batch_size * num_batches, 12 | dim[0], 13 | dim[1], 14 | channels).astype(np.float32) 15 | 16 | 17 | def _create_labels(batch_size, num_batches, n_classes): 18 | return np.random.choice(n_classes, batch_size * num_batches) 19 | 20 | 21 | 22 | class FakeDataGenerator(keras.preprocessing.image.Iterator): 23 | 24 | def __init__(self, 25 | batch_size=32, 26 | num_batches=20, 27 | dim=(224, 224), 28 | n_channels=3, 29 | n_classes=10, 30 | length=1000, 31 | shuffle=True, 32 | seed=42): 33 | 34 | 'Initialization' 35 | super(FakeDataGenerator, self).__init__(length, 36 | batch_size, 37 | shuffle, 38 | seed) 39 | self.dim = dim 40 | self.n_channels = n_channels 41 | self.n_classes = n_classes 42 | self.num_batches = num_batches 43 | self._data = _create_data(self.batch_size, self.num_batches, self.dim, self.n_channels) 44 | self._labels = _create_labels(self.batch_size, self.num_batches, self.n_classes) 45 | self.translation_index = np.random.choice(len(self._labels), length) 46 | 47 | 48 | def _get_batches_of_transformed_samples(self, index_array): 49 | logger = _get_logger() 50 | logger.debug('Retrieving samples') 51 | logger.debug(str(index_array)) 52 | tr_index_array = self.translation_index[index_array] 53 | return self._data[tr_index_array], keras.utils.to_categorical(self._labels[tr_index_array], num_classes=self.n_classes) -------------------------------------------------------------------------------- /HorovodKeras/src/imagenet_keras_horovod.py: -------------------------------------------------------------------------------- 1 | """ 2 | Trains ResNet50 in Keras using Horovod. 3 | 4 | It requires the following env variables 5 | AZ_BATCHAI_INPUT_TRAIN 6 | AZ_BATCHAI_INPUT_TEST 7 | AZ_BATCHAI_OUTPUT_MODEL 8 | AZ_BATCHAI_JOB_TEMP_DIR 9 | """ 10 | import logging 11 | import os 12 | import sys 13 | from functools import lru_cache 14 | from timer import Timer 15 | 16 | import keras 17 | import tensorflow as tf 18 | from data_generator import FakeDataGenerator 19 | from keras import backend as K 20 | from keras.preprocessing import image 21 | 22 | 23 | def _str_to_bool(in_str): 24 | if "t" in in_str.lower(): 25 | return True 26 | else: 27 | return False 28 | 29 | 30 | _WIDTH = 224 31 | _HEIGHT = 224 32 | _CHANNELS = 3 33 | _LR = 0.001 34 | _EPOCHS = os.getenv("EPOCHS", 1) 35 | _BATCHSIZE = 64 36 | _R_MEAN = 123.68 37 | _G_MEAN = 116.78 38 | _B_MEAN = 103.94 39 | 40 | # Settings from https://arxiv.org/abs/1706.02677. 41 | _WARMUP_EPOCHS = 5 42 | _WEIGHT_DECAY = 0.00005 43 | 44 | _NUM_WORKERS = int(os.getenv("NUM_WORKERS", 10)) 45 | _MAX_QUEUE_SIZE = int(os.getenv("MAX_QUEUE_SIZE", 10)) 46 | _MULTIPROCESSING = _str_to_bool(os.getenv("MULTIPROCESSING", "False")) 47 | _DISTRIBUTED = _str_to_bool(os.getenv("DISTRIBUTED", "False")) 48 | _FAKE = _str_to_bool(os.getenv("FAKE", "False")) 49 | _DATA_LENGTH = int( 50 | os.getenv("FAKE_DATA_LENGTH", 1281167) 51 | ) # How much fake data to simulate, default to size of imagenet dataset 52 | _VALIDATION = _str_to_bool(os.getenv("VALIDATION", "False")) 53 | 54 | 55 | if _DISTRIBUTED: 56 | import horovod.keras as hvd 57 | 58 | 59 | def _get_rank(): 60 | if _DISTRIBUTED: 61 | try: 62 | return hvd.rank() 63 | except: 64 | return 0 65 | else: 66 | return 0 67 | 68 | 69 | class HorovodAdapter(logging.LoggerAdapter): 70 | def __init__(self, logger): 71 | self._str_epoch = "" 72 | self._gpu_rank = 0 73 | super(HorovodAdapter, self).__init__(logger, {}) 74 | 75 | def set_epoch(self, epoch): 76 | self._str_epoch = "[Epoch {}]".format(epoch) 77 | 78 | def process(self, msg, kwargs): 79 | kwargs["extra"] = {"gpurank": _get_rank(), "epoch": self._str_epoch} 80 | return msg, kwargs 81 | 82 | 83 | @lru_cache() 84 | def _get_logger(): 85 | logger = logging.getLogger(__name__) 86 | logger.setLevel(logging.INFO) 87 | ch = logging.StreamHandler(stream=sys.stdout) 88 | formatter = logging.Formatter( 89 | "%(levelname)s:%(name)s:%(gpurank)d: %(epoch)s %(message)s" 90 | ) 91 | ch.setFormatter(formatter) 92 | logger.addHandler(ch) 93 | adapter = HorovodAdapter(logger) 94 | return adapter 95 | 96 | 97 | def _create_model(): 98 | logger = _get_logger() 99 | logger.info("Creating model") 100 | # Set up standard ResNet-50 model. 101 | model = keras.applications.resnet50.ResNet50(weights=None) 102 | # ResNet-50 model that is included with Keras is optimized for inference. 103 | # Add L2 weight decay & adjust BN settings. 104 | model_config = model.get_config() 105 | for layer, layer_config in zip(model.layers, model_config["layers"]): 106 | if hasattr(layer, "kernel_regularizer"): 107 | regularizer = keras.regularizers.l2(_WEIGHT_DECAY) 108 | layer_config["config"]["kernel_regularizer"] = { 109 | "class_name": regularizer.__class__.__name__, 110 | "config": regularizer.get_config(), 111 | } 112 | if type(layer) == keras.layers.BatchNormalization: 113 | layer_config["config"]["momentum"] = 0.9 114 | layer_config["config"]["epsilon"] = 1e-5 115 | model = keras.models.Model.from_config(model_config) 116 | return model 117 | 118 | 119 | def _validation_data_iterator_from(): 120 | # Validation data iterator. 121 | 122 | test_gen = image.ImageDataGenerator( 123 | zoom_range=(0.875, 0.875), 124 | preprocessing_function=keras.applications.resnet50.preprocess_input, 125 | ) 126 | test_iter = test_gen.flow_from_directory( 127 | os.getenv("AZ_BATCHAI_INPUT_TEST"), 128 | batch_size=_BATCHSIZE, 129 | target_size=(224, 224), 130 | ) 131 | return test_iter 132 | 133 | 134 | def _training_data_iterator_from(): 135 | # Training data iterator. 136 | train_gen = image.ImageDataGenerator( 137 | width_shift_range=0.33, 138 | height_shift_range=0.33, 139 | zoom_range=0.5, 140 | horizontal_flip=True, 141 | preprocessing_function=keras.applications.resnet50.preprocess_input, 142 | ) 143 | train_iter = train_gen.flow_from_directory( 144 | os.getenv("AZ_BATCHAI_INPUT_TRAIN"), 145 | batch_size=_BATCHSIZE, 146 | target_size=(224, 224), 147 | ) 148 | return train_iter 149 | 150 | 151 | def _fake_data_iterator_from(length=_DATA_LENGTH): 152 | return FakeDataGenerator(batch_size=_BATCHSIZE, n_classes=1000, length=length) 153 | 154 | 155 | def _get_optimizer(params, is_distributed=_DISTRIBUTED): 156 | if is_distributed: 157 | # Horovod: adjust learning rate based on number of GPUs. 158 | opt = keras.optimizers.SGD( 159 | lr=params["learning_rate"] * hvd.size(), momentum=params["momentum"] 160 | ) 161 | # Horovod: add Horovod Distributed Optimizer. 162 | return hvd.DistributedOptimizer(opt) 163 | else: 164 | return keras.optimizers.SGD( 165 | lr=params["learning_rate"], momentum=params["momentum"] 166 | ) 167 | 168 | 169 | def _get_runconfig(is_distributed=_DISTRIBUTED): 170 | if is_distributed: 171 | # Horovod: pin GPU to be used to process local rank (one GPU per process) 172 | config = tf.ConfigProto() 173 | config.gpu_options.allow_growth = True 174 | config.gpu_options.visible_device_list = str(hvd.local_rank()) 175 | else: 176 | config = tf.ConfigProto() 177 | config.gpu_options.allow_growth = True 178 | return config 179 | 180 | 181 | def _get_model_dir(is_distributed=_DISTRIBUTED): 182 | if is_distributed: 183 | # Horovod: save checkpoints only on worker 0 to prevent other workers from 184 | # corrupting them. 185 | return ( 186 | os.getenv("AZ_BATCHAI_OUTPUT_MODEL") 187 | if hvd.rank() == 0 188 | else os.getenv("AZ_BATCHAI_JOB_TEMP_DIR") 189 | ) 190 | else: 191 | return os.getenv("AZ_BATCHAI_OUTPUT_MODEL") 192 | 193 | 194 | def _get_hooks(is_distributed=_DISTRIBUTED, verbose=1): 195 | logger = _get_logger() 196 | if is_distributed: 197 | logger.info("Rank: {} Cluster Size {}".format(hvd.local_rank(), hvd.size())) 198 | return [ 199 | # Horovod: broadcast initial variable states from rank 0 to all other processes. 200 | # This is necessary to ensure consistent initialization of all workers when 201 | # training is started with random weights or restored from a checkpoint. 202 | hvd.callbacks.BroadcastGlobalVariablesCallback(0), 203 | # Horovod: average metrics among workers at the end of every epoch. 204 | # 205 | # Note: This callback must be in the list before the ReduceLROnPlateau, 206 | # TensorBoard, or other metrics-based callbacks. 207 | hvd.callbacks.MetricAverageCallback(), 208 | # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final 209 | # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during 210 | # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. 211 | hvd.callbacks.LearningRateWarmupCallback( 212 | warmup_epochs=_WARMUP_EPOCHS, verbose=verbose 213 | ), 214 | # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs. 215 | hvd.callbacks.LearningRateScheduleCallback( 216 | start_epoch=_WARMUP_EPOCHS, end_epoch=30, multiplier=1.0 217 | ), 218 | hvd.callbacks.LearningRateScheduleCallback( 219 | start_epoch=30, end_epoch=60, multiplier=1e-1 220 | ), 221 | hvd.callbacks.LearningRateScheduleCallback( 222 | start_epoch=60, end_epoch=80, multiplier=1e-2 223 | ), 224 | hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3), 225 | ] 226 | else: 227 | return [] 228 | 229 | 230 | class LoggerCallback(keras.callbacks.Callback): 231 | def __init__(self, logger, data_length): 232 | self._timer = Timer( 233 | output=logger.info, prefix="Epoch duration: ", fmt="{:.3f} seconds" 234 | ) 235 | self._data_length = data_length 236 | 237 | def on_epoch_begin(self, epoch, logs): 238 | logger = _get_logger() 239 | logger.set_epoch(epoch) 240 | self._timer.start() 241 | 242 | def on_epoch_end(self, epoch, logs): 243 | duration = self._timer.elapsed 244 | _log_summary(self._data_length, duration) 245 | 246 | 247 | def _is_master(is_distributed=_DISTRIBUTED): 248 | if is_distributed: 249 | if hvd.rank() == 0: 250 | return True 251 | else: 252 | return False 253 | else: 254 | return True 255 | 256 | 257 | def _log_summary(data_length, duration): 258 | logger = _get_logger() 259 | images_per_second = data_length / duration 260 | logger.info("Data length: {}".format(data_length)) 261 | logger.info("Total duration: {:.3f}".format(duration)) 262 | logger.info("Total images/sec: {:.3f}".format(images_per_second)) 263 | logger.info( 264 | "Batch size: (Per GPU {}: Total {})".format( 265 | _BATCHSIZE, hvd.size() * _BATCHSIZE if _DISTRIBUTED else _BATCHSIZE 266 | ) 267 | ) 268 | logger.info("Distributed: {}".format("True" if _DISTRIBUTED else "False")) 269 | logger.info("Num GPUs: {:.3f}".format(hvd.size() if _DISTRIBUTED else 1)) 270 | logger.info("Dataset: {}".format("Synthetic" if _FAKE else "Imagenet")) 271 | 272 | 273 | def main(): 274 | verbose = 1 275 | logger = _get_logger() 276 | if _DISTRIBUTED: 277 | # Horovod: initialize Horovod. 278 | hvd.init() 279 | logger.info("Runnin Distributed") 280 | verbose = 1 if hvd.rank() == 0 else 0 281 | 282 | logger.info("Tensorflow version {}".format(tf.__version__)) 283 | K.set_session(tf.Session(config=_get_runconfig())) 284 | 285 | # Horovod: broadcast resume_from_epoch from rank 0 (which will have 286 | # checkpoints) to other ranks. 287 | resume_from_epoch = 0 288 | if _DISTRIBUTED: 289 | resume_from_epoch = hvd.broadcast( 290 | resume_from_epoch, 0, name="resume_from_epoch" 291 | ) 292 | 293 | if _FAKE: 294 | train_iter = _fake_data_iterator_from() 295 | else: 296 | train_iter = _training_data_iterator_from() 297 | test_iter = _validation_data_iterator_from() if _VALIDATION else None 298 | 299 | model = _create_model() 300 | 301 | params = {"learning_rate": _LR, "momentum": 0.9} 302 | 303 | opt = _get_optimizer(params) 304 | model.compile( 305 | loss=keras.losses.categorical_crossentropy, 306 | optimizer=opt, 307 | metrics=["accuracy", "top_k_categorical_accuracy"], 308 | ) 309 | 310 | model_dir = _get_model_dir() 311 | checkpoint_format = os.path.join(model_dir, "checkpoint-{epoch}.h5") 312 | 313 | callbacks = _get_hooks() 314 | callbacks.append(LoggerCallback(logger, len(train_iter) * _BATCHSIZE)) 315 | 316 | # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them. 317 | if _is_master(): 318 | callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_format)) 319 | # callbacks.append(keras.callbacks.TensorBoard(log_dir)) 320 | 321 | # Restore from a previous checkpoint, if initial_epoch is specified. 322 | # Horovod: restore on the first worker which will broadcast weights to other workers. 323 | if resume_from_epoch > 0 and _is_master(): 324 | model.load_weights(checkpoint_format.format(epoch=resume_from_epoch)) 325 | 326 | logger.info("Training...") 327 | # Train the model. The training will randomly sample 1 / N batches of training data and 328 | # 3 / N batches of validation data on every worker, where N is the number of workers. 329 | # Over-sampling of validation data helps to increase probability that every validation 330 | # example will be evaluated. 331 | num_workers = hvd.size() if _DISTRIBUTED else 1 332 | model.fit_generator( 333 | train_iter, 334 | steps_per_epoch=len(train_iter) // num_workers, 335 | callbacks=callbacks, 336 | epochs=_EPOCHS, 337 | verbose=verbose, 338 | workers=_NUM_WORKERS, 339 | max_queue_size=_MAX_QUEUE_SIZE, 340 | use_multiprocessing=_MULTIPROCESSING, 341 | initial_epoch=resume_from_epoch, 342 | ) 343 | 344 | if _FAKE is False and _VALIDATION: 345 | # Evaluate the model on the full data set. 346 | with Timer(output=logger.info, prefix="Testing"): 347 | logger.info("Testing...") 348 | score = hvd.allreduce( 349 | model.evaluate_generator(test_iter, len(test_iter), workers=10) 350 | ) 351 | if verbose: 352 | print("Test loss:", score[0]) 353 | print("Test accuracy:", score[1]) 354 | 355 | 356 | if __name__ == "__main__": 357 | main() 358 | -------------------------------------------------------------------------------- /HorovodPytorch/00_CreateImageAndTest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": "# Create Docker Image for PyTorch\nIn this notebook we will create the Docker image for our PyTorch script to run in. We will go through the process of creating the image and testing it locally to make sure it runs before submitting it to the cluster. It is often recommended you do this rather than debugging on the cluster since debugging on a cluster can be much more difficult and time consuming.\n \n**You will need to be running everything on a GPU enabled VM to run this notebook.** " 7 | }, 8 | { 9 | "cell_type": "code", 10 | "execution_count": null, 11 | "metadata": {}, 12 | "outputs": [], 13 | "source": "import sys\nsys.path.append(\"../common\") \n\nfrom dotenv import get_key\nimport os\nfrom utils import dotenv_for\nimport docker" 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": "We will use fake data here since we don't want to have to download the data etc. Using fake data is often a good way to debug your models as well as checking what IO overhead is. Here we are setting the number of processes (NUM_PROCESSES) to 2 since the VM we are testing on has 2 GPUs. If you are running on a machine with 1 GPU set NUM_PROCESSES to 1." 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": { 24 | "tags": [ 25 | "parameters" 26 | ] 27 | }, 28 | "outputs": [], 29 | "source": "dotenv_path = dotenv_for()\nUSE_FAKE = True\nDOCKERHUB = os.getenv('DOCKER_REPOSITORY', \"masalvar\")\nNUM_PROCESSES = 2\nDOCKER_PWD = get_key(dotenv_path, 'DOCKER_PWD')" 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": "dc = docker.from_env()" 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": "image, log_iter = dc.images.build(path='Docker', \n tag='{}/caia-horovod-pytorch'.format(DOCKERHUB))" 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": "container_labels = {'containerName': 'pytorchgpu'}\nenvironment ={\n \"DISTRIBUTED\":True,\n \"PYTHONPATH\":'/workspace/common/',\n}\n\nvolumes = {\n os.getenv('EXT_PWD'): {\n 'bind': '/workspace', \n 'mode': 'rw'\n }\n}\n\nif USE_FAKE:\n environment['FAKE'] = True\nelse:\n environment['FAKE'] = False\n volumes[os.getenv('EXT_DATA')]={'bind': '/mnt/input', 'mode': 'rw'}\n environment['AZ_BATCHAI_INPUT_TRAIN'] = '/mnt/input/train'\n environment['AZ_BATCHAI_INPUT_TEST'] = '/mnt/input/validation'" 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": "cmd=f'mpirun -np {NUM_PROCESSES} -H localhost:{NUM_PROCESSES} '\\\n 'python -u /workspace/HorovodPytorch/src/imagenet_pytorch_horovod.py'\ncontainer = dc.containers.run(image.tags[0], \n command=cmd,\n detach=True, \n labels=container_labels,\n runtime='nvidia',\n volumes=volumes,\n environment=environment,\n shm_size='8G',\n privileged=True)" 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": "With the code below we are simply monitoring what is happening in the container. Feel free to stop the notebook when you are happy that everything is working." 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": { 68 | "tags": [ 69 | "stripout" 70 | ] 71 | }, 72 | "outputs": [], 73 | "source": "for line in container.logs(stderr=True, stream=True):\n print(line.decode(\"utf-8\"),end =\"\")" 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": "container.reload() # Refresh state\nif container.status is 'running':\n container.kill()" 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "tags": [ 87 | "stripout" 88 | ] 89 | }, 90 | "outputs": [], 91 | "source": "for line in dc.images.push(image.tags[0], \n stream=True,\n auth_config={\"username\": DOCKERHUB,\n \"password\": DOCKER_PWD}):\n print(line)" 92 | } 93 | ], 94 | "metadata": { 95 | "jupytext": { 96 | "text_representation": { 97 | "extension": ".py", 98 | "format_name": "light", 99 | "format_version": "1.3", 100 | "jupytext_version": "0.8.6" 101 | } 102 | }, 103 | "kernelspec": { 104 | "display_name": "Python 3", 105 | "language": "python", 106 | "name": "python3" 107 | } 108 | }, 109 | "nbformat": 4, 110 | "nbformat_minor": 2 111 | } 112 | -------------------------------------------------------------------------------- /HorovodPytorch/01_TrainPyTorchModel.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": "# Train PyTorch Model Distributed on Batch AI\nIn this notebook we will train a PyTorch model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n * [Create Experiment](#experiment)\n * [Upload Training Scripts](#training_scripts)\n * [Submit and Monitor Job](#job)\n * [Clean Up Resources](#clean_up)" 7 | }, 8 | { 9 | "cell_type": "code", 10 | "execution_count": null, 11 | "metadata": {}, 12 | "outputs": [], 13 | "source": "import sys\nsys.path.append(\"../common\") \n\nimport json\nfrom dotenv import get_key\nimport os\nfrom utils import write_json_to_file, dotenv_for" 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": "Set the USE_FAKE to True if you want to use fake data rather than the Imagenet dataset. This is often a good way to debug your models as well as checking what IO overhead is." 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": { 24 | "tags": [ 25 | "parameters" 26 | ] 27 | }, 28 | "outputs": [], 29 | "source": "# Variables for Batch AI - change as necessary\ndotenv_path = dotenv_for()\nGROUP_NAME = get_key(dotenv_path, 'GROUP_NAME')\nFILE_SHARE_NAME = get_key(dotenv_path, 'FILE_SHARE_NAME')\nWORKSPACE = get_key(dotenv_path, 'WORKSPACE')\nNUM_NODES = int(get_key(dotenv_path, 'NUM_NODES'))\nCLUSTER_NAME = get_key(dotenv_path, 'CLUSTER_NAME')\nGPU_TYPE = get_key(dotenv_path, 'GPU_TYPE')\nPROCESSES_PER_NODE = int(get_key(dotenv_path, 'PROCESSES_PER_NODE'))\nSTORAGE_ACCOUNT_NAME = get_key(dotenv_path, 'STORAGE_ACCOUNT_NAME')\n\nEXPERIMENT = f\"distributed_pytorch_{GPU_TYPE}\"\nUSE_FAKE = False\nDOCKERHUB = os.getenv('DOCKER_REPOSITORY', \"masalvar\") #\"\"" 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": "FAKE='-x FAKE=True' if USE_FAKE else ''\nTOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES" 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": "\n# Create Experiment\nNext we create our experiment." 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": "!az batchai experiment create -n $EXPERIMENT -g $GROUP_NAME -w $WORKSPACE" 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": "\n# Upload Training Scripts\nWe need to upload our training scripts and associated files" 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": "json_data = !az storage account keys list -n $STORAGE_ACCOUNT_NAME -g $GROUP_NAME\nstorage_account_key = json.loads(''.join([i for i in json_data if 'WARNING' not in i]))[0]['value']" 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "tags": [ 67 | "stripout" 68 | ] 69 | }, 70 | "outputs": [], 71 | "source": "%env AZURE_STORAGE_ACCOUNT $STORAGE_ACCOUNT_NAME\n%env AZURE_STORAGE_KEY=$storage_account_key" 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": "Upload our training scripts" 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": "!az storage file upload --share-name $FILE_SHARE_NAME --source src/imagenet_pytorch_horovod.py --path scripts\n!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts" 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": "Let's check our cluster we created earlier" 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": "!az batchai cluster list -w $WORKSPACE -o table" 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": "\n## Submit and Monitor Job\nBelow we specify the job we wish to execute. " 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": "jobs_dict = {\n \"$schema\": \"https://raw.githubusercontent.com/Azure/BatchAI/master/schemas/2017-09-01-preview/job.json\",\n \"properties\": {\n \"nodeCount\": NUM_NODES,\n \"customToolkitSettings\": {\n \"commandLine\": f\"echo $AZ_BATCH_HOST_LIST; \\\n cat $AZ_BATCHAI_MPI_HOST_FILE; \\\n mpirun -np {TOTAL_PROCESSES} --hostfile $AZ_BATCHAI_MPI_HOST_FILE \\\n -bind-to none -map-by slot \\\n -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \\\n -mca btl_tcp_if_include eth0 \\\n -x NCCL_SOCKET_IFNAME=eth0 \\\n -mca btl ^openib \\\n -x NCCL_IB_DISABLE=1 \\\n -x DISTRIBUTED=True \\\n -x AZ_BATCHAI_INPUT_TRAIN \\\n -x AZ_BATCHAI_INPUT_TEST \\\n --allow-run-as-root \\\n {FAKE} \\\n python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py\"\n },\n \"stdOutErrPathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n \"inputDirectories\": [{\n \"id\": \"SCRIPTS\",\n \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts\"\n },\n {\n \"id\": \"TRAIN\",\n \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\",\n },\n {\n \"id\": \"TEST\",\n \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\",\n },\n ],\n \"outputDirectories\": [{\n \"id\": \"MODEL\",\n \"pathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n \"pathSuffix\": \"Models\"\n }],\n \"containerSettings\": {\n \"imageSourceRegistry\": {\n \"image\": f\"{DOCKERHUB}/caia-horovod-pytorch\"\n }\n }\n }\n}" 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": "write_json_to_file(jobs_dict, 'job.json')" 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": "JOB_NAME='pytorch-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)" 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": "We now submit the job to Batch AI" 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "tags": [ 133 | "stripout" 134 | ] 135 | }, 136 | "outputs": [], 137 | "source": "!az batchai job create -n $JOB_NAME --cluster $CLUSTER_NAME -w $WORKSPACE -e $EXPERIMENT -f job.json" 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": "With the command below we can check the status of the job" 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": "!az batchai job list -w $WORKSPACE -e $EXPERIMENT -o table" 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": "To view the files that the job has generated use the command below" 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "tags": [ 161 | "stripout" 162 | ] 163 | }, 164 | "outputs": [], 165 | "source": "!az batchai job file list -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr" 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": "We are also able to stream the stdout and stderr that our job produces. This is great to check the progress of our job as well as debug issues." 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "tags": [ 177 | "stripout" 178 | ] 179 | }, 180 | "outputs": [], 181 | "source": "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stdout.txt" 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": { 187 | "tags": [ 188 | "stripout" 189 | ] 190 | }, 191 | "outputs": [], 192 | "source": "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stderr.txt" 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": "We can either wait for the job to complete or delete it with the command below." 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": "!az batchai job delete -w $WORKSPACE -e $EXPERIMENT --name $JOB_NAME -y" 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": "\n## Clean Up Resources\nNext we wish to tidy up the resource we created. \nFirst we reset the default values we set earlier." 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": "!az configure --defaults group=''\n!az configure --defaults location=''" 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": " Next we delete the cluster" 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": "!az batchai cluster delete -w $WORKSPACE --name $CLUSTER_NAME -g $GROUP_NAME -y" 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": "Once the cluster is deleted you will not incur any cost for the computation but you can still retain your experiments and workspace. If you wish to delete those as well execute the commands below." 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": "!az batchai experiment delete -w $WORKSPACE --name $EXPERIMENT -g $GROUP_NAME -y" 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": "!az batchai workspace delete -n $WORKSPACE -g $GROUP_NAME -y" 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": "Finally we can delete the group and we will have deleted everything created for this tutorial." 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": "!az group delete --name $GROUP_NAME -y" 260 | } 261 | ], 262 | "metadata": { 263 | "jupytext": { 264 | "text_representation": { 265 | "extension": ".py", 266 | "format_name": "light", 267 | "format_version": "1.3", 268 | "jupytext_version": "0.8.6" 269 | } 270 | }, 271 | "kernelspec": { 272 | "display_name": "Python 3", 273 | "language": "python", 274 | "name": "python3" 275 | } 276 | }, 277 | "nbformat": 4, 278 | "nbformat_minor": 2 279 | } 280 | -------------------------------------------------------------------------------- /HorovodPytorch/Docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:9.0-devel-ubuntu16.04 2 | 3 | # TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully 4 | ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0 5 | ENV NCCL_VERSION=2.2.13-1+cuda9.0 6 | ENV PYTORCH_VERSION=0.4.0 7 | ENV PYTHON_VERSION=3.5 8 | 9 | RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list 10 | 11 | RUN apt-get update && apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \ 12 | build-essential \ 13 | cmake \ 14 | git \ 15 | curl \ 16 | vim \ 17 | wget \ 18 | ca-certificates \ 19 | libcudnn7=${CUDNN_VERSION} \ 20 | libnccl2=${NCCL_VERSION} \ 21 | libnccl-dev=${NCCL_VERSION} \ 22 | libjpeg-dev \ 23 | libpng-dev \ 24 | python${PYTHON_VERSION} \ 25 | python${PYTHON_VERSION}-dev 26 | 27 | RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python 28 | 29 | RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ 30 | python get-pip.py && \ 31 | rm get-pip.py 32 | 33 | 34 | # Install PyTorch 35 | RUN pip install http://download.pytorch.org/whl/cu90/torch-${PYTORCH_VERSION}-cp35-cp35m-linux_x86_64.whl && \ 36 | pip install --no-cache-dir torchvision h5py scipy jupyter ipykernel numpy toolz pandas scikit-learn pillow 37 | 38 | # Install Open MPI 39 | RUN mkdir /tmp/openmpi && \ 40 | cd /tmp/openmpi && \ 41 | wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \ 42 | tar zxf openmpi-3.0.0.tar.gz && \ 43 | cd openmpi-3.0.0 && \ 44 | ./configure --enable-orterun-prefix-by-default && \ 45 | make -j $(nproc) all && \ 46 | make install && \ 47 | ldconfig && \ 48 | rm -rf /tmp/openmpi 49 | 50 | # Install Horovod, temporarily using CUDA stubs 51 | RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \ 52 | HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod==0.13.2 && \ 53 | ldconfig 54 | 55 | # Create a wrapper for OpenMPI to allow running as root by default 56 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \ 57 | echo '#!/bin/bash' > /usr/local/bin/mpirun && \ 58 | echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \ 59 | chmod a+x /usr/local/bin/mpirun 60 | 61 | # Configure OpenMPI to run good defaults: 62 | # --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 63 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \ 64 | echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 65 | 66 | # Set default NCCL parameters 67 | RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf && \ 68 | echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf 69 | 70 | # Install OpenSSH for MPI to communicate between containers 71 | RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \ 72 | mkdir -p /var/run/sshd 73 | 74 | # Allow OpenSSH to talk to containers without asking for confirmation 75 | RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ 76 | echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ 77 | mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config 78 | 79 | WORKDIR "/examples" -------------------------------------------------------------------------------- /HorovodPytorch/cluster_config/cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "properties": { 3 | "nodeSetup": { 4 | "setupTask": { 5 | "commandLine": "$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts/nodeprep.sh", 6 | "runElevated": "True", 7 | "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs" 8 | } 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /HorovodPytorch/cluster_config/docker.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Docker Application Container Engine 3 | Documentation=https://docs.docker.com 4 | After=network-online.target docker.socket firewalld.service 5 | Wants=network-online.target 6 | Requires=docker.socket 7 | 8 | [Service] 9 | EnvironmentFile=/etc/default/docker 10 | Type=notify 11 | # the default is not to use systemd for cgroups because the delegate issues still 12 | # exists and systemd currently does not support the cgroup feature set required 13 | # for containers run by docker 14 | ExecStart=/usr/bin/dockerd --default-shm-size 8G -g /mnt/docker/ -H fd:// 15 | ExecReload=/bin/kill -s HUP $MAINPID 16 | LimitNOFILE=1048576 17 | # Having non-zero Limit*s causes performance problems due to accounting overhead 18 | # in the kernel. We recommend using cgroups to do container-local accounting. 19 | LimitNPROC=infinity 20 | LimitCORE=infinity 21 | # Uncomment TasksMax if your systemd version supports it. 22 | # Only systemd 226 and above support this version. 23 | TasksMax=infinity 24 | TimeoutStartSec=0 25 | # set delegate yes so that systemd does not reset the cgroups of docker containers 26 | Delegate=yes 27 | # kill only the docker process, not all processes in the cgroup 28 | KillMode=process 29 | # restart the docker process if it exits prematurely 30 | Restart=on-failure 31 | StartLimitBurst=3 32 | StartLimitInterval=60s 33 | 34 | [Install] 35 | WantedBy=multi-user.target 36 | -------------------------------------------------------------------------------- /HorovodPytorch/cluster_config/nodeprep.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | sudo cp $AZ_BATCHAI_MOUNT_ROOT/extfs/scripts/docker.service /lib/systemd/system 3 | sudo systemctl daemon-reload 4 | sudo systemctl restart docker 5 | -------------------------------------------------------------------------------- /HorovodPytorch/src/imagenet_pytorch_horovod.py: -------------------------------------------------------------------------------- 1 | """ 2 | Trains ResNet50 in Keras using Horovod. 3 | 4 | It requires the following env variables 5 | AZ_BATCHAI_INPUT_TRAIN 6 | AZ_BATCHAI_INPUT_TEST 7 | AZ_BATCHAI_OUTPUT_MODEL 8 | AZ_BATCHAI_JOB_TEMP_DIR 9 | """ 10 | import logging 11 | import os 12 | import sys 13 | from functools import lru_cache 14 | from os import path 15 | from timer import Timer 16 | 17 | import numpy as np 18 | import pandas as pd 19 | import torch.backends.cudnn as cudnn 20 | import torch.nn.functional as F 21 | import torch.optim as optim 22 | import torch.utils.data.distributed 23 | import torchvision.models as models 24 | from torch.utils.data import Dataset 25 | from torchvision import transforms, datasets 26 | 27 | 28 | def _str_to_bool(in_str): 29 | if "t" in in_str.lower(): 30 | return True 31 | else: 32 | return False 33 | 34 | 35 | _WIDTH = 224 36 | _HEIGHT = 224 37 | _CHANNELS = 3 38 | _LR = 0.001 39 | _EPOCHS = os.getenv("EPOCHS", 1) 40 | _BATCHSIZE = 64 41 | _RGB_MEAN = [0.485, 0.456, 0.406] 42 | _RGB_SD = [0.229, 0.224, 0.225] 43 | _SEED = 42 44 | 45 | # Settings from https://arxiv.org/abs/1706.02677. 46 | _WARMUP_EPOCHS = 5 47 | _WEIGHT_DECAY = 0.00005 48 | 49 | _FAKE = _str_to_bool(os.getenv("FAKE", "False")) 50 | _DATA_LENGTH = int( 51 | os.getenv("FAKE_DATA_LENGTH", 1281167) 52 | ) # How much fake data to simulate, default to size of imagenet dataset 53 | _DISTRIBUTED = _str_to_bool(os.getenv("DISTRIBUTED", "False")) 54 | 55 | if _DISTRIBUTED: 56 | import horovod.torch as hvd 57 | 58 | 59 | def _get_rank(): 60 | if _DISTRIBUTED: 61 | try: 62 | return hvd.rank() 63 | except: 64 | return 0 65 | else: 66 | return 0 67 | 68 | 69 | 70 | class HorovodAdapter(logging.LoggerAdapter): 71 | def __init__(self, logger): 72 | self._str_epoch = "" 73 | self._gpu_rank = 0 74 | super(HorovodAdapter, self).__init__(logger, {}) 75 | 76 | def set_epoch(self, epoch): 77 | self._str_epoch = "[Epoch {}]".format(epoch) 78 | 79 | def process(self, msg, kwargs): 80 | kwargs["extra"] = {"gpurank": _get_rank(), "epoch": self._str_epoch} 81 | return msg, kwargs 82 | 83 | 84 | @lru_cache() 85 | def _get_logger(): 86 | logger = logging.getLogger(__name__) 87 | logger.setLevel(logging.INFO) 88 | ch = logging.StreamHandler(stream=sys.stdout) 89 | formatter = logging.Formatter( 90 | "%(levelname)s:%(name)s:%(gpurank)d: %(epoch)s %(message)s" 91 | ) 92 | ch.setFormatter(formatter) 93 | logger.addHandler(ch) 94 | adapter = HorovodAdapter(logger) 95 | return adapter 96 | 97 | 98 | def _append_path_to(data_path, data_series): 99 | return data_series.apply(lambda x: path.join(data_path, x)) 100 | 101 | 102 | def _load_training(data_dir): 103 | logger = _get_logger() 104 | logger.info("Reading training data from {}".format(data_dir)) 105 | train_df = pd.read_csv(path.join(data_dir, "train.csv")) 106 | return train_df.assign( 107 | filenames=_append_path_to(path.join(data_dir, "train"), train_df.filenames) 108 | ) 109 | 110 | 111 | def _load_validation(data_dir): 112 | logger = _get_logger() 113 | logger.info("Reading validation data from {}".format(data_dir)) 114 | train_df = pd.read_csv(path.join(data_dir, "validation.csv")) 115 | return train_df.assign( 116 | filenames=_append_path_to(path.join(data_dir, "validation"), train_df.filenames) 117 | ) 118 | 119 | 120 | def _create_data_fn(train_path, test_path): 121 | train_df = _load_training(train_path) 122 | validation_df = _load_validation(test_path) 123 | # File-path 124 | train_X = train_df["filenames"].values 125 | validation_X = validation_df["filenames"].values 126 | # One-hot encoded labels for torch 127 | train_labels = train_df[["num_id"]].values.ravel() 128 | validation_labels = validation_df[["num_id"]].values.ravel() 129 | # Index starts from 0 130 | train_labels -= 1 131 | validation_labels -= 1 132 | return train_X, train_labels, validation_X, validation_labels 133 | 134 | 135 | def _create_data(batch_size, num_batches, dim, channels, seed=42): 136 | np.random.seed(seed) 137 | return np.random.rand(batch_size * num_batches, channels, dim[0], dim[1]).astype( 138 | np.float32 139 | ) 140 | 141 | 142 | def _create_labels(batch_size, num_batches, n_classes): 143 | return np.random.choice(n_classes, batch_size * num_batches) 144 | 145 | 146 | class FakeData(Dataset): 147 | def __init__( 148 | self, 149 | batch_size=32, 150 | num_batches=20, 151 | dim=(224, 224), 152 | n_channels=3, 153 | n_classes=10, 154 | length=_DATA_LENGTH, 155 | seed=42, 156 | data_transform=None, 157 | ): 158 | self.dim = dim 159 | self.n_channels = n_channels 160 | self.n_classes = n_classes 161 | self.num_batches = num_batches 162 | self._data = _create_data( 163 | batch_size, self.num_batches, self.dim, self.n_channels 164 | ) 165 | self._labels = _create_labels(batch_size, self.num_batches, self.n_classes) 166 | self.translation_index = np.random.choice(len(self._labels), length) 167 | self._length = length 168 | 169 | self._data_transform = data_transform 170 | logger = _get_logger() 171 | logger.info( 172 | "Creating fake data {} labels and {} images".format( 173 | n_classes, len(self._data) 174 | ) 175 | ) 176 | 177 | def __getitem__(self, idx): 178 | logger = _get_logger() 179 | logger.debug("Retrieving samples") 180 | logger.debug(str(idx)) 181 | tr_index_array = self.translation_index[idx] 182 | 183 | if self._data_transform is not None: 184 | data = self._data_transform(self._data[tr_index_array]) 185 | else: 186 | data = self._data[tr_index_array] 187 | 188 | return data, self._labels[tr_index_array] 189 | 190 | def __len__(self): 191 | return self._length 192 | 193 | 194 | def _is_master(is_distributed=_DISTRIBUTED): 195 | if is_distributed: 196 | if hvd.rank() == 0: 197 | return True 198 | else: 199 | return False 200 | else: 201 | return True 202 | 203 | 204 | def train(train_loader, model, criterion, optimizer, epoch): 205 | logger = _get_logger() 206 | msg = " duration({}) loss:{} total-samples: {}" 207 | t = Timer() 208 | t.start() 209 | logger.set_epoch(epoch) 210 | for i, (data, target) in enumerate(train_loader): 211 | data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True) 212 | optimizer.zero_grad() 213 | # compute output 214 | output = model(data) 215 | loss = criterion(output, target) 216 | # compute gradient and do SGD step 217 | loss.backward() 218 | optimizer.step() 219 | if i % 100 == 0: 220 | logger.info(msg.format(t.elapsed, loss.item(), i * len(data))) 221 | t.start() 222 | 223 | 224 | def validate(train_loader, model, criterion): 225 | logger = _get_logger() 226 | msg = "validation duration({}) loss:{} total-samples: {}" 227 | t = Timer() 228 | t.start() 229 | model.eval() 230 | with torch.no_grad(): 231 | for i, (data, target) in enumerate(train_loader): 232 | data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True) 233 | # compute output 234 | output = model(data) 235 | loss = criterion(output, target) 236 | # compute gradient and do SGD step 237 | if i % 100 == 0: 238 | logger.info(msg.format(t.elapsed, loss.item(), i * len(data))) 239 | t.start() 240 | 241 | 242 | def _log_summary(data_length, duration): 243 | logger = _get_logger() 244 | images_per_second = data_length / duration 245 | logger.info("Data length: {}".format(data_length)) 246 | logger.info("Total duration: {:.3f}".format(duration)) 247 | logger.info("Total images/sec: {:.3f}".format(images_per_second)) 248 | logger.info( 249 | "Batch size: (Per GPU {}: Total {})".format( 250 | _BATCHSIZE, hvd.size() * _BATCHSIZE if _DISTRIBUTED else _BATCHSIZE 251 | ) 252 | ) 253 | logger.info("Distributed: {}".format("True" if _DISTRIBUTED else "False")) 254 | logger.info("Num GPUs: {:.3f}".format(hvd.size() if _DISTRIBUTED else 1)) 255 | logger.info("Dataset: {}".format("Synthetic" if _FAKE else "Imagenet")) 256 | 257 | 258 | def _get_sampler(dataset, is_distributed=_DISTRIBUTED): 259 | if is_distributed: 260 | return torch.utils.data.distributed.DistributedSampler( 261 | dataset, num_replicas=hvd.size(), rank=hvd.rank() 262 | ) 263 | else: 264 | return torch.utils.data.sampler.RandomSampler(dataset) 265 | 266 | 267 | def main(): 268 | logger = _get_logger() 269 | if _DISTRIBUTED: 270 | # Horovod: initialize Horovod. 271 | 272 | hvd.init() 273 | logger.info("Runnin Distributed") 274 | torch.manual_seed(_SEED) 275 | # Horovod: pin GPU to local rank. 276 | torch.cuda.set_device(hvd.local_rank()) 277 | torch.cuda.manual_seed(_SEED) 278 | 279 | logger.info("PyTorch version {}".format(torch.__version__)) 280 | 281 | if _FAKE: 282 | logger.info("Setting up fake loaders") 283 | train_dataset = FakeData(n_classes=1000, data_transform=torch.FloatTensor) 284 | else: 285 | normalize = transforms.Normalize(_RGB_MEAN, _RGB_SD) 286 | logger.info("Setting up loaders") 287 | train_dataset = datasets.ImageFolder( 288 | os.getenv("AZ_BATCHAI_INPUT_TRAIN"), 289 | transforms.Compose( 290 | [ 291 | transforms.RandomResizedCrop(_WIDTH), 292 | transforms.RandomHorizontalFlip(), 293 | transforms.ToTensor(), 294 | normalize, 295 | ] 296 | ), 297 | ) 298 | 299 | validation_dataset = datasets.ImageFolder( 300 | os.getenv("AZ_BATCHAI_INPUT_TRAIN"), 301 | transforms.Compose( 302 | [ 303 | transforms.Resize(256), 304 | transforms.CenterCrop(224), 305 | transforms.ToTensor(), 306 | normalize, 307 | ] 308 | ), 309 | ) 310 | 311 | train_sampler = _get_sampler(train_dataset) 312 | 313 | kwargs = {"num_workers": 5, "pin_memory": True} 314 | train_loader = torch.utils.data.DataLoader( 315 | train_dataset, batch_size=_BATCHSIZE, sampler=train_sampler, **kwargs 316 | ) 317 | 318 | # Autotune 319 | cudnn.benchmark = True 320 | 321 | logger.info("Loading model") 322 | # Load symbol 323 | model = models.__dict__["resnet50"](pretrained=False) 324 | 325 | model.cuda() 326 | 327 | if _DISTRIBUTED: 328 | # Horovod: broadcast parameters. 329 | hvd.broadcast_parameters(model.state_dict(), root_rank=0) 330 | 331 | num_gpus = hvd.size() if _DISTRIBUTED else 1 332 | # Horovod: scale learning rate by the number of GPUs. 333 | optimizer = optim.SGD(model.parameters(), lr=_LR * num_gpus, momentum=0.9) 334 | if _DISTRIBUTED: 335 | # Horovod: wrap optimizer with DistributedOptimizer. 336 | optimizer = hvd.DistributedOptimizer( 337 | optimizer, named_parameters=model.named_parameters() 338 | ) 339 | 340 | criterion = F.cross_entropy 341 | 342 | if not _FAKE: 343 | val_sampler = _get_sampler(validation_dataset) 344 | val_loader = torch.utils.data.DataLoader( 345 | validation_dataset, batch_size=_BATCHSIZE, sampler=val_sampler, **kwargs 346 | ) 347 | 348 | # Main training-loop 349 | logger.info("Training ...") 350 | for epoch in range(_EPOCHS): 351 | with Timer(output=logger.info, prefix="Training") as t: 352 | model.train() 353 | if _DISTRIBUTED: 354 | train_sampler.set_epoch(epoch) 355 | train(train_loader, model, criterion, optimizer, epoch) 356 | _log_summary(len(train_dataset), t.elapsed) 357 | 358 | if not _FAKE: 359 | validate(val_loader, model, criterion) 360 | 361 | 362 | if __name__ == "__main__": 363 | main() 364 | -------------------------------------------------------------------------------- /HorovodTF/00_CreateImageAndTest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": "# Create Docker Image for TensorFlow\nIn this notebook we will create the Docker image for our TensorFlow script to run in. We will go through the process of creating the image and testing it locally to make sure it runs before submitting it to the cluster. It is often recommended you do this rather than debugging on the cluster since debugging on a cluster can be much more difficult and time consuming.\n \n**You will need to be running everything on a GPU enabled VM to run this notebook.** " 7 | }, 8 | { 9 | "cell_type": "code", 10 | "execution_count": null, 11 | "metadata": {}, 12 | "outputs": [], 13 | "source": "import sys\nsys.path.append(\"../common\") \n\nfrom dotenv import get_key\nimport os\nfrom utils import dotenv_for\nimport docker" 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": "We will use fake data here since we don't want to have to download the data etc. Using fake data is often a good way to debug your models as well as checking what IO overhead is. Here we are setting the number of processes (NUM_PROCESSES) to 2 since the VM we are testing on has 2 GPUs. If you are running on a machine with 1 GPU set NUM_PROCESSES to 1." 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": { 24 | "tags": [ 25 | "parameters" 26 | ] 27 | }, 28 | "outputs": [], 29 | "source": "dotenv_path = dotenv_for()\nUSE_FAKE = True\nDOCKERHUB = os.getenv('DOCKER_REPOSITORY', \"masalvar\")\nNUM_PROCESSES = 2\nDOCKER_PWD = get_key(dotenv_path, 'DOCKER_PWD')" 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": "dc = docker.from_env()" 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": "image, log_iter = dc.images.build(path='Docker', \n tag='{}/caia-horovod-tensorflow'.format(DOCKERHUB))" 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": "container_labels = {'containerName': 'tensorflowgpu'}\nenvironment ={\n \"DISTRIBUTED\":True,\n \"PYTHONPATH\":'/workspace/common/',\n}\n\nvolumes = {\n os.getenv('EXT_PWD'): {\n 'bind': '/workspace', \n 'mode': 'rw'\n }\n}\n\nif USE_FAKE:\n environment['FAKE'] = True\nelse:\n environment['FAKE'] = False\n volumes[os.getenv('EXT_DATA')]={'bind': '/mnt/input', 'mode': 'rw'}\n environment['AZ_BATCHAI_INPUT_TRAIN'] = '/mnt/input/train'\n environment['AZ_BATCHAI_INPUT_TEST'] = '/mnt/input/validation'" 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": "cmd=f'mpirun -np {NUM_PROCESSES} -H localhost:{NUM_PROCESSES} '\\\n 'python -u /workspace/HorovodTF/src/imagenet_estimator_tf_horovod.py'\ncontainer = dc.containers.run(image.tags[0], \n command=cmd,\n detach=True, \n labels=container_labels,\n runtime='nvidia',\n volumes=volumes,\n environment=environment,\n shm_size='8G',\n privileged=True)" 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": "With the code below we are simply monitoring what is happening in the container. Feel free to stop the notebook when you are happy that everything is working." 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": { 68 | "tags": [ 69 | "stripout" 70 | ] 71 | }, 72 | "outputs": [], 73 | "source": "for line in container.logs(stderr=True, stream=True):\n print(line.decode(\"utf-8\"),end =\"\")" 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": "container.reload() # Refresh state\nif container.status is 'running':\n container.kill()" 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "tags": [ 87 | "stripout" 88 | ] 89 | }, 90 | "outputs": [], 91 | "source": "for line in dc.images.push(image.tags[0], \n stream=True,\n auth_config={\"username\": DOCKERHUB,\n \"password\": DOCKER_PWD}):\n print(line)" 92 | } 93 | ], 94 | "metadata": { 95 | "jupytext": { 96 | "text_representation": { 97 | "extension": ".py", 98 | "format_name": "light", 99 | "format_version": "1.3", 100 | "jupytext_version": "0.8.6" 101 | } 102 | }, 103 | "kernelspec": { 104 | "display_name": "Python 3", 105 | "language": "python", 106 | "name": "python3" 107 | } 108 | }, 109 | "nbformat": 4, 110 | "nbformat_minor": 2 111 | } 112 | -------------------------------------------------------------------------------- /HorovodTF/01_TrainTensorflowModel.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": "# Train TensorFlow Model Distributed on Batch AI\nIn this notebook we will train a TensorFlow model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n * [Create Experiment](#experiment)\n * [Upload Training Scripts](#training_scripts)\n * [Submit and Monitor Job](#job)\n * [Clean Up Resources](#clean_up)" 7 | }, 8 | { 9 | "cell_type": "code", 10 | "execution_count": null, 11 | "metadata": {}, 12 | "outputs": [], 13 | "source": "import sys\nsys.path.append(\"../common\") \n\nimport json\nfrom dotenv import get_key\nimport os\nfrom utils import write_json_to_file, dotenv_for" 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": "Set the USE_FAKE to True if you want to use fake data rather than the ImageNet dataset. This is often a good way to debug your models as well as checking what IO overhead is." 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": { 24 | "tags": [ 25 | "parameters" 26 | ] 27 | }, 28 | "outputs": [], 29 | "source": "# Variables for Batch AI - change as necessary\ndotenv_path = dotenv_for()\nGROUP_NAME = get_key(dotenv_path, 'GROUP_NAME')\nFILE_SHARE_NAME = get_key(dotenv_path, 'FILE_SHARE_NAME')\nWORKSPACE = get_key(dotenv_path, 'WORKSPACE')\nNUM_NODES = int(get_key(dotenv_path, 'NUM_NODES'))\nCLUSTER_NAME = get_key(dotenv_path, 'CLUSTER_NAME')\nGPU_TYPE = get_key(dotenv_path, 'GPU_TYPE')\nPROCESSES_PER_NODE = int(get_key(dotenv_path, 'PROCESSES_PER_NODE'))\nSTORAGE_ACCOUNT_NAME = get_key(dotenv_path, 'STORAGE_ACCOUNT_NAME')\n\nEXPERIMENT = f\"distributed_tensorflow_{GPU_TYPE}\"\nUSE_FAKE = False\nDOCKERHUB = os.getenv('DOCKER_REPOSITORY', \"masalvar\")" 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": "FAKE='-x FAKE=True' if USE_FAKE else ''\nTOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES" 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": "\n# Create Experiment\nNext we create our experiment." 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": "!az batchai experiment create -n $EXPERIMENT -g $GROUP_NAME -w $WORKSPACE" 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": "\n# Upload Training Scripts\nWe need to upload our training scripts and associated files" 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": "json_data = !az storage account keys list -n $STORAGE_ACCOUNT_NAME -g $GROUP_NAME\nstorage_account_key = json.loads(''.join([i for i in json_data if 'WARNING' not in i]))[0]['value']" 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "tags": [ 67 | "stripout" 68 | ] 69 | }, 70 | "outputs": [], 71 | "source": "%env AZURE_STORAGE_ACCOUNT $STORAGE_ACCOUNT_NAME\n%env AZURE_STORAGE_KEY=$storage_account_key" 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": "Upload our training scripts" 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": "!az storage file upload --share-name $FILE_SHARE_NAME --source src/imagenet_estimator_tf_horovod.py --path scripts\n!az storage file upload --share-name $FILE_SHARE_NAME --source src/resnet_model.py --path scripts\n!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts" 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": "Let's check our cluster we created earlier" 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": "!az batchai cluster list -w $WORKSPACE -o table" 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": "\n## Submit and Monitor Job\nBelow we specify the job we wish to execute. " 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": "jobs_dict = {\n \"$schema\": \"https://raw.githubusercontent.com/Azure/BatchAI/master/schemas/2017-09-01-preview/job.json\",\n \"properties\": {\n \"nodeCount\": NUM_NODES,\n \"customToolkitSettings\": {\n \"commandLine\": f\"echo $AZ_BATCH_HOST_LIST; \\\n cat $AZ_BATCHAI_MPI_HOST_FILE; \\\n mpirun -np {TOTAL_PROCESSES} --hostfile $AZ_BATCHAI_MPI_HOST_FILE \\\n -bind-to none -map-by slot \\\n -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \\\n -mca btl_tcp_if_include eth0 \\\n -x NCCL_SOCKET_IFNAME=eth0 \\\n -mca btl ^openib \\\n -x NCCL_IB_DISABLE=1 \\\n -x DISTRIBUTED=True \\\n -x AZ_BATCHAI_INPUT_TRAIN \\\n -x AZ_BATCHAI_INPUT_TEST \\\n --allow-run-as-root \\\n {FAKE} \\\n python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py\"\n },\n \"stdOutErrPathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n \"inputDirectories\": [{\n \"id\": \"SCRIPTS\",\n \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts\"\n },\n {\n \"id\": \"TRAIN\",\n \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\",\n },\n {\n \"id\": \"TEST\",\n \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\",\n },\n ],\n \"outputDirectories\": [{\n \"id\": \"MODEL\",\n \"pathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n \"pathSuffix\": \"Models\"\n }],\n \"containerSettings\": {\n \"imageSourceRegistry\": {\n \"image\": f\"{DOCKERHUB}/caia-horovod-tensorflow\"\n }\n }\n }\n}" 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": "write_json_to_file(jobs_dict, 'job.json')" 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": "JOB_NAME='tensorflow-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)" 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": "We now submit the job to Batch AI" 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "tags": [ 133 | "stripout" 134 | ] 135 | }, 136 | "outputs": [], 137 | "source": "!az batchai job create -n $JOB_NAME --cluster $CLUSTER_NAME -w $WORKSPACE -e $EXPERIMENT -f job.json" 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": "With the command below we can check the status of the job" 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": "!az batchai job list -w $WORKSPACE -e $EXPERIMENT -o table" 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": "To view the files that the job has generated use the command below" 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "tags": [ 161 | "stripout" 162 | ] 163 | }, 164 | "outputs": [], 165 | "source": "!az batchai job file list -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr" 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": "We are also able to stream the stdout and stderr that our job produces. This is great to check the progress of our job as well as debug issues." 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "tags": [ 177 | "stripout" 178 | ] 179 | }, 180 | "outputs": [], 181 | "source": "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stdout.txt" 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": { 187 | "tags": [ 188 | "stripout" 189 | ] 190 | }, 191 | "outputs": [], 192 | "source": "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stderr.txt" 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": "We can either wait for the job to complete or delete it with the command below." 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": "!az batchai job delete -w $WORKSPACE -e $EXPERIMENT --name $JOB_NAME -y" 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": "\n## Clean Up Resources\nNext we wish to tidy up the resource we created. \nFirst we reset the default values we set earlier." 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": "!az configure --defaults group=''\n!az configure --defaults location=''" 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": " Next we delete the cluster" 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": "!az batchai cluster delete -w $WORKSPACE --name $CLUSTER_NAME -g $GROUP_NAME -y" 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": "Once the cluster is deleted you will not incur any cost for the computation but you can still retain your experiments and workspace. If you wish to delete those as well execute the commands below." 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": "!az batchai experiment delete -w $WORKSPACE --name $EXPERIMENT -g $GROUP_NAME -y" 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": "!az batchai workspace delete -n $WORKSPACE -g $GROUP_NAME -y" 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": "Finally we can delete the group and we will have deleted everything created for this tutorial." 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": "!az group delete --name $GROUP_NAME -y" 260 | } 261 | ], 262 | "metadata": { 263 | "jupytext": { 264 | "text_representation": { 265 | "extension": ".py", 266 | "format_name": "light", 267 | "format_version": "1.3", 268 | "jupytext_version": "0.8.6" 269 | } 270 | }, 271 | "kernelspec": { 272 | "display_name": "Python 3", 273 | "language": "python", 274 | "name": "python3" 275 | } 276 | }, 277 | "nbformat": 4, 278 | "nbformat_minor": 2 279 | } 280 | -------------------------------------------------------------------------------- /HorovodTF/Docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:9.0-devel-ubuntu16.04 2 | 3 | # TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully 4 | ENV PYTHON_VERSION=3.5 5 | ENV TENSORFLOW_VERSION=1.9.0 6 | ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0 7 | 8 | RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list 9 | 10 | RUN apt-get update && apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \ 11 | build-essential \ 12 | cmake \ 13 | cpio \ 14 | git \ 15 | curl \ 16 | wget \ 17 | ca-certificates \ 18 | libdapl2 \ 19 | libcudnn7=$CUDNN_VERSION \ 20 | libjpeg-dev \ 21 | libpng-dev \ 22 | libmlx4-1 \ 23 | libsm6 \ 24 | libxext6 \ 25 | python$PYTHON_VERSION \ 26 | python$PYTHON_VERSION-dev 27 | 28 | 29 | # install intel MPI 30 | RUN cd /tmp && \ 31 | wget -q 'http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/11595/l_mpi_2017.3.196.tgz' && \ 32 | tar zxvf l_mpi_2017.3.196.tgz && \ 33 | sed -i -e 's/^ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' /tmp/l_mpi_2017.3.196/silent.cfg && \ 34 | sed -i -e 's|^#ACTIVATION_LICENSE_FILE=|ACTIVATION_LICENSE_FILE=/tmp/l_mpi_2017.3.196/USE_SERVER.lic|g' \ 35 | /tmp/l_mpi_2017.3.196/silent.cfg && \ 36 | sed -i -e 's/^ACTIVATION_TYPE=exist_lic/ACTIVATION_TYPE=license_server/g' /tmp/l_mpi_2017.3.196/silent.cfg && \ 37 | cd /tmp/l_mpi_2017.3.196 && \ 38 | ./install.sh -s silent.cfg && \ 39 | cd .. && \ 40 | rm -rf l_mpi_2017.3.196* && \ 41 | echo "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" >> ~/.bashrc 42 | 43 | ENV PATH $PATH:/opt/intel/compilers_and_libraries/linux/mpi/bin64 44 | 45 | RUN ln -s /usr/bin/python$PYTHON_VERSION /usr/bin/python 46 | 47 | RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ 48 | python get-pip.py && \ 49 | rm get-pip.py 50 | 51 | # Install TensorFlow 52 | RUN pip install --no-cache-dir tensorflow-gpu==$TENSORFLOW_VERSION h5py scipy jupyter ipykernel numpy toolz pandas \ 53 | scikit-learn 54 | 55 | # Install Horovod, temporarily using CUDA stubs 56 | RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \ 57 | /bin/bash -c "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" && \ 58 | HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod==0.13.2 && \ 59 | ldconfig -------------------------------------------------------------------------------- /HorovodTF/src/imagenet_estimator_tf_horovod.py: -------------------------------------------------------------------------------- 1 | """ 2 | Trains ResNet50 using Horovod. 3 | 4 | It requires the following env variables 5 | AZ_BATCHAI_INPUT_TRAIN 6 | AZ_BATCHAI_INPUT_TEST 7 | AZ_BATCHAI_OUTPUT_MODEL 8 | AZ_BATCHAI_JOB_TEMP_DIR 9 | """ 10 | import glob 11 | import logging 12 | import os 13 | import sys 14 | from functools import lru_cache 15 | from os import path 16 | from pathlib import Path 17 | from timer import Timer 18 | 19 | import numpy as np 20 | import tensorflow as tf 21 | from resnet_model import resnet_v1 22 | from toolz import pipe 23 | 24 | _WIDTH = 224 25 | _HEIGHT = 224 26 | _CHANNELS = 3 27 | _LR = 0.001 28 | _EPOCHS = os.getenv("EPOCHS", 1) 29 | _BATCHSIZE = 64 30 | _R_MEAN = 123.68 31 | _G_MEAN = 116.78 32 | _B_MEAN = 103.94 33 | _BUFFER = 256 34 | 35 | 36 | def _str_to_bool(in_str): 37 | if "t" in in_str.lower(): 38 | return True 39 | else: 40 | return False 41 | 42 | 43 | _DISTRIBUTED = _str_to_bool(os.getenv("DISTRIBUTED", "False")) 44 | _FAKE = _str_to_bool(os.getenv("FAKE", "False")) 45 | _DATA_LENGTH = int( 46 | os.getenv("FAKE_DATA_LENGTH", 1281167) 47 | ) # How much fake data to simulate, default to size of imagenet dataset 48 | _VALIDATION = _str_to_bool(os.getenv("VALIDATION", "False")) 49 | 50 | if _DISTRIBUTED: 51 | import horovod.tensorflow as hvd 52 | 53 | 54 | tf_logger = logging.getLogger("tensorflow") 55 | tf_logger.setLevel(logging.INFO) 56 | stout = logging.StreamHandler(stream=sys.stdout) 57 | tf_logger.addHandler(stout) 58 | 59 | 60 | def _get_rank(): 61 | if _DISTRIBUTED: 62 | try: 63 | return hvd.rank() 64 | except: 65 | return 0 66 | else: 67 | return 0 68 | 69 | 70 | class HorovodAdapter(logging.LoggerAdapter): 71 | def __init__(self, logger): 72 | self._str_epoch = "" 73 | self._gpu_rank = 0 74 | super(HorovodAdapter, self).__init__(logger, {}) 75 | 76 | def set_epoch(self, epoch): 77 | self._str_epoch = "[Epoch {}]".format(epoch) 78 | 79 | def process(self, msg, kwargs): 80 | kwargs["extra"] = {"gpurank": _get_rank(), "epoch": self._str_epoch} 81 | return msg, kwargs 82 | 83 | 84 | @lru_cache() 85 | def _get_logger(): 86 | logger = logging.getLogger(__name__) 87 | logger.setLevel(logging.INFO) 88 | ch = logging.StreamHandler(stream=sys.stdout) 89 | formatter = logging.Formatter( 90 | "%(levelname)s:%(name)s:%(gpurank)d: %(epoch)s %(message)s" 91 | ) 92 | ch.setFormatter(formatter) 93 | logger.addHandler(ch) 94 | adapter = HorovodAdapter(logger) 95 | return adapter 96 | 97 | 98 | def _load_image(filename, channels=_CHANNELS): 99 | return tf.to_float(tf.image.decode_png(tf.read_file(filename), channels=channels)) 100 | 101 | 102 | def _resize(img, width=_WIDTH, height=_HEIGHT): 103 | return tf.image.resize_images(img, [height, width]) 104 | 105 | 106 | def _centre(img, mean_subtraction=(_R_MEAN, _G_MEAN, _B_MEAN)): 107 | return tf.subtract(img, list(mean_subtraction)) 108 | 109 | 110 | def _random_crop(img, width=_WIDTH, height=_HEIGHT, channels=_CHANNELS): 111 | return tf.random_crop(img, [height, width, channels]) 112 | 113 | 114 | def _random_horizontal_flip(img): 115 | return tf.image.random_flip_left_right(img) 116 | 117 | 118 | def _preprocess_images(filename): 119 | return pipe(filename, _load_image, _resize, _centre) 120 | 121 | 122 | def _preprocess_labels(label): 123 | return tf.cast(label, dtype=tf.int32) 124 | 125 | 126 | def _transform_to_NCHW(img): 127 | return tf.transpose(img, [2, 0, 1]) # Transform from NHWC to NCHW 128 | 129 | 130 | def _parse_function_train(tensor, label): 131 | img_rgb = pipe(tensor, _random_crop, _random_horizontal_flip, _transform_to_NCHW) 132 | 133 | return img_rgb, label 134 | 135 | 136 | def _prep(filename, label): 137 | return tf.data.Dataset.from_tensor_slices( 138 | ([_preprocess_images(filename)], [_preprocess_labels(label)]) 139 | ) 140 | 141 | 142 | def _parse_function_eval(filename, label): 143 | return ( 144 | pipe(filename, _preprocess_images, _transform_to_NCHW), 145 | _preprocess_labels(label), 146 | ) 147 | 148 | 149 | def _get_optimizer(params, is_distributed=_DISTRIBUTED): 150 | if is_distributed: 151 | # Horovod: add Horovod Distributed Optimizer. 152 | return hvd.DistributedOptimizer( 153 | tf.train.MomentumOptimizer( 154 | learning_rate=params["learning_rate"] * hvd.size(), momentum=0.9 155 | ) 156 | ) 157 | else: 158 | return tf.train.MomentumOptimizer( 159 | learning_rate=params["learning_rate"], momentum=0.9 160 | ) 161 | 162 | 163 | def build_network(features, mode, params): 164 | network = resnet_v1( 165 | resnet_depth=50, num_classes=params["classes"], data_format="channels_first" 166 | ) 167 | return network(inputs=features, is_training=(mode == tf.estimator.ModeKeys.TRAIN)) 168 | 169 | 170 | def model_fn(features, labels, mode, params): 171 | """ 172 | features: This is the x-arg from the input_fn. 173 | labels: This is the y-arg from the input_fn, 174 | see e.g. train_input_fn for these two. 175 | mode: Either TRAIN, EVAL, or PREDICT 176 | params: User-defined hyper-parameters, e.g. learning-rate. 177 | """ 178 | logger = _get_logger() 179 | logger.info("Creating model in {} mode".format(mode)) 180 | 181 | logits = build_network(features, mode, params) 182 | 183 | if mode == tf.estimator.ModeKeys.PREDICT: 184 | # Softmax output of the neural network. 185 | y_pred = tf.nn.softmax(logits=logits) 186 | 187 | # Classification output of the neural network. 188 | y_pred_cls = tf.argmax(y_pred, axis=1) 189 | 190 | predictions = { 191 | "class_ids": y_pred_cls, 192 | "probabilities": y_pred, 193 | "logits": logits, 194 | } 195 | return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) 196 | 197 | cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( 198 | logits=logits, labels=labels 199 | ) 200 | loss = tf.reduce_mean(cross_entropy) 201 | 202 | if mode == tf.estimator.ModeKeys.EVAL: 203 | # Softmax output of the neural network. 204 | y_pred = tf.nn.softmax(logits=logits) 205 | 206 | # Classification output of the neural network. 207 | y_pred_cls = tf.argmax(y_pred, axis=1) 208 | 209 | accuracy = tf.metrics.accuracy( 210 | labels=tf.argmax(labels, axis=1), predictions=y_pred_cls, name="acc_op" 211 | ) 212 | metrics = {"accuracy": accuracy} 213 | tf.summary.scalar("accuracy", accuracy[1]) 214 | return tf.estimator.EstimatorSpec(mode=mode, eval_metric_ops=metrics, loss=loss) 215 | 216 | optimizer = _get_optimizer(params) 217 | 218 | train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) 219 | 220 | return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) 221 | 222 | 223 | def _append_path_to(data_path, data_series): 224 | return data_series.apply(lambda x: path.join(data_path, x)) 225 | 226 | 227 | def _load_training(data_dir): 228 | return list(glob.glob(Path(data_dir) / "**" / "*.jpg")) 229 | 230 | 231 | def _load_validation(data_dir): 232 | return list(glob.glob(Path(data_dir) / "**" / "*.jpg")) 233 | 234 | 235 | def _create_data_fn(train_path, test_path): 236 | logger = _get_logger() 237 | logger.info("Reading training data info") 238 | train_df = _load_training(train_path) 239 | 240 | logger.info("Reading validation data info") 241 | validation_df = _load_validation(test_path) 242 | 243 | train_labels = train_df[["num_id"]].values.ravel() - 1 244 | validation_labels = validation_df[["num_id"]].values.ravel() - 1 245 | 246 | train_data = tf.data.Dataset.from_tensor_slices( 247 | (train_df["filenames"].values, train_labels) 248 | ) 249 | train_data_transform = tf.contrib.data.map_and_batch( 250 | _parse_function_train, _BATCHSIZE, num_parallel_batches=5 251 | ) 252 | train_data = train_data.apply( 253 | tf.contrib.data.parallel_interleave( 254 | _prep, cycle_length=5, buffer_output_elements=1024 255 | ) 256 | ) 257 | 258 | train_data = ( 259 | train_data.shuffle(1024).repeat().apply(train_data_transform).prefetch(_BUFFER) 260 | ) 261 | 262 | validation_data = tf.data.Dataset.from_tensor_slices( 263 | (validation_df["filenames"].values, validation_labels) 264 | ) 265 | validation_data_transform = tf.contrib.data.map_and_batch( 266 | _parse_function_eval, _BATCHSIZE, num_parallel_batches=4 267 | ) 268 | validation_data = validation_data.apply(validation_data_transform).prefetch(_BUFFER) 269 | 270 | def _train_input_fn(): 271 | return train_data.make_one_shot_iterator().get_next() 272 | 273 | def _validation_input_fn(): 274 | return validation_data.make_one_shot_iterator().get_next() 275 | 276 | _train_input_fn.length = len(train_df) 277 | _validation_input_fn.length = len(validation_df) 278 | _train_input_fn.classes = 1000 279 | _validation_input_fn.classes = 1000 280 | 281 | return _train_input_fn, _validation_input_fn 282 | 283 | 284 | def _create_data(batch_size, num_batches, dim, channels, seed=42): 285 | np.random.seed(seed) 286 | return np.random.rand(batch_size * num_batches, channels, dim[0], dim[1]).astype( 287 | np.float32 288 | ) 289 | 290 | 291 | def _create_labels(batch_size, num_batches, n_classes): 292 | return np.random.choice(n_classes, batch_size * num_batches) 293 | 294 | 295 | def _create_fake_data_fn(train_length=_DATA_LENGTH, valid_length=50000, num_batches=40): 296 | """ Creates fake dataset 297 | 298 | Data is returned in NCHW since this tends to be faster on GPUs 299 | """ 300 | logger = _get_logger() 301 | logger.info("Creating fake data") 302 | 303 | data_array = _create_data(_BATCHSIZE, num_batches, (_HEIGHT, _WIDTH), _CHANNELS) 304 | labels_array = _create_labels(_BATCHSIZE, num_batches, 1000) 305 | 306 | def fake_data_generator(): 307 | for i in range(num_batches): 308 | yield data_array[i * _BATCHSIZE : (i + 1) * _BATCHSIZE], labels_array[ 309 | i * _BATCHSIZE : (i + 1) * _BATCHSIZE 310 | ] 311 | 312 | train_data = tf.data.Dataset().from_generator( 313 | fake_data_generator, 314 | output_types=(tf.float32, tf.int32), 315 | output_shapes=( 316 | tf.TensorShape([None, _CHANNELS, _HEIGHT, _WIDTH]), 317 | tf.TensorShape([None]), 318 | ), 319 | ) 320 | 321 | train_data = train_data.shuffle(40 * _BATCHSIZE).repeat().prefetch(_BUFFER) 322 | 323 | validation_data = tf.data.Dataset().from_generator( 324 | fake_data_generator, 325 | output_types=(tf.float32, tf.int32), 326 | output_shapes=( 327 | tf.TensorShape([None, _CHANNELS, _HEIGHT, _WIDTH]), 328 | tf.TensorShape([None]), 329 | ), 330 | ) 331 | 332 | validation_data = validation_data.prefetch(_BUFFER) 333 | 334 | def _train_input_fn(): 335 | return train_data.make_one_shot_iterator().get_next() 336 | 337 | def _validation_input_fn(): 338 | return validation_data.make_one_shot_iterator().get_next() 339 | 340 | _train_input_fn.length = train_length 341 | _validation_input_fn.length = valid_length 342 | _train_input_fn.classes = 1000 343 | _validation_input_fn.classes = 1000 344 | 345 | return _train_input_fn, _validation_input_fn 346 | 347 | 348 | def _get_runconfig(is_distributed=_DISTRIBUTED): 349 | if is_distributed: 350 | # Horovod: pin GPU to be used to process local rank (one GPU per process) 351 | config = tf.ConfigProto() 352 | config.gpu_options.allow_growth = True 353 | config.gpu_options.visible_device_list = str(hvd.local_rank()) 354 | 355 | return tf.estimator.RunConfig( 356 | save_checkpoints_steps=None, 357 | save_checkpoints_secs=None, 358 | session_config=config, 359 | ) 360 | else: 361 | return tf.estimator.RunConfig(save_checkpoints_steps=None) 362 | 363 | 364 | def _get_model_dir(is_distributed=_DISTRIBUTED): 365 | if is_distributed: 366 | # Horovod: save checkpoints only on worker 0 to prevent other workers from 367 | # corrupting them. 368 | return ( 369 | os.getenv("AZ_BATCHAI_OUTPUT_MODEL") 370 | if hvd.rank() == 0 371 | else os.getenv("AZ_BATCHAI_JOB_TEMP_DIR") 372 | ) 373 | else: 374 | return os.getenv("AZ_BATCHAI_OUTPUT_MODEL") 375 | 376 | 377 | def _get_hooks(is_distributed=_DISTRIBUTED): 378 | logger = _get_logger() 379 | if is_distributed: 380 | bcast_hook = hvd.BroadcastGlobalVariablesHook(0) 381 | logger.info("Rank: {} Cluster Size {}".format(hvd.local_rank(), hvd.size())) 382 | return [bcast_hook] 383 | else: 384 | return [] 385 | 386 | 387 | def _is_master(is_distributed=_DISTRIBUTED): 388 | if is_distributed: 389 | if hvd.rank() == 0: 390 | return True 391 | else: 392 | return False 393 | else: 394 | return True 395 | 396 | 397 | def _log_summary(data_length, duration): 398 | logger = _get_logger() 399 | images_per_second = data_length / duration 400 | logger.info("Data length: {}".format(data_length)) 401 | logger.info("Total duration: {:.3f}".format(duration)) 402 | logger.info("Total images/sec: {:.3f}".format(images_per_second)) 403 | logger.info( 404 | "Batch size: (Per GPU {}: Total {})".format( 405 | _BATCHSIZE, hvd.size() * _BATCHSIZE if _DISTRIBUTED else _BATCHSIZE 406 | ) 407 | ) 408 | logger.info("Distributed: {}".format("True" if _DISTRIBUTED else "False")) 409 | logger.info("Num GPUs: {:.3f}".format(hvd.size() if _DISTRIBUTED else 1)) 410 | logger.info("Dataset: {}".format("Synthetic" if _FAKE else "Imagenet")) 411 | 412 | 413 | def main(): 414 | 415 | if _DISTRIBUTED: 416 | # Horovod: initialize Horovod. 417 | hvd.init() 418 | logger = _get_logger() 419 | logger.info("Runnin Distributed") 420 | else: 421 | logger = _get_logger() 422 | 423 | logger.info("Tensorflow version {}".format(tf.__version__)) 424 | if _FAKE: 425 | train_input_fn, validation_input_fn = _create_fake_data_fn() 426 | else: 427 | train_input_fn, validation_input_fn = _create_data_fn( 428 | os.getenv("AZ_BATCHAI_INPUT_TRAIN"), os.getenv("AZ_BATCHAI_INPUT_TEST") 429 | ) 430 | 431 | run_config = _get_runconfig() 432 | model_dir = _get_model_dir() 433 | 434 | params = {"learning_rate": _LR, "classes": train_input_fn.classes} 435 | logger.info("Creating estimator with params: {}".format(params)) 436 | model = tf.estimator.Estimator( 437 | model_fn=model_fn, params=params, model_dir=model_dir, config=run_config 438 | ) 439 | 440 | hooks = _get_hooks() 441 | num_gpus = hvd.size() if _DISTRIBUTED else 1 442 | with Timer(output=logger.info, prefix="Training") as t: 443 | logger.info("Training...") 444 | model.train( 445 | input_fn=train_input_fn, 446 | steps=_EPOCHS * train_input_fn.length // (_BATCHSIZE * num_gpus), 447 | hooks=hooks, 448 | ) 449 | 450 | _log_summary(_EPOCHS * train_input_fn.length, t.elapsed) 451 | 452 | if _is_master() and _FAKE is False and _VALIDATION: 453 | with Timer(output=logger.info, prefix="Testing"): 454 | logger.info("Testing...") 455 | model.evaluate(input_fn=validation_input_fn) 456 | 457 | 458 | if __name__ == "__main__": 459 | main() 460 | -------------------------------------------------------------------------------- /HorovodTF/src/resnet_model.py: -------------------------------------------------------------------------------- 1 | """ Taken from official Tensorflow TPU spec https://github.com/tensorflow/tpu 2 | """ 3 | 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | import tensorflow as tf 9 | 10 | BATCH_NORM_DECAY = 0.9 11 | BATCH_NORM_EPSILON = 1e-5 12 | 13 | 14 | def batch_norm_relu(inputs, is_training, relu=True, init_zero=False, 15 | data_format='channels_first'): 16 | """Performs a batch normalization followed by a ReLU. 17 | 18 | Args: 19 | inputs: `Tensor` of shape `[batch, channels, ...]`. 20 | is_training: `bool` for whether the model is training. 21 | relu: `bool` if False, omits the ReLU operation. 22 | init_zero: `bool` if True, initializes scale parameter of batch 23 | normalization with 0 instead of 1 (default). 24 | data_format: `str` either "channels_first" for `[batch, channels, height, 25 | width]` or "channels_last for `[batch, height, width, channels]`. 26 | 27 | Returns: 28 | A normalized `Tensor` with the same `data_format`. 29 | """ 30 | if init_zero: 31 | gamma_initializer = tf.zeros_initializer() 32 | else: 33 | gamma_initializer = tf.ones_initializer() 34 | 35 | if data_format == 'channels_first': 36 | axis = 1 37 | else: 38 | axis = 3 39 | 40 | inputs = tf.layers.batch_normalization( 41 | inputs=inputs, 42 | axis=axis, 43 | momentum=BATCH_NORM_DECAY, 44 | epsilon=BATCH_NORM_EPSILON, 45 | center=True, 46 | scale=True, 47 | training=is_training, 48 | fused=True, 49 | gamma_initializer=gamma_initializer) 50 | 51 | if relu: 52 | inputs = tf.nn.relu(inputs) 53 | return inputs 54 | 55 | 56 | def fixed_padding(inputs, kernel_size, data_format='channels_first'): 57 | """Pads the input along the spatial dimensions independently of input size. 58 | 59 | Args: 60 | inputs: `Tensor` of size `[batch, channels, height, width]` or 61 | `[batch, height, width, channels]` depending on `data_format`. 62 | kernel_size: `int` kernel size to be used for `conv2d` or max_pool2d` 63 | operations. Should be a positive integer. 64 | data_format: `str` either "channels_first" for `[batch, channels, height, 65 | width]` or "channels_last for `[batch, height, width, channels]`. 66 | 67 | Returns: 68 | A padded `Tensor` of the same `data_format` with size either intact 69 | (if `kernel_size == 1`) or padded (if `kernel_size > 1`). 70 | """ 71 | pad_total = kernel_size - 1 72 | pad_beg = pad_total // 2 73 | pad_end = pad_total - pad_beg 74 | if data_format == 'channels_first': 75 | padded_inputs = tf.pad(inputs, [[0, 0], [0, 0], 76 | [pad_beg, pad_end], [pad_beg, pad_end]]) 77 | else: 78 | padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end], 79 | [pad_beg, pad_end], [0, 0]]) 80 | 81 | return padded_inputs 82 | 83 | 84 | def conv2d_fixed_padding(inputs, filters, kernel_size, strides, 85 | data_format='channels_first'): 86 | """Strided 2-D convolution with explicit padding. 87 | 88 | The padding is consistent and is based only on `kernel_size`, not on the 89 | dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone). 90 | 91 | Args: 92 | inputs: `Tensor` of size `[batch, channels, height_in, width_in]`. 93 | filters: `int` number of filters in the convolution. 94 | kernel_size: `int` size of the kernel to be used in the convolution. 95 | strides: `int` strides of the convolution. 96 | data_format: `str` either "channels_first" for `[batch, channels, height, 97 | width]` or "channels_last for `[batch, height, width, channels]`. 98 | 99 | Returns: 100 | A `Tensor` of shape `[batch, filters, height_out, width_out]`. 101 | """ 102 | if strides > 1: 103 | inputs = fixed_padding(inputs, kernel_size, data_format=data_format) 104 | 105 | return tf.layers.conv2d( 106 | inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides, 107 | padding=('SAME' if strides == 1 else 'VALID'), use_bias=False, 108 | kernel_initializer=tf.variance_scaling_initializer(), 109 | data_format=data_format) 110 | 111 | 112 | def residual_block(inputs, filters, is_training, strides, 113 | use_projection=False, data_format='channels_first'): 114 | """Standard building block for residual networks with BN after convolutions. 115 | 116 | Args: 117 | inputs: `Tensor` of size `[batch, channels, height, width]`. 118 | filters: `int` number of filters for the first two convolutions. Note that 119 | the third and final convolution will use 4 times as many filters. 120 | is_training: `bool` for whether the model is in training. 121 | strides: `int` block stride. If greater than 1, this block will ultimately 122 | downsample the input. 123 | use_projection: `bool` for whether this block should use a projection 124 | shortcut (versus the default identity shortcut). This is usually `True` 125 | for the first block of a block group, which may change the number of 126 | filters and the resolution. 127 | data_format: `str` either "channels_first" for `[batch, channels, height, 128 | width]` or "channels_last for `[batch, height, width, channels]`. 129 | 130 | Returns: 131 | The output `Tensor` of the block. 132 | """ 133 | shortcut = inputs 134 | if use_projection: 135 | # Projection shortcut in first layer to match filters and strides 136 | shortcut = conv2d_fixed_padding( 137 | inputs=inputs, filters=filters, kernel_size=1, strides=strides, 138 | data_format=data_format) 139 | shortcut = batch_norm_relu(shortcut, is_training, relu=False, 140 | data_format=data_format) 141 | 142 | inputs = conv2d_fixed_padding( 143 | inputs=inputs, filters=filters, kernel_size=3, strides=strides, 144 | data_format=data_format) 145 | inputs = batch_norm_relu(inputs, is_training, data_format=data_format) 146 | 147 | inputs = conv2d_fixed_padding( 148 | inputs=inputs, filters=filters, kernel_size=3, strides=1, 149 | data_format=data_format) 150 | inputs = batch_norm_relu(inputs, is_training, relu=False, init_zero=True, 151 | data_format=data_format) 152 | 153 | return tf.nn.relu(inputs + shortcut) 154 | 155 | 156 | def bottleneck_block(inputs, filters, is_training, strides, 157 | use_projection=False, data_format='channels_first'): 158 | """Bottleneck block variant for residual networks with BN after convolutions. 159 | 160 | Args: 161 | inputs: `Tensor` of size `[batch, channels, height, width]`. 162 | filters: `int` number of filters for the first two convolutions. Note that 163 | the third and final convolution will use 4 times as many filters. 164 | is_training: `bool` for whether the model is in training. 165 | strides: `int` block stride. If greater than 1, this block will ultimately 166 | downsample the input. 167 | use_projection: `bool` for whether this block should use a projection 168 | shortcut (versus the default identity shortcut). This is usually `True` 169 | for the first block of a block group, which may change the number of 170 | filters and the resolution. 171 | data_format: `str` either "channels_first" for `[batch, channels, height, 172 | width]` or "channels_last for `[batch, height, width, channels]`. 173 | 174 | Returns: 175 | The output `Tensor` of the block. 176 | """ 177 | shortcut = inputs 178 | if use_projection: 179 | # Projection shortcut only in first block within a group. Bottleneck blocks 180 | # end with 4 times the number of filters. 181 | filters_out = 4 * filters 182 | shortcut = conv2d_fixed_padding( 183 | inputs=inputs, filters=filters_out, kernel_size=1, strides=strides, 184 | data_format=data_format) 185 | shortcut = batch_norm_relu(shortcut, is_training, relu=False, 186 | data_format=data_format) 187 | 188 | inputs = conv2d_fixed_padding( 189 | inputs=inputs, filters=filters, kernel_size=1, strides=1, 190 | data_format=data_format) 191 | inputs = batch_norm_relu(inputs, is_training, data_format=data_format) 192 | 193 | inputs = conv2d_fixed_padding( 194 | inputs=inputs, filters=filters, kernel_size=3, strides=strides, 195 | data_format=data_format) 196 | inputs = batch_norm_relu(inputs, is_training, data_format=data_format) 197 | 198 | inputs = conv2d_fixed_padding( 199 | inputs=inputs, filters=4 * filters, kernel_size=1, strides=1, 200 | data_format=data_format) 201 | inputs = batch_norm_relu(inputs, is_training, relu=False, init_zero=True, 202 | data_format=data_format) 203 | 204 | return tf.nn.relu(inputs + shortcut) 205 | 206 | 207 | def block_group(inputs, filters, block_fn, blocks, strides, is_training, name, 208 | data_format='channels_first'): 209 | """Creates one group of blocks for the ResNet model. 210 | 211 | Args: 212 | inputs: `Tensor` of size `[batch, channels, height, width]`. 213 | filters: `int` number of filters for the first convolution of the layer. 214 | block_fn: `function` for the block to use within the model 215 | blocks: `int` number of blocks contained in the layer. 216 | strides: `int` stride to use for the first convolution of the layer. If 217 | greater than 1, this layer will downsample the input. 218 | is_training: `bool` for whether the model is training. 219 | name: `str`name for the Tensor output of the block layer. 220 | data_format: `str` either "channels_first" for `[batch, channels, height, 221 | width]` or "channels_last for `[batch, height, width, channels]`. 222 | 223 | Returns: 224 | The output `Tensor` of the block layer. 225 | """ 226 | # Only the first block per block_group uses projection shortcut and strides. 227 | inputs = block_fn(inputs, filters, is_training, strides, 228 | use_projection=True, data_format=data_format) 229 | 230 | for _ in range(1, blocks): 231 | inputs = block_fn(inputs, filters, is_training, 1, 232 | data_format=data_format) 233 | 234 | return tf.identity(inputs, name) 235 | 236 | 237 | def resnet_v1_generator(block_fn, layers, num_classes, 238 | data_format='channels_first'): 239 | """Generator for ResNet v1 models. 240 | 241 | Args: 242 | block_fn: `function` for the block to use within the model. Either 243 | `residual_block` or `bottleneck_block`. 244 | layers: list of 4 `int`s denoting the number of blocks to include in each 245 | of the 4 block groups. Each group consists of blocks that take inputs of 246 | the same resolution. 247 | num_classes: `int` number of possible classes for image classification. 248 | data_format: `str` either "channels_first" for `[batch, channels, height, 249 | width]` or "channels_last for `[batch, height, width, channels]`. 250 | 251 | Returns: 252 | Model `function` that takes in `inputs` and `is_training` and returns the 253 | output `Tensor` of the ResNet model. 254 | """ 255 | 256 | def model(inputs, is_training): 257 | """Creation of the model graph.""" 258 | inputs = conv2d_fixed_padding( 259 | inputs=inputs, filters=64, kernel_size=7, strides=2, 260 | data_format=data_format) 261 | inputs = tf.identity(inputs, 'initial_conv') 262 | inputs = batch_norm_relu(inputs, is_training, data_format=data_format) 263 | 264 | inputs = tf.layers.max_pooling2d( 265 | inputs=inputs, pool_size=3, strides=2, padding='SAME', 266 | data_format=data_format) 267 | inputs = tf.identity(inputs, 'initial_max_pool') 268 | 269 | inputs = block_group( 270 | inputs=inputs, filters=64, block_fn=block_fn, blocks=layers[0], 271 | strides=1, is_training=is_training, name='block_group1', 272 | data_format=data_format) 273 | inputs = block_group( 274 | inputs=inputs, filters=128, block_fn=block_fn, blocks=layers[1], 275 | strides=2, is_training=is_training, name='block_group2', 276 | data_format=data_format) 277 | inputs = block_group( 278 | inputs=inputs, filters=256, block_fn=block_fn, blocks=layers[2], 279 | strides=2, is_training=is_training, name='block_group3', 280 | data_format=data_format) 281 | inputs = block_group( 282 | inputs=inputs, filters=512, block_fn=block_fn, blocks=layers[3], 283 | strides=2, is_training=is_training, name='block_group4', 284 | data_format=data_format) 285 | 286 | # The activation is 7x7 so this is a global average pool. 287 | inputs = tf.layers.average_pooling2d( 288 | inputs=inputs, pool_size=7, strides=1, padding='VALID', 289 | data_format=data_format) 290 | inputs = tf.identity(inputs, 'final_avg_pool') 291 | inputs = tf.reshape( 292 | inputs, [-1, 2048 if block_fn is bottleneck_block else 512]) 293 | inputs = tf.layers.dense( 294 | inputs=inputs, 295 | units=num_classes, 296 | kernel_initializer=tf.random_normal_initializer(stddev=.01)) 297 | inputs = tf.identity(inputs, 'final_dense') 298 | return inputs 299 | 300 | model.default_image_size = 224 301 | return model 302 | 303 | 304 | def resnet_v1(resnet_depth, num_classes, data_format='channels_first'): 305 | """Returns the ResNet model for a given size and number of output classes.""" 306 | model_params = { 307 | 18: {'block': residual_block, 'layers': [2, 2, 2, 2]}, 308 | 34: {'block': residual_block, 'layers': [3, 4, 6, 3]}, 309 | 50: {'block': bottleneck_block, 'layers': [3, 4, 6, 3]}, 310 | 101: {'block': bottleneck_block, 'layers': [3, 4, 23, 3]}, 311 | 152: {'block': bottleneck_block, 'layers': [3, 8, 36, 3]}, 312 | 200: {'block': bottleneck_block, 'layers': [3, 24, 36, 3]} 313 | } 314 | 315 | if resnet_depth not in model_params: 316 | raise ValueError('Not a valid resnet_depth:', resnet_depth) 317 | 318 | params = model_params[resnet_depth] 319 | return resnet_v1_generator( 320 | params['block'], params['layers'], num_classes, data_format) 321 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | define PROJECT_HELP_MSG 2 | Usage: 3 | make help show this message 4 | make build build docker image 5 | make push push container 6 | make run run benchmarking container 7 | make jupyter run jupyter notebook inside container 8 | endef 9 | export PROJECT_HELP_MSG 10 | PWD:=$(shell pwd) 11 | dockerhub:= 12 | data:= 13 | image_name:=$(dockerhub)/distributed-training-control 14 | 15 | help: 16 | echo "$$PROJECT_HELP_MSG" | less 17 | 18 | build: 19 | docker build -t $(image_name) Docker 20 | 21 | jupyter: 22 | docker run -p 9999:9999 \ 23 | -e EXT_PWD=$(PWD) \ 24 | -e EXT_DATA=$(data) \ 25 | -e DOCKER_REPOSITORY=$(dockerhub) \ 26 | -v $(PWD):/workspace \ 27 | -v $(data):/data \ 28 | -v /var/run/docker.sock:/var/run/docker.sock \ 29 | -it $(image_name) bash -c "jupyter notebook" 30 | 31 | run: 32 | docker run -p 9999:9999 \ 33 | -e EXT_PWD=$(PWD) \ 34 | -e EXT_DATA=$(data) \ 35 | -e DOCKER_REPOSITORY=$(dockerhub) \ 36 | -v $(PWD):/workspace \ 37 | -v $(data):/data \ 38 | -v /var/run/docker.sock:/var/run/docker.sock \ 39 | -it $(image_name) 40 | 41 | push: 42 | docker push $(image_name) 43 | 44 | 45 | 46 | .PHONY: help build push 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Training Distributed Training on Batch AI 2 | 3 | This repo is a tutorial on how to train a CNN model in a distributed fashion using Batch AI. 4 | The scenario covered is image classification, but the solution can be generalized for other deep learning scenarios such as segmentation and object detection. 5 | 6 | ![Distributed training diagram](images/dist_training_diag2.png "Distributed training diagram") 7 | 8 | Image classification is a common task in computer vision applications and is often tackled by training a convolutional neural network (CNN). 9 | For particularly large models with large datasets, the training process can take weeks or months on a single GPU. 10 | In some situations, the models are so large that it isn’t possible to fit reasonable batch sizes onto the GPU. 11 | Using distributed training in these situations helps shorten the training time. 12 | In this specific scenario, a ResNet50 CNN model is trained using Horovod on the ImageNet dataset as well as on synthetic data. 13 | The tutorial demonstrates how to accomplish this using three of the most popular deep learning frameworks: TensorFlow, Keras, and PyTorch. 14 | There are number of ways to train a deep learning model in a distributed fashion, including data parallel and model parallel approaches based on synchronous and asynchronous updates. 15 | Currently the most common scenario is data parallel with synchronous updates—it’s the easiest to implement and sufficient for the majority of use cases. 16 | In data parallel distributed training with synchronous updates the model is replicated across N hardware devices and a 17 | mini-batch of training samples is divided into N micro-batches (see Figure 2). 18 | Each device performs the forward and backward pass for a micro-batch and when it finishes the process it shares the 19 | updates with the other devices. These are then used to calculate the updated weights of the entire mini-batch and then the 20 | weights are synchronized across the models. This is the scenario that is covered in the GitHub repository. The same architecture though can 21 | be used for model parallel and asynchronous updates. 22 | 23 | 24 | ## Prerequisites 25 | * Computer with Nvidia GPU (The path was tested on an [Azure NC12 Ubuntu DSVM](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu)) 26 | * Linux 27 | * [Docker](https://docs.docker.com/install/linux/docker-ce/ubuntu/) installed 28 | * [Nvidia Docker runtime](https://github.com/NVIDIA/nvidia-container-runtime) installed 29 | * [Dockerhub](https://hub.docker.com/) account 30 | * Port 9999 open on the VM or computer 31 | * ImageNet dataset (look at [this](00_DataProcessing.ipynb) notebook for details) 32 | 33 | ## Setup 34 | Before you begin make sure you are logged into your dockerhub account by running on your machine: 35 | 36 | ```bash 37 | docker login 38 | ``` 39 | 40 | 41 | 42 | ### Setup Execution Environment 43 | Before being able to run anything you will need to set up the environment in which you will be executing the Batch AI commands etc. 44 | There are a number of dependencies therefore we offer a dockerfile that will take care of these dependencies for you. 45 | If you don't want to use Docker simply look inside the Docker directory at the dockerfile and environment.yml file for the dependencies. 46 | To build the container run(replace all instances of with your own dockerhub account name): 47 | 48 | ```bash 49 | make build dockerhub= 50 | ``` 51 | 52 | The you run the command to start the environment (replace with a location on your file system. Make sure it has at least 300GB of free space for the ImageNet dataset) 53 | ```bash 54 | make jupyter dockerhub= data= 55 | ``` 56 | 57 | This will start the Jupyter notebook on port 9999. Simply point your browser to the IP or DNS of your machine. 58 | From there you can navigate to [00_DataProcessing.ipynb](00_DataProcessing.ipynb) to process the ImageNet Data. 59 | 60 | Once you have covered the two prerequisite notebooks folders [00_DataProcessing.ipynb](00_DataProcessing.ipynb) and [01_CreateResources.ipynb](01_CreateResources.ipynb) you can 61 | navigate to the tutorials for each of the frameworks [HorovodTF](HorovodTF), [HorovodPytorch](HorovodPytorch) and [HorovodKeras](HorovodKeras). 62 | 63 | 64 | 65 | # Contributing 66 | 67 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 68 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 69 | the rights to use your contribution. For details, visit https://cla.microsoft.com. 70 | 71 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide 72 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions 73 | provided by the bot. You will only need to do this once across all repos using our CLA. 74 | 75 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 76 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 77 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 78 | -------------------------------------------------------------------------------- /common/timer.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import functools 3 | import logging 4 | from timeit import default_timer 5 | 6 | 7 | class Timer(object): 8 | 9 | """ 10 | 11 | Keyword arguments: 12 | output: if True, print output after exiting context. 13 | if callable, pass output to callable. 14 | format: str.format string to be used for output; default "took {} seconds" 15 | prefix: string to prepend (plus a space) to output 16 | For convenience, if you only specify this, output defaults to True. 17 | """ 18 | 19 | def __init__(self, 20 | timer=default_timer, 21 | factor=1, 22 | output=None, 23 | fmt="took {:.3f} seconds", 24 | prefix=""): 25 | self._timer = timer 26 | self._factor = factor 27 | self._output = output 28 | self._fmt = fmt 29 | self._prefix = prefix 30 | self._end = None 31 | self._start = None 32 | 33 | def start(self): 34 | self._start = self() 35 | 36 | def stop(self): 37 | self._end = self() 38 | 39 | def __call__(self): 40 | """ Return the current time """ 41 | return self._timer() 42 | 43 | def __enter__(self): 44 | """ Set the start time """ 45 | self.start() 46 | return self 47 | 48 | def __exit__(self, exc_type, exc_value, exc_traceback): 49 | """ Set the end time """ 50 | self.stop() 51 | 52 | if self._output is True or (self._output is None and self._prefix): 53 | self._output = print 54 | 55 | if callable(self._output): 56 | output = " ".join([self._prefix, self._fmt.format(self.elapsed)]) 57 | self._output(output) 58 | 59 | def __str__(self): 60 | return '%.3f' % (self.elapsed) 61 | 62 | @property 63 | def elapsed(self): 64 | """ Return the elapsed time 65 | """ 66 | if self._end is None: 67 | # if elapsed is called in the context manager scope 68 | return (self() - self._start) * self._factor 69 | else: 70 | # if elapsed is called out of the context manager scope 71 | return (self._end - self._start) * self._factor 72 | 73 | 74 | def timer(logger=None, 75 | level=logging.INFO, 76 | fmt="function %(function_name)s execution time: %(execution_time).3f", 77 | *func_or_func_args, 78 | **timer_kwargs): 79 | """ Function decorator displaying the function execution time 80 | """ 81 | def wrapped_f(f): 82 | @functools.wraps(f) 83 | def wrapped(*args, **kwargs): 84 | with Timer(**timer_kwargs) as t: 85 | out = f(*args, **kwargs) 86 | context = { 87 | 'function_name': f.__name__, 88 | 'execution_time': t.elapsed, 89 | } 90 | if logger: 91 | logger.log( 92 | level, 93 | fmt % context, 94 | extra=context) 95 | else: 96 | print(fmt % context) 97 | return out 98 | 99 | return wrapped 100 | 101 | if (len(func_or_func_args) == 1 102 | and isinstance(func_or_func_args[0], collections.Callable)): 103 | return wrapped_f(func_or_func_args[0]) 104 | else: 105 | return wrapped_f 106 | -------------------------------------------------------------------------------- /common/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from dotenv import dotenv_values, set_key, find_dotenv, get_key 4 | from getpass import getpass 5 | 6 | 7 | def _create_env(dotenv_path): 8 | with open(dotenv_path, 'a'): 9 | os.utime(dotenv_path) 10 | 11 | 12 | def dotenv_for(): 13 | dotenv_path = find_dotenv() 14 | if dotenv_path == '': 15 | dotenv_path = '.env' 16 | _create_env(dotenv_path) 17 | return dotenv_path 18 | 19 | 20 | def get_password(dotenv_path): 21 | if 'PASSWORD' not in dotenv_values(dotenv_path=dotenv_path): 22 | print('Password not set') 23 | password = getpass('Please enter password to use for the cluster') 24 | _ = set_key(dotenv_path, 'PASSWORD', password) 25 | return get_key(dotenv_path, 'PASSWORD') 26 | 27 | 28 | def write_json_to_file(json_dict, filename, mode='w'): 29 | with open(filename, mode) as outfile: 30 | json.dump(json_dict, outfile, indent=4, sort_keys=True) 31 | outfile.write('\n\n') 32 | -------------------------------------------------------------------------------- /images/dist_training_diag2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/DistributedDeepLearning/d037c568bbd4394fbf2f668937d32122ae5a1a37/images/dist_training_diag2.png -------------------------------------------------------------------------------- /include/build.mk: -------------------------------------------------------------------------------- 1 | define PROJECT_HELP_MSG 2 | Usage: 3 | make help show this message 4 | make build make image 5 | make push push image 6 | endef 7 | export PROJECT_HELP_MSG 8 | 9 | help: 10 | echo "$$PROJECT_HELP_MSG" | less 11 | 12 | build: 13 | docker build -t $(image) $(dockerpath) 14 | 15 | push: 16 | docker push $(image) 17 | 18 | 19 | .PHONY: help build push 20 | --------------------------------------------------------------------------------