├── recipes ├── Caffe │ ├── Caffe-GPU │ │ ├── preparation_script.sh │ │ ├── lenet_solver.prototxt │ │ ├── Readme.md │ │ └── lenet_train_test.prototxt │ └── README.md ├── configuration.json.template ├── Keras │ ├── Keras-DSVM │ │ ├── job.json │ │ ├── Readme.md │ │ ├── mnist_cnn.py │ │ └── cli-instructions.md │ └── README.md ├── TensorFlow │ ├── TensorFlow-GPU │ │ ├── job.json │ │ ├── Readme.md │ │ ├── cli-instructions.md │ │ ├── TensorFlow-GPU.ipynb │ │ └── convolutional.py │ ├── Readme.md │ └── TensorFlow-GPU-Distributed │ │ ├── job.json │ │ ├── Readme.md │ │ ├── cli-instructions.md │ │ └── mnist_replica.py ├── Chainer │ ├── Chainer-GPU-Distributed │ │ ├── job.json │ │ ├── Readme.md │ │ ├── docker │ │ │ └── dockerfile │ │ ├── cli-instructions.md │ │ └── train_mnist.py │ └── README.md ├── CNTK │ ├── CNTK-GPU-Python-Distributed │ │ ├── CIFA-10_data_prepare.sh │ │ ├── job.json │ │ ├── Readme.md │ │ ├── cli-instructions.md │ │ ├── ConvNet_CIFAR10_DataAug.py │ │ └── ConvNet_CIFAR10_DataAug_Distributed.py │ ├── CNTK-GPU-Python │ │ ├── job.json │ │ ├── Readme.md │ │ ├── cli-instructions.md │ │ └── ConvNet_MNIST.py │ ├── CNTK-GPU-BrainScript │ │ ├── job.json │ │ ├── Readme.md │ │ ├── ConvNet_MNIST.cntk │ │ └── cli-instructions.md │ ├── CNTK-GPU-BrainScript-Distributed │ │ ├── job.json │ │ ├── Readme.md │ │ ├── ConvNet_MNIST.cntk │ │ └── cli-instructions.md │ ├── CNTK-GPU-Python-Distrbuted-Infiniband │ │ ├── job.json │ │ ├── jobprep_cntk_distributed_ib.sh │ │ ├── Readme.md │ │ ├── cli-instructions.md │ │ └── dockerfile │ └── Readme.md ├── Caffe2 │ ├── README.md │ └── Caffe2-GPU-Distributed │ │ └── Readme.md ├── Horovod │ ├── Horovod-Infiniband-Benchmark │ │ ├── job.json │ │ ├── jobprep_benchmark.sh │ │ ├── Readme.md │ │ └── cli-instructions.md │ ├── Horovod │ │ ├── job.json │ │ ├── Readme.md │ │ └── cli-instructions.md │ └── Readme.md ├── utilities.py └── Readme.md ├── LICENSE ├── README.md └── .gitignore /recipes/Caffe/Caffe-GPU/preparation_script.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | sed 's,$AZ_BATCHAI_OUTPUT_MODEL,'$AZ_BATCHAI_OUTPUT_MODEL',g; s,$AZ_BATCHAI_INPUT_SAMPLE,'$AZ_BATCHAI_INPUT_SAMPLE',g' $AZ_BATCHAI_INPUT_SAMPLE/lenet_solver.prototxt.template > $AZ_BATCHAI_INPUT_SAMPLE/lenet_solver.prototxt 4 | sed 's,$AZ_BATCHAI_INPUT_SAMPLE,'$AZ_BATCHAI_INPUT_SAMPLE',g' $AZ_BATCHAI_INPUT_SAMPLE/lenet_train_test.prototxt.template > $AZ_BATCHAI_INPUT_SAMPLE/lenet_train_test.prototxt 5 | -------------------------------------------------------------------------------- /recipes/configuration.json.template: -------------------------------------------------------------------------------- 1 | { 2 | "subscription_id": "", 3 | "aad_client_id": "", 4 | "aad_secret": "", 5 | "aad_tenant": "", 6 | "location": "eastus", 7 | "base_url": "", 8 | "resource_group": "", 9 | "storage_account" : { 10 | "name": "", 11 | "key": "" 12 | }, 13 | "admin_user" : { 14 | "name": "", 15 | "password": "", 16 | "ssh_public_key": "" 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /recipes/Keras/Keras-DSVM/job.json: -------------------------------------------------------------------------------- 1 | { 2 | "properties": { 3 | "nodeCount": 1, 4 | "customToolkitSettings": { 5 | "commandLine": "KERAS_BACKEND=cntk python $AZ_BATCHAI_INPUT_SCRIPT/mnist_cnn.py" 6 | }, 7 | "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 8 | "inputDirectories": [{ 9 | "id": "SCRIPT", 10 | "path": "$AZ_BATCHAI_MOUNT_ROOT/external/keras_samples" 11 | }] 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /recipes/TensorFlow/TensorFlow-GPU/job.json: -------------------------------------------------------------------------------- 1 | { 2 | "properties": { 3 | "nodeCount": 1, 4 | "tensorFlowSettings": { 5 | "pythonScriptFilePath": "$AZ_BATCHAI_INPUT_SCRIPT/convolutional.py", 6 | "masterCommandLineArgs": "-p" 7 | }, 8 | "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 9 | "inputDirectories": [{ 10 | "id": "SCRIPT", 11 | "path": "$AZ_BATCHAI_MOUNT_ROOT/external/tensorflow_samples" 12 | }], 13 | "containerSettings": { 14 | "imageSourceRegistry": { 15 | "image": "tensorflow/tensorflow:1.1.0-gpu" 16 | } 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /recipes/Caffe/README.md: -------------------------------------------------------------------------------- 1 | # Caffe 2 | 3 | Caffe is a deep learning framework made with expression, speed, and modularity in mind. It is developed by Berkeley AI Research (BAIR)/The Berkeley Vision and Learning Center (BVLC) and community contributors. 4 | 5 | See official Caffe GitHub page (https://github.com/BVLC/caffe). 6 | 7 | #### [Caffe-GPU](./Caffe-GPU) 8 | This Caffe-GPU recipe contains information on how to run Caffe training job on a GPU node with BatchAI. 9 | 10 | ## Help or Feedback 11 | -------------------- 12 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 13 | 14 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 15 | -------------------------------------------------------------------------------- /recipes/Chainer/Chainer-GPU-Distributed/job.json: -------------------------------------------------------------------------------- 1 | { 2 | "properties": { 3 | "nodeCount": 2, 4 | "chainerSettings": { 5 | "pythonScriptFilePath": "$AZ_BATCHAI_INPUT_SCRIPT/train_mnist.py", 6 | "masterCommandLineArgs": "-g -o $AZ_BATCHAI_OUTPUT_MODEL" 7 | }, 8 | "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 9 | "inputDirectories": [{ 10 | "id": "SCRIPT", 11 | "path": "$AZ_BATCHAI_MOUNT_ROOT/external/chainer_samples" 12 | }], 13 | "outputDirectories": [{ 14 | "id": "MODEL", 15 | "pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 16 | "pathSuffix": "Models" 17 | }], 18 | "containerSettings": { 19 | "imageSourceRegistry": { 20 | "image": "batchaitraining/chainermn:openMPI" 21 | } 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-Python-Distributed/CIFA-10_data_prepare.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | if [ ! -z $AZ_BATCHAI_JOB_TEMP ];then 3 | cd $AZ_BATCHAI_JOB_TEMP 4 | wget 'https://batchaisamples.blob.core.windows.net/samples/CIFAR-10_dataset.tar?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=b&sig=nFXsAp0Eq%2BoS5%2BKAEPnfyEGlCkBcKIadDvCPA%2BcX6lU%3D' -k -O 'CIFAR-10_dataset.tar' 5 | echo "untar CIFAR-10 dataset........." 6 | tar -xf CIFAR-10_dataset.tar 7 | echo "done" 8 | ROOT_DIR=`pwd` 9 | files=( "train_map.txt" "test_map.txt" ) 10 | for file in "${files[@]}" 11 | do 12 | output=$ROOT_DIR"/"$file 13 | if [ -f $output ];then 14 | rm $output 15 | fi 16 | touch $output 17 | while read -r line 18 | do 19 | name="$line" 20 | echo "$ROOT_DIR$name" >> $output 21 | done < $file".template" 22 | done 23 | fi 24 | -------------------------------------------------------------------------------- /recipes/Caffe/Caffe-GPU/lenet_solver.prototxt: -------------------------------------------------------------------------------- 1 | # The train/test net protocol buffer definition 2 | net: "$AZ_BATCHAI_INPUT_SAMPLE/lenet_train_test.prototxt" 3 | # test_iter specifies how many forward passes the test should carry out. 4 | # In the case of MNIST, we have test batch size 100 and 100 test iterations, 5 | # covering the full 10,000 testing images. 6 | test_iter: 100 7 | # Carry out testing every 500 training iterations. 8 | test_interval: 500 9 | # The base learning rate, momentum and the weight decay of the network. 10 | base_lr: 0.01 11 | momentum: 0.9 12 | weight_decay: 0.0005 13 | # The learning rate policy 14 | lr_policy: "inv" 15 | gamma: 0.0001 16 | power: 0.75 17 | # Display every 100 iterations 18 | display: 100 19 | # The maximum number of iterations 20 | max_iter: 10000 21 | # snapshot intermediate results 22 | snapshot: 5000 23 | snapshot_prefix: "$AZ_BATCHAI_OUTPUT_MODEL/lenet" 24 | # solver mode: CPU or GPU 25 | solver_mode: GPU 26 | -------------------------------------------------------------------------------- /recipes/Caffe2/README.md: -------------------------------------------------------------------------------- 1 | # Caffe2 2 | 3 | Caffe2 is a python-based lightweight, modular, and scalable deep learning framework. Building on the original Caffe, Caffe2 is designed with expression, speed, and modularity in mind. 4 | 5 | See official Caffe2 GitHub page (https://github.com/caffe2/caffe2). 6 | 7 | #### [Caffe2-GPU-Distributed](./Caffe2-GPU-Distributed) 8 | This Caffe2-GPU-Distributed recipe contains information on how to run distributed Caffe2 training job across multiple GPU nodes with BatchAI, by setting up a single-node NFS file server. 9 | 10 | ## Help or Feedback 11 | -------------------- 12 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 13 | 14 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 15 | -------------------------------------------------------------------------------- /recipes/Keras/README.md: -------------------------------------------------------------------------------- 1 | # Keras 2 | 3 | Keras is a high-level neural networks API, written in Python and capable of running on top of TensorFlow, CNTK, or Theano. It was developed with a focus on enabling fast experimentation. Being able to go from idea to result with the least possible delay is key to doing good research. 4 | 5 | See official Keras GitHub page (https://github.com/fchollet/keras). 6 | 7 | #### [Keras-DSVM](./Keras-DSVM) 8 | This Keras-DSVM recipe contains information on how to run Keras training job on a GPU data science node with BatchAI. 9 | 10 | ## Help or Feedback 11 | -------------------- 12 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 13 | 14 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 15 | -------------------------------------------------------------------------------- /recipes/Horovod/Horovod-Infiniband-Benchmark/job.json: -------------------------------------------------------------------------------- 1 | { 2 | "properties": { 3 | "nodeCount": 2, 4 | "jobPreparation": { 5 | "commandLine": "bash $AZ_BATCHAI_INPUT_SCRIPTS/jobprep_benchmark.sh" 6 | }, 7 | "customToolkitSettings": { 8 | "commandLine": "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh; cd $AZ_BATCHAI_JOB_TEMP/benchmarks/; mpirun -n 8 -ppn 4 -hosts $AZ_BATCH_HOST_LIST -env I_MPI_FABRICS=dapl -env I_MPI_DAPL_PROVIDER=ofa-v2-ib0 -env I_MPI_DYNAMIC_CONNECTION=0 python scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --model resnet101 --batch_size 64 --variable_update horovod" 9 | }, 10 | "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 11 | "inputDirectories": [{ 12 | "id": "SCRIPTS", 13 | "path": "$AZ_BATCHAI_MOUNT_ROOT/external/horovod_samples" 14 | } 15 | ], 16 | "containerSettings": { 17 | "imageSourceRegistry": { 18 | "image": "tensorflow/tensorflow:1.4.0-gpu" 19 | } 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-Python/job.json: -------------------------------------------------------------------------------- 1 | { 2 | "properties": { 3 | "nodeCount": 1, 4 | "cntkSettings": { 5 | "pythonScriptFilePath": "$AZ_BATCHAI_INPUT_SCRIPT/ConvNet_MNIST.py", 6 | "commandLineArgs": "$AZ_BATCHAI_INPUT_DATASET $AZ_BATCHAI_OUTPUT_MODEL" 7 | }, 8 | "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 9 | "inputDirectories": [{ 10 | "id": "DATASET", 11 | "path": "$AZ_BATCHAI_MOUNT_ROOT/external/mnist_database" 12 | }, { 13 | "id": "SCRIPT", 14 | "path": "$AZ_BATCHAI_MOUNT_ROOT/external/cntk_samples" 15 | }], 16 | "outputDirectories": [{ 17 | "id": "MODEL", 18 | "pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 19 | "pathSuffix": "Models" 20 | }], 21 | "containerSettings": { 22 | "imageSourceRegistry": { 23 | "image": "microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0" 24 | } 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-BrainScript/job.json: -------------------------------------------------------------------------------- 1 | { 2 | "properties": { 3 | "nodeCount": 1, 4 | "cntkSettings": { 5 | "configFilePath": "$AZ_BATCHAI_INPUT_CONFIG/ConvNet_MNIST.cntk", 6 | "commandLineArgs": "rootDir=. dataDir=$AZ_BATCHAI_INPUT_DATASET outputDir=$AZ_BATCHAI_OUTPUT_MODEL" 7 | }, 8 | "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 9 | "inputDirectories": [{ 10 | "id": "DATASET", 11 | "path": "$AZ_BATCHAI_MOUNT_ROOT/external/mnist_database" 12 | }, { 13 | "id": "CONFIG", 14 | "path": "$AZ_BATCHAI_MOUNT_ROOT/external/cntk_samples" 15 | }], 16 | "outputDirectories": [{ 17 | "id": "MODEL", 18 | "pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 19 | "pathSuffix": "Models" 20 | }], 21 | "containerSettings": { 22 | "imageSourceRegistry": { 23 | "image": "microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0" 24 | } 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /recipes/Horovod/Horovod/job.json: -------------------------------------------------------------------------------- 1 | { 2 | "properties": { 3 | "nodeCount": 2, 4 | "jobPreparation": { 5 | "commandLine": "apt update; apt install mpi-default-dev mpi-default-bin -y; pip install horovod" 6 | }, 7 | "customToolkitSettings": { 8 | "commandLine": "mpirun -mca btl_tcp_if_exclude docker0,lo --allow-run-as-root --hostfile $AZ_BATCHAI_MPI_HOST_FILE python $AZ_BATCHAI_INPUT_SCRIPTS/tensorflow_mnist.py" 9 | }, 10 | "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 11 | "outputDirectories": [ 12 | { 13 | "createNew": true, 14 | "id": "MODEL", 15 | "pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 16 | "pathSuffix": "Models", 17 | "type": "custom" 18 | } 19 | ], 20 | "inputDirectories": [{ 21 | "id": "SCRIPTS", 22 | "path": "$AZ_BATCHAI_MOUNT_ROOT/external/horovod_samples" 23 | } 24 | ], 25 | "containerSettings": { 26 | "imageSourceRegistry": { 27 | "image": "tensorflow/tensorflow:1.1.0-gpu" 28 | } 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-BrainScript-Distributed/job.json: -------------------------------------------------------------------------------- 1 | { 2 | "properties": { 3 | "nodeCount": 2, 4 | "cntkSettings": { 5 | "configFilePath": "$AZ_BATCHAI_INPUT_CONFIG/DistributedConvNet_MNIST.cntk", 6 | "commandLineArgs": "rootDir=. dataDir=$AZ_BATCHAI_INPUT_DATASET outputDir=$AZ_BATCHAI_OUTPUT_MODEL parallelTrain=true" 7 | }, 8 | "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 9 | "inputDirectories": [{ 10 | "id": "DATASET", 11 | "path": "$AZ_BATCHAI_MOUNT_ROOT/external/mnist_database" 12 | }, { 13 | "id": "CONFIG", 14 | "path": "$AZ_BATCHAI_MOUNT_ROOT/external/cntk_samples" 15 | }], 16 | "outputDirectories": [{ 17 | "id": "MODEL", 18 | "pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 19 | "pathSuffix": "Models" 20 | }], 21 | "containerSettings": { 22 | "imageSourceRegistry": { 23 | "image": "microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0" 24 | } 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /recipes/Horovod/Readme.md: -------------------------------------------------------------------------------- 1 | # Horovod 2 | 3 | Horovod is a distributed training framework for TensorFlow. The goal of Horovod is to make distributed Deep Learning 4 | fast and easy to use. 5 | 6 | See official [Horovod GitHub page](https://github.com/uber/horovod). 7 | 8 | #### [Horovod](./Horovod) 9 | 10 | This Horovod recipe contains information on how to run Horovod training job on a GPU cluster with Batch AI. 11 | 12 | #### [Horovod-Infiniband-Benchmark](./Horovod-Infiniband-Benchmark) 13 | 14 | This Horovod-Infiniband-Benchmark recipe contains information on how to reproduce [Horovod distributed training benchmarks](https://github.com/uber/horovod/blob/master/docs/benchmarks.md) with infiniband support using Batch AI. 15 | 16 | 17 | ## Help or Feedback 18 | -------------------- 19 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 20 | 21 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 22 | -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-Python-Distributed/job.json: -------------------------------------------------------------------------------- 1 | { 2 | "properties": { 3 | "nodeCount": 2, 4 | "cntkSettings": { 5 | "pythonScriptFilePath": "$AZ_BATCHAI_INPUT_SCRIPT/ConvNet_CIFAR10_DataAug_Distributed.py", 6 | "commandLineArgs": "--datadir $AZ_BATCHAI_JOB_TEMP -outputdir $AZ_BATCHAI_OUTPUT_MODEL -n 5", 7 | "processCount": 2 8 | }, 9 | "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 10 | "inputDirectories": [{ 11 | "id": "SCRIPT", 12 | "path": "$AZ_BATCHAI_MOUNT_ROOT/external/cntk_samples" 13 | }], 14 | "outputDirectories": [{ 15 | "id": "MODEL", 16 | "pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 17 | "pathSuffix": "Models" 18 | }], 19 | "containerSettings": { 20 | "imageSourceRegistry": { 21 | "image": "microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0" 22 | } 23 | }, 24 | "jobPreparation": { 25 | "commandLine": "bash $AZ_BATCHAI_INPUT_SCRIPT/CIFA-10_data_prepare.sh" 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-Python-Distrbuted-Infiniband/job.json: -------------------------------------------------------------------------------- 1 | { 2 | "properties": { 3 | "nodeCount": 2, 4 | "cntkSettings": { 5 | "pythonScriptFilePath": "$AZ_BATCHAI_INPUT_SCRIPT/TrainResNet_CIFAR10_Distributed.py", 6 | "commandLineArgs": "--datadir $AZ_BATCHAI_JOB_TEMP -outputdir $AZ_BATCHAI_OUTPUT_MODEL -n resnet110 -e 5", 7 | "processCount": 8 8 | }, 9 | "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 10 | "inputDirectories": [{ 11 | "id": "SCRIPT", 12 | "path": "$AZ_BATCHAI_MOUNT_ROOT/external/cntk_samples" 13 | }], 14 | "outputDirectories": [{ 15 | "id": "MODEL", 16 | "pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 17 | "pathSuffix": "" 18 | }], 19 | "containerSettings": { 20 | "imageSourceRegistry": { 21 | "image": "batchaitraining/cntk:2.3-gpu-1bitsgd-py36-cuda8-cudnn6-intelmpi" 22 | } 23 | }, 24 | "jobPreparation": { 25 | "commandLine": "bash $AZ_BATCHAI_INPUT_SCRIPT/jobprep_cntk_distributed_ib.sh" 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /recipes/TensorFlow/TensorFlow-GPU/Readme.md: -------------------------------------------------------------------------------- 1 | # Tensorflow GPU 2 | 3 | This example demonstrate how to run standard TensorFlow sample (https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py) on an Azure Batch AI cluster of one node. 4 | 5 | ## Details 6 | 7 | - For demonstration purposes, official convolutional.py will be deployed at Azure File Share; 8 | - Standard output of the job will be stored on Azure File Share; 9 | 10 | 11 | ## Instructions to Run Recipe 12 | 13 | ### Jupyter Notebook 14 | 15 | You can find Jupyter Notebook for this recipe in [TensorFlow-GPU.ipynb](./TensorFlow-GPU.ipynb). 16 | 17 | ### Azure CLI 2.0 18 | 19 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md). 20 | 21 | ## License Notice 22 | 23 | Under construction... 24 | 25 | ## Help or Feedback 26 | -------------------- 27 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 28 | 29 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 30 | -------------------------------------------------------------------------------- /recipes/Horovod/Horovod-Infiniband-Benchmark/jobprep_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | apt-get update -y 3 | apt-get install -y -q -o Dpkg::Options::="--force-confold" --no-install-recommends cpio libdapl2 libmlx4-1 libsm6 libxext6 wget git 4 | 5 | # download the benchmark scripts 6 | cd $AZ_BATCHAI_JOB_TEMP 7 | git clone https://github.com/alsrgv/benchmarks 8 | cd benchmarks 9 | git checkout horovod_v2 10 | 11 | # install intel MPI 12 | cd /tmp 13 | wget -q 'http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/11595/l_mpi_2017.3.196.tgz' 14 | tar zxvf l_mpi_2017.3.196.tgz 15 | sed -i -e 's/^ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' /tmp/l_mpi_2017.3.196/silent.cfg 16 | sed -i -e 's|^#ACTIVATION_LICENSE_FILE=|ACTIVATION_LICENSE_FILE=/tmp/l_mpi_2017.3.196/USE_SERVER.lic|g' /tmp/l_mpi_2017.3.196/silent.cfg 17 | sed -i -e 's/^ACTIVATION_TYPE=exist_lic/ACTIVATION_TYPE=license_server/g' /tmp/l_mpi_2017.3.196/silent.cfg 18 | cd /tmp/l_mpi_2017.3.196 19 | ./install.sh -s silent.cfg 20 | cd .. 21 | rm -rf l_mpi_2017.3.196* 22 | echo "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" >> ~/.bashrc 23 | 24 | # install horovod 25 | source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh 26 | pip install horovod -------------------------------------------------------------------------------- /recipes/TensorFlow/Readme.md: -------------------------------------------------------------------------------- 1 | # TensorFlow 2 | 3 | TensorFlow is an Python-based open source software library for numerical computation using data flow graphs. The graph nodes represent mathematical operations, while the graph edges represent the multidimensional data arrays (tensors) that flow between them. This flexible architecture lets you deploy computation to one or more CPUs or GPUs in a desktop, server, or mobile device without rewriting code. TensorFlow also includes TensorBoard, a data visualization toolkit. 4 | 5 | See official TensorFlow GitHub page (https://github.com/tensorflow/tensorflow). 6 | 7 | #### [TensorFlow-GPU-Distributed](./TensorFlow-GPU-Distributed) 8 | This TensorFlow-GPU-Distributed recipe contains information on how to run distributed TensorFlow job across multiple GPU nodes with BatchAI. 9 | 10 | #### [TensorFlow-GPU](./TensorFlow-GPU) 11 | This TensorFlow-GPU recipe contains information on how to run TensorFlow job on a GPU node with BatchAI. 12 | 13 | 14 | ## Help or Feedback 15 | -------------------- 16 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 17 | 18 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 19 | -------------------------------------------------------------------------------- /recipes/Keras/Keras-DSVM/Readme.md: -------------------------------------------------------------------------------- 1 | # Keras DSVM 2 | 3 | This recipe shows how to run Keras using Batch AI on DSVM. DSVM supports tensorflow, cntk and theano backends for running Keras. Currently only tensorflow and cntk backends supports running on GPU. 4 | 5 | ## Details 6 | 7 | - DSVM has Keras framework preinstalled; 8 | - Standard keras sample script [mnist_cnn.py](https://raw.githubusercontent.com/fchollet/keras/master/examples/mnist_cnn.py) is used; 9 | - The script downloads the standard MNIST Database on its own; 10 | - Standard output of the job will be stored on Azure File Share. 11 | 12 | ## Instructions to Run Recipe 13 | 14 | ### Python Jupyter Notebook 15 | 16 | You can find Jupyter Notebook for this recipe in [Keras-DSVM.ipynb](./Keras-DSVM.ipynb). 17 | 18 | ### Azure CLI 2.0 19 | 20 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md). 21 | 22 | ## License Notice 23 | 24 | Under construction... 25 | 26 | ## Help or Feedback 27 | -------------------- 28 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 29 | 30 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 31 | -------------------------------------------------------------------------------- /recipes/Chainer/README.md: -------------------------------------------------------------------------------- 1 | # Chainer 2 | 3 | Chainer is a Python-based deep learning framework aiming at flexibility. It provides automatic differentiation APIs based on the define-by-run approach (a.k.a. dynamic computational graphs) as well as object-oriented high-level APIs to build and train neural networks. It also supports CUDA/cuDNN using CuPy for high performance training and inference. For more details of Chainer, see the documents and resources listed above and join the community in Forum, Slack, and Twitter. 4 | 5 | ChainerMN is an additional package for Chainer, a flexible deep learning framework. ChainerMN enables multi-node distributed deep learning. 6 | 7 | See official GitHub pages for Chainer (https://github.com/chainer/chainer) and ChainerMN (https://github.com/chainer/chainermn) 8 | 9 | #### [Chainer-GPU-Distributed](./Chainer-GPU-Distributed) 10 | This Chainer-GPU-Distributed recipe contains information on how to run distributed Chainer training job across multiple GPU nodes with BatchAI. 11 | 12 | ## Help or Feedback 13 | -------------------- 14 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 15 | 16 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 17 | -------------------------------------------------------------------------------- /recipes/Caffe2/Caffe2-GPU-Distributed/Readme.md: -------------------------------------------------------------------------------- 1 | # Caffe2 GPU Distributed 2 | 3 | This example demonstrates how to run standard Caffe2 resnet50_trainer.py example using Batch AI. You can run it on a single or multiple compute nodes. 4 | 5 | ## Details 6 | 7 | - Standard Caffe2 sample script [resnet50_trainer.py](https://github.com/caffe2/caffe2/blob/master/caffe2/python/examples/resnet50_trainer.py) is used; 8 | - MNIST Dataset has been translated into a lmdb database, and can be obtained at http://download.caffe2.ai/databases/mnist-lmdb.zip; 9 | - NFS will be used for rendezvous temp files to coordinate between each shard/node 10 | - Standard output of the job will be stored on Azure File Share. 11 | 12 | 13 | ## Instructions to Run Recipe 14 | 15 | ### Python Jupyter Notebook 16 | 17 | You can find Jupyter Notebook for this sample in [Caffe2-GPU-Distributed.ipynb](./Caffe2-GPU-Distributed.ipynb). 18 | 19 | ### Azure CLI 2.0 20 | 21 | Under Construction... 22 | 23 | ## License Notice 24 | 25 | Under construction... 26 | 27 | ## Help or Feedback 28 | -------------------- 29 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 30 | 31 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 32 | -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-BrainScript/Readme.md: -------------------------------------------------------------------------------- 1 | # BrainScript CNTK GPU 2 | 3 | This example uses the MNIST dataset to demonstrate how to train a convolutional neural network (CNN) on an Azure Batch AI cluster of one node. 4 | 5 | ## Details 6 | 7 | - For demonstration purposes, MNIST dataset and ConvNet_MNIST.cntk will be deployed at Azure File Share; 8 | - Standard output of the job and the model will be stored on Azure File Share; 9 | - MNIST dataset (http://yann.lecun.com/exdb/mnist/) has been preprocessed by usign install_mnist.py available [here](https://batchaisamples.blob.core.windows.net/samples/mnist_dataset.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=c&sig=PmhL%2BYnYAyNTZr1DM2JySvrI12e%2F4wZNIwCtf7TRI%2BM%3D). 10 | - ConvNet_MNIST.cntk config file is available [here](./ConvNet_MNIST.cntk). 11 | 12 | ## Instructions to Run Recipe 13 | 14 | ### Jupyter Notebook 15 | 16 | You can find Jupyter Notebook for this recipe in [CNTK-GPU-BrainScript.ipynb](./CNTK-GPU-BrainScript.ipynb). 17 | 18 | ### Azure CLI 2.0 19 | 20 | You can find Azure CLI instructions for this recipe in [cli-instructions.md](./cli-instructions.md). 21 | 22 | ## License Notice 23 | 24 | Under construction... 25 | 26 | ## Help or Feedback 27 | -------------------- 28 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 29 | 30 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 31 | -------------------------------------------------------------------------------- /recipes/Chainer/Chainer-GPU-Distributed/Readme.md: -------------------------------------------------------------------------------- 1 | # Chainer GPU Distributed 2 | 3 | This example demonstrates how to run standard Chainer [train_mnist.py](https://github.com/chainer/chainer/blob/master/examples/mnist/train_mnist.py) example using Batch AI. 4 | 5 | ## Details 6 | 7 | - batchaitraining/chainer:distributed docker image is used; 8 | - Standard chainer sample script [train_mnist.py](https://github.com/chainer/chainer/blob/master/examples/mnist/train_mnist.py) is used; 9 | - Chainer downloads the standard MNIST Database on its own; 10 | - Standard output of the job and the model will be stored on Azure File Share; 11 | 12 | ## Instructions to Run Recipe 13 | 14 | ### Python Jupyter Notebook 15 | 16 | You can find Jupyter Notebook for this recipe in [Chainer-GPU-Distributed.ipynb](./Chainer-GPU-Distributed.ipynb). 17 | 18 | ### Azure CLI 2.0 19 | 20 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md). 21 | 22 | ## Dockerfile 23 | 24 | The `Dockerfile` for the Docker images used in this recipe can be found [here](./docker/dockerfile). The dockerfile is a modified version of ChainerMN example at https://github.com/chainer/chainermn/pull/71 25 | 26 | ## License Notice 27 | 28 | Under construction... 29 | 30 | ## Help or Feedback 31 | -------------------- 32 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 33 | 34 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 35 | -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-Python/Readme.md: -------------------------------------------------------------------------------- 1 | # Python CNTK GPU 2 | 3 | This example uses the MNIST dataset to demonstrate how to train a convolutional neural network (CNN) on an Azure Batch AI cluster of one node. 4 | 5 | ## Details 6 | 7 | - For demonstration purposes, MNIST dataset and ConvNet_MNIST.py will be deployed at Azure File Share; 8 | - Standard output of the job and the model will be stored on Azure File Share; 9 | - MNIST dataset (http://yann.lecun.com/exdb/mnist/) has been preprocessed by using install_mnist.py available [here](https://batchaisamples.blob.core.windows.net/samples/mnist_dataset.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=c&sig=PmhL%2BYnYAyNTZr1DM2JySvrI12e%2F4wZNIwCtf7TRI%2BM%3D). 10 | - The original CNTK example (https://github.com/Microsoft/CNTK/blob/master/Examples/Image/Classification/ConvNet/Python/ConvNet_MNIST.py) has been modified to accept CNTK dataset and model locations via command line arguments and available here [./ConvNet_MNIST.py). 11 | 12 | ## Instructions to Run Recipe 13 | 14 | ### Jupyter Notebook 15 | 16 | You can find Jupyter Notebook for this recipe in [CNTK-GPU-Python.ipynb](./CNTK-GPU-Python.ipynb). 17 | 18 | ### Azure CLI 2.0 19 | 20 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md). 21 | 22 | ## License Notice 23 | 24 | Under construction... 25 | 26 | ## Help or Feedback 27 | -------------------- 28 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 29 | 30 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 31 | -------------------------------------------------------------------------------- /recipes/TensorFlow/TensorFlow-GPU-Distributed/job.json: -------------------------------------------------------------------------------- 1 | { 2 | "properties": { 3 | "nodeCount": 2, 4 | "tensorFlowSettings": { 5 | "parameterServerCount": 1, 6 | "workerCount": 2, 7 | "pythonScriptFilePath": "$AZ_BATCHAI_INPUT_SCRIPT/mnist_replica.py", 8 | "masterCommandLineArgs": "--job_name=worker --num_gpus=1 --ps_hosts=$AZ_BATCHAI_PS_HOSTS --worker_hosts=$AZ_BATCHAI_WORKER_HOSTS --task_index=$AZ_BATCHAI_TASK_INDEX --data_dir=$AZ_BATCHAI_INPUT_DATASET --output_dir=$AZ_BATCHAI_OUTPUT_MODEL", 9 | "workerCommandLineArgs": "--job_name=worker --num_gpus=1 --ps_hosts=$AZ_BATCHAI_PS_HOSTS --worker_hosts=$AZ_BATCHAI_WORKER_HOSTS --task_index=$AZ_BATCHAI_TASK_INDEX --data_dir=$AZ_BATCHAI_INPUT_DATASET --output_dir=$AZ_BATCHAI_OUTPUT_MODEL", 10 | "parameterServerCommandLineArgs": "--job_name=ps --num_gpus=0 --ps_hosts=$AZ_BATCHAI_PS_HOSTS --worker_hosts=$AZ_BATCHAI_WORKER_HOSTS --task_index=$AZ_BATCHAI_TASK_INDEX --data_dir=$AZ_BATCHAI_INPUT_DATASET --output_dir=$AZ_BATCHAI_OUTPUT_MODEL" 11 | }, 12 | "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 13 | "inputDirectories": [{ 14 | "id": "DATASET", 15 | "path": "$AZ_BATCHAI_MOUNT_ROOT/external/mnist_dataset" 16 | }, { 17 | "id": "SCRIPT", 18 | "path": "$AZ_BATCHAI_MOUNT_ROOT/external/tensorflow_samples" 19 | }], 20 | "outputDirectories": [{ 21 | "id": "MODEL", 22 | "pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external", 23 | "pathSuffix": "Models" 24 | }], 25 | "containerSettings": { 26 | "imageSourceRegistry": { 27 | "image": "tensorflow/tensorflow:1.1.0-gpu" 28 | } 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /recipes/Horovod/Horovod/Readme.md: -------------------------------------------------------------------------------- 1 | # Horovod 2 | 3 | This recipe shows how to run [Horovod](https://github.com/uber/horovod) distributed training framework using Batch AI. 4 | 5 | Currently Batch AI has no native support for Horovod framework, but it's easy to run it using Batch AI custom toolkit. 6 | 7 | 8 | ## Details 9 | 10 | - Standard Horovod [tensorflow_mnist.py](https://github.com/uber/horovod/blob/v0.9.10/examples/tensorflow_mnist.py) example will be used; 11 | - tensorflow_mnist.py downloads training data on its own during execution; 12 | - The job will be run on standard tensorflow container ```tensorflow/tensorflow:1.1.0-gpu```. You can run the same job directly on GPU nodes by choosing Ubuntu DSVM as an image and removing 13 | container settings from the job definition.; 14 | - Horovod framework will be installed in the container using job preparation command line. Note, you can build your own docker image containing tensorflow and horovod instead. 15 | - Standard output of the job will be stored on Azure File Share. 16 | 17 | ## Instructions to Run Recipe 18 | 19 | ### Python Jupyter Notebook 20 | 21 | You can find Jupyter Notebook for this recipe in [Horovod.ipynb](./Horovod.ipynb). 22 | 23 | ### Azure CLI 2.0 24 | 25 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md). 26 | 27 | ## License Notice 28 | 29 | Under construction... 30 | 31 | ## Help or Feedback 32 | -------------------- 33 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 34 | 35 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 36 | -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-Python-Distrbuted-Infiniband/jobprep_cntk_distributed_ib.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # Download the CIFAR-10 dataset from Azure Blob 4 | if [ ! -z $AZ_BATCHAI_JOB_TEMP ];then 5 | cd $AZ_BATCHAI_JOB_TEMP 6 | wget 'https://batchaisamples.blob.core.windows.net/samples/CIFAR-10_dataset.tar?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=b&sig=nFXsAp0Eq%2BoS5%2BKAEPnfyEGlCkBcKIadDvCPA%2BcX6lU%3D' -k -O 'CIFAR-10_dataset.tar' 7 | echo "untar CIFAR-10 dataset........." 8 | tar -xf CIFAR-10_dataset.tar 9 | echo "done" 10 | ROOT_DIR=`pwd` 11 | files=( "train_map.txt" "test_map.txt" ) 12 | for file in "${files[@]}" 13 | do 14 | output=$ROOT_DIR"/"$file 15 | if [ -f $output ];then 16 | rm $output 17 | fi 18 | touch $output 19 | while read -r line 20 | do 21 | name="$line" 22 | echo "$ROOT_DIR$name" >> $output 23 | done < $file".template" 24 | done 25 | fi 26 | 27 | # install intel MPI if Infiniband is used 28 | if [ -d /dev/infiniband ];then 29 | cd /tmp 30 | wget -q 'http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/11595/l_mpi_2017.3.196.tgz' 31 | tar zxvf l_mpi_2017.3.196.tgz 32 | sed -i -e 's/^ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' /tmp/l_mpi_2017.3.196/silent.cfg 33 | sed -i -e 's|^#ACTIVATION_LICENSE_FILE=|ACTIVATION_LICENSE_FILE=/tmp/l_mpi_2017.3.196/USE_SERVER.lic|g' /tmp/l_mpi_2017.3.196/silent.cfg 34 | sed -i -e 's/^ACTIVATION_TYPE=exist_lic/ACTIVATION_TYPE=license_server/g' /tmp/l_mpi_2017.3.196/silent.cfg 35 | cd /tmp/l_mpi_2017.3.196 36 | ./install.sh -s silent.cfg 37 | cd .. 38 | rm -rf l_mpi_2017.3.196* 39 | echo "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" >> ~/.bashrc 40 | fi 41 | -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-Python-Distributed/Readme.md: -------------------------------------------------------------------------------- 1 | # Python CNTK GPU 2 | 3 | This example uses the CIFAR-10 dataset to demonstrate how to train a convolutional neural network (CNN) on a multi-node multi-GPU cluster. You can run this recipe on a single or multiple nodes. 4 | 5 | ## Details 6 | 7 | - For demonstration purposes, CIFAR-10 data preparation script and ConvNet_CIFAR10_DataAug_Distributed.py with its dependency will be deployed at Azure File Share; 8 | - Standard output of the job and the model will be stored on Azure File Share; 9 | - CIFAR-10 dataset(http://www.cs.toronto.edu/~kriz/cifar.html) has been preprocessed available [here](https://batchaisamples.blob.core.windows.net/samples/CIFAR-10_dataset.tar?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=b&sig=nFXsAp0Eq%2BoS5%2BKAEPnfyEGlCkBcKIadDvCPA%2BcX6lU%3D). 10 | - The official CNTK example ConvNet_CIFAR10_DataAug_Distributed.py (https://github.com/Microsoft/CNTK/blob/master/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py) is used. 11 | 12 | 13 | ## Instructions to Run Recipe 14 | 15 | ### Python Jupyter Notebook 16 | 17 | You can find Jupyter Notebook for this recipe in [CNTK-GPU-Python-Distrbuted.ipynb](./CNTK-GPU-Python-Distrbuted.ipynb). 18 | 19 | ### Azure CLI 2.0 20 | 21 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md). 22 | 23 | ## License Notice 24 | 25 | Under construction... 26 | 27 | ## Help or Feedback 28 | -------------------- 29 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 30 | 31 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 32 | -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-BrainScript-Distributed/Readme.md: -------------------------------------------------------------------------------- 1 | # Brainscript CNTK GPU Distributed 2 | 3 | This example uses the MNIST dataset to demonstrate how to train a convolutional neural network (CNN) on a GPU cluster. You can run this recipe on a single or multiple nodes. 4 | 5 | ## Details 6 | 7 | - For demonstration purposes, MNIST dataset and ConvNet_MNIST.cntk will be deployed at Azure File Share; 8 | - Standard output of the job and the model will be stored on Azure File Share; 9 | - MNIST dataset (http://yann.lecun.com/exdb/mnist/) has been preprocessed by usign install_mnist.py available [here](https://batchaisamples.blob.core.windows.net/samples/mnist_dataset.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=c&sig=PmhL%2BYnYAyNTZr1DM2JySvrI12e%2F4wZNIwCtf7TRI%2BM%3D). 10 | - ConvNet_MNIST.cntk config file has been modified from official cntk sample (https://raw.githubusercontent.com/Microsoft/CNTK/master/Examples/Image/Classification/ConvNet/BrainScript/ConvNet_MNIST.cntk) for distributed training, and is available [here](./ConvNet_MNIST.cntk). 11 | 12 | ## Instructions to Run Recipe 13 | 14 | ### Jupyter Notebook 15 | 16 | You can find Jupyter Notebook for this recipe in [CNTK-GPU-BrainScript-Distributed.ipynb](./CNTK-GPU-BrainScript-Distributed.ipynb). 17 | 18 | ### Azure CLI 2.0 19 | 20 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md). 21 | 22 | ## License Notice 23 | 24 | Under construction... 25 | 26 | ## Help or Feedback 27 | -------------------- 28 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 29 | 30 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 31 | -------------------------------------------------------------------------------- /recipes/Caffe/Caffe-GPU/Readme.md: -------------------------------------------------------------------------------- 1 | # Caffe GPU 2 | 3 | This example demonstrates how to run standard Caffe lenet_solver.prototxt example using Batch AI. This recipe is running on a signle . 4 | 5 | ## Details 6 | 7 | - For demonstration purposes, MNIST dataset and caffe configuration file will be deployed at Azure File Share; 8 | - Standard output of the job and the model will be stored on Azure File Share; 9 | - MNIST dataset has been preprocessed according to http://caffe.berkeleyvision.org/gathered/examples/mnist.html available [here](https://batchaisamples.blob.core.windows.net/samples/mnist_lmdb.zip?st=2017-10-06T00%3A15%3A00Z&se=2100-01-01T00%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=b&sig=jKlQA8x190lLGDXloeHrSe6jpOtUEYLD1DRoyWuiAdQ%3D). 10 | - The original Caffe solver and net prototxt files have been modified to take environment variables: AZ_BATCHAI_INPUT_SAMPLE and AZ_BATCHAI_OUTPUT_MODEL, and available here lenet_solver.prototxt and lenet_train_test.prototxt. 11 | - Since prototxt files supports neither command line overloading nor environment variable, we use job preparation task preparation_script.sh to expand the environment varible specified in the files, providing more flexibility of the job setup. 12 | 13 | 14 | ## Instructions to Run Recipe 15 | 16 | ### Python Jupyter Notebook 17 | 18 | You can find Jupyter Notebook for this sample in [Caffe-GPU.ipynb](./Caffe-GPU.ipynb). 19 | 20 | ### Azure CLI 2.0 21 | 22 | Under Construction... 23 | 24 | ## License Notice 25 | 26 | Under construction... 27 | 28 | ## Help or Feedback 29 | -------------------- 30 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 31 | 32 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 33 | -------------------------------------------------------------------------------- /recipes/TensorFlow/TensorFlow-GPU-Distributed/Readme.md: -------------------------------------------------------------------------------- 1 | # TensorFlow Distributed GPU 2 | 3 | ## Introduction 4 | 5 | This example demonstrate how to run standard TensorFlow sample (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dist_test/python/mnist_replica.py) on Azure Batch AI cluster of 2 nodes. 6 | 7 | ## Details 8 | 9 | - For demonstration purposes, MNIST dataset and `mnist_replica.py` will be deployed at Azure File Share; 10 | - Standard output of the job will be stored on Azure File Share; 11 | - MNIST dataset (http://yann.lecun.com/exdb/mnist/) is archived and uploaded into the [blob](https://batchaisamples.blob.core.windows.net/samples/mnist_dataset_original.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=b&sig=Qc1RA3zsXIP4oeioXutkL1PXIrHJO0pHJlppS2rID3I%3D). 12 | - The recipe uses official `mnist_replica.py` (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dist_test/python/mnist_replica.py). 13 | - Please refer to the [official tutorial](https://www.tensorflow.org/deploy/distributed) on distributed tensorflow training 14 | 15 | ## Instructions to Run Recipe 16 | 17 | ### Python Jupyter Notebook 18 | 19 | You can find Jupyter Notebook for this recipe in [TensorFlow-GPU-Distributed.ipynb](./TensorFlow-GPU-Distributed.ipynb). 20 | 21 | ### Azure CLI 2.0 22 | 23 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md). 24 | 25 | ## License Notice 26 | 27 | Under construction... 28 | 29 | ## Help or Feedback 30 | -------------------- 31 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 32 | 33 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 34 | -------------------------------------------------------------------------------- /recipes/Horovod/Horovod-Infiniband-Benchmark/Readme.md: -------------------------------------------------------------------------------- 1 | # Horovod-Infiniband-Benchmark 2 | 3 | This recipe shows how to reproduce [Horovod distributed training benchmarks](https://github.com/uber/horovod/blob/master/docs/benchmarks.md) using Azure Batch AI. 4 | 5 | Currently Batch AI has no native support for Horovod framework, but it's easy to run it using Batch AI custom toolkit. 6 | 7 | 8 | ## Details 9 | 10 | - Official Horovod Benchmark [scripts](https://github.com/alsrgv/benchmarks/tree/master/scripts/tf_cnn_benchmarks) will be used; 11 | - The job will be run on standard tensorflow container ```tensorflow/tensorflow:1.4.0-gpu```; 12 | - Horovod framework and IntelMPI will be installed in the container using job preparation command line. Note, you can build your own docker image containing tensorflow and horovod instead. 13 | - Benchmark scripts will be downloaded to GPU nodes using job preparation command line as well, stored in `$AZ_BATCHAI_JOB_TEMP` at each node 14 | - This sample needs to use at lesat two `STANDARD_NC24r` nodes, please be sure you have enough quota 15 | - Standard output of the job will be stored on Azure File Share. 16 | - This recipe ONLY reproduce the training results with synthetic data on NVIDIA K80 GPUs. 17 | 18 | 19 | ## Instructions to Run Recipe 20 | 21 | ### Python Jupyter Notebook 22 | 23 | You can find Jupyter Notebook for this recipe in [Horovod-Infiniband-Benchmark.ipynb](./Horovod-Infiniband-benchmark.ipynb). 24 | 25 | ### Azure CLI 2.0 26 | 27 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md). 28 | 29 | ## License Notice 30 | 31 | Under construction... 32 | 33 | ## Help or Feedback 34 | -------------------- 35 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 36 | 37 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 38 | -------------------------------------------------------------------------------- /recipes/CNTK/Readme.md: -------------------------------------------------------------------------------- 1 | # CNTK 2 | 3 | The Microsoft Cognitive Toolkit (https://cntk.ai), is a unified deep-learning toolkit that describes neural networks as a series of computational steps via a directed graph. CNTK can be included as a library in your Python, C#, or C++ programs, or used as a standalone machine learning tool through its own model description language (BrainScript). 4 | 5 | See official CNTK GitHub page (https://github.com/Microsoft/CNTK/). 6 | 7 | #### [CNTK-GPU-Python](./CNTK-GPU-Python) 8 | This CNTK-GPU-Python recipe contains information on how to run Python CNTK learning job on a GPU VM. 9 | 10 | #### [CNTK-GPU-Python-Distributed](./CNTK-GPU-Python-Distributed) 11 | This CNTK-GPU-Python-Distributed] recipe contains information on how to run Python CNTK learning job on GPU VMs, including execution across multiple compute nodes and multiple GPUs. 12 | 13 | #### [CNTK-GPU-Python-Distrbuted-Infiniband](./CNTK-GPU-Python-Distrbuted-Infiniband) 14 | This CNTK-GPU-Python-Distributed] recipe contains information on how to run Python CNTK learning job on GPU VMs, including execution across multiple compute nodes and multiple GPUs connected by Infiniband networks. 15 | 16 | #### [CNTK-GPU-BrainScript](./CNTK-GPU-BrainScript) 17 | This CNTK-GPU-BrainScript recipe contains information on how to run CNTK learning job on a GPU VM with BrainScript configuration file. 18 | 19 | #### [CNTK-GPU-BrainScript-Distributed](./CNTK-GPU-BrainScript-Distributed) 20 | This CNTK-GPU-BrainScript-Distributedrecipe contains information on how to run Python CNTK learning job on GPU VMs with BrainScript configuration file, including execution across multiple compute nodes and multiple GPUs. 21 | 22 | ## Help or Feedback 23 | -------------------- 24 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 25 | 26 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 27 | -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-Python-Distrbuted-Infiniband/Readme.md: -------------------------------------------------------------------------------- 1 | # Distrbuted CNTK with GPU and Infiniband 2 | 3 | This example uses the CIFAR-10 dataset to demonstrate how to train a Residual network (ResNet) on a multi-node multi-GPU cluster with infiniband. 4 | 5 | ## Details 6 | 7 | - The official CNTK ResNet for CIFAR10 [example](https://github.com/Microsoft/CNTK/tree/master/Examples/Image/Classification/ResNet/Python) is used. 8 | - CIFAR-10 dataset(http://www.cs.toronto.edu/~kriz/cifar.html) has been preprocessed available at the [Azure storage](https://batchaisamples.blob.core.windows.net/samples/CIFAR-10_dataset.tar?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=b&sig=nFXsAp0Eq%2BoS5%2BKAEPnfyEGlCkBcKIadDvCPA%2BcX6lU%3D), and will be downloaded to GPU local SSD. 9 | - The job will be run on a prebuild CNTK container ```batchaitraining/cntk:2.3-gpu-1bitsgd-py36-cuda8-cudnn6-intelmpi``` based on [dockerfile](./dockerfile). Intel MPI package will be installed in the container using job preparation command line. 10 | - For demonstration purposes, CIFAR-10 data preparation script and CNTK job scripts will be deployed at Azure File Share. 11 | - Standard output of the job and the model will be stored on Azure File Share. 12 | - This sample needs to use at lesat two STANDARD_NC24r nodes, please be sure you have enough quota 13 | - If you like to conduct performance comparasion with TCP network, you can create the cluster with VM size `STANDARD_NC24` that does not support Infiniband. 14 | 15 | ## Instructions to Run Recipe 16 | 17 | ### Python Jupyter Notebook 18 | 19 | You can find Jupyter Notebook for this recipe in [CNTK-GPU-Python-Distrbuted-Infiniband.ipynb](./CNTK-GPU-Python-Distrbuted-Infiniband.ipynb). 20 | 21 | ### Azure CLI 2.0 22 | 23 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md). 24 | 25 | ## License Notice 26 | 27 | Under construction... 28 | 29 | ## Help or Feedback 30 | -------------------- 31 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 32 | 33 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Azure Batch AI 2 | 3 | Welcome to our documenting page at https://docs.microsoft.com/azure/batch-ai 4 | 5 | ## Updates 6 | 7 | 11.15.2017 Java SDK is [available](https://mvnrepository.com/artifact/com.microsoft.azure/azure-mgmt-batchai) 8 | 9 | 11.08.2017 Node.js SDK is [available](https://www.npmjs.com/package/azure-arm-batchai) 10 | 11 | 10.11.2017 C# nuget package Microsoft.Azure.Management.BatchAI is available on nuget.org. 12 | 13 | 10.09.2017 Azure BatchAI starts public preview on October 9th, 2017! 14 | 15 | ## Batch AI Recipes 16 | 17 | We have created [recipes](/recipes/) for popular AI frameworks to help you get started with Batch AI and submit jobs without being an expert on Azure compute, storage, and networking. 18 | 19 | [Microsoft Cognitive Toolkit](/recipes/CNTK/) 20 | 21 | [TensorFlow](/recipes/TensorFlow/) 22 | 23 | [Chainer/ChainerMN](/recipes/Chainer/) 24 | 25 | [Caffe](/recipes/Caffe/) 26 | 27 | [Caffe2](/recipes/Caffe2/) 28 | 29 | [Horovod](/recipes/Horovod) 30 | 31 | [Custom Took Kit -- a Keras example](/recipes/Keras/) 32 | 33 | ## Contributing 34 | 35 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 36 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 37 | the rights to use your contribution. For details, visit https://cla.microsoft.com. 38 | 39 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide 40 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions 41 | provided by the bot. You will only need to do this once across all repos using our CLA. 42 | 43 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 44 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 45 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 46 | 47 | ## Help or Feedback 48 | -------------------- 49 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 50 | 51 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 52 | -------------------------------------------------------------------------------- /recipes/Keras/Keras-DSVM/mnist_cnn.py: -------------------------------------------------------------------------------- 1 | '''Trains a simple convnet on the MNIST dataset. 2 | 3 | Gets to 99.25% test accuracy after 12 epochs 4 | (there is still a lot of margin for parameter tuning). 5 | 16 seconds per epoch on a GRID K520 GPU. 6 | ''' 7 | 8 | from __future__ import print_function 9 | import keras 10 | from keras.datasets import mnist 11 | from keras.models import Sequential 12 | from keras.layers import Dense, Dropout, Flatten 13 | from keras.layers import Conv2D, MaxPooling2D 14 | from keras import backend as K 15 | 16 | batch_size = 128 17 | num_classes = 10 18 | epochs = 12 19 | 20 | # input image dimensions 21 | img_rows, img_cols = 28, 28 22 | 23 | # the data, shuffled and split between train and test sets 24 | (x_train, y_train), (x_test, y_test) = mnist.load_data() 25 | 26 | if K.image_data_format() == 'channels_first': 27 | x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) 28 | x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) 29 | input_shape = (1, img_rows, img_cols) 30 | else: 31 | x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) 32 | x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) 33 | input_shape = (img_rows, img_cols, 1) 34 | 35 | x_train = x_train.astype('float32') 36 | x_test = x_test.astype('float32') 37 | x_train /= 255 38 | x_test /= 255 39 | print('x_train shape:', x_train.shape) 40 | print(x_train.shape[0], 'train samples') 41 | print(x_test.shape[0], 'test samples') 42 | 43 | # convert class vectors to binary class matrices 44 | y_train = keras.utils.to_categorical(y_train, num_classes) 45 | y_test = keras.utils.to_categorical(y_test, num_classes) 46 | 47 | model = Sequential() 48 | model.add(Conv2D(32, kernel_size=(3, 3), 49 | activation='relu', 50 | input_shape=input_shape)) 51 | model.add(Conv2D(64, (3, 3), activation='relu')) 52 | model.add(MaxPooling2D(pool_size=(2, 2))) 53 | model.add(Dropout(0.25)) 54 | model.add(Flatten()) 55 | model.add(Dense(128, activation='relu')) 56 | model.add(Dropout(0.5)) 57 | model.add(Dense(num_classes, activation='softmax')) 58 | 59 | model.compile(loss=keras.losses.categorical_crossentropy, 60 | optimizer=keras.optimizers.Adadelta(), 61 | metrics=['accuracy']) 62 | 63 | model.fit(x_train, y_train, 64 | batch_size=batch_size, 65 | epochs=epochs, 66 | verbose=1, 67 | validation_data=(x_test, y_test)) 68 | score = model.evaluate(x_test, y_test, verbose=0) 69 | print('Test loss:', score[0]) 70 | print('Test accuracy:', score[1]) 71 | -------------------------------------------------------------------------------- /recipes/Chainer/Chainer-GPU-Distributed/docker/dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu14.04 2 | RUN apt-get update 3 | 4 | 5 | # disable interactive functions 6 | ENV DEBIAN_FRONTEND noninteractive 7 | 8 | 9 | #################Install MiniConda and other dependencies########## 10 | ENV CONDA_DIR /opt/conda 11 | ENV PATH $CONDA_DIR/bin:$PATH 12 | ENV OPENBLAS_NUM_THREADS $(nproc) 13 | 14 | RUN mkdir -p $CONDA_DIR && \ 15 | echo export PATH=$CONDA_DIR/bin:'$PATH' > /etc/profile.d/conda.sh && \ 16 | 17 | apt-get update -y && \ 18 | apt-get install -y \ 19 | 20 | wget \ 21 | vim \ 22 | git \ 23 | g++ \ 24 | graphviz \ 25 | 26 | software-properties-common \ 27 | python-software-properties \ 28 | python3-dev \ 29 | 30 | libhdf5-dev \ 31 | libopenblas-dev \ 32 | liblapack-dev \ 33 | libblas-dev \ 34 | gfortran && \ 35 | 36 | rm -rf /var/lib/apt/lists/* && \ 37 | 38 | 39 | wget --quiet https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ 40 | /bin/bash /Miniconda3-latest-Linux-x86_64.sh -f -b -p $CONDA_DIR && \ 41 | rm Miniconda3-latest-Linux-x86_64.sh 42 | 43 | 44 | #########################MPI########################### 45 | RUN cd /tmp && \ 46 | wget "https://www.open-mpi.org/software/ompi/v2.1/downloads/openmpi-2.1.1.tar.gz" && \ 47 | tar xzf openmpi-2.1.1.tar.gz && \ 48 | cd openmpi-2.1.1 && \ 49 | ./configure --with-cuda && make -j"$(nproc)" install # && ldconfig 50 | 51 | 52 | 53 | #######################NCCL########################### 54 | ENV CPATH /usr/local/cuda/include:/usr/local/include:$CPATH 55 | RUN cd /usr/local && git clone https://github.com/NVIDIA/nccl.git && cd nccl && \ 56 | 57 | ######### Compile for devices with cuda compute compatibility 3 (e.g. GRID K520 on aws) 58 | # UNCOMMENT line below to compile for GPUs with cuda compute compatibility 3.0 59 | # sed -i '/NVCC_GENCODE ?=/a \ -gencode=arch=compute_30,code=sm_30 \\' Makefile && \ 60 | ########## 61 | 62 | make CUDA_HOME=/usr/local/cuda -j"$(nproc)" && \ 63 | make install && ldconfig 64 | 65 | 66 | ####################Python 3######################### 67 | ARG python_version=3.5.2 68 | RUN conda install -y python=${python_version} && \ 69 | pip install -U pip && \ 70 | 71 | conda install Pillow scikit-learn notebook pandas matplotlib mkl nose pyyaml six h5py && \ 72 | 73 | 74 | pip install mpi4py && \ 75 | pip install cython && \ 76 | 77 | pip install chainer && \ 78 | pip install chainercv && \ 79 | pip install chainermn && \ 80 | 81 | conda clean -yt 82 | 83 | ENV PYTHONPATH $CONDA_DIR/lib/python3.5/site-packages/:$PYTHONPATH 84 | 85 | ###################################################### 86 | 87 | ENV PYTHONPATH /src/:$PYTHONPATH -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-BrainScript/ConvNet_MNIST.cntk: -------------------------------------------------------------------------------- 1 | # ConvNet on MNIST dataset. 2 | 3 | command = trainNetwork:testNetwork 4 | 5 | precision = "float"; traceLevel = 1 ; deviceId = "auto" 6 | 7 | rootDir = "../../.." ; dataDir = "$rootDir$/DataSets/MNIST" ; 8 | outputDir = "./Output" ; 9 | 10 | modelPath = "$outputDir$/Models/ConvNet_MNIST" 11 | #stderr = "$outputDir$/ConvNet_MNIST_bs_out" 12 | 13 | # TRAINING CONFIG 14 | trainNetwork = { 15 | action = "train" 16 | 17 | BrainScriptNetworkBuilder = { 18 | imageShape = 28:28:1 # image dimensions, 1 channel only 19 | labelDim = 10 # number of distinct labels 20 | featScale = 1/256 21 | Scale{f} = x => Constant(f) .* x 22 | 23 | model = Sequential ( 24 | Scale {featScale} : 25 | ConvolutionalLayer {32, (5:5), pad = true} : ReLU : 26 | MaxPoolingLayer {(3:3), stride=(2:2)} : 27 | ConvolutionalLayer {48, (3:3), pad = false} : ReLU : 28 | MaxPoolingLayer {(3:3), stride=(2:2)} : 29 | ConvolutionalLayer {64, (3:3), pad = false} : ReLU : 30 | DenseLayer {96} : Dropout : ReLU : 31 | LinearLayer {labelDim} 32 | ) 33 | 34 | # inputs 35 | features = Input {imageShape} 36 | labels = Input {labelDim} 37 | 38 | # apply model to features 39 | ol = model (features) 40 | 41 | # loss and error computation 42 | ce = CrossEntropyWithSoftmax (labels, ol) 43 | errs = ClassificationError (labels, ol) 44 | 45 | # declare special nodes 46 | featureNodes = (features) 47 | labelNodes = (labels) 48 | criterionNodes = (ce) 49 | evaluationNodes = (errs) 50 | outputNodes = (ol) 51 | } 52 | 53 | SGD = { 54 | epochSize = 60000 55 | minibatchSize = 64 56 | maxEpochs = 40 57 | learningRatesPerSample = 0.001*10:0.0005*10:0.0001 58 | dropoutRate = 0.5 59 | momentumAsTimeConstant = 0*5:1024 60 | 61 | numMBsToShowResult = 500 62 | } 63 | 64 | reader = { 65 | readerType = "CNTKTextFormatReader" 66 | # See ../README.md for details on getting the data (Train-28x28_cntk_text.txt). 67 | file = "$DataDir$/Train-28x28_cntk_text.txt" 68 | randomize = true 69 | keepDataInMemory = true 70 | input = { 71 | features = { dim = 784 ; format = "dense" } 72 | labels = { dim = 10 ; format = "dense" } 73 | } 74 | } 75 | } 76 | 77 | # TEST CONFIG 78 | testNetwork = { 79 | action = test 80 | minibatchSize = 1024 # reduce this if you run out of memory 81 | 82 | reader = { 83 | readerType = "CNTKTextFormatReader" 84 | file = "$DataDir$/Test-28x28_cntk_text.txt" 85 | input = { 86 | features = { dim = 784 ; format = "dense" } 87 | labels = { dim = 10 ; format = "dense" } 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /recipes/Caffe/Caffe-GPU/lenet_train_test.prototxt: -------------------------------------------------------------------------------- 1 | name: "LeNet" 2 | layer { 3 | name: "mnist" 4 | type: "Data" 5 | top: "data" 6 | top: "label" 7 | include { 8 | phase: TRAIN 9 | } 10 | transform_param { 11 | scale: 0.00390625 12 | } 13 | data_param { 14 | source: "$AZ_BATCHAI_INPUT_SAMPLE/mnist_train_lmdb" 15 | batch_size: 64 16 | backend: LMDB 17 | } 18 | } 19 | layer { 20 | name: "mnist" 21 | type: "Data" 22 | top: "data" 23 | top: "label" 24 | include { 25 | phase: TEST 26 | } 27 | transform_param { 28 | scale: 0.00390625 29 | } 30 | data_param { 31 | source: "$AZ_BATCHAI_INPUT_SAMPLE/mnist_test_lmdb" 32 | batch_size: 100 33 | backend: LMDB 34 | } 35 | } 36 | layer { 37 | name: "conv1" 38 | type: "Convolution" 39 | bottom: "data" 40 | top: "conv1" 41 | param { 42 | lr_mult: 1 43 | } 44 | param { 45 | lr_mult: 2 46 | } 47 | convolution_param { 48 | num_output: 20 49 | kernel_size: 5 50 | stride: 1 51 | weight_filler { 52 | type: "xavier" 53 | } 54 | bias_filler { 55 | type: "constant" 56 | } 57 | } 58 | } 59 | layer { 60 | name: "pool1" 61 | type: "Pooling" 62 | bottom: "conv1" 63 | top: "pool1" 64 | pooling_param { 65 | pool: MAX 66 | kernel_size: 2 67 | stride: 2 68 | } 69 | } 70 | layer { 71 | name: "conv2" 72 | type: "Convolution" 73 | bottom: "pool1" 74 | top: "conv2" 75 | param { 76 | lr_mult: 1 77 | } 78 | param { 79 | lr_mult: 2 80 | } 81 | convolution_param { 82 | num_output: 50 83 | kernel_size: 5 84 | stride: 1 85 | weight_filler { 86 | type: "xavier" 87 | } 88 | bias_filler { 89 | type: "constant" 90 | } 91 | } 92 | } 93 | layer { 94 | name: "pool2" 95 | type: "Pooling" 96 | bottom: "conv2" 97 | top: "pool2" 98 | pooling_param { 99 | pool: MAX 100 | kernel_size: 2 101 | stride: 2 102 | } 103 | } 104 | layer { 105 | name: "ip1" 106 | type: "InnerProduct" 107 | bottom: "pool2" 108 | top: "ip1" 109 | param { 110 | lr_mult: 1 111 | } 112 | param { 113 | lr_mult: 2 114 | } 115 | inner_product_param { 116 | num_output: 500 117 | weight_filler { 118 | type: "xavier" 119 | } 120 | bias_filler { 121 | type: "constant" 122 | } 123 | } 124 | } 125 | layer { 126 | name: "relu1" 127 | type: "ReLU" 128 | bottom: "ip1" 129 | top: "ip1" 130 | } 131 | layer { 132 | name: "ip2" 133 | type: "InnerProduct" 134 | bottom: "ip1" 135 | top: "ip2" 136 | param { 137 | lr_mult: 1 138 | } 139 | param { 140 | lr_mult: 2 141 | } 142 | inner_product_param { 143 | num_output: 10 144 | weight_filler { 145 | type: "xavier" 146 | } 147 | bias_filler { 148 | type: "constant" 149 | } 150 | } 151 | } 152 | layer { 153 | name: "accuracy" 154 | type: "Accuracy" 155 | bottom: "ip2" 156 | bottom: "label" 157 | top: "accuracy" 158 | include { 159 | phase: TEST 160 | } 161 | } 162 | layer { 163 | name: "loss" 164 | type: "SoftmaxWithLoss" 165 | bottom: "ip2" 166 | bottom: "label" 167 | top: "loss" 168 | } 169 | -------------------------------------------------------------------------------- /recipes/Horovod/Horovod-Infiniband-Benchmark/cli-instructions.md: -------------------------------------------------------------------------------- 1 | Please follow [instructions](/recipes/Readme.md) to install Azure CLI 2.0, configure default location, create and configure default resource group and storage account. 2 | 3 | 4 | ### Script Deployment 5 | 6 | - Create an Azure File Share with `horovod_samples` folder: 7 | ```sh 8 | az storage share create --name batchaisample 9 | az storage directory create --share-name batchaisample --name horovod_samples 10 | ``` 11 | Upload the job preparation script, that does the following tasks: 12 | - Install essential packages for infiniband support 13 | - Download benchmark scripts from https://github.com/alsrgv/benchmarks 14 | - Install IntelMPI binary 15 | - Install honovod framework 16 | ```sh 17 | az storage file upload --share-name batchaisample --source jobprep_benchmark.sh --path horovod_samples 18 | ``` 19 | 20 | ### Cluster 21 | 22 | By default, for this recipe we will use a GPU cluster with two nodes (`min node = max node = 2`) of `Standard_NC24r` size (four GPU with infiniband) 23 | with latest Ubuntu 16.04-LTS image. 24 | 25 | Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`. 26 | 27 | #### Cluster Creation Command 28 | 29 | For GNU/Linux users: 30 | 31 | ```sh 32 | az batchai cluster create -n nc24r -s Standard_NC24r --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub 33 | ``` 34 | 35 | For Windows users: 36 | 37 | ```sh 38 | az batchai cluster create -n nc24r -s Standard_NC24r --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u -p 39 | ``` 40 | 41 | ### Job 42 | 43 | The job creation parameters are in [job.json](./job.json): 44 | 45 | - An input directory with ID `SCRIPTS` to allow the job to find the job preparation script via environment variable `$AZ_BATCHAI_INPUT_SCRIPTS`; 46 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and error streams; 47 | - nodeCount defines how many nodes will be used for the job execution; 48 | - ```tensorflow/tensorflow:1.4.0-gpu``` standard tensorflow container will be used 49 | - ```Horovod``` framwork, intelMPI and horovod benchmark scripts will be downloaded/installed by job preparation script; 50 | You can build and publish your own docker image containing tensorflow and Horovod instead; 51 | - The benchmark script (```tf_cnn_benchmarks.py```) will be executed with custom toolkit; 52 | - If you are insterested using TCP instead, please replace ```-env I_MPI_FABRICS=dapl -env I_MPI_DAPL_PROVIDER=ofa-v2-ib0 -env I_MPI_DYNAMIC_CONNECTION=0``` with ```-env I_MPI_FABRICS=tcp``` in the command line. 53 | 54 | #### Job Creation Command 55 | 56 | ```sh 57 | az batchai job create -n horovod_benchmark --cluster-name nc24r -c job.json 58 | ``` 59 | 60 | Note, the job will start running when the cluster finished allocation and initialization of the nodes. 61 | 62 | ### Get Help 63 | 64 | The Azure CLI has built-in help documentation, which you can run from the command line: 65 | 66 | ```sh 67 | az [command-group [command]] -h 68 | ``` 69 | 70 | For example, to get information about all Azure Batch AI categories, use: 71 | 72 | ```sh 73 | az batchai -h 74 | ``` 75 | 76 | To get help with the command to create a cluster, use: 77 | 78 | ```sh 79 | az batchai cluster create -h 80 | ``` 81 | 82 | You can use [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) as end-to-end example of CLI usage. 83 | -------------------------------------------------------------------------------- /recipes/TensorFlow/TensorFlow-GPU/cli-instructions.md: -------------------------------------------------------------------------------- 1 | Please follow [instructions](/documentation/using-azure-cli-20.md) to install Azure CLI 2.0 and configure it for using with Batch AI. 2 | 3 | ### Create a Resource Group 4 | 5 | Create a resource group ```batchaitests``` (or choose your own resource name) which will be used for resources creations: 6 | 7 | ```sh 8 | az group create -n batchaitests -l eastus 9 | ``` 10 | 11 | ### Create a Storage Account 12 | 13 | Create a storage account with an unique name in the same region where you are going to use Batch AI: 14 | 15 | ```sh 16 | az storage account create -n --sku Standard_LRS -l eastus -g batchaitests 17 | ``` 18 | 19 | 20 | ### Data Deployment 21 | 22 | - Download convolutional.py sample script into the current folder: 23 | 24 | For GNU/Linux users: 25 | 26 | ```sh 27 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/TensorFlow/TensorFlow-GPU/convolutional.py?token=AcZzrZcCveHaaevWYBtN9wYREYDOJvY-ks5Z4c4QwA%3D%3D" -O convolutional.py 28 | ``` 29 | 30 | - Create an Azure File Share with `tensorflow_samples` folder and upload convolutional.py into it: 31 | 32 | ```sh 33 | az storage share create --name batchaisample --account-name 34 | az storage directory create --share-name batchaisample --name tensorflow_samples 35 | az storage file upload --share-name batchaisample --source convolutional.py --path tensorflow_samples 36 | ``` 37 | 38 | ### Cluster 39 | 40 | For this recipe we need one node GPU cluster (`min node = max node = 1`) of `Standard_NC6` size (one GPU) with standard Ubuntu LTS (`UbuntuLTS`) or Ubuntu DSVM (```UbuntuDSVM```) image and Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`. 41 | 42 | #### Cluster Creation Command 43 | 44 | For GNU/Linux users: 45 | 46 | ```sh 47 | az batchai cluster create -l eastus -g batchaitests --storage-account-name -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 1 --max 1 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub 48 | ``` 49 | 50 | For Windows users: 51 | 52 | ```sh 53 | az batchai cluster create -l eastus -g batchaitests --storage-account-name -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 1 --max 1 --afs-name batchaisample --afs-mount-path external -u -p 54 | ``` 55 | 56 | ### Job 57 | 58 | The job creation parameters are in [job.json](./job.json): 59 | 60 | - An input directory with ID `SCRIPT` to allow the job to find the sample script via environment variable `$AZ_BATCHAI_INPUT_SCRIPT`; 61 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and error streams; 62 | - nodeCount defining how many nodes will be used for the job execution; 63 | - path and parameters for running convolutional.py; 64 | - ```tensorflow/tensorflow:1.1.0-gpu``` docker image will be used for job execution. 65 | 66 | Note, you can delete the docker image information to run the job directly on DSVM. 67 | 68 | #### Job Creation Command 69 | 70 | ```sh 71 | az batchai job create -l eastus -g batchaitests -n tensorflow -r nc6 -c job.json 72 | ``` 73 | 74 | Note, the job will start running when the cluster finished allocation and initialization of the node. 75 | 76 | ### Next Steps 77 | 78 | Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) contains detailed information on 79 | how to manage your clusters and jobs. 80 | 81 | [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) contains an end-to-end example of using 82 | Azure CLI 2.0 for Batch AI cluster creation, job submission and checking job's execution results. -------------------------------------------------------------------------------- /recipes/Chainer/Chainer-GPU-Distributed/cli-instructions.md: -------------------------------------------------------------------------------- 1 | Please follow [instructions](/documentation/using-azure-cli-20.md) to install Azure CLI 2.0 and configure it for using with Batch AI. 2 | 3 | 4 | ### Create a Resource Group 5 | 6 | Create a resource group ```batchaitests``` (or choose your own resource name) which will be used for resources creations: 7 | 8 | ```sh 9 | az group create -n batchaitests -l eastus 10 | ``` 11 | 12 | ### Create a Storage Account 13 | 14 | Create a storage account with an unique name in the same region where you are going to use Batch AI: 15 | 16 | ```sh 17 | az storage account create -n --sku Standard_LRS -l eastus -g batchaitests 18 | ``` 19 | 20 | ### Data Deployment 21 | 22 | - Download train_mnist sample script into the current folder: 23 | 24 | For GNU/Linux users: 25 | 26 | ```sh 27 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/Chainer/Chainer-GPU-Distributed/train_mnist.py?token=AcZzrV-OFepRwpRSB1kyABIX-PLh2ZHqks5Z4eukwA%3D%3D" -O train_mnist.py 28 | ``` 29 | 30 | - Create an Azure File Share with `chainer_samples` folder and upload train_mnist.py into it: 31 | 32 | ```sh 33 | az storage share create --name batchaisample --account-name 34 | az storage directory create --share-name batchaisample --name chainer_samples 35 | az storage file upload --share-name batchaisample --source train_mnist.py --path chainer_samples 36 | ``` 37 | 38 | ### Cluster 39 | 40 | For this recipe we need two nodes GPU cluster (`min node = max node = 2`) of `Standard_NC6` size (one GPU) with standard Ubuntu LTS (`UbuntuLTS`) or Ubuntu DSVM (```UbuntuDSVM```) image and Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`. 41 | 42 | #### Cluster Creation Command 43 | 44 | For GNU/Linux users: 45 | 46 | ```sh 47 | az batchai cluster create -l eastus -g batchaitests --storage-account-name -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub 48 | ``` 49 | 50 | For Windows users: 51 | 52 | ```sh 53 | az batchai cluster create -l eastus -g batchaitests --storage-account-name -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u -p 54 | ``` 55 | 56 | ### Job 57 | 58 | The job creation parameters are in [job.json](./job.json): 59 | 60 | - An input directory with ID `SCRIPT` to allow the job to find the sample script via environment variable `$AZ_BATCHAI_INPUT_SCRIPT`; 61 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and error streams; 62 | - output directory with ID `MODEL` to allow the hob to find the output directory via environment variable `$AZ_BATCHAI_OUTPUT_MODEL`; 63 | - nodeCount defining how many nodes will be used for the job execution; 64 | - path and parameters for running train_mnist.py; 65 | - ```batchaitraining/chainermn:openMPI``` docker image will be used for job execution. 66 | 67 | #### Job Creation Command 68 | 69 | ```sh 70 | az batchai job create -l eastus -g batchaitests -n distributed_chainer -r nc6 -c job.json 71 | ``` 72 | 73 | Note, the job will start running when the cluster finished allocation and initialization of the node. 74 | 75 | ### Next Steps 76 | 77 | Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) contains detailed information on 78 | how to manage your clusters and jobs. 79 | 80 | [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) contains an end-to-end example of using 81 | Azure CLI 2.0 for Batch AI cluster creation, job submission and checking job's execution results. 82 | -------------------------------------------------------------------------------- /recipes/Keras/Keras-DSVM/cli-instructions.md: -------------------------------------------------------------------------------- 1 | Please follow [instructions](/documentation/using-azure-cli-20.md) to install Azure CLI 2.0 and configure it for using with Batch AI. 2 | 3 | ### Create a Resource Group 4 | 5 | Create a resource group ```batchaitests``` (or choose your own resource name) which will be used for resources creations: 6 | 7 | ```sh 8 | az group create -n batchaitests -l eastus 9 | ``` 10 | 11 | ### Create a Storage Account 12 | 13 | Create a storage account with an unique name in the same region where you are going to use Batch AI: 14 | 15 | ```sh 16 | az storage account create -n --sku Standard_LRS -l eastus -g batchaitests 17 | ``` 18 | 19 | 20 | ### Data Deployment 21 | 22 | - Download mnist_cnn.py sample script into the current folder: 23 | 24 | For GNU/Linux users: 25 | 26 | ```sh 27 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/Keras/Keras-DSVM/mnist_cnn.py?token=AcZzrU1mri0vNMxtUKL6GW6hSezGK7qBks5Z4ewWwA%3D%3D" -O mnist_cnn.py 28 | ``` 29 | 30 | - Create an Azure File Share with `keras_samples` folder and upload mnist_cnn.py 31 | into it: 32 | 33 | ```sh 34 | az storage share create --name batchaisample --account-name 35 | az storage directory create --share-name batchaisample --name keras_samples 36 | az storage file upload --share-name batchaisample --source mnist_cnn.py --path keras_samples 37 | ``` 38 | 39 | ### Cluster 40 | 41 | For this recipe we need one node GPU cluster (`min node = max node = 1`) of `Standard_NC6` size (one GPU) with Ubuntu DSVM (```UbuntuDSVM```) image and Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`. 42 | 43 | #### Cluster Creation Command 44 | 45 | For GNU/Linux users: 46 | 47 | ```sh 48 | az batchai cluster create -l eastus -g batchaitests --storage-account-name -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 1 --max 1 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub 49 | ``` 50 | 51 | For Windows users: 52 | 53 | ```sh 54 | az batchai cluster create -l eastus -g batchaitests --storage-account-name -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 1 --max 1 --afs-name batchaisample --afs-mount-path external -u -p 55 | ``` 56 | 57 | ### Job 58 | 59 | The job creation parameters are in [job.json](./job.json): 60 | 61 | - An input directory with ID `SCRIPT` to allow the job to find the sample script via environment variable `$AZ_BATCHAI_INPUT_SCRIPT`; 62 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and error streams; 63 | - nodeCount defining how many nodes will be used for the job execution; 64 | - BatchAI has no native support for Keras, but it can run it as a custom_toolkit; 65 | - Keral in this recipe uses cntk backend; DSVM supports cntk, tensorflow and theano backends for keras, just change KERAS_BACKEND to "tensorflow" or "theano" to use corresponding backend. Note, theano backend will run on CPU. 66 | - the job will run on DSVM directly, so no docker image is configured for it. 67 | 68 | 69 | #### Job Creation Command 70 | 71 | ```sh 72 | az batchai job create -l eastus -g batchaitests -n keras -r nc6 -c job.json 73 | ``` 74 | 75 | Note, the job will start running when the cluster finished allocation and initialization of the node. 76 | 77 | ### Next Steps 78 | 79 | Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) contains detailed information on 80 | how to manage your clusters and jobs. 81 | 82 | [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) contains an end-to-end example of using 83 | Azure CLI 2.0 for Batch AI cluster creation, job submission and checking job's execution results. -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-BrainScript-Distributed/ConvNet_MNIST.cntk: -------------------------------------------------------------------------------- 1 | # ConvNet on MNIST dataset. 2 | 3 | command = trainNetwork:testNetwork 4 | 5 | precision = "float"; traceLevel = 1 ; deviceId = "auto" 6 | 7 | rootDir = "../../.." ; dataDir = "$rootDir$/DataSets/MNIST" ; 8 | outputDir = "./Output" ; 9 | 10 | modelPath = "$outputDir$/Models/ConvNet_MNIST" 11 | #stderr = "$outputDir$/ConvNet_MNIST_bs_out" 12 | 13 | maxEpochs = 40 14 | minibatchSize = 64 15 | autoAdjustMinibatch = false 16 | 17 | # TRAINING CONFIG 18 | trainNetwork = { 19 | action = "train" 20 | 21 | BrainScriptNetworkBuilder = { 22 | imageShape = 28:28:1 # image dimensions, 1 channel only 23 | labelDim = 10 # number of distinct labels 24 | featScale = 1/256 25 | Scale{f} = x => Constant(f) .* x 26 | 27 | model = Sequential ( 28 | Scale {featScale} : 29 | ConvolutionalLayer {32, (5:5), pad = true} : ReLU : 30 | MaxPoolingLayer {(3:3), stride=(2:2)} : 31 | ConvolutionalLayer {48, (3:3), pad = false} : ReLU : 32 | MaxPoolingLayer {(3:3), stride=(2:2)} : 33 | ConvolutionalLayer {64, (3:3), pad = false} : ReLU : 34 | DenseLayer {96} : Dropout : ReLU : 35 | LinearLayer {labelDim} 36 | ) 37 | 38 | # inputs 39 | features = Input {imageShape} 40 | labels = Input {labelDim} 41 | 42 | # apply model to features 43 | ol = model (features) 44 | 45 | # loss and error computation 46 | ce = CrossEntropyWithSoftmax (labels, ol) 47 | errs = ClassificationError (labels, ol) 48 | 49 | # declare special nodes 50 | featureNodes = (features) 51 | labelNodes = (labels) 52 | criterionNodes = (ce) 53 | evaluationNodes = (errs) 54 | outputNodes = (ol) 55 | } 56 | 57 | SGD = { 58 | ParallelTrain = { 59 | parallelizationMethod = DataParallelASGD 60 | distributedMBReading = true 61 | syncPerfStats = 20 62 | DataParallelASGD = [ 63 | syncPeriodPerWorker=256 64 | usePipeline = true 65 | AdjustLearningRateAtBeginning = [ 66 | adjustCoefficient = 0.2 67 | adjustNBMiniBatch = 1024 68 | ] 69 | ] 70 | 71 | } 72 | AutoAdjust = { 73 | autoAdjustMinibatch = $autoAdjustMinibatch$ 74 | minibatchSizeTuningFrequency = 3 75 | } 76 | epochSize = 60000 77 | minibatchSize = $minibatchSize$ 78 | maxEpochs = $maxEpochs$ 79 | learningRatesPerSample = 0.001*10:0.0005*10:0.0001 80 | dropoutRate = 0.5 81 | momentumAsTimeConstant = 0*5:1024 82 | 83 | numMBsToShowResult = 500 84 | } 85 | 86 | reader = { 87 | readerType = "CNTKTextFormatReader" 88 | # See ../REAMDE.md for details on getting the data (Train-28x28_cntk_text.txt). 89 | file = "$DataDir$/Train-28x28_cntk_text.txt" 90 | randomize = true 91 | keepDataInMemory = true 92 | input = { 93 | features = { dim = 784 ; format = "dense" } 94 | labels = { dim = 10 ; format = "dense" } 95 | } 96 | } 97 | } 98 | 99 | # TEST CONFIG 100 | testNetwork = { 101 | action = test 102 | minibatchSize = 1024 # reduce this if you run out of memory 103 | 104 | reader = { 105 | readerType = "CNTKTextFormatReader" 106 | file = "$DataDir$/Test-28x28_cntk_text.txt" 107 | input = { 108 | features = { dim = 784 ; format = "dense" } 109 | labels = { dim = 10 ; format = "dense" } 110 | } 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /recipes/Horovod/Horovod/cli-instructions.md: -------------------------------------------------------------------------------- 1 | Please follow [instructions](/documentation/using-azure-cli-20.md) to install Azure CLI 2.0 and configure it for using with Batch AI. 2 | 3 | ### Create a Resource Group 4 | 5 | Create a resource group ```batchaitests``` (or choose your own resource name) which will be used for resources creations: 6 | 7 | ```sh 8 | az group create -n batchaitests -l eastus 9 | ``` 10 | 11 | ### Create a Storage Account 12 | 13 | Create a storage account with an unique name in the same region where you are going to use Batch AI: 14 | 15 | ```sh 16 | az storage account create -n --sku Standard_LRS -l eastus -g batchaitests 17 | ``` 18 | 19 | ### Data Deployment 20 | 21 | - Download tensorflow_mnist.py sample script into the current folder: 22 | 23 | For GNU/Linux users: 24 | 25 | ```sh 26 | wget https://raw.githubusercontent.com/uber/horovod/v0.9.10/examples/tensorflow_mnist.py 27 | ``` 28 | 29 | - Create an Azure File Share with `horovod_samples` folder and upload tensorflow_mnist.py into it: 30 | 31 | ```sh 32 | az storage share create --name batchaisample --account-name 33 | az storage directory create --share-name batchaisample --name horovod_samples 34 | az storage file upload --share-name batchaisample --source tensorflow_mnist.py --path horovod_samples 35 | ``` 36 | 37 | ### Cluster 38 | 39 | For this recipe we will use a GPU cluster with two nodes (`min node = max node = 2`) of `Standard_NC6` size (one GPU) 40 | with Ubuntu DSVM (```UbuntuDSVM```) image and Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`. 41 | 42 | #### Cluster Creation Command 43 | 44 | For GNU/Linux users: 45 | 46 | ```sh 47 | az batchai cluster create -l eastus -g batchaitests --storage-account-name -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub 48 | ``` 49 | 50 | For Windows users: 51 | 52 | ```sh 53 | az batchai cluster create -l eastus -g batchaitests --storage-account-name -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u -p 54 | ``` 55 | 56 | ### Job 57 | 58 | The job creation parameters are in [job.json](./job.json): 59 | 60 | - An input directory with ID `SCRIPTS` to allow the job to find the sample script via environment variable `$AZ_BATCHAI_INPUT_SCRIPTS`; 61 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and error streams; 62 | - nodeCount defines how many nodes will be used for the job execution; 63 | - ```tensorflow/tensorflow:1.1.0-gpu``` standard tensorflow container will be used and ```Horovod``` will be installed by job preparation command line. 64 | You can build and publish your own docker image containing tensorflow and Horovod instead; 65 | - The ```tensorflow_mnist.py``` example will be executed with custom toolkit. 66 | - To run mpi task we will use hostfile generated but Batch AI and available via ```$AZ_BATCHAI_MPI_HOST_FILE``` environment variable. 67 | 68 | Note, you can delete ```containerSettings``` from the job definition to run the same job directly on the host DSVM. 69 | 70 | #### Job Creation Command 71 | 72 | ```sh 73 | az batchai job create -l eastus -g batchaitests --storage-account-name -n horovod -r nc6 -c job.json 74 | ``` 75 | 76 | Note, the job will start running when the cluster finished allocation and initialization of the nodes. 77 | 78 | ### Next Steps 79 | 80 | Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) contains detailed information on 81 | how to manage your clusters and jobs. 82 | 83 | [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) contains an end-to-end example of using 84 | Azure CLI 2.0 for Batch AI cluster creation, job submission and checking job's execution results. -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-Python-Distributed/cli-instructions.md: -------------------------------------------------------------------------------- 1 | Please follow [instructions](/documentation/using-azure-cli-20.md) to install Azure CLI 2.0 and configure it for using with Batch AI. 2 | 3 | ### Create a Resource Group 4 | 5 | Create a resource group ```batchaitests``` (or choose your own resource name) which will be used for resources creations: 6 | 7 | ```sh 8 | az group create -n batchaitests -l eastus 9 | ``` 10 | 11 | ### Create a Storage Account 12 | 13 | Create a storage account with an unique name in the same region where you are going to use Batch AI: 14 | 15 | ```sh 16 | az storage account create -n --sku Standard_LRS -l eastus -g batchaitests 17 | ``` 18 | 19 | ### Data Deployment 20 | 21 | - Download ConvNet_CIFAR10_DataAug_Distributed.py, ConvNet_CIFAR10_DataAug.py and CIFA-10_data_prepare.sh into the current folder: 22 | 23 | For GNU/Linux users: 24 | 25 | ```sh 26 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/CNTK/CNTK-GPU-Python-Distributed/ConvNet_CIFAR10_DataAug_Distributed.py?token=AcZzrbN1I34RrKn8MPnn5_dfy86I-XEIks5Z4cfswA%3D%3D" -O ConvNet_CIFAR10_DataAug_Distributed.py 27 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/CNTK/CNTK-GPU-Python-Distributed/ConvNet_CIFAR10_DataAug.py?token=AcZzrWAAVqoQXUtPR0JxBF7m4pXbUACzks5Z4cguwA%3D%3D" -O ConvNet_CIFAR10_DataAug.py 28 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/CNTK/CNTK-GPU-Python-Distributed/CIFA-10_data_prepare.sh?token=AcZzrdr1tTQK_Gr7EdVXvg-sUarpWMqnks5Z4chYwA%3D%3D" -O CIFA-10_data_prepare.sh 29 | ``` 30 | 31 | - Create an Azure File Share with `cntk_sample` folder and upload the scripts into it: 32 | 33 | ```sh 34 | az storage share create --name batchaisample --account-name 35 | az storage directory create --share-name batchaisample --name cntk_samples 36 | az storage file upload --share-name batchaisample --source ConvNet_CIFAR10_DataAug_Distributed.py --path cntk_samples 37 | az storage file upload --share-name batchaisample --source ConvNet_CIFAR10_DataAug.py --path cntk_samples 38 | az storage file upload --share-name batchaisample --source CIFA-10_data_prepare.sh --path cntk_samples 39 | ``` 40 | 41 | ### Cluster 42 | 43 | For this recipe we need two nodes GPU cluster (`min node = max node = 2`) of `Standard_NC6` size (one GPU) with standard Ubuntu LTS (`UbuntuLTS`) or Ubuntu DSVM (```UbuntuDSVM```) image and Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`. 44 | 45 | #### Cluster Creation Command 46 | 47 | For GNU/Linux users: 48 | 49 | ```sh 50 | az batchai cluster create -l eastus -g batchaitests --storage-account-name -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub 51 | ``` 52 | 53 | For Windows users: 54 | 55 | ```sh 56 | az batchai cluster create -l eastus -g batchaitests --storage-account-name -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u -p 57 | ``` 58 | 59 | ### Job 60 | 61 | The job creation parameters are in [job.json](./job.json): 62 | 63 | - An input directory with IDs `SCRIPT` to allow the job to find the sample scripts via environment variable `$AZ_BATCHAI_INPUT_SCRIPT`; 64 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and input; 65 | - An output directory with ID `MODEL` to allow job to find the output directory for the model via `$AZ_BATCHAI_OUTPUT_MODEL` environment variable; 66 | - node_count defining how many nodes will be used for the job execution; 67 | - job preparation task will execute CIFA-10_data_prepare.sh script to download and preprocess CIFAR-10 dataset on local SSD (at $AZ_BATCHAI_JOB_TEMP); 68 | - path and parameters for running ConvNet_CIFAR10_DataAug_Distributed.py; 69 | - ```microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0``` docker image will be used for job execution. 70 | 71 | Note, you can delete the docker image information to run the job directly on DSVM. 72 | 73 | #### Job Creation Command 74 | 75 | ```sh 76 | az batchai job create -l eastus -g batchaitests -n distributed_cntk_python -r nc6 -c job.json 77 | ``` 78 | 79 | Note, the job will start running when the cluster finished allocation and initialization of the nodes. 80 | 81 | ### Next Steps 82 | 83 | Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) contains detailed information on 84 | how to manage your clusters and jobs. 85 | 86 | [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) contains an end-to-end example of using 87 | Azure CLI 2.0 for Batch AI cluster creation, job submission and checking job's execution results. -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-BrainScript/cli-instructions.md: -------------------------------------------------------------------------------- 1 | Please follow [instructions](/documentation/using-azure-cli-20.md) to install Azure CLI 2.0 and configure it for using with Batch AI. 2 | 3 | ### Create a Resource Group 4 | 5 | Create a resource group ```batchaitests``` (or choose your own resource name) which will be used for resources creations: 6 | 7 | ```sh 8 | az group create -n batchaitests -l eastus 9 | ``` 10 | 11 | ### Create a Storage Account 12 | 13 | Create a storage account with an unique name in the same region where you are going to use Batch AI: 14 | 15 | ```sh 16 | az storage account create -n --sku Standard_LRS -l eastus -g batchaitests 17 | ``` 18 | 19 | ### Data Deployment 20 | 21 | - Download and extract preprocessed MNIST Database from this [location](https://batchaisamples.blob.core.windows.net/samples/mnist_dataset.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=c&sig=PmhL%2BYnYAyNTZr1DM2JySvrI12e%2F4wZNIwCtf7TRI%2BM%3D) into the current folder. 22 | 23 | For GNU/Linux users: 24 | 25 | ```sh 26 | wget "https://batchaisamples.blob.core.windows.net/samples/mnist_dataset.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=c&sig=PmhL%2BYnYAyNTZr1DM2JySvrI12e%2F4wZNIwCtf7TRI%2BM%3D" -O mnist_dataset.zip 27 | unzip mnist_dataset.zip 28 | ``` 29 | 30 | - Download ConvNet_MNIST.cntk config file into the current folder: 31 | 32 | For GNU/Linux users: 33 | 34 | ```sh 35 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/CNTK/CNTK-GPU-BrainScript/ConvNet_MNIST.cntk?token=AcZzrfNpH_TV0LwzeHO_iGt4Kuh04on8ks5Z4bFrwA%3D%3D" -O ConvNet_MNIST.cntk 36 | ``` 37 | 38 | - Create an Azure File Share with `nmist_database` and `cntk_sample` folders and upload MNIST database and BrainScript ConvNet_MNIST.cntk config file: 39 | 40 | ```sh 41 | az storage share create --name batchaisample --account-name 42 | az storage directory create --share-name batchaisample --name mnist_database 43 | az storage file upload --share-name batchaisample --source Train-28x28_cntk_text.txt --path mnist_database 44 | az storage file upload --share-name batchaisample --source Test-28x28_cntk_text.txt --path mnist_database 45 | az storage directory create --share-name batchaisample --name cntk_samples 46 | az storage file upload --share-name batchaisample --source ConvNet_MNIST.cntk --path cntk_samples 47 | ``` 48 | 49 | ### Cluster 50 | 51 | For this recipe we need one node GPU cluster (`min node = max node = 1`) of `Standard_NC6` size (one GPU) with standard Ubuntu LTS (`UbuntuLTS`) or Ubuntu DSVM (```UbuntuDSVM```) image and Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`. 52 | 53 | #### Cluster Creation Command 54 | 55 | For GNU/Linux users: 56 | 57 | ```sh 58 | az batchai cluster create -l eastus -g batchaitests --storage-account-name -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 1 --max 1 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub 59 | ``` 60 | 61 | For Windows users: 62 | 63 | ```sh 64 | az batchai cluster create -l eastus -g batchaitests --storage-account-name -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 1 --max 1 --afs-name batchaisample --afs-mount-path external -u -p 65 | ``` 66 | 67 | ### Job 68 | 69 | The job creation parameters are in [job.json](./job.json): 70 | 71 | - Two input directories with IDs `CONFIG` and `DATASET` to allow the job to find the sample config and MNIST Database via environment variables `$AZ_BATCHAI_INPUT_CONFIG` and `$AZ_BATCHAI_INPUT_DATASET`; 72 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and error streams; 73 | - An output directory with ID `MODEL` to allow job to find the output directory for the model via `$AZ_BATCHAI_OUTPUT_MODEL` environment variable; 74 | - node_count defining how many nodes will be used for the job execution; 75 | - ```microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0``` docker image will be used for job execution. 76 | 77 | Note, you can remove docker image information to run the job directly on DSVM. 78 | 79 | #### Job Creation Command 80 | 81 | ```sh 82 | az batchai job create -l eastus -g batchaitests -n cntk -r nc6 -c job.json 83 | ``` 84 | 85 | Note, the job will start running when the cluster finished allocation and initialization of the node. 86 | 87 | ### Next Steps 88 | 89 | Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) contains detailed information on 90 | how to manage your clusters and jobs. 91 | 92 | [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) contains an end-to-end example of using 93 | Azure CLI 2.0 for Batch AI cluster creation, job submission and checking job's execution results. -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-Python/cli-instructions.md: -------------------------------------------------------------------------------- 1 | Please follow [instructions](/documentation/using-azure-cli-20.md) to install Azure CLI 2.0 and configure it for using with Batch AI. 2 | 3 | ### Create a Resource Group 4 | 5 | Create a resource group ```batchaitests``` (or choose your own resource name) which will be used for resources creations: 6 | 7 | ```sh 8 | az group create -n batchaitests -l eastus 9 | ``` 10 | 11 | ### Create a Storage Account 12 | 13 | Create a storage account with an unique name in the same region where you are going to use Batch AI: 14 | 15 | ```sh 16 | az storage account create -n --sku Standard_LRS -l eastus -g batchaitests 17 | ``` 18 | 19 | ### Data Deployment 20 | 21 | - Download and extract preprocessed MNIST Database from this [location](https://batchaisamples.blob.core.windows.net/samples/mnist_dataset.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=c&sig=PmhL%2BYnYAyNTZr1DM2JySvrI12e%2F4wZNIwCtf7TRI%2BM%3D) into the current folder. 22 | 23 | For GNU/Linux users: 24 | 25 | ```sh 26 | wget "https://batchaisamples.blob.core.windows.net/samples/mnist_dataset.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=c&sig=PmhL%2BYnYAyNTZr1DM2JySvrI12e%2F4wZNIwCtf7TRI%2BM%3D" -O mnist_dataset.zip 27 | unzip mnist_dataset.zip 28 | ``` 29 | 30 | - Download ConvNet_MNIST.py example script into the current folder: 31 | 32 | For GNU/Linux users: 33 | 34 | ```sh 35 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/CNTK/CNTK-GPU-Python/ConvNet_MNIST.py?token=AcZzrejaokHC2Nj5ehsoMFe4t3LqFcThks5Z4bmEwA%3D%3D" -O ConvNet_MNIST.py 36 | ``` 37 | 38 | - Create an Azure File Share with `nmist_database` and `cntk_sample` folders and upload MNIST database and ConvNet_MNIST.py script: 39 | 40 | ```sh 41 | az storage share create --name batchaisample --account-name 42 | az storage directory create --share-name batchaisample --name mnist_database 43 | az storage file upload --share-name batchaisample --source Train-28x28_cntk_text.txt --path mnist_database 44 | az storage file upload --share-name batchaisample --source Test-28x28_cntk_text.txt --path mnist_database 45 | az storage directory create --share-name batchaisample --name cntk_samples 46 | az storage file upload --share-name batchaisample --source ConvNet_MNIST.py --path cntk_samples 47 | ``` 48 | 49 | ### Cluster 50 | 51 | For this recipe we need one node GPU cluster (`min node = max node = 1`) of `Standard_NC6` size (one GPU) with standard Ubuntu LTS (`UbuntuLTS`) or Ubuntu DSVM (```UbuntuDSVM```) image and Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`. 52 | 53 | #### Cluster Creation Command 54 | 55 | For GNU/Linux users: 56 | 57 | ```sh 58 | az batchai cluster create -l eastus -g batchaitests --storage-account-name -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 1 --max 1 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub 59 | ``` 60 | 61 | For Windows users: 62 | 63 | ```sh 64 | az batchai cluster create -l eastus -g batchaitests --storage-account-name -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 1 --max 1 --afs-name batchaisample --afs-mount-path external -u -p 65 | ``` 66 | 67 | ### Job 68 | 69 | The job creation parameters are in [job.json](./job.json): 70 | 71 | - Two input directories with IDs `SCRIPT` and `DATASET` to allow the job to find the sample script and MNIST Database via environment variables `$AZ_BATCHAI_INPUT_SCRIPT` and `$AZ_BATCHAI_INPUT_DATASET`; 72 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and input; 73 | - An output directory with ID `MODEL` to allow job to find the output directory for the model via `$AZ_BATCHAI_OUTPUT_MODEL` environment variable; 74 | - node_count defining how many nodes will be used for the job execution; 75 | - path and parameters for running ConvNet_MNIST.py; 76 | - ```microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0``` docker image will be used for job execution. 77 | 78 | Note, you can remove docker image information to run the job directly on DSVM. 79 | 80 | #### Job Creation Command 81 | 82 | ```sh 83 | az batchai job create -l eastus -g batchaitests -n cntk_python -r nc6 -c job.json 84 | ``` 85 | 86 | Note, the job will start running when the cluster finished allocation and initialization of the node. 87 | 88 | ### Next Steps 89 | 90 | Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) contains detailed information on 91 | how to manage your clusters and jobs. 92 | 93 | [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) contains an end-to-end example of using 94 | Azure CLI 2.0 for Batch AI cluster creation, job submission and checking job's execution results. -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-Python-Distrbuted-Infiniband/cli-instructions.md: -------------------------------------------------------------------------------- 1 | Please follow [instructions](/recipes/Readme.md) to install Azure CLI 2.0, configure default location, create and configure default resource group and storage account. 2 | 3 | 4 | ### Data Deployment 5 | 6 | - Download ConvNet_CIFAR10_DataAug_Distributed.py, ConvNet_CIFAR10_DataAug.py and CIFA-10_data_prepare.sh into the current folder: 7 | 8 | For GNU/Linux users: 9 | 10 | ```sh 11 | wget "https://raw.githubusercontent.com/Microsoft/CNTK/v2.3/Examples/Image/Classification/ResNet/Python/resnet_models.py" -O resnet_models.py 12 | wget "https://raw.githubusercontent.com/Microsoft/CNTK/v2.3/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10_Distributed.py" -O TrainResNet_CIFAR10_Distributed.py 13 | wget "https://raw.githubusercontent.com/Microsoft/CNTK/v2.3/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10.py" -O TrainResNet_CIFAR10.py 14 | ``` 15 | 16 | Create an Azure File Share with `cntk_sample` folder and upload the scripts into it: 17 | 18 | ```sh 19 | az storage share create --name batchaisample 20 | az storage directory create --share-name batchaisample --name cntk_samples 21 | az storage file upload --share-name batchaisample --source TrainResNet_CIFAR10_Distributed.py --path cntk_samples 22 | az storage file upload --share-name batchaisample --source TrainResNet_CIFAR10.py --path cntk_samples 23 | az storage file upload --share-name batchaisample --source resnet_models.py --path cntk_samples 24 | ``` 25 | 26 | Upload the job preparation script, that does the following tasks: 27 | - Download CIFAR-10 data set on all GPU nodes (under ```$AZ_BATCHAI_JOB_TEMP``` directory) 28 | - Install IntelMPI binary 29 | 30 | ```sh 31 | az storage file upload --share-name batchaisample --source jobprep_cntk_distributed_ib.sh --path horovod_samples 32 | ``` 33 | 34 | 35 | ### Cluster 36 | 37 | By default, for this recipe we will use a GPU cluster with two nodes (`min node = max node = 2`) of `Standard_NC24r` size (four GPU with infiniband) 38 | with latest Ubuntu 16.04-LTS image. 39 | 40 | Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`. 41 | 42 | #### Cluster Creation Command 43 | 44 | For GNU/Linux users: 45 | 46 | ```sh 47 | az batchai cluster create -n nc24r -s Standard_NC24r --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub 48 | ``` 49 | 50 | For Windows users: 51 | 52 | ```sh 53 | az batchai cluster create -n nc24r -s Standard_NC24r --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u -p 54 | ``` 55 | 56 | ### Job 57 | 58 | The job creation parameters are in [job.json](./job.json): 59 | 60 | - The job will use `batchaitraining/cntk:2.3-gpu-1bitsgd-py36-cuda8-cudnn6-intelmpi` container that is built based on [dockerfile](./dockerfile) 61 | - Will use job preparation task to execute job prreparation script (jobprep_cntk_distributed_ib.sh). The CIFA-10 dataset will be downloaded and processed on compute nodes locally (under ```$AZ_BATCHAI_JOB_TEMP``` directory); 62 | - Will use configured previously input and output directories; 63 | - Will run TrainResNet_CIFAR10_Distributed.py providing CIFAR-10 Dataset path as the first parameter and desired mode output as the second one. 64 | - Will set ```processCount``` to 8, so that all 8 GPUs from 2 NC24r nodes will be used; 65 | - An input directory with IDs `SCRIPT` to allow the job to find the sample scripts via environment variable `$AZ_BATCHAI_INPUT_SCRIPT`; 66 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and input; 67 | - An output directory with ID `MODEL` to allow job to find the output directory for the model via `$AZ_BATCHAI_OUTPUT_MODEL` environment variable; 68 | - For illustration purpose, we will train a ResNet 110 and only run 5 epoches 69 | 70 | 71 | #### Job Creation Command 72 | 73 | ```sh 74 | az batchai job create -n distributed_cntk_ib --cluster-name nc24r -c job.json 75 | ``` 76 | 77 | Note, the job will start running when the cluster finished allocation and initialization of the nodes. 78 | 79 | ### Get Help 80 | 81 | The Azure CLI has built-in help documentation, which you can run from the command line: 82 | 83 | ```sh 84 | az [command-group [command]] -h 85 | ``` 86 | 87 | For example, to get information about all Azure Batch AI categories, use: 88 | 89 | ```sh 90 | az batchai -h 91 | ``` 92 | 93 | To get help with the command to create a cluster, use: 94 | 95 | ```sh 96 | az batchai cluster create -h 97 | ``` 98 | 99 | You can use [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) as end-to-end example of CLI usage. 100 | -------------------------------------------------------------------------------- /recipes/TensorFlow/TensorFlow-GPU-Distributed/cli-instructions.md: -------------------------------------------------------------------------------- 1 | Please follow [instructions](/documentation/using-azure-cli-20.md) to install Azure CLI 2.0 and configure it for using with Batch AI. 2 | 3 | ### Create a Resource Group 4 | 5 | Create a resource group ```batchaitests``` (or choose your own resource name) which will be used for resources creations: 6 | 7 | ```sh 8 | az group create -n batchaitests -l eastus 9 | ``` 10 | 11 | ### Create a Storage Account 12 | 13 | Create a storage account with an unique name in the same region where you are going to use Batch AI: 14 | 15 | ```sh 16 | az storage account create -n --sku Standard_LRS -l eastus -g batchaitests 17 | ``` 18 | 19 | ### Data Deployment 20 | 21 | - Download and extract preprocessed MNIST database: 22 | 23 | For GNU/Linux users: 24 | 25 | ```sh 26 | wget "https://batchaisamples.blob.core.windows.net/samples/mnist_dataset_original.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=b&sig=Qc1RA3zsXIP4oeioXutkL1PXIrHJO0pHJlppS2rID3I%3D" -O mnist_dataset_original.zip 27 | unzip mnist_dataset_original.zip 28 | ``` 29 | 30 | - Download mnist_replica.py sample script into the current folder: 31 | 32 | For GNU/Linux users: 33 | 34 | ```sh 35 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/TensorFlow/TensorFlow-GPU-Distributed/mnist_replica.py?token=AcZzrcpJGDHCUzsCyjlWiKVNfBuDdkqwks5Z4dPrwA%3D%3D" -O mnist_replica.py 36 | ``` 37 | 38 | - Create an Azure File Share with `mnist_dataset` and `tensorflow_samples` folders and upload MNIST database and convolutional.py into them: 39 | 40 | ```sh 41 | az storage share create --name batchaisample --account-name 42 | az storage directory create --share-name batchaisample --name mnist_dataset 43 | az storage file upload --share-name batchaisample --source t10k-images-idx3-ubyte.gz --path mnist_dataset 44 | az storage file upload --share-name batchaisample --source t10k-labels-idx1-ubyte.gz --path mnist_dataset 45 | az storage file upload --share-name batchaisample --source train-images-idx3-ubyte.gz --path mnist_dataset 46 | az storage file upload --share-name batchaisample --source train-labels-idx1-ubyte.gz --path mnist_dataset 47 | az storage directory create --share-name batchaisample --name tensorflow_samples 48 | az storage file upload --share-name batchaisample --source mnist_replica.py --path tensorflow_samples 49 | ``` 50 | 51 | ### Cluster 52 | 53 | For this recipe we need two nodes GPU cluster (`min node = max node = 2`) of `Standard_NC6` size (one GPU) with standard Ubuntu LTS (`UbuntuLTS`) or Ubuntu DSVM (```UbuntuDSVM```) image and Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`. 54 | 55 | #### Cluster Creation Command 56 | 57 | For GNU/Linux users: 58 | 59 | ```sh 60 | az batchai cluster create -l eastus -g batchaitests --storage-account-name -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub 61 | ``` 62 | 63 | For Windows users: 64 | 65 | ```sh 66 | az batchai cluster create -l eastus -g batchaitests --storage-account-name -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u -p 67 | ``` 68 | 69 | ### Job 70 | 71 | The job creation parameters are in [job.json](./job.json): 72 | 73 | - Two input directories with IDs `SCRIPT` and `DATASET` to allow the job to find the sample script and MNIST Database via environment variables `$AZ_BATCHAI_INPUT_SCRIPT` and `$AZ_BATCHAI_INPUT_DATASET`; 74 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and error streams; 75 | - An output directory with ID `MODEL` to allow job to find the output directory for the model via `$AZ_BATCHAI_OUTPUT_MODEL` environment variable; 76 | - nodeCount defining how many nodes will be used for the job execution; 77 | - path to mnist_replica.py and parameters for master, workers and parameter server; 78 | - ```tensorflow/tensorflow:1.1.0-gpu``` docker image will be used for job execution. 79 | 80 | Note, you can delete the docker image information to run the job directly on DSVM. 81 | 82 | #### Job Creation Command 83 | 84 | ```sh 85 | az batchai job create -l eastus -g batchaitests -n distibuted_tensorflow -r nc6 -c job.json 86 | ``` 87 | 88 | Note, the job will start running when the cluster finished allocation and initialization of the node. 89 | 90 | ### Next Steps 91 | 92 | Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) contains detailed information on 93 | how to manage your clusters and jobs. 94 | 95 | [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) contains an end-to-end example of using 96 | Azure CLI 2.0 for Batch AI cluster creation, job submission and checking job's execution results. -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-BrainScript-Distributed/cli-instructions.md: -------------------------------------------------------------------------------- 1 | Please follow [instructions](/documentation/using-azure-cli-20.md) to install Azure CLI 2.0 and configure it for using with Batch AI. 2 | 3 | ### Create a Resource Group 4 | 5 | Create a resource group ```batchaitests``` (or choose your own resource name) which will be used for resources creations: 6 | 7 | ```sh 8 | az group create -n batchaitests -l eastus 9 | ``` 10 | 11 | ### Create a Storage Account 12 | 13 | Create a storage account with an unique name in the same region where you are going to use Batch AI: 14 | 15 | ```sh 16 | az storage account create -n --sku Standard_LRS -l eastus -g batchaitests 17 | ``` 18 | 19 | ### Data Deployment 20 | 21 | - Download and extract preprocessed MNIST Database from this [location](https://batchaisamples.blob.core.windows.net/samples/mnist_dataset.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=c&sig=PmhL%2BYnYAyNTZr1DM2JySvrI12e%2F4wZNIwCtf7TRI%2BM%3D) into the current folder. 22 | 23 | For GNU/Linux users: 24 | 25 | ```sh 26 | wget "https://batchaisamples.blob.core.windows.net/samples/mnist_dataset.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=c&sig=PmhL%2BYnYAyNTZr1DM2JySvrI12e%2F4wZNIwCtf7TRI%2BM%3D" -O mnist_dataset.zip 27 | unzip mnist_dataset.zip 28 | ``` 29 | 30 | - Download ConvNet_MNIST.cntk config file into the current folder: 31 | 32 | For GNU/Linux users: 33 | 34 | ```sh 35 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/CNTK/CNTK-GPU-BrainScript-Distributed/ConvNet_MNIST.cntk?token=AcZzrWPVqDDfb6ig-y98_6af-Fj3R9piks5Z4b7rwA%3D%3D" -O DistributedConvNet_MNIST.cntk 36 | ``` 37 | 38 | - Create an Azure File Share with `nmist_database` and `cntk_sample` folders and upload MNIST database and BrainScript DistibutedConvNet_MNIST.cntk config file: 39 | 40 | ```sh 41 | az storage share create --name batchaisample --account-name 42 | az storage directory create --share-name batchaisample --name mnist_database 43 | az storage file upload --share-name batchaisample --source Train-28x28_cntk_text.txt --path mnist_database 44 | az storage file upload --share-name batchaisample --source Test-28x28_cntk_text.txt --path mnist_database 45 | az storage directory create --share-name batchaisample --name cntk_samples 46 | az storage file upload --share-name batchaisample --source DistributedConvNet_MNIST.cntk --path cntk_samples 47 | ``` 48 | 49 | ### Cluster 50 | 51 | For this recipe we need two nodes GPU cluster (`min node = max node = 2`) of `Standard_NC6` size (one GPU) with standard Ubuntu LTS (`UbuntuLTS`) or Ubuntu DSVM (```UbuntuDSVM```) image and Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`. 52 | 53 | #### Cluster Creation Command 54 | 55 | For GNU/Linux users: 56 | 57 | ```sh 58 | az batchai cluster create -l eastus -g batchaitests --storage-account-name -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub 59 | ``` 60 | 61 | For Windows users: 62 | 63 | ```sh 64 | az batchai cluster create -l eastus -g batchaitests --storage-account-name -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u -p 65 | ``` 66 | 67 | ### Job 68 | 69 | The job creation parameters are in [job.json](./job.json): 70 | 71 | - Two input directories with IDs `CONFIG` and `DATASET` to allow the job to find the sample config and MNIST Database via environment variables `$AZ_BATCHAI_INPUT_CONFIG` and `$AZ_BATCHAI_INPUT_DATASET`; 72 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and error streams; 73 | - An output directory with ID `MODEL` to allow job to find the output directory for the model via `$AZ_BATCHAI_OUTPUT_MODEL` environment variable; 74 | - node_count defining how many nodes will be used for the job execution; 75 | - path and parameters for running DistributedConvNet_MNIST.cntk; 76 | - ```microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0``` docker image will be used for job execution. 77 | 78 | Note, you can remove docker image information to run the job directly on DSVM. 79 | 80 | #### Job Creation Command 81 | 82 | ```sh 83 | az batchai job create -l eastus -g batchaitests -n distributed_cntk -r nc6 -c job.json 84 | ``` 85 | 86 | Note, the job will start running when the cluster finished allocation and initialization of the node. 87 | 88 | ### Next Steps 89 | 90 | Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) contains detailed information on 91 | how to manage your clusters and jobs. 92 | 93 | [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) contains an end-to-end example of using 94 | Azure CLI 2.0 for Batch AI cluster creation, job submission and checking job's execution results. -------------------------------------------------------------------------------- /recipes/Chainer/Chainer-GPU-Distributed/train_mnist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | 4 | import argparse 5 | 6 | import chainer 7 | import chainer.functions as F 8 | import chainer.links as L 9 | from chainer import training 10 | from chainer.training import extensions 11 | from mpi4py import MPI 12 | 13 | import chainermn 14 | 15 | 16 | class MLP(chainer.Chain): 17 | 18 | def __init__(self, n_units, n_out): 19 | super(MLP, self).__init__( 20 | # the size of the inputs to each layer will be inferred 21 | l1=L.Linear(784, n_units), # n_in -> n_units 22 | l2=L.Linear(n_units, n_units), # n_units -> n_units 23 | l3=L.Linear(n_units, n_out), # n_units -> n_out 24 | ) 25 | 26 | def __call__(self, x): 27 | h1 = F.relu(self.l1(x)) 28 | h2 = F.relu(self.l2(h1)) 29 | return self.l3(h2) 30 | 31 | 32 | def main(): 33 | parser = argparse.ArgumentParser(description='ChainerMN example: MNIST') 34 | parser.add_argument('--batchsize', '-b', type=int, default=100, 35 | help='Number of images in each mini-batch') 36 | parser.add_argument('--communicator', type=str, 37 | default='hierarchical', help='Type of communicator') 38 | parser.add_argument('--epoch', '-e', type=int, default=20, 39 | help='Number of sweeps over the dataset to train') 40 | parser.add_argument('--gpu', '-g', action='store_true', 41 | help='Use GPU') 42 | parser.add_argument('--out', '-o', default='result', 43 | help='Directory to output the result') 44 | parser.add_argument('--resume', '-r', default='', 45 | help='Resume the training from snapshot') 46 | parser.add_argument('--unit', '-u', type=int, default=1000, 47 | help='Number of units') 48 | args = parser.parse_args() 49 | 50 | # Prepare ChainerMN communicator. 51 | 52 | if args.gpu: 53 | if args.communicator == 'naive': 54 | print("Error: 'naive' communicator does not support GPU.\n") 55 | exit(-1) 56 | comm = chainermn.create_communicator(args.communicator) 57 | device = comm.intra_rank 58 | else: 59 | if args.communicator != 'naive': 60 | print('Warning: using naive communicator ' 61 | 'because only naive supports CPU-only execution') 62 | comm = chainermn.create_communicator('naive') 63 | device = -1 64 | 65 | if comm.mpi_comm.rank == 0: 66 | print('==========================================') 67 | print('Num process (COMM_WORLD): {}'.format(MPI.COMM_WORLD.Get_size())) 68 | if args.gpu: 69 | print('Using GPUs') 70 | print('Using {} communicator'.format(args.communicator)) 71 | print('Num unit: {}'.format(args.unit)) 72 | print('Num Minibatch-size: {}'.format(args.batchsize)) 73 | print('Num epoch: {}'.format(args.epoch)) 74 | print('==========================================') 75 | 76 | model = L.Classifier(MLP(args.unit, 10)) 77 | if device >= 0: 78 | chainer.cuda.get_device(device).use() 79 | model.to_gpu() 80 | 81 | # Create a multi node optimizer from a standard Chainer optimizer. 82 | optimizer = chainermn.create_multi_node_optimizer( 83 | chainer.optimizers.Adam(), comm) 84 | optimizer.setup(model) 85 | 86 | # Split and distribute the dataset. Only worker 0 loads the whole dataset. 87 | # Datasets of worker 0 are evenly split and distributed to all workers. 88 | if comm.rank == 0: 89 | train, test = chainer.datasets.get_mnist() 90 | else: 91 | train, test = None, None 92 | train = chainermn.scatter_dataset(train, comm, shuffle=True) 93 | test = chainermn.scatter_dataset(test, comm, shuffle=True) 94 | 95 | train_iter = chainer.iterators.SerialIterator(train, args.batchsize) 96 | test_iter = chainer.iterators.SerialIterator(test, args.batchsize, 97 | repeat=False, shuffle=False) 98 | 99 | updater = training.StandardUpdater(train_iter, optimizer, device=device) 100 | trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) 101 | 102 | # Create a multi node evaluator from a standard Chainer evaluator. 103 | evaluator = extensions.Evaluator(test_iter, model, device=device) 104 | evaluator = chainermn.create_multi_node_evaluator(evaluator, comm) 105 | trainer.extend(evaluator) 106 | 107 | # Some display and output extensions are necessary only for one worker. 108 | # (Otherwise, there would just be repeated outputs.) 109 | if comm.rank == 0: 110 | trainer.extend(extensions.dump_graph('main/loss')) 111 | trainer.extend(extensions.LogReport()) 112 | trainer.extend(extensions.PrintReport( 113 | ['epoch', 'main/loss', 'validation/main/loss', 114 | 'main/accuracy', 'validation/main/accuracy', 'elapsed_time'])) 115 | trainer.extend(extensions.ProgressBar()) 116 | 117 | if args.resume: 118 | chainer.serializers.load_npz(args.resume, trainer) 119 | 120 | trainer.run() 121 | 122 | 123 | if __name__ == '__main__': 124 | main() 125 | -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-Python/ConvNet_MNIST.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft. All rights reserved. 2 | 3 | # Licensed under the MIT license. See LICENSE.md file in the project root 4 | # for full license information. 5 | # ============================================================================== 6 | 7 | from __future__ import print_function 8 | import numpy as np 9 | import sys 10 | import os 11 | import cntk 12 | 13 | data_path = sys.argv[1] 14 | model_path = sys.argv[2] 15 | 16 | # Merge stdout and stderr 17 | sys.stdout = sys.stderr 18 | 19 | 20 | # Define the reader for both training and evaluation action. 21 | def create_reader(path, is_training, input_dim, label_dim): 22 | return cntk.io.MinibatchSource(cntk.io.CTFDeserializer(path, cntk.io.StreamDefs( 23 | features = cntk.io.StreamDef(field='features', shape=input_dim), 24 | labels = cntk.io.StreamDef(field='labels', shape=label_dim) 25 | )), randomize=is_training, max_sweeps = cntk.io.INFINITELY_REPEAT if is_training else 1) 26 | 27 | 28 | # Creates and trains a feedforward classification model for MNIST images 29 | def convnet_mnist(debug_output=False): 30 | image_height = 28 31 | image_width = 28 32 | num_channels = 1 33 | input_dim = image_height * image_width * num_channels 34 | num_output_classes = 10 35 | 36 | # Input variables denoting the features and label data 37 | input_var = cntk.ops.input((num_channels, image_height, image_width), np.float32) 38 | label_var = cntk.ops.input(num_output_classes, np.float32) 39 | 40 | # Instantiate the feedforward classification model 41 | scaled_input = cntk.ops.element_times(cntk.ops.constant(0.00390625), input_var) 42 | 43 | with cntk.layers.default_options(activation=cntk.ops.relu, pad=False): 44 | conv1 = cntk.layers.Convolution2D((5,5), 32, pad=True)(scaled_input) 45 | pool1 = cntk.layers.MaxPooling((3,3), (2,2))(conv1) 46 | conv2 = cntk.layers.Convolution2D((3,3), 48)(pool1) 47 | pool2 = cntk.layers.MaxPooling((3,3), (2,2))(conv2) 48 | conv3 = cntk.layers.Convolution2D((3,3), 64)(pool2) 49 | f4 = cntk.layers.Dense(96)(conv3) 50 | drop4 = cntk.layers.Dropout(0.5)(f4) 51 | z = cntk.layers.Dense(num_output_classes, activation=None)(drop4) 52 | 53 | ce = cntk.losses.cross_entropy_with_softmax(z, label_var) 54 | pe = cntk.metrics.classification_error(z, label_var) 55 | 56 | reader_train = create_reader(os.path.join(data_path, 'Train-28x28_cntk_text.txt'), True, input_dim, num_output_classes) 57 | 58 | # training config 59 | epoch_size = 60000 # for now we manually specify epoch size 60 | minibatch_size = 64 61 | max_epochs = 40 62 | 63 | # Set learning parameters 64 | lr_per_sample = [0.001]*10 + [0.0005]*10 + [0.0001] 65 | lr_schedule = cntk.learning_rate_schedule(lr_per_sample, cntk.learners.UnitType.sample, epoch_size) 66 | mm_time_constant = [0]*5 + [1024] 67 | mm_schedule = cntk.learners.momentum_as_time_constant_schedule(mm_time_constant, epoch_size) 68 | 69 | # Instantiate the trainer object to drive the model training 70 | learner = cntk.learners.momentum_sgd(z.parameters, lr_schedule, mm_schedule) 71 | progress_printer = cntk.logging.ProgressPrinter(tag='Training', 72 | num_epochs=max_epochs) 73 | trainer = cntk.Trainer(z, (ce, pe), learner, progress_printer) 74 | 75 | # define mapping from reader streams to network inputs 76 | input_map = { 77 | input_var : reader_train.streams.features, 78 | label_var : reader_train.streams.labels 79 | } 80 | 81 | cntk.logging.log_number_of_parameters(z) ; print() 82 | 83 | # Get minibatches of images to train with and perform model training 84 | for epoch in range(max_epochs): # loop over epochs 85 | sample_count = 0 86 | while sample_count < epoch_size: # loop over minibatches in the epoch 87 | data = reader_train.next_minibatch(min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch. 88 | trainer.train_minibatch(data) # update model with it 89 | sample_count += data[label_var].num_samples # count samples processed so far 90 | 91 | trainer.summarize_training_progress() 92 | z.save(os.path.join(model_path, "ConvNet_MNIST_{}.dnn".format(epoch))) 93 | 94 | # Load test data 95 | reader_test = create_reader(os.path.join(data_path, 'Test-28x28_cntk_text.txt'), False, input_dim, num_output_classes) 96 | 97 | input_map = { 98 | input_var : reader_test.streams.features, 99 | label_var : reader_test.streams.labels 100 | } 101 | 102 | # Test data for trained model 103 | epoch_size = 10000 104 | minibatch_size = 1024 105 | 106 | # process minibatches and evaluate the model 107 | metric_numer = 0 108 | metric_denom = 0 109 | sample_count = 0 110 | minibatch_index = 0 111 | 112 | while sample_count < epoch_size: 113 | current_minibatch = min(minibatch_size, epoch_size - sample_count) 114 | 115 | # Fetch next test min batch. 116 | data = reader_test.next_minibatch(current_minibatch, input_map=input_map) 117 | 118 | # minibatch data to be trained with 119 | metric_numer += trainer.test_minibatch(data) * current_minibatch 120 | metric_denom += current_minibatch 121 | 122 | # Keep track of the number of samples processed so far. 123 | sample_count += data[label_var].num_samples 124 | minibatch_index += 1 125 | 126 | print("") 127 | print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)) 128 | print("") 129 | 130 | return metric_numer/metric_denom 131 | 132 | if __name__=='__main__': 133 | convnet_mnist() 134 | 135 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.suo 8 | *.user 9 | *.userosscache 10 | *.sln.docstates 11 | 12 | # User-specific files (MonoDevelop/Xamarin Studio) 13 | *.userprefs 14 | 15 | # Build results 16 | [Dd]ebug/ 17 | [Dd]ebugPublic/ 18 | [Rr]elease/ 19 | [Rr]eleases/ 20 | x64/ 21 | x86/ 22 | bld/ 23 | [Bb]in/ 24 | [Oo]bj/ 25 | [Ll]og/ 26 | 27 | # Visual Studio 2015 cache/options directory 28 | .vs/ 29 | # Uncomment if you have tasks that create the project's static files in wwwroot 30 | #wwwroot/ 31 | 32 | # MSTest test Results 33 | [Tt]est[Rr]esult*/ 34 | [Bb]uild[Ll]og.* 35 | 36 | # NUNIT 37 | *.VisualState.xml 38 | TestResult.xml 39 | 40 | # Build Results of an ATL Project 41 | [Dd]ebugPS/ 42 | [Rr]eleasePS/ 43 | dlldata.c 44 | 45 | # .NET Core 46 | project.lock.json 47 | project.fragment.lock.json 48 | artifacts/ 49 | **/Properties/launchSettings.json 50 | 51 | *_i.c 52 | *_p.c 53 | *_i.h 54 | *.ilk 55 | *.meta 56 | *.obj 57 | *.pch 58 | *.pdb 59 | *.pgc 60 | *.pgd 61 | *.rsp 62 | *.sbr 63 | *.tlb 64 | *.tli 65 | *.tlh 66 | *.tmp 67 | *.tmp_proj 68 | *.log 69 | *.vspscc 70 | *.vssscc 71 | .builds 72 | *.pidb 73 | *.svclog 74 | *.scc 75 | 76 | # Chutzpah Test files 77 | _Chutzpah* 78 | 79 | # Visual C++ cache files 80 | ipch/ 81 | *.aps 82 | *.ncb 83 | *.opendb 84 | *.opensdf 85 | *.sdf 86 | *.cachefile 87 | *.VC.db 88 | *.VC.VC.opendb 89 | 90 | # Visual Studio profiler 91 | *.psess 92 | *.vsp 93 | *.vspx 94 | *.sap 95 | 96 | # TFS 2012 Local Workspace 97 | $tf/ 98 | 99 | # Guidance Automation Toolkit 100 | *.gpState 101 | 102 | # ReSharper is a .NET coding add-in 103 | _ReSharper*/ 104 | *.[Rr]e[Ss]harper 105 | *.DotSettings.user 106 | 107 | # JustCode is a .NET coding add-in 108 | .JustCode 109 | 110 | # TeamCity is a build add-in 111 | _TeamCity* 112 | 113 | # DotCover is a Code Coverage Tool 114 | *.dotCover 115 | 116 | # Visual Studio code coverage results 117 | *.coverage 118 | *.coveragexml 119 | 120 | # NCrunch 121 | _NCrunch_* 122 | .*crunch*.local.xml 123 | nCrunchTemp_* 124 | 125 | # MightyMoose 126 | *.mm.* 127 | AutoTest.Net/ 128 | 129 | # Web workbench (sass) 130 | .sass-cache/ 131 | 132 | # Installshield output folder 133 | [Ee]xpress/ 134 | 135 | # DocProject is a documentation generator add-in 136 | DocProject/buildhelp/ 137 | DocProject/Help/*.HxT 138 | DocProject/Help/*.HxC 139 | DocProject/Help/*.hhc 140 | DocProject/Help/*.hhk 141 | DocProject/Help/*.hhp 142 | DocProject/Help/Html2 143 | DocProject/Help/html 144 | 145 | # Click-Once directory 146 | publish/ 147 | 148 | # Publish Web Output 149 | *.[Pp]ublish.xml 150 | *.azurePubxml 151 | # TODO: Comment the next line if you want to checkin your web deploy settings 152 | # but database connection strings (with potential passwords) will be unencrypted 153 | *.pubxml 154 | *.publishproj 155 | 156 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 157 | # checkin your Azure Web App publish settings, but sensitive information contained 158 | # in these scripts will be unencrypted 159 | PublishScripts/ 160 | 161 | # NuGet Packages 162 | *.nupkg 163 | # The packages folder can be ignored because of Package Restore 164 | **/packages/* 165 | # except build/, which is used as an MSBuild target. 166 | !**/packages/build/ 167 | # Uncomment if necessary however generally it will be regenerated when needed 168 | #!**/packages/repositories.config 169 | # NuGet v3's project.json files produces more ignorable files 170 | *.nuget.props 171 | *.nuget.targets 172 | 173 | # Microsoft Azure Build Output 174 | csx/ 175 | *.build.csdef 176 | 177 | # Microsoft Azure Emulator 178 | ecf/ 179 | rcf/ 180 | 181 | # Windows Store app package directories and files 182 | AppPackages/ 183 | BundleArtifacts/ 184 | Package.StoreAssociation.xml 185 | _pkginfo.txt 186 | 187 | # Visual Studio cache files 188 | # files ending in .cache can be ignored 189 | *.[Cc]ache 190 | # but keep track of directories ending in .cache 191 | !*.[Cc]ache/ 192 | 193 | # Others 194 | ClientBin/ 195 | ~$* 196 | *~ 197 | *.dbmdl 198 | *.dbproj.schemaview 199 | *.jfm 200 | *.pfx 201 | *.publishsettings 202 | orleans.codegen.cs 203 | 204 | # Since there are multiple workflows, uncomment next line to ignore bower_components 205 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 206 | #bower_components/ 207 | 208 | # RIA/Silverlight projects 209 | Generated_Code/ 210 | 211 | # Backup & report files from converting an old project file 212 | # to a newer Visual Studio version. Backup files are not needed, 213 | # because we have git ;-) 214 | _UpgradeReport_Files/ 215 | Backup*/ 216 | UpgradeLog*.XML 217 | UpgradeLog*.htm 218 | 219 | # SQL Server files 220 | *.mdf 221 | *.ldf 222 | *.ndf 223 | 224 | # Business Intelligence projects 225 | *.rdl.data 226 | *.bim.layout 227 | *.bim_*.settings 228 | 229 | # Microsoft Fakes 230 | FakesAssemblies/ 231 | 232 | # GhostDoc plugin setting file 233 | *.GhostDoc.xml 234 | 235 | # Node.js Tools for Visual Studio 236 | .ntvs_analysis.dat 237 | node_modules/ 238 | 239 | # Typescript v1 declaration files 240 | typings/ 241 | 242 | # Visual Studio 6 build log 243 | *.plg 244 | 245 | # Visual Studio 6 workspace options file 246 | *.opt 247 | 248 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 249 | *.vbw 250 | 251 | # Visual Studio LightSwitch build output 252 | **/*.HTMLClient/GeneratedArtifacts 253 | **/*.DesktopClient/GeneratedArtifacts 254 | **/*.DesktopClient/ModelManifest.xml 255 | **/*.Server/GeneratedArtifacts 256 | **/*.Server/ModelManifest.xml 257 | _Pvt_Extensions 258 | 259 | # Paket dependency manager 260 | .paket/paket.exe 261 | paket-files/ 262 | 263 | # FAKE - F# Make 264 | .fake/ 265 | 266 | # JetBrains Rider 267 | .idea/ 268 | *.sln.iml 269 | 270 | # CodeRush 271 | .cr/ 272 | 273 | # Python Tools for Visual Studio (PTVS) 274 | __pycache__/ 275 | *.pyc 276 | 277 | # Cake - Uncomment if you are using it 278 | # tools/** 279 | # !tools/packages.config 280 | 281 | # Telerik's JustMock configuration file 282 | *.jmconfig 283 | 284 | # BizTalk build output 285 | *.btp.cs 286 | *.btm.cs 287 | *.odx.cs 288 | *.xsd.cs 289 | -------------------------------------------------------------------------------- /recipes/utilities.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import json 4 | import os 5 | import time 6 | 7 | import azure.mgmt.batchai as training 8 | import azure.mgmt.batchai.models as models 9 | import requests 10 | from azure.common.credentials import ServicePrincipalCredentials 11 | from azure.mgmt.resource import ResourceManagementClient 12 | 13 | POLLING_INTERVAL_SEC = 5 14 | 15 | 16 | def encode(value): 17 | if isinstance(value, type('str')): 18 | return value 19 | return value.encode('utf-8') 20 | 21 | 22 | class Configuration: 23 | """Configuration for recipes and notebooks""" 24 | 25 | def __init__(self, file_name): 26 | if not os.path.exists(file_name): 27 | raise ValueError('Cannot find configuration file "{0}"'. 28 | format(file_name)) 29 | 30 | with open(file_name, 'r') as f: 31 | conf = json.load(f) 32 | 33 | try: 34 | self.subscription_id = encode(conf['subscription_id']) 35 | self.aad_client_id = encode(conf['aad_client_id']) 36 | self.aad_secret_key = encode(conf['aad_secret']) 37 | self.aad_token_uri = 'https://login.microsoftonline.com/{0}/oauth2/token'.format(encode(conf['aad_tenant'])) 38 | self.location = encode(conf['location']) 39 | self.url = encode(conf['base_url']) 40 | self.resource_group = encode(conf['resource_group']) 41 | self.storage_account_name = encode(conf['storage_account']['name']) 42 | self.storage_account_key = encode(conf['storage_account']['key']) 43 | self.admin = encode(conf['admin_user']['name']) 44 | self.admin_password = conf['admin_user'].get('password', None) 45 | if self.admin_password: 46 | self.admin_password = encode(self.admin_password) 47 | self.admin_ssh_key = conf['admin_user'].get('ssh_public_key', None) 48 | if self.admin_ssh_key: 49 | self.admin_ssh_key = encode(self.admin_ssh_key) 50 | if not self.admin_password and not self.admin_ssh_key: 51 | raise AttributeError( 52 | 'Please provide admin user password or public ssh key') 53 | except KeyError as err: 54 | raise AttributeError( 55 | 'Please provide a value for "{0}" configuration key'.format( 56 | err.args[0])) 57 | 58 | 59 | class OutputStreamer: 60 | """Helper class to stream (tail -f) job's output files.""" 61 | 62 | def __init__(self, client, resource_group, job_name, output_directory_id, 63 | file_name): 64 | self.client = client 65 | self.resource_group = resource_group 66 | self.job_name = job_name 67 | self.output_directory_id = output_directory_id 68 | self.file_name = file_name 69 | self.url = None 70 | self.downloaded = 0 71 | # if no output_directory_id or file_name specified, the tail call is 72 | # nope 73 | if self.output_directory_id is None or self.file_name is None: 74 | self.tail = lambda: None 75 | 76 | def tail(self): 77 | if not self.url: 78 | files = self.client.jobs.list_output_files( 79 | self.resource_group, self.job_name, 80 | models.JobsListOutputFilesOptions( 81 | self.output_directory_id)) 82 | if not files: 83 | return 84 | else: 85 | for f in list(files): 86 | if f.name == self.file_name: 87 | self.url = f.download_url 88 | if self.url: 89 | r = requests.get(self.url, headers={ 90 | 'Range': 'bytes={0}-'.format(self.downloaded)}) 91 | if int(r.status_code / 100) == 2: 92 | self.downloaded += len(r.content) 93 | print(r.content.decode(), end='') 94 | 95 | 96 | def create_batchai_client(configuration): 97 | client = training.BatchAIManagementClient( 98 | credentials = ServicePrincipalCredentials(client_id=configuration.aad_client_id, secret=configuration.aad_secret_key, token_uri=configuration.aad_token_uri), 99 | subscription_id = configuration.subscription_id, 100 | base_url = configuration.url) 101 | return client 102 | 103 | 104 | def create_resource_group(configuration): 105 | client = ResourceManagementClient( 106 | credentials = ServicePrincipalCredentials(client_id=configuration.aad_client_id, secret=configuration.aad_secret_key, token_uri=configuration.aad_token_uri), 107 | subscription_id = configuration.subscription_id, base_url = configuration.url) 108 | resource = client.resource_groups.create_or_update(configuration.resource_group, {'location': configuration.location}) 109 | 110 | 111 | def download_file(sas, destination): 112 | dir_name = os.path.dirname(destination) 113 | if dir_name: 114 | os.makedirs(dir_name, exist_ok=True) 115 | print('Downloading {0} ...'.format(sas), end='') 116 | r = requests.get(sas, stream=True) 117 | with open(destination, 'wb') as f: 118 | for chunk in r.iter_content(chunk_size=512 * 1024): 119 | if chunk: # filter out keep-alive new chunks 120 | f.write(chunk) 121 | f.close() 122 | print('Done') 123 | 124 | 125 | def print_job_status(job): 126 | failure_message = None 127 | exit_code = 'None' 128 | if job.execution_info is not None: 129 | exit_code = job.execution_info.exit_code 130 | if job.execution_state == models.ExecutionState.failed: 131 | for error in job.execution_info.errors: 132 | failure_message = \ 133 | '\nErrorCode:{0}\nErrorMessage:{1}\n'. \ 134 | format(error.code, 135 | error.message) 136 | if error.details is not None: 137 | failure_message += 'Details:\n' 138 | for detail in error.details: 139 | failure_message += '{0}:{1}\n'.format(detail.name, 140 | detail.value) 141 | print('Job state: {0} ExitCode: {1}'.format(job.execution_state.name, 142 | exit_code)) 143 | if failure_message: 144 | print('FailureDetails: {0}'.format(failure_message)) 145 | 146 | 147 | def print_cluster_status(cluster): 148 | print( 149 | 'Cluster state: {0} Target: {1}; Allocated: {2}; Idle: {3}; ' 150 | 'Unusable: {4}; Running: {5}; Preparing: {6}; Leaving: {7}'.format( 151 | cluster.allocation_state, 152 | cluster.scale_settings.manual.target_node_count, 153 | cluster.current_node_count, 154 | cluster.node_state_counts.idle_node_count, 155 | cluster.node_state_counts.unusable_node_count, 156 | cluster.node_state_counts.running_node_count, 157 | cluster.node_state_counts.preparing_node_count, 158 | cluster.node_state_counts.leaving_node_count)) 159 | if not cluster.errors: 160 | return 161 | for error in cluster.errors: 162 | print('Cluster error: {0}: {1}'.format(error.code, error.message)) 163 | if error.details: 164 | print('Details:') 165 | for detail in error.details: 166 | print('{0}: {1}'.format(detail.name, detail.value)) 167 | 168 | 169 | def wait_for_job_completion(client, resource_group, job_name, cluster_name, 170 | output_directory_id=None, file_name=None): 171 | """ 172 | Waits for job completion and tails a file specified by output_directory_id 173 | and file_name. 174 | """ 175 | # Wait for job to start running 176 | while True: 177 | cluster = client.clusters.get(resource_group, cluster_name) 178 | print_cluster_status(cluster) 179 | job = client.jobs.get(resource_group, job_name) 180 | print_job_status(job) 181 | if job.execution_state != models.ExecutionState.queued: 182 | break 183 | time.sleep(POLLING_INTERVAL_SEC) 184 | 185 | print('Waiting for job output to become available...') 186 | 187 | # Tail the output file and wait for job to complete 188 | streamer = OutputStreamer(client, resource_group, job_name, 189 | output_directory_id, file_name) 190 | while True: 191 | streamer.tail() 192 | job = client.jobs.get(resource_group, job_name) 193 | if job.execution_state == models.ExecutionState.succeeded or job.execution_state == models.ExecutionState.failed: 194 | break 195 | time.sleep(1) 196 | streamer.tail() 197 | print_job_status(job) 198 | -------------------------------------------------------------------------------- /recipes/Readme.md: -------------------------------------------------------------------------------- 1 | # Getting Started with the Recipes 2 | 3 | ## Prerequisites 4 | 5 | 1. Azure subscription. This can be a free trial subscription, MSDN, or the one you use for other work. 6 | 2. Azure Python SDK and azure-mgmt-batchai, if you like to run recipes using Python Jupyter notebook. See How to install [Azure SDK](https://docs.microsoft.com/en-us/python/azure/python-sdk-azure-install?view=azure-python). 7 | 3. Azure CLI 2.0, if you like to run recipes using Azure CLI - See [Install Azure CLI 2.0](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest#install-on-windows) for instructions. 8 | 4. Azure Storage Account in East US (required for all recipes). See [How to create Azure storage accounts](https://docs.microsoft.com/en-us/azure/storage/common/storage-create-storage-account?toc=%2fazure%2fstorage%2ffiles%2ftoc.json) 9 | 10 | ## Make a Local Copy of Repo 11 | 12 | To start, please Clone or download this [repo](https://github.com/Azure/BatchAI) 13 | 14 | ## Recipe Instructions 15 | 16 | Use the following links for a quick navigation: 17 | 18 | 1. [Run Recipes Using Python Jupyter notebook](#jupyternotebook) 19 | 2. [Run Recipes Using Azure CLI 2.0](#azurecli) 20 | 21 | ## Run Recipes Using Python Jupyter notebook 22 | 23 | ### Create Credentials for Service Principal Authentication 24 | Jupyter notebook recipes require you to use service principal authentication rather than providing your account credentials. 25 | There are several ways to create a Service Principal as described in following sections: 26 | 27 | #### Using Azure CLI2.0 28 | 1. Log in into Azure CLI 2.0 29 | 2. Execute the following command 30 | ```sh 31 | $ az ad sp create-for-rbac 32 | ``` 33 | Example output: 34 | ``` 35 | { 36 | "appId": "...", 37 | "displayName": "azure-cli-2017-10-27-18-45-51", 38 | "name": "http://azure-cli-2017-10-27-18-45-51", 39 | "password": "...", 40 | "tenant": "..." 41 | } 42 | ``` 43 | Use appId value as aad_client_id, password as aad_secret and tenant as aad_tenant during configuration file creation later. 44 | 45 | #### Using Portal 46 | 1. Log in to your Azure Account through the [Azure portal](https://portal.azure.com/). 47 | 2. Select *Azure Active Directory*. 48 | 3. To get the AAD tenant ID, select *Properties* and copy the *Directory ID*. This value is your **AAD tenant ID**. 49 | 4. Go back to *Azure Active Directory* and select *App registrations*. 50 | 5. Select *New application registration*. 51 | 6. Provide a name and URL for the application. After setting the values, select *Create*. 52 | 7. From *App registrations* in *Azure Active Directory*, select your application. 53 | 8. Copy the *Application ID* and this is your **AAD Client ID**. 54 | 9. To generate an authentication key, select *Keys*. 55 | 10. Provide a description and a duration for the key. When done, select *Save*. After saving the key, the value of the key is displayed. Copy this value because you are not able to retrieve the key later. This is your **ADD Secret**. 56 | 11. To assign the just created application, select the subscription you are going to use for Azure Batch AI. (You can find it from *More Services* -> *Subscriptions*) 57 | 12. Select *Acess control (IAM)* 58 | 13. Select *Add* 59 | 14. Select *Contributor* as the *role* 60 | 15. Search for your application and select it. 61 | 16. Select *Save* to finish assigning the role. You see your application in the list of users assigned to a role for that scope. 62 | 63 | For a more detailed walk-through, please see [this link](https://docs.microsoft.com/en-us/azure/azure-resource-manager/resource-group-create-service-principal-portal). 64 | 65 | ### Register BatchAI Resource Providers 66 | 1. Log in to your Azure Account through the [Azure portal](https://portal.azure.com/). 67 | 2. Select the subscription you are going to use for Azure Batch AI. (You can find it from *More Services* -> *Subscriptions*) 68 | 3. Select *Resource providers* 69 | 4. Register with **Microsoft.BatchAI** and **Microsoft.Batch providers**. 70 | 71 | Note, a provider registration can take up to 15 minutes. 72 | 73 | ### Grant Batch AI Network Contributor Role on Your Subscription 74 | You can use two different approaches: 75 | 76 | #### Uzing Azure CLI 2.0 77 | ```sh 78 | az role assignment create --scope /subscriptions/ --role "Network Contributor" --assignee 9fcb3732-5f52-4135-8c08-9d4bbaf203ea 79 | ``` 80 | , here `9fcb3732-5f52-4135-8c08-9d4bbaf203ea` is a service principal of Microsoft Azure BatchAI. 81 | 82 | #### Using Portal 83 | 1. Select the subscription you are going to use for Azure Batch AI. (You can find it from *More Services* -> *Subscriptions*) 84 | 2. Select *Acess control (IAM)* 85 | 3. Select *Add* 86 | 4. Select *Network Contributor* as the *role* 87 | 5. Search for 'Microsoft Azure BatchAI' application and select it. 88 | 6. Select *Save* to finish assigning the role. 89 | 90 | ### Create Configuration File for All Recipes 91 | 92 | - Rename [configuration.json.template](/recipes/configuration.json.template) to configuration.json. 93 | - Fill in your subscription Id and your AAD application information as obtained in the above step. 94 | - Leave the "base_url" filed as empty. 95 | - You need to specify the name of your resource group. Our recipe will automatically create resource group if it does not exist. 96 | - Specify your Azure Storage account name and key, Please see [this page](https://docs.microsoft.com/en-us/azure/storage/common/storage-create-storage-account?toc=%2fazure%2fstorage%2ffiles%2ftoc.json). 97 | - Batch AI creates administrator user account on every compute node and enables ssh. You need to specify user name and at least a password or ssh public key for this account. 98 | 99 | ### Helper functions in utilities.py 100 | 101 | For your convenience, we provide a collection of helper functions in [utilities.py](./utilities.py) used for each recipes: 102 | 103 | - Read parameters from configuration file 104 | - Create python client object (BatchAIManagementClient) to access Azure Batch AI service 105 | - Create/Update resource group 106 | - Download file with given shared access signature (SAS) 107 | - Print Job/Cluster status 108 | - File Streaming 109 | 110 | ### Install Azure Batch AI Management Client 111 | 112 | Install Batch AI management client using the following command: 113 | 114 | ```sh 115 | pip install azure-mgmt-batchai 116 | ``` 117 | 118 | ### Install Azure Python SDK 119 | 120 | Since all recipes utlize APIs from other Azure products (e.g, Azure storage, credentials), it is also required to install the full package of Azure Python SDK: 121 | ```sh 122 | pip install azure 123 | ``` 124 | 125 | ### Install Jupyter Notebook 126 | 127 | Please install Jupyter Notebook from https://jupyter.org/ or run 128 | 129 | ```sh 130 | python -m pip install jupyter 131 | ``` 132 | 133 | ### Start to Run Recipes 134 | 135 | - Route into the root your cloned recipe directory 136 | ```sh 137 | cd /BatchAI/recipes 138 | ``` 139 | 140 | - Launch the Jupyter Notebook by 141 | ```sh 142 | jupyter notebook 143 | ``` 144 | 145 | - In the prompted brower brower, navigate into the recipe of interest, and start the *.ipynb file. 146 | 147 | 148 | ## Run Recipes Using Azure CLI 2.0 149 | 150 | ### Install Azure CLI 2.0 and Configure Azure CLI 2.0 151 | 152 | Please follow Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) to install and 153 | configure Azure CLI 2.0 for using with Batch AI. 154 | 155 | ### Generate Authentication Key for SSH (for Cloud Shell and GNU/Linux Users) 156 | 157 | During Cluster and File Server creation you will need to specify a name and authentication method for administrator account which will be created on each compute node (you can use this account to ssh to the node). 158 | 159 | You can provide a password and/or ssh public key as authentication method via --password (-p) and --ssh-public-key (-k) parameters. 160 | 161 | GNU/Linux users (including Cloud Shell users) can generate authentication key for ssh using ```ssh-keygen``` command. 162 | 163 | Note, GNU/Linux part of recipes expects you to have a public ssh key at ~/.ssh/id_rsa.pub, if you prefer to use different ssh key, please update -k parameter value. 164 | 165 | ### Install unzip package (for GNU/Linux Users) 166 | 167 | Training data used in recipes is compressed in ```zip``` archives and requires ```unzip``` utility to be installed on the host, please install it using your distribution package manager. 168 | 169 | Cloud Shell has ```unzip``` already installed. 170 | 171 | ### Run Recipes 172 | 173 | Each recipe contains ```cli-instructions.md``` file which describes input data, cluster and job configuration and provides instructions for cluster and job creation. 174 | 175 | ## Help or Feedback 176 | -------------------- 177 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub. 178 | 179 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI. 180 | -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-Python-Distributed/ConvNet_CIFAR10_DataAug.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # Copyright (c) Microsoft. All rights reserved. 3 | # Licensed under the MIT license. See LICENSE.md file in the project root 4 | # for full license information. 5 | # ============================================================================== 6 | 7 | from __future__ import print_function 8 | import os 9 | import math 10 | import numpy as np 11 | import cntk 12 | import _cntk_py 13 | import cntk.io.transforms as xforms 14 | 15 | from cntk.layers import Convolution2D, MaxPooling, AveragePooling, Dropout, BatchNormalization, Dense, default_options, identity, Sequential, For 16 | from cntk.layers.typing import * 17 | from cntk.io import MinibatchSource, ImageDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT 18 | from cntk import Trainer, use_default_device 19 | from cntk.learners import momentum_sgd, learning_rate_schedule, UnitType, momentum_as_time_constant_schedule 20 | from cntk import cross_entropy_with_softmax, classification_error, relu 21 | from cntk.ops import Function 22 | from cntk.debugging import set_computation_network_trace_level 23 | from cntk.logging import * 24 | 25 | ######################## 26 | # variables and paths # 27 | ######################## 28 | 29 | # paths (are relative to current python file) 30 | abs_path = os.path.dirname(os.path.abspath(__file__)) 31 | data_path = os.path.join(abs_path, "..", "..", "..", "DataSets", "CIFAR-10") 32 | model_path = os.path.join(abs_path, "Models") 33 | 34 | # model dimensions 35 | image_height = 32 36 | image_width = 32 37 | num_channels = 3 # RGB 38 | num_classes = 10 39 | 40 | ######################## 41 | # define the reader # 42 | ######################## 43 | 44 | def create_reader(map_file, mean_file, is_training): 45 | if not os.path.exists(map_file) or not os.path.exists(mean_file): 46 | raise RuntimeError("File '%s' or '%s' does not exist. Please run install_cifar10.py from DataSets/CIFAR-10 to fetch them" % 47 | (map_file, mean_file)) 48 | 49 | # transformation pipeline for the features has jitter/crop only when training 50 | transforms = [] 51 | if is_training: 52 | transforms += [ 53 | xforms.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter 54 | ] 55 | transforms += [ 56 | xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'), 57 | xforms.mean(mean_file) 58 | ] 59 | # deserializer 60 | return MinibatchSource(ImageDeserializer(map_file, StreamDefs( 61 | features=StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image' 62 | labels=StreamDef(field='label', shape=num_classes))), # and second as 'label' 63 | randomize=is_training, max_sweeps = INFINITELY_REPEAT if is_training else 1) 64 | 65 | ######################## 66 | # define the model # 67 | ######################## 68 | 69 | def create_convnet_cifar10_model(num_classes): 70 | with default_options(activation=relu, pad=True): 71 | return Sequential([ 72 | For(range(2), lambda : [ 73 | Convolution2D((3,3), 64), 74 | Convolution2D((3,3), 64), 75 | MaxPooling((3,3), strides=2) 76 | ]), 77 | For(range(2), lambda i: [ 78 | Dense([256,128][i]), 79 | Dropout(0.5) 80 | ]), 81 | Dense(num_classes, activation=None) 82 | ]) 83 | 84 | ######################## 85 | # define the criteria # 86 | ######################## 87 | 88 | # compose model function and criterion primitives into a criterion function 89 | # takes: Function: features -> prediction 90 | # returns: Function: (features, labels) -> (loss, metric) 91 | def create_criterion_function(model, normalize=identity): 92 | #@Function # Python 3 93 | #def criterion(x: Tensor[(num_channels, image_height, image_width)], y: Tensor[num_classes]): 94 | @Function 95 | @Signature(x = Tensor[(num_channels, image_height, image_width)], y = Tensor[num_classes]) 96 | def criterion(x, y): 97 | z = model(normalize(x)) 98 | ce = cross_entropy_with_softmax(z, y) 99 | errs = classification_error (z, y) 100 | return (ce, errs) 101 | return criterion 102 | 103 | ######################## 104 | # train action # 105 | ######################## 106 | 107 | def train_model(reader, model, criterion, epoch_size=50000, max_epochs=80): 108 | minibatch_size = 64 109 | 110 | # learning parameters 111 | learner = momentum_sgd(model.parameters, 112 | lr = learning_rate_schedule([0.0015625]*20+[0.00046875]*20+[0.00015625]*20+[0.000046875]*10+[0.000015625], minibatch_size=1, epoch_size=epoch_size), 113 | momentum = momentum_as_time_constant_schedule([0]*20+[600]*20+[1200], epoch_size=epoch_size), 114 | l2_regularization_weight = 0.002) 115 | 116 | # trainer object 117 | trainer = Trainer(None, criterion, learner) 118 | 119 | # perform model training 120 | log_number_of_parameters(model) ; print() 121 | progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs) 122 | 123 | for epoch in range(max_epochs): # loop over epochs 124 | sample_count = 0 125 | while sample_count < epoch_size: # loop over minibatches in the epoch 126 | mb = reader.next_minibatch(min(minibatch_size, epoch_size - sample_count)) # fetch minibatch. 127 | #trainer.train_minibatch(mb[reader.streams.features], mb[reader.streams.labels]) 128 | trainer.train_minibatch({criterion.arguments[0]: mb[reader.streams.features], criterion.arguments[1]: mb[reader.streams.labels]}) 129 | sample_count += mb[reader.streams.labels].num_samples # count samples processed so far 130 | progress_printer.update_with_trainer(trainer, with_metric=True) # log progress 131 | 132 | loss, metric, actual_samples = progress_printer.epoch_summary(with_metric=True) 133 | model.save(os.path.join(model_path, "ConvNet_CIFAR10_DataAug_{}.dnn".format(epoch))) 134 | 135 | # return evaluation error. 136 | return loss, metric # return values from last epoch 137 | 138 | ######################## 139 | # eval action # 140 | ######################## 141 | 142 | # helper function to create a dummy Trainer that one can call test_minibatch() on 143 | # TODO: replace by a proper such class once available 144 | def Evaluator(criterion): 145 | loss, metric = Trainer._get_loss_metric(criterion) 146 | parameters = set(loss.parameters) 147 | if metric: 148 | parameters |= set(metric.parameters) 149 | dummy_learner = momentum_sgd(tuple(parameters), 150 | lr = learning_rate_schedule(1, UnitType.minibatch), 151 | momentum = momentum_as_time_constant_schedule(0)) 152 | return Trainer(None, (loss, metric), dummy_learner) 153 | 154 | def evaluate(reader, criterion, device=None, minibatch_size=16, max_samples=None): 155 | 156 | # process minibatches and perform evaluation 157 | if not device: 158 | device = use_default_device() 159 | 160 | evaluator = Evaluator(criterion) 161 | progress_printer = ProgressPrinter(tag='Evaluation', num_epochs=1) 162 | 163 | samples_evaluated = 0 164 | while True: 165 | if (max_samples and samples_evaluated >= max_samples): 166 | break 167 | 168 | # Fetch minibatches until we hit the end 169 | mb = reader.next_minibatch(minibatch_size) 170 | if not mb: 171 | break 172 | 173 | metric = evaluator.test_minibatch({criterion.arguments[0]: mb[reader.streams.features], criterion.arguments[1]: mb[reader.streams.labels]}, device=device) 174 | samples_evaluated += minibatch_size 175 | progress_printer.update(0, mb[reader.streams.labels].num_samples, metric) # log progress 176 | 177 | loss, metric, actual_samples = progress_printer.epoch_summary(with_metric=True) 178 | return loss, metric 179 | 180 | ############################# 181 | # main function boilerplate # 182 | ############################# 183 | 184 | if __name__=='__main__': 185 | # create model 186 | model = create_convnet_cifar10_model(num_classes=10) 187 | # declare the model's input dimension 188 | # Training does not require this, but it is needed for deployment. 189 | model.update_signature((num_channels, image_height, image_width)) 190 | 191 | # criterion function. This is what is being trained trained. 192 | # Model gets "sandwiched" between normalization (not part of model proper) and criterion. 193 | criterion = create_criterion_function(model, normalize=lambda x: x / 256) 194 | 195 | # train 196 | reader = create_reader(os.path.join(data_path, 'train_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), True) 197 | train_model(reader, model, criterion, max_epochs=80) 198 | 199 | # save and load (as an illustration) 200 | path = data_path + "/model.cmf" 201 | model.save(path) 202 | 203 | # test 204 | model = Function.load(path) 205 | reader = create_reader(os.path.join(data_path, 'test_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), False) 206 | criterion = create_criterion_function(model, normalize=lambda x: x / 256) 207 | evaluate(reader, criterion) 208 | -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-Python-Distrbuted-Infiniband/dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04 2 | 3 | # install base system 4 | RUN apt-get update && apt-get install -y --no-install-recommends \ 5 | autotools-dev \ 6 | build-essential \ 7 | cmake \ 8 | git \ 9 | gfortran-multilib \ 10 | libavcodec-dev \ 11 | libavformat-dev \ 12 | libjasper-dev \ 13 | libjpeg-dev \ 14 | libpng-dev \ 15 | liblapacke-dev \ 16 | libswscale-dev \ 17 | libtiff-dev \ 18 | pkg-config \ 19 | wget \ 20 | zlib1g-dev \ 21 | # Protobuf 22 | ca-certificates \ 23 | curl \ 24 | unzip \ 25 | # For Kaldi 26 | python-dev \ 27 | automake \ 28 | libtool \ 29 | autoconf \ 30 | subversion \ 31 | # For Kaldi's dependencies 32 | libapr1 libaprutil1 libltdl-dev libltdl7 libserf-1-1 libsigsegv2 libsvn1 m4 \ 33 | # For Java Bindings 34 | openjdk-9-jdk-headless \ 35 | # For SWIG 36 | libpcre3-dev \ 37 | libpcre++-dev && \ 38 | apt-get install -y --no-install-recommends \ 39 | # Infiniband/RDMA 40 | cpio \ 41 | libmlx4-1 \ 42 | libmlx5-1 \ 43 | librdmacm1 \ 44 | libibverbs1 \ 45 | libmthca1 \ 46 | libdapl2 \ 47 | dapl2-utils 48 | 49 | # build and install libzip, cub, boost, openblas, opencv, protobuf 50 | RUN LIBZIP_VERSION=1.1.3 && \ 51 | wget -q -O - http://nih.at/libzip/libzip-${LIBZIP_VERSION}.tar.gz | tar -xzf - && \ 52 | cd libzip-${LIBZIP_VERSION} && \ 53 | ./configure --prefix=/usr/local && \ 54 | make -j"$(nproc)" install && \ 55 | ldconfig /usr/local/lib && \ 56 | cd .. && \ 57 | rm -rf /libzip-${LIBZIP_VERSION} && \ 58 | # boost 59 | BOOST_VERSION=1_60_0 && \ 60 | BOOST_DOTTED_VERSION=$(echo $BOOST_VERSION | tr _ .) && \ 61 | wget -q -O - https://sourceforge.net/projects/boost/files/boost/${BOOST_DOTTED_VERSION}/boost_${BOOST_VERSION}.tar.gz/download | tar -xzf - && \ 62 | cd boost_${BOOST_VERSION} && \ 63 | ./bootstrap.sh --prefix=/usr/local --with-libraries=filesystem,system,test && \ 64 | ./b2 -d0 -j"$(nproc)" install && \ 65 | ldconfig /usr/local/lib && \ 66 | cd .. && \ 67 | rm -rf /boost_${BOOST_VERSION} && \ 68 | # cub 69 | wget -q -O - https://github.com/NVlabs/cub/archive/1.4.1.tar.gz | tar -C /usr/local -xzf - && \ 70 | # openblas 71 | OPENBLAS_VERSION=0.2.19 && \ 72 | wget -q -O - https://github.com/xianyi/OpenBLAS/archive/v${OPENBLAS_VERSION}.tar.gz | tar -xzf - && \ 73 | cd OpenBLAS-${OPENBLAS_VERSION} && \ 74 | make -j"$(nproc)" USE_OPENMP=1 | tee make.log && \ 75 | grep -qF 'OpenBLAS build complete. (BLAS CBLAS LAPACK LAPACKE)' make.log && \ 76 | grep -qF 'Use OpenMP in the multithreading.' make.log && \ 77 | make PREFIX=/usr/local/openblas install && \ 78 | ldconfig /usr/local/openblas && \ 79 | cd .. && \ 80 | rm -rf /OpenBLAS-${OPENBLAS_VERSION} && \ 81 | # opencv 82 | OPENCV_VERSION=3.1.0 && \ 83 | wget -q -O - https://github.com/opencv/opencv/archive/${OPENCV_VERSION}.tar.gz | tar -xzf - && \ 84 | cd opencv-${OPENCV_VERSION} && \ 85 | cmake -DWITH_CUDA=OFF -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_INSTALL_PREFIX=/usr/local/opencv-${OPENCV_VERSION} . && \ 86 | make -j"$(nproc)" install && \ 87 | ldconfig /usr/local/lib && \ 88 | cd .. && \ 89 | rm -rf /opencv-${OPENCV_VERSION} && \ 90 | # protocol buffers 91 | PROTOBUF_VERSION=3.1.0 \ 92 | PROTOBUF_STRING=protobuf-$PROTOBUF_VERSION && \ 93 | wget -O - --no-verbose https://github.com/google/protobuf/archive/v${PROTOBUF_VERSION}.tar.gz | tar -xzf - && \ 94 | cd $PROTOBUF_STRING && \ 95 | ./autogen.sh && \ 96 | ./configure CFLAGS=-fPIC CXXFLAGS=-fPIC --disable-shared --prefix=/usr/local/$PROTOBUF_STRING && \ 97 | make -j $(nproc) install && \ 98 | cd .. && \ 99 | rm -rf $PROTOBUF_STRING 100 | 101 | # set env vars 102 | ENV KALDI_VERSION=c024e8aa 103 | ENV MKLML_VERSION=mklml_lnx_2018.0.1.20171007 104 | ENV PATH=/root/anaconda3/envs/cntk-py36/bin:/usr/local/bin:/cntk/build-mkl/gpu/release/bin:${PATH} \ 105 | KALDI_PATH=/usr/local/kaldi-$KALDI_VERSION \ 106 | BLAS=/usr/local/openblas/lib/libopenblas.so \ 107 | LAPACK=/usr/local/openblas/lib/libopenblas.so \ 108 | MKL_PATH=/usr/local/CNTKCustomMKL \ 109 | PYTHONPATH=/cntk/bindings/python:$PYTHONPATH \ 110 | LD_LIBRARY_PATH=/usr/local/openblas/lib:/cntk/bindings/python/cntk/libs:$LD_LIBRARY_PATH 111 | 112 | # install cntk custom mkl, kaldi, swig and anaconda 113 | RUN mkdir ${MKL_PATH} && \ 114 | wget --no-verbose -O - https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VERSION}.tgz | \ 115 | tar -xzf - -C ${MKL_PATH} && \ 116 | # kaldi 117 | mkdir $KALDI_PATH && \ 118 | wget --no-verbose -O - https://github.com/kaldi-asr/kaldi/archive/$KALDI_VERSION.tar.gz | tar -xzf - --strip-components=1 -C $KALDI_PATH && \ 119 | cd $KALDI_PATH/tools && \ 120 | perl -pi -e 's/^# (OPENFST_VERSION = 1.4.1)$/\1/' Makefile && \ 121 | #/bin/bash extras/check_dependencies.sh && \ 122 | #make -j $(nproc) all && \ 123 | make -j $(nproc) sph2pipe atlas sclite openfst && \ 124 | cd ../src && \ 125 | ./configure --openblas-root=/usr/local/openblas --shared && \ 126 | make -j $(nproc) depend && \ 127 | make -j $(nproc) all && \ 128 | find $KALDI_PATH -name '*.o' -print0 | xargs -0 rm && \ 129 | for dir in $KALDI_PATH/src/*bin; do make -C $dir clean; done && \ 130 | # SWIG 131 | SWIG_VERSION=3.0.10 && \ 132 | cd /root && \ 133 | wget -q http://prdownloads.sourceforge.net/swig/swig-${SWIG_VERSION}.tar.gz -O - | tar xvfz - && \ 134 | cd swig-${SWIG_VERSION} && \ 135 | ./configure --without-alllang && \ 136 | make -j$(nproc) && \ 137 | make install && \ 138 | cd .. && \ 139 | rm -rf swig-${SWIG_VERSION} && \ 140 | # Anaconda 141 | wget -q https://repo.continuum.io/archive/Anaconda3-4.4.0-Linux-x86_64.sh && \ 142 | bash Anaconda3-4.4.0-Linux-x86_64.sh -b && \ 143 | rm -f Anaconda3-4.4.0-Linux-x86_64.sh && \ 144 | # set paths for CNTK 145 | mkdir -p /usr/local/cudnn/cuda/include && \ 146 | ln -s /usr/include/cudnn.h /usr/local/cudnn/cuda/include/cudnn.h && \ 147 | mkdir -p /usr/local/cudnn/cuda/lib64 && \ 148 | ln -s /etc/alternatives/libcudnn_so /usr/local/cudnn/cuda/lib64/libcudnn.so && \ 149 | ln -s /usr/local/cuda/lib64/stubs/libnvidia-ml.so /usr/local/cuda/lib64/stubs/libnvidia-ml.so.1 && \ 150 | # update ldconfig 151 | ldconfig /usr/local/lib 152 | 153 | # set cntk dir 154 | WORKDIR /cntk 155 | 156 | # add intel mpi library and build cntk 157 | ENV MANPATH=/usr/share/man:/usr/local/man \ 158 | COMPILERVARS_ARCHITECTURE=intel64 \ 159 | COMPILERVARS_PLATFORM=linux \ 160 | INTEL_MPI_PATH=/opt/intel/compilers_and_libraries/linux/mpi 161 | RUN cd /tmp && \ 162 | wget -q 'http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/11595/l_mpi_2017.3.196.tgz' && \ 163 | tar zxvf l_mpi_2017.3.196.tgz && \ 164 | sed -i -e 's/^ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' /tmp/l_mpi_2017.3.196/silent.cfg && \ 165 | sed -i -e 's|^#ACTIVATION_LICENSE_FILE=|ACTIVATION_LICENSE_FILE=/tmp/l_mpi_2017.3.196/USE_SERVER.lic|g' /tmp/l_mpi_2017.3.196/silent.cfg && \ 166 | sed -i -e 's/^ACTIVATION_TYPE=exist_lic/ACTIVATION_TYPE=license_server/g' /tmp/l_mpi_2017.3.196/silent.cfg && \ 167 | cd /tmp/l_mpi_2017.3.196 && \ 168 | ./install.sh -s silent.cfg && \ 169 | cd .. 170 | 171 | # cntk makefiles use non-standard mpic++, symlink to mpicxx 172 | RUN ln -s ${INTEL_MPI_PATH}/${COMPILERVARS_ARCHITECTURE}/bin/mpicxx ${INTEL_MPI_PATH}/${COMPILERVARS_ARCHITECTURE}/bin/mpic++ && \ 173 | # build cntk 174 | CNTK_VERSION=v2.3 && \ 175 | cd /cntk && \ 176 | git clone --depth=1 --recursive -b ${CNTK_VERSION} --single-branch https://github.com/Microsoft/CNTK.git . 177 | 178 | # add cast in /cntk/Source/CNTKv2LibraryDll/Trainer.cpp to prevent build issue 179 | RUN sed -i 's|, unit)|, (int)unit)|g' /cntk/Source/CNTKv2LibraryDll/Trainer.cpp 180 | 181 | # set Anaconda environment 182 | RUN /root/anaconda3/bin/conda env create -p /root/anaconda3/envs/cntk-py36/ \ 183 | --file /cntk/Scripts/install/linux/conda-linux-cntk-py36-environment.yml && \ 184 | # source intel mpi vars 185 | . /opt/intel/bin/compilervars.sh && \ 186 | . /opt/intel/compilers_and_libraries/linux/mpi/bin64/mpivars.sh && \ 187 | # build gpu-mkl only 188 | CONFIGURE_OPTS="\ 189 | --1bitsgd=yes \ 190 | --with-mpi=${INTEL_MPI_PATH}/${COMPILERVARS_ARCHITECTURE} \ 191 | --with-cuda=/usr/local/cuda \ 192 | --with-gdk-include=/usr/local/cuda/include \ 193 | --with-gdk-nvml-lib=/usr/local/cuda/lib64/stubs \ 194 | --with-kaldi=${KALDI_PATH} \ 195 | --with-py36-path=/root/anaconda3/envs/cntk-py36 \ 196 | --with-cudnn=/usr/local/cudnn" && \ 197 | mkdir -p build-mkl/gpu/release && \ 198 | cd build-mkl/gpu/release && \ 199 | ../../../configure $CONFIGURE_OPTS --with-mkl=${MKL_PATH}/${MKLML_VERSION} && \ 200 | make -j"$(nproc)" 201 | 202 | # clean up 203 | RUN rm -rf /cntk/build-mkl/gpu/release/.build && \ 204 | rm -rf /cntk/.git && \ 205 | /root/anaconda3/bin/conda clean --all --yes && \ 206 | # create activate script 207 | echo "source /root/anaconda3/bin/activate /root/anaconda3/envs/cntk-py36" > /cntk/activate-cntk && \ 208 | # add cntk activate to root bashrc 209 | echo "source /cntk/activate-cntk" >> /root/.bashrc && \ 210 | # add LD_LIBRARY_PATH to root bashrc 211 | echo LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:'$LD_LIBRARY_PATH' >> /root/.bashrc && \ 212 | # remove intel components 213 | rm -rf /opt/intel 214 | -------------------------------------------------------------------------------- /recipes/CNTK/CNTK-GPU-Python-Distributed/ConvNet_CIFAR10_DataAug_Distributed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft. All rights reserved. 2 | 3 | # Licensed under the MIT license. See LICENSE.md file in the project root 4 | # for full license information. 5 | # ============================================================================== 6 | 7 | from __future__ import print_function 8 | import os 9 | import math 10 | import argparse 11 | import numpy as np 12 | import cntk as C 13 | import _cntk_py 14 | import cntk.io.transforms as xforms 15 | from cntk.train.training_session import * 16 | from cntk.logging import * 17 | from cntk.debugging import * 18 | 19 | # default Paths relative to current python file. 20 | abs_path = os.path.dirname(os.path.abspath(__file__)) 21 | sys.path.append(abs_path) 22 | model_path = os.path.join(abs_path, "Models") 23 | 24 | from ConvNet_CIFAR10_DataAug import create_convnet_cifar10_model 25 | 26 | # model dimensions 27 | image_height = 32 28 | image_width = 32 29 | num_channels = 3 # RGB 30 | num_classes = 10 31 | 32 | # Create a minibatch source. 33 | def create_image_mb_source(map_file, mean_file, train, total_number_of_samples): 34 | if not os.path.exists(map_file) or not os.path.exists(mean_file): 35 | raise RuntimeError("File '%s' or '%s' does not exist. Please run install_cifar10.py from DataSets/CIFAR-10 to fetch them" % 36 | (map_file, mean_file)) 37 | 38 | # transformation pipeline for the features has jitter/crop only when training 39 | transforms = [] 40 | if train: 41 | transforms += [ 42 | xforms.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter 43 | ] 44 | 45 | transforms += [ 46 | xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'), 47 | xforms.mean(mean_file) 48 | ] 49 | 50 | # deserializer 51 | return C.io.MinibatchSource( 52 | C.io.ImageDeserializer( 53 | map_file, 54 | C.io.StreamDefs(features=C.io.StreamDef(field='image', transforms=transforms), # 1st col in mapfile referred to as 'image' 55 | labels=C.io.StreamDef(field='label', shape=num_classes))), # and second as 'label' 56 | randomize=train, 57 | max_samples=total_number_of_samples, 58 | multithreaded_deserializer=True) 59 | 60 | # Create the network. 61 | def create_conv_network(): 62 | # Input variables denoting the features and label data 63 | feature_var = C.input_variable((num_channels, image_height, image_width)) 64 | label_var = C.input_variable((num_classes)) 65 | 66 | # apply model to input 67 | scaled_input = C.element_times(C.constant(0.00390625), feature_var) 68 | 69 | z = create_convnet_cifar10_model(num_classes)(scaled_input) 70 | 71 | # loss and metric 72 | ce = C.cross_entropy_with_softmax(z, label_var) 73 | pe = C.classification_error(z, label_var) 74 | 75 | C.logging.log_number_of_parameters(z) ; print() 76 | 77 | return { 78 | 'feature': feature_var, 79 | 'label': label_var, 80 | 'ce' : ce, 81 | 'pe' : pe, 82 | 'output': z 83 | } 84 | 85 | # Create trainer 86 | def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up, progress_writers): 87 | # Set learning parameters 88 | lr_per_sample = [0.0015625]*20 + [0.00046875]*20 + [0.00015625]*20 + [0.000046875]*10 + [0.000015625] 89 | lr_schedule = C.learning_rate_schedule(lr_per_sample, unit=C.learners.UnitType.sample, epoch_size=epoch_size) 90 | mm_time_constant = [0]*20 + [600]*20 + [1200] 91 | mm_schedule = C.learners.momentum_as_time_constant_schedule(mm_time_constant, epoch_size=epoch_size) 92 | l2_reg_weight = 0.002 93 | 94 | # Create learner 95 | if block_size != None and num_quantization_bits != 32: 96 | raise RuntimeError("Block momentum cannot be used with quantization, please remove quantized_bits option.") 97 | 98 | local_learner = C.learners.momentum_sgd(network['output'].parameters, 99 | lr_schedule, mm_schedule, 100 | l2_regularization_weight=l2_reg_weight) 101 | 102 | if block_size != None: 103 | parameter_learner = C.train.distributed.block_momentum_distributed_learner(local_learner, block_size=block_size) 104 | else: 105 | parameter_learner = C.train.distributed.data_parallel_distributed_learner(local_learner, 106 | num_quantization_bits=num_quantization_bits, 107 | distributed_after=warm_up) 108 | 109 | # Create trainer 110 | return C.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_writers) 111 | 112 | # Train and test 113 | def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore, profiling=False): 114 | 115 | # define mapping from intput streams to network inputs 116 | input_map = { 117 | network['feature']: train_source.streams.features, 118 | network['label']: train_source.streams.labels 119 | } 120 | 121 | # Train all minibatches 122 | if profiling: 123 | start_profiler(sync_gpu=True) 124 | 125 | training_session( 126 | trainer=trainer, mb_source = train_source, 127 | model_inputs_to_streams = input_map, 128 | mb_size = minibatch_size, 129 | progress_frequency=epoch_size, 130 | checkpoint_config = CheckpointConfig(frequency = epoch_size, 131 | filename = os.path.join(model_path, "ConvNet_CIFAR10_DataAug"), 132 | restore = restore), 133 | test_config = TestConfig(test_source, minibatch_size=minibatch_size) 134 | ).train() 135 | 136 | if profiling: 137 | stop_profiler() 138 | 139 | # Train and evaluate the network. 140 | def convnet_cifar10_dataaug(train_data, test_data, mean_data, minibatch_size=64, epoch_size=50000, num_quantization_bits=32, 141 | block_size=3200, warm_up=0, max_epochs=2, restore=False, log_to_file=None, 142 | num_mbs_per_log=None, gen_heartbeat=False, profiling=False, tensorboard_logdir=None): 143 | _cntk_py.set_computation_network_trace_level(0) 144 | 145 | network = create_conv_network() 146 | 147 | distributed_sync_report_freq = None 148 | if block_size is not None: 149 | distributed_sync_report_freq = 1 150 | 151 | progress_writers = [C.logging.ProgressPrinter( 152 | freq=num_mbs_per_log, 153 | tag='Training', 154 | log_to_file=log_to_file, 155 | rank=C.train.distributed.Communicator.rank(), 156 | gen_heartbeat=gen_heartbeat, 157 | num_epochs=max_epochs, 158 | distributed_freq=distributed_sync_report_freq)] 159 | 160 | if tensorboard_logdir is not None: 161 | progress_writers.append(C.logging.TensorBoardProgressWriter( 162 | freq=num_mbs_per_log, 163 | log_dir=tensorboard_logdir, 164 | rank=C.train.distributed.Communicator.rank(), 165 | model=network['output'])) 166 | 167 | trainer = create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up, progress_writers) 168 | train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size) 169 | test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=C.io.FULL_DATA_SWEEP) 170 | train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore, profiling) 171 | 172 | 173 | if __name__=='__main__': 174 | parser = argparse.ArgumentParser() 175 | data_path = os.path.join(abs_path, "..", "..", "..", "DataSets", "CIFAR-10") 176 | 177 | parser.add_argument('-datadir', '--datadir', help='Data directory where the CIFAR dataset is located', 178 | required=False, default=data_path) 179 | parser.add_argument('-outputdir', '--outputdir', help='Output directory for checkpoints and models', required=False, default=None) 180 | parser.add_argument('-logdir', '--logdir', help='Log file', required=False, default=None) 181 | parser.add_argument('-tensorboard_logdir', '--tensorboard_logdir', help='Directory where TensorBoard logs should be created', 182 | required=False, default=None) 183 | parser.add_argument('-n', '--num_epochs', help='Total number of epochs to train', type=int, required=False, default='160') 184 | parser.add_argument('-m', '--minibatch_size', help='Minibatch size', type=int, required=False, default='64') 185 | parser.add_argument('-e', '--epoch_size', help='Epoch size', type=int, required=False, default='50000') 186 | parser.add_argument('-q', '--quantized_bits', help='Number of quantized bits used for gradient aggregation', type=int, 187 | required=False, default='32') 188 | parser.add_argument('-a', '--distributed_after', help='Number of samples to train with before running distributed', type=int, 189 | required=False, default='0') 190 | parser.add_argument('-b', '--block_samples', type=int, help="Number of samples per block for block momentum (BM) distributed learner (if 0 BM learner is not used)", 191 | required=False, default=None) 192 | parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', 193 | action='store_true') 194 | parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", 195 | required=False, default=None) 196 | parser.add_argument('-profile', '--profile', help="Turn on profiling", action='store_true', default=False) 197 | 198 | args = vars(parser.parse_args()) 199 | 200 | if args['outputdir'] is not None: 201 | model_path = args['outputdir'] + "/models" 202 | if args['logdir'] is not None: 203 | log_dir = args['logdir'] 204 | if args['device'] is not None: 205 | C.device.try_set_default_device(C.device.gpu(args['device'])) 206 | 207 | data_path = args['datadir'] 208 | 209 | if not os.path.isdir(data_path): 210 | raise RuntimeError("Directory %s does not exist" % data_path) 211 | 212 | mean_data=os.path.join(data_path, 'CIFAR-10_mean.xml') 213 | train_data=os.path.join(data_path, 'train_map.txt') 214 | test_data=os.path.join(data_path, 'test_map.txt') 215 | 216 | convnet_cifar10_dataaug(train_data, test_data, mean_data, 217 | minibatch_size=args['minibatch_size'], 218 | epoch_size=args['epoch_size'], 219 | num_quantization_bits=args['quantized_bits'], 220 | block_size=args['block_samples'], 221 | warm_up=args['distributed_after'], 222 | max_epochs=args['num_epochs'], 223 | restore=not args['restart'], 224 | log_to_file=args['logdir'], 225 | num_mbs_per_log=100, 226 | gen_heartbeat=True, 227 | profiling=args['profile'], 228 | tensorboard_logdir=args['tensorboard_logdir']) 229 | # Must call MPI finalize when process exit without exceptions 230 | C.train.distributed.Communicator.finalize() 231 | 232 | -------------------------------------------------------------------------------- /recipes/TensorFlow/TensorFlow-GPU-Distributed/mnist_replica.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Distributed MNIST training and validation, with model replicas. 17 | 18 | A simple softmax model with one hidden layer is defined. The parameters 19 | (weights and biases) are located on one parameter server (ps), while the ops 20 | are executed on two worker nodes by default. The TF sessions also run on the 21 | worker node. 22 | Multiple invocations of this script can be done in parallel, with different 23 | values for --task_index. There should be exactly one invocation with 24 | --task_index, which will create a master session that carries out variable 25 | initialization. The other, non-master, sessions will wait for the master 26 | session to finish the initialization before proceeding to the training stage. 27 | 28 | The coordination between the multiple worker invocations occurs due to 29 | the definition of the parameters on the same ps devices. The parameter updates 30 | from one worker is visible to all other workers. As such, the workers can 31 | perform forward computation and gradient calculation in parallel, which 32 | should lead to increased training speed for the simple model. 33 | """ 34 | 35 | 36 | from __future__ import absolute_import 37 | from __future__ import division 38 | from __future__ import print_function 39 | 40 | import math 41 | import sys 42 | import tempfile 43 | import time 44 | 45 | import tensorflow as tf 46 | from tensorflow.examples.tutorials.mnist import input_data 47 | 48 | 49 | flags = tf.app.flags 50 | flags.DEFINE_string("data_dir", "/tmp/mnist-data", 51 | "Directory for storing mnist data") 52 | flags.DEFINE_boolean("download_only", False, 53 | "Only perform downloading of data; Do not proceed to " 54 | "session preparation, model definition or training") 55 | flags.DEFINE_integer("task_index", None, 56 | "Worker task index, should be >= 0. task_index=0 is " 57 | "the master worker task the performs the variable " 58 | "initialization ") 59 | flags.DEFINE_integer("num_gpus", 1, 60 | "Total number of gpus for each machine." 61 | "If you don't use GPU, please set it to '0'") 62 | flags.DEFINE_integer("replicas_to_aggregate", None, 63 | "Number of replicas to aggregate before parameter update" 64 | "is applied (For sync_replicas mode only; default: " 65 | "num_workers)") 66 | flags.DEFINE_integer("hidden_units", 100, 67 | "Number of units in the hidden layer of the NN") 68 | flags.DEFINE_integer("train_steps", 200, 69 | "Number of (global) training steps to perform") 70 | flags.DEFINE_integer("batch_size", 100, "Training batch size") 71 | flags.DEFINE_float("learning_rate", 0.01, "Learning rate") 72 | flags.DEFINE_boolean("sync_replicas", False, 73 | "Use the sync_replicas (synchronized replicas) mode, " 74 | "wherein the parameter updates from workers are aggregated " 75 | "before applied to avoid stale gradients") 76 | flags.DEFINE_boolean( 77 | "existing_servers", False, "Whether servers already exists. If True, " 78 | "will use the worker hosts via their GRPC URLs (one client process " 79 | "per worker host). Otherwise, will create an in-process TensorFlow " 80 | "server.") 81 | flags.DEFINE_string("ps_hosts","localhost:2222", 82 | "Comma-separated list of hostname:port pairs") 83 | flags.DEFINE_string("worker_hosts", "localhost:2223,localhost:2224", 84 | "Comma-separated list of hostname:port pairs") 85 | flags.DEFINE_string("job_name", None,"job name: worker or ps") 86 | 87 | FLAGS = flags.FLAGS 88 | 89 | 90 | IMAGE_PIXELS = 28 91 | 92 | 93 | def main(unused_argv): 94 | mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) 95 | if FLAGS.download_only: 96 | sys.exit(0) 97 | 98 | if FLAGS.job_name is None or FLAGS.job_name == "": 99 | raise ValueError("Must specify an explicit `job_name`") 100 | if FLAGS.task_index is None or FLAGS.task_index =="": 101 | raise ValueError("Must specify an explicit `task_index`") 102 | 103 | print("job name = %s" % FLAGS.job_name) 104 | print("task index = %d" % FLAGS.task_index) 105 | 106 | #Construct the cluster and start the server 107 | ps_spec = FLAGS.ps_hosts.split(",") 108 | worker_spec = FLAGS.worker_hosts.split(",") 109 | 110 | # Get the number of workers. 111 | num_workers = len(worker_spec) 112 | 113 | cluster = tf.train.ClusterSpec({ 114 | "ps": ps_spec, 115 | "worker": worker_spec}) 116 | 117 | if not FLAGS.existing_servers: 118 | # Not using existing servers. Create an in-process server. 119 | server = tf.train.Server( 120 | cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) 121 | if FLAGS.job_name == "ps": 122 | server.join() 123 | 124 | is_chief = (FLAGS.task_index == 0) 125 | if FLAGS.num_gpus > 0: 126 | # Avoid gpu allocation conflict: now allocate task_num -> #gpu 127 | # for each worker in the corresponding machine 128 | gpu = (FLAGS.task_index % FLAGS.num_gpus) 129 | worker_device = "/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu) 130 | elif FLAGS.num_gpus == 0: 131 | # Just allocate the CPU to worker server 132 | cpu = 0 133 | worker_device = "/job:worker/task:%d/cpu:%d" % (FLAGS.task_index, cpu) 134 | # The device setter will automatically place Variables ops on separate 135 | # parameter servers (ps). The non-Variable ops will be placed on the workers. 136 | # The ps use CPU and workers use corresponding GPU 137 | with tf.device( 138 | tf.train.replica_device_setter( 139 | worker_device=worker_device, 140 | ps_device="/job:ps/cpu:0", 141 | cluster=cluster)): 142 | global_step = tf.Variable(0, name="global_step", trainable=False) 143 | 144 | # Variables of the hidden layer 145 | hid_w = tf.Variable( 146 | tf.truncated_normal( 147 | [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units], 148 | stddev=1.0 / IMAGE_PIXELS), 149 | name="hid_w") 150 | hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b") 151 | 152 | # Variables of the softmax layer 153 | sm_w = tf.Variable( 154 | tf.truncated_normal( 155 | [FLAGS.hidden_units, 10], 156 | stddev=1.0 / math.sqrt(FLAGS.hidden_units)), 157 | name="sm_w") 158 | sm_b = tf.Variable(tf.zeros([10]), name="sm_b") 159 | 160 | # Ops: located on the worker specified with FLAGS.task_index 161 | x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS]) 162 | y_ = tf.placeholder(tf.float32, [None, 10]) 163 | 164 | hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) 165 | hid = tf.nn.relu(hid_lin) 166 | 167 | y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) 168 | cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) 169 | 170 | opt = tf.train.AdamOptimizer(FLAGS.learning_rate) 171 | 172 | if FLAGS.sync_replicas: 173 | if FLAGS.replicas_to_aggregate is None: 174 | replicas_to_aggregate = num_workers 175 | else: 176 | replicas_to_aggregate = FLAGS.replicas_to_aggregate 177 | 178 | opt = tf.train.SyncReplicasOptimizer( 179 | opt, 180 | replicas_to_aggregate=replicas_to_aggregate, 181 | total_num_replicas=num_workers, 182 | name="mnist_sync_replicas") 183 | 184 | train_step = opt.minimize(cross_entropy, global_step=global_step) 185 | 186 | if FLAGS.sync_replicas: 187 | local_init_op = opt.local_step_init_op 188 | if is_chief: 189 | local_init_op = opt.chief_init_op 190 | 191 | ready_for_local_init_op = opt.ready_for_local_init_op 192 | 193 | # Initial token and chief queue runners required by the sync_replicas mode 194 | chief_queue_runner = opt.get_chief_queue_runner() 195 | sync_init_op = opt.get_init_tokens_op() 196 | 197 | init_op = tf.global_variables_initializer() 198 | train_dir = tempfile.mkdtemp() 199 | 200 | if FLAGS.sync_replicas: 201 | sv = tf.train.Supervisor( 202 | is_chief=is_chief, 203 | logdir=train_dir, 204 | init_op=init_op, 205 | local_init_op=local_init_op, 206 | ready_for_local_init_op=ready_for_local_init_op, 207 | recovery_wait_secs=1, 208 | global_step=global_step) 209 | else: 210 | sv = tf.train.Supervisor( 211 | is_chief=is_chief, 212 | logdir=train_dir, 213 | init_op=init_op, 214 | recovery_wait_secs=1, 215 | global_step=global_step) 216 | 217 | sess_config = tf.ConfigProto( 218 | allow_soft_placement=True, 219 | log_device_placement=False, 220 | device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.task_index]) 221 | 222 | # The chief worker (task_index==0) session will prepare the session, 223 | # while the remaining workers will wait for the preparation to complete. 224 | if is_chief: 225 | print("Worker %d: Initializing session..." % FLAGS.task_index) 226 | else: 227 | print("Worker %d: Waiting for session to be initialized..." % 228 | FLAGS.task_index) 229 | 230 | if FLAGS.existing_servers: 231 | server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index] 232 | print("Using existing server at: %s" % server_grpc_url) 233 | 234 | sess = sv.prepare_or_wait_for_session(server_grpc_url, 235 | config=sess_config) 236 | else: 237 | sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) 238 | 239 | print("Worker %d: Session initialization complete." % FLAGS.task_index) 240 | 241 | if FLAGS.sync_replicas and is_chief: 242 | # Chief worker will start the chief queue runner and call the init op. 243 | sess.run(sync_init_op) 244 | sv.start_queue_runners(sess, [chief_queue_runner]) 245 | 246 | # Perform training 247 | time_begin = time.time() 248 | print("Training begins @ %f" % time_begin) 249 | 250 | local_step = 0 251 | while True: 252 | # Training feed 253 | batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size) 254 | train_feed = {x: batch_xs, y_: batch_ys} 255 | 256 | _, step = sess.run([train_step, global_step], feed_dict=train_feed) 257 | local_step += 1 258 | 259 | now = time.time() 260 | print("%f: Worker %d: training step %d done (global step: %d)" % 261 | (now, FLAGS.task_index, local_step, step)) 262 | 263 | if step >= FLAGS.train_steps: 264 | break 265 | 266 | time_end = time.time() 267 | print("Training ends @ %f" % time_end) 268 | training_time = time_end - time_begin 269 | print("Training elapsed time: %f s" % training_time) 270 | 271 | # Validation feed 272 | val_feed = {x: mnist.validation.images, y_: mnist.validation.labels} 273 | val_xent = sess.run(cross_entropy, feed_dict=val_feed) 274 | print("After %d training step(s), validation cross entropy = %g" % 275 | (FLAGS.train_steps, val_xent)) 276 | 277 | 278 | if __name__ == "__main__": 279 | tf.app.run() -------------------------------------------------------------------------------- /recipes/TensorFlow/TensorFlow-GPU/TensorFlow-GPU.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tensorflow GPU\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Introduction\n", 15 | "\n", 16 | "This example demonstrate how to run standard TensorFlow sample (https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py) on Azure Batch AI cluster of one node.\n", 17 | "\n", 18 | "## Details\n", 19 | "\n", 20 | "- For demonstration purposes, official convolutional.py will be deployed at Azure File Share;\n", 21 | "- Standard output of the job will be stored on Azure File Share;" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Instructions\n", 29 | "\n", 30 | "### Install Dependencies and Create Configuration file.\n", 31 | "Follow [instructions](/recipes) to install all dependencies and create configuration file." 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "### Read Configuration and Create Batch AI client" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "collapsed": false 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "from __future__ import print_function\n", 50 | "\n", 51 | "import time\n", 52 | "from datetime import datetime\n", 53 | "import os\n", 54 | "import sys\n", 55 | "import zipfile\n", 56 | "\n", 57 | "from azure.storage.file import FileService, FilePermissions\n", 58 | "import azure.mgmt.batchai.models as models\n", 59 | "\n", 60 | "# utilities.py contains helper functions used by different notebooks\n", 61 | "sys.path.append('..\\..')\n", 62 | "import utilities\n", 63 | "\n", 64 | "cfg = utilities.Configuration('..\\..\\configuration.json')\n", 65 | "client = utilities.create_batchai_client(cfg)\n", 66 | "utilities.create_resource_group(cfg)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "### Create File Share\n", 74 | "\n", 75 | "For this example we will create a new File Share with name `batchaisample` under your storage account.\n", 76 | "\n", 77 | "**Note** You don't need to create new file share for every cluster. We are doing this in this sample to simplify resource management for you." 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": { 84 | "collapsed": false 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "azure_file_share_name = 'batchaisample'\n", 89 | "service = FileService(cfg.storage_account_name, cfg.storage_account_key)\n", 90 | "service.create_share(azure_file_share_name, fail_on_exist=False)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "### Configure Compute Cluster\n", 98 | "\n", 99 | "- For this example we will use a gpu cluster of 1 `STANDARD_NC6` node. You can increase the number of nodes by changing `nodes_count` variable;\n", 100 | "- We will mount file share at folder with name `external`. Full path of this folder on a computer node will be `$AZ_BATCHAI_MOUNT_ROOT/external`;\n", 101 | "- We will call the cluster `nc6`;\n", 102 | "\n", 103 | "So, the cluster will have the following parameters:" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "collapsed": false 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "azure_file_share = 'external'\n", 115 | "nodes_count = 1\n", 116 | "cluster_name = 'nc6'\n", 117 | "\n", 118 | "volumes = models.MountVolumes(\n", 119 | " azure_file_shares=[\n", 120 | " models.AzureFileShareReference(\n", 121 | " account_name=cfg.storage_account_name,\n", 122 | " credentials=models.AzureStorageCredentialsInfo(\n", 123 | " account_key=cfg.storage_account_key),\n", 124 | " azure_file_url = 'https://{0}.file.core.windows.net/{1}'.format(\n", 125 | " cfg.storage_account_name, azure_file_share_name),\n", 126 | " relative_mount_path=azure_file_share)\n", 127 | " ]\n", 128 | ")\n", 129 | "\n", 130 | "parameters = models.ClusterCreateParameters(\n", 131 | " location=cfg.location,\n", 132 | " vm_size=\"STANDARD_NC6\",\n", 133 | " scale_settings=models.ScaleSettings(\n", 134 | " manual=models.ManualScaleSettings(target_node_count=nodes_count)\n", 135 | " ),\n", 136 | " node_setup=models.NodeSetup(\n", 137 | " mount_volumes=volumes\n", 138 | " ),\n", 139 | " user_account_settings=models.UserAccountSettings(\n", 140 | " admin_user_name=cfg.admin,\n", 141 | " admin_user_password=cfg.admin_password,\n", 142 | " admin_user_ssh_public_key=cfg.admin_ssh_key\n", 143 | " )\n", 144 | ")" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "### Create Compute Cluster" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "_ = client.clusters.create(cfg.resource_group, cluster_name, parameters)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "### Monitor Cluster Creation\n", 170 | "\n", 171 | "Monitor the just created cluster. utilities.py contains a helper function to print out all kind of nodes count in the cluster." 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": { 178 | "collapsed": false 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "cluster = client.clusters.get(cfg.resource_group, cluster_name)\n", 183 | "utilities.print_cluster_status(cluster)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "### Deploy Sample Script and Configure the Input Directories\n" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "- For each job we will create a folder containing a copy of the sample script. This allows to run the same job with different scripts." 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": { 204 | "collapsed": true 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "mnist_script_directory = 'tensorflow_samples'\n", 209 | "service = FileService(cfg.storage_account_name, cfg.storage_account_key)\n", 210 | "service.create_directory(\n", 211 | " azure_file_share_name, mnist_script_directory, fail_on_exist=False)\n", 212 | "service.create_file_from_path(\n", 213 | " azure_file_share_name, mnist_script_directory, 'convolutional.py', 'convolutional.py')" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "The job needs to know where to find ConvNet_MNIST.py and input MNIST dataset. We will create two input directories for this:" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": { 227 | "collapsed": false 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "input_directories = [\n", 232 | " models.InputDirectory(\n", 233 | " id='SCRIPT',\n", 234 | " path='$AZ_BATCHAI_MOUNT_ROOT/{0}/{1}'.format(azure_file_share, mnist_script_directory))]" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "The job will be able to reference that directory using ```$AZ_BATCHAI_INPUT_SCRIPT``` environment variable." 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "### Configure Output Directories\n", 249 | "We will store standard and error output of the job in File Share:" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": { 256 | "collapsed": true 257 | }, 258 | "outputs": [], 259 | "source": [ 260 | "std_output_path_prefix = \"$AZ_BATCHAI_MOUNT_ROOT/{0}\".format(azure_file_share)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "### Configure Job\n", 268 | "\n", 269 | "- The job will use `tensorflow/tensorflow:1.1.0-gpu` container.\n", 270 | "- Will use configured previously input and output directories.\n", 271 | "- By removing container_settings, the job will be ran on the host VMs if you are using DSVM.\n", 272 | "\n", 273 | "**Note** You must agree to the following licences before using this container:\n", 274 | "- [TensorFlow License](https://github.com/tensorflow/tensorflow/blob/master/LICENSE)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": { 281 | "collapsed": false 282 | }, 283 | "outputs": [], 284 | "source": [ 285 | "job_name = datetime.utcnow().strftime(\"tf_%m_%d_%Y_%H%M%S\")\n", 286 | "parameters = models.job_create_parameters.JobCreateParameters(\n", 287 | " location=cfg.location,\n", 288 | " cluster=models.ResourceId(cluster.id),\n", 289 | " node_count=nodes_count,\n", 290 | " input_directories=input_directories,\n", 291 | " std_out_err_path_prefix=std_output_path_prefix,\n", 292 | " container_settings=models.ContainerSettings(\n", 293 | " models.ImageSourceRegistry(image='tensorflow/tensorflow:1.1.0-gpu')),\n", 294 | " tensor_flow_settings=models.TensorFlowSettings(\n", 295 | " python_script_file_path='$AZ_BATCHAI_INPUT_SCRIPT/convolutional.py',\n", 296 | " master_command_line_args=\"-p\",\n", 297 | " )\n", 298 | ")" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "### Create a training Job and wait for Job completion\n" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": { 312 | "collapsed": false 313 | }, 314 | "outputs": [], 315 | "source": [ 316 | "_ = client.jobs.create(cfg.resource_group, job_name, parameters) \n", 317 | "print('Created Job: {}'.format(job_name))" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "### Wait for Job to Finish\n", 325 | "The job will start running when the cluster will have enought idle nodes. The following code waits for job to start running printing the cluster state. During job run, the code prints current content of stdeout-0.txt (the output of the worker running on the first node)." 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": { 332 | "collapsed": false 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "utilities.wait_for_job_completion(client, cfg.resource_group, job_name, cluster_name, 'stdOuterr', 'stdout-wk-0.txt')" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": { 342 | "collapsed": true 343 | }, 344 | "source": [ 345 | "### Download stdout.txt and stderr.txt files for the Job" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": { 352 | "collapsed": false 353 | }, 354 | "outputs": [], 355 | "source": [ 356 | "files = client.jobs.list_output_files(cfg.resource_group, job_name, models.JobsListOutputFilesOptions(\"stdOuterr\")) \n", 357 | "for file in list(files):\n", 358 | " utilities.download_file(file.download_url, file.name)\n", 359 | "print(\"All files Downloaded\")" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "### Delete the Job" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": { 373 | "collapsed": false 374 | }, 375 | "outputs": [], 376 | "source": [ 377 | "client.jobs.delete(cfg.resource_group, job_name)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "### Delete the Cluster\n", 385 | "When you are finished with the sample and don't want to submit any more jobs you can delete the cluster using the following code." 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": { 392 | "collapsed": false 393 | }, 394 | "outputs": [], 395 | "source": [ 396 | "client.clusters.delete(cfg.resource_group, cluster_name)" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": { 402 | "collapsed": true 403 | }, 404 | "source": [ 405 | "### Delete File Share\n", 406 | "When you are finished with the sample and don't want to submit any more jobs you can delete the file share completely with all files using the following code." 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": { 413 | "collapsed": true 414 | }, 415 | "outputs": [], 416 | "source": [ 417 | "service = FileService(cfg.storage_account_name, cfg.storage_account_key)\n", 418 | "service.delete_share(azure_file_share_name)" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": { 425 | "collapsed": true 426 | }, 427 | "outputs": [], 428 | "source": [] 429 | } 430 | ], 431 | "metadata": { 432 | "anaconda-cloud": {}, 433 | "kernelspec": { 434 | "display_name": "Python [Root]", 435 | "language": "python", 436 | "name": "Python [Root]" 437 | }, 438 | "language_info": { 439 | "codemirror_mode": { 440 | "name": "ipython", 441 | "version": 3 442 | }, 443 | "file_extension": ".py", 444 | "mimetype": "text/x-python", 445 | "name": "python", 446 | "nbconvert_exporter": "python", 447 | "pygments_lexer": "ipython3", 448 | "version": "3.5.2" 449 | } 450 | }, 451 | "nbformat": 4, 452 | "nbformat_minor": 1 453 | } 454 | -------------------------------------------------------------------------------- /recipes/TensorFlow/TensorFlow-GPU/convolutional.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Simple, end-to-end, LeNet-5-like convolutional MNIST model example. 17 | 18 | This should achieve a test error of 0.7%. Please keep this model as simple and 19 | linear as possible, it is meant as a tutorial for simple convolutional models. 20 | Run with --self_test on the command line to execute a short self-test. 21 | """ 22 | from __future__ import absolute_import 23 | from __future__ import division 24 | from __future__ import print_function 25 | 26 | import argparse 27 | import gzip 28 | import os 29 | import sys 30 | import time 31 | 32 | import numpy 33 | from six.moves import urllib 34 | from six.moves import xrange # pylint: disable=redefined-builtin 35 | import tensorflow as tf 36 | 37 | SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/' 38 | WORK_DIRECTORY = 'data' 39 | IMAGE_SIZE = 28 40 | NUM_CHANNELS = 1 41 | PIXEL_DEPTH = 255 42 | NUM_LABELS = 10 43 | VALIDATION_SIZE = 5000 # Size of the validation set. 44 | SEED = 66478 # Set to None for random seed. 45 | BATCH_SIZE = 64 46 | NUM_EPOCHS = 10 47 | EVAL_BATCH_SIZE = 64 48 | EVAL_FREQUENCY = 100 # Number of steps between evaluations. 49 | 50 | 51 | FLAGS = None 52 | 53 | 54 | def data_type(): 55 | """Return the type of the activations, weights, and placeholder variables.""" 56 | if FLAGS.use_fp16: 57 | return tf.float16 58 | else: 59 | return tf.float32 60 | 61 | 62 | def maybe_download(filename): 63 | """Download the data from Yann's website, unless it's already here.""" 64 | if not tf.gfile.Exists(WORK_DIRECTORY): 65 | tf.gfile.MakeDirs(WORK_DIRECTORY) 66 | filepath = os.path.join(WORK_DIRECTORY, filename) 67 | if not tf.gfile.Exists(filepath): 68 | filepath, _ = urllib.request.urlretrieve(SOURCE_URL + filename, filepath) 69 | with tf.gfile.GFile(filepath) as f: 70 | size = f.size() 71 | print('Successfully downloaded', filename, size, 'bytes.') 72 | return filepath 73 | 74 | 75 | def extract_data(filename, num_images): 76 | """Extract the images into a 4D tensor [image index, y, x, channels]. 77 | 78 | Values are rescaled from [0, 255] down to [-0.5, 0.5]. 79 | """ 80 | print('Extracting', filename) 81 | with gzip.open(filename) as bytestream: 82 | bytestream.read(16) 83 | buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images * NUM_CHANNELS) 84 | data = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.float32) 85 | data = (data - (PIXEL_DEPTH / 2.0)) / PIXEL_DEPTH 86 | data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS) 87 | return data 88 | 89 | 90 | def extract_labels(filename, num_images): 91 | """Extract the labels into a vector of int64 label IDs.""" 92 | print('Extracting', filename) 93 | with gzip.open(filename) as bytestream: 94 | bytestream.read(8) 95 | buf = bytestream.read(1 * num_images) 96 | labels = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.int64) 97 | return labels 98 | 99 | 100 | def fake_data(num_images): 101 | """Generate a fake dataset that matches the dimensions of MNIST.""" 102 | data = numpy.ndarray( 103 | shape=(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS), 104 | dtype=numpy.float32) 105 | labels = numpy.zeros(shape=(num_images,), dtype=numpy.int64) 106 | for image in xrange(num_images): 107 | label = image % 2 108 | data[image, :, :, 0] = label - 0.5 109 | labels[image] = label 110 | return data, labels 111 | 112 | 113 | def error_rate(predictions, labels): 114 | """Return the error rate based on dense predictions and sparse labels.""" 115 | return 100.0 - ( 116 | 100.0 * 117 | numpy.sum(numpy.argmax(predictions, 1) == labels) / 118 | predictions.shape[0]) 119 | 120 | 121 | def main(_): 122 | if FLAGS.self_test: 123 | print('Running self-test.') 124 | train_data, train_labels = fake_data(256) 125 | validation_data, validation_labels = fake_data(EVAL_BATCH_SIZE) 126 | test_data, test_labels = fake_data(EVAL_BATCH_SIZE) 127 | num_epochs = 1 128 | else: 129 | # Get the data. 130 | train_data_filename = maybe_download('train-images-idx3-ubyte.gz') 131 | train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz') 132 | test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz') 133 | test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz') 134 | 135 | # Extract it into numpy arrays. 136 | train_data = extract_data(train_data_filename, 60000) 137 | train_labels = extract_labels(train_labels_filename, 60000) 138 | test_data = extract_data(test_data_filename, 10000) 139 | test_labels = extract_labels(test_labels_filename, 10000) 140 | 141 | # Generate a validation set. 142 | validation_data = train_data[:VALIDATION_SIZE, ...] 143 | validation_labels = train_labels[:VALIDATION_SIZE] 144 | train_data = train_data[VALIDATION_SIZE:, ...] 145 | train_labels = train_labels[VALIDATION_SIZE:] 146 | num_epochs = NUM_EPOCHS 147 | train_size = train_labels.shape[0] 148 | 149 | # This is where training samples and labels are fed to the graph. 150 | # These placeholder nodes will be fed a batch of training data at each 151 | # training step using the {feed_dict} argument to the Run() call below. 152 | train_data_node = tf.placeholder( 153 | data_type(), 154 | shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)) 155 | train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,)) 156 | eval_data = tf.placeholder( 157 | data_type(), 158 | shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)) 159 | 160 | # The variables below hold all the trainable weights. They are passed an 161 | # initial value which will be assigned when we call: 162 | # {tf.global_variables_initializer().run()} 163 | conv1_weights = tf.Variable( 164 | tf.truncated_normal([5, 5, NUM_CHANNELS, 32], # 5x5 filter, depth 32. 165 | stddev=0.1, 166 | seed=SEED, dtype=data_type())) 167 | conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type())) 168 | conv2_weights = tf.Variable(tf.truncated_normal( 169 | [5, 5, 32, 64], stddev=0.1, 170 | seed=SEED, dtype=data_type())) 171 | conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type())) 172 | fc1_weights = tf.Variable( # fully connected, depth 512. 173 | tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512], 174 | stddev=0.1, 175 | seed=SEED, 176 | dtype=data_type())) 177 | fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type())) 178 | fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_LABELS], 179 | stddev=0.1, 180 | seed=SEED, 181 | dtype=data_type())) 182 | fc2_biases = tf.Variable(tf.constant( 183 | 0.1, shape=[NUM_LABELS], dtype=data_type())) 184 | 185 | # We will replicate the model structure for the training subgraph, as well 186 | # as the evaluation subgraphs, while sharing the trainable parameters. 187 | def model(data, train=False): 188 | """The Model definition.""" 189 | # 2D convolution, with 'SAME' padding (i.e. the output feature map has 190 | # the same size as the input). Note that {strides} is a 4D array whose 191 | # shape matches the data layout: [image index, y, x, depth]. 192 | conv = tf.nn.conv2d(data, 193 | conv1_weights, 194 | strides=[1, 1, 1, 1], 195 | padding='SAME') 196 | # Bias and rectified linear non-linearity. 197 | relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases)) 198 | # Max pooling. The kernel size spec {ksize} also follows the layout of 199 | # the data. Here we have a pooling window of 2, and a stride of 2. 200 | pool = tf.nn.max_pool(relu, 201 | ksize=[1, 2, 2, 1], 202 | strides=[1, 2, 2, 1], 203 | padding='SAME') 204 | conv = tf.nn.conv2d(pool, 205 | conv2_weights, 206 | strides=[1, 1, 1, 1], 207 | padding='SAME') 208 | relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases)) 209 | pool = tf.nn.max_pool(relu, 210 | ksize=[1, 2, 2, 1], 211 | strides=[1, 2, 2, 1], 212 | padding='SAME') 213 | # Reshape the feature map cuboid into a 2D matrix to feed it to the 214 | # fully connected layers. 215 | pool_shape = pool.get_shape().as_list() 216 | reshape = tf.reshape( 217 | pool, 218 | [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]]) 219 | # Fully connected layer. Note that the '+' operation automatically 220 | # broadcasts the biases. 221 | hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases) 222 | # Add a 50% dropout during training only. Dropout also scales 223 | # activations such that no rescaling is needed at evaluation time. 224 | if train: 225 | hidden = tf.nn.dropout(hidden, 0.5, seed=SEED) 226 | return tf.matmul(hidden, fc2_weights) + fc2_biases 227 | 228 | # Training computation: logits + cross-entropy loss. 229 | logits = model(train_data_node, True) 230 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( 231 | labels=train_labels_node, logits=logits)) 232 | 233 | # L2 regularization for the fully connected parameters. 234 | regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) + 235 | tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases)) 236 | # Add the regularization term to the loss. 237 | loss += 5e-4 * regularizers 238 | 239 | # Optimizer: set up a variable that's incremented once per batch and 240 | # controls the learning rate decay. 241 | batch = tf.Variable(0, dtype=data_type()) 242 | # Decay once per epoch, using an exponential schedule starting at 0.01. 243 | learning_rate = tf.train.exponential_decay( 244 | 0.01, # Base learning rate. 245 | batch * BATCH_SIZE, # Current index into the dataset. 246 | train_size, # Decay step. 247 | 0.95, # Decay rate. 248 | staircase=True) 249 | # Use simple momentum for the optimization. 250 | optimizer = tf.train.MomentumOptimizer(learning_rate, 251 | 0.9).minimize(loss, 252 | global_step=batch) 253 | 254 | # Predictions for the current training minibatch. 255 | train_prediction = tf.nn.softmax(logits) 256 | 257 | # Predictions for the test and validation, which we'll compute less often. 258 | eval_prediction = tf.nn.softmax(model(eval_data)) 259 | 260 | # Small utility function to evaluate a dataset by feeding batches of data to 261 | # {eval_data} and pulling the results from {eval_predictions}. 262 | # Saves memory and enables this to run on smaller GPUs. 263 | def eval_in_batches(data, sess): 264 | """Get all predictions for a dataset by running it in small batches.""" 265 | size = data.shape[0] 266 | if size < EVAL_BATCH_SIZE: 267 | raise ValueError("batch size for evals larger than dataset: %d" % size) 268 | predictions = numpy.ndarray(shape=(size, NUM_LABELS), dtype=numpy.float32) 269 | for begin in xrange(0, size, EVAL_BATCH_SIZE): 270 | end = begin + EVAL_BATCH_SIZE 271 | if end <= size: 272 | predictions[begin:end, :] = sess.run( 273 | eval_prediction, 274 | feed_dict={eval_data: data[begin:end, ...]}) 275 | else: 276 | batch_predictions = sess.run( 277 | eval_prediction, 278 | feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]}) 279 | predictions[begin:, :] = batch_predictions[begin - size:, :] 280 | return predictions 281 | 282 | # Create a local session to run the training. 283 | start_time = time.time() 284 | with tf.Session() as sess: 285 | # Run all the initializers to prepare the trainable parameters. 286 | tf.global_variables_initializer().run() 287 | print('Initialized!') 288 | # Loop through training steps. 289 | for step in xrange(int(num_epochs * train_size) // BATCH_SIZE): 290 | # Compute the offset of the current minibatch in the data. 291 | # Note that we could use better randomization across epochs. 292 | offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE) 293 | batch_data = train_data[offset:(offset + BATCH_SIZE), ...] 294 | batch_labels = train_labels[offset:(offset + BATCH_SIZE)] 295 | # This dictionary maps the batch data (as a numpy array) to the 296 | # node in the graph it should be fed to. 297 | feed_dict = {train_data_node: batch_data, 298 | train_labels_node: batch_labels} 299 | # Run the optimizer to update weights. 300 | sess.run(optimizer, feed_dict=feed_dict) 301 | # print some extra information once reach the evaluation frequency 302 | if step % EVAL_FREQUENCY == 0: 303 | # fetch some extra nodes' data 304 | l, lr, predictions = sess.run([loss, learning_rate, train_prediction], 305 | feed_dict=feed_dict) 306 | elapsed_time = time.time() - start_time 307 | start_time = time.time() 308 | print('Step %d (epoch %.2f), %.1f ms' % 309 | (step, float(step) * BATCH_SIZE / train_size, 310 | 1000 * elapsed_time / EVAL_FREQUENCY)) 311 | print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr)) 312 | print('Minibatch error: %.1f%%' % error_rate(predictions, batch_labels)) 313 | print('Validation error: %.1f%%' % error_rate( 314 | eval_in_batches(validation_data, sess), validation_labels)) 315 | sys.stdout.flush() 316 | # Finally print the result! 317 | test_error = error_rate(eval_in_batches(test_data, sess), test_labels) 318 | print('Test error: %.1f%%' % test_error) 319 | if FLAGS.self_test: 320 | print('test_error', test_error) 321 | assert test_error == 0.0, 'expected 0.0 test_error, got %.2f' % ( 322 | test_error,) 323 | 324 | 325 | if __name__ == '__main__': 326 | parser = argparse.ArgumentParser() 327 | parser.add_argument( 328 | '--use_fp16', 329 | default=False, 330 | help='Use half floats instead of full floats if True.', 331 | action='store_true') 332 | parser.add_argument( 333 | '--self_test', 334 | default=False, 335 | action='store_true', 336 | help='True if running a self test.') 337 | 338 | FLAGS, unparsed = parser.parse_known_args() 339 | tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) 340 | --------------------------------------------------------------------------------