├── recipes
    ├── Caffe
    │   ├── Caffe-GPU
    │   │   ├── preparation_script.sh
    │   │   ├── lenet_solver.prototxt
    │   │   ├── Readme.md
    │   │   └── lenet_train_test.prototxt
    │   └── README.md
    ├── configuration.json.template
    ├── Keras
    │   ├── Keras-DSVM
    │   │   ├── job.json
    │   │   ├── Readme.md
    │   │   ├── mnist_cnn.py
    │   │   └── cli-instructions.md
    │   └── README.md
    ├── TensorFlow
    │   ├── TensorFlow-GPU
    │   │   ├── job.json
    │   │   ├── Readme.md
    │   │   ├── cli-instructions.md
    │   │   ├── TensorFlow-GPU.ipynb
    │   │   └── convolutional.py
    │   ├── Readme.md
    │   └── TensorFlow-GPU-Distributed
    │   │   ├── job.json
    │   │   ├── Readme.md
    │   │   ├── cli-instructions.md
    │   │   └── mnist_replica.py
    ├── Chainer
    │   ├── Chainer-GPU-Distributed
    │   │   ├── job.json
    │   │   ├── Readme.md
    │   │   ├── docker
    │   │   │   └── dockerfile
    │   │   ├── cli-instructions.md
    │   │   └── train_mnist.py
    │   └── README.md
    ├── CNTK
    │   ├── CNTK-GPU-Python-Distributed
    │   │   ├── CIFA-10_data_prepare.sh
    │   │   ├── job.json
    │   │   ├── Readme.md
    │   │   ├── cli-instructions.md
    │   │   ├── ConvNet_CIFAR10_DataAug.py
    │   │   └── ConvNet_CIFAR10_DataAug_Distributed.py
    │   ├── CNTK-GPU-Python
    │   │   ├── job.json
    │   │   ├── Readme.md
    │   │   ├── cli-instructions.md
    │   │   └── ConvNet_MNIST.py
    │   ├── CNTK-GPU-BrainScript
    │   │   ├── job.json
    │   │   ├── Readme.md
    │   │   ├── ConvNet_MNIST.cntk
    │   │   └── cli-instructions.md
    │   ├── CNTK-GPU-BrainScript-Distributed
    │   │   ├── job.json
    │   │   ├── Readme.md
    │   │   ├── ConvNet_MNIST.cntk
    │   │   └── cli-instructions.md
    │   ├── CNTK-GPU-Python-Distrbuted-Infiniband
    │   │   ├── job.json
    │   │   ├── jobprep_cntk_distributed_ib.sh
    │   │   ├── Readme.md
    │   │   ├── cli-instructions.md
    │   │   └── dockerfile
    │   └── Readme.md
    ├── Caffe2
    │   ├── README.md
    │   └── Caffe2-GPU-Distributed
    │   │   └── Readme.md
    ├── Horovod
    │   ├── Horovod-Infiniband-Benchmark
    │   │   ├── job.json
    │   │   ├── jobprep_benchmark.sh
    │   │   ├── Readme.md
    │   │   └── cli-instructions.md
    │   ├── Horovod
    │   │   ├── job.json
    │   │   ├── Readme.md
    │   │   └── cli-instructions.md
    │   └── Readme.md
    ├── utilities.py
    └── Readme.md
├── LICENSE
├── README.md
└── .gitignore


/recipes/Caffe/Caffe-GPU/preparation_script.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/bash
2 | 
3 | sed 's,$AZ_BATCHAI_OUTPUT_MODEL,'$AZ_BATCHAI_OUTPUT_MODEL',g; s,$AZ_BATCHAI_INPUT_SAMPLE,'$AZ_BATCHAI_INPUT_SAMPLE',g' $AZ_BATCHAI_INPUT_SAMPLE/lenet_solver.prototxt.template > $AZ_BATCHAI_INPUT_SAMPLE/lenet_solver.prototxt
4 | sed 's,$AZ_BATCHAI_INPUT_SAMPLE,'$AZ_BATCHAI_INPUT_SAMPLE',g' $AZ_BATCHAI_INPUT_SAMPLE/lenet_train_test.prototxt.template > $AZ_BATCHAI_INPUT_SAMPLE/lenet_train_test.prototxt
5 | 


--------------------------------------------------------------------------------
/recipes/configuration.json.template:
--------------------------------------------------------------------------------
 1 | {
 2 |     "subscription_id": "",
 3 |     "aad_client_id": "",
 4 |     "aad_secret": "",
 5 |     "aad_tenant": "",
 6 |     "location": "eastus",
 7 |     "base_url": "",
 8 |     "resource_group": "",
 9 |     "storage_account" : {
10 |         "name": "",
11 |         "key": ""
12 |     },
13 |     "admin_user" : {
14 |         "name": "",
15 |         "password": "",
16 |         "ssh_public_key": ""
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/recipes/Keras/Keras-DSVM/job.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "properties": {
 3 |         "nodeCount": 1,
 4 |         "customToolkitSettings": {
 5 |             "commandLine": "KERAS_BACKEND=cntk python $AZ_BATCHAI_INPUT_SCRIPT/mnist_cnn.py"
 6 |         },
 7 |         "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
 8 |         "inputDirectories": [{
 9 |             "id": "SCRIPT",
10 |             "path": "$AZ_BATCHAI_MOUNT_ROOT/external/keras_samples"
11 |         }]
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/recipes/TensorFlow/TensorFlow-GPU/job.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "properties": {
 3 |         "nodeCount": 1,
 4 |         "tensorFlowSettings": {
 5 |             "pythonScriptFilePath": "$AZ_BATCHAI_INPUT_SCRIPT/convolutional.py",
 6 |             "masterCommandLineArgs": "-p"
 7 |         },
 8 |         "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
 9 |         "inputDirectories": [{
10 |             "id": "SCRIPT",
11 |             "path": "$AZ_BATCHAI_MOUNT_ROOT/external/tensorflow_samples"
12 |         }],
13 |         "containerSettings": {
14 |             "imageSourceRegistry": {
15 |                 "image": "tensorflow/tensorflow:1.1.0-gpu"
16 |             }
17 |         }
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/recipes/Caffe/README.md:
--------------------------------------------------------------------------------
 1 | # Caffe
 2 | 
 3 | Caffe is a deep learning framework made with expression, speed, and modularity in mind. It is developed by Berkeley AI Research (BAIR)/The Berkeley Vision and Learning Center (BVLC) and community contributors.
 4 | 
 5 | See official Caffe GitHub page (https://github.com/BVLC/caffe).
 6 | 
 7 | #### [Caffe-GPU](./Caffe-GPU)
 8 | This Caffe-GPU recipe contains information on how to run Caffe training job on a GPU node with BatchAI.
 9 | 
10 | ## Help or Feedback
11 | --------------------
12 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
13 | 
14 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
15 | 


--------------------------------------------------------------------------------
/recipes/Chainer/Chainer-GPU-Distributed/job.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "properties": {
 3 |         "nodeCount": 2,
 4 |         "chainerSettings": {
 5 |             "pythonScriptFilePath": "$AZ_BATCHAI_INPUT_SCRIPT/train_mnist.py",
 6 |             "masterCommandLineArgs": "-g -o $AZ_BATCHAI_OUTPUT_MODEL"
 7 |         },
 8 |         "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
 9 |         "inputDirectories": [{
10 |             "id": "SCRIPT",
11 |             "path": "$AZ_BATCHAI_MOUNT_ROOT/external/chainer_samples"
12 |         }],
13 |         "outputDirectories": [{
14 |             "id": "MODEL",
15 |             "pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
16 |             "pathSuffix": "Models"
17 |         }],
18 |         "containerSettings": {
19 |             "imageSourceRegistry": {
20 |                 "image": "batchaitraining/chainermn:openMPI"
21 |             }
22 |         }
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-Python-Distributed/CIFA-10_data_prepare.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | if [ ! -z $AZ_BATCHAI_JOB_TEMP ];then
 3 |     cd $AZ_BATCHAI_JOB_TEMP
 4 |     wget 'https://batchaisamples.blob.core.windows.net/samples/CIFAR-10_dataset.tar?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=b&sig=nFXsAp0Eq%2BoS5%2BKAEPnfyEGlCkBcKIadDvCPA%2BcX6lU%3D' -k -O 'CIFAR-10_dataset.tar'
 5 |     echo "untar CIFAR-10 dataset........."
 6 |     tar -xf CIFAR-10_dataset.tar
 7 |     echo "done"
 8 |     ROOT_DIR=`pwd`
 9 |     files=( "train_map.txt" "test_map.txt" )
10 |     for file in "${files[@]}"
11 |     do
12 |         output=$ROOT_DIR"/"$file
13 |         if [ -f $output ];then
14 |             rm $output
15 |         fi
16 |         touch $output
17 |         while read -r line
18 |         do
19 |             name="$line"
20 |             echo "$ROOT_DIR$name" >> $output
21 |         done < $file".template"
22 |     done
23 | fi
24 | 


--------------------------------------------------------------------------------
/recipes/Caffe/Caffe-GPU/lenet_solver.prototxt:
--------------------------------------------------------------------------------
 1 | # The train/test net protocol buffer definition
 2 | net: "$AZ_BATCHAI_INPUT_SAMPLE/lenet_train_test.prototxt"
 3 | # test_iter specifies how many forward passes the test should carry out.
 4 | # In the case of MNIST, we have test batch size 100 and 100 test iterations,
 5 | # covering the full 10,000 testing images.
 6 | test_iter: 100
 7 | # Carry out testing every 500 training iterations.
 8 | test_interval: 500
 9 | # The base learning rate, momentum and the weight decay of the network.
10 | base_lr: 0.01
11 | momentum: 0.9
12 | weight_decay: 0.0005
13 | # The learning rate policy
14 | lr_policy: "inv"
15 | gamma: 0.0001
16 | power: 0.75
17 | # Display every 100 iterations
18 | display: 100
19 | # The maximum number of iterations
20 | max_iter: 10000
21 | # snapshot intermediate results
22 | snapshot: 5000
23 | snapshot_prefix: "$AZ_BATCHAI_OUTPUT_MODEL/lenet"
24 | # solver mode: CPU or GPU
25 | solver_mode: GPU
26 | 


--------------------------------------------------------------------------------
/recipes/Caffe2/README.md:
--------------------------------------------------------------------------------
 1 | # Caffe2
 2 | 
 3 | Caffe2 is a python-based lightweight, modular, and scalable deep learning framework. Building on the original Caffe, Caffe2 is designed with expression, speed, and modularity in mind.
 4 | 
 5 | See official Caffe2 GitHub page (https://github.com/caffe2/caffe2).
 6 | 
 7 | #### [Caffe2-GPU-Distributed](./Caffe2-GPU-Distributed)
 8 | This Caffe2-GPU-Distributed recipe contains information on how to run distributed Caffe2 training job across multiple GPU nodes with BatchAI, by setting up a single-node NFS file server.
 9 | 
10 | ## Help or Feedback
11 | --------------------
12 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
13 | 
14 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
15 | 


--------------------------------------------------------------------------------
/recipes/Keras/README.md:
--------------------------------------------------------------------------------
 1 | # Keras
 2 | 
 3 | Keras is a high-level neural networks API, written in Python and capable of running on top of TensorFlow, CNTK, or Theano. It was developed with a focus on enabling fast experimentation. Being able to go from idea to result with the least possible delay is key to doing good research.
 4 | 
 5 | See official Keras GitHub page (https://github.com/fchollet/keras).
 6 | 
 7 | #### [Keras-DSVM](./Keras-DSVM)
 8 | This Keras-DSVM recipe contains information on how to run Keras training job on a GPU data science node with BatchAI.
 9 | 
10 | ## Help or Feedback
11 | --------------------
12 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
13 | 
14 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
15 | 


--------------------------------------------------------------------------------
/recipes/Horovod/Horovod-Infiniband-Benchmark/job.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "properties": {
 3 |     "nodeCount": 2,
 4 |     "jobPreparation": {
 5 |       "commandLine": "bash $AZ_BATCHAI_INPUT_SCRIPTS/jobprep_benchmark.sh"
 6 |     },
 7 |     "customToolkitSettings": {
 8 |       "commandLine": "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh; cd $AZ_BATCHAI_JOB_TEMP/benchmarks/; mpirun -n 8 -ppn 4 -hosts $AZ_BATCH_HOST_LIST -env I_MPI_FABRICS=dapl -env I_MPI_DAPL_PROVIDER=ofa-v2-ib0 -env I_MPI_DYNAMIC_CONNECTION=0 python scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --model resnet101 --batch_size 64 --variable_update horovod"
 9 |     },
10 |     "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
11 |     "inputDirectories": [{
12 |         "id": "SCRIPTS",
13 |         "path": "$AZ_BATCHAI_MOUNT_ROOT/external/horovod_samples"
14 |       }
15 |     ],
16 |     "containerSettings": {
17 |       "imageSourceRegistry": {
18 |         "image": "tensorflow/tensorflow:1.4.0-gpu"
19 |       }
20 |     }
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-Python/job.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "properties": {
 3 |         "nodeCount": 1,
 4 |         "cntkSettings": {
 5 |             "pythonScriptFilePath": "$AZ_BATCHAI_INPUT_SCRIPT/ConvNet_MNIST.py",
 6 |             "commandLineArgs": "$AZ_BATCHAI_INPUT_DATASET $AZ_BATCHAI_OUTPUT_MODEL"
 7 |         },
 8 |         "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
 9 |         "inputDirectories": [{
10 |             "id": "DATASET",
11 |             "path": "$AZ_BATCHAI_MOUNT_ROOT/external/mnist_database"
12 |         }, {
13 |             "id": "SCRIPT",
14 |             "path": "$AZ_BATCHAI_MOUNT_ROOT/external/cntk_samples"
15 |         }],
16 |         "outputDirectories": [{
17 |             "id": "MODEL",
18 |             "pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
19 |             "pathSuffix": "Models"
20 |         }],
21 |         "containerSettings": {
22 |             "imageSourceRegistry": {
23 |                 "image": "microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0"
24 |             }
25 |         }
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-BrainScript/job.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "properties": {
 3 |         "nodeCount": 1,
 4 |         "cntkSettings": {
 5 |             "configFilePath": "$AZ_BATCHAI_INPUT_CONFIG/ConvNet_MNIST.cntk",
 6 |             "commandLineArgs": "rootDir=. dataDir=$AZ_BATCHAI_INPUT_DATASET outputDir=$AZ_BATCHAI_OUTPUT_MODEL"
 7 |         },
 8 |         "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
 9 |         "inputDirectories": [{
10 |             "id": "DATASET",
11 |             "path": "$AZ_BATCHAI_MOUNT_ROOT/external/mnist_database"
12 |         }, {
13 |             "id": "CONFIG",
14 |             "path": "$AZ_BATCHAI_MOUNT_ROOT/external/cntk_samples"
15 |         }],
16 |         "outputDirectories": [{
17 |             "id": "MODEL",
18 |             "pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
19 |             "pathSuffix": "Models"
20 |         }],
21 |         "containerSettings": {
22 |             "imageSourceRegistry": {
23 |                 "image": "microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0"
24 |             }
25 |         }
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/recipes/Horovod/Horovod/job.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "properties": {
 3 |     "nodeCount": 2,
 4 |     "jobPreparation": {
 5 |       "commandLine": "apt update; apt install mpi-default-dev mpi-default-bin -y; pip install horovod"
 6 |     },
 7 |     "customToolkitSettings": {
 8 |       "commandLine": "mpirun -mca btl_tcp_if_exclude docker0,lo --allow-run-as-root --hostfile $AZ_BATCHAI_MPI_HOST_FILE python $AZ_BATCHAI_INPUT_SCRIPTS/tensorflow_mnist.py"
 9 |     },
10 |     "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
11 |     "outputDirectories": [
12 |       {
13 |         "createNew": true,
14 |         "id": "MODEL",
15 |         "pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
16 |         "pathSuffix": "Models",
17 |         "type": "custom"
18 |       }
19 |     ],
20 |     "inputDirectories": [{
21 |         "id": "SCRIPTS",
22 |         "path": "$AZ_BATCHAI_MOUNT_ROOT/external/horovod_samples"
23 |       }
24 |     ],
25 |     "containerSettings": {
26 |       "imageSourceRegistry": {
27 |         "image": "tensorflow/tensorflow:1.1.0-gpu"
28 |       }
29 |     }
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-BrainScript-Distributed/job.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "properties": {
 3 |         "nodeCount": 2,
 4 |         "cntkSettings": {
 5 |             "configFilePath": "$AZ_BATCHAI_INPUT_CONFIG/DistributedConvNet_MNIST.cntk",
 6 |             "commandLineArgs": "rootDir=. dataDir=$AZ_BATCHAI_INPUT_DATASET outputDir=$AZ_BATCHAI_OUTPUT_MODEL parallelTrain=true"
 7 |         },
 8 |         "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
 9 |         "inputDirectories": [{
10 |             "id": "DATASET",
11 |             "path": "$AZ_BATCHAI_MOUNT_ROOT/external/mnist_database"
12 |         }, {
13 |             "id": "CONFIG",
14 |             "path": "$AZ_BATCHAI_MOUNT_ROOT/external/cntk_samples"
15 |         }],
16 |         "outputDirectories": [{
17 |             "id": "MODEL",
18 |             "pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
19 |             "pathSuffix": "Models"
20 |         }],
21 |         "containerSettings": {
22 |             "imageSourceRegistry": {
23 |                 "image": "microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0"
24 |             }
25 |         }
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/recipes/Horovod/Readme.md:
--------------------------------------------------------------------------------
 1 | # Horovod
 2 | 
 3 | Horovod is a distributed training framework for TensorFlow. The goal of Horovod is to make distributed Deep Learning
 4 | fast and easy to use.
 5 | 
 6 | See official [Horovod GitHub page](https://github.com/uber/horovod).
 7 | 
 8 | #### [Horovod](./Horovod)
 9 | 
10 | This Horovod recipe contains information on how to run Horovod training job on a GPU cluster with Batch AI. 
11 | 
12 | #### [Horovod-Infiniband-Benchmark](./Horovod-Infiniband-Benchmark)
13 | 
14 | This Horovod-Infiniband-Benchmark recipe contains information on how to reproduce [Horovod distributed training benchmarks](https://github.com/uber/horovod/blob/master/docs/benchmarks.md) with infiniband support using Batch AI. 
15 | 
16 | 
17 | ## Help or Feedback
18 | --------------------
19 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
20 | 
21 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
22 | 


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-Python-Distributed/job.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "properties": {
 3 |         "nodeCount": 2,
 4 |         "cntkSettings": {
 5 |             "pythonScriptFilePath": "$AZ_BATCHAI_INPUT_SCRIPT/ConvNet_CIFAR10_DataAug_Distributed.py",
 6 |             "commandLineArgs": "--datadir $AZ_BATCHAI_JOB_TEMP -outputdir $AZ_BATCHAI_OUTPUT_MODEL -n 5",
 7 |             "processCount": 2
 8 |         },
 9 |         "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
10 |         "inputDirectories": [{
11 |             "id": "SCRIPT",
12 |             "path": "$AZ_BATCHAI_MOUNT_ROOT/external/cntk_samples"
13 |         }],
14 |         "outputDirectories": [{
15 |             "id": "MODEL",
16 |             "pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
17 |             "pathSuffix": "Models"
18 |         }],
19 |         "containerSettings": {
20 |             "imageSourceRegistry": {
21 |                 "image": "microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0"
22 |             }
23 |         },
24 |         "jobPreparation": {
25 |             "commandLine": "bash $AZ_BATCHAI_INPUT_SCRIPT/CIFA-10_data_prepare.sh"
26 |         }
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-Python-Distrbuted-Infiniband/job.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "properties": {
 3 |         "nodeCount": 2,
 4 |         "cntkSettings": {
 5 |             "pythonScriptFilePath": "$AZ_BATCHAI_INPUT_SCRIPT/TrainResNet_CIFAR10_Distributed.py",
 6 |             "commandLineArgs": "--datadir $AZ_BATCHAI_JOB_TEMP -outputdir $AZ_BATCHAI_OUTPUT_MODEL -n resnet110 -e 5",
 7 |             "processCount": 8
 8 |         },
 9 |         "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
10 |         "inputDirectories": [{
11 |             "id": "SCRIPT",
12 |             "path": "$AZ_BATCHAI_MOUNT_ROOT/external/cntk_samples"
13 |         }],
14 |         "outputDirectories": [{
15 |             "id": "MODEL",
16 |             "pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
17 |             "pathSuffix": ""
18 |         }],
19 |         "containerSettings": {
20 |             "imageSourceRegistry": {
21 |                 "image": "batchaitraining/cntk:2.3-gpu-1bitsgd-py36-cuda8-cudnn6-intelmpi"
22 |             }
23 |         },
24 |         "jobPreparation": {
25 |             "commandLine": "bash $AZ_BATCHAI_INPUT_SCRIPT/jobprep_cntk_distributed_ib.sh"
26 |         }
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation. All rights reserved.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/recipes/TensorFlow/TensorFlow-GPU/Readme.md:
--------------------------------------------------------------------------------
 1 | # Tensorflow GPU
 2 | 
 3 | This example demonstrate how to run standard TensorFlow sample (https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py) on an Azure Batch AI cluster of one node.
 4 | 
 5 | ## Details
 6 | 
 7 | - For demonstration purposes, official convolutional.py will be deployed at Azure File Share;
 8 | - Standard output of the job will be stored on Azure File Share;
 9 | 
10 | 
11 | ## Instructions to Run Recipe
12 | 
13 | ### Jupyter Notebook
14 | 
15 | You can find Jupyter Notebook for this recipe in [TensorFlow-GPU.ipynb](./TensorFlow-GPU.ipynb).
16 | 
17 | ### Azure CLI 2.0
18 | 
19 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md).
20 | 
21 | ## License Notice
22 | 
23 | Under construction...
24 | 
25 | ## Help or Feedback
26 | --------------------
27 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
28 | 
29 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
30 | 


--------------------------------------------------------------------------------
/recipes/Horovod/Horovod-Infiniband-Benchmark/jobprep_benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | apt-get update -y
 3 | apt-get install -y -q -o Dpkg::Options::="--force-confold" --no-install-recommends cpio libdapl2 libmlx4-1 libsm6 libxext6 wget git
 4 | 
 5 | # download the benchmark scripts
 6 | cd $AZ_BATCHAI_JOB_TEMP
 7 | git clone https://github.com/alsrgv/benchmarks
 8 | cd benchmarks
 9 | git checkout horovod_v2
10 | 
11 | # install intel MPI
12 | cd /tmp
13 | wget -q 'http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/11595/l_mpi_2017.3.196.tgz' 
14 | tar zxvf l_mpi_2017.3.196.tgz
15 | sed -i -e 's/^ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' /tmp/l_mpi_2017.3.196/silent.cfg
16 | sed -i -e 's|^#ACTIVATION_LICENSE_FILE=|ACTIVATION_LICENSE_FILE=/tmp/l_mpi_2017.3.196/USE_SERVER.lic|g' /tmp/l_mpi_2017.3.196/silent.cfg
17 | sed -i -e 's/^ACTIVATION_TYPE=exist_lic/ACTIVATION_TYPE=license_server/g' /tmp/l_mpi_2017.3.196/silent.cfg 
18 | cd /tmp/l_mpi_2017.3.196 
19 | ./install.sh -s silent.cfg
20 | cd .. 
21 | rm -rf l_mpi_2017.3.196* 
22 | echo "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" >> ~/.bashrc
23 | 
24 | # install horovod
25 | source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh
26 | pip install horovod


--------------------------------------------------------------------------------
/recipes/TensorFlow/Readme.md:
--------------------------------------------------------------------------------
 1 | # TensorFlow
 2 | 
 3 | TensorFlow is an Python-based open source software library for numerical computation using data flow graphs. The graph nodes represent mathematical operations, while the graph edges represent the multidimensional data arrays (tensors) that flow between them. This flexible architecture lets you deploy computation to one or more CPUs or GPUs in a desktop, server, or mobile device without rewriting code. TensorFlow also includes TensorBoard, a data visualization toolkit.
 4 | 
 5 | See official TensorFlow GitHub page (https://github.com/tensorflow/tensorflow).
 6 | 
 7 | #### [TensorFlow-GPU-Distributed](./TensorFlow-GPU-Distributed)
 8 | This TensorFlow-GPU-Distributed recipe contains information on how to run distributed TensorFlow job across multiple GPU nodes with BatchAI.
 9 | 
10 | #### [TensorFlow-GPU](./TensorFlow-GPU)
11 | This TensorFlow-GPU recipe contains information on how to run TensorFlow job on a GPU node with BatchAI.
12 | 
13 | 
14 | ## Help or Feedback
15 | --------------------
16 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
17 | 
18 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
19 | 


--------------------------------------------------------------------------------
/recipes/Keras/Keras-DSVM/Readme.md:
--------------------------------------------------------------------------------
 1 | # Keras DSVM
 2 | 
 3 | This recipe shows how to run Keras using Batch AI on DSVM. DSVM supports tensorflow, cntk and theano backends for running Keras. Currently only tensorflow and cntk backends supports running on GPU.
 4 | 
 5 | ## Details
 6 | 
 7 | - DSVM has Keras framework preinstalled;
 8 | - Standard keras sample script [mnist_cnn.py](https://raw.githubusercontent.com/fchollet/keras/master/examples/mnist_cnn.py) is used;
 9 | - The script downloads the standard MNIST Database on its own;
10 | - Standard output of the job will be stored on Azure File Share.
11 | 
12 | ## Instructions to Run Recipe
13 | 
14 | ### Python Jupyter Notebook
15 | 
16 | You can find Jupyter Notebook for this recipe in [Keras-DSVM.ipynb](./Keras-DSVM.ipynb).
17 | 
18 | ### Azure CLI 2.0
19 | 
20 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md).
21 | 
22 | ## License Notice
23 | 
24 | Under construction...
25 | 
26 | ## Help or Feedback
27 | --------------------
28 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
29 | 
30 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
31 | 


--------------------------------------------------------------------------------
/recipes/Chainer/README.md:
--------------------------------------------------------------------------------
 1 | # Chainer
 2 | 
 3 | Chainer is a Python-based deep learning framework aiming at flexibility. It provides automatic differentiation APIs based on the define-by-run approach (a.k.a. dynamic computational graphs) as well as object-oriented high-level APIs to build and train neural networks. It also supports CUDA/cuDNN using CuPy for high performance training and inference. For more details of Chainer, see the documents and resources listed above and join the community in Forum, Slack, and Twitter.
 4 | 
 5 | ChainerMN is an additional package for Chainer, a flexible deep learning framework. ChainerMN enables multi-node distributed deep learning.
 6 | 
 7 | See official GitHub pages for Chainer (https://github.com/chainer/chainer) and ChainerMN (https://github.com/chainer/chainermn)
 8 | 
 9 | #### [Chainer-GPU-Distributed](./Chainer-GPU-Distributed)
10 | This Chainer-GPU-Distributed recipe contains information on how to run distributed Chainer training job across multiple GPU nodes with BatchAI.
11 | 
12 | ## Help or Feedback
13 | --------------------
14 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
15 | 
16 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
17 | 


--------------------------------------------------------------------------------
/recipes/Caffe2/Caffe2-GPU-Distributed/Readme.md:
--------------------------------------------------------------------------------
 1 | # Caffe2 GPU Distributed
 2 | 
 3 | This example demonstrates how to run standard Caffe2 resnet50_trainer.py example using Batch AI. You can run it on a single or multiple compute nodes.
 4 | 
 5 | ## Details
 6 | 
 7 | - Standard Caffe2 sample script [resnet50_trainer.py](https://github.com/caffe2/caffe2/blob/master/caffe2/python/examples/resnet50_trainer.py) is used;
 8 | - MNIST Dataset has been translated into a lmdb database, and can be obtained at http://download.caffe2.ai/databases/mnist-lmdb.zip;
 9 | - NFS will be used for rendezvous temp files to coordinate between each shard/node 
10 | - Standard output of the job will be stored on Azure File Share.
11 | 
12 | 
13 | ## Instructions to Run Recipe
14 | 
15 | ### Python Jupyter Notebook
16 | 
17 | You can find Jupyter Notebook for this sample in [Caffe2-GPU-Distributed.ipynb](./Caffe2-GPU-Distributed.ipynb).
18 | 
19 | ### Azure CLI 2.0
20 | 
21 | Under Construction...
22 | 
23 | ## License Notice
24 | 
25 | Under construction...
26 | 
27 | ## Help or Feedback
28 | --------------------
29 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
30 | 
31 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
32 | 


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-BrainScript/Readme.md:
--------------------------------------------------------------------------------
 1 | # BrainScript CNTK GPU
 2 | 
 3 | This example uses the MNIST dataset to demonstrate how to train a convolutional neural network (CNN) on an Azure Batch AI cluster of one node.
 4 | 
 5 | ## Details
 6 | 
 7 | - For demonstration purposes, MNIST dataset and ConvNet_MNIST.cntk will be deployed at Azure File Share;
 8 | - Standard output of the job and the model will be stored on Azure File Share;
 9 | - MNIST dataset (http://yann.lecun.com/exdb/mnist/) has been preprocessed by usign install_mnist.py available [here](https://batchaisamples.blob.core.windows.net/samples/mnist_dataset.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=c&sig=PmhL%2BYnYAyNTZr1DM2JySvrI12e%2F4wZNIwCtf7TRI%2BM%3D).
10 | - ConvNet_MNIST.cntk config file is available [here](./ConvNet_MNIST.cntk). 
11 | 
12 | ## Instructions to Run Recipe
13 | 
14 | ### Jupyter Notebook
15 | 
16 | You can find Jupyter Notebook for this recipe in [CNTK-GPU-BrainScript.ipynb](./CNTK-GPU-BrainScript.ipynb).
17 | 
18 | ### Azure CLI 2.0
19 | 
20 | You can find Azure CLI instructions for this recipe in [cli-instructions.md](./cli-instructions.md).
21 | 
22 | ## License Notice
23 | 
24 | Under construction...
25 | 
26 | ## Help or Feedback
27 | --------------------
28 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
29 | 
30 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
31 | 


--------------------------------------------------------------------------------
/recipes/Chainer/Chainer-GPU-Distributed/Readme.md:
--------------------------------------------------------------------------------
 1 | # Chainer GPU Distributed
 2 | 
 3 | This example demonstrates how to run standard Chainer [train_mnist.py](https://github.com/chainer/chainer/blob/master/examples/mnist/train_mnist.py) example using Batch AI.
 4 | 
 5 | ## Details
 6 | 
 7 | - batchaitraining/chainer:distributed docker image is used;
 8 | - Standard chainer sample script [train_mnist.py](https://github.com/chainer/chainer/blob/master/examples/mnist/train_mnist.py) is used;
 9 | - Chainer downloads the standard MNIST Database on its own;
10 | - Standard output of the job and the model will be stored on Azure File Share;
11 | 
12 | ## Instructions to Run Recipe
13 | 
14 | ### Python Jupyter Notebook
15 | 
16 | You can find Jupyter Notebook for this recipe in [Chainer-GPU-Distributed.ipynb](./Chainer-GPU-Distributed.ipynb).
17 | 
18 | ### Azure CLI 2.0
19 | 
20 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md).
21 | 
22 | ## Dockerfile
23 | 
24 | The `Dockerfile` for the Docker images used in this recipe can be found [here](./docker/dockerfile). The dockerfile is a modified version of ChainerMN example at https://github.com/chainer/chainermn/pull/71
25 | 
26 | ## License Notice
27 | 
28 | Under construction...
29 | 
30 | ## Help or Feedback
31 | --------------------
32 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
33 | 
34 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
35 | 


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-Python/Readme.md:
--------------------------------------------------------------------------------
 1 | # Python CNTK GPU
 2 | 
 3 | This example uses the MNIST dataset to demonstrate how to train a convolutional neural network (CNN) on an Azure Batch AI cluster of one node.
 4 | 
 5 | ## Details
 6 | 
 7 | - For demonstration purposes, MNIST dataset and ConvNet_MNIST.py will be deployed at Azure File Share;
 8 | - Standard output of the job and the model will be stored on Azure File Share;
 9 | - MNIST dataset (http://yann.lecun.com/exdb/mnist/) has been preprocessed by using install_mnist.py available [here](https://batchaisamples.blob.core.windows.net/samples/mnist_dataset.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=c&sig=PmhL%2BYnYAyNTZr1DM2JySvrI12e%2F4wZNIwCtf7TRI%2BM%3D).
10 | - The original CNTK example (https://github.com/Microsoft/CNTK/blob/master/Examples/Image/Classification/ConvNet/Python/ConvNet_MNIST.py) has been modified to accept CNTK dataset and model locations via command line arguments and available here [./ConvNet_MNIST.py). 
11 | 
12 | ## Instructions to Run Recipe
13 | 
14 | ### Jupyter Notebook
15 | 
16 | You can find Jupyter Notebook for this recipe in [CNTK-GPU-Python.ipynb](./CNTK-GPU-Python.ipynb).
17 | 
18 | ### Azure CLI 2.0
19 | 
20 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md).
21 | 
22 | ## License Notice
23 | 
24 | Under construction...
25 | 
26 | ## Help or Feedback
27 | --------------------
28 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
29 | 
30 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
31 | 


--------------------------------------------------------------------------------
/recipes/TensorFlow/TensorFlow-GPU-Distributed/job.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "properties": {
 3 |         "nodeCount": 2,
 4 |         "tensorFlowSettings": {
 5 |             "parameterServerCount": 1,
 6 |             "workerCount": 2,
 7 |             "pythonScriptFilePath": "$AZ_BATCHAI_INPUT_SCRIPT/mnist_replica.py",
 8 |             "masterCommandLineArgs": "--job_name=worker --num_gpus=1 --ps_hosts=$AZ_BATCHAI_PS_HOSTS --worker_hosts=$AZ_BATCHAI_WORKER_HOSTS --task_index=$AZ_BATCHAI_TASK_INDEX --data_dir=$AZ_BATCHAI_INPUT_DATASET --output_dir=$AZ_BATCHAI_OUTPUT_MODEL",
 9 |             "workerCommandLineArgs": "--job_name=worker --num_gpus=1 --ps_hosts=$AZ_BATCHAI_PS_HOSTS --worker_hosts=$AZ_BATCHAI_WORKER_HOSTS --task_index=$AZ_BATCHAI_TASK_INDEX --data_dir=$AZ_BATCHAI_INPUT_DATASET --output_dir=$AZ_BATCHAI_OUTPUT_MODEL",
10 |             "parameterServerCommandLineArgs": "--job_name=ps --num_gpus=0 --ps_hosts=$AZ_BATCHAI_PS_HOSTS --worker_hosts=$AZ_BATCHAI_WORKER_HOSTS --task_index=$AZ_BATCHAI_TASK_INDEX --data_dir=$AZ_BATCHAI_INPUT_DATASET --output_dir=$AZ_BATCHAI_OUTPUT_MODEL"
11 |         },
12 |         "stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
13 |         "inputDirectories": [{
14 |             "id": "DATASET",
15 |             "path": "$AZ_BATCHAI_MOUNT_ROOT/external/mnist_dataset"
16 |         }, {
17 |             "id": "SCRIPT",
18 |             "path": "$AZ_BATCHAI_MOUNT_ROOT/external/tensorflow_samples"
19 |         }],
20 |         "outputDirectories": [{
21 |             "id": "MODEL",
22 |             "pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/external",
23 |             "pathSuffix": "Models"
24 |         }],
25 |         "containerSettings": {
26 |             "imageSourceRegistry": {
27 |                 "image": "tensorflow/tensorflow:1.1.0-gpu"
28 |             }
29 |         }
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/recipes/Horovod/Horovod/Readme.md:
--------------------------------------------------------------------------------
 1 | # Horovod
 2 | 
 3 | This recipe shows how to run [Horovod](https://github.com/uber/horovod) distributed training framework using Batch AI.
 4 | 
 5 | Currently Batch AI has no native support for Horovod framework, but it's easy to run it using Batch AI custom toolkit.
 6 | 
 7 | 
 8 | ## Details
 9 | 
10 | - Standard Horovod [tensorflow_mnist.py](https://github.com/uber/horovod/blob/v0.9.10/examples/tensorflow_mnist.py) example will be used;
11 | - tensorflow_mnist.py downloads training data on its own during execution;
12 | - The job will be run on standard tensorflow container ```tensorflow/tensorflow:1.1.0-gpu```. You can run the same job directly on GPU nodes by choosing Ubuntu DSVM as an image and removing
13 | container settings from the job definition.;
14 | - Horovod framework will be installed in the container using job preparation command line. Note, you can build your own docker image containing tensorflow and horovod instead.
15 | - Standard output of the job will be stored on Azure File Share.
16 | 
17 | ## Instructions to Run Recipe
18 | 
19 | ### Python Jupyter Notebook
20 | 
21 | You can find Jupyter Notebook for this recipe in [Horovod.ipynb](./Horovod.ipynb).
22 | 
23 | ### Azure CLI 2.0
24 | 
25 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md).
26 | 
27 | ## License Notice
28 | 
29 | Under construction...
30 | 
31 | ## Help or Feedback
32 | --------------------
33 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
34 | 
35 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
36 | 


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-Python-Distrbuted-Infiniband/jobprep_cntk_distributed_ib.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | # Download the CIFAR-10 dataset from Azure Blob
 4 | if [ ! -z $AZ_BATCHAI_JOB_TEMP ];then
 5 |     cd $AZ_BATCHAI_JOB_TEMP
 6 |     wget 'https://batchaisamples.blob.core.windows.net/samples/CIFAR-10_dataset.tar?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=b&sig=nFXsAp0Eq%2BoS5%2BKAEPnfyEGlCkBcKIadDvCPA%2BcX6lU%3D' -k -O 'CIFAR-10_dataset.tar'
 7 |     echo "untar CIFAR-10 dataset........."
 8 |     tar -xf CIFAR-10_dataset.tar
 9 |     echo "done"
10 |     ROOT_DIR=`pwd`
11 |     files=( "train_map.txt" "test_map.txt" )
12 |     for file in "${files[@]}"
13 |     do
14 |         output=$ROOT_DIR"/"$file
15 |         if [ -f $output ];then
16 |             rm $output
17 |         fi
18 |         touch $output
19 |         while read -r line
20 |         do
21 |             name="$line"
22 |             echo "$ROOT_DIR$name" >> $output
23 |         done < $file".template"
24 |     done
25 | fi
26 | 
27 | # install intel MPI if Infiniband is used
28 | if [ -d /dev/infiniband ];then
29 | 	cd /tmp
30 | 	wget -q 'http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/11595/l_mpi_2017.3.196.tgz' 
31 | 	tar zxvf l_mpi_2017.3.196.tgz
32 | 	sed -i -e 's/^ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' /tmp/l_mpi_2017.3.196/silent.cfg
33 | 	sed -i -e 's|^#ACTIVATION_LICENSE_FILE=|ACTIVATION_LICENSE_FILE=/tmp/l_mpi_2017.3.196/USE_SERVER.lic|g' /tmp/l_mpi_2017.3.196/silent.cfg
34 | 	sed -i -e 's/^ACTIVATION_TYPE=exist_lic/ACTIVATION_TYPE=license_server/g' /tmp/l_mpi_2017.3.196/silent.cfg 
35 | 	cd /tmp/l_mpi_2017.3.196 
36 | 	./install.sh -s silent.cfg
37 | 	cd .. 
38 | 	rm -rf l_mpi_2017.3.196* 
39 | 	echo "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" >> ~/.bashrc
40 | fi
41 | 


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-Python-Distributed/Readme.md:
--------------------------------------------------------------------------------
 1 | # Python CNTK GPU
 2 | 
 3 | This example uses the CIFAR-10 dataset to demonstrate how to train a convolutional neural network (CNN) on a multi-node multi-GPU cluster. You can run this recipe on a single or multiple nodes.
 4 | 
 5 | ## Details
 6 | 
 7 | - For demonstration purposes, CIFAR-10 data preparation script and ConvNet_CIFAR10_DataAug_Distributed.py with its dependency will be deployed at Azure File Share;
 8 | - Standard output of the job and the model will be stored on Azure File Share;
 9 | - CIFAR-10 dataset(http://www.cs.toronto.edu/~kriz/cifar.html) has been preprocessed available [here](https://batchaisamples.blob.core.windows.net/samples/CIFAR-10_dataset.tar?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=b&sig=nFXsAp0Eq%2BoS5%2BKAEPnfyEGlCkBcKIadDvCPA%2BcX6lU%3D).
10 | - The official CNTK example ConvNet_CIFAR10_DataAug_Distributed.py (https://github.com/Microsoft/CNTK/blob/master/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py) is used. 
11 | 
12 | 
13 | ## Instructions to Run Recipe
14 | 
15 | ### Python Jupyter Notebook
16 | 
17 | You can find Jupyter Notebook for this recipe in [CNTK-GPU-Python-Distrbuted.ipynb](./CNTK-GPU-Python-Distrbuted.ipynb).
18 | 
19 | ### Azure CLI 2.0
20 | 
21 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md).
22 | 
23 | ## License Notice
24 | 
25 | Under construction...
26 | 
27 | ## Help or Feedback
28 | --------------------
29 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
30 | 
31 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
32 | 


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-BrainScript-Distributed/Readme.md:
--------------------------------------------------------------------------------
 1 | # Brainscript CNTK GPU Distributed
 2 | 
 3 | This example uses the MNIST dataset to demonstrate how to train a convolutional neural network (CNN) on a GPU cluster. You can run this recipe on a single or multiple nodes.
 4 | 
 5 | ## Details
 6 | 
 7 | - For demonstration purposes, MNIST dataset and ConvNet_MNIST.cntk will be deployed at Azure File Share;
 8 | - Standard output of the job and the model will be stored on Azure File Share;
 9 | - MNIST dataset (http://yann.lecun.com/exdb/mnist/) has been preprocessed by usign install_mnist.py available [here](https://batchaisamples.blob.core.windows.net/samples/mnist_dataset.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=c&sig=PmhL%2BYnYAyNTZr1DM2JySvrI12e%2F4wZNIwCtf7TRI%2BM%3D).
10 | - ConvNet_MNIST.cntk config file has been modified from official cntk sample (https://raw.githubusercontent.com/Microsoft/CNTK/master/Examples/Image/Classification/ConvNet/BrainScript/ConvNet_MNIST.cntk) for distributed training, and is available [here](./ConvNet_MNIST.cntk). 
11 | 
12 | ## Instructions to Run Recipe
13 | 
14 | ### Jupyter Notebook
15 | 
16 | You can find Jupyter Notebook for this recipe in [CNTK-GPU-BrainScript-Distributed.ipynb](./CNTK-GPU-BrainScript-Distributed.ipynb).
17 | 
18 | ### Azure CLI 2.0
19 | 
20 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md).
21 | 
22 | ## License Notice
23 | 
24 | Under construction...
25 | 
26 | ## Help or Feedback
27 | --------------------
28 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
29 | 
30 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
31 | 


--------------------------------------------------------------------------------
/recipes/Caffe/Caffe-GPU/Readme.md:
--------------------------------------------------------------------------------
 1 | # Caffe GPU
 2 | 
 3 | This example demonstrates how to run standard Caffe lenet_solver.prototxt example using Batch AI. This recipe is running on a signle .
 4 | 
 5 | ## Details
 6 | 
 7 | - For demonstration purposes, MNIST dataset and caffe configuration file will be deployed at Azure File Share;
 8 | - Standard output of the job and the model will be stored on Azure File Share;
 9 | - MNIST dataset has been preprocessed according to http://caffe.berkeleyvision.org/gathered/examples/mnist.html available [here](https://batchaisamples.blob.core.windows.net/samples/mnist_lmdb.zip?st=2017-10-06T00%3A15%3A00Z&se=2100-01-01T00%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=b&sig=jKlQA8x190lLGDXloeHrSe6jpOtUEYLD1DRoyWuiAdQ%3D).
10 | - The original Caffe solver and net prototxt files have been modified to take environment variables: AZ_BATCHAI_INPUT_SAMPLE and AZ_BATCHAI_OUTPUT_MODEL, and available here lenet_solver.prototxt and lenet_train_test.prototxt. 
11 | - Since prototxt files supports neither command line overloading nor environment variable, we use job preparation task preparation_script.sh to expand the environment varible specified in the files, providing more flexibility of the job setup.
12 | 
13 | 
14 | ## Instructions to Run Recipe
15 | 
16 | ### Python Jupyter Notebook
17 | 
18 | You can find Jupyter Notebook for this sample in [Caffe-GPU.ipynb](./Caffe-GPU.ipynb).
19 | 
20 | ### Azure CLI 2.0
21 | 
22 | Under Construction...
23 | 
24 | ## License Notice
25 | 
26 | Under construction...
27 | 
28 | ## Help or Feedback
29 | --------------------
30 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
31 | 
32 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
33 | 


--------------------------------------------------------------------------------
/recipes/TensorFlow/TensorFlow-GPU-Distributed/Readme.md:
--------------------------------------------------------------------------------
 1 | # TensorFlow Distributed GPU
 2 | 
 3 | ## Introduction
 4 | 
 5 | This example demonstrate how to run standard TensorFlow sample (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dist_test/python/mnist_replica.py) on Azure Batch AI cluster of 2 nodes.
 6 | 
 7 | ## Details
 8 | 
 9 | - For demonstration purposes, MNIST dataset and `mnist_replica.py` will be deployed at Azure File Share;
10 | - Standard output of the job will be stored on Azure File Share;
11 | - MNIST dataset (http://yann.lecun.com/exdb/mnist/) is archived and uploaded into the [blob](https://batchaisamples.blob.core.windows.net/samples/mnist_dataset_original.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=b&sig=Qc1RA3zsXIP4oeioXutkL1PXIrHJO0pHJlppS2rID3I%3D).
12 | - The recipe uses official `mnist_replica.py` (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dist_test/python/mnist_replica.py).
13 | - Please refer to the [official tutorial](https://www.tensorflow.org/deploy/distributed) on distributed tensorflow training
14 | 
15 | ## Instructions to Run Recipe
16 | 
17 | ### Python Jupyter Notebook
18 | 
19 | You can find Jupyter Notebook for this recipe in [TensorFlow-GPU-Distributed.ipynb](./TensorFlow-GPU-Distributed.ipynb).
20 | 
21 | ### Azure CLI 2.0
22 | 
23 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md).
24 | 
25 | ## License Notice
26 | 
27 | Under construction...
28 | 
29 | ## Help or Feedback
30 | --------------------
31 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
32 | 
33 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
34 | 


--------------------------------------------------------------------------------
/recipes/Horovod/Horovod-Infiniband-Benchmark/Readme.md:
--------------------------------------------------------------------------------
 1 | # Horovod-Infiniband-Benchmark
 2 | 
 3 | This recipe shows how to reproduce [Horovod distributed training benchmarks](https://github.com/uber/horovod/blob/master/docs/benchmarks.md) using Azure Batch AI.
 4 | 
 5 | Currently Batch AI has no native support for Horovod framework, but it's easy to run it using Batch AI custom toolkit.
 6 | 
 7 | 
 8 | ## Details
 9 | 
10 | - Official Horovod Benchmark [scripts](https://github.com/alsrgv/benchmarks/tree/master/scripts/tf_cnn_benchmarks) will be used;
11 | - The job will be run on standard tensorflow container ```tensorflow/tensorflow:1.4.0-gpu```;
12 | - Horovod framework and IntelMPI will be installed in the container using job preparation command line. Note, you can build your own docker image containing tensorflow and horovod instead.
13 | - Benchmark scripts will be downloaded to GPU nodes using job preparation command line as well, stored in `$AZ_BATCHAI_JOB_TEMP` at each node
14 | - This sample needs to use at lesat two `STANDARD_NC24r` nodes, please be sure you have enough quota
15 | - Standard output of the job will be stored on Azure File Share.
16 | - This recipe ONLY reproduce the training results with synthetic data on NVIDIA K80 GPUs. 
17 | 
18 | 
19 | ## Instructions to Run Recipe
20 | 
21 | ### Python Jupyter Notebook
22 | 
23 | You can find Jupyter Notebook for this recipe in [Horovod-Infiniband-Benchmark.ipynb](./Horovod-Infiniband-benchmark.ipynb).
24 | 
25 | ### Azure CLI 2.0
26 | 
27 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md).
28 | 
29 | ## License Notice
30 | 
31 | Under construction...
32 | 
33 | ## Help or Feedback
34 | --------------------
35 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
36 | 
37 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
38 | 


--------------------------------------------------------------------------------
/recipes/CNTK/Readme.md:
--------------------------------------------------------------------------------
 1 | # CNTK
 2 | 
 3 | The Microsoft Cognitive Toolkit (https://cntk.ai), is a unified deep-learning toolkit that describes neural networks as a series of computational steps via a directed graph. CNTK can be included as a library in your Python, C#, or C++ programs, or used as a standalone machine learning tool through its own model description language (BrainScript). 
 4 | 
 5 | See official CNTK GitHub page (https://github.com/Microsoft/CNTK/).
 6 | 
 7 | #### [CNTK-GPU-Python](./CNTK-GPU-Python)
 8 | This CNTK-GPU-Python recipe contains information on how to run Python CNTK learning job on a GPU VM.
 9 | 
10 | #### [CNTK-GPU-Python-Distributed](./CNTK-GPU-Python-Distributed)
11 | This CNTK-GPU-Python-Distributed] recipe contains information on how to run Python CNTK learning job on GPU VMs, including execution across multiple compute nodes and multiple GPUs.
12 | 
13 | #### [CNTK-GPU-Python-Distrbuted-Infiniband](./CNTK-GPU-Python-Distrbuted-Infiniband)
14 | This CNTK-GPU-Python-Distributed] recipe contains information on how to run Python CNTK learning job on GPU VMs, including execution across multiple compute nodes and multiple GPUs connected by Infiniband networks.
15 | 
16 | #### [CNTK-GPU-BrainScript](./CNTK-GPU-BrainScript)
17 | This CNTK-GPU-BrainScript recipe contains information on how to run CNTK learning job on a GPU VM with BrainScript configuration file.
18 | 
19 | #### [CNTK-GPU-BrainScript-Distributed](./CNTK-GPU-BrainScript-Distributed)
20 | This CNTK-GPU-BrainScript-Distributedrecipe contains information on how to run Python CNTK learning job on GPU VMs with BrainScript configuration file, including execution across multiple compute nodes and multiple GPUs.
21 | 
22 | ## Help or Feedback
23 | --------------------
24 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
25 | 
26 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
27 | 


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-Python-Distrbuted-Infiniband/Readme.md:
--------------------------------------------------------------------------------
 1 | # Distrbuted CNTK with GPU and Infiniband 
 2 | 
 3 | This example uses the CIFAR-10 dataset to demonstrate how to train a Residual network (ResNet) on a multi-node multi-GPU cluster with infiniband. 
 4 | 
 5 | ## Details
 6 | 
 7 | - The official CNTK ResNet for CIFAR10 [example](https://github.com/Microsoft/CNTK/tree/master/Examples/Image/Classification/ResNet/Python) is used.
 8 | - CIFAR-10 dataset(http://www.cs.toronto.edu/~kriz/cifar.html) has been preprocessed available at the [Azure storage](https://batchaisamples.blob.core.windows.net/samples/CIFAR-10_dataset.tar?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=b&sig=nFXsAp0Eq%2BoS5%2BKAEPnfyEGlCkBcKIadDvCPA%2BcX6lU%3D), and will be downloaded to GPU local SSD. 
 9 | - The job will be run on a prebuild CNTK container ```batchaitraining/cntk:2.3-gpu-1bitsgd-py36-cuda8-cudnn6-intelmpi``` based on [dockerfile](./dockerfile). Intel MPI package will be installed in the container using job preparation command line.
10 | - For demonstration purposes, CIFAR-10 data preparation script and CNTK job scripts will be deployed at Azure File Share.
11 | - Standard output of the job and the model will be stored on Azure File Share.
12 | - This sample needs to use at lesat two STANDARD_NC24r nodes, please be sure you have enough quota
13 | - If you like to conduct performance comparasion with TCP network, you can create the cluster with VM size `STANDARD_NC24` that does not support Infiniband.
14 | 
15 | ## Instructions to Run Recipe
16 | 
17 | ### Python Jupyter Notebook
18 | 
19 | You can find Jupyter Notebook for this recipe in [CNTK-GPU-Python-Distrbuted-Infiniband.ipynb](./CNTK-GPU-Python-Distrbuted-Infiniband.ipynb).
20 | 
21 | ### Azure CLI 2.0
22 | 
23 | You can find Azure CLI 2.0 instructions for this recipe in [cli-instructions.md](./cli-instructions.md).
24 | 
25 | ## License Notice
26 | 
27 | Under construction...
28 | 
29 | ## Help or Feedback
30 | --------------------
31 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
32 | 
33 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Azure Batch AI
 2 | 
 3 | Welcome to our documenting page at https://docs.microsoft.com/azure/batch-ai
 4 | 
 5 | ## Updates 
 6 | 
 7 | 11.15.2017 Java SDK is [available](https://mvnrepository.com/artifact/com.microsoft.azure/azure-mgmt-batchai)
 8 | 
 9 | 11.08.2017 Node.js SDK is [available](https://www.npmjs.com/package/azure-arm-batchai)
10 | 
11 | 10.11.2017 C# nuget package Microsoft.Azure.Management.BatchAI is available on nuget.org.
12 | 
13 | 10.09.2017 Azure BatchAI starts public preview on October 9th, 2017!
14 | 
15 | ## Batch AI Recipes 
16 | 
17 | We have created [recipes](/recipes/) for popular AI frameworks to help you get started with Batch AI and submit jobs without being an expert on Azure compute, storage, and networking.  
18 | 
19 | [Microsoft Cognitive Toolkit](/recipes/CNTK/)
20 | 
21 | [TensorFlow](/recipes/TensorFlow/)
22 | 
23 | [Chainer/ChainerMN](/recipes/Chainer/)
24 | 
25 | [Caffe](/recipes/Caffe/)
26 | 
27 | [Caffe2](/recipes/Caffe2/)
28 | 
29 | [Horovod](/recipes/Horovod)
30 | 
31 | [Custom Took Kit -- a Keras example](/recipes/Keras/)
32 | 
33 | ## Contributing
34 | 
35 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
36 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
37 | the rights to use your contribution. For details, visit https://cla.microsoft.com.
38 | 
39 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
40 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
41 | provided by the bot. You will only need to do this once across all repos using our CLA.
42 | 
43 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
44 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
45 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
46 | 
47 | ## Help or Feedback
48 | --------------------
49 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
50 | 
51 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
52 | 


--------------------------------------------------------------------------------
/recipes/Keras/Keras-DSVM/mnist_cnn.py:
--------------------------------------------------------------------------------
 1 | '''Trains a simple convnet on the MNIST dataset.
 2 | 
 3 | Gets to 99.25% test accuracy after 12 epochs
 4 | (there is still a lot of margin for parameter tuning).
 5 | 16 seconds per epoch on a GRID K520 GPU.
 6 | '''
 7 | 
 8 | from __future__ import print_function
 9 | import keras
10 | from keras.datasets import mnist
11 | from keras.models import Sequential
12 | from keras.layers import Dense, Dropout, Flatten
13 | from keras.layers import Conv2D, MaxPooling2D
14 | from keras import backend as K
15 | 
16 | batch_size = 128
17 | num_classes = 10
18 | epochs = 12
19 | 
20 | # input image dimensions
21 | img_rows, img_cols = 28, 28
22 | 
23 | # the data, shuffled and split between train and test sets
24 | (x_train, y_train), (x_test, y_test) = mnist.load_data()
25 | 
26 | if K.image_data_format() == 'channels_first':
27 |     x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
28 |     x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
29 |     input_shape = (1, img_rows, img_cols)
30 | else:
31 |     x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
32 |     x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
33 |     input_shape = (img_rows, img_cols, 1)
34 | 
35 | x_train = x_train.astype('float32')
36 | x_test = x_test.astype('float32')
37 | x_train /= 255
38 | x_test /= 255
39 | print('x_train shape:', x_train.shape)
40 | print(x_train.shape[0], 'train samples')
41 | print(x_test.shape[0], 'test samples')
42 | 
43 | # convert class vectors to binary class matrices
44 | y_train = keras.utils.to_categorical(y_train, num_classes)
45 | y_test = keras.utils.to_categorical(y_test, num_classes)
46 | 
47 | model = Sequential()
48 | model.add(Conv2D(32, kernel_size=(3, 3),
49 |                  activation='relu',
50 |                  input_shape=input_shape))
51 | model.add(Conv2D(64, (3, 3), activation='relu'))
52 | model.add(MaxPooling2D(pool_size=(2, 2)))
53 | model.add(Dropout(0.25))
54 | model.add(Flatten())
55 | model.add(Dense(128, activation='relu'))
56 | model.add(Dropout(0.5))
57 | model.add(Dense(num_classes, activation='softmax'))
58 | 
59 | model.compile(loss=keras.losses.categorical_crossentropy,
60 |               optimizer=keras.optimizers.Adadelta(),
61 |               metrics=['accuracy'])
62 | 
63 | model.fit(x_train, y_train,
64 |           batch_size=batch_size,
65 |           epochs=epochs,
66 |           verbose=1,
67 |           validation_data=(x_test, y_test))
68 | score = model.evaluate(x_test, y_test, verbose=0)
69 | print('Test loss:', score[0])
70 | print('Test accuracy:', score[1])
71 | 


--------------------------------------------------------------------------------
/recipes/Chainer/Chainer-GPU-Distributed/docker/dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu14.04
 2 | RUN apt-get update
 3 | 
 4 | 
 5 | # disable interactive functions
 6 | ENV DEBIAN_FRONTEND noninteractive
 7 | 
 8 | 
 9 | #################Install MiniConda and other dependencies##########
10 | ENV CONDA_DIR /opt/conda
11 | ENV PATH $CONDA_DIR/bin:$PATH
12 | ENV OPENBLAS_NUM_THREADS $(nproc)
13 | 
14 | RUN mkdir -p $CONDA_DIR && \
15 |     echo export PATH=$CONDA_DIR/bin:'$PATH' > /etc/profile.d/conda.sh && \
16 | 
17 |     apt-get update -y && \
18 |     apt-get install -y \
19 | 
20 |     wget \
21 |     vim \
22 |     git \
23 |     g++ \
24 |     graphviz \
25 | 
26 |     software-properties-common \
27 |     python-software-properties \
28 |     python3-dev \
29 | 
30 |     libhdf5-dev \
31 |     libopenblas-dev \
32 |     liblapack-dev \
33 |     libblas-dev \
34 |     gfortran && \
35 | 
36 |     rm -rf /var/lib/apt/lists/* && \
37 | 
38 | 
39 |     wget --quiet https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
40 |     /bin/bash /Miniconda3-latest-Linux-x86_64.sh -f -b -p $CONDA_DIR && \
41 |     rm Miniconda3-latest-Linux-x86_64.sh
42 | 
43 | 
44 | #########################MPI###########################
45 | RUN cd /tmp && \
46 |         wget "https://www.open-mpi.org/software/ompi/v2.1/downloads/openmpi-2.1.1.tar.gz" && \
47 |         tar xzf openmpi-2.1.1.tar.gz && \
48 |         cd openmpi-2.1.1  && \
49 |         ./configure --with-cuda && make -j"$(nproc)" install # && ldconfig
50 | 
51 | 
52 | 
53 | #######################NCCL###########################
54 | ENV CPATH /usr/local/cuda/include:/usr/local/include:$CPATH
55 | RUN cd /usr/local && git clone https://github.com/NVIDIA/nccl.git && cd nccl && \
56 | 
57 | ######### Compile for devices with cuda compute compatibility 3 (e.g. GRID K520 on aws)
58 | # UNCOMMENT line below to compile for GPUs with cuda compute compatibility 3.0
59 | #        sed -i '/NVCC_GENCODE ?=/a \                -gencode=arch=compute_30,code=sm_30 \\' Makefile && \
60 | ##########
61 | 
62 |         make CUDA_HOME=/usr/local/cuda -j"$(nproc)" && \
63 |         make install && ldconfig
64 | 
65 | 
66 | ####################Python 3#########################
67 | ARG python_version=3.5.2
68 | RUN conda install -y python=${python_version} && \
69 |     pip install -U pip && \
70 | 
71 |     conda install Pillow scikit-learn notebook pandas matplotlib mkl nose pyyaml six h5py && \
72 | 
73 | 
74 |     pip install mpi4py && \
75 |     pip install cython && \
76 | 
77 |     pip install chainer && \
78 |     pip install chainercv && \
79 |     pip install chainermn && \
80 | 
81 |     conda clean -yt
82 | 
83 | ENV PYTHONPATH $CONDA_DIR/lib/python3.5/site-packages/:$PYTHONPATH
84 | 
85 | ######################################################
86 | 
87 | ENV PYTHONPATH /src/:$PYTHONPATH


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-BrainScript/ConvNet_MNIST.cntk:
--------------------------------------------------------------------------------
 1 | # ConvNet on MNIST dataset. 
 2 | 
 3 | command = trainNetwork:testNetwork
 4 | 
 5 | precision = "float"; traceLevel = 1 ; deviceId = "auto"
 6 | 
 7 | rootDir = "../../.." ; dataDir = "$rootDir$/DataSets/MNIST" ;
 8 | outputDir = "./Output" ;
 9 | 
10 | modelPath = "$outputDir$/Models/ConvNet_MNIST"
11 | #stderr = "$outputDir$/ConvNet_MNIST_bs_out"
12 | 
13 | # TRAINING CONFIG
14 | trainNetwork = {
15 |     action = "train"
16 | 
17 |     BrainScriptNetworkBuilder = {
18 |         imageShape = 28:28:1                        # image dimensions, 1 channel only
19 |         labelDim = 10                               # number of distinct labels
20 |         featScale = 1/256
21 |         Scale{f} = x => Constant(f) .* x
22 |         
23 |         model = Sequential (
24 |             Scale {featScale} :
25 |             ConvolutionalLayer {32, (5:5), pad = true} : ReLU : 
26 |             MaxPoolingLayer    {(3:3), stride=(2:2)} :
27 |             ConvolutionalLayer {48, (3:3), pad = false} : ReLU : 
28 |             MaxPoolingLayer    {(3:3), stride=(2:2)} :
29 |             ConvolutionalLayer {64, (3:3), pad = false} : ReLU : 
30 |             DenseLayer         {96} : Dropout : ReLU :  
31 |             LinearLayer        {labelDim}
32 |         )
33 | 
34 |         # inputs
35 |         features = Input {imageShape}
36 |         labels = Input {labelDim}
37 | 
38 |         # apply model to features
39 |         ol = model (features)
40 | 
41 |         # loss and error computation
42 |         ce   = CrossEntropyWithSoftmax (labels, ol)
43 |         errs = ClassificationError (labels, ol)
44 | 
45 |         # declare special nodes
46 |         featureNodes    = (features)
47 |         labelNodes      = (labels)
48 |         criterionNodes  = (ce)
49 |         evaluationNodes = (errs)
50 |         outputNodes     = (ol)
51 |     }
52 | 
53 |     SGD = {
54 |         epochSize = 60000
55 |         minibatchSize = 64
56 |         maxEpochs = 40
57 |         learningRatesPerSample = 0.001*10:0.0005*10:0.0001
58 |         dropoutRate = 0.5
59 |         momentumAsTimeConstant = 0*5:1024
60 |         
61 |         numMBsToShowResult = 500
62 |     }
63 | 
64 |     reader = {
65 |         readerType = "CNTKTextFormatReader"
66 |         # See ../README.md for details on getting the data (Train-28x28_cntk_text.txt).
67 |         file = "$DataDir$/Train-28x28_cntk_text.txt"
68 |         randomize = true
69 |         keepDataInMemory = true
70 |         input = {
71 |             features = { dim = 784 ; format = "dense" }
72 |             labels =   { dim = 10  ; format = "dense" }
73 |         }
74 |     }    
75 | }
76 | 
77 | # TEST CONFIG
78 | testNetwork = {
79 |     action = test
80 |     minibatchSize = 1024    # reduce this if you run out of memory
81 | 
82 |     reader = {
83 |         readerType = "CNTKTextFormatReader"
84 |         file = "$DataDir$/Test-28x28_cntk_text.txt"
85 |         input = {
86 |             features = { dim = 784 ; format = "dense" }
87 |             labels =   { dim = 10  ; format = "dense" }
88 |         }
89 |     }
90 | }
91 | 


--------------------------------------------------------------------------------
/recipes/Caffe/Caffe-GPU/lenet_train_test.prototxt:
--------------------------------------------------------------------------------
  1 | name: "LeNet"
  2 | layer {
  3 |   name: "mnist"
  4 |   type: "Data"
  5 |   top: "data"
  6 |   top: "label"
  7 |   include {
  8 |     phase: TRAIN
  9 |   }
 10 |   transform_param {
 11 |     scale: 0.00390625
 12 |   }
 13 |   data_param {
 14 |     source: "$AZ_BATCHAI_INPUT_SAMPLE/mnist_train_lmdb"
 15 |     batch_size: 64
 16 |     backend: LMDB
 17 |   }
 18 | }
 19 | layer {
 20 |   name: "mnist"
 21 |   type: "Data"
 22 |   top: "data"
 23 |   top: "label"
 24 |   include {
 25 |     phase: TEST
 26 |   }
 27 |   transform_param {
 28 |     scale: 0.00390625
 29 |   }
 30 |   data_param {
 31 |     source: "$AZ_BATCHAI_INPUT_SAMPLE/mnist_test_lmdb"
 32 |     batch_size: 100
 33 |     backend: LMDB
 34 |   }
 35 | }
 36 | layer {
 37 |   name: "conv1"
 38 |   type: "Convolution"
 39 |   bottom: "data"
 40 |   top: "conv1"
 41 |   param {
 42 |     lr_mult: 1
 43 |   }
 44 |   param {
 45 |     lr_mult: 2
 46 |   }
 47 |   convolution_param {
 48 |     num_output: 20
 49 |     kernel_size: 5
 50 |     stride: 1
 51 |     weight_filler {
 52 |       type: "xavier"
 53 |     }
 54 |     bias_filler {
 55 |       type: "constant"
 56 |     }
 57 |   }
 58 | }
 59 | layer {
 60 |   name: "pool1"
 61 |   type: "Pooling"
 62 |   bottom: "conv1"
 63 |   top: "pool1"
 64 |   pooling_param {
 65 |     pool: MAX
 66 |     kernel_size: 2
 67 |     stride: 2
 68 |   }
 69 | }
 70 | layer {
 71 |   name: "conv2"
 72 |   type: "Convolution"
 73 |   bottom: "pool1"
 74 |   top: "conv2"
 75 |   param {
 76 |     lr_mult: 1
 77 |   }
 78 |   param {
 79 |     lr_mult: 2
 80 |   }
 81 |   convolution_param {
 82 |     num_output: 50
 83 |     kernel_size: 5
 84 |     stride: 1
 85 |     weight_filler {
 86 |       type: "xavier"
 87 |     }
 88 |     bias_filler {
 89 |       type: "constant"
 90 |     }
 91 |   }
 92 | }
 93 | layer {
 94 |   name: "pool2"
 95 |   type: "Pooling"
 96 |   bottom: "conv2"
 97 |   top: "pool2"
 98 |   pooling_param {
 99 |     pool: MAX
100 |     kernel_size: 2
101 |     stride: 2
102 |   }
103 | }
104 | layer {
105 |   name: "ip1"
106 |   type: "InnerProduct"
107 |   bottom: "pool2"
108 |   top: "ip1"
109 |   param {
110 |     lr_mult: 1
111 |   }
112 |   param {
113 |     lr_mult: 2
114 |   }
115 |   inner_product_param {
116 |     num_output: 500
117 |     weight_filler {
118 |       type: "xavier"
119 |     }
120 |     bias_filler {
121 |       type: "constant"
122 |     }
123 |   }
124 | }
125 | layer {
126 |   name: "relu1"
127 |   type: "ReLU"
128 |   bottom: "ip1"
129 |   top: "ip1"
130 | }
131 | layer {
132 |   name: "ip2"
133 |   type: "InnerProduct"
134 |   bottom: "ip1"
135 |   top: "ip2"
136 |   param {
137 |     lr_mult: 1
138 |   }
139 |   param {
140 |     lr_mult: 2
141 |   }
142 |   inner_product_param {
143 |     num_output: 10
144 |     weight_filler {
145 |       type: "xavier"
146 |     }
147 |     bias_filler {
148 |       type: "constant"
149 |     }
150 |   }
151 | }
152 | layer {
153 |   name: "accuracy"
154 |   type: "Accuracy"
155 |   bottom: "ip2"
156 |   bottom: "label"
157 |   top: "accuracy"
158 |   include {
159 |     phase: TEST
160 |   }
161 | }
162 | layer {
163 |   name: "loss"
164 |   type: "SoftmaxWithLoss"
165 |   bottom: "ip2"
166 |   bottom: "label"
167 |   top: "loss"
168 | }
169 | 


--------------------------------------------------------------------------------
/recipes/Horovod/Horovod-Infiniband-Benchmark/cli-instructions.md:
--------------------------------------------------------------------------------
 1 | Please follow [instructions](/recipes/Readme.md) to install Azure CLI 2.0, configure default location, create and configure default resource group and storage account.
 2 | 
 3 | 
 4 | ### Script Deployment
 5 | 
 6 | - Create an Azure File Share with `horovod_samples` folder:
 7 | ```sh
 8 | az storage share create --name batchaisample
 9 | az storage directory create --share-name batchaisample --name horovod_samples
10 | ```
11 | Upload the job preparation script, that does the following tasks:
12 | - Install essential packages for infiniband support
13 | - Download benchmark scripts from https://github.com/alsrgv/benchmarks
14 | - Install IntelMPI binary
15 | - Install honovod framework
16 | ```sh
17 | az storage file upload --share-name batchaisample --source jobprep_benchmark.sh --path horovod_samples
18 | ```
19 | 
20 | ### Cluster
21 | 
22 | By default, for this recipe we will use a GPU cluster with two nodes (`min node = max node = 2`) of `Standard_NC24r` size (four GPU with infiniband)
23 | with latest Ubuntu 16.04-LTS image. 
24 | 
25 | Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`.
26 | 
27 | #### Cluster Creation Command
28 | 
29 | For GNU/Linux users:
30 | 
31 | ```sh
32 | az batchai cluster create -n nc24r -s Standard_NC24r --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub
33 | ```
34 | 
35 | For Windows users:
36 | 
37 | ```sh
38 | az batchai cluster create -n nc24r -s Standard_NC24r --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u <user_name> -p <password>
39 | ```
40 | 
41 | ### Job
42 | 
43 | The job creation parameters are in [job.json](./job.json):
44 | 
45 | - An input directory with ID `SCRIPTS` to allow the job to find the job preparation script via environment variable `$AZ_BATCHAI_INPUT_SCRIPTS`;
46 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and error streams;
47 | - nodeCount defines how many nodes will be used for the job execution;
48 | - ```tensorflow/tensorflow:1.4.0-gpu``` standard tensorflow container will be used 
49 | - ```Horovod``` framwork, intelMPI and horovod benchmark scripts will be downloaded/installed by job preparation script;
50 | You can build and publish your own docker image containing tensorflow and Horovod instead;
51 | - The benchmark script (```tf_cnn_benchmarks.py```) will be executed with custom toolkit;
52 | - If you are insterested using TCP instead, please replace ```-env I_MPI_FABRICS=dapl -env I_MPI_DAPL_PROVIDER=ofa-v2-ib0 -env I_MPI_DYNAMIC_CONNECTION=0``` with ```-env I_MPI_FABRICS=tcp``` in the command line.
53 | 
54 | #### Job Creation Command
55 | 
56 | ```sh
57 | az batchai job create -n horovod_benchmark --cluster-name nc24r -c job.json
58 | ```
59 | 
60 | Note, the job will start running when the cluster finished allocation and initialization of the nodes.
61 | 
62 | ### Get Help
63 | 
64 | The Azure CLI has built-in help documentation, which you can run from the command line:
65 | 
66 | ```sh
67 | az [command-group [command]] -h
68 | ```
69 | 
70 | For example, to get information about all Azure Batch AI categories, use:
71 | 
72 | ```sh
73 | az batchai -h
74 | ```
75 | 
76 | To get help with the command to create a cluster, use:
77 | 
78 | ```sh
79 | az batchai cluster create -h
80 | ```
81 | 
82 | You can use [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) as end-to-end example of CLI usage.
83 | 


--------------------------------------------------------------------------------
/recipes/TensorFlow/TensorFlow-GPU/cli-instructions.md:
--------------------------------------------------------------------------------
 1 | Please follow [instructions](/documentation/using-azure-cli-20.md) to install Azure CLI 2.0 and configure it for using with Batch AI.
 2 | 
 3 | ### Create a Resource Group
 4 | 
 5 | Create a resource group ```batchaitests``` (or choose your own resource name) which will be used for resources creations:
 6 | 
 7 | ```sh
 8 | az group create -n batchaitests -l eastus
 9 | ```
10 | 
11 | ### Create a Storage Account
12 | 
13 | Create a storage account with an unique name in the same region where you are going to use Batch AI:
14 | 
15 | ```sh
16 | az storage account create -n <storage account name> --sku Standard_LRS -l eastus -g batchaitests
17 | ```
18 | 
19 | 
20 | ### Data Deployment
21 | 
22 | - Download convolutional.py sample script into the current folder:
23 | 
24 | For GNU/Linux users:
25 | 
26 | ```sh
27 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/TensorFlow/TensorFlow-GPU/convolutional.py?token=AcZzrZcCveHaaevWYBtN9wYREYDOJvY-ks5Z4c4QwA%3D%3D" -O convolutional.py
28 | ```
29 | 
30 | - Create an Azure File Share with `tensorflow_samples` folder and upload convolutional.py into it:
31 | 
32 | ```sh
33 | az storage share create --name batchaisample --account-name <storage account name>
34 | az storage directory create --share-name batchaisample --name tensorflow_samples
35 | az storage file upload --share-name batchaisample --source convolutional.py --path tensorflow_samples
36 | ```
37 | 
38 | ### Cluster
39 | 
40 | For this recipe we need one node GPU cluster (`min node = max node = 1`) of `Standard_NC6` size (one GPU) with standard Ubuntu LTS (`UbuntuLTS`) or Ubuntu DSVM (```UbuntuDSVM```) image and Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`.
41 | 
42 | #### Cluster Creation Command
43 | 
44 | For GNU/Linux users:
45 | 
46 | ```sh
47 | az batchai cluster create -l eastus -g batchaitests --storage-account-name <storage account name> -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 1 --max 1 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub
48 | ```
49 | 
50 | For Windows users:
51 | 
52 | ```sh
53 | az batchai cluster create -l eastus -g batchaitests --storage-account-name <storage account name> -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 1 --max 1 --afs-name batchaisample --afs-mount-path external -u <user_name> -p <password>
54 | ```
55 | 
56 | ### Job
57 | 
58 | The job creation parameters are in [job.json](./job.json):
59 | 
60 | - An input directory with ID `SCRIPT` to allow the job to find the sample script via environment variable `$AZ_BATCHAI_INPUT_SCRIPT`;
61 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and error streams;
62 | - nodeCount defining how many nodes will be used for the job execution;
63 | - path and parameters for running convolutional.py;
64 | - ```tensorflow/tensorflow:1.1.0-gpu``` docker image will be used for job execution.
65 | 
66 | Note, you can delete the docker image information to run the job directly on DSVM.
67 | 
68 | #### Job Creation Command
69 | 
70 | ```sh
71 | az batchai job create -l eastus -g batchaitests -n tensorflow -r nc6 -c job.json
72 | ```
73 | 
74 | Note, the job will start running when the cluster finished allocation and initialization of the node.
75 | 
76 | ### Next Steps
77 | 
78 | Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) contains detailed information on
79 | how to manage your clusters and jobs.
80 | 
81 | [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) contains an end-to-end example of using
82 | Azure CLI 2.0 for Batch AI cluster creation, job submission and checking job's execution results.


--------------------------------------------------------------------------------
/recipes/Chainer/Chainer-GPU-Distributed/cli-instructions.md:
--------------------------------------------------------------------------------
 1 | Please follow [instructions](/documentation/using-azure-cli-20.md) to install Azure CLI 2.0 and configure it for using with Batch AI.
 2 | 
 3 | 
 4 | ### Create a Resource Group
 5 | 
 6 | Create a resource group ```batchaitests``` (or choose your own resource name) which will be used for resources creations:
 7 | 
 8 | ```sh
 9 | az group create -n batchaitests -l eastus
10 | ```
11 | 
12 | ### Create a Storage Account
13 | 
14 | Create a storage account with an unique name in the same region where you are going to use Batch AI:
15 | 
16 | ```sh
17 | az storage account create -n <storage account name> --sku Standard_LRS -l eastus -g batchaitests
18 | ```
19 | 
20 | ### Data Deployment
21 | 
22 | - Download train_mnist sample script into the current folder:
23 | 
24 | For GNU/Linux users:
25 | 
26 | ```sh
27 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/Chainer/Chainer-GPU-Distributed/train_mnist.py?token=AcZzrV-OFepRwpRSB1kyABIX-PLh2ZHqks5Z4eukwA%3D%3D" -O train_mnist.py
28 | ```
29 | 
30 | - Create an Azure File Share with `chainer_samples` folder and upload train_mnist.py into it:
31 | 
32 | ```sh
33 | az storage share create --name batchaisample --account-name <storage account name>
34 | az storage directory create --share-name batchaisample --name chainer_samples
35 | az storage file upload --share-name batchaisample --source train_mnist.py --path chainer_samples
36 | ```
37 | 
38 | ### Cluster
39 | 
40 | For this recipe we need two nodes GPU cluster (`min node = max node = 2`) of `Standard_NC6` size (one GPU) with standard Ubuntu LTS (`UbuntuLTS`) or Ubuntu DSVM (```UbuntuDSVM```) image and Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`.
41 | 
42 | #### Cluster Creation Command
43 | 
44 | For GNU/Linux users:
45 | 
46 | ```sh
47 | az batchai cluster create -l eastus -g batchaitests --storage-account-name <storage account name> -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub
48 | ```
49 | 
50 | For Windows users:
51 | 
52 | ```sh
53 | az batchai cluster create -l eastus -g batchaitests --storage-account-name <storage account name> -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u <user_name> -p <password>
54 | ```
55 | 
56 | ### Job
57 | 
58 | The job creation parameters are in [job.json](./job.json):
59 | 
60 | - An input directory with ID `SCRIPT` to allow the job to find the sample script via environment variable `$AZ_BATCHAI_INPUT_SCRIPT`;
61 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and error streams;
62 | - output directory with ID `MODEL` to allow the hob to find the output directory via environment variable `$AZ_BATCHAI_OUTPUT_MODEL`;
63 | - nodeCount defining how many nodes will be used for the job execution;
64 | - path and parameters for running train_mnist.py;
65 | - ```batchaitraining/chainermn:openMPI``` docker image will be used for job execution.
66 | 
67 | #### Job Creation Command
68 | 
69 | ```sh
70 | az batchai job create -l eastus -g batchaitests -n distributed_chainer -r nc6 -c job.json
71 | ```
72 | 
73 | Note, the job will start running when the cluster finished allocation and initialization of the node.
74 | 
75 | ### Next Steps
76 | 
77 | Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) contains detailed information on
78 | how to manage your clusters and jobs.
79 | 
80 | [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) contains an end-to-end example of using
81 | Azure CLI 2.0 for Batch AI cluster creation, job submission and checking job's execution results.
82 | 


--------------------------------------------------------------------------------
/recipes/Keras/Keras-DSVM/cli-instructions.md:
--------------------------------------------------------------------------------
 1 | Please follow [instructions](/documentation/using-azure-cli-20.md) to install Azure CLI 2.0 and configure it for using with Batch AI.
 2 | 
 3 | ### Create a Resource Group
 4 | 
 5 | Create a resource group ```batchaitests``` (or choose your own resource name) which will be used for resources creations:
 6 | 
 7 | ```sh
 8 | az group create -n batchaitests -l eastus
 9 | ```
10 | 
11 | ### Create a Storage Account
12 | 
13 | Create a storage account with an unique name in the same region where you are going to use Batch AI:
14 | 
15 | ```sh
16 | az storage account create -n <storage account name> --sku Standard_LRS -l eastus -g batchaitests
17 | ```
18 | 
19 | 
20 | ### Data Deployment
21 | 
22 | - Download mnist_cnn.py sample script into the current folder:
23 | 
24 | For GNU/Linux users:
25 | 
26 | ```sh
27 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/Keras/Keras-DSVM/mnist_cnn.py?token=AcZzrU1mri0vNMxtUKL6GW6hSezGK7qBks5Z4ewWwA%3D%3D" -O mnist_cnn.py
28 | ```
29 | 
30 | - Create an Azure File Share with `keras_samples` folder and upload mnist_cnn.py
31 | into it:
32 | 
33 | ```sh
34 | az storage share create --name batchaisample --account-name <storage account name>
35 | az storage directory create --share-name batchaisample --name keras_samples
36 | az storage file upload --share-name batchaisample --source mnist_cnn.py --path keras_samples
37 | ```
38 | 
39 | ### Cluster
40 | 
41 | For this recipe we need one node GPU cluster (`min node = max node = 1`) of `Standard_NC6` size (one GPU) with Ubuntu DSVM (```UbuntuDSVM```) image and Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`.
42 | 
43 | #### Cluster Creation Command
44 | 
45 | For GNU/Linux users:
46 | 
47 | ```sh
48 | az batchai cluster create -l eastus -g batchaitests --storage-account-name <storage account name> -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 1 --max 1 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub
49 | ```
50 | 
51 | For Windows users:
52 | 
53 | ```sh
54 | az batchai cluster create -l eastus -g batchaitests --storage-account-name <storage account name> -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 1 --max 1 --afs-name batchaisample --afs-mount-path external -u <user_name> -p <password>
55 | ```
56 | 
57 | ### Job
58 | 
59 | The job creation parameters are in [job.json](./job.json):
60 | 
61 | - An input directory with ID `SCRIPT` to allow the job to find the sample script via environment variable `$AZ_BATCHAI_INPUT_SCRIPT`;
62 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and error streams;
63 | - nodeCount defining how many nodes will be used for the job execution;
64 | - BatchAI has no native support for Keras, but it can run it as a custom_toolkit;
65 | - Keral in this recipe uses cntk backend; DSVM supports cntk, tensorflow and theano backends for keras, just change KERAS_BACKEND to "tensorflow" or "theano" to use corresponding backend. Note, theano backend will run on CPU.
66 | - the job will run on DSVM directly, so no docker image is configured for it.
67 | 
68 | 
69 | #### Job Creation Command
70 | 
71 | ```sh
72 | az batchai job create -l eastus -g batchaitests -n keras -r nc6 -c job.json
73 | ```
74 | 
75 | Note, the job will start running when the cluster finished allocation and initialization of the node.
76 | 
77 | ### Next Steps
78 | 
79 | Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) contains detailed information on
80 | how to manage your clusters and jobs.
81 | 
82 | [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) contains an end-to-end example of using
83 | Azure CLI 2.0 for Batch AI cluster creation, job submission and checking job's execution results.


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-BrainScript-Distributed/ConvNet_MNIST.cntk:
--------------------------------------------------------------------------------
  1 | # ConvNet on MNIST dataset.
  2 | 
  3 | command = trainNetwork:testNetwork
  4 | 
  5 | precision = "float"; traceLevel = 1 ; deviceId = "auto"
  6 | 
  7 | rootDir = "../../.." ; dataDir = "$rootDir$/DataSets/MNIST" ;
  8 | outputDir = "./Output" ;
  9 | 
 10 | modelPath = "$outputDir$/Models/ConvNet_MNIST"
 11 | #stderr = "$outputDir$/ConvNet_MNIST_bs_out"
 12 | 
 13 | maxEpochs = 40
 14 | minibatchSize = 64
 15 | autoAdjustMinibatch = false
 16 | 
 17 | # TRAINING CONFIG
 18 | trainNetwork = {
 19 |     action = "train"
 20 | 
 21 |     BrainScriptNetworkBuilder = {
 22 |         imageShape = 28:28:1                        # image dimensions, 1 channel only
 23 |         labelDim = 10                               # number of distinct labels
 24 |         featScale = 1/256
 25 |         Scale{f} = x => Constant(f) .* x
 26 | 
 27 |         model = Sequential (
 28 |             Scale {featScale} :
 29 |             ConvolutionalLayer {32, (5:5), pad = true} : ReLU :
 30 |             MaxPoolingLayer    {(3:3), stride=(2:2)} :
 31 |             ConvolutionalLayer {48, (3:3), pad = false} : ReLU :
 32 |             MaxPoolingLayer    {(3:3), stride=(2:2)} :
 33 |             ConvolutionalLayer {64, (3:3), pad = false} : ReLU :
 34 |             DenseLayer         {96} : Dropout : ReLU :
 35 |             LinearLayer        {labelDim}
 36 |         )
 37 | 
 38 |         # inputs
 39 |         features = Input {imageShape}
 40 |         labels = Input {labelDim}
 41 | 
 42 |         # apply model to features
 43 |         ol = model (features)
 44 | 
 45 |         # loss and error computation
 46 |         ce   = CrossEntropyWithSoftmax (labels, ol)
 47 |         errs = ClassificationError (labels, ol)
 48 | 
 49 |         # declare special nodes
 50 |         featureNodes    = (features)
 51 |         labelNodes      = (labels)
 52 |         criterionNodes  = (ce)
 53 |         evaluationNodes = (errs)
 54 |         outputNodes     = (ol)
 55 |     }
 56 | 
 57 |     SGD = {
 58 |         ParallelTrain = {
 59 | 		    parallelizationMethod = DataParallelASGD
 60 | 		    distributedMBReading = true
 61 | 		    syncPerfStats = 20
 62 | 		    DataParallelASGD = [
 63 | 			    syncPeriodPerWorker=256
 64 | 			    usePipeline = true
 65 | 			    AdjustLearningRateAtBeginning = [
 66 | 					adjustCoefficient = 0.2
 67 | 					adjustNBMiniBatch = 1024
 68 | 			   ]
 69 | 		   ]
 70 | 			
 71 |         }
 72 |         AutoAdjust = {
 73 |             autoAdjustMinibatch = $autoAdjustMinibatch$
 74 |             minibatchSizeTuningFrequency = 3
 75 |         }
 76 |         epochSize = 60000
 77 |         minibatchSize = $minibatchSize$
 78 |         maxEpochs = $maxEpochs$
 79 |         learningRatesPerSample = 0.001*10:0.0005*10:0.0001
 80 | 		dropoutRate = 0.5
 81 |         momentumAsTimeConstant = 0*5:1024
 82 | 
 83 |         numMBsToShowResult = 500
 84 |     }
 85 | 
 86 |     reader = {
 87 |         readerType = "CNTKTextFormatReader"
 88 |         # See ../REAMDE.md for details on getting the data (Train-28x28_cntk_text.txt).
 89 |         file = "$DataDir$/Train-28x28_cntk_text.txt"
 90 |         randomize = true
 91 |         keepDataInMemory = true
 92 |         input = {
 93 |             features = { dim = 784 ; format = "dense" }
 94 |             labels =   { dim = 10  ; format = "dense" }
 95 |         }
 96 |     }
 97 | }
 98 | 
 99 | # TEST CONFIG
100 | testNetwork = {
101 |     action = test
102 |     minibatchSize = 1024    # reduce this if you run out of memory
103 | 
104 |     reader = {
105 |         readerType = "CNTKTextFormatReader"
106 |         file = "$DataDir$/Test-28x28_cntk_text.txt"
107 |         input = {
108 |             features = { dim = 784 ; format = "dense" }
109 |             labels =   { dim = 10  ; format = "dense" }
110 |         }
111 |     }
112 | }
113 | 


--------------------------------------------------------------------------------
/recipes/Horovod/Horovod/cli-instructions.md:
--------------------------------------------------------------------------------
 1 | Please follow [instructions](/documentation/using-azure-cli-20.md) to install Azure CLI 2.0 and configure it for using with Batch AI.
 2 | 
 3 | ### Create a Resource Group
 4 | 
 5 | Create a resource group ```batchaitests``` (or choose your own resource name) which will be used for resources creations:
 6 | 
 7 | ```sh
 8 | az group create -n batchaitests -l eastus
 9 | ```
10 | 
11 | ### Create a Storage Account
12 | 
13 | Create a storage account with an unique name in the same region where you are going to use Batch AI:
14 | 
15 | ```sh
16 | az storage account create -n <storage account name> --sku Standard_LRS -l eastus -g batchaitests
17 | ```
18 | 
19 | ### Data Deployment
20 | 
21 | - Download tensorflow_mnist.py sample script into the current folder:
22 | 
23 | For GNU/Linux users:
24 | 
25 | ```sh
26 | wget https://raw.githubusercontent.com/uber/horovod/v0.9.10/examples/tensorflow_mnist.py
27 | ```
28 | 
29 | - Create an Azure File Share with `horovod_samples` folder and upload tensorflow_mnist.py into it:
30 | 
31 | ```sh
32 | az storage share create --name batchaisample --account-name <storage account name>
33 | az storage directory create --share-name batchaisample --name horovod_samples
34 | az storage file upload --share-name batchaisample --source tensorflow_mnist.py --path horovod_samples
35 | ```
36 | 
37 | ### Cluster
38 | 
39 | For this recipe we will use a GPU cluster with two nodes (`min node = max node = 2`) of `Standard_NC6` size (one GPU)
40 | with Ubuntu DSVM (```UbuntuDSVM```) image and Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`.
41 | 
42 | #### Cluster Creation Command
43 | 
44 | For GNU/Linux users:
45 | 
46 | ```sh
47 | az batchai cluster create -l eastus -g batchaitests --storage-account-name <storage account name> -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub
48 | ```
49 | 
50 | For Windows users:
51 | 
52 | ```sh
53 | az batchai cluster create -l eastus -g batchaitests --storage-account-name <storage account name> -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u <user_name> -p <password>
54 | ```
55 | 
56 | ### Job
57 | 
58 | The job creation parameters are in [job.json](./job.json):
59 | 
60 | - An input directory with ID `SCRIPTS` to allow the job to find the sample script via environment variable `$AZ_BATCHAI_INPUT_SCRIPTS`;
61 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and error streams;
62 | - nodeCount defines how many nodes will be used for the job execution;
63 | - ```tensorflow/tensorflow:1.1.0-gpu``` standard tensorflow container will be used and ```Horovod``` will be installed by job preparation command line.
64 | You can build and publish your own docker image containing tensorflow and Horovod instead;
65 | - The ```tensorflow_mnist.py``` example will be executed with custom toolkit.
66 | - To run mpi task we will use hostfile generated but Batch AI and available via ```$AZ_BATCHAI_MPI_HOST_FILE``` environment variable.
67 | 
68 | Note, you can delete ```containerSettings``` from the job definition to run the same job directly on the host DSVM.
69 | 
70 | #### Job Creation Command
71 | 
72 | ```sh
73 | az batchai job create -l eastus -g batchaitests --storage-account-name <storage account name> -n horovod -r nc6 -c job.json
74 | ```
75 | 
76 | Note, the job will start running when the cluster finished allocation and initialization of the nodes.
77 | 
78 | ### Next Steps
79 | 
80 | Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) contains detailed information on
81 | how to manage your clusters and jobs.
82 | 
83 | [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) contains an end-to-end example of using
84 | Azure CLI 2.0 for Batch AI cluster creation, job submission and checking job's execution results.


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-Python-Distributed/cli-instructions.md:
--------------------------------------------------------------------------------
 1 | Please follow [instructions](/documentation/using-azure-cli-20.md) to install Azure CLI 2.0 and configure it for using with Batch AI.
 2 | 
 3 | ### Create a Resource Group
 4 | 
 5 | Create a resource group ```batchaitests``` (or choose your own resource name) which will be used for resources creations:
 6 | 
 7 | ```sh
 8 | az group create -n batchaitests -l eastus
 9 | ```
10 | 
11 | ### Create a Storage Account
12 | 
13 | Create a storage account with an unique name in the same region where you are going to use Batch AI:
14 | 
15 | ```sh
16 | az storage account create -n <storage account name> --sku Standard_LRS -l eastus -g batchaitests
17 | ```
18 | 
19 | ### Data Deployment
20 | 
21 | - Download ConvNet_CIFAR10_DataAug_Distributed.py, ConvNet_CIFAR10_DataAug.py and CIFA-10_data_prepare.sh into the current folder:
22 | 
23 | For GNU/Linux users:
24 | 
25 | ```sh
26 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/CNTK/CNTK-GPU-Python-Distributed/ConvNet_CIFAR10_DataAug_Distributed.py?token=AcZzrbN1I34RrKn8MPnn5_dfy86I-XEIks5Z4cfswA%3D%3D" -O ConvNet_CIFAR10_DataAug_Distributed.py
27 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/CNTK/CNTK-GPU-Python-Distributed/ConvNet_CIFAR10_DataAug.py?token=AcZzrWAAVqoQXUtPR0JxBF7m4pXbUACzks5Z4cguwA%3D%3D" -O ConvNet_CIFAR10_DataAug.py
28 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/CNTK/CNTK-GPU-Python-Distributed/CIFA-10_data_prepare.sh?token=AcZzrdr1tTQK_Gr7EdVXvg-sUarpWMqnks5Z4chYwA%3D%3D" -O CIFA-10_data_prepare.sh
29 | ```
30 | 
31 | - Create an Azure File Share with `cntk_sample` folder and upload the scripts into it:
32 | 
33 | ```sh
34 | az storage share create --name batchaisample --account-name <storage account name>
35 | az storage directory create --share-name batchaisample --name cntk_samples
36 | az storage file upload --share-name batchaisample --source ConvNet_CIFAR10_DataAug_Distributed.py --path cntk_samples
37 | az storage file upload --share-name batchaisample --source ConvNet_CIFAR10_DataAug.py --path cntk_samples
38 | az storage file upload --share-name batchaisample --source CIFA-10_data_prepare.sh --path cntk_samples
39 | ```
40 | 
41 | ### Cluster
42 | 
43 | For this recipe we need two nodes GPU cluster (`min node = max node = 2`) of `Standard_NC6` size (one GPU) with standard Ubuntu LTS (`UbuntuLTS`) or Ubuntu DSVM (```UbuntuDSVM```) image and Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`.
44 | 
45 | #### Cluster Creation Command
46 | 
47 | For GNU/Linux users:
48 | 
49 | ```sh
50 | az batchai cluster create -l eastus -g batchaitests --storage-account-name <storage account name> -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub
51 | ```
52 | 
53 | For Windows users:
54 | 
55 | ```sh
56 | az batchai cluster create -l eastus -g batchaitests --storage-account-name <storage account name> -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u <user_name> -p <password>
57 | ```
58 | 
59 | ### Job
60 | 
61 | The job creation parameters are in [job.json](./job.json):
62 | 
63 | - An input directory with IDs `SCRIPT` to allow the job to find the sample scripts via environment variable `$AZ_BATCHAI_INPUT_SCRIPT`;
64 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and input;
65 | - An output directory with ID `MODEL` to allow job to find the output directory for the model via `$AZ_BATCHAI_OUTPUT_MODEL` environment variable;
66 | - node_count defining how many nodes will be used for the job execution;
67 | - job preparation task will execute CIFA-10_data_prepare.sh script to download and preprocess CIFAR-10 dataset on local SSD (at $AZ_BATCHAI_JOB_TEMP);
68 | - path and parameters for running ConvNet_CIFAR10_DataAug_Distributed.py;
69 | - ```microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0``` docker image will be used for job execution.
70 | 
71 | Note, you can delete the docker image information to run the job directly on DSVM.
72 | 
73 | #### Job Creation Command
74 | 
75 | ```sh
76 | az batchai job create -l eastus -g batchaitests -n distributed_cntk_python -r nc6 -c job.json
77 | ```
78 | 
79 | Note, the job will start running when the cluster finished allocation and initialization of the nodes.
80 | 
81 | ### Next Steps
82 | 
83 | Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) contains detailed information on
84 | how to manage your clusters and jobs.
85 | 
86 | [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) contains an end-to-end example of using
87 | Azure CLI 2.0 for Batch AI cluster creation, job submission and checking job's execution results.


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-BrainScript/cli-instructions.md:
--------------------------------------------------------------------------------
 1 | Please follow [instructions](/documentation/using-azure-cli-20.md) to install Azure CLI 2.0 and configure it for using with Batch AI.
 2 | 
 3 | ### Create a Resource Group
 4 | 
 5 | Create a resource group ```batchaitests``` (or choose your own resource name) which will be used for resources creations:
 6 | 
 7 | ```sh
 8 | az group create -n batchaitests -l eastus
 9 | ```
10 | 
11 | ### Create a Storage Account
12 | 
13 | Create a storage account with an unique name in the same region where you are going to use Batch AI:
14 | 
15 | ```sh
16 | az storage account create -n <storage account name> --sku Standard_LRS -l eastus -g batchaitests
17 | ```
18 | 
19 | ### Data Deployment
20 | 
21 | - Download and extract preprocessed MNIST Database from this [location](https://batchaisamples.blob.core.windows.net/samples/mnist_dataset.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=c&sig=PmhL%2BYnYAyNTZr1DM2JySvrI12e%2F4wZNIwCtf7TRI%2BM%3D) into the current folder.
22 | 
23 | For GNU/Linux users:
24 | 
25 | ```sh
26 | wget "https://batchaisamples.blob.core.windows.net/samples/mnist_dataset.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=c&sig=PmhL%2BYnYAyNTZr1DM2JySvrI12e%2F4wZNIwCtf7TRI%2BM%3D" -O mnist_dataset.zip
27 | unzip mnist_dataset.zip
28 | ```
29 | 
30 | - Download ConvNet_MNIST.cntk config file into the current folder:
31 | 
32 | For GNU/Linux users:
33 | 
34 | ```sh
35 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/CNTK/CNTK-GPU-BrainScript/ConvNet_MNIST.cntk?token=AcZzrfNpH_TV0LwzeHO_iGt4Kuh04on8ks5Z4bFrwA%3D%3D" -O ConvNet_MNIST.cntk
36 | ```
37 | 
38 | - Create an Azure File Share with `nmist_database` and `cntk_sample` folders and upload MNIST database and BrainScript ConvNet_MNIST.cntk config file:
39 | 
40 | ```sh
41 | az storage share create --name batchaisample --account-name <storage account name>
42 | az storage directory create --share-name batchaisample --name mnist_database
43 | az storage file upload --share-name batchaisample --source Train-28x28_cntk_text.txt --path mnist_database
44 | az storage file upload --share-name batchaisample --source Test-28x28_cntk_text.txt --path mnist_database
45 | az storage directory create --share-name batchaisample --name cntk_samples
46 | az storage file upload --share-name batchaisample --source ConvNet_MNIST.cntk --path cntk_samples
47 | ```
48 | 
49 | ### Cluster
50 | 
51 | For this recipe we need one node GPU cluster (`min node = max node = 1`) of `Standard_NC6` size (one GPU) with standard Ubuntu LTS (`UbuntuLTS`) or Ubuntu DSVM (```UbuntuDSVM```) image and Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`.
52 | 
53 | #### Cluster Creation Command
54 | 
55 | For GNU/Linux users:
56 | 
57 | ```sh
58 | az batchai cluster create -l eastus -g batchaitests --storage-account-name <storage account name> -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 1 --max 1 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub
59 | ```
60 | 
61 | For Windows users:
62 | 
63 | ```sh
64 | az batchai cluster create -l eastus -g batchaitests --storage-account-name <storage account name> -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 1 --max 1 --afs-name batchaisample --afs-mount-path external -u <user_name> -p <password>
65 | ```
66 | 
67 | ### Job
68 | 
69 | The job creation parameters are in [job.json](./job.json):
70 | 
71 | - Two input directories with IDs `CONFIG` and `DATASET` to allow the job to find the sample config and MNIST Database via environment variables `$AZ_BATCHAI_INPUT_CONFIG` and `$AZ_BATCHAI_INPUT_DATASET`;
72 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and error streams;
73 | - An output directory with ID `MODEL` to allow job to find the output directory for the model via `$AZ_BATCHAI_OUTPUT_MODEL` environment variable;
74 | - node_count defining how many nodes will be used for the job execution;
75 | - ```microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0``` docker image will be used for job execution.
76 | 
77 | Note, you can remove docker image information to run the job directly on DSVM.
78 | 
79 | #### Job Creation Command
80 | 
81 | ```sh
82 | az batchai job create -l eastus -g batchaitests -n cntk -r nc6 -c job.json
83 | ```
84 | 
85 | Note, the job will start running when the cluster finished allocation and initialization of the node.
86 | 
87 | ### Next Steps
88 | 
89 | Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) contains detailed information on
90 | how to manage your clusters and jobs.
91 | 
92 | [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) contains an end-to-end example of using
93 | Azure CLI 2.0 for Batch AI cluster creation, job submission and checking job's execution results.


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-Python/cli-instructions.md:
--------------------------------------------------------------------------------
 1 | Please follow [instructions](/documentation/using-azure-cli-20.md) to install Azure CLI 2.0 and configure it for using with Batch AI.
 2 | 
 3 | ### Create a Resource Group
 4 | 
 5 | Create a resource group ```batchaitests``` (or choose your own resource name) which will be used for resources creations:
 6 | 
 7 | ```sh
 8 | az group create -n batchaitests -l eastus
 9 | ```
10 | 
11 | ### Create a Storage Account
12 | 
13 | Create a storage account with an unique name in the same region where you are going to use Batch AI:
14 | 
15 | ```sh
16 | az storage account create -n <storage account name> --sku Standard_LRS -l eastus -g batchaitests
17 | ```
18 | 
19 | ### Data Deployment
20 | 
21 | - Download and extract preprocessed MNIST Database from this [location](https://batchaisamples.blob.core.windows.net/samples/mnist_dataset.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=c&sig=PmhL%2BYnYAyNTZr1DM2JySvrI12e%2F4wZNIwCtf7TRI%2BM%3D) into the current folder.
22 | 
23 | For GNU/Linux users:
24 | 
25 | ```sh
26 | wget "https://batchaisamples.blob.core.windows.net/samples/mnist_dataset.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=c&sig=PmhL%2BYnYAyNTZr1DM2JySvrI12e%2F4wZNIwCtf7TRI%2BM%3D" -O mnist_dataset.zip
27 | unzip mnist_dataset.zip
28 | ```
29 | 
30 | - Download ConvNet_MNIST.py example script into the current folder:
31 | 
32 | For GNU/Linux users:
33 | 
34 | ```sh
35 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/CNTK/CNTK-GPU-Python/ConvNet_MNIST.py?token=AcZzrejaokHC2Nj5ehsoMFe4t3LqFcThks5Z4bmEwA%3D%3D" -O ConvNet_MNIST.py
36 | ```
37 | 
38 | - Create an Azure File Share with `nmist_database` and `cntk_sample` folders and upload MNIST database and ConvNet_MNIST.py script:
39 | 
40 | ```sh
41 | az storage share create --name batchaisample --account-name <storage account name>
42 | az storage directory create --share-name batchaisample --name mnist_database
43 | az storage file upload --share-name batchaisample --source Train-28x28_cntk_text.txt --path mnist_database
44 | az storage file upload --share-name batchaisample --source Test-28x28_cntk_text.txt --path mnist_database
45 | az storage directory create --share-name batchaisample --name cntk_samples
46 | az storage file upload --share-name batchaisample --source ConvNet_MNIST.py --path cntk_samples
47 | ```
48 | 
49 | ### Cluster
50 | 
51 | For this recipe we need one node GPU cluster (`min node = max node = 1`) of `Standard_NC6` size (one GPU) with standard Ubuntu LTS (`UbuntuLTS`) or Ubuntu DSVM (```UbuntuDSVM```) image and Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`.
52 | 
53 | #### Cluster Creation Command
54 | 
55 | For GNU/Linux users:
56 | 
57 | ```sh
58 | az batchai cluster create -l eastus -g batchaitests --storage-account-name <storage account name> -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 1 --max 1 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub
59 | ```
60 | 
61 | For Windows users:
62 | 
63 | ```sh
64 | az batchai cluster create -l eastus -g batchaitests --storage-account-name <storage account name> -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 1 --max 1 --afs-name batchaisample --afs-mount-path external -u <user_name> -p <password>
65 | ```
66 | 
67 | ### Job
68 | 
69 | The job creation parameters are in [job.json](./job.json):
70 | 
71 | - Two input directories with IDs `SCRIPT` and `DATASET` to allow the job to find the sample script and MNIST Database via environment variables `$AZ_BATCHAI_INPUT_SCRIPT` and `$AZ_BATCHAI_INPUT_DATASET`;
72 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and input;
73 | - An output directory with ID `MODEL` to allow job to find the output directory for the model via `$AZ_BATCHAI_OUTPUT_MODEL` environment variable;
74 | - node_count defining how many nodes will be used for the job execution;
75 | - path and parameters for running ConvNet_MNIST.py;
76 | - ```microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0``` docker image will be used for job execution.
77 | 
78 | Note, you can remove docker image information to run the job directly on DSVM.
79 | 
80 | #### Job Creation Command
81 | 
82 | ```sh
83 | az batchai job create -l eastus -g batchaitests -n cntk_python -r nc6 -c job.json
84 | ```
85 | 
86 | Note, the job will start running when the cluster finished allocation and initialization of the node.
87 | 
88 | ### Next Steps
89 | 
90 | Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) contains detailed information on
91 | how to manage your clusters and jobs.
92 | 
93 | [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) contains an end-to-end example of using
94 | Azure CLI 2.0 for Batch AI cluster creation, job submission and checking job's execution results.


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-Python-Distrbuted-Infiniband/cli-instructions.md:
--------------------------------------------------------------------------------
  1 | Please follow [instructions](/recipes/Readme.md) to install Azure CLI 2.0, configure default location, create and configure default resource group and storage account.
  2 | 
  3 | 
  4 | ### Data Deployment
  5 | 
  6 | - Download ConvNet_CIFAR10_DataAug_Distributed.py, ConvNet_CIFAR10_DataAug.py and CIFA-10_data_prepare.sh into the current folder:
  7 | 
  8 | For GNU/Linux users:
  9 | 
 10 | ```sh
 11 | wget "https://raw.githubusercontent.com/Microsoft/CNTK/v2.3/Examples/Image/Classification/ResNet/Python/resnet_models.py" -O resnet_models.py
 12 | wget "https://raw.githubusercontent.com/Microsoft/CNTK/v2.3/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10_Distributed.py" -O TrainResNet_CIFAR10_Distributed.py
 13 | wget "https://raw.githubusercontent.com/Microsoft/CNTK/v2.3/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10.py" -O TrainResNet_CIFAR10.py
 14 | ```
 15 | 
 16 | Create an Azure File Share with `cntk_sample` folder and upload the scripts into it:
 17 | 
 18 | ```sh
 19 | az storage share create --name batchaisample
 20 | az storage directory create --share-name batchaisample --name cntk_samples
 21 | az storage file upload --share-name batchaisample --source TrainResNet_CIFAR10_Distributed.py --path cntk_samples
 22 | az storage file upload --share-name batchaisample --source TrainResNet_CIFAR10.py --path cntk_samples
 23 | az storage file upload --share-name batchaisample --source resnet_models.py --path cntk_samples
 24 | ```
 25 | 
 26 | Upload the job preparation script, that does the following tasks:
 27 | - Download CIFAR-10 data set on all GPU nodes (under ```$AZ_BATCHAI_JOB_TEMP``` directory)
 28 | - Install IntelMPI binary
 29 | 
 30 | ```sh
 31 | az storage file upload --share-name batchaisample --source jobprep_cntk_distributed_ib.sh --path horovod_samples
 32 | ```
 33 | 
 34 | 
 35 | ### Cluster
 36 | 
 37 | By default, for this recipe we will use a GPU cluster with two nodes (`min node = max node = 2`) of `Standard_NC24r` size (four GPU with infiniband)
 38 | with latest Ubuntu 16.04-LTS image. 
 39 | 
 40 | Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`.
 41 | 
 42 | #### Cluster Creation Command
 43 | 
 44 | For GNU/Linux users:
 45 | 
 46 | ```sh
 47 | az batchai cluster create -n nc24r -s Standard_NC24r --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub
 48 | ```
 49 | 
 50 | For Windows users:
 51 | 
 52 | ```sh
 53 | az batchai cluster create -n nc24r -s Standard_NC24r --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u <user_name> -p <password>
 54 | ```
 55 | 
 56 | ### Job
 57 | 
 58 | The job creation parameters are in [job.json](./job.json):
 59 | 
 60 | - The job will use `batchaitraining/cntk:2.3-gpu-1bitsgd-py36-cuda8-cudnn6-intelmpi` container that is built based on [dockerfile](./dockerfile)
 61 | - Will use job preparation task to execute job prreparation script (jobprep_cntk_distributed_ib.sh). The CIFA-10 dataset will be downloaded and processed on compute nodes locally (under ```$AZ_BATCHAI_JOB_TEMP``` directory);
 62 | - Will use configured previously input and output directories;
 63 | - Will run TrainResNet_CIFAR10_Distributed.py providing CIFAR-10 Dataset path as the first parameter and desired mode output as the second one. 
 64 | - Will set ```processCount``` to 8, so that all 8 GPUs from 2 NC24r nodes will be used;
 65 | - An input directory with IDs `SCRIPT` to allow the job to find the sample scripts via environment variable `$AZ_BATCHAI_INPUT_SCRIPT`;
 66 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and input;
 67 | - An output directory with ID `MODEL` to allow job to find the output directory for the model via `$AZ_BATCHAI_OUTPUT_MODEL` environment variable;
 68 | - For illustration purpose, we will train a ResNet 110 and only run 5 epoches
 69 | 
 70 | 
 71 | #### Job Creation Command
 72 | 
 73 | ```sh
 74 | az batchai job create -n distributed_cntk_ib --cluster-name nc24r -c job.json
 75 | ```
 76 | 
 77 | Note, the job will start running when the cluster finished allocation and initialization of the nodes.
 78 | 
 79 | ### Get Help
 80 | 
 81 | The Azure CLI has built-in help documentation, which you can run from the command line:
 82 | 
 83 | ```sh
 84 | az [command-group [command]] -h
 85 | ```
 86 | 
 87 | For example, to get information about all Azure Batch AI categories, use:
 88 | 
 89 | ```sh
 90 | az batchai -h
 91 | ```
 92 | 
 93 | To get help with the command to create a cluster, use:
 94 | 
 95 | ```sh
 96 | az batchai cluster create -h
 97 | ```
 98 | 
 99 | You can use [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) as end-to-end example of CLI usage.
100 | 


--------------------------------------------------------------------------------
/recipes/TensorFlow/TensorFlow-GPU-Distributed/cli-instructions.md:
--------------------------------------------------------------------------------
 1 | Please follow [instructions](/documentation/using-azure-cli-20.md) to install Azure CLI 2.0 and configure it for using with Batch AI.
 2 | 
 3 | ### Create a Resource Group
 4 | 
 5 | Create a resource group ```batchaitests``` (or choose your own resource name) which will be used for resources creations:
 6 | 
 7 | ```sh
 8 | az group create -n batchaitests -l eastus
 9 | ```
10 | 
11 | ### Create a Storage Account
12 | 
13 | Create a storage account with an unique name in the same region where you are going to use Batch AI:
14 | 
15 | ```sh
16 | az storage account create -n <storage account name> --sku Standard_LRS -l eastus -g batchaitests
17 | ```
18 | 
19 | ### Data Deployment
20 | 
21 | - Download and extract preprocessed MNIST database:
22 | 
23 | For GNU/Linux users:
24 | 
25 | ```sh
26 | wget "https://batchaisamples.blob.core.windows.net/samples/mnist_dataset_original.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=b&sig=Qc1RA3zsXIP4oeioXutkL1PXIrHJO0pHJlppS2rID3I%3D" -O mnist_dataset_original.zip
27 | unzip mnist_dataset_original.zip
28 | ```
29 | 
30 | - Download mnist_replica.py sample script into the current folder:
31 | 
32 | For GNU/Linux users:
33 | 
34 | ```sh
35 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/TensorFlow/TensorFlow-GPU-Distributed/mnist_replica.py?token=AcZzrcpJGDHCUzsCyjlWiKVNfBuDdkqwks5Z4dPrwA%3D%3D" -O mnist_replica.py
36 | ```
37 | 
38 | - Create an Azure File Share with `mnist_dataset` and `tensorflow_samples` folders and upload MNIST database and convolutional.py into them:
39 | 
40 | ```sh
41 | az storage share create --name batchaisample --account-name <storage account name>
42 | az storage directory create --share-name batchaisample --name mnist_dataset
43 | az storage file upload --share-name batchaisample --source t10k-images-idx3-ubyte.gz --path mnist_dataset
44 | az storage file upload --share-name batchaisample --source t10k-labels-idx1-ubyte.gz --path mnist_dataset
45 | az storage file upload --share-name batchaisample --source train-images-idx3-ubyte.gz --path mnist_dataset
46 | az storage file upload --share-name batchaisample --source train-labels-idx1-ubyte.gz --path mnist_dataset
47 | az storage directory create --share-name batchaisample --name tensorflow_samples
48 | az storage file upload --share-name batchaisample --source mnist_replica.py --path tensorflow_samples
49 | ```
50 | 
51 | ### Cluster
52 | 
53 | For this recipe we need two nodes GPU cluster (`min node = max node = 2`) of `Standard_NC6` size (one GPU) with standard Ubuntu LTS (`UbuntuLTS`) or Ubuntu DSVM (```UbuntuDSVM```) image and Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`.
54 | 
55 | #### Cluster Creation Command
56 | 
57 | For GNU/Linux users:
58 | 
59 | ```sh
60 | az batchai cluster create -l eastus -g batchaitests --storage-account-name <storage account name> -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub
61 | ```
62 | 
63 | For Windows users:
64 | 
65 | ```sh
66 | az batchai cluster create -l eastus -g batchaitests --storage-account-name <storage account name> -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u <user_name> -p <password>
67 | ```
68 | 
69 | ### Job
70 | 
71 | The job creation parameters are in [job.json](./job.json):
72 | 
73 | - Two input directories with IDs `SCRIPT` and `DATASET` to allow the job to find the sample script and MNIST Database via environment variables `$AZ_BATCHAI_INPUT_SCRIPT` and `$AZ_BATCHAI_INPUT_DATASET`;
74 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and error streams;
75 | - An output directory with ID `MODEL` to allow job to find the output directory for the model via `$AZ_BATCHAI_OUTPUT_MODEL` environment variable;
76 | - nodeCount defining how many nodes will be used for the job execution;
77 | - path to mnist_replica.py and parameters for master, workers and parameter server;
78 | - ```tensorflow/tensorflow:1.1.0-gpu``` docker image will be used for job execution.
79 | 
80 | Note, you can delete the docker image information to run the job directly on DSVM.
81 | 
82 | #### Job Creation Command
83 | 
84 | ```sh
85 | az batchai job create -l eastus -g batchaitests -n distibuted_tensorflow -r nc6 -c job.json
86 | ```
87 | 
88 | Note, the job will start running when the cluster finished allocation and initialization of the node.
89 | 
90 | ### Next Steps
91 | 
92 | Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) contains detailed information on
93 | how to manage your clusters and jobs.
94 | 
95 | [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) contains an end-to-end example of using
96 | Azure CLI 2.0 for Batch AI cluster creation, job submission and checking job's execution results.


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-BrainScript-Distributed/cli-instructions.md:
--------------------------------------------------------------------------------
 1 | Please follow [instructions](/documentation/using-azure-cli-20.md) to install Azure CLI 2.0 and configure it for using with Batch AI.
 2 | 
 3 | ### Create a Resource Group
 4 | 
 5 | Create a resource group ```batchaitests``` (or choose your own resource name) which will be used for resources creations:
 6 | 
 7 | ```sh
 8 | az group create -n batchaitests -l eastus
 9 | ```
10 | 
11 | ### Create a Storage Account
12 | 
13 | Create a storage account with an unique name in the same region where you are going to use Batch AI:
14 | 
15 | ```sh
16 | az storage account create -n <storage account name> --sku Standard_LRS -l eastus -g batchaitests
17 | ```
18 | 
19 | ### Data Deployment
20 | 
21 | - Download and extract preprocessed MNIST Database from this [location](https://batchaisamples.blob.core.windows.net/samples/mnist_dataset.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=c&sig=PmhL%2BYnYAyNTZr1DM2JySvrI12e%2F4wZNIwCtf7TRI%2BM%3D) into the current folder.
22 | 
23 | For GNU/Linux users:
24 | 
25 | ```sh
26 | wget "https://batchaisamples.blob.core.windows.net/samples/mnist_dataset.zip?st=2017-09-29T18%3A29%3A00Z&se=2099-12-31T08%3A00%3A00Z&sp=rl&sv=2016-05-31&sr=c&sig=PmhL%2BYnYAyNTZr1DM2JySvrI12e%2F4wZNIwCtf7TRI%2BM%3D" -O mnist_dataset.zip
27 | unzip mnist_dataset.zip
28 | ```
29 | 
30 | - Download ConvNet_MNIST.cntk config file into the current folder:
31 | 
32 | For GNU/Linux users:
33 | 
34 | ```sh
35 | wget "https://raw.githubusercontent.com/Azure/BatchAI/master/recipes/CNTK/CNTK-GPU-BrainScript-Distributed/ConvNet_MNIST.cntk?token=AcZzrWPVqDDfb6ig-y98_6af-Fj3R9piks5Z4b7rwA%3D%3D" -O DistributedConvNet_MNIST.cntk
36 | ```
37 | 
38 | - Create an Azure File Share with `nmist_database` and `cntk_sample` folders and upload MNIST database and BrainScript DistibutedConvNet_MNIST.cntk config file:
39 | 
40 | ```sh
41 | az storage share create --name batchaisample --account-name <storage account name>
42 | az storage directory create --share-name batchaisample --name mnist_database
43 | az storage file upload --share-name batchaisample --source Train-28x28_cntk_text.txt --path mnist_database
44 | az storage file upload --share-name batchaisample --source Test-28x28_cntk_text.txt --path mnist_database
45 | az storage directory create --share-name batchaisample --name cntk_samples
46 | az storage file upload --share-name batchaisample --source DistributedConvNet_MNIST.cntk --path cntk_samples
47 | ```
48 | 
49 | ### Cluster
50 | 
51 | For this recipe we need two nodes GPU cluster (`min node = max node = 2`) of `Standard_NC6` size (one GPU) with standard Ubuntu LTS (`UbuntuLTS`) or Ubuntu DSVM (```UbuntuDSVM```) image and Azure File share `batchaisample` mounted at `$AZ_BATCHAI_MOUNT_ROOT/external`.
52 | 
53 | #### Cluster Creation Command
54 | 
55 | For GNU/Linux users:
56 | 
57 | ```sh
58 | az batchai cluster create -l eastus -g batchaitests --storage-account-name <storage account name> -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u $USER -k ~/.ssh/id_rsa.pub
59 | ```
60 | 
61 | For Windows users:
62 | 
63 | ```sh
64 | az batchai cluster create -l eastus -g batchaitests --storage-account-name <storage account name> -n nc6 -i UbuntuDSVM -s Standard_NC6 --min 2 --max 2 --afs-name batchaisample --afs-mount-path external -u <user_name> -p <password>
65 | ```
66 | 
67 | ### Job
68 | 
69 | The job creation parameters are in [job.json](./job.json):
70 | 
71 | - Two input directories with IDs `CONFIG` and `DATASET` to allow the job to find the sample config and MNIST Database via environment variables `$AZ_BATCHAI_INPUT_CONFIG` and `$AZ_BATCHAI_INPUT_DATASET`;
72 | - stdOutErrPathPrefix specifies that the job should use file share for standard output and error streams;
73 | - An output directory with ID `MODEL` to allow job to find the output directory for the model via `$AZ_BATCHAI_OUTPUT_MODEL` environment variable;
74 | - node_count defining how many nodes will be used for the job execution;
75 | - path and parameters for running DistributedConvNet_MNIST.cntk;
76 | - ```microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0``` docker image will be used for job execution.
77 | 
78 | Note, you can remove docker image information to run the job directly on DSVM.
79 | 
80 | #### Job Creation Command
81 | 
82 | ```sh
83 | az batchai job create -l eastus -g batchaitests -n distributed_cntk -r nc6 -c job.json
84 | ```
85 | 
86 | Note, the job will start running when the cluster finished allocation and initialization of the node.
87 | 
88 | ### Next Steps
89 | 
90 | Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) contains detailed information on
91 | how to manage your clusters and jobs.
92 | 
93 | [CLI Quickstart](https://docs.microsoft.com/en-us/azure/batch-ai/quickstart-cli) contains an end-to-end example of using
94 | Azure CLI 2.0 for Batch AI cluster creation, job submission and checking job's execution results.


--------------------------------------------------------------------------------
/recipes/Chainer/Chainer-GPU-Distributed/train_mnist.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | 
  4 | import argparse
  5 | 
  6 | import chainer
  7 | import chainer.functions as F
  8 | import chainer.links as L
  9 | from chainer import training
 10 | from chainer.training import extensions
 11 | from mpi4py import MPI
 12 | 
 13 | import chainermn
 14 | 
 15 | 
 16 | class MLP(chainer.Chain):
 17 | 
 18 |     def __init__(self, n_units, n_out):
 19 |         super(MLP, self).__init__(
 20 |             # the size of the inputs to each layer will be inferred
 21 |             l1=L.Linear(784, n_units),  # n_in -> n_units
 22 |             l2=L.Linear(n_units, n_units),  # n_units -> n_units
 23 |             l3=L.Linear(n_units, n_out),  # n_units -> n_out
 24 |         )
 25 | 
 26 |     def __call__(self, x):
 27 |         h1 = F.relu(self.l1(x))
 28 |         h2 = F.relu(self.l2(h1))
 29 |         return self.l3(h2)
 30 | 
 31 | 
 32 | def main():
 33 |     parser = argparse.ArgumentParser(description='ChainerMN example: MNIST')
 34 |     parser.add_argument('--batchsize', '-b', type=int, default=100,
 35 |                         help='Number of images in each mini-batch')
 36 |     parser.add_argument('--communicator', type=str,
 37 |                         default='hierarchical', help='Type of communicator')
 38 |     parser.add_argument('--epoch', '-e', type=int, default=20,
 39 |                         help='Number of sweeps over the dataset to train')
 40 |     parser.add_argument('--gpu', '-g', action='store_true',
 41 |                         help='Use GPU')
 42 |     parser.add_argument('--out', '-o', default='result',
 43 |                         help='Directory to output the result')
 44 |     parser.add_argument('--resume', '-r', default='',
 45 |                         help='Resume the training from snapshot')
 46 |     parser.add_argument('--unit', '-u', type=int, default=1000,
 47 |                         help='Number of units')
 48 |     args = parser.parse_args()
 49 | 
 50 |     # Prepare ChainerMN communicator.
 51 | 
 52 |     if args.gpu:
 53 |         if args.communicator == 'naive':
 54 |             print("Error: 'naive' communicator does not support GPU.\n")
 55 |             exit(-1)
 56 |         comm = chainermn.create_communicator(args.communicator)
 57 |         device = comm.intra_rank
 58 |     else:
 59 |         if args.communicator != 'naive':
 60 |             print('Warning: using naive communicator '
 61 |                   'because only naive supports CPU-only execution')
 62 |         comm = chainermn.create_communicator('naive')
 63 |         device = -1
 64 | 
 65 |     if comm.mpi_comm.rank == 0:
 66 |         print('==========================================')
 67 |         print('Num process (COMM_WORLD): {}'.format(MPI.COMM_WORLD.Get_size()))
 68 |         if args.gpu:
 69 |             print('Using GPUs')
 70 |         print('Using {} communicator'.format(args.communicator))
 71 |         print('Num unit: {}'.format(args.unit))
 72 |         print('Num Minibatch-size: {}'.format(args.batchsize))
 73 |         print('Num epoch: {}'.format(args.epoch))
 74 |         print('==========================================')
 75 | 
 76 |     model = L.Classifier(MLP(args.unit, 10))
 77 |     if device >= 0:
 78 |         chainer.cuda.get_device(device).use()
 79 |         model.to_gpu()
 80 | 
 81 |     # Create a multi node optimizer from a standard Chainer optimizer.
 82 |     optimizer = chainermn.create_multi_node_optimizer(
 83 |         chainer.optimizers.Adam(), comm)
 84 |     optimizer.setup(model)
 85 | 
 86 |     # Split and distribute the dataset. Only worker 0 loads the whole dataset.
 87 |     # Datasets of worker 0 are evenly split and distributed to all workers.
 88 |     if comm.rank == 0:
 89 |         train, test = chainer.datasets.get_mnist()
 90 |     else:
 91 |         train, test = None, None
 92 |     train = chainermn.scatter_dataset(train, comm, shuffle=True)
 93 |     test = chainermn.scatter_dataset(test, comm, shuffle=True)
 94 | 
 95 |     train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
 96 |     test_iter = chainer.iterators.SerialIterator(test, args.batchsize,
 97 |                                                  repeat=False, shuffle=False)
 98 | 
 99 |     updater = training.StandardUpdater(train_iter, optimizer, device=device)
100 |     trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
101 | 
102 |     # Create a multi node evaluator from a standard Chainer evaluator.
103 |     evaluator = extensions.Evaluator(test_iter, model, device=device)
104 |     evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
105 |     trainer.extend(evaluator)
106 | 
107 |     # Some display and output extensions are necessary only for one worker.
108 |     # (Otherwise, there would just be repeated outputs.)
109 |     if comm.rank == 0:
110 |         trainer.extend(extensions.dump_graph('main/loss'))
111 |         trainer.extend(extensions.LogReport())
112 |         trainer.extend(extensions.PrintReport(
113 |             ['epoch', 'main/loss', 'validation/main/loss',
114 |              'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))
115 |         trainer.extend(extensions.ProgressBar())
116 | 
117 |     if args.resume:
118 |         chainer.serializers.load_npz(args.resume, trainer)
119 | 
120 |     trainer.run()
121 | 
122 | 
123 | if __name__ == '__main__':
124 |     main()
125 | 


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-Python/ConvNet_MNIST.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft. All rights reserved.
  2 | 
  3 | # Licensed under the MIT license. See LICENSE.md file in the project root
  4 | # for full license information.
  5 | # ==============================================================================
  6 | 
  7 | from __future__ import print_function
  8 | import numpy as np
  9 | import sys
 10 | import os
 11 | import cntk
 12 | 
 13 | data_path = sys.argv[1]
 14 | model_path = sys.argv[2]
 15 | 
 16 | # Merge stdout and stderr
 17 | sys.stdout = sys.stderr
 18 | 
 19 | 
 20 | # Define the reader for both training and evaluation action.
 21 | def create_reader(path, is_training, input_dim, label_dim):
 22 |     return cntk.io.MinibatchSource(cntk.io.CTFDeserializer(path, cntk.io.StreamDefs(
 23 |         features  = cntk.io.StreamDef(field='features', shape=input_dim),
 24 |         labels    = cntk.io.StreamDef(field='labels',   shape=label_dim)
 25 |     )), randomize=is_training, max_sweeps = cntk.io.INFINITELY_REPEAT if is_training else 1)
 26 | 
 27 | 
 28 | # Creates and trains a feedforward classification model for MNIST images
 29 | def convnet_mnist(debug_output=False):
 30 |     image_height = 28
 31 |     image_width  = 28
 32 |     num_channels = 1
 33 |     input_dim = image_height * image_width * num_channels
 34 |     num_output_classes = 10
 35 | 
 36 |     # Input variables denoting the features and label data
 37 |     input_var = cntk.ops.input((num_channels, image_height, image_width), np.float32)
 38 |     label_var = cntk.ops.input(num_output_classes, np.float32)
 39 | 
 40 |     # Instantiate the feedforward classification model
 41 |     scaled_input = cntk.ops.element_times(cntk.ops.constant(0.00390625), input_var)
 42 | 
 43 |     with cntk.layers.default_options(activation=cntk.ops.relu, pad=False): 
 44 |         conv1 = cntk.layers.Convolution2D((5,5), 32, pad=True)(scaled_input)
 45 |         pool1 = cntk.layers.MaxPooling((3,3), (2,2))(conv1)
 46 |         conv2 = cntk.layers.Convolution2D((3,3), 48)(pool1)
 47 |         pool2 = cntk.layers.MaxPooling((3,3), (2,2))(conv2)
 48 |         conv3 = cntk.layers.Convolution2D((3,3), 64)(pool2)
 49 |         f4    = cntk.layers.Dense(96)(conv3)
 50 |         drop4 = cntk.layers.Dropout(0.5)(f4)
 51 |         z     = cntk.layers.Dense(num_output_classes, activation=None)(drop4)
 52 | 
 53 |     ce = cntk.losses.cross_entropy_with_softmax(z, label_var)
 54 |     pe = cntk.metrics.classification_error(z, label_var)
 55 | 
 56 |     reader_train = create_reader(os.path.join(data_path, 'Train-28x28_cntk_text.txt'), True, input_dim, num_output_classes)
 57 | 
 58 |     # training config
 59 |     epoch_size = 60000                    # for now we manually specify epoch size
 60 |     minibatch_size = 64
 61 |     max_epochs = 40
 62 | 
 63 |     # Set learning parameters
 64 |     lr_per_sample    = [0.001]*10 + [0.0005]*10 + [0.0001]
 65 |     lr_schedule      = cntk.learning_rate_schedule(lr_per_sample, cntk.learners.UnitType.sample, epoch_size)
 66 |     mm_time_constant = [0]*5 + [1024]
 67 |     mm_schedule      = cntk.learners.momentum_as_time_constant_schedule(mm_time_constant, epoch_size)
 68 | 
 69 |     # Instantiate the trainer object to drive the model training
 70 |     learner = cntk.learners.momentum_sgd(z.parameters, lr_schedule, mm_schedule)
 71 |     progress_printer = cntk.logging.ProgressPrinter(tag='Training',
 72 |                                                     num_epochs=max_epochs)
 73 |     trainer = cntk.Trainer(z, (ce, pe), learner, progress_printer)
 74 | 
 75 |     # define mapping from reader streams to network inputs
 76 |     input_map = {
 77 |         input_var : reader_train.streams.features,
 78 |         label_var : reader_train.streams.labels
 79 |     }
 80 | 
 81 |     cntk.logging.log_number_of_parameters(z) ; print()
 82 | 
 83 |     # Get minibatches of images to train with and perform model training
 84 |     for epoch in range(max_epochs):       # loop over epochs
 85 |         sample_count = 0
 86 |         while sample_count < epoch_size:  # loop over minibatches in the epoch
 87 |             data = reader_train.next_minibatch(min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch.
 88 |             trainer.train_minibatch(data)                                   # update model with it
 89 |             sample_count += data[label_var].num_samples                     # count samples processed so far
 90 | 
 91 |         trainer.summarize_training_progress()
 92 |         z.save(os.path.join(model_path, "ConvNet_MNIST_{}.dnn".format(epoch)))
 93 |     
 94 |     # Load test data
 95 |     reader_test = create_reader(os.path.join(data_path, 'Test-28x28_cntk_text.txt'), False, input_dim, num_output_classes)
 96 | 
 97 |     input_map = {
 98 |         input_var : reader_test.streams.features,
 99 |         label_var : reader_test.streams.labels
100 |     }
101 | 
102 |     # Test data for trained model
103 |     epoch_size = 10000
104 |     minibatch_size = 1024
105 | 
106 |     # process minibatches and evaluate the model
107 |     metric_numer    = 0
108 |     metric_denom    = 0
109 |     sample_count    = 0
110 |     minibatch_index = 0
111 | 
112 |     while sample_count < epoch_size:
113 |         current_minibatch = min(minibatch_size, epoch_size - sample_count)
114 | 
115 |         # Fetch next test min batch.
116 |         data = reader_test.next_minibatch(current_minibatch, input_map=input_map)
117 | 
118 |         # minibatch data to be trained with
119 |         metric_numer += trainer.test_minibatch(data) * current_minibatch
120 |         metric_denom += current_minibatch
121 | 
122 |         # Keep track of the number of samples processed so far.
123 |         sample_count += data[label_var].num_samples
124 |         minibatch_index += 1
125 | 
126 |     print("")
127 |     print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom))
128 |     print("")
129 | 
130 |     return metric_numer/metric_denom
131 | 
132 | if __name__=='__main__':
133 |     convnet_mnist()
134 |  
135 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.suo
  8 | *.user
  9 | *.userosscache
 10 | *.sln.docstates
 11 | 
 12 | # User-specific files (MonoDevelop/Xamarin Studio)
 13 | *.userprefs
 14 | 
 15 | # Build results
 16 | [Dd]ebug/
 17 | [Dd]ebugPublic/
 18 | [Rr]elease/
 19 | [Rr]eleases/
 20 | x64/
 21 | x86/
 22 | bld/
 23 | [Bb]in/
 24 | [Oo]bj/
 25 | [Ll]og/
 26 | 
 27 | # Visual Studio 2015 cache/options directory
 28 | .vs/
 29 | # Uncomment if you have tasks that create the project's static files in wwwroot
 30 | #wwwroot/
 31 | 
 32 | # MSTest test Results
 33 | [Tt]est[Rr]esult*/
 34 | [Bb]uild[Ll]og.*
 35 | 
 36 | # NUNIT
 37 | *.VisualState.xml
 38 | TestResult.xml
 39 | 
 40 | # Build Results of an ATL Project
 41 | [Dd]ebugPS/
 42 | [Rr]eleasePS/
 43 | dlldata.c
 44 | 
 45 | # .NET Core
 46 | project.lock.json
 47 | project.fragment.lock.json
 48 | artifacts/
 49 | **/Properties/launchSettings.json
 50 | 
 51 | *_i.c
 52 | *_p.c
 53 | *_i.h
 54 | *.ilk
 55 | *.meta
 56 | *.obj
 57 | *.pch
 58 | *.pdb
 59 | *.pgc
 60 | *.pgd
 61 | *.rsp
 62 | *.sbr
 63 | *.tlb
 64 | *.tli
 65 | *.tlh
 66 | *.tmp
 67 | *.tmp_proj
 68 | *.log
 69 | *.vspscc
 70 | *.vssscc
 71 | .builds
 72 | *.pidb
 73 | *.svclog
 74 | *.scc
 75 | 
 76 | # Chutzpah Test files
 77 | _Chutzpah*
 78 | 
 79 | # Visual C++ cache files
 80 | ipch/
 81 | *.aps
 82 | *.ncb
 83 | *.opendb
 84 | *.opensdf
 85 | *.sdf
 86 | *.cachefile
 87 | *.VC.db
 88 | *.VC.VC.opendb
 89 | 
 90 | # Visual Studio profiler
 91 | *.psess
 92 | *.vsp
 93 | *.vspx
 94 | *.sap
 95 | 
 96 | # TFS 2012 Local Workspace
 97 | $tf/
 98 | 
 99 | # Guidance Automation Toolkit
100 | *.gpState
101 | 
102 | # ReSharper is a .NET coding add-in
103 | _ReSharper*/
104 | *.[Rr]e[Ss]harper
105 | *.DotSettings.user
106 | 
107 | # JustCode is a .NET coding add-in
108 | .JustCode
109 | 
110 | # TeamCity is a build add-in
111 | _TeamCity*
112 | 
113 | # DotCover is a Code Coverage Tool
114 | *.dotCover
115 | 
116 | # Visual Studio code coverage results
117 | *.coverage
118 | *.coveragexml
119 | 
120 | # NCrunch
121 | _NCrunch_*
122 | .*crunch*.local.xml
123 | nCrunchTemp_*
124 | 
125 | # MightyMoose
126 | *.mm.*
127 | AutoTest.Net/
128 | 
129 | # Web workbench (sass)
130 | .sass-cache/
131 | 
132 | # Installshield output folder
133 | [Ee]xpress/
134 | 
135 | # DocProject is a documentation generator add-in
136 | DocProject/buildhelp/
137 | DocProject/Help/*.HxT
138 | DocProject/Help/*.HxC
139 | DocProject/Help/*.hhc
140 | DocProject/Help/*.hhk
141 | DocProject/Help/*.hhp
142 | DocProject/Help/Html2
143 | DocProject/Help/html
144 | 
145 | # Click-Once directory
146 | publish/
147 | 
148 | # Publish Web Output
149 | *.[Pp]ublish.xml
150 | *.azurePubxml
151 | # TODO: Comment the next line if you want to checkin your web deploy settings
152 | # but database connection strings (with potential passwords) will be unencrypted
153 | *.pubxml
154 | *.publishproj
155 | 
156 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
157 | # checkin your Azure Web App publish settings, but sensitive information contained
158 | # in these scripts will be unencrypted
159 | PublishScripts/
160 | 
161 | # NuGet Packages
162 | *.nupkg
163 | # The packages folder can be ignored because of Package Restore
164 | **/packages/*
165 | # except build/, which is used as an MSBuild target.
166 | !**/packages/build/
167 | # Uncomment if necessary however generally it will be regenerated when needed
168 | #!**/packages/repositories.config
169 | # NuGet v3's project.json files produces more ignorable files
170 | *.nuget.props
171 | *.nuget.targets
172 | 
173 | # Microsoft Azure Build Output
174 | csx/
175 | *.build.csdef
176 | 
177 | # Microsoft Azure Emulator
178 | ecf/
179 | rcf/
180 | 
181 | # Windows Store app package directories and files
182 | AppPackages/
183 | BundleArtifacts/
184 | Package.StoreAssociation.xml
185 | _pkginfo.txt
186 | 
187 | # Visual Studio cache files
188 | # files ending in .cache can be ignored
189 | *.[Cc]ache
190 | # but keep track of directories ending in .cache
191 | !*.[Cc]ache/
192 | 
193 | # Others
194 | ClientBin/
195 | ~$*
196 | *~
197 | *.dbmdl
198 | *.dbproj.schemaview
199 | *.jfm
200 | *.pfx
201 | *.publishsettings
202 | orleans.codegen.cs
203 | 
204 | # Since there are multiple workflows, uncomment next line to ignore bower_components
205 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
206 | #bower_components/
207 | 
208 | # RIA/Silverlight projects
209 | Generated_Code/
210 | 
211 | # Backup & report files from converting an old project file
212 | # to a newer Visual Studio version. Backup files are not needed,
213 | # because we have git ;-)
214 | _UpgradeReport_Files/
215 | Backup*/
216 | UpgradeLog*.XML
217 | UpgradeLog*.htm
218 | 
219 | # SQL Server files
220 | *.mdf
221 | *.ldf
222 | *.ndf
223 | 
224 | # Business Intelligence projects
225 | *.rdl.data
226 | *.bim.layout
227 | *.bim_*.settings
228 | 
229 | # Microsoft Fakes
230 | FakesAssemblies/
231 | 
232 | # GhostDoc plugin setting file
233 | *.GhostDoc.xml
234 | 
235 | # Node.js Tools for Visual Studio
236 | .ntvs_analysis.dat
237 | node_modules/
238 | 
239 | # Typescript v1 declaration files
240 | typings/
241 | 
242 | # Visual Studio 6 build log
243 | *.plg
244 | 
245 | # Visual Studio 6 workspace options file
246 | *.opt
247 | 
248 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
249 | *.vbw
250 | 
251 | # Visual Studio LightSwitch build output
252 | **/*.HTMLClient/GeneratedArtifacts
253 | **/*.DesktopClient/GeneratedArtifacts
254 | **/*.DesktopClient/ModelManifest.xml
255 | **/*.Server/GeneratedArtifacts
256 | **/*.Server/ModelManifest.xml
257 | _Pvt_Extensions
258 | 
259 | # Paket dependency manager
260 | .paket/paket.exe
261 | paket-files/
262 | 
263 | # FAKE - F# Make
264 | .fake/
265 | 
266 | # JetBrains Rider
267 | .idea/
268 | *.sln.iml
269 | 
270 | # CodeRush
271 | .cr/
272 | 
273 | # Python Tools for Visual Studio (PTVS)
274 | __pycache__/
275 | *.pyc
276 | 
277 | # Cake - Uncomment if you are using it
278 | # tools/**
279 | # !tools/packages.config
280 | 
281 | # Telerik's JustMock configuration file
282 | *.jmconfig
283 | 
284 | # BizTalk build output
285 | *.btp.cs
286 | *.btm.cs
287 | *.odx.cs
288 | *.xsd.cs
289 | 


--------------------------------------------------------------------------------
/recipes/utilities.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import json
  4 | import os
  5 | import time
  6 | 
  7 | import azure.mgmt.batchai as training
  8 | import azure.mgmt.batchai.models as models
  9 | import requests
 10 | from azure.common.credentials import ServicePrincipalCredentials
 11 | from azure.mgmt.resource import ResourceManagementClient 
 12 | 
 13 | POLLING_INTERVAL_SEC = 5
 14 | 
 15 | 
 16 | def encode(value):
 17 |     if isinstance(value, type('str')):
 18 |         return value
 19 |     return value.encode('utf-8')
 20 | 
 21 | 
 22 | class Configuration:
 23 |     """Configuration for recipes and notebooks"""
 24 | 
 25 |     def __init__(self, file_name):
 26 |         if not os.path.exists(file_name):
 27 |             raise ValueError('Cannot find configuration file "{0}"'.
 28 |                              format(file_name))
 29 | 
 30 |         with open(file_name, 'r') as f:
 31 |             conf = json.load(f)
 32 | 
 33 |         try:
 34 |             self.subscription_id = encode(conf['subscription_id'])
 35 |             self.aad_client_id = encode(conf['aad_client_id'])
 36 |             self.aad_secret_key = encode(conf['aad_secret'])
 37 |             self.aad_token_uri = 'https://login.microsoftonline.com/{0}/oauth2/token'.format(encode(conf['aad_tenant']))
 38 |             self.location = encode(conf['location'])
 39 |             self.url = encode(conf['base_url'])
 40 |             self.resource_group = encode(conf['resource_group'])
 41 |             self.storage_account_name = encode(conf['storage_account']['name'])
 42 |             self.storage_account_key = encode(conf['storage_account']['key'])
 43 |             self.admin = encode(conf['admin_user']['name'])
 44 |             self.admin_password = conf['admin_user'].get('password', None)
 45 |             if self.admin_password:
 46 |                 self.admin_password = encode(self.admin_password)
 47 |             self.admin_ssh_key = conf['admin_user'].get('ssh_public_key', None)
 48 |             if self.admin_ssh_key:
 49 |                 self.admin_ssh_key = encode(self.admin_ssh_key)
 50 |             if not self.admin_password and not self.admin_ssh_key:
 51 |                 raise AttributeError(
 52 |                     'Please provide admin user password or public ssh key')
 53 |         except KeyError as err:
 54 |             raise AttributeError(
 55 |                 'Please provide a value for "{0}" configuration key'.format(
 56 |                     err.args[0]))
 57 | 
 58 | 
 59 | class OutputStreamer:
 60 |     """Helper class to stream (tail -f) job's output files."""
 61 | 
 62 |     def __init__(self, client, resource_group, job_name, output_directory_id,
 63 |                  file_name):
 64 |         self.client = client
 65 |         self.resource_group = resource_group
 66 |         self.job_name = job_name
 67 |         self.output_directory_id = output_directory_id
 68 |         self.file_name = file_name
 69 |         self.url = None
 70 |         self.downloaded = 0
 71 |         # if no output_directory_id or file_name specified, the tail call is
 72 |         # nope
 73 |         if self.output_directory_id is None or self.file_name is None:
 74 |             self.tail = lambda: None
 75 | 
 76 |     def tail(self):
 77 |         if not self.url:
 78 |             files = self.client.jobs.list_output_files(
 79 |                 self.resource_group, self.job_name,
 80 |                 models.JobsListOutputFilesOptions(
 81 |                     self.output_directory_id))
 82 |             if not files:
 83 |                 return
 84 |             else:
 85 |                 for f in list(files):
 86 |                     if f.name == self.file_name:
 87 |                         self.url = f.download_url
 88 |         if self.url:
 89 |             r = requests.get(self.url, headers={
 90 |                 'Range': 'bytes={0}-'.format(self.downloaded)})
 91 |             if int(r.status_code / 100) == 2:
 92 |                 self.downloaded += len(r.content)
 93 |                 print(r.content.decode(), end='')
 94 | 
 95 | 
 96 | def create_batchai_client(configuration):
 97 |     client = training.BatchAIManagementClient(
 98 | 			credentials = ServicePrincipalCredentials(client_id=configuration.aad_client_id, secret=configuration.aad_secret_key, token_uri=configuration.aad_token_uri),
 99 | 			subscription_id = configuration.subscription_id,
100 | 			base_url = configuration.url)
101 |     return client
102 | 
103 | 
104 | def create_resource_group(configuration):
105 | 	client = ResourceManagementClient(
106 | 		credentials = ServicePrincipalCredentials(client_id=configuration.aad_client_id, secret=configuration.aad_secret_key, token_uri=configuration.aad_token_uri), 
107 | 		subscription_id = configuration.subscription_id, base_url = configuration.url)
108 | 	resource = client.resource_groups.create_or_update(configuration.resource_group, {'location': configuration.location})
109 | 
110 | 	
111 | def download_file(sas, destination):
112 |     dir_name = os.path.dirname(destination)
113 |     if dir_name:
114 |         os.makedirs(dir_name, exist_ok=True)
115 |     print('Downloading {0} ...'.format(sas), end='')
116 |     r = requests.get(sas, stream=True)
117 |     with open(destination, 'wb') as f:
118 |         for chunk in r.iter_content(chunk_size=512 * 1024):
119 |             if chunk:  # filter out keep-alive new chunks
120 |                 f.write(chunk)
121 |     f.close()
122 |     print('Done')
123 | 
124 | 
125 | def print_job_status(job):
126 |     failure_message = None
127 |     exit_code = 'None'
128 |     if job.execution_info is not None:
129 |         exit_code = job.execution_info.exit_code
130 |     if job.execution_state == models.ExecutionState.failed:
131 |         for error in job.execution_info.errors:
132 |             failure_message = \
133 |                 '\nErrorCode:{0}\nErrorMessage:{1}\n'. \
134 |                 format(error.code,
135 |                        error.message)
136 |             if error.details is not None:
137 |                 failure_message += 'Details:\n'
138 |                 for detail in error.details:
139 |                     failure_message += '{0}:{1}\n'.format(detail.name,
140 |                                                           detail.value)
141 |     print('Job state: {0} ExitCode: {1}'.format(job.execution_state.name,
142 |                                                 exit_code))
143 |     if failure_message:
144 |         print('FailureDetails: {0}'.format(failure_message))
145 | 
146 | 
147 | def print_cluster_status(cluster):
148 |     print(
149 |         'Cluster state: {0} Target: {1}; Allocated: {2}; Idle: {3}; '
150 |         'Unusable: {4}; Running: {5}; Preparing: {6}; Leaving: {7}'.format(
151 |             cluster.allocation_state,
152 |             cluster.scale_settings.manual.target_node_count,
153 |             cluster.current_node_count,
154 |             cluster.node_state_counts.idle_node_count,
155 |             cluster.node_state_counts.unusable_node_count,
156 |             cluster.node_state_counts.running_node_count,
157 |             cluster.node_state_counts.preparing_node_count,
158 | 			cluster.node_state_counts.leaving_node_count))
159 |     if not cluster.errors:
160 |         return
161 |     for error in cluster.errors:
162 |         print('Cluster error: {0}: {1}'.format(error.code, error.message))
163 |         if error.details:
164 |             print('Details:')
165 |             for detail in error.details:
166 |                 print('{0}: {1}'.format(detail.name, detail.value))
167 | 
168 | 
169 | def wait_for_job_completion(client, resource_group, job_name, cluster_name,
170 |                             output_directory_id=None, file_name=None):
171 |     """
172 |     Waits for job completion and tails a file specified by output_directory_id
173 |     and file_name.
174 |     """
175 |     # Wait for job to start running
176 |     while True:
177 |         cluster = client.clusters.get(resource_group, cluster_name)
178 |         print_cluster_status(cluster)
179 |         job = client.jobs.get(resource_group, job_name)
180 |         print_job_status(job)
181 |         if job.execution_state != models.ExecutionState.queued:
182 |             break
183 |         time.sleep(POLLING_INTERVAL_SEC)
184 | 
185 |     print('Waiting for job output to become available...')
186 | 
187 |     # Tail the output file and wait for job to complete
188 |     streamer = OutputStreamer(client, resource_group, job_name,
189 |                               output_directory_id, file_name)
190 |     while True:
191 |         streamer.tail()
192 |         job = client.jobs.get(resource_group, job_name)
193 |         if job.execution_state == models.ExecutionState.succeeded or job.execution_state == models.ExecutionState.failed:
194 |             break
195 |         time.sleep(1)
196 |     streamer.tail()
197 |     print_job_status(job)
198 | 


--------------------------------------------------------------------------------
/recipes/Readme.md:
--------------------------------------------------------------------------------
  1 | # Getting Started with the Recipes
  2 | 
  3 | ## Prerequisites
  4 | 
  5 |  1. Azure subscription. This can be a free trial subscription, MSDN, or the one you use for other work.
  6 |  2. Azure Python SDK and azure-mgmt-batchai, if you like to run recipes using Python Jupyter notebook. See How to install [Azure SDK](https://docs.microsoft.com/en-us/python/azure/python-sdk-azure-install?view=azure-python). 
  7 |  3. Azure CLI 2.0, if you like to run recipes using Azure CLI - See [Install Azure CLI 2.0](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest#install-on-windows) for instructions.
  8 |  4. Azure Storage Account in East US (required for all recipes). See [How to create Azure storage accounts](https://docs.microsoft.com/en-us/azure/storage/common/storage-create-storage-account?toc=%2fazure%2fstorage%2ffiles%2ftoc.json)
  9 |  
 10 | ## Make a Local Copy of Repo
 11 | 
 12 | To start, please Clone or download this [repo](https://github.com/Azure/BatchAI)
 13 | 
 14 | ## Recipe Instructions
 15 | 
 16 | Use the following links for a quick navigation:
 17 | 
 18 | 1. [Run Recipes Using Python Jupyter notebook](#jupyternotebook)
 19 | 2. [Run Recipes Using Azure CLI 2.0](#azurecli)
 20 | 
 21 | ## <a name="jupyternotebook"></a> Run Recipes Using Python Jupyter notebook
 22 | 
 23 | ### Create Credentials for Service Principal Authentication
 24 | Jupyter notebook recipes require you to use service principal authentication rather than providing your account credentials.
 25 | There are several ways to create a Service Principal as described in following sections:
 26 | 
 27 | #### Using Azure CLI2.0
 28 | 1. Log in into Azure CLI 2.0
 29 | 2. Execute the following command
 30 | ```sh
 31 | $ az ad sp create-for-rbac
 32 | ```
 33 | Example output:
 34 | ```
 35 | {
 36 |   "appId": "...",
 37 |   "displayName": "azure-cli-2017-10-27-18-45-51",
 38 |   "name": "http://azure-cli-2017-10-27-18-45-51",
 39 |   "password": "...",
 40 |   "tenant": "..."
 41 | }
 42 | ```
 43 | Use appId value as aad_client_id, password as aad_secret and tenant as aad_tenant during configuration file creation later.
 44 | 
 45 | #### Using Portal
 46 | 1.	Log in to your Azure Account through the [Azure portal](https://portal.azure.com/).
 47 | 2.	Select *Azure Active Directory*.
 48 | 3.	To get the AAD tenant ID, select *Properties* and copy the *Directory ID*.  This value is your **AAD tenant ID**.
 49 | 4.	Go back to *Azure Active Directory* and select *App registrations*.
 50 | 5.	Select *New application registration*.
 51 | 6.	Provide a name and URL for the application. After setting the values, select *Create*.
 52 | 7.	From *App registrations* in *Azure Active Directory*, select your application.
 53 | 8.	Copy the *Application ID* and this is your **AAD Client ID**. 
 54 | 9.	To generate an authentication key, select *Keys*.
 55 | 10.	Provide a description and a duration for the key. When done, select *Save*. After saving the key, the value of the key is displayed. Copy this value because you are not able to retrieve the key later. This is your **ADD Secret**.
 56 | 11.	To assign the just created application, select the subscription you are going to use for Azure Batch AI. (You can find it from *More Services* -> *Subscriptions*)
 57 | 12.	Select *Acess control (IAM)*
 58 | 13.	Select *Add*
 59 | 14.	Select *Contributor* as the *role*
 60 | 15.	Search for your application and select it.
 61 | 16.	Select *Save* to finish assigning the role. You see your application in the list of users assigned to a role for that scope.
 62 | 
 63 | For a more detailed walk-through, please see [this link](https://docs.microsoft.com/en-us/azure/azure-resource-manager/resource-group-create-service-principal-portal).
 64 | 
 65 | ### Register BatchAI Resource Providers
 66 | 1.	Log in to your Azure Account through the [Azure portal](https://portal.azure.com/).
 67 | 2.	Select the subscription you are going to use for Azure Batch AI. (You can find it from *More Services* -> *Subscriptions*)
 68 | 3.  Select *Resource providers*
 69 | 4.  Register with **Microsoft.BatchAI** and **Microsoft.Batch providers**. 
 70 |   
 71 | Note, a provider registration can take up to 15 minutes.
 72 | 
 73 | ### Grant Batch AI Network Contributor Role on Your Subscription
 74 | You can use two different approaches:
 75 | 
 76 | #### Uzing Azure CLI 2.0
 77 | ```sh
 78 | az role assignment create --scope /subscriptions/<your subscription id> --role "Network Contributor" --assignee 9fcb3732-5f52-4135-8c08-9d4bbaf203ea
 79 | ```
 80 | , here `9fcb3732-5f52-4135-8c08-9d4bbaf203ea` is a service principal of Microsoft Azure BatchAI.
 81 | 
 82 | #### Using Portal
 83 | 1.	Select the subscription you are going to use for Azure Batch AI. (You can find it from *More Services* -> *Subscriptions*)
 84 | 2.	Select *Acess control (IAM)*
 85 | 3.	Select *Add*
 86 | 4.	Select *Network Contributor* as the *role*
 87 | 5.	Search for 'Microsoft Azure BatchAI' application and select it.
 88 | 6.	Select *Save* to finish assigning the role.
 89 | 
 90 | ### Create Configuration File for All Recipes 
 91 | 
 92 | - Rename [configuration.json.template](/recipes/configuration.json.template) to configuration.json.
 93 | - Fill in your subscription Id and your AAD application information as obtained in the above step. 
 94 | - Leave the "base_url" filed as empty. 
 95 | - You need to specify the name of your resource group. Our recipe will automatically create resource group if it does not exist.  
 96 | - Specify your Azure Storage account name and key, Please see [this page](https://docs.microsoft.com/en-us/azure/storage/common/storage-create-storage-account?toc=%2fazure%2fstorage%2ffiles%2ftoc.json).
 97 | - Batch AI creates administrator user account on every compute node and enables ssh. You need to specify user name and at least a password or ssh public key for this account.
 98 |  
 99 | ### Helper functions in utilities.py
100 | 
101 | For your convenience, we provide a collection of helper functions in [utilities.py](./utilities.py) used for each recipes:
102 | 
103 | - Read parameters from configuration file
104 | - Create python client object (BatchAIManagementClient) to access Azure Batch AI service
105 | - Create/Update resource group
106 | - Download file with given shared access signature (SAS)
107 | - Print Job/Cluster status
108 | - File Streaming 
109 | 
110 | ### Install Azure Batch AI Management Client
111 | 
112 | Install Batch AI management client using the following command:
113 |  
114 |  ```sh
115 |  pip install azure-mgmt-batchai
116 |  ```
117 | 
118 | ### Install Azure Python SDK
119 | 
120 | Since all recipes utlize APIs from other Azure products (e.g, Azure storage, credentials), it is also required to install the full package of Azure Python SDK:
121 |  ```sh
122 |  pip install azure
123 |  ```
124 | 
125 | ### Install Jupyter Notebook
126 | 
127 | Please install Jupyter Notebook from https://jupyter.org/ or run
128 | 
129 | ```sh
130 | python -m pip install jupyter
131 | ```
132 | 
133 | ### Start to Run Recipes
134 | 
135 | - Route into the root your cloned recipe directory 
136 | ```sh
137 | cd <your clone root>/BatchAI/recipes
138 | ```
139 | 
140 | - Launch the Jupyter Notebook by
141 | ```sh
142 | jupyter notebook
143 | ```
144 | 
145 | - In the prompted brower brower, navigate into the recipe of interest, and start the *.ipynb file.
146 | 
147 | 
148 | ## <a name="azurecli"></a> Run Recipes Using Azure CLI 2.0
149 | 
150 | ### Install Azure CLI 2.0 and Configure Azure CLI 2.0
151 | 
152 | Please follow Azure CLI 2.0 Batch AI specific [documentation](/documentation/using-azure-cli-20.md) to install and
153 | configure Azure CLI 2.0 for using with Batch AI.
154 | 
155 | ### Generate Authentication Key for SSH (for Cloud Shell and GNU/Linux Users)
156 | 
157 | During Cluster and File Server creation you will need to specify a name and authentication method for administrator account which will be created on each compute node (you can use this account to ssh to the node).
158 | 
159 | You can provide a password and/or ssh public key as authentication method via --password (-p) and --ssh-public-key (-k) parameters.
160 | 
161 | GNU/Linux users (including Cloud Shell users) can generate authentication key for ssh using ```ssh-keygen``` command.
162 | 
163 | Note, GNU/Linux part of recipes expects you to have a public ssh key at ~/.ssh/id_rsa.pub, if you prefer to use different ssh key, please update -k parameter value.
164 | 
165 | ### Install unzip package (for GNU/Linux Users)
166 | 
167 | Training data used in recipes is compressed in ```zip``` archives and requires ```unzip``` utility to be installed on the host, please install it using your distribution package manager.
168 | 
169 | Cloud Shell has ```unzip``` already installed.
170 | 
171 | ### Run Recipes
172 | 
173 | Each recipe contains ```cli-instructions.md``` file which describes input data, cluster and job configuration and provides instructions for cluster and job creation.
174 | 
175 | ## Help or Feedback
176 | --------------------
177 | If you have any problems or questions, you can reach the Batch AI team at [AzureBatchAITrainingPreview@service.microsoft.com](mailto:AzureBatchAITrainingPreview@service.microsoft.com) or you can create an issue on GitHub.
178 | 
179 | We also welcome your contributions of additional sample notebooks, scripts, or other examples of working with Batch AI.
180 | 


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-Python-Distributed/ConvNet_CIFAR10_DataAug.py:
--------------------------------------------------------------------------------
  1 | # ==============================================================================
  2 | # Copyright (c) Microsoft. All rights reserved.
  3 | # Licensed under the MIT license. See LICENSE.md file in the project root
  4 | # for full license information.
  5 | # ==============================================================================
  6 | 
  7 | from __future__ import print_function
  8 | import os
  9 | import math
 10 | import numpy as np
 11 | import cntk
 12 | import _cntk_py
 13 | import cntk.io.transforms as xforms
 14 | 
 15 | from cntk.layers import Convolution2D, MaxPooling, AveragePooling, Dropout, BatchNormalization, Dense, default_options, identity, Sequential, For
 16 | from cntk.layers.typing import *
 17 | from cntk.io import MinibatchSource, ImageDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT
 18 | from cntk import Trainer, use_default_device
 19 | from cntk.learners import momentum_sgd, learning_rate_schedule, UnitType, momentum_as_time_constant_schedule
 20 | from cntk import cross_entropy_with_softmax, classification_error, relu
 21 | from cntk.ops import Function
 22 | from cntk.debugging import set_computation_network_trace_level
 23 | from cntk.logging import *
 24 | 
 25 | ########################
 26 | # variables and paths  #
 27 | ########################
 28 | 
 29 | # paths (are relative to current python file)
 30 | abs_path   = os.path.dirname(os.path.abspath(__file__))
 31 | data_path  = os.path.join(abs_path, "..", "..", "..", "DataSets", "CIFAR-10")
 32 | model_path = os.path.join(abs_path, "Models")
 33 | 
 34 | # model dimensions
 35 | image_height = 32
 36 | image_width  = 32
 37 | num_channels = 3  # RGB
 38 | num_classes  = 10
 39 | 
 40 | ########################
 41 | # define the reader    #
 42 | ########################
 43 | 
 44 | def create_reader(map_file, mean_file, is_training):
 45 |     if not os.path.exists(map_file) or not os.path.exists(mean_file):
 46 |         raise RuntimeError("File '%s' or '%s' does not exist. Please run install_cifar10.py from DataSets/CIFAR-10 to fetch them" %
 47 |                            (map_file, mean_file))
 48 | 
 49 |     # transformation pipeline for the features has jitter/crop only when training
 50 |     transforms = []
 51 |     if is_training:
 52 |         transforms += [
 53 |             xforms.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
 54 |         ]
 55 |     transforms += [
 56 |         xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
 57 |         xforms.mean(mean_file)
 58 |     ]
 59 |     # deserializer
 60 |     return MinibatchSource(ImageDeserializer(map_file, StreamDefs(
 61 |         features=StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
 62 |         labels=StreamDef(field='label', shape=num_classes))),   # and second as 'label'
 63 |         randomize=is_training, max_sweeps = INFINITELY_REPEAT if is_training else 1)
 64 | 
 65 | ########################
 66 | # define the model     #
 67 | ########################
 68 | 
 69 | def create_convnet_cifar10_model(num_classes):
 70 |     with default_options(activation=relu, pad=True):
 71 |         return Sequential([
 72 |             For(range(2), lambda : [
 73 |                 Convolution2D((3,3), 64), 
 74 |                 Convolution2D((3,3), 64), 
 75 |                 MaxPooling((3,3), strides=2)
 76 |             ]), 
 77 |             For(range(2), lambda i: [
 78 |                 Dense([256,128][i]), 
 79 |                 Dropout(0.5)
 80 |             ]), 
 81 |             Dense(num_classes, activation=None)
 82 |         ])
 83 | 
 84 | ########################
 85 | # define the criteria  #
 86 | ########################
 87 | 
 88 | # compose model function and criterion primitives into a criterion function
 89 | #  takes:   Function: features -> prediction
 90 | #  returns: Function: (features, labels) -> (loss, metric)
 91 | def create_criterion_function(model, normalize=identity):
 92 |     #@Function    # Python 3
 93 |     #def criterion(x: Tensor[(num_channels, image_height, image_width)], y: Tensor[num_classes]):
 94 |     @Function
 95 |     @Signature(x = Tensor[(num_channels, image_height, image_width)], y = Tensor[num_classes])
 96 |     def criterion(x, y):
 97 |         z = model(normalize(x))
 98 |         ce   = cross_entropy_with_softmax(z, y)
 99 |         errs = classification_error      (z, y)
100 |         return (ce, errs)
101 |     return criterion
102 | 
103 | ########################
104 | # train action  #
105 | ########################
106 | 
107 | def train_model(reader, model, criterion, epoch_size=50000, max_epochs=80):
108 |     minibatch_size = 64
109 | 
110 |     # learning parameters
111 |     learner = momentum_sgd(model.parameters, 
112 |                            lr       = learning_rate_schedule([0.0015625]*20+[0.00046875]*20+[0.00015625]*20+[0.000046875]*10+[0.000015625], minibatch_size=1, epoch_size=epoch_size),
113 |                            momentum = momentum_as_time_constant_schedule([0]*20+[600]*20+[1200], epoch_size=epoch_size),
114 |                            l2_regularization_weight = 0.002)
115 |     
116 |     # trainer object
117 |     trainer = Trainer(None, criterion, learner)
118 | 
119 |     # perform model training
120 |     log_number_of_parameters(model) ; print()
121 |     progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)
122 | 
123 |     for epoch in range(max_epochs):       # loop over epochs
124 |         sample_count = 0
125 |         while sample_count < epoch_size:  # loop over minibatches in the epoch
126 |             mb = reader.next_minibatch(min(minibatch_size, epoch_size - sample_count)) # fetch minibatch.
127 |             #trainer.train_minibatch(mb[reader.streams.features], mb[reader.streams.labels])
128 |             trainer.train_minibatch({criterion.arguments[0]: mb[reader.streams.features], criterion.arguments[1]: mb[reader.streams.labels]})
129 |             sample_count += mb[reader.streams.labels].num_samples                     # count samples processed so far
130 |             progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
131 | 
132 |         loss, metric, actual_samples = progress_printer.epoch_summary(with_metric=True)
133 |         model.save(os.path.join(model_path, "ConvNet_CIFAR10_DataAug_{}.dnn".format(epoch)))
134 | 
135 |     # return evaluation error.
136 |     return loss, metric # return values from last epoch
137 | 
138 | ########################
139 | # eval action          #
140 | ########################
141 | 
142 | # helper function to create a dummy Trainer that one can call test_minibatch() on
143 | # TODO: replace by a proper such class once available
144 | def Evaluator(criterion):
145 |     loss, metric = Trainer._get_loss_metric(criterion)
146 |     parameters = set(loss.parameters)
147 |     if metric:
148 |         parameters |= set(metric.parameters)
149 |     dummy_learner = momentum_sgd(tuple(parameters), 
150 |                                  lr = learning_rate_schedule(1, UnitType.minibatch),
151 |                                  momentum = momentum_as_time_constant_schedule(0))
152 |     return Trainer(None, (loss, metric), dummy_learner)
153 | 
154 | def evaluate(reader, criterion, device=None, minibatch_size=16, max_samples=None):
155 | 
156 |     # process minibatches and perform evaluation
157 |     if not device:
158 |         device = use_default_device()
159 | 
160 |     evaluator = Evaluator(criterion)
161 |     progress_printer = ProgressPrinter(tag='Evaluation', num_epochs=1)
162 | 
163 |     samples_evaluated = 0
164 |     while True:
165 |         if (max_samples and samples_evaluated >= max_samples):
166 |             break
167 | 
168 |         # Fetch minibatches until we hit the end
169 |         mb = reader.next_minibatch(minibatch_size)
170 |         if not mb:
171 |             break
172 | 
173 |         metric = evaluator.test_minibatch({criterion.arguments[0]: mb[reader.streams.features], criterion.arguments[1]: mb[reader.streams.labels]}, device=device)
174 |         samples_evaluated += minibatch_size
175 |         progress_printer.update(0, mb[reader.streams.labels].num_samples, metric) # log progress
176 | 
177 |     loss, metric, actual_samples = progress_printer.epoch_summary(with_metric=True)
178 |     return loss, metric
179 | 
180 | ############################# 
181 | # main function boilerplate #
182 | #############################
183 | 
184 | if __name__=='__main__':
185 |     # create model
186 |     model = create_convnet_cifar10_model(num_classes=10)
187 |     # declare the model's input dimension
188 |     # Training does not require this, but it is needed for deployment.
189 |     model.update_signature((num_channels, image_height, image_width))
190 | 
191 |     # criterion function. This is what is being trained trained.
192 |     # Model gets "sandwiched" between normalization (not part of model proper) and criterion.
193 |     criterion = create_criterion_function(model, normalize=lambda x: x / 256)
194 | 
195 |     # train
196 |     reader = create_reader(os.path.join(data_path, 'train_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), True)
197 |     train_model(reader, model, criterion, max_epochs=80)
198 | 
199 |     # save and load (as an illustration)
200 |     path = data_path + "/model.cmf"
201 |     model.save(path)
202 | 
203 |     # test
204 |     model = Function.load(path)
205 |     reader = create_reader(os.path.join(data_path, 'test_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), False)
206 |     criterion = create_criterion_function(model, normalize=lambda x: x / 256)
207 |     evaluate(reader, criterion)
208 | 


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-Python-Distrbuted-Infiniband/dockerfile:
--------------------------------------------------------------------------------
  1 | FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
  2 | 
  3 | # install base system
  4 | RUN apt-get update && apt-get install -y --no-install-recommends \
  5 |         autotools-dev \
  6 |         build-essential \
  7 |         cmake \
  8 |         git \
  9 |         gfortran-multilib \
 10 |         libavcodec-dev \
 11 |         libavformat-dev \
 12 |         libjasper-dev \
 13 |         libjpeg-dev \
 14 |         libpng-dev \
 15 |         liblapacke-dev \
 16 |         libswscale-dev \
 17 |         libtiff-dev \
 18 |         pkg-config \
 19 |         wget \
 20 |         zlib1g-dev \
 21 |         # Protobuf
 22 |         ca-certificates \
 23 |         curl \
 24 |         unzip \
 25 |         # For Kaldi
 26 |         python-dev \
 27 |         automake \
 28 |         libtool \
 29 |         autoconf \
 30 |         subversion \
 31 |         # For Kaldi's dependencies
 32 |         libapr1 libaprutil1 libltdl-dev libltdl7 libserf-1-1 libsigsegv2 libsvn1 m4 \
 33 |         # For Java Bindings
 34 |         openjdk-9-jdk-headless \
 35 |         # For SWIG
 36 |         libpcre3-dev \
 37 |         libpcre++-dev && \
 38 |     apt-get install -y --no-install-recommends \
 39 |         # Infiniband/RDMA
 40 |         cpio \
 41 |         libmlx4-1 \
 42 |         libmlx5-1 \
 43 |         librdmacm1 \
 44 |         libibverbs1 \
 45 |         libmthca1 \
 46 |         libdapl2 \
 47 |         dapl2-utils
 48 | 
 49 | # build and install libzip, cub, boost, openblas, opencv, protobuf
 50 | RUN LIBZIP_VERSION=1.1.3 && \
 51 |     wget -q -O - http://nih.at/libzip/libzip-${LIBZIP_VERSION}.tar.gz | tar -xzf - && \
 52 |     cd libzip-${LIBZIP_VERSION} && \
 53 |     ./configure --prefix=/usr/local && \
 54 |     make -j"$(nproc)" install && \
 55 |     ldconfig /usr/local/lib && \
 56 |     cd .. && \
 57 |     rm -rf /libzip-${LIBZIP_VERSION} && \
 58 |     # boost
 59 |     BOOST_VERSION=1_60_0 && \
 60 |     BOOST_DOTTED_VERSION=$(echo $BOOST_VERSION | tr _ .) && \
 61 |     wget -q -O - https://sourceforge.net/projects/boost/files/boost/${BOOST_DOTTED_VERSION}/boost_${BOOST_VERSION}.tar.gz/download | tar -xzf - && \
 62 |     cd boost_${BOOST_VERSION} && \
 63 |     ./bootstrap.sh --prefix=/usr/local --with-libraries=filesystem,system,test  && \
 64 |     ./b2 -d0 -j"$(nproc)" install && \
 65 |     ldconfig /usr/local/lib && \
 66 |     cd .. && \
 67 |     rm -rf /boost_${BOOST_VERSION} && \
 68 |     # cub
 69 |     wget -q -O - https://github.com/NVlabs/cub/archive/1.4.1.tar.gz | tar -C /usr/local -xzf - && \
 70 |     # openblas
 71 |     OPENBLAS_VERSION=0.2.19 && \
 72 |     wget -q -O - https://github.com/xianyi/OpenBLAS/archive/v${OPENBLAS_VERSION}.tar.gz | tar -xzf - && \
 73 |     cd OpenBLAS-${OPENBLAS_VERSION} && \
 74 |     make -j"$(nproc)" USE_OPENMP=1 | tee make.log && \
 75 |     grep -qF 'OpenBLAS build complete. (BLAS CBLAS LAPACK LAPACKE)' make.log && \
 76 |     grep -qF 'Use OpenMP in the multithreading.' make.log && \
 77 |     make PREFIX=/usr/local/openblas install && \
 78 |     ldconfig /usr/local/openblas && \
 79 |     cd .. && \
 80 |     rm -rf /OpenBLAS-${OPENBLAS_VERSION} && \
 81 |     # opencv
 82 |     OPENCV_VERSION=3.1.0 && \
 83 |     wget -q -O - https://github.com/opencv/opencv/archive/${OPENCV_VERSION}.tar.gz | tar -xzf - && \
 84 |     cd opencv-${OPENCV_VERSION} && \
 85 |     cmake -DWITH_CUDA=OFF -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_INSTALL_PREFIX=/usr/local/opencv-${OPENCV_VERSION} . && \
 86 |     make -j"$(nproc)" install && \
 87 |     ldconfig /usr/local/lib && \
 88 |     cd .. && \
 89 |     rm -rf /opencv-${OPENCV_VERSION} && \
 90 |     # protocol buffers
 91 |     PROTOBUF_VERSION=3.1.0 \
 92 |     PROTOBUF_STRING=protobuf-$PROTOBUF_VERSION && \
 93 |     wget -O - --no-verbose https://github.com/google/protobuf/archive/v${PROTOBUF_VERSION}.tar.gz | tar -xzf - && \
 94 |     cd $PROTOBUF_STRING && \
 95 |     ./autogen.sh && \
 96 |     ./configure CFLAGS=-fPIC CXXFLAGS=-fPIC --disable-shared --prefix=/usr/local/$PROTOBUF_STRING && \
 97 |     make -j $(nproc) install && \
 98 |     cd .. && \
 99 |     rm -rf $PROTOBUF_STRING
100 | 
101 | # set env vars
102 | ENV KALDI_VERSION=c024e8aa
103 | ENV MKLML_VERSION=mklml_lnx_2018.0.1.20171007
104 | ENV PATH=/root/anaconda3/envs/cntk-py36/bin:/usr/local/bin:/cntk/build-mkl/gpu/release/bin:${PATH} \
105 |     KALDI_PATH=/usr/local/kaldi-$KALDI_VERSION \
106 |     BLAS=/usr/local/openblas/lib/libopenblas.so \
107 |     LAPACK=/usr/local/openblas/lib/libopenblas.so \
108 |     MKL_PATH=/usr/local/CNTKCustomMKL \
109 |     PYTHONPATH=/cntk/bindings/python:$PYTHONPATH \
110 |     LD_LIBRARY_PATH=/usr/local/openblas/lib:/cntk/bindings/python/cntk/libs:$LD_LIBRARY_PATH
111 | 
112 | # install cntk custom mkl, kaldi, swig and anaconda
113 | RUN mkdir ${MKL_PATH} && \
114 |     wget --no-verbose -O - https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VERSION}.tgz | \
115 |     tar -xzf - -C ${MKL_PATH} && \
116 |     # kaldi
117 |     mkdir $KALDI_PATH && \
118 |     wget --no-verbose -O - https://github.com/kaldi-asr/kaldi/archive/$KALDI_VERSION.tar.gz | tar -xzf - --strip-components=1 -C $KALDI_PATH && \
119 |     cd $KALDI_PATH/tools && \
120 |     perl -pi -e 's/^# (OPENFST_VERSION = 1.4.1)$/\1/' Makefile && \
121 |     #/bin/bash extras/check_dependencies.sh && \
122 |     #make -j $(nproc) all && \
123 |     make -j $(nproc) sph2pipe atlas sclite openfst && \
124 |     cd ../src && \
125 |     ./configure --openblas-root=/usr/local/openblas --shared && \
126 |     make -j $(nproc) depend && \
127 |     make -j $(nproc) all && \
128 |     find $KALDI_PATH -name '*.o' -print0 | xargs -0 rm && \
129 |     for dir in $KALDI_PATH/src/*bin; do make -C $dir clean; done && \
130 |     # SWIG
131 |     SWIG_VERSION=3.0.10 && \
132 |     cd /root && \
133 |     wget -q http://prdownloads.sourceforge.net/swig/swig-${SWIG_VERSION}.tar.gz -O - | tar xvfz - && \
134 |     cd swig-${SWIG_VERSION} && \
135 |     ./configure --without-alllang && \
136 |     make -j$(nproc) && \
137 |     make install && \
138 |     cd .. && \
139 |     rm -rf swig-${SWIG_VERSION} && \
140 |     # Anaconda
141 |     wget -q https://repo.continuum.io/archive/Anaconda3-4.4.0-Linux-x86_64.sh && \
142 |     bash Anaconda3-4.4.0-Linux-x86_64.sh -b && \
143 |     rm -f Anaconda3-4.4.0-Linux-x86_64.sh && \
144 |     # set paths for CNTK
145 |     mkdir -p /usr/local/cudnn/cuda/include && \
146 |     ln -s /usr/include/cudnn.h /usr/local/cudnn/cuda/include/cudnn.h && \
147 |     mkdir -p /usr/local/cudnn/cuda/lib64 && \
148 |     ln -s /etc/alternatives/libcudnn_so /usr/local/cudnn/cuda/lib64/libcudnn.so && \
149 |     ln -s /usr/local/cuda/lib64/stubs/libnvidia-ml.so /usr/local/cuda/lib64/stubs/libnvidia-ml.so.1 && \
150 |     # update ldconfig
151 |     ldconfig /usr/local/lib
152 | 
153 | # set cntk dir
154 | WORKDIR /cntk
155 | 
156 | # add intel mpi library and build cntk
157 | ENV MANPATH=/usr/share/man:/usr/local/man \
158 |     COMPILERVARS_ARCHITECTURE=intel64 \
159 |     COMPILERVARS_PLATFORM=linux \
160 |     INTEL_MPI_PATH=/opt/intel/compilers_and_libraries/linux/mpi
161 | RUN cd /tmp && \
162 |   wget -q 'http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/11595/l_mpi_2017.3.196.tgz'  && \
163 |   tar zxvf l_mpi_2017.3.196.tgz && \
164 |   sed -i -e 's/^ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
165 |   sed -i -e 's|^#ACTIVATION_LICENSE_FILE=|ACTIVATION_LICENSE_FILE=/tmp/l_mpi_2017.3.196/USE_SERVER.lic|g' /tmp/l_mpi_2017.3.196/silent.cfg && \
166 |   sed -i -e 's/^ACTIVATION_TYPE=exist_lic/ACTIVATION_TYPE=license_server/g' /tmp/l_mpi_2017.3.196/silent.cfg  && \
167 |   cd /tmp/l_mpi_2017.3.196  && \
168 |   ./install.sh -s silent.cfg && \
169 |   cd ..
170 | 
171 | # cntk makefiles use non-standard mpic++, symlink to mpicxx
172 | RUN ln -s ${INTEL_MPI_PATH}/${COMPILERVARS_ARCHITECTURE}/bin/mpicxx ${INTEL_MPI_PATH}/${COMPILERVARS_ARCHITECTURE}/bin/mpic++ && \
173 |     # build cntk
174 |     CNTK_VERSION=v2.3 && \
175 |     cd /cntk && \
176 |     git clone --depth=1 --recursive -b ${CNTK_VERSION} --single-branch https://github.com/Microsoft/CNTK.git .
177 | 
178 | # add cast in /cntk/Source/CNTKv2LibraryDll/Trainer.cpp to prevent build issue 
179 | RUN sed -i 's|, unit)|, (int)unit)|g' /cntk/Source/CNTKv2LibraryDll/Trainer.cpp
180 | 
181 | # set Anaconda environment
182 | RUN /root/anaconda3/bin/conda env create -p /root/anaconda3/envs/cntk-py36/ \
183 |         --file /cntk/Scripts/install/linux/conda-linux-cntk-py36-environment.yml && \
184 |     # source intel mpi vars
185 |     . /opt/intel/bin/compilervars.sh && \
186 |     . /opt/intel/compilers_and_libraries/linux/mpi/bin64/mpivars.sh && \
187 |     # build gpu-mkl only
188 |     CONFIGURE_OPTS="\
189 |       --1bitsgd=yes \
190 |       --with-mpi=${INTEL_MPI_PATH}/${COMPILERVARS_ARCHITECTURE} \
191 |       --with-cuda=/usr/local/cuda \
192 |       --with-gdk-include=/usr/local/cuda/include \
193 |       --with-gdk-nvml-lib=/usr/local/cuda/lib64/stubs \
194 |       --with-kaldi=${KALDI_PATH} \
195 |       --with-py36-path=/root/anaconda3/envs/cntk-py36 \
196 |       --with-cudnn=/usr/local/cudnn" && \
197 |     mkdir -p build-mkl/gpu/release && \
198 |     cd build-mkl/gpu/release && \
199 |     ../../../configure $CONFIGURE_OPTS --with-mkl=${MKL_PATH}/${MKLML_VERSION} && \
200 |     make -j"$(nproc)"
201 | 
202 | # clean up
203 | RUN rm -rf /cntk/build-mkl/gpu/release/.build && \
204 |     rm -rf /cntk/.git && \
205 |     /root/anaconda3/bin/conda clean --all --yes && \
206 |     # create activate script
207 |     echo "source /root/anaconda3/bin/activate /root/anaconda3/envs/cntk-py36" > /cntk/activate-cntk && \
208 |     # add cntk activate to root bashrc
209 |     echo "source /cntk/activate-cntk" >> /root/.bashrc && \
210 |     # add LD_LIBRARY_PATH to root bashrc
211 |     echo LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:'$LD_LIBRARY_PATH' >> /root/.bashrc && \
212 |     # remove intel components
213 |     rm -rf /opt/intel
214 | 


--------------------------------------------------------------------------------
/recipes/CNTK/CNTK-GPU-Python-Distributed/ConvNet_CIFAR10_DataAug_Distributed.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft. All rights reserved.
  2 | 
  3 | # Licensed under the MIT license. See LICENSE.md file in the project root
  4 | # for full license information.
  5 | # ==============================================================================
  6 | 
  7 | from __future__ import print_function
  8 | import os
  9 | import math
 10 | import argparse
 11 | import numpy as np
 12 | import cntk as C
 13 | import _cntk_py
 14 | import cntk.io.transforms as xforms
 15 | from cntk.train.training_session import *
 16 | from cntk.logging import *
 17 | from cntk.debugging import *
 18 | 
 19 | # default Paths relative to current python file.
 20 | abs_path   = os.path.dirname(os.path.abspath(__file__))
 21 | sys.path.append(abs_path)
 22 | model_path = os.path.join(abs_path, "Models")
 23 | 
 24 | from ConvNet_CIFAR10_DataAug import create_convnet_cifar10_model
 25 | 
 26 | # model dimensions
 27 | image_height = 32
 28 | image_width  = 32
 29 | num_channels = 3  # RGB
 30 | num_classes  = 10
 31 | 
 32 | # Create a minibatch source.
 33 | def create_image_mb_source(map_file, mean_file, train, total_number_of_samples):
 34 |     if not os.path.exists(map_file) or not os.path.exists(mean_file):
 35 |         raise RuntimeError("File '%s' or '%s' does not exist. Please run install_cifar10.py from DataSets/CIFAR-10 to fetch them" %
 36 |                            (map_file, mean_file))
 37 | 
 38 |     # transformation pipeline for the features has jitter/crop only when training
 39 |     transforms = []
 40 |     if train:
 41 |         transforms += [
 42 |             xforms.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
 43 |         ]
 44 | 
 45 |     transforms += [
 46 |         xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
 47 |         xforms.mean(mean_file)
 48 |     ]
 49 | 
 50 |     # deserializer
 51 |     return C.io.MinibatchSource(
 52 |         C.io.ImageDeserializer(
 53 |             map_file, 
 54 |             C.io.StreamDefs(features=C.io.StreamDef(field='image', transforms=transforms), # 1st col in mapfile referred to as 'image'
 55 |                             labels=C.io.StreamDef(field='label', shape=num_classes))),   # and second as 'label'
 56 |         randomize=train,
 57 |         max_samples=total_number_of_samples,
 58 |         multithreaded_deserializer=True)
 59 | 
 60 | # Create the network.
 61 | def create_conv_network():
 62 |     # Input variables denoting the features and label data
 63 |     feature_var = C.input_variable((num_channels, image_height, image_width))
 64 |     label_var = C.input_variable((num_classes))
 65 | 
 66 |     # apply model to input
 67 |     scaled_input = C.element_times(C.constant(0.00390625), feature_var)
 68 | 
 69 |     z = create_convnet_cifar10_model(num_classes)(scaled_input)
 70 | 
 71 |     # loss and metric
 72 |     ce = C.cross_entropy_with_softmax(z, label_var)
 73 |     pe = C.classification_error(z, label_var)
 74 | 
 75 |     C.logging.log_number_of_parameters(z) ; print()
 76 | 
 77 |     return {
 78 |         'feature': feature_var,
 79 |         'label': label_var,
 80 |         'ce' : ce,
 81 |         'pe' : pe,
 82 |         'output': z
 83 |     }
 84 | 
 85 | # Create trainer
 86 | def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up, progress_writers):
 87 |     # Set learning parameters
 88 |     lr_per_sample     = [0.0015625]*20 + [0.00046875]*20 + [0.00015625]*20 + [0.000046875]*10 + [0.000015625]
 89 |     lr_schedule       = C.learning_rate_schedule(lr_per_sample, unit=C.learners.UnitType.sample, epoch_size=epoch_size)
 90 |     mm_time_constant  = [0]*20 + [600]*20 + [1200]
 91 |     mm_schedule       = C.learners.momentum_as_time_constant_schedule(mm_time_constant, epoch_size=epoch_size)
 92 |     l2_reg_weight     = 0.002
 93 | 
 94 |     # Create learner
 95 |     if block_size != None and num_quantization_bits != 32:
 96 |         raise RuntimeError("Block momentum cannot be used with quantization, please remove quantized_bits option.")
 97 | 
 98 |     local_learner = C.learners.momentum_sgd(network['output'].parameters,
 99 |                                             lr_schedule, mm_schedule,
100 |                                             l2_regularization_weight=l2_reg_weight)
101 | 
102 |     if block_size != None:
103 |         parameter_learner = C.train.distributed.block_momentum_distributed_learner(local_learner, block_size=block_size)
104 |     else:
105 |         parameter_learner = C.train.distributed.data_parallel_distributed_learner(local_learner, 
106 |                                                                                   num_quantization_bits=num_quantization_bits, 
107 |                                                                                   distributed_after=warm_up)
108 | 
109 |     # Create trainer
110 |     return C.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_writers)
111 | 
112 | # Train and test
113 | def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore, profiling=False):
114 | 
115 |     # define mapping from intput streams to network inputs
116 |     input_map = {
117 |         network['feature']: train_source.streams.features,
118 |         network['label']: train_source.streams.labels
119 |     }
120 | 
121 |     # Train all minibatches
122 |     if profiling:
123 |         start_profiler(sync_gpu=True)
124 | 
125 |     training_session(
126 |         trainer=trainer, mb_source = train_source,
127 |         model_inputs_to_streams = input_map, 
128 |         mb_size = minibatch_size,
129 |         progress_frequency=epoch_size,
130 |         checkpoint_config = CheckpointConfig(frequency = epoch_size,
131 |                                              filename = os.path.join(model_path, "ConvNet_CIFAR10_DataAug"),
132 |                                              restore = restore),
133 |         test_config = TestConfig(test_source, minibatch_size=minibatch_size)
134 |     ).train()
135 | 
136 |     if profiling:
137 |         stop_profiler()
138 | 
139 | # Train and evaluate the network.
140 | def convnet_cifar10_dataaug(train_data, test_data, mean_data, minibatch_size=64, epoch_size=50000, num_quantization_bits=32,
141 |                             block_size=3200, warm_up=0, max_epochs=2, restore=False, log_to_file=None, 
142 |                             num_mbs_per_log=None, gen_heartbeat=False, profiling=False, tensorboard_logdir=None):
143 |     _cntk_py.set_computation_network_trace_level(0)
144 | 
145 |     network = create_conv_network()
146 | 
147 |     distributed_sync_report_freq = None
148 |     if block_size is not None:
149 |         distributed_sync_report_freq = 1
150 | 
151 |     progress_writers = [C.logging.ProgressPrinter(
152 |         freq=num_mbs_per_log,
153 |         tag='Training',
154 |         log_to_file=log_to_file,
155 |         rank=C.train.distributed.Communicator.rank(),
156 |         gen_heartbeat=gen_heartbeat,
157 |         num_epochs=max_epochs,
158 |         distributed_freq=distributed_sync_report_freq)]
159 | 
160 |     if tensorboard_logdir is not None:
161 |         progress_writers.append(C.logging.TensorBoardProgressWriter(
162 |         freq=num_mbs_per_log,
163 |         log_dir=tensorboard_logdir,
164 |         rank=C.train.distributed.Communicator.rank(),
165 |         model=network['output']))
166 | 
167 |     trainer = create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up, progress_writers)
168 |     train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
169 |     test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=C.io.FULL_DATA_SWEEP)
170 |     train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore, profiling)
171 | 
172 | 
173 | if __name__=='__main__':
174 |     parser = argparse.ArgumentParser()
175 |     data_path  = os.path.join(abs_path, "..", "..", "..", "DataSets", "CIFAR-10")
176 | 
177 |     parser.add_argument('-datadir', '--datadir', help='Data directory where the CIFAR dataset is located', 
178 |                         required=False, default=data_path)
179 |     parser.add_argument('-outputdir', '--outputdir', help='Output directory for checkpoints and models', required=False, default=None)
180 |     parser.add_argument('-logdir', '--logdir', help='Log file', required=False, default=None)
181 |     parser.add_argument('-tensorboard_logdir', '--tensorboard_logdir', help='Directory where TensorBoard logs should be created', 
182 |                         required=False, default=None)
183 |     parser.add_argument('-n', '--num_epochs', help='Total number of epochs to train', type=int, required=False, default='160')
184 |     parser.add_argument('-m', '--minibatch_size', help='Minibatch size', type=int, required=False, default='64')
185 |     parser.add_argument('-e', '--epoch_size', help='Epoch size', type=int, required=False, default='50000')
186 |     parser.add_argument('-q', '--quantized_bits', help='Number of quantized bits used for gradient aggregation', type=int, 
187 |                         required=False, default='32')
188 |     parser.add_argument('-a', '--distributed_after', help='Number of samples to train with before running distributed', type=int, 
189 |                         required=False, default='0')
190 |     parser.add_argument('-b', '--block_samples', type=int, help="Number of samples per block for block momentum (BM) distributed learner (if 0 BM learner is not used)", 
191 |                         required=False, default=None)
192 |     parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', 
193 |                         action='store_true')
194 |     parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", 
195 |                         required=False, default=None)
196 |     parser.add_argument('-profile', '--profile', help="Turn on profiling", action='store_true', default=False)
197 | 
198 |     args = vars(parser.parse_args())
199 | 
200 |     if args['outputdir'] is not None:
201 |         model_path = args['outputdir'] + "/models"
202 |     if args['logdir'] is not None:
203 |         log_dir = args['logdir']
204 |     if args['device'] is not None:
205 |         C.device.try_set_default_device(C.device.gpu(args['device']))
206 | 
207 |     data_path = args['datadir']
208 | 
209 |     if not os.path.isdir(data_path):
210 |         raise RuntimeError("Directory %s does not exist" % data_path)
211 | 
212 |     mean_data=os.path.join(data_path, 'CIFAR-10_mean.xml')
213 |     train_data=os.path.join(data_path, 'train_map.txt')
214 |     test_data=os.path.join(data_path, 'test_map.txt')
215 | 
216 |     convnet_cifar10_dataaug(train_data, test_data, mean_data,
217 |                             minibatch_size=args['minibatch_size'],
218 |                             epoch_size=args['epoch_size'],
219 |                             num_quantization_bits=args['quantized_bits'],
220 |                             block_size=args['block_samples'],
221 |                             warm_up=args['distributed_after'],
222 |                             max_epochs=args['num_epochs'],
223 |                             restore=not args['restart'],
224 |                             log_to_file=args['logdir'],
225 |                             num_mbs_per_log=100,
226 |                             gen_heartbeat=True,
227 |                             profiling=args['profile'],
228 |                             tensorboard_logdir=args['tensorboard_logdir'])
229 |     # Must call MPI finalize when process exit without exceptions
230 |     C.train.distributed.Communicator.finalize()
231 | 
232 | 


--------------------------------------------------------------------------------
/recipes/TensorFlow/TensorFlow-GPU-Distributed/mnist_replica.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Distributed MNIST training and validation, with model replicas.
 17 | 
 18 | A simple softmax model with one hidden layer is defined. The parameters
 19 | (weights and biases) are located on one parameter server (ps), while the ops
 20 | are executed on two worker nodes by default. The TF sessions also run on the
 21 | worker node.
 22 | Multiple invocations of this script can be done in parallel, with different
 23 | values for --task_index. There should be exactly one invocation with
 24 | --task_index, which will create a master session that carries out variable
 25 | initialization. The other, non-master, sessions will wait for the master
 26 | session to finish the initialization before proceeding to the training stage.
 27 | 
 28 | The coordination between the multiple worker invocations occurs due to
 29 | the definition of the parameters on the same ps devices. The parameter updates
 30 | from one worker is visible to all other workers. As such, the workers can
 31 | perform forward computation and gradient calculation in parallel, which
 32 | should lead to increased training speed for the simple model.
 33 | """
 34 | 
 35 | 
 36 | from __future__ import absolute_import
 37 | from __future__ import division
 38 | from __future__ import print_function
 39 | 
 40 | import math
 41 | import sys
 42 | import tempfile
 43 | import time
 44 | 
 45 | import tensorflow as tf
 46 | from tensorflow.examples.tutorials.mnist import input_data
 47 | 
 48 | 
 49 | flags = tf.app.flags
 50 | flags.DEFINE_string("data_dir", "/tmp/mnist-data",
 51 |                     "Directory for storing mnist data")
 52 | flags.DEFINE_boolean("download_only", False,
 53 |                      "Only perform downloading of data; Do not proceed to "
 54 |                      "session preparation, model definition or training")
 55 | flags.DEFINE_integer("task_index", None,
 56 |                      "Worker task index, should be >= 0. task_index=0 is "
 57 |                      "the master worker task the performs the variable "
 58 |                      "initialization ")
 59 | flags.DEFINE_integer("num_gpus", 1,
 60 |                      "Total number of gpus for each machine."
 61 |                      "If you don't use GPU, please set it to '0'")
 62 | flags.DEFINE_integer("replicas_to_aggregate", None,
 63 |                      "Number of replicas to aggregate before parameter update"
 64 |                      "is applied (For sync_replicas mode only; default: "
 65 |                      "num_workers)")
 66 | flags.DEFINE_integer("hidden_units", 100,
 67 |                      "Number of units in the hidden layer of the NN")
 68 | flags.DEFINE_integer("train_steps", 200,
 69 |                      "Number of (global) training steps to perform")
 70 | flags.DEFINE_integer("batch_size", 100, "Training batch size")
 71 | flags.DEFINE_float("learning_rate", 0.01, "Learning rate")
 72 | flags.DEFINE_boolean("sync_replicas", False,
 73 |                      "Use the sync_replicas (synchronized replicas) mode, "
 74 |                      "wherein the parameter updates from workers are aggregated "
 75 |                      "before applied to avoid stale gradients")
 76 | flags.DEFINE_boolean(
 77 |     "existing_servers", False, "Whether servers already exists. If True, "
 78 |     "will use the worker hosts via their GRPC URLs (one client process "
 79 |     "per worker host). Otherwise, will create an in-process TensorFlow "
 80 |     "server.")
 81 | flags.DEFINE_string("ps_hosts","localhost:2222",
 82 |                     "Comma-separated list of hostname:port pairs")
 83 | flags.DEFINE_string("worker_hosts", "localhost:2223,localhost:2224",
 84 |                     "Comma-separated list of hostname:port pairs")
 85 | flags.DEFINE_string("job_name", None,"job name: worker or ps")
 86 | 
 87 | FLAGS = flags.FLAGS
 88 | 
 89 | 
 90 | IMAGE_PIXELS = 28
 91 | 
 92 | 
 93 | def main(unused_argv):
 94 |   mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
 95 |   if FLAGS.download_only:
 96 |     sys.exit(0)
 97 | 
 98 |   if FLAGS.job_name is None or FLAGS.job_name == "":
 99 |     raise ValueError("Must specify an explicit `job_name`")
100 |   if FLAGS.task_index is None or FLAGS.task_index =="":
101 |     raise ValueError("Must specify an explicit `task_index`")
102 | 
103 |   print("job name = %s" % FLAGS.job_name)
104 |   print("task index = %d" % FLAGS.task_index)
105 | 
106 |   #Construct the cluster and start the server
107 |   ps_spec = FLAGS.ps_hosts.split(",")
108 |   worker_spec = FLAGS.worker_hosts.split(",")
109 | 
110 |   # Get the number of workers.
111 |   num_workers = len(worker_spec)
112 | 
113 |   cluster = tf.train.ClusterSpec({
114 |       "ps": ps_spec,
115 |       "worker": worker_spec})
116 | 
117 |   if not FLAGS.existing_servers:
118 |     # Not using existing servers. Create an in-process server.
119 |     server = tf.train.Server(
120 |         cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index)
121 |     if FLAGS.job_name == "ps":
122 |       server.join()
123 | 
124 |   is_chief = (FLAGS.task_index == 0)
125 |   if FLAGS.num_gpus > 0:
126 |     # Avoid gpu allocation conflict: now allocate task_num -> #gpu
127 |     # for each worker in the corresponding machine
128 |     gpu = (FLAGS.task_index % FLAGS.num_gpus)
129 |     worker_device = "/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu)
130 |   elif FLAGS.num_gpus == 0:
131 |     # Just allocate the CPU to worker server
132 |     cpu = 0
133 |     worker_device = "/job:worker/task:%d/cpu:%d" % (FLAGS.task_index, cpu)
134 |   # The device setter will automatically place Variables ops on separate
135 |   # parameter servers (ps). The non-Variable ops will be placed on the workers.
136 |   # The ps use CPU and workers use corresponding GPU
137 |   with tf.device(
138 |       tf.train.replica_device_setter(
139 |           worker_device=worker_device,
140 |           ps_device="/job:ps/cpu:0",
141 |           cluster=cluster)):
142 |     global_step = tf.Variable(0, name="global_step", trainable=False)
143 | 
144 |     # Variables of the hidden layer
145 |     hid_w = tf.Variable(
146 |         tf.truncated_normal(
147 |             [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
148 |             stddev=1.0 / IMAGE_PIXELS),
149 |         name="hid_w")
150 |     hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b")
151 | 
152 |     # Variables of the softmax layer
153 |     sm_w = tf.Variable(
154 |         tf.truncated_normal(
155 |             [FLAGS.hidden_units, 10],
156 |             stddev=1.0 / math.sqrt(FLAGS.hidden_units)),
157 |         name="sm_w")
158 |     sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
159 | 
160 |     # Ops: located on the worker specified with FLAGS.task_index
161 |     x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS])
162 |     y_ = tf.placeholder(tf.float32, [None, 10])
163 | 
164 |     hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
165 |     hid = tf.nn.relu(hid_lin)
166 | 
167 |     y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
168 |     cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
169 | 
170 |     opt = tf.train.AdamOptimizer(FLAGS.learning_rate)
171 | 
172 |     if FLAGS.sync_replicas:
173 |       if FLAGS.replicas_to_aggregate is None:
174 |         replicas_to_aggregate = num_workers
175 |       else:
176 |         replicas_to_aggregate = FLAGS.replicas_to_aggregate
177 | 
178 |       opt = tf.train.SyncReplicasOptimizer(
179 |           opt,
180 |           replicas_to_aggregate=replicas_to_aggregate,
181 |           total_num_replicas=num_workers,
182 |           name="mnist_sync_replicas")
183 | 
184 |     train_step = opt.minimize(cross_entropy, global_step=global_step)
185 | 
186 |     if FLAGS.sync_replicas:
187 |       local_init_op = opt.local_step_init_op
188 |       if is_chief:
189 |         local_init_op = opt.chief_init_op
190 | 
191 |       ready_for_local_init_op = opt.ready_for_local_init_op
192 | 
193 |       # Initial token and chief queue runners required by the sync_replicas mode
194 |       chief_queue_runner = opt.get_chief_queue_runner()
195 |       sync_init_op = opt.get_init_tokens_op()
196 | 
197 |     init_op = tf.global_variables_initializer()
198 |     train_dir = tempfile.mkdtemp()
199 | 
200 |     if FLAGS.sync_replicas:
201 |       sv = tf.train.Supervisor(
202 |           is_chief=is_chief,
203 |           logdir=train_dir,
204 |           init_op=init_op,
205 |           local_init_op=local_init_op,
206 |           ready_for_local_init_op=ready_for_local_init_op,
207 |           recovery_wait_secs=1,
208 |           global_step=global_step)
209 |     else:
210 |       sv = tf.train.Supervisor(
211 |           is_chief=is_chief,
212 |           logdir=train_dir,
213 |           init_op=init_op,
214 |           recovery_wait_secs=1,
215 |           global_step=global_step)
216 | 
217 |     sess_config = tf.ConfigProto(
218 |         allow_soft_placement=True,
219 |         log_device_placement=False,
220 |         device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.task_index])
221 | 
222 |     # The chief worker (task_index==0) session will prepare the session,
223 |     # while the remaining workers will wait for the preparation to complete.
224 |     if is_chief:
225 |       print("Worker %d: Initializing session..." % FLAGS.task_index)
226 |     else:
227 |       print("Worker %d: Waiting for session to be initialized..." %
228 |             FLAGS.task_index)
229 | 
230 |     if FLAGS.existing_servers:
231 |       server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index]
232 |       print("Using existing server at: %s" % server_grpc_url)
233 | 
234 |       sess = sv.prepare_or_wait_for_session(server_grpc_url,
235 |                                             config=sess_config)
236 |     else:
237 |       sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
238 | 
239 |     print("Worker %d: Session initialization complete." % FLAGS.task_index)
240 | 
241 |     if FLAGS.sync_replicas and is_chief:
242 |       # Chief worker will start the chief queue runner and call the init op.
243 |       sess.run(sync_init_op)
244 |       sv.start_queue_runners(sess, [chief_queue_runner])
245 | 
246 |     # Perform training
247 |     time_begin = time.time()
248 |     print("Training begins @ %f" % time_begin)
249 | 
250 |     local_step = 0
251 |     while True:
252 |       # Training feed
253 |       batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size)
254 |       train_feed = {x: batch_xs, y_: batch_ys}
255 | 
256 |       _, step = sess.run([train_step, global_step], feed_dict=train_feed)
257 |       local_step += 1
258 | 
259 |       now = time.time()
260 |       print("%f: Worker %d: training step %d done (global step: %d)" %
261 |             (now, FLAGS.task_index, local_step, step))
262 | 
263 |       if step >= FLAGS.train_steps:
264 |         break
265 | 
266 |     time_end = time.time()
267 |     print("Training ends @ %f" % time_end)
268 |     training_time = time_end - time_begin
269 |     print("Training elapsed time: %f s" % training_time)
270 | 
271 |     # Validation feed
272 |     val_feed = {x: mnist.validation.images, y_: mnist.validation.labels}
273 |     val_xent = sess.run(cross_entropy, feed_dict=val_feed)
274 |     print("After %d training step(s), validation cross entropy = %g" %
275 |           (FLAGS.train_steps, val_xent))
276 | 
277 | 
278 | if __name__ == "__main__":
279 |   tf.app.run()


--------------------------------------------------------------------------------
/recipes/TensorFlow/TensorFlow-GPU/TensorFlow-GPU.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Tensorflow GPU\n"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Introduction\n",
 15 |     "\n",
 16 |     "This example demonstrate how to run standard TensorFlow sample (https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py) on Azure Batch AI cluster of one node.\n",
 17 |     "\n",
 18 |     "## Details\n",
 19 |     "\n",
 20 |     "- For demonstration purposes, official convolutional.py will be deployed at Azure File Share;\n",
 21 |     "- Standard output of the job will be stored on Azure File Share;"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## Instructions\n",
 29 |     "\n",
 30 |     "### Install Dependencies and Create Configuration file.\n",
 31 |     "Follow [instructions](/recipes) to install all dependencies and create configuration file."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "### Read Configuration and Create Batch AI client"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {
 45 |     "collapsed": false
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "from __future__ import print_function\n",
 50 |     "\n",
 51 |     "import time\n",
 52 |     "from datetime import datetime\n",
 53 |     "import os\n",
 54 |     "import sys\n",
 55 |     "import zipfile\n",
 56 |     "\n",
 57 |     "from azure.storage.file import FileService, FilePermissions\n",
 58 |     "import azure.mgmt.batchai.models as models\n",
 59 |     "\n",
 60 |     "# utilities.py contains helper functions used by different notebooks\n",
 61 |     "sys.path.append('..\\..')\n",
 62 |     "import utilities\n",
 63 |     "\n",
 64 |     "cfg = utilities.Configuration('..\\..\\configuration.json')\n",
 65 |     "client = utilities.create_batchai_client(cfg)\n",
 66 |     "utilities.create_resource_group(cfg)"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "### Create File Share\n",
 74 |     "\n",
 75 |     "For this example we will create a new File Share with name `batchaisample` under your storage account.\n",
 76 |     "\n",
 77 |     "**Note** You don't need to create new file share for every cluster. We are doing this in this sample to simplify resource management for you."
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {
 84 |     "collapsed": false
 85 |    },
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "azure_file_share_name = 'batchaisample'\n",
 89 |     "service = FileService(cfg.storage_account_name, cfg.storage_account_key)\n",
 90 |     "service.create_share(azure_file_share_name, fail_on_exist=False)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "### Configure Compute Cluster\n",
 98 |     "\n",
 99 |     "- For this example we will use a gpu cluster of 1 `STANDARD_NC6` node. You can increase the number of nodes by changing `nodes_count` variable;\n",
100 |     "- We will mount file share at folder with name `external`. Full path of this folder on a computer node will be `$AZ_BATCHAI_MOUNT_ROOT/external`;\n",
101 |     "- We will call the cluster `nc6`;\n",
102 |     "\n",
103 |     "So, the cluster will have the following parameters:"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {
110 |     "collapsed": false
111 |    },
112 |    "outputs": [],
113 |    "source": [
114 |     "azure_file_share = 'external'\n",
115 |     "nodes_count = 1\n",
116 |     "cluster_name = 'nc6'\n",
117 |     "\n",
118 |     "volumes = models.MountVolumes(\n",
119 |     "    azure_file_shares=[\n",
120 |     "        models.AzureFileShareReference(\n",
121 |     "            account_name=cfg.storage_account_name,\n",
122 |     "            credentials=models.AzureStorageCredentialsInfo(\n",
123 |     "                account_key=cfg.storage_account_key),\n",
124 |     "            azure_file_url = 'https://{0}.file.core.windows.net/{1}'.format(\n",
125 |     "                cfg.storage_account_name, azure_file_share_name),\n",
126 |     "            relative_mount_path=azure_file_share)\n",
127 |     "    ]\n",
128 |     ")\n",
129 |     "\n",
130 |     "parameters = models.ClusterCreateParameters(\n",
131 |     "    location=cfg.location,\n",
132 |     "    vm_size=\"STANDARD_NC6\",\n",
133 |     "    scale_settings=models.ScaleSettings(\n",
134 |     "        manual=models.ManualScaleSettings(target_node_count=nodes_count)\n",
135 |     "    ),\n",
136 |     "    node_setup=models.NodeSetup(\n",
137 |     "        mount_volumes=volumes\n",
138 |     "    ),\n",
139 |     "    user_account_settings=models.UserAccountSettings(\n",
140 |     "        admin_user_name=cfg.admin,\n",
141 |     "        admin_user_password=cfg.admin_password,\n",
142 |     "        admin_user_ssh_public_key=cfg.admin_ssh_key\n",
143 |     "    )\n",
144 |     ")"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "### Create Compute Cluster"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {
158 |     "collapsed": false
159 |    },
160 |    "outputs": [],
161 |    "source": [
162 |     "_ = client.clusters.create(cfg.resource_group, cluster_name, parameters)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "### Monitor Cluster Creation\n",
170 |     "\n",
171 |     "Monitor the just created cluster. utilities.py contains a helper function to print out all kind of nodes count in the cluster."
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {
178 |     "collapsed": false
179 |    },
180 |    "outputs": [],
181 |    "source": [
182 |     "cluster = client.clusters.get(cfg.resource_group, cluster_name)\n",
183 |     "utilities.print_cluster_status(cluster)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "### Deploy Sample Script and Configure the Input Directories\n"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {},
196 |    "source": [
197 |     "- For each job we will create a folder containing a copy of the sample script. This allows to run the same job with different scripts."
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "metadata": {
204 |     "collapsed": true
205 |    },
206 |    "outputs": [],
207 |    "source": [
208 |     "mnist_script_directory = 'tensorflow_samples'\n",
209 |     "service = FileService(cfg.storage_account_name, cfg.storage_account_key)\n",
210 |     "service.create_directory(\n",
211 |     "    azure_file_share_name, mnist_script_directory, fail_on_exist=False)\n",
212 |     "service.create_file_from_path(\n",
213 |     "    azure_file_share_name, mnist_script_directory, 'convolutional.py', 'convolutional.py')"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "The job needs to know where to find ConvNet_MNIST.py and input MNIST dataset. We will create two input directories for this:"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {
227 |     "collapsed": false
228 |    },
229 |    "outputs": [],
230 |    "source": [
231 |     "input_directories = [\n",
232 |     "    models.InputDirectory(\n",
233 |     "        id='SCRIPT',\n",
234 |     "        path='$AZ_BATCHAI_MOUNT_ROOT/{0}/{1}'.format(azure_file_share, mnist_script_directory))]"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "metadata": {},
240 |    "source": [
241 |     "The job will be able to reference that directory using ```$AZ_BATCHAI_INPUT_SCRIPT``` environment variable."
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "### Configure Output Directories\n",
249 |     "We will store standard and error output of the job in File Share:"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": null,
255 |    "metadata": {
256 |     "collapsed": true
257 |    },
258 |    "outputs": [],
259 |    "source": [
260 |     "std_output_path_prefix = \"$AZ_BATCHAI_MOUNT_ROOT/{0}\".format(azure_file_share)"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {},
266 |    "source": [
267 |     "### Configure Job\n",
268 |     "\n",
269 |     "- The job will use `tensorflow/tensorflow:1.1.0-gpu` container.\n",
270 |     "- Will use configured previously input and output directories.\n",
271 |     "- By removing container_settings, the job will be ran on the host VMs if you are using DSVM.\n",
272 |     "\n",
273 |     "**Note** You must agree to the following licences before using this container:\n",
274 |     "- [TensorFlow License](https://github.com/tensorflow/tensorflow/blob/master/LICENSE)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {
281 |     "collapsed": false
282 |    },
283 |    "outputs": [],
284 |    "source": [
285 |     "job_name = datetime.utcnow().strftime(\"tf_%m_%d_%Y_%H%M%S\")\n",
286 |     "parameters = models.job_create_parameters.JobCreateParameters(\n",
287 |     "     location=cfg.location,\n",
288 |     "     cluster=models.ResourceId(cluster.id),\n",
289 |     "     node_count=nodes_count,\n",
290 |     "     input_directories=input_directories,\n",
291 |     "     std_out_err_path_prefix=std_output_path_prefix,\n",
292 |     "     container_settings=models.ContainerSettings(\n",
293 |     "         models.ImageSourceRegistry(image='tensorflow/tensorflow:1.1.0-gpu')),\n",
294 |     "     tensor_flow_settings=models.TensorFlowSettings(\n",
295 |     "         python_script_file_path='$AZ_BATCHAI_INPUT_SCRIPT/convolutional.py',\n",
296 |     "         master_command_line_args=\"-p\",\n",
297 |     "     )\n",
298 |     ")"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "markdown",
303 |    "metadata": {},
304 |    "source": [
305 |     "### Create a training Job and wait for Job completion\n"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {
312 |     "collapsed": false
313 |    },
314 |    "outputs": [],
315 |    "source": [
316 |     "_ = client.jobs.create(cfg.resource_group, job_name, parameters) \n",
317 |     "print('Created Job: {}'.format(job_name))"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "metadata": {},
323 |    "source": [
324 |     "### Wait for Job to Finish\n",
325 |     "The job will start running when the cluster will have enought idle nodes. The following code waits for job to start running printing the cluster state. During job run, the code prints current content of stdeout-0.txt (the output of the worker running on the first node)."
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "metadata": {
332 |     "collapsed": false
333 |    },
334 |    "outputs": [],
335 |    "source": [
336 |     "utilities.wait_for_job_completion(client, cfg.resource_group, job_name, cluster_name, 'stdOuterr', 'stdout-wk-0.txt')"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "markdown",
341 |    "metadata": {
342 |     "collapsed": true
343 |    },
344 |    "source": [
345 |     "### Download stdout.txt and stderr.txt files for the Job"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": null,
351 |    "metadata": {
352 |     "collapsed": false
353 |    },
354 |    "outputs": [],
355 |    "source": [
356 |     "files = client.jobs.list_output_files(cfg.resource_group, job_name, models.JobsListOutputFilesOptions(\"stdOuterr\")) \n",
357 |     "for file in list(files):\n",
358 |     "    utilities.download_file(file.download_url, file.name)\n",
359 |     "print(\"All files Downloaded\")"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "markdown",
364 |    "metadata": {},
365 |    "source": [
366 |     "### Delete the Job"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": null,
372 |    "metadata": {
373 |     "collapsed": false
374 |    },
375 |    "outputs": [],
376 |    "source": [
377 |     "client.jobs.delete(cfg.resource_group, job_name)"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "markdown",
382 |    "metadata": {},
383 |    "source": [
384 |     "### Delete the Cluster\n",
385 |     "When you are finished with the sample and don't want to submit any more jobs you can delete the cluster using the following code."
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": null,
391 |    "metadata": {
392 |     "collapsed": false
393 |    },
394 |    "outputs": [],
395 |    "source": [
396 |     "client.clusters.delete(cfg.resource_group, cluster_name)"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "markdown",
401 |    "metadata": {
402 |     "collapsed": true
403 |    },
404 |    "source": [
405 |     "### Delete File Share\n",
406 |     "When you are finished with the sample and don't want to submit any more jobs you can delete the file share completely with all files using the following code."
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": null,
412 |    "metadata": {
413 |     "collapsed": true
414 |    },
415 |    "outputs": [],
416 |    "source": [
417 |     "service = FileService(cfg.storage_account_name, cfg.storage_account_key)\n",
418 |     "service.delete_share(azure_file_share_name)"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": null,
424 |    "metadata": {
425 |     "collapsed": true
426 |    },
427 |    "outputs": [],
428 |    "source": []
429 |   }
430 |  ],
431 |  "metadata": {
432 |   "anaconda-cloud": {},
433 |   "kernelspec": {
434 |    "display_name": "Python [Root]",
435 |    "language": "python",
436 |    "name": "Python [Root]"
437 |   },
438 |   "language_info": {
439 |    "codemirror_mode": {
440 |     "name": "ipython",
441 |     "version": 3
442 |    },
443 |    "file_extension": ".py",
444 |    "mimetype": "text/x-python",
445 |    "name": "python",
446 |    "nbconvert_exporter": "python",
447 |    "pygments_lexer": "ipython3",
448 |    "version": "3.5.2"
449 |   }
450 |  },
451 |  "nbformat": 4,
452 |  "nbformat_minor": 1
453 | }
454 | 


--------------------------------------------------------------------------------
/recipes/TensorFlow/TensorFlow-GPU/convolutional.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Simple, end-to-end, LeNet-5-like convolutional MNIST model example.
 17 | 
 18 | This should achieve a test error of 0.7%. Please keep this model as simple and
 19 | linear as possible, it is meant as a tutorial for simple convolutional models.
 20 | Run with --self_test on the command line to execute a short self-test.
 21 | """
 22 | from __future__ import absolute_import
 23 | from __future__ import division
 24 | from __future__ import print_function
 25 | 
 26 | import argparse
 27 | import gzip
 28 | import os
 29 | import sys
 30 | import time
 31 | 
 32 | import numpy
 33 | from six.moves import urllib
 34 | from six.moves import xrange  # pylint: disable=redefined-builtin
 35 | import tensorflow as tf
 36 | 
 37 | SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
 38 | WORK_DIRECTORY = 'data'
 39 | IMAGE_SIZE = 28
 40 | NUM_CHANNELS = 1
 41 | PIXEL_DEPTH = 255
 42 | NUM_LABELS = 10
 43 | VALIDATION_SIZE = 5000  # Size of the validation set.
 44 | SEED = 66478  # Set to None for random seed.
 45 | BATCH_SIZE = 64
 46 | NUM_EPOCHS = 10
 47 | EVAL_BATCH_SIZE = 64
 48 | EVAL_FREQUENCY = 100  # Number of steps between evaluations.
 49 | 
 50 | 
 51 | FLAGS = None
 52 | 
 53 | 
 54 | def data_type():
 55 |   """Return the type of the activations, weights, and placeholder variables."""
 56 |   if FLAGS.use_fp16:
 57 |     return tf.float16
 58 |   else:
 59 |     return tf.float32
 60 | 
 61 | 
 62 | def maybe_download(filename):
 63 |   """Download the data from Yann's website, unless it's already here."""
 64 |   if not tf.gfile.Exists(WORK_DIRECTORY):
 65 |     tf.gfile.MakeDirs(WORK_DIRECTORY)
 66 |   filepath = os.path.join(WORK_DIRECTORY, filename)
 67 |   if not tf.gfile.Exists(filepath):
 68 |     filepath, _ = urllib.request.urlretrieve(SOURCE_URL + filename, filepath)
 69 |     with tf.gfile.GFile(filepath) as f:
 70 |       size = f.size()
 71 |     print('Successfully downloaded', filename, size, 'bytes.')
 72 |   return filepath
 73 | 
 74 | 
 75 | def extract_data(filename, num_images):
 76 |   """Extract the images into a 4D tensor [image index, y, x, channels].
 77 | 
 78 |   Values are rescaled from [0, 255] down to [-0.5, 0.5].
 79 |   """
 80 |   print('Extracting', filename)
 81 |   with gzip.open(filename) as bytestream:
 82 |     bytestream.read(16)
 83 |     buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images * NUM_CHANNELS)
 84 |     data = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.float32)
 85 |     data = (data - (PIXEL_DEPTH / 2.0)) / PIXEL_DEPTH
 86 |     data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)
 87 |     return data
 88 | 
 89 | 
 90 | def extract_labels(filename, num_images):
 91 |   """Extract the labels into a vector of int64 label IDs."""
 92 |   print('Extracting', filename)
 93 |   with gzip.open(filename) as bytestream:
 94 |     bytestream.read(8)
 95 |     buf = bytestream.read(1 * num_images)
 96 |     labels = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.int64)
 97 |   return labels
 98 | 
 99 | 
100 | def fake_data(num_images):
101 |   """Generate a fake dataset that matches the dimensions of MNIST."""
102 |   data = numpy.ndarray(
103 |       shape=(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS),
104 |       dtype=numpy.float32)
105 |   labels = numpy.zeros(shape=(num_images,), dtype=numpy.int64)
106 |   for image in xrange(num_images):
107 |     label = image % 2
108 |     data[image, :, :, 0] = label - 0.5
109 |     labels[image] = label
110 |   return data, labels
111 | 
112 | 
113 | def error_rate(predictions, labels):
114 |   """Return the error rate based on dense predictions and sparse labels."""
115 |   return 100.0 - (
116 |       100.0 *
117 |       numpy.sum(numpy.argmax(predictions, 1) == labels) /
118 |       predictions.shape[0])
119 | 
120 | 
121 | def main(_):
122 |   if FLAGS.self_test:
123 |     print('Running self-test.')
124 |     train_data, train_labels = fake_data(256)
125 |     validation_data, validation_labels = fake_data(EVAL_BATCH_SIZE)
126 |     test_data, test_labels = fake_data(EVAL_BATCH_SIZE)
127 |     num_epochs = 1
128 |   else:
129 |     # Get the data.
130 |     train_data_filename = maybe_download('train-images-idx3-ubyte.gz')
131 |     train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz')
132 |     test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz')
133 |     test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz')
134 | 
135 |     # Extract it into numpy arrays.
136 |     train_data = extract_data(train_data_filename, 60000)
137 |     train_labels = extract_labels(train_labels_filename, 60000)
138 |     test_data = extract_data(test_data_filename, 10000)
139 |     test_labels = extract_labels(test_labels_filename, 10000)
140 | 
141 |     # Generate a validation set.
142 |     validation_data = train_data[:VALIDATION_SIZE, ...]
143 |     validation_labels = train_labels[:VALIDATION_SIZE]
144 |     train_data = train_data[VALIDATION_SIZE:, ...]
145 |     train_labels = train_labels[VALIDATION_SIZE:]
146 |     num_epochs = NUM_EPOCHS
147 |   train_size = train_labels.shape[0]
148 | 
149 |   # This is where training samples and labels are fed to the graph.
150 |   # These placeholder nodes will be fed a batch of training data at each
151 |   # training step using the {feed_dict} argument to the Run() call below.
152 |   train_data_node = tf.placeholder(
153 |       data_type(),
154 |       shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
155 |   train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,))
156 |   eval_data = tf.placeholder(
157 |       data_type(),
158 |       shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
159 | 
160 |   # The variables below hold all the trainable weights. They are passed an
161 |   # initial value which will be assigned when we call:
162 |   # {tf.global_variables_initializer().run()}
163 |   conv1_weights = tf.Variable(
164 |       tf.truncated_normal([5, 5, NUM_CHANNELS, 32],  # 5x5 filter, depth 32.
165 |                           stddev=0.1,
166 |                           seed=SEED, dtype=data_type()))
167 |   conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type()))
168 |   conv2_weights = tf.Variable(tf.truncated_normal(
169 |       [5, 5, 32, 64], stddev=0.1,
170 |       seed=SEED, dtype=data_type()))
171 |   conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type()))
172 |   fc1_weights = tf.Variable(  # fully connected, depth 512.
173 |       tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512],
174 |                           stddev=0.1,
175 |                           seed=SEED,
176 |                           dtype=data_type()))
177 |   fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type()))
178 |   fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_LABELS],
179 |                                                 stddev=0.1,
180 |                                                 seed=SEED,
181 |                                                 dtype=data_type()))
182 |   fc2_biases = tf.Variable(tf.constant(
183 |       0.1, shape=[NUM_LABELS], dtype=data_type()))
184 | 
185 |   # We will replicate the model structure for the training subgraph, as well
186 |   # as the evaluation subgraphs, while sharing the trainable parameters.
187 |   def model(data, train=False):
188 |     """The Model definition."""
189 |     # 2D convolution, with 'SAME' padding (i.e. the output feature map has
190 |     # the same size as the input). Note that {strides} is a 4D array whose
191 |     # shape matches the data layout: [image index, y, x, depth].
192 |     conv = tf.nn.conv2d(data,
193 |                         conv1_weights,
194 |                         strides=[1, 1, 1, 1],
195 |                         padding='SAME')
196 |     # Bias and rectified linear non-linearity.
197 |     relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases))
198 |     # Max pooling. The kernel size spec {ksize} also follows the layout of
199 |     # the data. Here we have a pooling window of 2, and a stride of 2.
200 |     pool = tf.nn.max_pool(relu,
201 |                           ksize=[1, 2, 2, 1],
202 |                           strides=[1, 2, 2, 1],
203 |                           padding='SAME')
204 |     conv = tf.nn.conv2d(pool,
205 |                         conv2_weights,
206 |                         strides=[1, 1, 1, 1],
207 |                         padding='SAME')
208 |     relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases))
209 |     pool = tf.nn.max_pool(relu,
210 |                           ksize=[1, 2, 2, 1],
211 |                           strides=[1, 2, 2, 1],
212 |                           padding='SAME')
213 |     # Reshape the feature map cuboid into a 2D matrix to feed it to the
214 |     # fully connected layers.
215 |     pool_shape = pool.get_shape().as_list()
216 |     reshape = tf.reshape(
217 |         pool,
218 |         [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]])
219 |     # Fully connected layer. Note that the '+' operation automatically
220 |     # broadcasts the biases.
221 |     hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases)
222 |     # Add a 50% dropout during training only. Dropout also scales
223 |     # activations such that no rescaling is needed at evaluation time.
224 |     if train:
225 |       hidden = tf.nn.dropout(hidden, 0.5, seed=SEED)
226 |     return tf.matmul(hidden, fc2_weights) + fc2_biases
227 | 
228 |   # Training computation: logits + cross-entropy loss.
229 |   logits = model(train_data_node, True)
230 |   loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
231 |       labels=train_labels_node, logits=logits))
232 | 
233 |   # L2 regularization for the fully connected parameters.
234 |   regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) +
235 |                   tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases))
236 |   # Add the regularization term to the loss.
237 |   loss += 5e-4 * regularizers
238 | 
239 |   # Optimizer: set up a variable that's incremented once per batch and
240 |   # controls the learning rate decay.
241 |   batch = tf.Variable(0, dtype=data_type())
242 |   # Decay once per epoch, using an exponential schedule starting at 0.01.
243 |   learning_rate = tf.train.exponential_decay(
244 |       0.01,                # Base learning rate.
245 |       batch * BATCH_SIZE,  # Current index into the dataset.
246 |       train_size,          # Decay step.
247 |       0.95,                # Decay rate.
248 |       staircase=True)
249 |   # Use simple momentum for the optimization.
250 |   optimizer = tf.train.MomentumOptimizer(learning_rate,
251 |                                          0.9).minimize(loss,
252 |                                                        global_step=batch)
253 | 
254 |   # Predictions for the current training minibatch.
255 |   train_prediction = tf.nn.softmax(logits)
256 | 
257 |   # Predictions for the test and validation, which we'll compute less often.
258 |   eval_prediction = tf.nn.softmax(model(eval_data))
259 | 
260 |   # Small utility function to evaluate a dataset by feeding batches of data to
261 |   # {eval_data} and pulling the results from {eval_predictions}.
262 |   # Saves memory and enables this to run on smaller GPUs.
263 |   def eval_in_batches(data, sess):
264 |     """Get all predictions for a dataset by running it in small batches."""
265 |     size = data.shape[0]
266 |     if size < EVAL_BATCH_SIZE:
267 |       raise ValueError("batch size for evals larger than dataset: %d" % size)
268 |     predictions = numpy.ndarray(shape=(size, NUM_LABELS), dtype=numpy.float32)
269 |     for begin in xrange(0, size, EVAL_BATCH_SIZE):
270 |       end = begin + EVAL_BATCH_SIZE
271 |       if end <= size:
272 |         predictions[begin:end, :] = sess.run(
273 |             eval_prediction,
274 |             feed_dict={eval_data: data[begin:end, ...]})
275 |       else:
276 |         batch_predictions = sess.run(
277 |             eval_prediction,
278 |             feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]})
279 |         predictions[begin:, :] = batch_predictions[begin - size:, :]
280 |     return predictions
281 | 
282 |   # Create a local session to run the training.
283 |   start_time = time.time()
284 |   with tf.Session() as sess:
285 |     # Run all the initializers to prepare the trainable parameters.
286 |     tf.global_variables_initializer().run()
287 |     print('Initialized!')
288 |     # Loop through training steps.
289 |     for step in xrange(int(num_epochs * train_size) // BATCH_SIZE):
290 |       # Compute the offset of the current minibatch in the data.
291 |       # Note that we could use better randomization across epochs.
292 |       offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE)
293 |       batch_data = train_data[offset:(offset + BATCH_SIZE), ...]
294 |       batch_labels = train_labels[offset:(offset + BATCH_SIZE)]
295 |       # This dictionary maps the batch data (as a numpy array) to the
296 |       # node in the graph it should be fed to.
297 |       feed_dict = {train_data_node: batch_data,
298 |                    train_labels_node: batch_labels}
299 |       # Run the optimizer to update weights.
300 |       sess.run(optimizer, feed_dict=feed_dict)
301 |       # print some extra information once reach the evaluation frequency
302 |       if step % EVAL_FREQUENCY == 0:
303 |         # fetch some extra nodes' data
304 |         l, lr, predictions = sess.run([loss, learning_rate, train_prediction],
305 |                                       feed_dict=feed_dict)
306 |         elapsed_time = time.time() - start_time
307 |         start_time = time.time()
308 |         print('Step %d (epoch %.2f), %.1f ms' %
309 |               (step, float(step) * BATCH_SIZE / train_size,
310 |                1000 * elapsed_time / EVAL_FREQUENCY))
311 |         print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr))
312 |         print('Minibatch error: %.1f%%' % error_rate(predictions, batch_labels))
313 |         print('Validation error: %.1f%%' % error_rate(
314 |             eval_in_batches(validation_data, sess), validation_labels))
315 |         sys.stdout.flush()
316 |     # Finally print the result!
317 |     test_error = error_rate(eval_in_batches(test_data, sess), test_labels)
318 |     print('Test error: %.1f%%' % test_error)
319 |     if FLAGS.self_test:
320 |       print('test_error', test_error)
321 |       assert test_error == 0.0, 'expected 0.0 test_error, got %.2f' % (
322 |           test_error,)
323 | 
324 | 
325 | if __name__ == '__main__':
326 |   parser = argparse.ArgumentParser()
327 |   parser.add_argument(
328 |       '--use_fp16',
329 |       default=False,
330 |       help='Use half floats instead of full floats if True.',
331 |       action='store_true')
332 |   parser.add_argument(
333 |       '--self_test',
334 |       default=False,
335 |       action='store_true',
336 |       help='True if running a self test.')
337 | 
338 |   FLAGS, unparsed = parser.parse_known_args()
339 |   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
340 | 


--------------------------------------------------------------------------------