├── .gitignore ├── .gitmodules ├── README.md ├── archetypes └── default.md ├── config.toml ├── content ├── _index.md ├── cleanup │ ├── _index.md │ └── clean_resources.md ├── intro │ ├── _index.md │ ├── addressing_challenges-1.md │ ├── addressing_challenges.md │ ├── challenges_solution.md │ └── horovod.md ├── kubernetes_dist_training │ ├── _index.md │ ├── build_container.md │ ├── fsx_lustre.md │ ├── install_cli.md │ ├── install_kubeflow.md │ ├── setup_eks.md │ ├── submit_job.md │ ├── verify_cluster.md │ └── workflow.md ├── sagemaker_dist_training │ ├── _index.md │ ├── monitoring_results.md │ ├── sagemaker_training.md │ ├── training_scrip_updates.md │ └── workflow.md ├── setup │ ├── _index.md │ ├── add_admin_policy.md │ ├── download_workshop.md │ └── sm_jupyter_instance.md └── update_code_dist_training │ ├── _index.md │ ├── distributed_training_script.md │ ├── prepare_dataset.md │ ├── problem_setup.md │ └── single_instance_script.md ├── layouts ├── 404.html ├── partials │ ├── custom-footer.html │ ├── favicon.html │ ├── footer.html │ ├── google.html │ ├── header.html │ ├── logo.html │ └── menu-footer.html └── shortcodes │ ├── cf-download.html │ ├── cf-launch.html │ ├── ghcontributors.html │ ├── github.html │ ├── mermaid.html │ ├── surveymonkey.html │ ├── tab.html │ ├── tabs.html │ └── year.html ├── notebooks ├── generate_cifar10_tfrecords.py ├── part-1-horovod │ ├── cifar10-distributed.ipynb │ ├── cifar10-single-instance.ipynb │ └── model_def.py ├── part-2-sagemaker │ ├── cifar10-sagemaker-distributed.ipynb │ └── code │ │ ├── cifar10-multi-gpu-horovod-sagemaker.py │ │ └── model_def.py └── part-3-kubernetes │ ├── Dockerfile.cpu │ ├── Dockerfile.gpu │ ├── code │ ├── cifar10-multi-gpu-horovod-k8s.py │ └── model_def.py │ ├── cpu_eks_cluster.sh │ ├── gpu_eks_cluster.sh │ └── specs │ ├── claim-fsx-s3.yaml │ ├── eks_tf_training_job-cpu.yaml │ ├── eks_tf_training_job-gpu.yaml │ ├── fsx_lustre_policy.json │ └── storage-class-fsx-s3-template.yaml └── static ├── 640px-Amazon_Web_Services_Logo.svg.png ├── AWS-Logo.svg ├── Amazon_Web_Services_Logo.svg ├── css ├── all.css ├── jquery-ui.min.css └── theme-mine.css ├── images ├── cleanup │ └── sm_cleanup.png ├── convert_script │ ├── distributed_script.png │ └── single_instance.png ├── eks │ ├── create_repo.png │ ├── eksctl_launch.png │ ├── get_container.png │ ├── job_yaml_container.png │ ├── push_commands.png │ ├── subnet_image.png │ ├── verify_eks.png │ └── workflow.png ├── intro │ ├── approaches.png │ ├── challenges.png │ ├── containers.png │ ├── containers_ecr.png │ ├── forward_backward.png │ ├── home.png │ ├── how_it_runs.png │ ├── mlinfra.png │ └── parallel_distributed.png ├── sagemaker │ ├── aws_console.png │ ├── sm_notebook.png │ ├── tensorboard.png │ └── workflow.png └── setup │ ├── admin_attach.png │ ├── attach_policy.png │ ├── go_to_IAM.png │ ├── launch_jupyter.png │ ├── launch_terminal.png │ ├── notebook_iam.png │ ├── setup_aws_console.png │ ├── setup_create_notebook.png │ ├── setup_fill_notebook.png │ └── setup_notebook.png ├── js ├── jquery-3.3.1.min.js └── jquery-ui-1.12.1.min.js ├── mermaid └── mermaid.min.js └── tf-world-distributed-training-workshop.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | .DS_Store 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "themes/learn"] 2 | path = themes/learn 3 | url = https://github.com/matcornic/hugo-theme-learn.git 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AWS Distributed Training Workshop 2 |
3 | This repository contains code, webpages and config files accompanying the AWS Distributed Training Workshop 4 | 5 | 6 | * [Workshop content](https://distributed-training-workshop.go-aws.com/) 7 | 8 | * [Presentation slides](static/tf-world-distributed-training-workshop.pdf) 9 | -------------------------------------------------------------------------------- /archetypes/default.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "{{ replace .Name "-" " " | title }}" 3 | date: {{ .Date }} 4 | --- 5 | -------------------------------------------------------------------------------- /config.toml: -------------------------------------------------------------------------------- 1 | baseURL = "https://distributed-training-workshop.go-aws.com" 2 | languageCode = "en-us" 3 | defaultContentLanguage = "en" 4 | title = "Distributed training with Amazon SageMaker / Amazon EKS Workshop" 5 | theme = "learn" 6 | uglyurls = true 7 | googleAnalytics = "UA-151135045-1" 8 | sectionPagesMenu = "main" 9 | pygmentsCodeFences = true 10 | 11 | [blackfriday] 12 | hrefTargetBlank = true 13 | 14 | [params] 15 | themeVariant = "mine" 16 | showVisitedLinks = false 17 | author = "Shashank Prasanna" 18 | description = "Distributed training workshop with Amazon SageMaker and Amazon EKS" 19 | disableSearch = false 20 | disableAssetsBusting = false 21 | 22 | disableInlineCopyToClipBoard = false 23 | disableShortcutsTitle = false 24 | disableLanguageSwitchingButton = false 25 | disableBreadcrumb = true 26 | disableNextPrev = true 27 | ordersectionsby = "weight" 28 | 29 | [[menu.shortcuts]] 30 | name = " @shshnkp" 31 | identifier = "tw" 32 | url = "https://twitter.com/shshnkp" 33 | weight = 1 34 | 35 | [outputs] 36 | home = [ "HTML", "AMP", "RSS", "JSON"] 37 | page = [ "HTML", "AMP"] 38 | -------------------------------------------------------------------------------- /content/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Distributed Training Workshop" 3 | chapter: true 4 | weight: 1 5 | --- 6 | 7 | # Distributed Training Workshop 8 | 9 | ### Welcome to the distributed training workshop with TensorFlow on Amazon SageMaker and Amazon Elastic Kubernetes Service (EKS). 10 |
11 | #### **At the end of this workshop, you'll be able to:** 12 |
13 | #### - Identify when to consider distributed training 14 | #### - Describe different approaches to distributed training 15 | #### - Outline libraries and tools needed for distributing training workloads on large clusters 16 | #### - Demonstrate code changes required to go from single-GPU to multi-GPU distributed training 17 | #### - Demonstrate using Amazon SageMaker and Amazon EKS to run distributed training jobs 18 | #### - Apply these skills to your own deep learning problem 19 | -------------------------------------------------------------------------------- /content/cleanup/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Cleanup" 3 | date: 2019-10-27T15:25:09-07:00 4 | chapter: true 5 | weight: 6 6 | --- 7 | 8 | # Clean up resources 9 | In this section, we'll walkthrough steps to clean up resources. 10 | 11 | -------------------------------------------------------------------------------- /content/cleanup/clean_resources.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Clean up resources" 3 | date: 2019-10-31T23:12:17-07:00 4 | --- 5 | 6 | ## Amazon EKS resources 7 | 8 | #### Kill all distributed training jobs 9 | ``` 10 | kubectl delete MPIJobs --all 11 | ``` 12 | 13 | #### Delete StorageClass, PersistentVolumeClaim and FSx for Lustre CSI Driver 14 | {{% notice tip %}} 15 | Note: This will automatically delete the FSx for luster file system. Your files are safe in Amazon S3. 16 | {{% /notice %}} 17 | ``` 18 | kubectl delete -f specs/storage-class-fsx-s3.yaml 19 | kubectl delete -f specs/claim-fsx-s3.yaml 20 | kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/aws-fsx-csi-driver/master/deploy/kubernetes/manifest.yaml 21 | ``` 22 | #### Delete security group 23 | ``` 24 | aws ec2 delete-security-group --group-id ${SECURITY_GROUP_ID} 25 | ``` 26 | 27 | #### Delete policies attached to the instance role 28 | These policies were automatically added to the node IAM roles, but we'll need to manually remove them. 29 | 30 | * Copy the role associated with the worker instances 31 | ``` 32 | echo $INSTANCE_ROLE_NAME 33 | ``` 34 | * Navigate to IAM console 35 | * Click on Roles on the left pane 36 | * Search for the output of `echo $INSTANCE_ROLE_NAME` 37 | * Delete the two inline policies. 38 | * `iam_alb_ingress_policy` 39 | * `iam_csi_fsx_policy` 40 | 41 | #### Finally, delete the cluster 42 | ``` 43 | eksctl delete cluster aws-tf-cluster-cpu 44 | ``` 45 | 46 | ## SageMaker resources 47 | SageMaker resources are easier to clear. 48 | Login into the SageMaker console and click dashboard 49 | Make sure that you don't have any resources that are **Green** as shown below. Click on the resources that is shown as green and either stop or delete them. 50 | 51 | ![sm_dashboard](/images/cleanup/sm_cleanup.png) 52 | 53 | ## Other resources 54 | It's always good idea to ensure that: 55 | 56 | -------------------------------------------------------------------------------- /content/intro/_index.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Introduction" 3 | date = 2019-10-27T15:22:24-07:00 4 | weight = 1 5 | chapter = true 6 | +++ 7 | 8 | # Introduction 9 | In a typical machine learning development workflow, there are two main stages where you can get benefit from scaling out. 10 | 11 | ![parallel distributed](/images/intro/parallel_distributed.png) 12 | 13 | 1. Running large-scale parallel experiments: In this scenario our goal is to find the best model/hyperparameters/network architecture by exploring a space of possibilities. 14 | 1. Running distributed training of a single model: In this scenario our goal is to train a single model faster, by distributing its computation across nodes in a cluster. 15 | 16 | ### The focus of this workshop is distributed training of a single model 17 | -------------------------------------------------------------------------------- /content/intro/addressing_challenges-1.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Addressing scaling challenges - Infrastructure management" 3 | date: 2019-10-28T21:07:47-07:00 4 | weight: 4 5 | --- 6 | 7 | ### Infrastructure management 8 | 9 | ![containers](/images/intro/mlinfra.png) 10 | -------------------------------------------------------------------------------- /content/intro/addressing_challenges.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Addressing scaling challenges - software dependencies" 3 | date: 2019-10-28T21:02:29-07:00 4 | weight: 3 5 | --- 6 | 7 | ### Software dependencies 8 | 9 | Containers provide consistent, lightweight and portable environment that includes not just the training code but also dependencies and configuration. 10 | ![containers](/images/intro/containers.png) 11 | 12 | Simply package up your code and push it to a container registry. 13 | The container image can then be pulled into a cluster and run at scale. 14 | ![containers](/images/intro/containers_ecr.png) 15 | -------------------------------------------------------------------------------- /content/intro/challenges_solution.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Challenges with scaling machine learning" 3 | date: 2019-10-28T20:56:39-07:00 4 | weight: 2 5 | --- 6 | ![challenges](/images/intro/challenges.png) 7 | 8 | There are two key challenges associated with scaling machine learning computation. 9 | 10 | 1. Development setup on a single computer or instance doesn't translate well when deploying to cluster 11 | 2. Managing infrastructure is challenging for machine learning researchers, data scientists and developer without IT/ops background 12 | -------------------------------------------------------------------------------- /content/intro/horovod.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Distributed training approaches" 3 | date: 2019-10-28T21:11:22-07:00 4 | weight: 5 5 | --- 6 | ![approaches](/images/intro/approaches.png) 7 | 8 | ## Horovod 9 | [(horovod.ai)](horovod.ai) 10 | 11 | Horovod is based on the MPI concepts: 12 | size, rank, local rank, allreduce, allgather, and broadcast. 13 | 14 | * Library for distributed deep learning with support for multiple frameworks including TensorFlow 15 | * Separates infrastructure from ML engineers 16 | * Uses ring-allreduce and uses Message Passing Interface (MPI) popular in the HPC community 17 | * Infrastructure services such as Amazon SageMaker and Amazon EKS provides container and MPI environment 18 | 19 | ![allreduce](/images/intro/forward_backward.png) 20 | 21 | 1. Forward pass on each device 22 | 1. Backward pass compute gradients 23 | 1. ”All reduce” (average and broadcast) gradients across devices 24 | 1. Update local variables with “all reduced” gradients 25 | 26 | Horovod will run the same copy of the script on all hosts/servers/nodes/instances 27 | 28 | ![mpi](/images/intro/how_it_runs.png) 29 | 30 | `horovodrun -np 16 -H server1:4,server2:4,server3:4,server4:4 python training_script.py` 31 | -------------------------------------------------------------------------------- /content/kubernetes_dist_training/_index.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Distributed Training with Amazon EKS" 3 | date = 2019-10-21T13:21:28-07:00 4 | weight = 5 5 | chapter = true 6 | #pre = "2. " 7 | +++ 8 | 9 | # Distributed Training with Amazon EKS 10 | 11 | In this section, we’ll run distributed training on Amazon Elastic Kubernetes Service (Amazon EKS). Amazon EKS makes it easy to deploy, manage, and scale containerized applications using Kubernetes on AWS. To run deep learning workloads on Amazon EKS, we'll install Kubeflow. The Kubeflow project includes capabilities that make deployments of machine learning (ML) workflows on Kubernetes easy. With EKS and Kubeflow, you'll still need to manage the underlying CPU and GPU instances that form your cluster. EKS and Kubeflow make it easy to manage and schedule machine learning workloads on your cluster. 12 | -------------------------------------------------------------------------------- /content/kubernetes_dist_training/build_container.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Build training container image and push it to ECR" 3 | date: 2019-10-28T16:51:02-07:00 4 | weight: 7 5 | --- 6 | 7 | #### Build a custom docker image with our training code 8 | 9 | In our Dockerfile we start with an AWS Deep Learning TensorFlow container and copy our training code into the container. 10 | 11 | ``` 12 | cd ~/SageMaker/distributed-training-workshop/notebooks/part-3-kubernetes/ 13 | cat Dockerfile.cpu 14 | ``` 15 | `Dockerfile.cpu` Output: 16 | ``` 17 | FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:1.14.0-cpu-py36-ubuntu16.04 18 | COPY code /opt/training/ 19 | WORKDIR /opt/training 20 | ``` 21 | 22 | {{% notice tip %}} 23 | Replace with `Dockerfile.gpu` if you're going to be running training on a GPU cluster. 24 | {{% /notice %}} 25 | 26 | #### Build and push a custom Docker container 27 | 28 | * Navigate to [ECR and create a new repository](https://console.aws.amazon.com/ecr/home) 29 | * Click create repository 30 | * Provide a repository name 31 | * Click create 32 | 33 | {{% notice tip %}} 34 | By clicking on **View push commands** button below, you can get access to docker build and push commands, so you don't have to remember them. 35 | {{% /notice %}} 36 | ![create repo](/images/eks/create_repo.png) 37 | ![push commands](/images/eks/push_commands.png) 38 | #### Create a new Elastic Container Registry repository 39 | 40 | * Head over to the terminal on JupyterLab and log-in to the AWS Deep Learning registry 41 | ``` 42 | $(aws ecr get-login --no-include-email --region us-west-2 --registry-ids 763104351884) 43 | ``` 44 | * Run `docker build` command in **Step 2** from the Docker push commands menu. Make sure to update it with the correct Docker file name for CPU or GPU: 45 | * For CPU container: `docker build -t -f Dockerfile.cpu .` 46 | * For GPU container: `docker build -t -f Dockerfile.gpu .` 47 | * Run the `docker tag` command in **Step 3** from the Docker push commands menu 48 | 49 | * Log in to your docker registry 50 | * `$(aws ecr get-login --no-include-email --region us-west-2)` 51 | 52 | * Run `docker push` command in **Step 4** from the Docker push commands menu 53 | 54 | {{% notice tip %}} 55 | What happened? 56 | (1) You first logged into the AWS Deep Learning container registry in order to pull the deep learning container (2) You then built your container. (3) After the container is built, you added the appropriate tag needed to push it to ECR. (4) Then you login to your own registry. (4) Then you push the container to your registry 57 | 58 | {{% /notice %}} 59 | -------------------------------------------------------------------------------- /content/kubernetes_dist_training/fsx_lustre.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Enable Amazon FSx for Lustre access" 3 | date: 2019-10-28T16:20:52-07:00 4 | weight: 6 5 | --- 6 | 7 | Amazon FSx for Lustre provides a high-performance file system optimized for fast processing of workloads such as in deep learning. FSx for Lustre file system transparently presents S3 objects as files and allows you to write results back to S3. 8 | 9 | #### Install the FSx CSI Driver 10 | ``` 11 | kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/aws-fsx-csi-driver/master/deploy/kubernetes/manifest.yaml 12 | ``` 13 | 14 | 15 | ``` 16 | VPC_ID=$(aws ec2 describe-vpcs --filters "Name=tag:Name,Values=eksctl-${CLUSTER_NAME}-cluster/VPC" --query "Vpcs[0].VpcId" --output text) 17 | ``` 18 | 19 | #### Get subnet ID from the EC2 console 20 | Navigate to [AWS EC2 console](https://console.aws.amazon.com/ec2/v2/home) and click on **Instances**. 21 | Select one of the running instances which starts with the name of the EKS cluster. This instance is a node on the EKS cluster. 22 | Copy the subnet ID as show in the image below. Click on the copy-to-clipboard icon show next to the arrow. 23 | 24 | ![subnet](/images/eks/subnet_image.png) 25 | paste the subnet ID below 26 | ``` 27 | export SUBNET_ID= 28 | ``` 29 | 30 | #### Create your security group for the FSx file system 31 | ``` 32 | export SECURITY_GROUP_ID=$(aws ec2 create-security-group --group-name eks-fsx-security-group --vpc-id ${VPC_ID} --description "FSx for Lustre Security Group" --query "GroupId" --output text) 33 | ``` 34 | 35 | {{% notice warning %}} 36 | **Stop:** Make sure that the security group was created before proceeding. 37 | Confirm by running `echo $SECURITY_GROUP_ID`. Don't proceed if this is empty. 38 | {{% /notice %}} 39 | 40 | #### Add an ingress rule that opens up port 988 from the 192.168.0.0/16 CIDR range 41 | ``` 42 | aws ec2 authorize-security-group-ingress --group-id ${SECURITY_GROUP_ID} --protocol tcp --port 988 --cidr 192.168.0.0/16 43 | ``` 44 | 45 | #### Update the environment variables in the storage class spec file 46 | Running envsubst will populate SUBNET_ID, SECURITY_GROUP_ID, BUCKET_NAME 47 | ``` 48 | cd ~/SageMaker/distributed-training-workshop/notebooks/part-3-kubernetes/ 49 | 50 | envsubst < specs/storage-class-fsx-s3-template.yaml > specs/storage-class-fsx-s3.yaml 51 | ``` 52 | 53 | #### Deploy the StorageClass and PersistentVolumeClaim 54 | ``` 55 | kubectl apply -f specs/storage-class-fsx-s3.yaml 56 | kubectl apply -f specs/claim-fsx-s3.yaml 57 | ``` 58 | 59 | This will take several minutes. You can check the status by running the following command. Hit `Ctrl+C` if you don't want the terminal to be blocked. To manually check, run the command without `-w` 60 | 61 | ``` 62 | kubectl get pvc fsx-claim -w 63 | ``` 64 | -------------------------------------------------------------------------------- /content/kubernetes_dist_training/install_cli.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Install CLI tools" 3 | date: 2019-10-28T15:02:28-07:00 4 | weight: 2 5 | --- 6 | 7 | Navigate to the following directory for part 3 of the workshop 8 | ``` 9 | cd ~/SageMaker/distributed-training-workshop/notebooks/part-3-kubernetes/ 10 | ``` 11 | 12 | 13 | #### Install `eksctl` 14 | 15 | To get started we'll fist install the eksctl CLI tool. [eksctl](https://eksctl.io) simplifies the process of creating EKS clusters. 16 | 17 | ```bash 18 | pip install awscli --upgrade --user 19 | curl --silent --location "https://github.com/weaveworks/eksctl/releases/download/latest_release/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp 20 | 21 | ``` 22 | 23 | Move eksctl to /usr/local/bin to that it's on path 24 | 25 | ``` 26 | sudo mv /tmp/eksctl /usr/local/bin 27 | eksctl version 28 | 29 | ``` 30 | 31 | #### Install `kubectl` 32 | Kubectl is a command line interface for running commands against Kubernetes clusters. Run the following to install Kubectl 33 | 34 | ```bash 35 | curl -o kubectl https://amazon-eks.s3-us-west-2.amazonaws.com/1.14.6/2019-08-22/bin/linux/amd64/kubectl 36 | chmod +x ./kubectl 37 | sudo mv ./kubectl /usr/local/bin 38 | kubectl version --short --client 39 | 40 | ``` 41 | 42 | #### Install `aws-iam-authenticator` 43 | 44 | ``` 45 | curl -o aws-iam-authenticator https://amazon-eks.s3-us-west-2.amazonaws.com/1.14.6/2019-08-22/bin/linux/amd64/aws-iam-authenticator 46 | 47 | chmod +x ./aws-iam-authenticator 48 | 49 | sudo mv aws-iam-authenticator /usr/local/bin 50 | ``` 51 | -------------------------------------------------------------------------------- /content/kubernetes_dist_training/install_kubeflow.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Install Kubeflow" 3 | date: 2019-10-28T15:42:44-07:00 4 | weight: 5 5 | --- 6 | 7 | #### Download the kfctl CLI tool 8 | 9 | ``` 10 | curl --silent --location https://github.com/kubeflow/kubeflow/releases/download/v0.7.0-rc.6/kfctl_v0.7.0-rc.5-7-gc66ebff3_linux.tar.gz | tar xz 11 | 12 | sudo mv kfctl /usr/local/bin 13 | ``` 14 | 15 | #### Get the latest Kubeflow configuration file 16 | 17 | ``` 18 | export CONFIG='https://raw.githubusercontent.com/kubeflow/manifests/v0.7-branch/kfdef/kfctl_aws.0.7.0.yaml' 19 | ``` 20 | 21 | #### Create environment and local variables 22 | 23 | ``` 24 | CLUSTER_NAME=$(eksctl get cluster --output=json | jq '.[0].name' --raw-output) 25 | 26 | INSTANCE_ROLE_NAME=$(eksctl get iamidentitymapping --name ${CLUSTER_NAME} --output=json | jq '.[0].rolearn' --raw-output | sed -e 's/.*\///') 27 | ``` 28 | 29 | {{% notice warning %}} 30 | Make sure that both environment variables are set before proceeding. 31 | Confirm by running `echo $CLUSTER_NAME` and `echo $INSTANCE_ROLE_NAME`. 32 | Make sure that these are not empty. 33 | {{% /notice %}} 34 | 35 | Add your S3 bucket name below: 36 | ``` 37 | export BUCKET_NAME= 38 | ``` 39 | 40 | {{% notice warning %}} 41 | **Stop:** Verify that you have the correct bucket name before proceeding. 42 | {{% /notice %}} 43 | 44 | ``` 45 | export KF_NAME=${CLUSTER_NAME} 46 | export KF_DIR=$PWD/${KF_NAME} 47 | ``` 48 | 49 | #### Build your configuration files 50 | We'll edit the configuration with the right names for the cluster and node groups before deploying Kubeflow. 51 | 52 | ``` 53 | mkdir -p ${KF_DIR} 54 | cd ${KF_DIR} 55 | kfctl build -V -f ${CONFIG} 56 | export CONFIG_FILE=${KF_DIR}/kfctl_aws.0.7.0.yaml 57 | 58 | ``` 59 | 60 | #### Edit the configuration file to include the correct instance role name and cluster name 61 | ``` 62 | sed -i "s@eksctl-kubeflow-aws-nodegroup-ng-a2-NodeInstanceRole-xxxxxxx@$INSTANCE_ROLE_NAME@" ${CONFIG_FILE} 63 | 64 | sed -i "s@kubeflow-aws@$CLUSTER_NAME@" ${CONFIG_FILE} 65 | 66 | ``` 67 | 68 | #### Apply the changes and deploy Kubeflow 69 | ``` 70 | cd ${KF_DIR} 71 | rm -rf kustomize/ 72 | kfctl apply -V -f ${CONFIG_FILE} 73 | ``` 74 | 75 | #### Wait for resource to become available 76 | 77 | Monitor changes by running kubectl get all namespaces command. 78 | ``` 79 | kubectl -n kubeflow get all 80 | ``` 81 | -------------------------------------------------------------------------------- /content/kubernetes_dist_training/setup_eks.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Setup an Amazon EKS cluster" 3 | date: 2019-10-28T15:14:12-07:00 4 | weight: 3 5 | --- 6 | 7 | Navigate to ***distributed-training-workshop > notebooks > part-3-sagemaker*** 8 | 9 | The `cpu_eks_cluster.sh` and `gpu_eks_cluster.sh` files include the necessary options to lauch a CPU or GPU cluster. Take a look at the options by running the following script to launch an EKS clusters 10 | 11 | ```bash 12 | cd ~/SageMaker/distributed-training-workshop/notebooks/part-3-kubernetes/ 13 | cat cpu_eks_cluster.sh 14 | ``` 15 | You should see the following output 16 | ``` 17 | Output: 18 | eksctl create cluster \ 19 | --name aws-tf-cluster-cpu \ 20 | --version 1.14 \ 21 | --region us-west-2 \ 22 | --nodegroup-name cpu-nodes \ 23 | --node-type c5.xlarge \ 24 | --nodes 2 \ 25 | --node-volume-size 50 \ 26 | --node-zones us-west-2a \ 27 | --timeout=40m \ 28 | --zones=us-west-2a,us-west-2b,us-west-2c \ 29 | --auto-kubeconfig 30 | ``` 31 | 32 | {{% notice tip %}} 33 | To launch a cluster with GPU use the script `gpu_eks_cluster.sh` instead. If you wish to launch a cluster with more than 2 nodes, update the `nodes` argument to number of nodes you want in the cluster. 34 | {{% /notice %}} 35 | 36 | Now launch an EKS cluster: 37 | ``` 38 | sh cpu_eks_cluster.sh 39 | ``` 40 | 41 | You should an output that something similar to this. 42 | 43 | ![eks output](/images/eks/eksctl_launch.png) 44 | 45 | Creating a cluster may take about 15 mins. You could head over to [AWS cloud formation console](https://console.aws.amazon.com/cloudformation) to monitor the progress. 46 | -------------------------------------------------------------------------------- /content/kubernetes_dist_training/submit_job.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Submit distributed training job" 3 | date: 2019-10-28T17:14:05-07:00 4 | weight: 8 5 | --- 6 | #### Confirm that you are in the right directory 7 | ``` 8 | cd ~/SageMaker/distributed-training-workshop/notebooks/part-3-kubernetes/ 9 | ``` 10 | #### Copy the container image name 11 | 12 | ![copy container](/images/eks/get_container.png?width=60pc) 13 | 14 | 15 | #### Update the MPIJob spec file 16 | 17 | Open `specs/eks_tf_training_job-cpu.yaml` and update `image: ` with the name of your container. 18 | 19 | ![update container](/images/eks/job_yaml_container.png?width=60pc) 20 | 21 | #### Submit a job run: 22 | ``` 23 | kubectl apply -f specs/eks_tf_training_job-cpu.yaml 24 | ``` 25 | {{% notice tip %}} 26 | For GPU jobs use this instead: `eks_tf_training_job-gpu.yaml` 27 | {{% /notice %}} 28 | 29 | You should see an output something like this: 30 | ``` 31 | mpijob.kubeflow.org/eks-tf-distributed-training created 32 | ``` 33 | Running `kubectl get pods` will should you the number of workers + 1 number of pods. 34 | 35 | ```bash 36 | $ kubectl get pods 37 | NAME READY STATUS RESTARTS AGE 38 | eks-tf-distributed-training-launcher-6lgzg 1/1 Running 0 63s 39 | eks-tf-distributed-training-worker-0 1/1 Running 0 66s 40 | eks-tf-distributed-training-worker-1 1/1 Running 0 66s 41 | ``` 42 | 43 | To observer training logs, run `kubectl logs `. Select the launcher pod from the list. You can use tab complete or copy the name of the pod from the output of `kubectl get pods` 44 | 45 | ``` 46 | kubectl logs eks-tf-distributed-training-launcher- 47 | ``` 48 | 49 | output: 50 | ``` 51 | ... 52 | Epoch 1/30 53 | Epoch 1/30 54 | 3/78 [>.............................] - ETA: 4:05 - loss: 3.6816 - acc: 0.1172 3/724/78 [========>.....................] - ETA: 1:29 - loss: 2.7493 - acc: 0.161024/778/78 [==============================] - 128s 2s/step - loss: 2.1984 - acc: 0.2268 - val_loss: 2.1794 - val_acc: 0.1699 55 | Epoch 2/30 56 | 78/78 [==============================] - 129s 2s/step - loss: 2.2108 - acc: 0.2268 - val_loss: 2.1794 - val_acc: 0.1699 57 | Epoch 2/30 58 | ``` 59 | -------------------------------------------------------------------------------- /content/kubernetes_dist_training/verify_cluster.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Verify installation and test cluster" 3 | date: 2019-10-28T15:33:52-07:00 4 | weight: 4 5 | --- 6 | 7 | Once the cluster is up and running, you should see a message that your cluster is now ready. 8 | ![verify eks](/images/eks/verify_eks.png) 9 | 10 | Update kubeconfig file to point to our new cluster. 11 | If you chose a different name for you cluster (other than aws-tf-cluster-cpu) then be sure to include the name of your cluster below. 12 | 13 | ``` 14 | aws eks --region us-west-2 update-kubeconfig --name aws-tf-cluster-cpu 15 | ``` 16 | 17 | Run the following to confirm that you can access the EKS cluster: 18 | 19 | You should see a list of kubernetes namespaces: 20 | ``` 21 | kubectl get ns 22 | ``` 23 | ``` 24 | Output: 25 | NAME STATUS AGE 26 | default Active 12m 27 | kube-node-lease Active 13m 28 | kube-public Active 13m 29 | kube-system Active 13m 30 | ``` 31 | 32 | You should see total number of nodes in your cluster: 33 | ``` 34 | kubectl get nodes 35 | ``` 36 | ``` 37 | Output: 38 | NAME STATUS ROLES AGE VERSION 39 | ip-192-168-10-211.us-west-2.compute.internal Ready 7m3s v1.14.7-eks-1861c5 40 | ip-192-168-10-229.us-west-2.compute.internal Ready 7m4s v1.14.7-eks-1861c5 41 | ``` 42 | -------------------------------------------------------------------------------- /content/kubernetes_dist_training/workflow.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Workflow" 3 | date: 2019-10-28T14:18:11-07:00 4 | weight: 1 5 | --- 6 | 7 | Navigate to 8 | ***distributed-training-workshop > notebooks > part-2-sagemaker*** 9 | You should see the following files: 10 | 11 | ```bash 12 | part-3-kubernetes/ 13 | ├── Dockerfile 14 | ├── cpu_eks_cluster.sh 15 | ├── gpu_eks_cluster.sh 16 | ├── code 17 | │ ├── cifar10-multi-gpu-horovod-k8s.py 18 | │ └── model_def.py 19 | └── specs 20 | ├── claim-fsx-s3.yaml 21 | ├── eks_tf_training_job.yaml 22 | ├── fsx_lustre_policy.json 23 | └── storage-class-fsx-s3-template.yaml 24 | ``` 25 | 26 | |Files/directories|Description| 27 | |-----|-----| 28 | |Dockerfile | Use this build a custom container image for training on Amazon EKS| 29 | |cpu_eks_cluster.sh, gpu_eks_cluster.sh |shell scripts using `eksctl` CLI tool to launch an Amazon EKS cluster| 30 | |code|Contains the training scrip and other training script dependencies| 31 | |specs|List of spec files required to configure Kubeflow| 32 | 33 | ![workflow](/images/eks/workflow.png) 34 | 35 | We'll need to first setup Amazon EKS, Amazon FSx for Lustre file system and install Kubeflow. This involves multiple steps and we'll leverage various CLI tools to to help install, configure and interact with EKS. At a high level, we'll perform the following steps: 36 | 37 | 1. Install `eksctl` CLI and use it to launch an Amazon EKS cluster 38 | 1. Install `kubectl` CLI to interact with the Amazon EKS cluster 39 | 1. Install `kfclt` CLI and use it to configure and install Kubeflow 40 | 1. Allow Amazon EKS to access Amazon FSx for Lustre file system that's linked to an Amazon S3 bucket 41 | 1. Finally, launch a distributed training job 42 | -------------------------------------------------------------------------------- /content/sagemaker_dist_training/_index.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Distributed Training with Amazon Sagemaker" 3 | date = 2019-10-21T13:21:01-07:00 4 | weight = 4 5 | chapter = true 6 | # pre = "1. " 7 | +++ 8 | 9 | # Distributed training with Amazon SageMaker 10 | 11 | In this section, we'll run distributed training on Amazon SageMaker. We'll provide SageMaker with our updated training script that includes horovod API, and SageMaker will take care of the rest - spinning up requested number of CPU or GPU instances, copying the training code and dependencies to the training cluster, copying the dataset from Amazon S3 to the training cluster, keeping track of training progress and shutting down the instances once training is done. Amazon SageMaker is a fully managed service, so you don't have to worry about managing instances. 12 | -------------------------------------------------------------------------------- /content/sagemaker_dist_training/monitoring_results.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Monitoring training progress" 3 | date: 2019-10-28T13:54:27-07:00 4 | weights: 4 5 | --- 6 | 7 | ### Monitoring training progress using tensorboard 8 | 9 | The ***cifar10-sagemaker-distributed.ipynb*** notebook will automatically start a tensorboard server for you when your run the following cell. Tensorboard is running locally on your Jupyter notebook instance, but reading the events from the Amazon S3 bucket we used to save the events using the keras callback. 10 | 11 | ```bash 12 | !S3_REGION=us-west-2 tensorboard --logdir s3://{bucket_name}/tensorboard_logs/ 13 | ``` 14 | 15 | Navigate to https://tfworld2019.notebook.us-west-2.sagemaker.aws/proxy/6006/ 16 | 17 | Replace `tfworld2019` with the name of your Jupyter notebook instance. 18 | ![tensorboard](/images/sagemaker/tensorboard.png) 19 | 20 | ### Monitoring training job status on the AWS SageMaker console 21 | 22 | Navigate to ***AWS management console > SageMaker console*** to see a full list of training jobs and their status. 23 | 24 | ![tensorboard](/images/sagemaker/aws_console.png) 25 | 26 | To view cloudwatch logs from the training instances, click on the ***training job name > Monitor > View logs*** 27 | -------------------------------------------------------------------------------- /content/sagemaker_dist_training/sagemaker_training.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "SageMaker distributed training" 3 | date: 2019-10-28T13:17:44-07:00 4 | weight: 3 5 | --- 6 | 7 |
Open `cifar10-sagemaker-distributed.ipynb` and run through the cells. The following notebook is located at:
8 | ***distributed-training-workshop > notebooks > part-2-sagemaker > cifar10-sagemaker-distributed.ipynb*** 9 | 10 | ![sm_notebook](/images/sagemaker/sm_notebook.png) 11 | 12 | {{% notice warning %}} 13 | **Stop:** Do this section on JupyterLab. Below is a copy of the jupyter notebook for reference.
14 | {{% /notice %}} 15 | 16 | ---- 17 | 18 | 19 | ## Distributed training with Amazon SageMaker 20 | 21 | In this notebook we use the SageMaker Python SDK to setup and run a distributed training job. 22 | SageMaker makes it easy to train models across a cluster containing a large number of machines, without having to explicitly manage those resources. 23 | 24 | **Step 1:** Import essentials packages, start a sagemaker session and specify the bucket name you created in the pre-requsites section of this workshop. 25 | 26 | 27 | ```python 28 | import os 29 | import time 30 | import numpy as np 31 | import sagemaker 32 | 33 | sagemaker_session = sagemaker.Session() 34 | role = sagemaker.get_execution_role() 35 | bucket_name = 'tfworld2019-' 36 | ``` 37 | 38 | **Step 2:** Specify hyperparameters, instance type and number of instances to distribute training to. The `hvd_processes_per_host` corrosponds to number of GPUs per instances. 39 | For example, if you choose: 40 | ``` 41 | hvd_instance_type = 'ml.p3.8large' 42 | hvd_instance_count = 2 43 | hvd_processes_per_host = 4 44 | ``` 45 | 46 | Since p3.8xlarge instance has 4 GPUs, we'll we distributing training to 8 workers, 1 per GPU. 47 | This is spread across 2 instances (or nodes). SageMaker automatically takes care of spinning up these instances and making sure they can communiate with each other. 48 | 49 | 50 | ```python 51 | hyperparameters = {'epochs': 100, 52 | 'learning-rate': 0.001, 53 | 'momentum': 0.9, 54 | 'weight-decay': 2e-4, 55 | 'optimizer': 'adam', 56 | 'batch-size' : 256} 57 | 58 | hvd_instance_type = 'ml.c5.xlarge' 59 | hvd_instance_count = 2 60 | hvd_processes_per_host = 1 61 | 62 | print('Distributed training with a total of {} workers'.format(hvd_processes_per_host*hvd_instance_count)) 63 | print('{} x {} instances with {} processes per instance'.format(hvd_instance_count, hvd_instance_type, hvd_processes_per_host)) 64 | ``` 65 | 66 | **Step 3:** In this cell we create a SageMaker estimator, by providing it with all the information it needs to launch instances and execute training on those instances. 67 | 68 | Since we're using horovod for distributed training, we specify `distributions` to mpi which is used by horovod. 69 | 70 | In the TensorFlow estimator call, we specify training script under `entry_point` and dependencies under `code`. SageMaker automatically copies these files into a TensorFlow container behind the scenes, and are executed on the training instances. 71 | 72 | 73 | ```python 74 | from sagemaker.tensorflow import TensorFlow 75 | 76 | output_path = 's3://{}/'.format(bucket_name) 77 | job_name = 'sm-dist-{}x{}-workers'.format(hvd_instance_count, hvd_processes_per_host) + time.strftime('%Y-%m-%d-%H-%M-%S-%j', time.gmtime()) 78 | model_dir = output_path + 'tensorboard_logs/' + job_name 79 | 80 | distributions = {'mpi': { 81 | 'enabled': True, 82 | 'processes_per_host': hvd_processes_per_host, 83 | 'custom_mpi_options': '-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none' 84 | } 85 | } 86 | 87 | estimator_hvd = TensorFlow(base_job_name='hvd-cifar10-tf', 88 | source_dir='code', 89 | entry_point='cifar10-multi-gpu-horovod-sagemaker.py', 90 | role=role, 91 | framework_version='1.14', 92 | py_version='py3', 93 | hyperparameters=hyperparameters, 94 | train_instance_count=hvd_instance_count, 95 | train_instance_type=hvd_instance_type, 96 | output_path=output_path, 97 | model_dir=model_dir, 98 | tags = [{'Key' : 'Project', 'Value' : 'cifar10'},{'Key' : 'TensorBoard', 'Value' : 'dist'}], 99 | metric_definitions=[{'Name': 'val_acc', 'Regex': 'val_acc: ([0-9\\.]+)'}], 100 | distributions=distributions) 101 | ``` 102 | 103 | **Step 4:** Specify dataset locations in Amazon S3 and then call the fit function. 104 | 105 | 106 | ```python 107 | train_path = 's3://{}/cifar10-dataset/train'.format(bucket_name) 108 | val_path = 's3://{}/cifar10-dataset/validation'.format(bucket_name) 109 | eval_path = 's3://{}/cifar10-dataset/eval/'.format(bucket_name) 110 | 111 | estimator_hvd.fit({'train': train_path,'validation': val_path,'eval': eval_path}, 112 | job_name=job_name, wait=False) 113 | ``` 114 | 115 | **Step 5:** Monitor progress on TensorBoard. Launch tensorboard and open the link on a new tab to visualize training progress, and navigate to the following link 116 | 117 | 118 | ```python 119 | !S3_REGION=us-west-2 tensorboard --logdir s3://{bucket_name}/tensorboard_logs/ 120 | ``` 121 | 122 | Open a new browser tan and navigate to the folloiwng link to access TensorBoard: 123 |
https://tfworld2019.notebook.us-west-2.sagemaker.aws/proxy/6006/ 124 |
Make sure that the name of the notebook instance is correct in the link above. 125 |
Don't forget the slash at the end of the URL 6006/ 126 | -------------------------------------------------------------------------------- /content/sagemaker_dist_training/training_scrip_updates.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Updates required to run on SageMaker" 3 | date: 2019-10-28T13:42:13-07:00 4 | weight: 2 5 | --- 6 | 7 | There are few minor changes required to run a training script on Amazon Sagemaker 8 | 9 | 10 | ##### SageMaker hyperparameters 11 | * SageMaker passes hyperparameters to the training scripts as commandline arguments. Your script must be able to parse these arguments. 12 | 13 | ##### SageMaker environment variables 14 | * SageMaker makes several environment variables available inside the container that a training script can take advantage of for finding location of the training dataset, number of GPU in the instance, dataset channels and others. A full list of environment variables an be found on the [SageMaker container GitHub repository](https://github.com/aws/sagemaker-containers#important-environment-variables) 15 | 16 | ``` 17 | parser = argparse.ArgumentParser() 18 | 19 | # Hyper-parameters 20 | parser.add_argument('--epochs', type=int, default=15) 21 | parser.add_argument('--learning-rate', type=float, default=0.001) 22 | parser.add_argument('--batch-size', type=int, default=256) 23 | parser.add_argument('--weight-decay', type=float, default=2e-4) 24 | parser.add_argument('--momentum', type=float, default='0.9') 25 | parser.add_argument('--optimizer', type=str, default='adam') 26 | 27 | # SageMaker parameters 28 | parser.add_argument('--model_dir', type=str) 29 | parser.add_argument('--model_output_dir', type=str, default=os.environ['SM_MODEL_DIR']) 30 | parser.add_argument('--output_data_dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR']) 31 | 32 | # Data directories and other options 33 | parser.add_argument('--gpu-count', type=int, default=os.environ['SM_NUM_GPUS']) 34 | parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN']) 35 | parser.add_argument('--validation', type=str, default=os.environ['SM_CHANNEL_VALIDATION']) 36 | parser.add_argument('--eval', type=str, default=os.environ['SM_CHANNEL_EVAL']) 37 | 38 | args = parser.parse_args() 39 | ``` 40 | 41 | ##### (Optional) TensorBoard callback for real-time monitoring of training 42 | * Using a keras callback we can upload tensorboard files to Amazon S3 so that we can monitor progress in real-time. 43 | tensorboard already comes installed on the SageMaker JupyterLab instance, and has support for reading event files from Amazon S3. 44 | 45 | `tensorboard --logdir s3://{bucket_name}/tensorboard_logs/` 46 | 47 | ``` 48 | class Sync2S3(tf.keras.callbacks.Callback): 49 | def __init__(self, logdir, s3logdir): 50 | super(Sync2S3, self).__init__() 51 | self.logdir = logdir 52 | self.s3logdir = s3logdir 53 | 54 | def on_epoch_end(self, batch, logs={}): 55 | os.system('aws s3 sync '+self.logdir+' '+self.s3logdir) 56 | # ' >/dev/null 2>&1' 57 | ``` 58 | -------------------------------------------------------------------------------- /content/sagemaker_dist_training/workflow.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Workflow" 3 | date: 2019-10-28T12:59:15-07:00 4 | weight: 1 5 | --- 6 | Navigate to 7 | ***distributed-training-workshop > notebooks > part-2-sagemaker*** 8 | You should see the following files: 9 | 10 | ```bash 11 | part-2-sagemaker/ 12 | ├── cifar10-sagemaker-distributed.ipynb 13 | └── code 14 | ├── cifar10-multi-gpu-horovod-sagemaker.py 15 | └── model_def.py 16 | ``` 17 | 18 | |Files/directories|Description| 19 | |-----|-----| 20 | |cifar10-sagemaker-distributed.ipynb |This jupyter notebook contains code to define and kick off a SageMaker training job| 21 | |code |This directory contains the training scrip and other training script dependencies| 22 | 23 | ![sagemaker_workflow](/images/sagemaker/workflow.png) 24 | 25 | SageMaker is a fully-managed service, which means when you kick off a training job using the SageMaker SDK in the `cifar10-sagemaker-distributed.ipynb` notebook, few different things happen behind the scene 26 | 27 | * SageMaker spins up request number of instances in a fully-managed SageMaker cluster 28 | * SageMaker pulls the latest (or specified version) of TensorFlow container images, instantiates it on the new instances and loads the content of the `code` directory into the container 29 | * SageMaker runs the training script on each instance. Since we're running distributed training SageMaker launches an `MPI` job with the right settings so that workers can communicate with each other. 30 | * SageMaker copies the dataset over from Amazon S3 and makes it available inside the container for Training 31 | * SageMaker monitors the training and updates progress on the Amazon SageMaker console 32 | * SageMaker copies all the code and model artifacts to Amazon S3 after the training is finished 33 | 34 | In addition, SageMaker does a lot more to ensure that the jobs run optimally and you get the best perfomance out-of-the box. As a user you don't have to worry about managing machine learning infrastructure. 35 | -------------------------------------------------------------------------------- /content/setup/_index.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Prerequisites" 3 | date = 2019-10-21T13:30:44-07:00 4 | weight = 2 5 | chapter = true 6 | #pre = "0. " 7 | +++ 8 | 9 | # Getting Started 10 | In this section, we'll setup up our development environment. 11 | We'll be using an Amazon SageMaker notebook instance which is a fully managed compute instance running the Jupyter Notebook server. 12 | -------------------------------------------------------------------------------- /content/setup/add_admin_policy.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Update notebook IAM role" 3 | date: 2019-10-27T23:41:36-07:00 4 | weight: 3 5 | --- 6 | 7 | ### Give your notebook instances admin privileges 8 | {{% notice warning %}} 9 | **Note:** We're providing admin privileges to the SageMaker notebook instance only because we'll be using the same instance to launch an Amazon EKS cluster in the later part of the workshop. If you're only going to be using SageMaker managed cluster for training, S3 full access policy should suffice. 10 | {{% /notice %}} 11 | 12 | * Click on the **tfworld2019** and you'll see additional details about the instance. Click on the IAM role link, this should take you to the IAM Management Console. Once there, click attach policy button. 13 | ![go_to_IAM](/images/setup/go_to_IAM.png) 14 | ![attach_policy](/images/setup/attach_policy.png) 15 | * Select **AdministratorAccess** and click on **Attach policy** 16 | ![admin attach](/images/setup/admin_attach.png) 17 | * Close the the IAM Management Console window and head back to the SageMaker console. 18 | -------------------------------------------------------------------------------- /content/setup/download_workshop.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Download the workshop content" 3 | date: 2019-10-28T00:14:06-07:00 4 | weight: 4 5 | --- 6 | ### Launch JupyterLab client and clone the workshop repository 7 | * Your notebook instance should now be ready. Click *JupyterLab* to launch your client. 8 | ![launch jupyter](/images/setup/launch_jupyter.png) 9 | 10 | * Click *File > New > Terminal* to launch terminal in your JupyterLab instance. 11 | ![Launch terminal](/images/setup/launch_terminal.png) 12 | 13 | * Download the workshop code and notebooks. Enter bash (optional), change directory to ~/SageMaker, clone the repository 14 | ```bash 15 | bash 16 | cd ~/SageMaker 17 | git clone https://github.com/shashankprasanna/distributed-training-workshop.git 18 | ``` 19 | 20 | * Confirm that you're able to see the contents. Should see 3 parts 21 | ``` 22 | ls distributed-training-workshop/notebooks 23 | ``` 24 | -------------------------------------------------------------------------------- /content/setup/sm_jupyter_instance.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Launch a SageMaker notebook instance" 3 | date: 2019-10-27T22:39:43-07:00 4 | weight: 2 5 | --- 6 | 7 | {{% notice info %}} 8 | **Note:** In this workshop, we'll be using an Amazon SageMaker notebook instance for simplicity and convenience. You can use any local client to perform steps detailed in this and subsequent sections. You'll just need to make sure you have the right privileges to access AWS services such as SageMaker, EKS, S3, ECR and others from your clinet. You'll also need to install AWS Command Line Interface (AWS CLI), python, boto3 and SageMaker SDK installed. The SageMaker Jupyter notebook on the other hand is preconfigured and ready to use. 9 | {{% /notice %}} 10 | 11 | ### Launch an Amazon SageMaker notebook instance 12 | 13 | * Open the [AWS Management Console](https://console.aws.amazon.com/console/home) 14 | {{% notice info %}} 15 | **Note:** This workshop has been tested on the US West (Oregon) (us-west-2) region. Make sure that you see **Oregon** on the top right hand corner of your AWS Management Console. If you see a different region, click the dropdown menu and select US West (Oregon) 16 | {{% /notice %}} 17 | 18 | * In the AWS Console search bar, type SageMaker and select Amazon SageMaker to open the service console. 19 | ![SageMaker Console](/images/setup/setup_aws_console.png) 20 | * Click on Notebook Instances 21 | ![Launch notebook instance 1](/images/setup/setup_notebook.png) 22 | * From the Amazon SageMaker > Notebook instances page, select Create notebook instance. 23 | ![Launch notebook instance 2](/images/setup/setup_create_notebook.png) 24 | * In the Notebook instance name text box, enter a name for the notebook instance. 25 | * For this workshop select **"tfworld2019"** as the instance name 26 | * Choose ml.c5.xlarge. We'll only be using this instance to launch jobs. The training job themselves will run either on a SageMaker managed cluster or an Amazon EKS cluster 27 | * Volume size 50 - this is only needed for building docker containers. During training data is copied directly from Amazon S3 to the training cluster when using SageMaker. When using Amazon EKS, we'll setup an FSx for lustre file system that worker nodes will use to get access to training data. 28 | ![Fill notebook instance](/images/setup/setup_fill_notebook.png) 29 | * To create an IAM role, from the IAM role drop-down list, select Create a new role. In the Create an IAM role dialog box, select Any S3 bucket. After that select Select **Create role**. Amazon SageMaker creates the **AmazonSageMaker-ExecutionRole-*** ** role. 30 | ![iam](/images/setup/notebook_iam.png) 31 | * Keep the default settings for the other options and click Create notebook instance. On the **Notebook instances** section you should see the status change from *Pending -> InService* 32 | * While the notebook instance spins up, continue to work on the next section, and we'll come back and launch the instance when it's ready. 33 | -------------------------------------------------------------------------------- /content/update_code_dist_training/_index.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Prepare your training scripts" 3 | date = 2019-10-28T00:51:31-07:00 4 | weight = 3 5 | chapter = true 6 | +++ 7 | 8 | # Prepare your training scripts 9 | 10 | In this section, we'll walk through the process of modifying an existing TensorFlow-Keras training script so that it can perform training in a distributed environment. 11 | -------------------------------------------------------------------------------- /content/update_code_dist_training/distributed_training_script.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Using horovod API for distributed training" 3 | date: 2019-10-28T02:47:30-07:00 4 | weight: 4 5 | --- 6 | 7 | ## Exercise 1: Convert training script to use horovod 8 | 9 |
In this section you'll update the training script with horovod API for run distributed training. 10 | 11 | Open `cifar10-distributed.ipynb` and run through the cells. The following notebook is located at:
12 | ***distributed-training-workshop > notebooks > part-1-horovod*** 13 | 14 | ![singe_instance](/images/convert_script/distributed_script.png) 15 | 16 | {{% notice warning %}} 17 | **Stop:** Do this section on JupyterLab. Below is a copy of the jupyter notebook for reference.

18 | Open `cifar10-distributed.ipynb` and run these cells.

19 | Look for cells that say **Change X** and fill in those cells with the modifications - where **X** is the change number. There are a total of 8 changes. 20 | Click on **> Solution** to see the answers 21 | {{% /notice %}} 22 | 23 | You'll need to make the following modifications to your training script to use horovod for distributed training. 24 | 25 | 1. Run hvd.init() 26 | 2. Pin a server GPU to be used by this process using config.gpu_options.visible_device_list. 27 | 3. Scale the learning rate by the number of workers. 28 | 4. Wrap the optimizer in hvd.DistributedOptimizer. 29 | 5. Add hvd.callbacks.BroadcastGlobalVariablesCallback(0) to broadcast initial variable states from rank 0 to all other processes. 30 | 6. Modify your code to save checkpoints only on worker 0 to prevent other workers from corrupting them. 31 | 32 | 33 | 34 | #### Change 1: Import horovod and keras backend 35 | 36 | 37 | ```python 38 | import tensorflow as tf 39 | 40 | 41 | 42 | ``` 43 | 44 |
Solution 45 |
 46 | import horovod.tensorflow.keras as hvd
 47 | import tensorflow.keras.backend as K
 48 | 
49 |
50 | 51 | 52 | ```python 53 | from datetime import datetime 54 | import argparse 55 | import os 56 | import numpy as np 57 | import codecs 58 | import json 59 | 60 | from tensorflow import keras 61 | from tensorflow.keras.layers import Input, Dense, Flatten 62 | from tensorflow.keras.models import Model 63 | from tensorflow.keras.optimizers import Adam, SGD 64 | from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint 65 | from model_def import get_model 66 | 67 | HEIGHT = 32 68 | WIDTH = 32 69 | DEPTH = 3 70 | NUM_CLASSES = 10 71 | NUM_TRAIN_IMAGES = 40000 72 | NUM_VALID_IMAGES = 10000 73 | NUM_TEST_IMAGES = 10000 74 | ``` 75 | 76 | 77 | ```python 78 | def train_preprocess_fn(image): 79 | 80 | # Resize the image to add four extra pixels on each side. 81 | image = tf.image.resize_image_with_crop_or_pad(image, HEIGHT + 8, WIDTH + 8) 82 | 83 | # Randomly crop a [HEIGHT, WIDTH] section of the image. 84 | image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH]) 85 | 86 | # Randomly flip the image horizontally. 87 | image = tf.image.random_flip_left_right(image) 88 | 89 | return image 90 | ``` 91 | 92 | 93 | ```python 94 | def make_batch(filenames, batch_size): 95 | """Read the images and labels from 'filenames'.""" 96 | # Repeat infinitely. 97 | dataset = tf.data.TFRecordDataset(filenames).repeat() 98 | 99 | # Parse records. 100 | dataset = dataset.map(single_example_parser, num_parallel_calls=1) 101 | 102 | # Batch it up. 103 | dataset = dataset.batch(batch_size, drop_remainder=True) 104 | iterator = dataset.make_one_shot_iterator() 105 | 106 | image_batch, label_batch = iterator.get_next() 107 | return image_batch, label_batch 108 | ``` 109 | 110 | 111 | ```python 112 | def single_example_parser(serialized_example): 113 | """Parses a single tf.Example into image and label tensors.""" 114 | # Dimensions of the images in the CIFAR-10 dataset. 115 | # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the 116 | # input format. 117 | features = tf.parse_single_example( 118 | serialized_example, 119 | features={ 120 | 'image': tf.FixedLenFeature([], tf.string), 121 | 'label': tf.FixedLenFeature([], tf.int64), 122 | }) 123 | image = tf.decode_raw(features['image'], tf.uint8) 124 | image.set_shape([DEPTH * HEIGHT * WIDTH]) 125 | 126 | # Reshape from [depth * height * width] to [depth, height, width]. 127 | image = tf.cast( 128 | tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]), 129 | tf.float32) 130 | label = tf.cast(features['label'], tf.int32) 131 | 132 | image = train_preprocess_fn(image) 133 | label = tf.one_hot(label, NUM_CLASSES) 134 | 135 | return image, label 136 | ``` 137 | 138 | 139 | ```python 140 | # Hyper-parameters 141 | epochs = 1 142 | lr = 0.01 143 | batch_size = 128 144 | momentum = 0.9 145 | weight_decay = 2e-4 146 | optimizer = 'sgd' 147 | gpu_count = 1 148 | 149 | # Data directories and other options 150 | checkpoint_dir = '../ckpt_dir' 151 | if not os.path.exists(checkpoint_dir): 152 | os.makedirs(checkpoint_dir) 153 | 154 | train_dir = '../data/train' 155 | validation_dir = '../data/validation' 156 | eval_dir = '../data/eval' 157 | 158 | train_dataset = make_batch(train_dir+'/train.tfrecords', batch_size) 159 | val_dataset = make_batch(validation_dir+'/validation.tfrecords', batch_size) 160 | eval_dataset = make_batch(eval_dir+'/eval.tfrecords', batch_size) 161 | ``` 162 | 163 | #### Change 2: Initialize horovod and get the size of the cluster 164 | 165 | 166 | ```python 167 | 168 | 169 | 170 | 171 | 172 | 173 | ``` 174 | 175 |
Solution 176 |
177 | hvd.init()
178 | size = hvd.size()
179 | 
180 |
181 | 182 | #### Change 3 - Pin GPU to be used to process local rank (one GPU per process) 183 | 184 | 185 | ```python 186 | 187 | 188 | 189 | 190 | 191 | ``` 192 | 193 |
Solution 194 |
195 | config = tf.ConfigProto()
196 | config.gpu_options.allow_growth = True
197 | config.gpu_options.visible_device_list = str(hvd.local_rank())
198 | K.set_session(tf.Session(config=config))
199 | 
200 |
201 | 202 | 203 | ```python 204 | model = get_model(lr, weight_decay, optimizer, momentum) 205 | ``` 206 | 207 | #### Change 4: How will you update the learning rate for distributed training? What changes should you make to the following command? 208 | 209 | 210 | ```python 211 | opt = SGD(lr=lr, decay=weight_decay, momentum=momentum) 212 | ``` 213 | 214 |
Solution 215 |
216 | opt = SGD(lr=lr * size, decay=weight_decay, momentum=momentum)
217 | 
218 | You need to scale the learning using the size of the cluster (total number of workers)
219 | 
220 |
221 | 222 | #### Change 6: How will you convert the optimizer to distributed optimizer? 223 | 224 | 225 | ```python 226 | model.compile(loss='categorical_crossentropy', 227 | optimizer=opt, 228 | metrics=['accuracy']) 229 | ``` 230 | 231 |
Solution 232 |
233 | opt = hvd.DistributedOptimizer(opt)
234 | model.compile(loss='categorical_crossentropy',
235 |               optimizer=opt,
236 |               metrics=['accuracy'])
237 | 
238 |
239 | 240 | #### Change 7: Add callbacks for syncing initial state, and saving checkpoints only on 1st worker 241 | 242 | 243 | ```python 244 | 245 | 246 | 247 | 248 | ``` 249 | 250 |
Solution 251 |
252 | callbacks = []
253 | callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
254 | callbacks.append(hvd.callbacks.MetricAverageCallback())
255 | callbacks.append(hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1))
256 | callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1))
257 | if hvd.rank() == 0:
258 |     callbacks.append(ModelCheckpoint('ckpt_dir' + '/checkpoint-{epoch}.h5'))
259 | 
260 |
261 | 262 | #### Change 8: Update the number of steps/epoch 263 | 264 | 265 | ```python 266 | %%time 267 | # Train model 268 | history = model.fit(x=train_dataset[0], y=train_dataset[1], 269 | steps_per_epoch=NUM_TRAIN_IMAGES // batch_size, 270 | validation_data=val_dataset, 271 | validation_steps=NUM_VALID_IMAGES // batch_size, 272 | epochs=epochs, 273 | callbacks=[ModelCheckpoint(checkpoint_dir + '/checkpoint-{epoch}.h5')]) 274 | 275 | ``` 276 | 277 |
Solution 278 |
279 | history = model.fit(x=train_dataset[0], y=train_dataset[1],
280 |                     steps_per_epoch= (NUM_TRAIN_IMAGES // batch_size)// size,
281 |                     validation_data=val_dataset,
282 |                     validation_steps= (NUM_VALID_IMAGES // batch_size)// size,
283 |                     epochs=epochs, callbacks=callbacks)
284 | 
285 |
286 | 287 | 288 | ```python 289 | # Evaluate model performance 290 | score = model.evaluate(eval_dataset[0], 291 | eval_dataset[1], 292 | steps=NUM_TEST_IMAGES // batch_size, 293 | verbose=0) 294 | print('Test loss :', score[0]) 295 | print('Test accuracy:', score[1]) 296 | ``` 297 | 298 | Note once these changes are made, you can convert the jupyter notebook into a python training script by running: 299 | $ jupyter nbconvert --to script notebook_name.ipynb 300 | -------------------------------------------------------------------------------- /content/update_code_dist_training/prepare_dataset.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Prepare training dataset" 3 | date: 2019-10-28T01:48:16-07:00 4 | weight: 2 5 | --- 6 | ### Download the CIFAR10 dataset and upload it to Amazon S3 7 | 8 | On a terminal window in JupyterLab client, navigate to the notebook directory 9 | 10 | ``` 11 | cd ~/SageMaker/distributed-training-workshop/notebooks/ 12 | ``` 13 | Activate the TensorFlow conda environment 14 | ``` 15 | source activate tensorflow_p36 16 | ``` 17 | 18 | Download CIFAR10 dataset and convert it to TFRecords format 19 | ``` 20 | python generate_cifar10_tfrecords.py --data-dir dataset 21 | ``` 22 | Confirm that the dataset was downloaded successfully. Run: 23 | ``` 24 | sudo yum install tree -y 25 | tree dataset 26 | ``` 27 | You should see the following output 28 | ``` 29 | dataset 30 | ├── eval 31 | │ └── eval.tfrecords 32 | ├── train 33 | │ └── train.tfrecords 34 | └── validation 35 | └── validation.tfrecords 36 | ``` 37 | 38 | Create a new S3 bucket and upload the dataset to it. Be sure to add a unique identifier, such as your name. 39 | ``` 40 | aws s3 mb s3:// 41 | ``` 42 | {{% notice warning %}} 43 | **Note:** Bucket names should be unique globally. If a bucket with the same name already exists, add another unique identifier such as today's date or your last name. 44 | {{% /notice %}} 45 | 46 | Proceed only if you don't see an error. Now, upload the dataset to S3 47 | ``` 48 | aws s3 sync dataset/ s3:///cifar10-dataset/ 49 | ``` 50 | -------------------------------------------------------------------------------- /content/update_code_dist_training/problem_setup.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Problem statement" 3 | date: 2019-10-28T01:12:18-07:00 4 | weight: 1 5 | --- 6 | 7 | ### Converting a single CPU/GPU training script to a multi-node/distributed compatible training script 8 | **Frameworks:** This workshop currently uses TensorFlow 1.14, Keras and Horovod 0.18. 9 | 10 | **Dataset:** The CIFAR-10 consists of 60,000 32x32 images belonging to 10 different classes (6,000 images per class). 11 |
CIFAR-10 dataset includes: 12 | 13 | * 40,000 images for training 14 | * 10,000 images for validation 15 | * 10,000 images for test 16 | 17 | Here are the classes in the dataset, as well as 10 random images from each: 18 | ![cifar10](https://camo.githubusercontent.com/a426b9aca74c978ecc8b093dddc540f113591858/68747470733a2f2f6d616574333630382e6769746875622e696f2f6e7574732d6d6c2f5f696d616765732f636966617231302e706e67) 19 | 20 | {{% notice info %}} 21 | **Note:** Although the dataset is small and this is a simpler problem, all the steps we'll take can easily be applied to large datasets that don't fit in memory. Amazon SageMaker has native pipe-mode support to stream dataset directly from S3 to the training instances. With Amazon EKS, we'll setup an Amazon FSx for lustre file system that's accessible to every worker. 22 | {{% /notice %}} 23 | -------------------------------------------------------------------------------- /content/update_code_dist_training/single_instance_script.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Getting familiar with a single instance training script" 3 | date: 2019-10-28T02:38:32-07:00 4 | weight: 3 5 | --- 6 | 7 | ## Single CPU/GPU training on the local instance 8 | 9 |
In this section you'll get familiar with the training script we'll be converting for distributed training in the next notebook. 10 | 11 | Open `cifar10-single-instance.ipynb` and run through the cells. The following notebook is located at:
12 | ***distributed-training-workshop > notebooks > part-1-horovod*** 13 | 14 | ![singe_instance](/images/convert_script/single_instance.png) 15 | 16 | {{% notice warning %}} 17 | **Stop:** Do this section on JupyterLab. Below is a copy of the jupyter notebook for reference.
18 | {{% /notice %}} 19 | 20 | 21 | ---- 22 | #### Below is a copy of the Jupyter notebook `cifar10-single-instance.ipynb` 23 | 24 | ---- 25 | 26 | This Jupyter notebook contains code that trains a DNN on the CIFAR10 dataset. 27 | The script was written for local training on a single instance. Run through this notebook either cell-by-cell or by hitting *Run > Run All Cells* 28 | 29 | Once you start to feel comfortable with what the script is doing, we'll then start to make changes to this script so that it can run on a cluster in a distributed fashion. 30 | 31 | **Step 1:** Import essentials packages and define constants 32 | 33 | 34 | ```python 35 | import tensorflow as tf 36 | import argparse 37 | from datetime import datetime 38 | import os 39 | from tensorflow import keras 40 | from tensorflow.keras.layers import Input, Dense, Flatten 41 | from tensorflow.keras.models import Model 42 | from tensorflow.keras.optimizers import Adam, SGD 43 | from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint 44 | 45 | # Import DNN model definition file 46 | from model_def import get_model 47 | 48 | HEIGHT = 32 49 | WIDTH = 32 50 | DEPTH = 3 51 | NUM_CLASSES = 10 52 | NUM_TRAIN_IMAGES = 40000 53 | NUM_VALID_IMAGES = 10000 54 | NUM_TEST_IMAGES = 10000 55 | ``` 56 | 57 | **Step 2:** Define functions used to load and prepare dataset for training. We incorporate 3 types of data augmentation schemes: random resize, random crop, random flip. Feel free to update this if you're comfortable. Leave the cell as it is if you aren't comfortable making changes. 58 | 59 | 60 | ```python 61 | def train_preprocess_fn(image): 62 | 63 | # Resize the image to add four extra pixels on each side. 64 | image = tf.image.resize_image_with_crop_or_pad(image, HEIGHT + 8, WIDTH + 8) 65 | 66 | # Randomly crop a [HEIGHT, WIDTH] section of the image. 67 | image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH]) 68 | 69 | # Randomly flip the image horizontally. 70 | image = tf.image.random_flip_left_right(image) 71 | 72 | return image 73 | 74 | def make_batch(filenames, batch_size): 75 | """Read the images and labels from 'filenames'.""" 76 | # Repeat infinitely. 77 | dataset = tf.data.TFRecordDataset(filenames).repeat() 78 | 79 | # Parse records. 80 | dataset = dataset.map(single_example_parser, num_parallel_calls=1) 81 | 82 | # Batch it up. 83 | dataset = dataset.batch(batch_size, drop_remainder=True) 84 | iterator = dataset.make_one_shot_iterator() 85 | 86 | image_batch, label_batch = iterator.get_next() 87 | return image_batch, label_batch 88 | 89 | def single_example_parser(serialized_example): 90 | """Parses a single tf.Example into image and label tensors.""" 91 | # Dimensions of the images in the CIFAR-10 dataset. 92 | # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the 93 | # input format. 94 | features = tf.parse_single_example( 95 | serialized_example, 96 | features={ 97 | 'image': tf.FixedLenFeature([], tf.string), 98 | 'label': tf.FixedLenFeature([], tf.int64), 99 | }) 100 | image = tf.decode_raw(features['image'], tf.uint8) 101 | image.set_shape([DEPTH * HEIGHT * WIDTH]) 102 | 103 | # Reshape from [depth * height * width] to [depth, height, width]. 104 | image = tf.cast( 105 | tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]), 106 | tf.float32) 107 | label = tf.cast(features['label'], tf.int32) 108 | 109 | image = train_preprocess_fn(image) 110 | label = tf.one_hot(label, NUM_CLASSES) 111 | 112 | return image, label 113 | ``` 114 | 115 | **Step 3:** 116 | * Define hyperameters, directories for train, validation and test. 117 | * Load model from model_def.py 118 | * Compile model and fit 119 | 120 | 121 | ```python 122 | # Hyper-parameters 123 | epochs = 1 124 | lr = 0.01 125 | batch_size = 128 126 | momentum = 0.9 127 | weight_decay = 2e-4 128 | optimizer = 'sgd' 129 | gpu_count = 1 130 | 131 | # Data directories and other options 132 | checkpoint_dir = '../ckpt_dir' 133 | if not os.path.exists(checkpoint_dir): 134 | os.makedirs(checkpoint_dir) 135 | 136 | train_dir = '../dataset/train' 137 | validation_dir = '../dataset/validation' 138 | eval_dir = '../dataset/eval' 139 | 140 | train_dataset = make_batch(train_dir+'/train.tfrecords', batch_size) 141 | val_dataset = make_batch(validation_dir+'/validation.tfrecords', batch_size) 142 | eval_dataset = make_batch(eval_dir+'/eval.tfrecords', batch_size) 143 | ``` 144 | 145 | 146 | ```python 147 | model = get_model(lr, weight_decay, optimizer, momentum) 148 | opt = SGD(lr=lr, decay=weight_decay, momentum=momentum) 149 | ``` 150 | 151 | 152 | ```python 153 | model.compile(loss='categorical_crossentropy', 154 | optimizer=opt, 155 | metrics=['accuracy']) 156 | ``` 157 | 158 | 159 | ```python 160 | # Compile model 161 | model.compile(optimizer=SGD(lr=lr, decay=weight_decay, momentum=momentum), 162 | loss='categorical_crossentropy', 163 | metrics=['accuracy']) 164 | ``` 165 | 166 | 167 | ```python 168 | %%time 169 | # Train model 170 | history = model.fit(x=train_dataset[0], y=train_dataset[1], 171 | steps_per_epoch=NUM_TRAIN_IMAGES // batch_size, 172 | validation_data=val_dataset, 173 | validation_steps=NUM_VALID_IMAGES // batch_size, 174 | epochs=epochs, 175 | callbacks=[ModelCheckpoint(checkpoint_dir + '/checkpoint-{epoch}.h5')]) 176 | ``` 177 | 178 | 179 | ```python 180 | # Evaluate model performance 181 | score = model.evaluate(eval_dataset[0], 182 | eval_dataset[1], 183 | steps=NUM_TEST_IMAGES // batch_size, 184 | verbose=0) 185 | print('Test loss :', score[0]) 186 | print('Test accuracy:', score[1]) 187 | ``` 188 | 189 | 190 | ---- 191 | ##### Now that you have a successfully working training script, open `cifar10-distributed.ipynb` and start converting it for distributed training 192 | -------------------------------------------------------------------------------- /layouts/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{ partial "meta.html" . }} {{ partial "favicon.html" . }} {{ .Scratch.Add "title" "" }}{{ if eq .Site.Data.titles .Title }}{{ .Scratch.Set "title" (index .Site.Data.titles .Title).title }}{{ else }}{{ .Scratch.Set "title" .Title}}{{end}} 6 | {{ .Scratch.Get "title" }} 7 | 8 | {{ $assetBusting := not .Site.Params.disableAssetsBusting }} 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | {{with .Site.Params.themeVariant}} 17 | 18 | {{end}} 19 | 34 | {{ partial "custom-header.html" . }} 35 | 36 | 37 | 38 | 39 | 40 | 41 |
42 |
43 |
44 |
45 |

{{T "title-404"}}

46 |

47 |

48 |

{{T "message-404"}}

49 |

50 |

{{T "Go-to-homepage"}}

51 |

52 |
53 |
54 | 55 |
56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /layouts/partials/custom-footer.html: -------------------------------------------------------------------------------- 1 | {{ template "_internal/google_analytics.html" . }} 2 | -------------------------------------------------------------------------------- /layouts/partials/favicon.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /layouts/partials/footer.html: -------------------------------------------------------------------------------- 1 | {{ if .Params.chapter }} 2 | 3 | {{ end }} 4 | 5 | {{ partial "custom-comments.html" . }} 6 | 7 | 8 | 50 | 51 | 52 | 53 |
54 |
55 |
56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 78 | {{ partial "custom-footer.html" . }} 79 | 80 | 81 | -------------------------------------------------------------------------------- /layouts/partials/google.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /layouts/partials/header.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | {{ .Hugo.Generator }} 12 | {{ partial "meta.html" . }} 13 | {{ partial "favicon.html" . }} 14 | {{ partials "google/analytics" . }} 15 | {{ .Title }} :: {{ .Site.Title }} 16 | 17 | {{ $assetBusting := not .Site.Params.disableAssetsBusting }} 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | {{with .Site.Params.themeVariant}} 28 | 29 | {{end}} 30 | 31 | 32 | 33 | 36 | 37 | 47 | {{ partial "custom-header.html" . }} 48 | 49 | 50 | {{ partial "menu.html" . }} 51 |
52 |
53 |
54 | {{if not .IsHome}} 55 |
56 |
57 | {{ if and (or .IsPage .IsSection) .Site.Params.editURL }} 58 | {{ $File := .File }} 59 | {{ $Site := .Site }} 60 | {{with $File.Path }} 61 | 67 | {{ end }} 68 | {{ end }} 69 | {{$toc := (and (not .Params.disableToc) (not .Params.chapter))}} 70 | 83 | {{ if $toc }} 84 | {{ partial "toc.html" . }} 85 | {{ end }} 86 |
87 |
88 | {{ end }} 89 | 90 | {{ if .Params.chapter }} 91 |
92 | {{ end }} 93 |
94 | {{if and (not .IsHome) (not .Params.chapter) }} 95 |

{{.Title}}

96 | {{end}} 97 | 98 | {{define "breadcrumb"}} 99 | {{$parent := .page.Parent }} 100 | {{ if $parent }} 101 | {{ $value := (printf "%s > %s" $parent.URL $parent.Title .value) }} 102 | {{ template "breadcrumb" dict "page" $parent "value" $value }} 103 | {{else}} 104 | {{.value|safeHTML}} 105 | {{end}} 106 | {{end}} 107 | -------------------------------------------------------------------------------- /layouts/partials/logo.html: -------------------------------------------------------------------------------- 1 | AWS-Logo_White-Color 2 | -------------------------------------------------------------------------------- /layouts/partials/menu-footer.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |

GitHub Repo

4 | 5 |

Distributed training workshop

6 | Star 7 |  Fork 8 | 9 |
10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /layouts/shortcodes/cf-download.html: -------------------------------------------------------------------------------- 1 | 4 | 5 | Download 6 | 7 | -------------------------------------------------------------------------------- /layouts/shortcodes/cf-launch.html: -------------------------------------------------------------------------------- 1 | 4 | 5 | Launch 6 | 7 | -------------------------------------------------------------------------------- /layouts/shortcodes/ghcontributors.html: -------------------------------------------------------------------------------- 1 | 22 |
23 | {{ $url := .Get 0 }} 24 | {{ range getJSON $url }} 25 |
26 | 27 | 28 | {{.contributions}} commits 29 |
30 | {{ end }} 31 |
32 | -------------------------------------------------------------------------------- /layouts/shortcodes/github.html: -------------------------------------------------------------------------------- 1 | {{ .Get 0 }} 2 | -------------------------------------------------------------------------------- /layouts/shortcodes/mermaid.html: -------------------------------------------------------------------------------- 1 |
{{ safeHTML .Inner }}
2 | -------------------------------------------------------------------------------- /layouts/shortcodes/surveymonkey.html: -------------------------------------------------------------------------------- 1 | Create your own user feedback survey 2 | -------------------------------------------------------------------------------- /layouts/shortcodes/tab.html: -------------------------------------------------------------------------------- 1 | {{ if .Parent }} 2 | {{ $name := trim (.Get "name") " " }} 3 | {{ $include := trim (.Get "include") " "}} 4 | {{ $codelang := .Get "codelang" }} 5 | {{ if not (.Parent.Scratch.Get "tabs") }} 6 | {{ .Parent.Scratch.Set "tabs" slice }} 7 | {{ end }} 8 | {{ with .Inner }} 9 | {{ if $codelang }} 10 | {{ $.Parent.Scratch.Add "tabs" (dict "name" $name "content" (highlight . $codelang "") ) }} 11 | {{ else }} 12 | {{ $.Parent.Scratch.Add "tabs" (dict "name" $name "content" . ) }} 13 | {{ end }} 14 | {{ else }} 15 | {{ $.Parent.Scratch.Add "tabs" (dict "name" $name "include" $include "codelang" $codelang) }} 16 | {{ end }} 17 | {{ else }} 18 | {{- errorf "[%s] %q: tab shortcode missing its parent" .Page.Site.Language.Lang .Page.Path -}} 19 | {{ end}} -------------------------------------------------------------------------------- /layouts/shortcodes/tabs.html: -------------------------------------------------------------------------------- 1 | {{ .Page.Scratch.Add "tabset-counter" 1 }} 2 | {{ $tab_set_id := .Get "name" | default (printf "tabset-%s-%d" (.Page.RelPermalink) (.Page.Scratch.Get "tabset-counter") ) | anchorize }} 3 | {{ $tabs := .Scratch.Get "tabs" }} 4 | {{ if .Inner }}{{/* We don't use the inner content, but Hugo will complain if we don't reference it. */}}{{ end }} 5 |
6 |
    7 | {{ range $i, $e := $tabs }} 8 | {{ $id := printf "%s-%d" $tab_set_id $i }} 9 |
  • {{ trim .name " " }}
  • 10 | {{ end }} 11 |
12 | {{ range $i, $e := $tabs }} 13 | {{ $id := printf "%s-%d" $tab_set_id $i }} 14 |
15 | {{ with .content }} 16 | {{ . }} 17 | {{ else }} 18 | {{ if eq $.Page.BundleType "leaf" }} 19 | {{/* find the file somewhere inside the bundle. Note the use of double asterisk */}} 20 | {{ with $.Page.Resources.GetMatch (printf "**%s*" .include) }} 21 | {{ if ne .ResourceType "page" }} 22 | {{/* Assume it is a file that needs code highlighting. */}} 23 | {{ $codelang := $e.codelang | default ( path.Ext .Name | strings.TrimPrefix ".") }} 24 | {{ highlight .Content $codelang "" }} 25 | {{ else}} 26 | {{ .Content }} 27 | {{ end }} 28 | {{ end }} 29 | {{ else}} 30 | {{ $path := path.Join $.Page.Dir .include }} 31 | {{ $page := $.Page.Site.GetPage "page" $path }} 32 | {{ with $page }} 33 | {{ .Content }} 34 | {{ else }} 35 | {{ errorf "[%s] tabs include not found for path %q" $.Page.Site.Language.Lang $path}} 36 | {{ end }} 37 | {{ end }} 38 | {{ end }} 39 |
40 | {{ end }} 41 |
42 | {{ $elem := $tab_set_id | safeJS }} 43 | 44 | -------------------------------------------------------------------------------- /layouts/shortcodes/year.html: -------------------------------------------------------------------------------- 1 | {{ .Page.Now.Year }} 2 | 3 | -------------------------------------------------------------------------------- /notebooks/generate_cifar10_tfrecords.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Read CIFAR-10 data from pickled numpy arrays and writes TFRecords. 16 | 17 | Generates tf.train.Example protos and writes them to TFRecord files from the 18 | python version of the CIFAR-10 dataset downloaded from 19 | https://www.cs.toronto.edu/~kriz/cifar.html. 20 | """ 21 | 22 | from __future__ import absolute_import 23 | from __future__ import division 24 | from __future__ import print_function 25 | 26 | import argparse 27 | import os 28 | import sys 29 | 30 | import tarfile 31 | from six.moves import cPickle as pickle 32 | from six.moves import xrange # pylint: disable=redefined-builtin 33 | import tensorflow as tf 34 | 35 | tf.logging.set_verbosity(tf.logging.ERROR) 36 | if type(tf.contrib) != type(tf): tf.contrib._warning = None 37 | 38 | CIFAR_FILENAME = 'cifar-10-python.tar.gz' 39 | CIFAR_DOWNLOAD_URL = 'https://www.cs.toronto.edu/~kriz/' + CIFAR_FILENAME 40 | CIFAR_LOCAL_FOLDER = 'cifar-10-batches-py' 41 | 42 | 43 | def download_and_extract(data_dir): 44 | # download CIFAR-10 if not already downloaded. 45 | tf.contrib.learn.datasets.base.maybe_download(CIFAR_FILENAME, data_dir, 46 | CIFAR_DOWNLOAD_URL) 47 | tarfile.open(os.path.join(data_dir, CIFAR_FILENAME), 48 | 'r:gz').extractall(data_dir) 49 | 50 | 51 | def _int64_feature(value): 52 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 53 | 54 | 55 | def _bytes_feature(value): 56 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 57 | 58 | 59 | def _get_file_names(): 60 | """Returns the file names expected to exist in the input_dir.""" 61 | file_names = {} 62 | file_names['train'] = ['data_batch_%d' % i for i in xrange(1, 5)] 63 | file_names['validation'] = ['data_batch_5'] 64 | file_names['eval'] = ['test_batch'] 65 | return file_names 66 | 67 | 68 | def read_pickle_from_file(filename): 69 | with tf.gfile.Open(filename, 'rb') as f: 70 | if sys.version_info >= (3, 0): 71 | data_dict = pickle.load(f, encoding='bytes') 72 | else: 73 | data_dict = pickle.load(f) 74 | return data_dict 75 | 76 | 77 | def convert_to_tfrecord(input_files, output_file): 78 | """Converts a file to TFRecords.""" 79 | print('Generating %s' % output_file) 80 | with tf.python_io.TFRecordWriter(output_file) as record_writer: 81 | for input_file in input_files: 82 | data_dict = read_pickle_from_file(input_file) 83 | data = data_dict[b'data'] 84 | labels = data_dict[b'labels'] 85 | 86 | num_entries_in_batch = len(labels) 87 | for i in range(num_entries_in_batch): 88 | example = tf.train.Example(features=tf.train.Features( 89 | feature={ 90 | 'image': _bytes_feature(data[i].tobytes()), 91 | 'label': _int64_feature(labels[i]) 92 | })) 93 | record_writer.write(example.SerializeToString()) 94 | 95 | 96 | def main(data_dir): 97 | print('Download from {} and extract.'.format(CIFAR_DOWNLOAD_URL)) 98 | download_and_extract(data_dir) 99 | file_names = _get_file_names() 100 | input_dir = os.path.join(data_dir, CIFAR_LOCAL_FOLDER) 101 | for mode, files in file_names.items(): 102 | input_files = [os.path.join(input_dir, f) for f in files] 103 | output_file = os.path.join(data_dir+'/'+mode, mode + '.tfrecords') 104 | if not os.path.exists(data_dir+'/'+mode): 105 | os.makedirs(data_dir+'/'+mode) 106 | try: 107 | os.remove(output_file) 108 | except OSError: 109 | pass 110 | # Convert to tf.train.Example and write the to TFRecords. 111 | convert_to_tfrecord(input_files, output_file) 112 | print('Done!') 113 | import shutil 114 | shutil.rmtree(data_dir+'/cifar-10-batches-py') 115 | os.remove(data_dir+'/cifar-10-python.tar.gz') 116 | 117 | 118 | if __name__ == '__main__': 119 | parser = argparse.ArgumentParser() 120 | parser.add_argument( 121 | '--data-dir', 122 | type=str, 123 | default='', 124 | help='Directory to download and extract CIFAR-10 to.') 125 | 126 | args = parser.parse_args() 127 | main(args.data_dir) 128 | -------------------------------------------------------------------------------- /notebooks/part-1-horovod/cifar10-distributed.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Exercise 1: Convert training script to use horovod" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "You'll need to make the following modifications to your training script to use horovod for distributed training.\n", 15 | "\n", 16 | "1. Run hvd.init()\n", 17 | "2. Pin a server GPU to be used by this process using config.gpu_options.visible_device_list.\n", 18 | "3. Scale the learning rate by the number of workers.\n", 19 | "4. Wrap the optimizer in hvd.DistributedOptimizer.\n", 20 | "5. Add hvd.callbacks.BroadcastGlobalVariablesCallback(0) to broadcast initial variable states from rank 0 to all other processes.\n", 21 | "6. Modify your code to save checkpoints only on worker 0 to prevent other workers from corrupting them." 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "Look for cells that say **Change X** and fill in those cells with the modifications - where **X** is the change number. There are a total of 8 changes.\n", 29 | "Click on **Solution** to see the answers\n", 30 | "\n", 31 | "After you've finished making necessary changes, run the script by hitting *Run > Run All Cells*.\n", 32 | "\n", 33 | "**Confirm that that the script still runs after introducing the horovod API**" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "#### Change 1: Import horovod and keras backend" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "import tensorflow as tf\n", 50 | "\n", 51 | "\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "
Solution\n", 59 | "
\n",
 60 |     "import horovod.tensorflow.keras as hvd\n",
 61 |     "import tensorflow.keras.backend as K\n",
 62 |     "
\n", 63 | "
" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "from datetime import datetime\n", 73 | "import argparse\n", 74 | "import os\n", 75 | "import numpy as np\n", 76 | "import codecs\n", 77 | "import json\n", 78 | "\n", 79 | "from tensorflow import keras\n", 80 | "from tensorflow.keras.layers import Input, Dense, Flatten\n", 81 | "from tensorflow.keras.models import Model\n", 82 | "from tensorflow.keras.optimizers import Adam, SGD\n", 83 | "from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint\n", 84 | "from model_def import get_model\n", 85 | " \n", 86 | "HEIGHT = 32\n", 87 | "WIDTH = 32\n", 88 | "DEPTH = 3\n", 89 | "NUM_CLASSES = 10\n", 90 | "NUM_TRAIN_IMAGES = 40000\n", 91 | "NUM_VALID_IMAGES = 10000\n", 92 | "NUM_TEST_IMAGES = 10000" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "def train_preprocess_fn(image):\n", 102 | "\n", 103 | " # Resize the image to add four extra pixels on each side.\n", 104 | " image = tf.image.resize_image_with_crop_or_pad(image, HEIGHT + 8, WIDTH + 8)\n", 105 | "\n", 106 | " # Randomly crop a [HEIGHT, WIDTH] section of the image.\n", 107 | " image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH])\n", 108 | "\n", 109 | " # Randomly flip the image horizontally.\n", 110 | " image = tf.image.random_flip_left_right(image)\n", 111 | "\n", 112 | " return image" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "def make_batch(filenames, batch_size):\n", 122 | " \"\"\"Read the images and labels from 'filenames'.\"\"\"\n", 123 | " # Repeat infinitely.\n", 124 | " dataset = tf.data.TFRecordDataset(filenames).repeat()\n", 125 | "\n", 126 | " # Parse records.\n", 127 | " dataset = dataset.map(single_example_parser, num_parallel_calls=1)\n", 128 | "\n", 129 | " # Batch it up.\n", 130 | " dataset = dataset.batch(batch_size, drop_remainder=True)\n", 131 | " iterator = dataset.make_one_shot_iterator()\n", 132 | "\n", 133 | " image_batch, label_batch = iterator.get_next()\n", 134 | " return image_batch, label_batch" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "def single_example_parser(serialized_example):\n", 144 | " \"\"\"Parses a single tf.Example into image and label tensors.\"\"\"\n", 145 | " # Dimensions of the images in the CIFAR-10 dataset.\n", 146 | " # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the\n", 147 | " # input format.\n", 148 | " features = tf.parse_single_example(\n", 149 | " serialized_example,\n", 150 | " features={\n", 151 | " 'image': tf.FixedLenFeature([], tf.string),\n", 152 | " 'label': tf.FixedLenFeature([], tf.int64),\n", 153 | " })\n", 154 | " image = tf.decode_raw(features['image'], tf.uint8)\n", 155 | " image.set_shape([DEPTH * HEIGHT * WIDTH])\n", 156 | "\n", 157 | " # Reshape from [depth * height * width] to [depth, height, width].\n", 158 | " image = tf.cast(\n", 159 | " tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]),\n", 160 | " tf.float32)\n", 161 | " label = tf.cast(features['label'], tf.int32)\n", 162 | " \n", 163 | " image = train_preprocess_fn(image)\n", 164 | " label = tf.one_hot(label, NUM_CLASSES)\n", 165 | " \n", 166 | " return image, label" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "# Hyper-parameters\n", 176 | "epochs = 1\n", 177 | "lr = 0.01\n", 178 | "batch_size = 128\n", 179 | "momentum = 0.9\n", 180 | "weight_decay = 2e-4\n", 181 | "optimizer = 'sgd'\n", 182 | "gpu_count = 1\n", 183 | "\n", 184 | "# Data directories and other options\n", 185 | "checkpoint_dir = '../ckpt_dir'\n", 186 | "if not os.path.exists(checkpoint_dir):\n", 187 | " os.makedirs(checkpoint_dir)\n", 188 | "\n", 189 | "train_dir = '../dataset/train'\n", 190 | "validation_dir = '../dataset/validation'\n", 191 | "eval_dir = '../dataset/eval'\n", 192 | "\n", 193 | "train_dataset = make_batch(train_dir+'/train.tfrecords', batch_size)\n", 194 | "val_dataset = make_batch(validation_dir+'/validation.tfrecords', batch_size)\n", 195 | "eval_dataset = make_batch(eval_dir+'/eval.tfrecords', batch_size)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "#### Change 2: Initialize horovod and get the size of the cluster" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "\n", 212 | "\n", 213 | "\n", 214 | "\n", 215 | "\n" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "
Solution\n", 223 | "
\n",
224 |     "hvd.init()\n",
225 |     "size = hvd.size()\n",
226 |     "
\n", 227 | "
" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "#### Change 3 - Pin GPU to be used to process local rank (one GPU per process)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "\n", 244 | "\n", 245 | "\n", 246 | "\n" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "
Solution\n", 254 | "
\n",
255 |     "config = tf.ConfigProto()\n",
256 |     "config.gpu_options.allow_growth = True\n",
257 |     "config.gpu_options.visible_device_list = str(hvd.local_rank())\n",
258 |     "K.set_session(tf.Session(config=config))\n",
259 |     "
\n", 260 | "
" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "model = get_model(lr, weight_decay, optimizer, momentum)" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "#### Change 4: How will you update the learning rate for distributed training? What changes should you make to the following command?" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "opt = SGD(lr=lr, decay=weight_decay, momentum=momentum)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "
Solution\n", 293 | "
\n",
294 |     "opt = SGD(lr=lr * size, decay=weight_decay, momentum=momentum)\n",
295 |     "\n",
296 |     "You need to scale the learning using the size of the cluster (total number of workers)\n",
297 |     "
\n", 298 | "
" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "#### Change 6: How will you convert the optimizer to distributed optimizer?" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "model.compile(loss='categorical_crossentropy',\n", 315 | " optimizer=opt,\n", 316 | " metrics=['accuracy'])" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "
Solution\n", 324 | "
\n",
325 |     "opt = hvd.DistributedOptimizer(opt)\n",
326 |     "model.compile(loss='categorical_crossentropy',\n",
327 |     "              optimizer=opt,\n",
328 |     "              metrics=['accuracy'])\n",
329 |     "
\n", 330 | "
" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "#### Change 7: Add callbacks for syncing initial state, and saving checkpoints only on 1st worker" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "\n", 347 | "\n", 348 | "\n" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "
Solution\n", 356 | "
\n",
357 |     "callbacks = []\n",
358 |     "callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))\n",
359 |     "callbacks.append(hvd.callbacks.MetricAverageCallback())\n",
360 |     "callbacks.append(hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1))\n",
361 |     "callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1))\n",
362 |     "if hvd.rank() == 0:\n",
363 |     "    callbacks.append(ModelCheckpoint('ckpt_dir' + '/checkpoint-{epoch}.h5'))\n",
364 |     "
\n", 365 | "
" 366 | ] 367 | }, 368 | { 369 | "cell_type": "markdown", 370 | "metadata": {}, 371 | "source": [ 372 | "#### Change 8: Update the number of steps/epoch" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [ 381 | "%%time\n", 382 | "# Train model\n", 383 | "history = model.fit(x=train_dataset[0], y=train_dataset[1],\n", 384 | " steps_per_epoch=NUM_TRAIN_IMAGES // batch_size,\n", 385 | " validation_data=val_dataset,\n", 386 | " validation_steps=NUM_VALID_IMAGES // batch_size,\n", 387 | " epochs=epochs, \n", 388 | " callbacks=[ModelCheckpoint(checkpoint_dir + '/checkpoint-{epoch}.h5')])\n" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": {}, 394 | "source": [ 395 | "
Solution\n", 396 | "
\n",
397 |     "history = model.fit(x=train_dataset[0], y=train_dataset[1],\n",
398 |     "                    steps_per_epoch= (NUM_TRAIN_IMAGES // batch_size)// size,\n",
399 |     "                    validation_data=val_dataset,\n",
400 |     "                    validation_steps= (NUM_VALID_IMAGES // batch_size)// size,\n",
401 |     "                    epochs=epochs, callbacks=callbacks)\n",
402 |     "
\n", 403 | "
" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [ 412 | "# Evaluate model performance\n", 413 | "score = model.evaluate(eval_dataset[0],\n", 414 | " eval_dataset[1],\n", 415 | " steps=NUM_TEST_IMAGES // batch_size,\n", 416 | " verbose=0)\n", 417 | "print('Test loss :', score[0])\n", 418 | "print('Test accuracy:', score[1])" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "metadata": {}, 424 | "source": [ 425 | "Note once these changes are made, you can convert the jupyter notebook into a python training script by running:\n", 426 | " $ jupyter nbconvert --to script notebook_name.ipynb " 427 | ] 428 | } 429 | ], 430 | "metadata": { 431 | "kernelspec": { 432 | "display_name": "conda_tensorflow_p36", 433 | "language": "python", 434 | "name": "conda_tensorflow_p36" 435 | }, 436 | "language_info": { 437 | "codemirror_mode": { 438 | "name": "ipython", 439 | "version": 3 440 | }, 441 | "file_extension": ".py", 442 | "mimetype": "text/x-python", 443 | "name": "python", 444 | "nbconvert_exporter": "python", 445 | "pygments_lexer": "ipython3", 446 | "version": "3.6.5" 447 | } 448 | }, 449 | "nbformat": 4, 450 | "nbformat_minor": 4 451 | } 452 | -------------------------------------------------------------------------------- /notebooks/part-1-horovod/cifar10-single-instance.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Single CPU/GPU training on the local instance\n", 8 | "This Jupyter notebook contains code that trains a DNN on the CIFAR10 dataset.\n", 9 | "The CIFAR-10 dataset consists of 60,000 32x32 images belonging to 10 different classes (6,000 images per class).\n", 10 | "\n", 11 | "This script was written for local training on a single instance. Run through this notebook either cell-by-cell or by hitting *Run > Run All Cells*\n", 12 | "\n", 13 | "Once you start to feel comfortable with what the script is doing, we'll then start to make changes to this script so that it can run on a cluster in a distributed fashion." 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "**Step 1:** Import essentials packages and define constants" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import tensorflow as tf\n", 30 | "import argparse\n", 31 | "from datetime import datetime\n", 32 | "import os\n", 33 | "from tensorflow import keras\n", 34 | "from tensorflow.keras.layers import Input, Dense, Flatten\n", 35 | "from tensorflow.keras.models import Model\n", 36 | "from tensorflow.keras.optimizers import Adam, SGD\n", 37 | "from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint\n", 38 | "\n", 39 | "# Import DNN model definition file\n", 40 | "from model_def import get_model\n", 41 | "\n", 42 | "HEIGHT = 32\n", 43 | "WIDTH = 32\n", 44 | "DEPTH = 3\n", 45 | "NUM_CLASSES = 10\n", 46 | "NUM_TRAIN_IMAGES = 40000\n", 47 | "NUM_VALID_IMAGES = 10000\n", 48 | "NUM_TEST_IMAGES = 10000" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "**Step 2:** Define functions used to load and prepare dataset for training. We incorporate 3 types of data augmentation schemes: random resize, random crop, random flip. Feel free to update this if you're comfortable. Leave the cell as it is if you aren't comfortable making changes." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "def train_preprocess_fn(image):\n", 65 | "\n", 66 | " # Resize the image to add four extra pixels on each side.\n", 67 | " image = tf.image.resize_image_with_crop_or_pad(image, HEIGHT + 8, WIDTH + 8)\n", 68 | "\n", 69 | " # Randomly crop a [HEIGHT, WIDTH] section of the image.\n", 70 | " image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH])\n", 71 | "\n", 72 | " # Randomly flip the image horizontally.\n", 73 | " image = tf.image.random_flip_left_right(image)\n", 74 | "\n", 75 | " return image\n", 76 | "\n", 77 | "def make_batch(filenames, batch_size):\n", 78 | " \"\"\"Read the images and labels from 'filenames'.\"\"\"\n", 79 | " # Repeat infinitely.\n", 80 | " dataset = tf.data.TFRecordDataset(filenames).repeat()\n", 81 | "\n", 82 | " # Parse records.\n", 83 | " dataset = dataset.map(single_example_parser, num_parallel_calls=1)\n", 84 | "\n", 85 | " # Batch it up.\n", 86 | " dataset = dataset.batch(batch_size, drop_remainder=True)\n", 87 | " iterator = dataset.make_one_shot_iterator()\n", 88 | "\n", 89 | " image_batch, label_batch = iterator.get_next()\n", 90 | " return image_batch, label_batch\n", 91 | "\n", 92 | "def single_example_parser(serialized_example):\n", 93 | " \"\"\"Parses a single tf.Example into image and label tensors.\"\"\"\n", 94 | " # Dimensions of the images in the CIFAR-10 dataset.\n", 95 | " # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the\n", 96 | " # input format.\n", 97 | " features = tf.parse_single_example(\n", 98 | " serialized_example,\n", 99 | " features={\n", 100 | " 'image': tf.FixedLenFeature([], tf.string),\n", 101 | " 'label': tf.FixedLenFeature([], tf.int64),\n", 102 | " })\n", 103 | " image = tf.decode_raw(features['image'], tf.uint8)\n", 104 | " image.set_shape([DEPTH * HEIGHT * WIDTH])\n", 105 | "\n", 106 | " # Reshape from [depth * height * width] to [depth, height, width].\n", 107 | " image = tf.cast(\n", 108 | " tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]),\n", 109 | " tf.float32)\n", 110 | " label = tf.cast(features['label'], tf.int32)\n", 111 | " \n", 112 | " image = train_preprocess_fn(image)\n", 113 | " label = tf.one_hot(label, NUM_CLASSES)\n", 114 | " \n", 115 | " return image, label" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "**Step 3:** \n", 123 | "* Define hyperameters, directories for train, validation and test.\n", 124 | "* Load model from model_def.py\n", 125 | "* Compile model and fit" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "# Hyper-parameters\n", 135 | "epochs = 1\n", 136 | "lr = 0.01\n", 137 | "batch_size = 128\n", 138 | "momentum = 0.9\n", 139 | "weight_decay = 2e-4\n", 140 | "optimizer = 'sgd'\n", 141 | "gpu_count = 1\n", 142 | "\n", 143 | "# Data directories and other options\n", 144 | "checkpoint_dir = '../ckpt_dir'\n", 145 | "if not os.path.exists(checkpoint_dir):\n", 146 | " os.makedirs(checkpoint_dir)\n", 147 | " \n", 148 | "train_dir = '../dataset/train'\n", 149 | "validation_dir = '../dataset/validation'\n", 150 | "eval_dir = '../dataset/eval'\n", 151 | "\n", 152 | "train_dataset = make_batch(train_dir+'/train.tfrecords', batch_size)\n", 153 | "val_dataset = make_batch(validation_dir+'/validation.tfrecords', batch_size)\n", 154 | "eval_dataset = make_batch(eval_dir+'/eval.tfrecords', batch_size)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "model = get_model(lr, weight_decay, optimizer, momentum)\n", 164 | "opt = SGD(lr=lr, decay=weight_decay, momentum=momentum)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "model.compile(loss='categorical_crossentropy',\n", 174 | " optimizer=opt,\n", 175 | " metrics=['accuracy'])" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "# Compile model\n", 185 | "model.compile(optimizer=SGD(lr=lr, decay=weight_decay, momentum=momentum),\n", 186 | " loss='categorical_crossentropy',\n", 187 | " metrics=['accuracy'])" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "%%time\n", 197 | "# Train model\n", 198 | "history = model.fit(x=train_dataset[0], y=train_dataset[1],\n", 199 | " steps_per_epoch=NUM_TRAIN_IMAGES // batch_size,\n", 200 | " validation_data=val_dataset,\n", 201 | " validation_steps=NUM_VALID_IMAGES // batch_size,\n", 202 | " epochs=epochs, \n", 203 | " callbacks=[ModelCheckpoint(checkpoint_dir + '/checkpoint-{epoch}.h5')])" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "# Evaluate model performance\n", 213 | "score = model.evaluate(eval_dataset[0],\n", 214 | " eval_dataset[1],\n", 215 | " steps=NUM_TEST_IMAGES // batch_size,\n", 216 | " verbose=0)\n", 217 | "print('Test loss :', score[0])\n", 218 | "print('Test accuracy:', score[1])" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "\n", 226 | "----\n", 227 | "##### Now that you have a successfully working training script, open `cifar10-distributed.ipynb` and start converting it for distributed training" 228 | ] 229 | } 230 | ], 231 | "metadata": { 232 | "kernelspec": { 233 | "display_name": "conda_tensorflow_p36", 234 | "language": "python", 235 | "name": "conda_tensorflow_p36" 236 | }, 237 | "language_info": { 238 | "codemirror_mode": { 239 | "name": "ipython", 240 | "version": 3 241 | }, 242 | "file_extension": ".py", 243 | "mimetype": "text/x-python", 244 | "name": "python", 245 | "nbconvert_exporter": "python", 246 | "pygments_lexer": "ipython3", 247 | "version": "3.6.5" 248 | } 249 | }, 250 | "nbformat": 4, 251 | "nbformat_minor": 4 252 | } 253 | -------------------------------------------------------------------------------- /notebooks/part-1-horovod/model_def.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras.layers import Activation, Conv2D, Dense, Dropout, Flatten, MaxPooling2D, BatchNormalization 3 | from tensorflow.keras.models import Sequential 4 | from tensorflow.keras.optimizers import Adam, SGD, RMSprop 5 | 6 | HEIGHT = 32 7 | WIDTH = 32 8 | DEPTH = 3 9 | NUM_CLASSES = 10 10 | 11 | def get_model(learning_rate, weight_decay, optimizer, momentum): 12 | 13 | model = Sequential() 14 | model.add(Conv2D(32, (3, 3), padding='same', input_shape=(HEIGHT, WIDTH, DEPTH))) 15 | model.add(BatchNormalization()) 16 | model.add(Activation('relu')) 17 | model.add(Conv2D(32, (3, 3))) 18 | model.add(BatchNormalization()) 19 | model.add(Activation('relu')) 20 | model.add(MaxPooling2D(pool_size=(2, 2))) 21 | model.add(Dropout(0.2)) 22 | 23 | model.add(Conv2D(64, (3, 3), padding='same')) 24 | model.add(BatchNormalization()) 25 | model.add(Activation('relu')) 26 | model.add(Conv2D(64, (3, 3))) 27 | model.add(BatchNormalization()) 28 | model.add(Activation('relu')) 29 | model.add(MaxPooling2D(pool_size=(2, 2))) 30 | model.add(Dropout(0.3)) 31 | 32 | model.add(Conv2D(128, (3, 3), padding='same')) 33 | model.add(BatchNormalization()) 34 | model.add(Activation('relu')) 35 | model.add(Conv2D(128, (3, 3))) 36 | model.add(BatchNormalization()) 37 | model.add(Activation('relu')) 38 | model.add(MaxPooling2D(pool_size=(2, 2))) 39 | model.add(Dropout(0.4)) 40 | 41 | model.add(Flatten()) 42 | model.add(Dense(512)) 43 | model.add(Activation('relu')) 44 | model.add(Dropout(0.5)) 45 | model.add(Dense(NUM_CLASSES)) 46 | model.add(Activation('softmax')) 47 | 48 | return model 49 | 50 | -------------------------------------------------------------------------------- /notebooks/part-2-sagemaker/cifar10-sagemaker-distributed.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Distributed training with Amazon SageMaker\n", 8 | "\n", 9 | "In this notebook we use the SageMaker Python SDK to setup and run a distributed training job.\n", 10 | "SageMaker makes it easy to train models across a cluster containing a large number of machines, without having to explicitly manage those resources. " 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "**Step 1:** Import essentials packages, start a sagemaker session and specify the bucket name you created in the pre-requsites section of this workshop." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import os\n", 27 | "import time\n", 28 | "import numpy as np\n", 29 | "import sagemaker\n", 30 | "\n", 31 | "sagemaker_session = sagemaker.Session()\n", 32 | "role = sagemaker.get_execution_role()\n", 33 | "\n", 34 | "bucket_name = ''" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "**Step 2:** Specify hyperparameters, instance type and number of instances to distribute training to. The `hvd_processes_per_host` corrosponds to number of GPUs per instances. \n", 42 | "For example, if you choose:\n", 43 | "```\n", 44 | "hvd_instance_type = 'ml.p3.8xlarge'\n", 45 | "hvd_instance_count = 2\n", 46 | "hvd_processes_per_host = 4\n", 47 | "```\n", 48 | "\n", 49 | "Since p3.8xlarge instance has 4 GPUs, we'll we distributing training to 8 workers, 1 per GPU.\n", 50 | "This is spread across 2 instances (or nodes). SageMaker automatically takes care of spinning up these instances and making sure they can communiate with each other." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "hyperparameters = {'epochs': 100, \n", 60 | " 'learning-rate': 0.001,\n", 61 | " 'momentum': 0.9,\n", 62 | " 'weight-decay': 2e-4,\n", 63 | " 'optimizer': 'adam',\n", 64 | " 'batch-size' : 256}\n", 65 | "\n", 66 | "hvd_instance_type = 'ml.c5.xlarge'\n", 67 | "hvd_instance_count = 2\n", 68 | "hvd_processes_per_host = 1\n", 69 | "\n", 70 | "print('Distributed training with a total of {} workers'.format(hvd_processes_per_host*hvd_instance_count))\n", 71 | "print('{} x {} instances with {} processes per instance'.format(hvd_instance_count, hvd_instance_type, hvd_processes_per_host))" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "**Step 3:** In this cell we create a SageMaker estimator, by providing it with all the information it needs to launch instances and execute training on those instances.\n", 79 | "\n", 80 | "Since we're using horovod for distributed training, we specify `distributions` to mpi which is used by horovod.\n", 81 | "\n", 82 | "In the TensorFlow estimator call, we specify training script under `entry_point` and dependencies under `code`. SageMaker automatically copies these files into a TensorFlow container behind the scenes, and are executed on the training instances." 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "from sagemaker.tensorflow import TensorFlow\n", 92 | "\n", 93 | "output_path = 's3://{}/'.format(bucket_name)\n", 94 | "job_name = 'sm-dist-{}x{}-workers-'.format(hvd_instance_count, hvd_processes_per_host) + time.strftime('%Y-%m-%d-%H-%M-%S-%j', time.gmtime())\n", 95 | "model_dir = output_path + 'tensorboard_logs/' + job_name\n", 96 | "\n", 97 | "distributions = {'mpi': {\n", 98 | " 'enabled': True,\n", 99 | " 'processes_per_host': hvd_processes_per_host,\n", 100 | " 'custom_mpi_options': '-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none'\n", 101 | " }\n", 102 | " }\n", 103 | "\n", 104 | "estimator_hvd = TensorFlow(base_job_name='hvd-cifar10-tf',\n", 105 | " source_dir='code',\n", 106 | " entry_point='cifar10-multi-gpu-horovod-sagemaker.py', \n", 107 | " role=role,\n", 108 | " framework_version='1.14',\n", 109 | " py_version='py3',\n", 110 | " hyperparameters=hyperparameters,\n", 111 | " train_instance_count=hvd_instance_count, \n", 112 | " train_instance_type=hvd_instance_type,\n", 113 | " output_path=output_path,\n", 114 | " model_dir=model_dir,\n", 115 | " tags = [{'Key' : 'Project', 'Value' : 'cifar10'},{'Key' : 'TensorBoard', 'Value' : 'dist'}],\n", 116 | " metric_definitions=[{'Name': 'val_acc', 'Regex': 'val_acc: ([0-9\\\\.]+)'}],\n", 117 | " distributions=distributions)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "**Step 4:** Specify dataset locations in Amazon S3 and then call the fit function." 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "train_path = 's3://{}/cifar10-dataset/train'.format(bucket_name)\n", 134 | "val_path = 's3://{}/cifar10-dataset/validation'.format(bucket_name)\n", 135 | "eval_path = 's3://{}/cifar10-dataset/eval/'.format(bucket_name)\n", 136 | "\n", 137 | "estimator_hvd.fit({'train': train_path,'validation': val_path,'eval': eval_path}, \n", 138 | " job_name=job_name, wait=False)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "**Note**: in the `estimator_hvd.fit()` function above, change`wait=True` if you want to see the training output in the Jupyter notebook.\n", 146 | "Advantage of setting `wait=False`, is that you can continue to run cells. \n", 147 | "Since we're unblocked due to `wait=False` we can now launch tensorboard in the notebook and monitor progress." 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "**Step 5:** Monitor progress on TensorBoard. Launch tensorboard and open the link on a new tab to visualize training progress, and navigate to the following link" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "!S3_REGION=us-west-2 tensorboard --logdir s3://{bucket_name}/tensorboard_logs/" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "Open a new browser and navigate to the folloiwng link to access TensorBoard:\n", 171 | "
https://***your_notebook_name***.notebook.us-west-2.sagemaker.aws/proxy/6006/\n", 172 | "

\n", 173 | "**Note:** Make sure to replace `your_notebook_name` with the name of the notebook instance. You can find the name of your notebook instance on the browser URL.\n", 174 | "
Don't forget the slash at the end of the URL 6006/" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [] 183 | } 184 | ], 185 | "metadata": { 186 | "kernelspec": { 187 | "display_name": "conda_tensorflow_p36", 188 | "language": "python", 189 | "name": "conda_tensorflow_p36" 190 | }, 191 | "language_info": { 192 | "codemirror_mode": { 193 | "name": "ipython", 194 | "version": 3 195 | }, 196 | "file_extension": ".py", 197 | "mimetype": "text/x-python", 198 | "name": "python", 199 | "nbconvert_exporter": "python", 200 | "pygments_lexer": "ipython3", 201 | "version": "3.6.5" 202 | } 203 | }, 204 | "nbformat": 4, 205 | "nbformat_minor": 4 206 | } 207 | -------------------------------------------------------------------------------- /notebooks/part-2-sagemaker/code/cifar10-multi-gpu-horovod-sagemaker.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import horovod.tensorflow.keras as hvd 3 | 4 | from datetime import datetime 5 | import argparse 6 | import os 7 | import numpy as np 8 | import codecs 9 | import json 10 | import boto3 11 | 12 | import tensorflow.keras.backend as K 13 | from tensorflow import keras 14 | from tensorflow.keras.layers import Input, Dense, Flatten 15 | from tensorflow.keras.models import Model 16 | from tensorflow.keras.utils import multi_gpu_model 17 | from tensorflow.keras.optimizers import Adam, SGD 18 | from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint 19 | from model_def import get_model 20 | 21 | HEIGHT = 32 22 | WIDTH = 32 23 | DEPTH = 3 24 | NUM_CLASSES = 10 25 | NUM_TRAIN_IMAGES = 40000 26 | NUM_VALID_IMAGES = 10000 27 | NUM_TEST_IMAGES = 10000 28 | 29 | class Sync2S3(tf.keras.callbacks.Callback): 30 | def __init__(self, logdir, s3logdir): 31 | super(Sync2S3, self).__init__() 32 | self.logdir = logdir 33 | self.s3logdir = s3logdir 34 | 35 | def on_epoch_end(self, batch, logs={}): 36 | os.system('aws s3 sync '+self.logdir+' '+self.s3logdir) 37 | # ' >/dev/null 2>&1' 38 | 39 | def train_preprocess_fn(image): 40 | 41 | # Resize the image to add four extra pixels on each side. 42 | image = tf.image.resize_image_with_crop_or_pad(image, HEIGHT + 8, WIDTH + 8) 43 | 44 | # Randomly crop a [HEIGHT, WIDTH] section of the image. 45 | image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH]) 46 | 47 | # Randomly flip the image horizontally. 48 | image = tf.image.random_flip_left_right(image) 49 | 50 | return image 51 | 52 | 53 | def make_batch(filenames, batch_size): 54 | """Read the images and labels from 'filenames'.""" 55 | # Repeat infinitely. 56 | dataset = tf.data.TFRecordDataset(filenames).repeat() 57 | 58 | # Parse records. 59 | dataset = dataset.map(single_example_parser, num_parallel_calls=1) 60 | 61 | # Batch it up. 62 | dataset = dataset.batch(batch_size, drop_remainder=True) 63 | iterator = dataset.make_one_shot_iterator() 64 | 65 | image_batch, label_batch = iterator.get_next() 66 | return image_batch, label_batch 67 | 68 | 69 | def single_example_parser(serialized_example): 70 | """Parses a single tf.Example into image and label tensors.""" 71 | # Dimensions of the images in the CIFAR-10 dataset. 72 | # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the 73 | # input format. 74 | features = tf.parse_single_example( 75 | serialized_example, 76 | features={ 77 | 'image': tf.FixedLenFeature([], tf.string), 78 | 'label': tf.FixedLenFeature([], tf.int64), 79 | }) 80 | image = tf.decode_raw(features['image'], tf.uint8) 81 | image.set_shape([DEPTH * HEIGHT * WIDTH]) 82 | 83 | # Reshape from [depth * height * width] to [depth, height, width]. 84 | image = tf.cast( 85 | tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]), 86 | tf.float32) 87 | label = tf.cast(features['label'], tf.int32) 88 | 89 | image = train_preprocess_fn(image) 90 | label = tf.one_hot(label, NUM_CLASSES) 91 | 92 | return image, label 93 | 94 | def save_history(path, history): 95 | 96 | history_for_json = {} 97 | # transform float values that aren't json-serializable 98 | for key in list(history.history.keys()): 99 | if type(history.history[key]) == np.ndarray: 100 | history_for_json[key] == history.history[key].tolist() 101 | elif type(history.history[key]) == list: 102 | if type(history.history[key][0]) == np.float32 or type(history.history[key][0]) == np.float64: 103 | history_for_json[key] = list(map(float, history.history[key])) 104 | 105 | with codecs.open(path, 'w', encoding='utf-8') as f: 106 | json.dump(history_for_json, f, separators=(',', ':'), sort_keys=True, indent=4) 107 | 108 | 109 | def main(args): 110 | # Hyper-parameters 111 | epochs = args.epochs 112 | lr = args.learning_rate 113 | batch_size = args.batch_size 114 | momentum = args.momentum 115 | weight_decay = args.weight_decay 116 | optimizer = args.optimizer 117 | 118 | # SageMaker options 119 | gpu_count = args.gpu_count 120 | model_dir = args.model_dir 121 | training_dir = args.train 122 | validation_dir = args.validation 123 | eval_dir = args.eval 124 | 125 | # Change 2 126 | hvd.init() 127 | size = hvd.size() 128 | 129 | # Change 3 - pin GPU to be used to process local rank (one GPU per process) 130 | config = tf.ConfigProto() 131 | config.gpu_options.allow_growth = True 132 | config.gpu_options.visible_device_list = str(hvd.local_rank()) 133 | K.set_session(tf.Session(config=config)) 134 | 135 | train_dataset = make_batch(training_dir+'/train.tfrecords', batch_size) 136 | val_dataset = make_batch(validation_dir+'/validation.tfrecords', batch_size) 137 | eval_dataset = make_batch(eval_dir+'/eval.tfrecords', batch_size) 138 | 139 | input_shape = (HEIGHT, WIDTH, DEPTH) 140 | 141 | # Change 4 - update learning rate 142 | # Change 5 - update training code 143 | 144 | # Change 6 - update callbacks - sync initial state, checkpoint only on 1st worker 145 | callbacks = [] 146 | callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) 147 | callbacks.append(hvd.callbacks.MetricAverageCallback()) 148 | callbacks.append(hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1)) 149 | callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1)) 150 | if hvd.rank() == 0: 151 | callbacks.append(ModelCheckpoint(args.output_data_dir + '/checkpoint-{epoch}.h5')) 152 | logdir = args.output_data_dir + '/' + datetime.now().strftime("%Y%m%d-%H%M%S") 153 | callbacks.append(TensorBoard(log_dir=logdir, profile_batch=0)) 154 | callbacks.append(Sync2S3(logdir=logdir, s3logdir=model_dir)) 155 | 156 | model = get_model(lr, weight_decay, optimizer, momentum, hvd) 157 | 158 | # Train model 159 | history = model.fit(x=train_dataset[0], y=train_dataset[1], 160 | steps_per_epoch= (NUM_TRAIN_IMAGES // batch_size)// size, 161 | validation_data=val_dataset, 162 | validation_steps= (NUM_VALID_IMAGES // batch_size)// size, 163 | epochs=epochs, callbacks=callbacks) 164 | 165 | # Evaluate model performance 166 | score = model.evaluate(eval_dataset[0], 167 | eval_dataset[1], 168 | steps=NUM_TEST_IMAGES // args.batch_size, 169 | verbose=0) 170 | print('Test loss :', score[0]) 171 | print('Test accuracy:', score[1]) 172 | 173 | if hvd.rank() == 0: 174 | save_history(args.output_data_dir + "/hvd_history.p", history) 175 | # Save model to model directory 176 | # bug: https://github.com/horovod/horovod/issues/1437 177 | # tf.contrib.saved_model.save_keras_model(model, args.model_output_dir) 178 | 179 | if __name__ == "__main__": 180 | 181 | parser = argparse.ArgumentParser() 182 | 183 | # Hyper-parameters 184 | parser.add_argument('--epochs', type=int, default=15) 185 | parser.add_argument('--learning-rate', type=float, default=0.001) 186 | parser.add_argument('--batch-size', type=int, default=256) 187 | parser.add_argument('--weight-decay', type=float, default=2e-4) 188 | parser.add_argument('--momentum', type=float, default='0.9') 189 | parser.add_argument('--optimizer', type=str, default='adam') 190 | 191 | # SageMaker parameters 192 | parser.add_argument('--model_dir', type=str) 193 | parser.add_argument('--model_output_dir', type=str, default=os.environ['SM_MODEL_DIR']) 194 | parser.add_argument('--output_data_dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR']) 195 | 196 | # Data directories and other options 197 | parser.add_argument('--gpu-count', type=int, default=os.environ['SM_NUM_GPUS']) 198 | parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN']) 199 | parser.add_argument('--validation', type=str, default=os.environ['SM_CHANNEL_VALIDATION']) 200 | parser.add_argument('--eval', type=str, default=os.environ['SM_CHANNEL_EVAL']) 201 | 202 | args = parser.parse_args() 203 | 204 | main(args) 205 | -------------------------------------------------------------------------------- /notebooks/part-2-sagemaker/code/model_def.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras.layers import Activation, Conv2D, Dense, Dropout, Flatten, MaxPooling2D, BatchNormalization 3 | from tensorflow.keras.models import Sequential 4 | from tensorflow.keras.optimizers import Adam, SGD, RMSprop 5 | 6 | HEIGHT = 32 7 | WIDTH = 32 8 | DEPTH = 3 9 | NUM_CLASSES = 10 10 | 11 | def get_model(learning_rate, weight_decay, optimizer, momentum, hvd): 12 | 13 | model = Sequential() 14 | model.add(Conv2D(32, (3, 3), padding='same', input_shape=(HEIGHT, WIDTH, DEPTH))) 15 | model.add(BatchNormalization()) 16 | model.add(Activation('relu')) 17 | model.add(Conv2D(32, (3, 3))) 18 | model.add(BatchNormalization()) 19 | model.add(Activation('relu')) 20 | model.add(MaxPooling2D(pool_size=(2, 2))) 21 | model.add(Dropout(0.2)) 22 | 23 | model.add(Conv2D(64, (3, 3), padding='same')) 24 | model.add(BatchNormalization()) 25 | model.add(Activation('relu')) 26 | model.add(Conv2D(64, (3, 3))) 27 | model.add(BatchNormalization()) 28 | model.add(Activation('relu')) 29 | model.add(MaxPooling2D(pool_size=(2, 2))) 30 | model.add(Dropout(0.3)) 31 | 32 | model.add(Conv2D(128, (3, 3), padding='same')) 33 | model.add(BatchNormalization()) 34 | model.add(Activation('relu')) 35 | model.add(Conv2D(128, (3, 3))) 36 | model.add(BatchNormalization()) 37 | model.add(Activation('relu')) 38 | model.add(MaxPooling2D(pool_size=(2, 2))) 39 | model.add(Dropout(0.4)) 40 | 41 | model.add(Flatten()) 42 | model.add(Dense(512)) 43 | model.add(Activation('relu')) 44 | model.add(Dropout(0.5)) 45 | model.add(Dense(NUM_CLASSES)) 46 | model.add(Activation('softmax')) 47 | 48 | size = hvd.size() 49 | 50 | if optimizer.lower() == 'sgd': 51 | opt = SGD(lr=learning_rate * size, decay=weight_decay, momentum=momentum) 52 | elif optimizer.lower() == 'rmsprop': 53 | opt = RMSprop(lr=learning_rate * size, decay=weight_decay) 54 | else: 55 | opt = Adam(lr=learning_rate * size, decay=weight_decay) 56 | 57 | opt = hvd.DistributedOptimizer(opt) 58 | 59 | model.compile(loss='categorical_crossentropy', 60 | optimizer=opt, 61 | metrics=['accuracy']) 62 | 63 | return model 64 | 65 | -------------------------------------------------------------------------------- /notebooks/part-3-kubernetes/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:1.14.0-cpu-py36-ubuntu16.04 2 | 3 | COPY code /opt/training/ 4 | 5 | WORKDIR /opt/training 6 | 7 | -------------------------------------------------------------------------------- /notebooks/part-3-kubernetes/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:1.14.0-gpu-py36-cu100-ubuntu16.04 2 | 3 | COPY code /opt/training/ 4 | 5 | WORKDIR /opt/training 6 | 7 | -------------------------------------------------------------------------------- /notebooks/part-3-kubernetes/code/cifar10-multi-gpu-horovod-k8s.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | import horovod.tensorflow.keras as hvd 4 | 5 | from datetime import datetime 6 | import argparse 7 | import os 8 | import numpy as np 9 | import codecs 10 | import json 11 | 12 | import tensorflow.keras.backend as K 13 | from tensorflow import keras 14 | from tensorflow.keras.layers import Input, Dense, Flatten 15 | from tensorflow.keras.models import Model 16 | from tensorflow.keras.utils import multi_gpu_model 17 | from tensorflow.keras.optimizers import Adam, SGD 18 | from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint 19 | from model_def import get_model 20 | 21 | HEIGHT = 32 22 | WIDTH = 32 23 | DEPTH = 3 24 | NUM_CLASSES = 10 25 | NUM_TRAIN_IMAGES = 40000 26 | NUM_VALID_IMAGES = 10000 27 | NUM_TEST_IMAGES = 10000 28 | 29 | def train_preprocess_fn(image): 30 | 31 | # Resize the image to add four extra pixels on each side. 32 | image = tf.image.resize_image_with_crop_or_pad(image, HEIGHT + 8, WIDTH + 8) 33 | 34 | # Randomly crop a [HEIGHT, WIDTH] section of the image. 35 | image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH]) 36 | 37 | # Randomly flip the image horizontally. 38 | image = tf.image.random_flip_left_right(image) 39 | 40 | return image 41 | 42 | 43 | def make_batch(filenames, batch_size): 44 | """Read the images and labels from 'filenames'.""" 45 | # Repeat infinitely. 46 | dataset = tf.data.TFRecordDataset(filenames).repeat() 47 | 48 | # Parse records. 49 | dataset = dataset.map(single_example_parser, num_parallel_calls=1) 50 | 51 | # Batch it up. 52 | dataset = dataset.batch(batch_size, drop_remainder=True) 53 | iterator = dataset.make_one_shot_iterator() 54 | 55 | image_batch, label_batch = iterator.get_next() 56 | return image_batch, label_batch 57 | 58 | 59 | def single_example_parser(serialized_example): 60 | """Parses a single tf.Example into image and label tensors.""" 61 | # Dimensions of the images in the CIFAR-10 dataset. 62 | # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the 63 | # input format. 64 | features = tf.parse_single_example( 65 | serialized_example, 66 | features={ 67 | 'image': tf.FixedLenFeature([], tf.string), 68 | 'label': tf.FixedLenFeature([], tf.int64), 69 | }) 70 | image = tf.decode_raw(features['image'], tf.uint8) 71 | image.set_shape([DEPTH * HEIGHT * WIDTH]) 72 | 73 | # Reshape from [depth * height * width] to [depth, height, width]. 74 | image = tf.cast( 75 | tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]), 76 | tf.float32) 77 | label = tf.cast(features['label'], tf.int32) 78 | 79 | image = train_preprocess_fn(image) 80 | label = tf.one_hot(label, NUM_CLASSES) 81 | 82 | return image, label 83 | 84 | def save_history(path, history): 85 | 86 | history_for_json = {} 87 | # transform float values that aren't json-serializable 88 | for key in list(history.history.keys()): 89 | if type(history.history[key]) == np.ndarray: 90 | history_for_json[key] == history.history[key].tolist() 91 | elif type(history.history[key]) == list: 92 | if type(history.history[key][0]) == np.float32 or type(history.history[key][0]) == np.float64: 93 | history_for_json[key] = list(map(float, history.history[key])) 94 | 95 | with codecs.open(path, 'w', encoding='utf-8') as f: 96 | json.dump(history_for_json, f, separators=(',', ':'), sort_keys=True, indent=4) 97 | 98 | 99 | def main(args): 100 | # Hyper-parameters 101 | epochs = args.epochs 102 | lr = args.learning_rate 103 | batch_size = args.batch_size 104 | momentum = args.momentum 105 | weight_decay = args.weight_decay 106 | optimizer = args.optimizer 107 | 108 | # Data directories and other options 109 | gpu_count = args.gpu_count 110 | model_dir = args.model_dir 111 | training_dir = args.train 112 | validation_dir = args.validation 113 | eval_dir = args.eval 114 | 115 | # Change 2 116 | hvd.init() 117 | size = hvd.size() 118 | 119 | # Change 3 - pin GPU to be used to process local rank (one GPU per process) 120 | config = tf.ConfigProto() 121 | config.gpu_options.allow_growth = True 122 | config.gpu_options.visible_device_list = str(hvd.local_rank()) 123 | K.set_session(tf.Session(config=config)) 124 | 125 | train_dataset = make_batch(training_dir+'/train.tfrecords', batch_size) 126 | val_dataset = make_batch(validation_dir+'/validation.tfrecords', batch_size) 127 | eval_dataset = make_batch(eval_dir+'/eval.tfrecords', batch_size) 128 | 129 | input_shape = (HEIGHT, WIDTH, DEPTH) 130 | 131 | # Change 4 - update learning rate 132 | # Change 5 - update training code 133 | 134 | # Change 6 - update callbacks - sync initial state, checkpoint only on 1st worker 135 | callbacks = [] 136 | callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) 137 | callbacks.append(hvd.callbacks.MetricAverageCallback()) 138 | callbacks.append(hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1)) 139 | callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1)) 140 | if hvd.rank() == 0: 141 | callbacks.append(ModelCheckpoint(args.output_data_dir + '/checkpoint-{epoch}.h5')) 142 | logdir = args.output_data_dir + '/' + datetime.now().strftime("%Y%m%d-%H%M%S") 143 | callbacks.append(TensorBoard(log_dir=logdir, profile_batch=0)) 144 | 145 | model = get_model(lr, weight_decay, optimizer, momentum, hvd) 146 | 147 | # Train model 148 | history = model.fit(x=train_dataset[0], y=train_dataset[1], 149 | steps_per_epoch= (NUM_TRAIN_IMAGES // batch_size)// size, 150 | validation_data=val_dataset, 151 | validation_steps= (NUM_VALID_IMAGES // batch_size)// size, 152 | epochs=epochs, callbacks=callbacks) 153 | 154 | # Evaluate model performance 155 | score = model.evaluate(eval_dataset[0], 156 | eval_dataset[1], 157 | steps=NUM_TEST_IMAGES // args.batch_size, 158 | verbose=0) 159 | print('Test loss :', score[0]) 160 | print('Test accuracy:', score[1]) 161 | 162 | if hvd.rank() == 0: 163 | save_history(args.output_data_dir + "/hvd_history.p", history) 164 | # Save model to model directory 165 | #bug: https://github.com/horovod/horovod/issues/1437 166 | #tf.contrib.saved_model.save_keras_model(model, args.model_output_dir) 167 | 168 | if __name__ == "__main__": 169 | 170 | parser = argparse.ArgumentParser() 171 | 172 | # Hyper-parameters 173 | parser.add_argument('--epochs', type=int, default=15) 174 | parser.add_argument('--learning-rate', type=float, default=0.001) 175 | parser.add_argument('--batch-size', type=int, default=256) 176 | parser.add_argument('--weight-decay', type=float, default=2e-4) 177 | parser.add_argument('--momentum', type=float, default='0.9') 178 | parser.add_argument('--optimizer', type=str, default='adam') 179 | 180 | # Data directories and other options 181 | parser.add_argument('--gpu-count', type=int, default=0) 182 | parser.add_argument('--train', type=str) 183 | parser.add_argument('--validation', type=str) 184 | parser.add_argument('--eval', type=str) 185 | 186 | parser.add_argument('--model_dir', type=str) 187 | parser.add_argument('--model_output_dir', type=str) 188 | parser.add_argument('--output_data_dir', type=str) 189 | parser.add_argument('--tensorboard_dir', type=str) 190 | 191 | args = parser.parse_args() 192 | main(args) 193 | -------------------------------------------------------------------------------- /notebooks/part-3-kubernetes/code/model_def.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras.layers import Activation, Conv2D, Dense, Dropout, Flatten, MaxPooling2D, BatchNormalization 3 | from tensorflow.keras.models import Sequential 4 | from tensorflow.keras.optimizers import Adam, SGD, RMSprop 5 | 6 | HEIGHT = 32 7 | WIDTH = 32 8 | DEPTH = 3 9 | NUM_CLASSES = 10 10 | 11 | def get_model(learning_rate, weight_decay, optimizer, momentum, hvd): 12 | 13 | model = Sequential() 14 | model.add(Conv2D(32, (3, 3), padding='same', input_shape=(HEIGHT, WIDTH, DEPTH))) 15 | model.add(BatchNormalization()) 16 | model.add(Activation('relu')) 17 | model.add(Conv2D(32, (3, 3))) 18 | model.add(BatchNormalization()) 19 | model.add(Activation('relu')) 20 | model.add(MaxPooling2D(pool_size=(2, 2))) 21 | model.add(Dropout(0.2)) 22 | 23 | model.add(Conv2D(64, (3, 3), padding='same')) 24 | model.add(BatchNormalization()) 25 | model.add(Activation('relu')) 26 | model.add(Conv2D(64, (3, 3))) 27 | model.add(BatchNormalization()) 28 | model.add(Activation('relu')) 29 | model.add(MaxPooling2D(pool_size=(2, 2))) 30 | model.add(Dropout(0.3)) 31 | 32 | model.add(Conv2D(128, (3, 3), padding='same')) 33 | model.add(BatchNormalization()) 34 | model.add(Activation('relu')) 35 | model.add(Conv2D(128, (3, 3))) 36 | model.add(BatchNormalization()) 37 | model.add(Activation('relu')) 38 | model.add(MaxPooling2D(pool_size=(2, 2))) 39 | model.add(Dropout(0.4)) 40 | 41 | model.add(Flatten()) 42 | model.add(Dense(512)) 43 | model.add(Activation('relu')) 44 | model.add(Dropout(0.5)) 45 | model.add(Dense(NUM_CLASSES)) 46 | model.add(Activation('softmax')) 47 | 48 | size = hvd.size() 49 | 50 | if optimizer.lower() == 'sgd': 51 | opt = SGD(lr=learning_rate * size, decay=weight_decay, momentum=momentum) 52 | elif optimizer.lower() == 'rmsprop': 53 | opt = RMSprop(lr=learning_rate * size, decay=weight_decay) 54 | else: 55 | opt = Adam(lr=learning_rate * size, decay=weight_decay) 56 | 57 | opt = hvd.DistributedOptimizer(opt) 58 | 59 | model.compile(loss='categorical_crossentropy', 60 | optimizer=opt, 61 | metrics=['accuracy']) 62 | 63 | return model 64 | -------------------------------------------------------------------------------- /notebooks/part-3-kubernetes/cpu_eks_cluster.sh: -------------------------------------------------------------------------------- 1 | eksctl create cluster \ 2 | --name aws-tf-cluster-cpu \ 3 | --version 1.14 \ 4 | --region us-west-2 \ 5 | --nodegroup-name cpu-nodes \ 6 | --node-type c5.xlarge \ 7 | --nodes 2 \ 8 | --node-volume-size 50 \ 9 | --node-zones us-west-2a \ 10 | --timeout=40m \ 11 | --zones=us-west-2a,us-west-2b,us-west-2c \ 12 | --auto-kubeconfig -------------------------------------------------------------------------------- /notebooks/part-3-kubernetes/gpu_eks_cluster.sh: -------------------------------------------------------------------------------- 1 | eksctl create cluster \ 2 | --name aws-tf-cluster \ 3 | --version 1.14 \ 4 | --region us-west-2 \ 5 | --nodegroup-name gpu-nodes \ 6 | --node-type p3.2xlarge \ 7 | --nodes 2 \ 8 | --node-volume-size 100 \ 9 | --node-zones us-west-2a \ 10 | --timeout=40m \ 11 | --zones=us-west-2a,us-west-2b,us-west-2c \ 12 | --auto-kubeconfig -------------------------------------------------------------------------------- /notebooks/part-3-kubernetes/specs/claim-fsx-s3.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: fsx-claim 5 | spec: 6 | accessModes: 7 | - ReadWriteMany 8 | storageClassName: fsx-sc 9 | resources: 10 | requests: 11 | storage: 1200Gi -------------------------------------------------------------------------------- /notebooks/part-3-kubernetes/specs/eks_tf_training_job-cpu.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1alpha1 2 | kind: MPIJob 3 | metadata: 4 | name: eks-tf-distributed-training 5 | spec: 6 | replicas: 2 7 | template: 8 | metadata: 9 | annotations: 10 | sidecar.istio.io/inject: "false" 11 | spec: 12 | restartPolicy: Never 13 | containers: 14 | - name: eks-tf-dist-job 15 | image: 16 | env: 17 | - name: HDF5_USE_FILE_LOCKING 18 | value: 'FALSE' 19 | command: 20 | - mpirun 21 | - --allow-run-as-root 22 | - -mca 23 | - btl_tcp_if_exclude 24 | - lo 25 | - -mca 26 | - pml 27 | - ob1 28 | - -mca 29 | - btl 30 | - ^openib 31 | - --bind-to 32 | - none 33 | - -map-by 34 | - slot 35 | - -x 36 | - LD_LIBRARY_PATH 37 | - -x 38 | - PATH 39 | - -x 40 | - NCCL_DEBUG=INFO 41 | - python 42 | - cifar10-multi-gpu-horovod-k8s.py 43 | - --epochs=30 44 | - --learning-rate=0.01 45 | - --batch-size=256 46 | - --weight-decay=0.0002 47 | - --momentum=0.9 48 | - --optimizer=sgd 49 | - --train=/training-data/cifar10-dataset/train 50 | - --eval=/training-data/cifar10-dataset/eval 51 | - --validation=/training-data/cifar10-dataset/validation 52 | - --model_dir=/training-data/eks-output 53 | - --output_data_dir=/training-data/eks-output 54 | - --tensorboard_dir=/training-data/eks-output 55 | volumeMounts: 56 | - mountPath: /training-data 57 | name: persistent-storage 58 | volumes: 59 | - name: persistent-storage 60 | persistentVolumeClaim: 61 | claimName: fsx-claim -------------------------------------------------------------------------------- /notebooks/part-3-kubernetes/specs/eks_tf_training_job-gpu.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1alpha1 2 | kind: MPIJob 3 | metadata: 4 | name: eks-tf-distributed-training 5 | spec: 6 | replicas: 2 7 | template: 8 | metadata: 9 | annotations: 10 | sidecar.istio.io/inject: "false" 11 | spec: 12 | restartPolicy: Never 13 | containers: 14 | - name: eks-tf-dist-job 15 | image: 16 | env: 17 | - name: HDF5_USE_FILE_LOCKING 18 | value: 'FALSE' 19 | command: 20 | - mpirun 21 | - --allow-run-as-root 22 | - -mca 23 | - btl_tcp_if_exclude 24 | - lo 25 | - -mca 26 | - pml 27 | - ob1 28 | - -mca 29 | - btl 30 | - ^openib 31 | - --bind-to 32 | - none 33 | - -map-by 34 | - slot 35 | - -x 36 | - LD_LIBRARY_PATH 37 | - -x 38 | - PATH 39 | - -x 40 | - NCCL_DEBUG=INFO 41 | - python 42 | - cifar10-multi-gpu-horovod.py 43 | - --epochs=30 44 | - --learning-rate=0.01 45 | - --batch-size=256 46 | - --weight-decay=0.0002 47 | - --momentum=0.9 48 | - --optimizer=sgd 49 | - --train=/training-data/cifar10-dataset/train 50 | - --eval=/training-data/cifar10-dataset/eval 51 | - --validation=/training-data/cifar10-dataset/validation 52 | - --model_dir=/training-data/eks-output 53 | - --output_data_dir=/training-data/eks-output 54 | - --tensorboard_dir=/training-data/eks-output 55 | resources: 56 | limits: 57 | nvidia.com/gpu: 1 58 | volumeMounts: 59 | - mountPath: /training-data 60 | name: persistent-storage 61 | volumes: 62 | - name: persistent-storage 63 | persistentVolumeClaim: 64 | claimName: fsx-claim -------------------------------------------------------------------------------- /notebooks/part-3-kubernetes/specs/fsx_lustre_policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "iam:CreateServiceLinkedRole", 8 | "iam:AttachRolePolicy", 9 | "iam:PutRolePolicy" 10 | ], 11 | "Resource": "arn:aws:iam::*:role/aws-service-role/s3.data-source.lustre.fsx.amazonaws.com/*" 12 | }, 13 | { 14 | "Effect": "Allow", 15 | "Action": [ 16 | "s3:*", 17 | "fsx:*" 18 | ], 19 | "Resource": ["*"] 20 | } 21 | ] 22 | } -------------------------------------------------------------------------------- /notebooks/part-3-kubernetes/specs/storage-class-fsx-s3-template.yaml: -------------------------------------------------------------------------------- 1 | kind: StorageClass 2 | apiVersion: storage.k8s.io/v1 3 | metadata: 4 | name: fsx-sc 5 | provisioner: fsx.csi.aws.com 6 | parameters: 7 | subnetId: $SUBNET_ID 8 | securityGroupIds: $SECURITY_GROUP_ID 9 | s3ImportPath: s3://$BUCKET_NAME/ 10 | s3ExportPath: s3://$BUCKET_NAME/ 11 | apiVersion: v1 -------------------------------------------------------------------------------- /static/640px-Amazon_Web_Services_Logo.svg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/640px-Amazon_Web_Services_Logo.svg.png -------------------------------------------------------------------------------- /static/AWS-Logo.svg: -------------------------------------------------------------------------------- 1 | AWS-Logo_White-Color -------------------------------------------------------------------------------- /static/Amazon_Web_Services_Logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 9 | 10 | 31 | 32 | 34 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /static/css/jquery-ui.min.css: -------------------------------------------------------------------------------- 1 | /*! jQuery UI - v1.12.1 - 2018-10-14 2 | * http://jqueryui.com 3 | * Includes: draggable.css, core.css, resizable.css, selectable.css, sortable.css, accordion.css, autocomplete.css, menu.css, button.css, controlgroup.css, checkboxradio.css, datepicker.css, dialog.css, progressbar.css, selectmenu.css, slider.css, spinner.css, tabs.css, tooltip.css, theme.css 4 | * To view and modify this theme, visit http://jqueryui.com/themeroller/?scope=&folderName=base&cornerRadiusShadow=8px&offsetLeftShadow=0px&offsetTopShadow=0px&thicknessShadow=5px&opacityShadow=30&bgImgOpacityShadow=0&bgTextureShadow=flat&bgColorShadow=666666&opacityOverlay=30&bgImgOpacityOverlay=0&bgTextureOverlay=flat&bgColorOverlay=aaaaaa&iconColorError=cc0000&fcError=5f3f3f&borderColorError=f1a899&bgTextureError=flat&bgColorError=fddfdf&iconColorHighlight=777620&fcHighlight=777620&borderColorHighlight=dad55e&bgTextureHighlight=flat&bgColorHighlight=fffa90&iconColorActive=ffffff&fcActive=ffffff&borderColorActive=003eff&bgTextureActive=flat&bgColorActive=007fff&iconColorHover=555555&fcHover=2b2b2b&borderColorHover=cccccc&bgTextureHover=flat&bgColorHover=ededed&iconColorDefault=777777&fcDefault=454545&borderColorDefault=c5c5c5&bgTextureDefault=flat&bgColorDefault=f6f6f6&iconColorContent=444444&fcContent=333333&borderColorContent=dddddd&bgTextureContent=flat&bgColorContent=ffffff&iconColorHeader=444444&fcHeader=333333&borderColorHeader=dddddd&bgTextureHeader=flat&bgColorHeader=e9e9e9&cornerRadius=3px&fwDefault=normal&fsDefault=1em&ffDefault=Arial%2CHelvetica%2Csans-serif 5 | * Copyright jQuery Foundation and other contributors; Licensed MIT */ 6 | 7 | .ui-draggable-handle{-ms-touch-action:none;touch-action:none}.ui-helper-hidden{display:none}.ui-helper-hidden-accessible{border:0;clip:rect(0 0 0 0);height:1px;margin:-1px;overflow:hidden;padding:0;position:absolute;width:1px}.ui-helper-reset{margin:0;padding:0;border:0;outline:0;line-height:1.3;text-decoration:none;font-size:100%;list-style:none}.ui-helper-clearfix:before,.ui-helper-clearfix:after{content:"";display:table;border-collapse:collapse}.ui-helper-clearfix:after{clear:both}.ui-helper-zfix{width:100%;height:100%;top:0;left:0;position:absolute;opacity:0;filter:Alpha(Opacity=0)}.ui-front{z-index:100}.ui-state-disabled{cursor:default!important;pointer-events:none}.ui-icon{display:inline-block;vertical-align:middle;margin-top:-.25em;position:relative;text-indent:-99999px;overflow:hidden;background-repeat:no-repeat}.ui-widget-icon-block{left:50%;margin-left:-8px;display:block}.ui-widget-overlay{position:fixed;top:0;left:0;width:100%;height:100%}.ui-resizable{position:relative}.ui-resizable-handle{position:absolute;font-size:0.1px;display:block;-ms-touch-action:none;touch-action:none}.ui-resizable-disabled .ui-resizable-handle,.ui-resizable-autohide .ui-resizable-handle{display:none}.ui-resizable-n{cursor:n-resize;height:7px;width:100%;top:-5px;left:0}.ui-resizable-s{cursor:s-resize;height:7px;width:100%;bottom:-5px;left:0}.ui-resizable-e{cursor:e-resize;width:7px;right:-5px;top:0;height:100%}.ui-resizable-w{cursor:w-resize;width:7px;left:-5px;top:0;height:100%}.ui-resizable-se{cursor:se-resize;width:12px;height:12px;right:1px;bottom:1px}.ui-resizable-sw{cursor:sw-resize;width:9px;height:9px;left:-5px;bottom:-5px}.ui-resizable-nw{cursor:nw-resize;width:9px;height:9px;left:-5px;top:-5px}.ui-resizable-ne{cursor:ne-resize;width:9px;height:9px;right:-5px;top:-5px}.ui-selectable{-ms-touch-action:none;touch-action:none}.ui-selectable-helper{position:absolute;z-index:100;border:1px dotted black}.ui-sortable-handle{-ms-touch-action:none;touch-action:none}.ui-accordion .ui-accordion-header{display:block;cursor:pointer;position:relative;margin:2px 0 0 0;padding:.5em .5em .5em .7em;font-size:100%}.ui-accordion .ui-accordion-content{padding:1em 2.2em;border-top:0;overflow:auto}.ui-autocomplete{position:absolute;top:0;left:0;cursor:default}.ui-menu{list-style:none;padding:0;margin:0;display:block;outline:0}.ui-menu .ui-menu{position:absolute}.ui-menu .ui-menu-item{margin:0;cursor:pointer;list-style-image:url("data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7")}.ui-menu .ui-menu-item-wrapper{position:relative;padding:3px 1em 3px .4em}.ui-menu .ui-menu-divider{margin:5px 0;height:0;font-size:0;line-height:0;border-width:1px 0 0 0}.ui-menu .ui-state-focus,.ui-menu .ui-state-active{margin:-1px}.ui-menu-icons{position:relative}.ui-menu-icons .ui-menu-item-wrapper{padding-left:2em}.ui-menu .ui-icon{position:absolute;top:0;bottom:0;left:.2em;margin:auto 0}.ui-menu .ui-menu-icon{left:auto;right:0}.ui-button{padding:.4em 1em;display:inline-block;position:relative;line-height:normal;margin-right:.1em;cursor:pointer;vertical-align:middle;text-align:center;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;overflow:visible}.ui-button,.ui-button:link,.ui-button:visited,.ui-button:hover,.ui-button:active{text-decoration:none}.ui-button-icon-only{width:2em;box-sizing:border-box;text-indent:-9999px;white-space:nowrap}input.ui-button.ui-button-icon-only{text-indent:0}.ui-button-icon-only .ui-icon{position:absolute;top:50%;left:50%;margin-top:-8px;margin-left:-8px}.ui-button.ui-icon-notext .ui-icon{padding:0;width:2.1em;height:2.1em;text-indent:-9999px;white-space:nowrap}input.ui-button.ui-icon-notext .ui-icon{width:auto;height:auto;text-indent:0;white-space:normal;padding:.4em 1em}input.ui-button::-moz-focus-inner,button.ui-button::-moz-focus-inner{border:0;padding:0}.ui-controlgroup{vertical-align:middle;display:inline-block}.ui-controlgroup > .ui-controlgroup-item{float:left;margin-left:0;margin-right:0}.ui-controlgroup > .ui-controlgroup-item:focus,.ui-controlgroup > .ui-controlgroup-item.ui-visual-focus{z-index:9999}.ui-controlgroup-vertical > .ui-controlgroup-item{display:block;float:none;width:100%;margin-top:0;margin-bottom:0;text-align:left}.ui-controlgroup-vertical .ui-controlgroup-item{box-sizing:border-box}.ui-controlgroup .ui-controlgroup-label{padding:.4em 1em}.ui-controlgroup .ui-controlgroup-label span{font-size:80%}.ui-controlgroup-horizontal .ui-controlgroup-label + .ui-controlgroup-item{border-left:none}.ui-controlgroup-vertical .ui-controlgroup-label + .ui-controlgroup-item{border-top:none}.ui-controlgroup-horizontal .ui-controlgroup-label.ui-widget-content{border-right:none}.ui-controlgroup-vertical .ui-controlgroup-label.ui-widget-content{border-bottom:none}.ui-controlgroup-vertical .ui-spinner-input{width:75%;width:calc( 100% - 2.4em )}.ui-controlgroup-vertical .ui-spinner .ui-spinner-up{border-top-style:solid}.ui-checkboxradio-label .ui-icon-background{box-shadow:inset 1px 1px 1px #ccc;border-radius:.12em;border:none}.ui-checkboxradio-radio-label .ui-icon-background{width:16px;height:16px;border-radius:1em;overflow:visible;border:none}.ui-checkboxradio-radio-label.ui-checkboxradio-checked .ui-icon,.ui-checkboxradio-radio-label.ui-checkboxradio-checked:hover .ui-icon{background-image:none;width:8px;height:8px;border-width:4px;border-style:solid}.ui-checkboxradio-disabled{pointer-events:none}.ui-datepicker{width:17em;padding:.2em .2em 0;display:none}.ui-datepicker .ui-datepicker-header{position:relative;padding:.2em 0}.ui-datepicker .ui-datepicker-prev,.ui-datepicker .ui-datepicker-next{position:absolute;top:2px;width:1.8em;height:1.8em}.ui-datepicker .ui-datepicker-prev-hover,.ui-datepicker .ui-datepicker-next-hover{top:1px}.ui-datepicker .ui-datepicker-prev{left:2px}.ui-datepicker .ui-datepicker-next{right:2px}.ui-datepicker .ui-datepicker-prev-hover{left:1px}.ui-datepicker .ui-datepicker-next-hover{right:1px}.ui-datepicker .ui-datepicker-prev span,.ui-datepicker .ui-datepicker-next span{display:block;position:absolute;left:50%;margin-left:-8px;top:50%;margin-top:-8px}.ui-datepicker .ui-datepicker-title{margin:0 2.3em;line-height:1.8em;text-align:center}.ui-datepicker .ui-datepicker-title select{font-size:1em;margin:1px 0}.ui-datepicker select.ui-datepicker-month,.ui-datepicker select.ui-datepicker-year{width:45%}.ui-datepicker table{width:100%;font-size:.9em;border-collapse:collapse;margin:0 0 .4em}.ui-datepicker th{padding:.7em .3em;text-align:center;font-weight:bold;border:0}.ui-datepicker td{border:0;padding:1px}.ui-datepicker td span,.ui-datepicker td a{display:block;padding:.2em;text-align:right;text-decoration:none}.ui-datepicker .ui-datepicker-buttonpane{background-image:none;margin:.7em 0 0 0;padding:0 .2em;border-left:0;border-right:0;border-bottom:0}.ui-datepicker .ui-datepicker-buttonpane button{float:right;margin:.5em .2em .4em;cursor:pointer;padding:.2em .6em .3em .6em;width:auto;overflow:visible}.ui-datepicker .ui-datepicker-buttonpane button.ui-datepicker-current{float:left}.ui-datepicker.ui-datepicker-multi{width:auto}.ui-datepicker-multi .ui-datepicker-group{float:left}.ui-datepicker-multi .ui-datepicker-group table{width:95%;margin:0 auto .4em}.ui-datepicker-multi-2 .ui-datepicker-group{width:50%}.ui-datepicker-multi-3 .ui-datepicker-group{width:33.3%}.ui-datepicker-multi-4 .ui-datepicker-group{width:25%}.ui-datepicker-multi .ui-datepicker-group-last .ui-datepicker-header,.ui-datepicker-multi .ui-datepicker-group-middle .ui-datepicker-header{border-left-width:0}.ui-datepicker-multi .ui-datepicker-buttonpane{clear:left}.ui-datepicker-row-break{clear:both;width:100%;font-size:0}.ui-datepicker-rtl{direction:rtl}.ui-datepicker-rtl .ui-datepicker-prev{right:2px;left:auto}.ui-datepicker-rtl .ui-datepicker-next{left:2px;right:auto}.ui-datepicker-rtl .ui-datepicker-prev:hover{right:1px;left:auto}.ui-datepicker-rtl .ui-datepicker-next:hover{left:1px;right:auto}.ui-datepicker-rtl .ui-datepicker-buttonpane{clear:right}.ui-datepicker-rtl .ui-datepicker-buttonpane button{float:left}.ui-datepicker-rtl .ui-datepicker-buttonpane button.ui-datepicker-current,.ui-datepicker-rtl .ui-datepicker-group{float:right}.ui-datepicker-rtl .ui-datepicker-group-last .ui-datepicker-header,.ui-datepicker-rtl .ui-datepicker-group-middle .ui-datepicker-header{border-right-width:0;border-left-width:1px}.ui-datepicker .ui-icon{display:block;text-indent:-99999px;overflow:hidden;background-repeat:no-repeat;left:.5em;top:.3em}.ui-dialog{position:absolute;top:0;left:0;padding:.2em;outline:0}.ui-dialog .ui-dialog-titlebar{padding:.4em 1em;position:relative}.ui-dialog .ui-dialog-title{float:left;margin:.1em 0;white-space:nowrap;width:90%;overflow:hidden;text-overflow:ellipsis}.ui-dialog .ui-dialog-titlebar-close{position:absolute;right:.3em;top:50%;width:20px;margin:-10px 0 0 0;padding:1px;height:20px}.ui-dialog .ui-dialog-content{position:relative;border:0;padding:.5em 1em;background:none;overflow:auto}.ui-dialog .ui-dialog-buttonpane{text-align:left;border-width:1px 0 0 0;background-image:none;margin-top:.5em;padding:.3em 1em .5em .4em}.ui-dialog .ui-dialog-buttonpane .ui-dialog-buttonset{float:right}.ui-dialog .ui-dialog-buttonpane button{margin:.5em .4em .5em 0;cursor:pointer}.ui-dialog .ui-resizable-n{height:2px;top:0}.ui-dialog .ui-resizable-e{width:2px;right:0}.ui-dialog .ui-resizable-s{height:2px;bottom:0}.ui-dialog .ui-resizable-w{width:2px;left:0}.ui-dialog .ui-resizable-se,.ui-dialog .ui-resizable-sw,.ui-dialog .ui-resizable-ne,.ui-dialog .ui-resizable-nw{width:7px;height:7px}.ui-dialog .ui-resizable-se{right:0;bottom:0}.ui-dialog .ui-resizable-sw{left:0;bottom:0}.ui-dialog .ui-resizable-ne{right:0;top:0}.ui-dialog .ui-resizable-nw{left:0;top:0}.ui-draggable .ui-dialog-titlebar{cursor:move}.ui-progressbar{height:2em;text-align:left;overflow:hidden}.ui-progressbar .ui-progressbar-value{margin:-1px;height:100%}.ui-progressbar .ui-progressbar-overlay{background:url("data:image/gif;base64,R0lGODlhKAAoAIABAAAAAP///yH/C05FVFNDQVBFMi4wAwEAAAAh+QQJAQABACwAAAAAKAAoAAACkYwNqXrdC52DS06a7MFZI+4FHBCKoDeWKXqymPqGqxvJrXZbMx7Ttc+w9XgU2FB3lOyQRWET2IFGiU9m1frDVpxZZc6bfHwv4c1YXP6k1Vdy292Fb6UkuvFtXpvWSzA+HycXJHUXiGYIiMg2R6W459gnWGfHNdjIqDWVqemH2ekpObkpOlppWUqZiqr6edqqWQAAIfkECQEAAQAsAAAAACgAKAAAApSMgZnGfaqcg1E2uuzDmmHUBR8Qil95hiPKqWn3aqtLsS18y7G1SzNeowWBENtQd+T1JktP05nzPTdJZlR6vUxNWWjV+vUWhWNkWFwxl9VpZRedYcflIOLafaa28XdsH/ynlcc1uPVDZxQIR0K25+cICCmoqCe5mGhZOfeYSUh5yJcJyrkZWWpaR8doJ2o4NYq62lAAACH5BAkBAAEALAAAAAAoACgAAAKVDI4Yy22ZnINRNqosw0Bv7i1gyHUkFj7oSaWlu3ovC8GxNso5fluz3qLVhBVeT/Lz7ZTHyxL5dDalQWPVOsQWtRnuwXaFTj9jVVh8pma9JjZ4zYSj5ZOyma7uuolffh+IR5aW97cHuBUXKGKXlKjn+DiHWMcYJah4N0lYCMlJOXipGRr5qdgoSTrqWSq6WFl2ypoaUAAAIfkECQEAAQAsAAAAACgAKAAAApaEb6HLgd/iO7FNWtcFWe+ufODGjRfoiJ2akShbueb0wtI50zm02pbvwfWEMWBQ1zKGlLIhskiEPm9R6vRXxV4ZzWT2yHOGpWMyorblKlNp8HmHEb/lCXjcW7bmtXP8Xt229OVWR1fod2eWqNfHuMjXCPkIGNileOiImVmCOEmoSfn3yXlJWmoHGhqp6ilYuWYpmTqKUgAAIfkECQEAAQAsAAAAACgAKAAAApiEH6kb58biQ3FNWtMFWW3eNVcojuFGfqnZqSebuS06w5V80/X02pKe8zFwP6EFWOT1lDFk8rGERh1TTNOocQ61Hm4Xm2VexUHpzjymViHrFbiELsefVrn6XKfnt2Q9G/+Xdie499XHd2g4h7ioOGhXGJboGAnXSBnoBwKYyfioubZJ2Hn0RuRZaflZOil56Zp6iioKSXpUAAAh+QQJAQABACwAAAAAKAAoAAACkoQRqRvnxuI7kU1a1UU5bd5tnSeOZXhmn5lWK3qNTWvRdQxP8qvaC+/yaYQzXO7BMvaUEmJRd3TsiMAgswmNYrSgZdYrTX6tSHGZO73ezuAw2uxuQ+BbeZfMxsexY35+/Qe4J1inV0g4x3WHuMhIl2jXOKT2Q+VU5fgoSUI52VfZyfkJGkha6jmY+aaYdirq+lQAACH5BAkBAAEALAAAAAAoACgAAAKWBIKpYe0L3YNKToqswUlvznigd4wiR4KhZrKt9Upqip61i9E3vMvxRdHlbEFiEXfk9YARYxOZZD6VQ2pUunBmtRXo1Lf8hMVVcNl8JafV38aM2/Fu5V16Bn63r6xt97j09+MXSFi4BniGFae3hzbH9+hYBzkpuUh5aZmHuanZOZgIuvbGiNeomCnaxxap2upaCZsq+1kAACH5BAkBAAEALAAAAAAoACgAAAKXjI8By5zf4kOxTVrXNVlv1X0d8IGZGKLnNpYtm8Lr9cqVeuOSvfOW79D9aDHizNhDJidFZhNydEahOaDH6nomtJjp1tutKoNWkvA6JqfRVLHU/QUfau9l2x7G54d1fl995xcIGAdXqMfBNadoYrhH+Mg2KBlpVpbluCiXmMnZ2Sh4GBqJ+ckIOqqJ6LmKSllZmsoq6wpQAAAh+QQJAQABACwAAAAAKAAoAAAClYx/oLvoxuJDkU1a1YUZbJ59nSd2ZXhWqbRa2/gF8Gu2DY3iqs7yrq+xBYEkYvFSM8aSSObE+ZgRl1BHFZNr7pRCavZ5BW2142hY3AN/zWtsmf12p9XxxFl2lpLn1rseztfXZjdIWIf2s5dItwjYKBgo9yg5pHgzJXTEeGlZuenpyPmpGQoKOWkYmSpaSnqKileI2FAAACH5BAkBAAEALAAAAAAoACgAAAKVjB+gu+jG4kORTVrVhRlsnn2dJ3ZleFaptFrb+CXmO9OozeL5VfP99HvAWhpiUdcwkpBH3825AwYdU8xTqlLGhtCosArKMpvfa1mMRae9VvWZfeB2XfPkeLmm18lUcBj+p5dnN8jXZ3YIGEhYuOUn45aoCDkp16hl5IjYJvjWKcnoGQpqyPlpOhr3aElaqrq56Bq7VAAAOw==");height:100%;filter:alpha(opacity=25);opacity:0.25}.ui-progressbar-indeterminate .ui-progressbar-value{background-image:none}.ui-selectmenu-menu{padding:0;margin:0;position:absolute;top:0;left:0;display:none}.ui-selectmenu-menu .ui-menu{overflow:auto;overflow-x:hidden;padding-bottom:1px}.ui-selectmenu-menu .ui-menu .ui-selectmenu-optgroup{font-size:1em;font-weight:bold;line-height:1.5;padding:2px 0.4em;margin:0.5em 0 0 0;height:auto;border:0}.ui-selectmenu-open{display:block}.ui-selectmenu-text{display:block;margin-right:20px;overflow:hidden;text-overflow:ellipsis}.ui-selectmenu-button.ui-button{text-align:left;white-space:nowrap;width:14em}.ui-selectmenu-icon.ui-icon{float:right;margin-top:0}.ui-slider{position:relative;text-align:left}.ui-slider .ui-slider-handle{position:absolute;z-index:2;width:1.2em;height:1.2em;cursor:default;-ms-touch-action:none;touch-action:none}.ui-slider .ui-slider-range{position:absolute;z-index:1;font-size:.7em;display:block;border:0;background-position:0 0}.ui-slider.ui-state-disabled .ui-slider-handle,.ui-slider.ui-state-disabled .ui-slider-range{filter:inherit}.ui-slider-horizontal{height:.8em}.ui-slider-horizontal .ui-slider-handle{top:-.3em;margin-left:-.6em}.ui-slider-horizontal .ui-slider-range{top:0;height:100%}.ui-slider-horizontal .ui-slider-range-min{left:0}.ui-slider-horizontal .ui-slider-range-max{right:0}.ui-slider-vertical{width:.8em;height:100px}.ui-slider-vertical .ui-slider-handle{left:-.3em;margin-left:0;margin-bottom:-.6em}.ui-slider-vertical .ui-slider-range{left:0;width:100%}.ui-slider-vertical .ui-slider-range-min{bottom:0}.ui-slider-vertical .ui-slider-range-max{top:0}.ui-spinner{position:relative;display:inline-block;overflow:hidden;padding:0;vertical-align:middle}.ui-spinner-input{border:none;background:none;color:inherit;padding:.222em 0;margin:.2em 0;vertical-align:middle;margin-left:.4em;margin-right:2em}.ui-spinner-button{width:1.6em;height:50%;font-size:.5em;padding:0;margin:0;text-align:center;position:absolute;cursor:default;display:block;overflow:hidden;right:0}.ui-spinner a.ui-spinner-button{border-top-style:none;border-bottom-style:none;border-right-style:none}.ui-spinner-up{top:0}.ui-spinner-down{bottom:0}.ui-tabs{position:relative;padding:.2em}.ui-tabs .ui-tabs-nav{margin:0;padding:.2em .2em 0}.ui-tabs .ui-tabs-nav li{list-style:none;float:left;position:relative;top:0;margin:1px .2em 0 0;border-bottom-width:0;padding:0;white-space:nowrap}.ui-tabs .ui-tabs-nav .ui-tabs-anchor{float:left;padding:.5em 1em;text-decoration:none}.ui-tabs .ui-tabs-nav li.ui-tabs-active{margin-bottom:-1px;padding-bottom:1px}.ui-tabs .ui-tabs-nav li.ui-tabs-active .ui-tabs-anchor,.ui-tabs .ui-tabs-nav li.ui-state-disabled .ui-tabs-anchor,.ui-tabs .ui-tabs-nav li.ui-tabs-loading .ui-tabs-anchor{cursor:text}.ui-tabs-collapsible .ui-tabs-nav li.ui-tabs-active .ui-tabs-anchor{cursor:pointer}.ui-tabs .ui-tabs-panel{display:block;border-width:0;padding:1em 1.4em;background:none}.ui-tooltip{padding:8px;position:absolute;z-index:9999;max-width:300px}body .ui-tooltip{border-width:2px}.ui-widget{font-family:Arial,Helvetica,sans-serif;font-size:1em}.ui-widget .ui-widget{font-size:1em}.ui-widget input,.ui-widget select,.ui-widget textarea,.ui-widget button{font-family:Arial,Helvetica,sans-serif;font-size:1em}.ui-widget.ui-widget-content{border:1px solid #c5c5c5}.ui-widget-content{border:1px solid #ddd;background:#fff;color:#333}.ui-widget-content a{color:#333}.ui-widget-header{border:1px solid #ddd;background:#e9e9e9;color:#333;font-weight:bold}.ui-widget-header a{color:#333}.ui-state-default,.ui-widget-content .ui-state-default,.ui-widget-header .ui-state-default,.ui-button,html .ui-button.ui-state-disabled:hover,html .ui-button.ui-state-disabled:active{border:1px solid #c5c5c5;background:#f6f6f6;font-weight:normal;color:#454545}.ui-state-default a,.ui-state-default a:link,.ui-state-default a:visited,a.ui-button,a:link.ui-button,a:visited.ui-button,.ui-button{color:#454545;text-decoration:none}.ui-state-hover,.ui-widget-content .ui-state-hover,.ui-widget-header .ui-state-hover,.ui-state-focus,.ui-widget-content .ui-state-focus,.ui-widget-header .ui-state-focus,.ui-button:hover,.ui-button:focus{border:1px solid #ccc;background:#ededed;font-weight:normal;color:#2b2b2b}.ui-state-hover a,.ui-state-hover a:hover,.ui-state-hover a:link,.ui-state-hover a:visited,.ui-state-focus a,.ui-state-focus a:hover,.ui-state-focus a:link,.ui-state-focus a:visited,a.ui-button:hover,a.ui-button:focus{color:#2b2b2b;text-decoration:none}.ui-visual-focus{box-shadow:0 0 3px 1px rgb(94,158,214)}.ui-state-active,.ui-widget-content .ui-state-active,.ui-widget-header .ui-state-active,a.ui-button:active,.ui-button:active,.ui-button.ui-state-active:hover{border:1px solid #003eff;background:#007fff;font-weight:normal;color:#fff}.ui-icon-background,.ui-state-active .ui-icon-background{border:#003eff;background-color:#fff}.ui-state-active a,.ui-state-active a:link,.ui-state-active a:visited{color:#fff;text-decoration:none}.ui-state-highlight,.ui-widget-content .ui-state-highlight,.ui-widget-header .ui-state-highlight{border:1px solid #dad55e;background:#fffa90;color:#777620}.ui-state-checked{border:1px solid #dad55e;background:#fffa90}.ui-state-highlight a,.ui-widget-content .ui-state-highlight a,.ui-widget-header .ui-state-highlight a{color:#777620}.ui-state-error,.ui-widget-content .ui-state-error,.ui-widget-header .ui-state-error{border:1px solid #f1a899;background:#fddfdf;color:#5f3f3f}.ui-state-error a,.ui-widget-content .ui-state-error a,.ui-widget-header .ui-state-error a{color:#5f3f3f}.ui-state-error-text,.ui-widget-content .ui-state-error-text,.ui-widget-header .ui-state-error-text{color:#5f3f3f}.ui-priority-primary,.ui-widget-content .ui-priority-primary,.ui-widget-header .ui-priority-primary{font-weight:bold}.ui-priority-secondary,.ui-widget-content .ui-priority-secondary,.ui-widget-header .ui-priority-secondary{opacity:.7;filter:Alpha(Opacity=70);font-weight:normal}.ui-state-disabled,.ui-widget-content .ui-state-disabled,.ui-widget-header .ui-state-disabled{opacity:.35;filter:Alpha(Opacity=35);background-image:none}.ui-state-disabled .ui-icon{filter:Alpha(Opacity=35)}.ui-icon{width:16px;height:16px}.ui-icon,.ui-widget-content .ui-icon{background-image:url("images/ui-icons_444444_256x240.png")}.ui-widget-header .ui-icon{background-image:url("images/ui-icons_444444_256x240.png")}.ui-state-hover .ui-icon,.ui-state-focus .ui-icon,.ui-button:hover .ui-icon,.ui-button:focus .ui-icon{background-image:url("images/ui-icons_555555_256x240.png")}.ui-state-active .ui-icon,.ui-button:active .ui-icon{background-image:url("images/ui-icons_ffffff_256x240.png")}.ui-state-highlight .ui-icon,.ui-button .ui-state-highlight.ui-icon{background-image:url("images/ui-icons_777620_256x240.png")}.ui-state-error .ui-icon,.ui-state-error-text .ui-icon{background-image:url("images/ui-icons_cc0000_256x240.png")}.ui-button .ui-icon{background-image:url("images/ui-icons_777777_256x240.png")}.ui-icon-blank{background-position:16px 16px}.ui-icon-caret-1-n{background-position:0 0}.ui-icon-caret-1-ne{background-position:-16px 0}.ui-icon-caret-1-e{background-position:-32px 0}.ui-icon-caret-1-se{background-position:-48px 0}.ui-icon-caret-1-s{background-position:-65px 0}.ui-icon-caret-1-sw{background-position:-80px 0}.ui-icon-caret-1-w{background-position:-96px 0}.ui-icon-caret-1-nw{background-position:-112px 0}.ui-icon-caret-2-n-s{background-position:-128px 0}.ui-icon-caret-2-e-w{background-position:-144px 0}.ui-icon-triangle-1-n{background-position:0 -16px}.ui-icon-triangle-1-ne{background-position:-16px -16px}.ui-icon-triangle-1-e{background-position:-32px -16px}.ui-icon-triangle-1-se{background-position:-48px -16px}.ui-icon-triangle-1-s{background-position:-65px -16px}.ui-icon-triangle-1-sw{background-position:-80px -16px}.ui-icon-triangle-1-w{background-position:-96px -16px}.ui-icon-triangle-1-nw{background-position:-112px -16px}.ui-icon-triangle-2-n-s{background-position:-128px -16px}.ui-icon-triangle-2-e-w{background-position:-144px -16px}.ui-icon-arrow-1-n{background-position:0 -32px}.ui-icon-arrow-1-ne{background-position:-16px -32px}.ui-icon-arrow-1-e{background-position:-32px -32px}.ui-icon-arrow-1-se{background-position:-48px -32px}.ui-icon-arrow-1-s{background-position:-65px -32px}.ui-icon-arrow-1-sw{background-position:-80px -32px}.ui-icon-arrow-1-w{background-position:-96px -32px}.ui-icon-arrow-1-nw{background-position:-112px -32px}.ui-icon-arrow-2-n-s{background-position:-128px -32px}.ui-icon-arrow-2-ne-sw{background-position:-144px -32px}.ui-icon-arrow-2-e-w{background-position:-160px -32px}.ui-icon-arrow-2-se-nw{background-position:-176px -32px}.ui-icon-arrowstop-1-n{background-position:-192px -32px}.ui-icon-arrowstop-1-e{background-position:-208px -32px}.ui-icon-arrowstop-1-s{background-position:-224px -32px}.ui-icon-arrowstop-1-w{background-position:-240px -32px}.ui-icon-arrowthick-1-n{background-position:1px -48px}.ui-icon-arrowthick-1-ne{background-position:-16px -48px}.ui-icon-arrowthick-1-e{background-position:-32px -48px}.ui-icon-arrowthick-1-se{background-position:-48px -48px}.ui-icon-arrowthick-1-s{background-position:-64px -48px}.ui-icon-arrowthick-1-sw{background-position:-80px -48px}.ui-icon-arrowthick-1-w{background-position:-96px -48px}.ui-icon-arrowthick-1-nw{background-position:-112px -48px}.ui-icon-arrowthick-2-n-s{background-position:-128px -48px}.ui-icon-arrowthick-2-ne-sw{background-position:-144px -48px}.ui-icon-arrowthick-2-e-w{background-position:-160px -48px}.ui-icon-arrowthick-2-se-nw{background-position:-176px -48px}.ui-icon-arrowthickstop-1-n{background-position:-192px -48px}.ui-icon-arrowthickstop-1-e{background-position:-208px -48px}.ui-icon-arrowthickstop-1-s{background-position:-224px -48px}.ui-icon-arrowthickstop-1-w{background-position:-240px -48px}.ui-icon-arrowreturnthick-1-w{background-position:0 -64px}.ui-icon-arrowreturnthick-1-n{background-position:-16px -64px}.ui-icon-arrowreturnthick-1-e{background-position:-32px -64px}.ui-icon-arrowreturnthick-1-s{background-position:-48px -64px}.ui-icon-arrowreturn-1-w{background-position:-64px -64px}.ui-icon-arrowreturn-1-n{background-position:-80px -64px}.ui-icon-arrowreturn-1-e{background-position:-96px -64px}.ui-icon-arrowreturn-1-s{background-position:-112px -64px}.ui-icon-arrowrefresh-1-w{background-position:-128px -64px}.ui-icon-arrowrefresh-1-n{background-position:-144px -64px}.ui-icon-arrowrefresh-1-e{background-position:-160px -64px}.ui-icon-arrowrefresh-1-s{background-position:-176px -64px}.ui-icon-arrow-4{background-position:0 -80px}.ui-icon-arrow-4-diag{background-position:-16px -80px}.ui-icon-extlink{background-position:-32px -80px}.ui-icon-newwin{background-position:-48px -80px}.ui-icon-refresh{background-position:-64px -80px}.ui-icon-shuffle{background-position:-80px -80px}.ui-icon-transfer-e-w{background-position:-96px -80px}.ui-icon-transferthick-e-w{background-position:-112px -80px}.ui-icon-folder-collapsed{background-position:0 -96px}.ui-icon-folder-open{background-position:-16px -96px}.ui-icon-document{background-position:-32px -96px}.ui-icon-document-b{background-position:-48px -96px}.ui-icon-note{background-position:-64px -96px}.ui-icon-mail-closed{background-position:-80px -96px}.ui-icon-mail-open{background-position:-96px -96px}.ui-icon-suitcase{background-position:-112px -96px}.ui-icon-comment{background-position:-128px -96px}.ui-icon-person{background-position:-144px -96px}.ui-icon-print{background-position:-160px -96px}.ui-icon-trash{background-position:-176px -96px}.ui-icon-locked{background-position:-192px -96px}.ui-icon-unlocked{background-position:-208px -96px}.ui-icon-bookmark{background-position:-224px -96px}.ui-icon-tag{background-position:-240px -96px}.ui-icon-home{background-position:0 -112px}.ui-icon-flag{background-position:-16px -112px}.ui-icon-calendar{background-position:-32px -112px}.ui-icon-cart{background-position:-48px -112px}.ui-icon-pencil{background-position:-64px -112px}.ui-icon-clock{background-position:-80px -112px}.ui-icon-disk{background-position:-96px -112px}.ui-icon-calculator{background-position:-112px -112px}.ui-icon-zoomin{background-position:-128px -112px}.ui-icon-zoomout{background-position:-144px -112px}.ui-icon-search{background-position:-160px -112px}.ui-icon-wrench{background-position:-176px -112px}.ui-icon-gear{background-position:-192px -112px}.ui-icon-heart{background-position:-208px -112px}.ui-icon-star{background-position:-224px -112px}.ui-icon-link{background-position:-240px -112px}.ui-icon-cancel{background-position:0 -128px}.ui-icon-plus{background-position:-16px -128px}.ui-icon-plusthick{background-position:-32px -128px}.ui-icon-minus{background-position:-48px -128px}.ui-icon-minusthick{background-position:-64px -128px}.ui-icon-close{background-position:-80px -128px}.ui-icon-closethick{background-position:-96px -128px}.ui-icon-key{background-position:-112px -128px}.ui-icon-lightbulb{background-position:-128px -128px}.ui-icon-scissors{background-position:-144px -128px}.ui-icon-clipboard{background-position:-160px -128px}.ui-icon-copy{background-position:-176px -128px}.ui-icon-contact{background-position:-192px -128px}.ui-icon-image{background-position:-208px -128px}.ui-icon-video{background-position:-224px -128px}.ui-icon-script{background-position:-240px -128px}.ui-icon-alert{background-position:0 -144px}.ui-icon-info{background-position:-16px -144px}.ui-icon-notice{background-position:-32px -144px}.ui-icon-help{background-position:-48px -144px}.ui-icon-check{background-position:-64px -144px}.ui-icon-bullet{background-position:-80px -144px}.ui-icon-radio-on{background-position:-96px -144px}.ui-icon-radio-off{background-position:-112px -144px}.ui-icon-pin-w{background-position:-128px -144px}.ui-icon-pin-s{background-position:-144px -144px}.ui-icon-play{background-position:0 -160px}.ui-icon-pause{background-position:-16px -160px}.ui-icon-seek-next{background-position:-32px -160px}.ui-icon-seek-prev{background-position:-48px -160px}.ui-icon-seek-end{background-position:-64px -160px}.ui-icon-seek-start{background-position:-80px -160px}.ui-icon-seek-first{background-position:-80px -160px}.ui-icon-stop{background-position:-96px -160px}.ui-icon-eject{background-position:-112px -160px}.ui-icon-volume-off{background-position:-128px -160px}.ui-icon-volume-on{background-position:-144px -160px}.ui-icon-power{background-position:0 -176px}.ui-icon-signal-diag{background-position:-16px -176px}.ui-icon-signal{background-position:-32px -176px}.ui-icon-battery-0{background-position:-48px -176px}.ui-icon-battery-1{background-position:-64px -176px}.ui-icon-battery-2{background-position:-80px -176px}.ui-icon-battery-3{background-position:-96px -176px}.ui-icon-circle-plus{background-position:0 -192px}.ui-icon-circle-minus{background-position:-16px -192px}.ui-icon-circle-close{background-position:-32px -192px}.ui-icon-circle-triangle-e{background-position:-48px -192px}.ui-icon-circle-triangle-s{background-position:-64px -192px}.ui-icon-circle-triangle-w{background-position:-80px -192px}.ui-icon-circle-triangle-n{background-position:-96px -192px}.ui-icon-circle-arrow-e{background-position:-112px -192px}.ui-icon-circle-arrow-s{background-position:-128px -192px}.ui-icon-circle-arrow-w{background-position:-144px -192px}.ui-icon-circle-arrow-n{background-position:-160px -192px}.ui-icon-circle-zoomin{background-position:-176px -192px}.ui-icon-circle-zoomout{background-position:-192px -192px}.ui-icon-circle-check{background-position:-208px -192px}.ui-icon-circlesmall-plus{background-position:0 -208px}.ui-icon-circlesmall-minus{background-position:-16px -208px}.ui-icon-circlesmall-close{background-position:-32px -208px}.ui-icon-squaresmall-plus{background-position:-48px -208px}.ui-icon-squaresmall-minus{background-position:-64px -208px}.ui-icon-squaresmall-close{background-position:-80px -208px}.ui-icon-grip-dotted-vertical{background-position:0 -224px}.ui-icon-grip-dotted-horizontal{background-position:-16px -224px}.ui-icon-grip-solid-vertical{background-position:-32px -224px}.ui-icon-grip-solid-horizontal{background-position:-48px -224px}.ui-icon-gripsmall-diagonal-se{background-position:-64px -224px}.ui-icon-grip-diagonal-se{background-position:-80px -224px}.ui-corner-all,.ui-corner-top,.ui-corner-left,.ui-corner-tl{border-top-left-radius:3px}.ui-corner-all,.ui-corner-top,.ui-corner-right,.ui-corner-tr{border-top-right-radius:3px}.ui-corner-all,.ui-corner-bottom,.ui-corner-left,.ui-corner-bl{border-bottom-left-radius:3px}.ui-corner-all,.ui-corner-bottom,.ui-corner-right,.ui-corner-br{border-bottom-right-radius:3px}.ui-widget-overlay{background:#aaa;opacity:.3;filter:Alpha(Opacity=30)}.ui-widget-shadow{-webkit-box-shadow:0 0 5px #666;box-shadow:0 0 5px #666} -------------------------------------------------------------------------------- /static/css/theme-mine.css: -------------------------------------------------------------------------------- 1 | 2 | :root{ 3 | 4 | --MAIN-TEXT-color:#323235; /* Color of text by default */ 5 | --MAIN-TITLES-TEXT-color: #778ba5; /* Color of titles h2-h3-h4-h5 */ 6 | --MAIN-LINK-color:#4881cd; /* Color of links */ 7 | --MAIN-LINK-HOVER-color:#599af1; /* Color of hovered links */ 8 | --MAIN-ANCHOR-color: #4881cd; /* color of anchors on titles */ 9 | 10 | --MENU-HEADER-BG-color:#283e5b; /* Background color of menu header */ 11 | --MENU-HEADER-BORDER-color:#435c7c; /*Color of menu header border */ 12 | 13 | --MENU-SEARCH-BG-color:#202c3c; /* Search field background color (by default borders + icons) */ 14 | --MENU-SEARCH-BOX-color: #4d6584; /* Override search field border color */ 15 | --MENU-SEARCH-BOX-ICONS-color: #4d6584; /* Override search field icons color */ 16 | 17 | --MENU-SECTIONS-ACTIVE-BG-color:#0a0c0e; /* Background color of the active section and its childs */ 18 | --MENU-SECTIONS-BG-color:#1c222a; /* Background color of other sections */ 19 | --MENU-SECTIONS-LINK-color: #ccc; /* Color of links in menu */ 20 | --MENU-SECTIONS-LINK-HOVER-color: #e6e6e6; /* Color of links in menu, when hovered */ 21 | --MENU-SECTION-ACTIVE-CATEGORY-color: #777; /* Color of active category text */ 22 | --MENU-SECTION-ACTIVE-CATEGORY-BG-color: #fff; /* Color of background for the active category (only) */ 23 | 24 | --MENU-VISITED-color: #33a1ff; /* Color of 'page visited' icons in menu */ 25 | --MENU-SECTION-HR-color: #20272b; /* Color of
separator in menu */ 26 | 27 | } 28 | 29 | body { 30 | color: var(--MAIN-TEXT-color) !important; 31 | } 32 | 33 | textarea:focus, input[type="email"]:focus, input[type="number"]:focus, input[type="password"]:focus, input[type="search"]:focus, input[type="tel"]:focus, input[type="text"]:focus, input[type="url"]:focus, input[type="color"]:focus, input[type="date"]:focus, input[type="datetime"]:focus, input[type="datetime-local"]:focus, input[type="month"]:focus, input[type="time"]:focus, input[type="week"]:focus, select[multiple=multiple]:focus { 34 | border-color: none; 35 | box-shadow: none; 36 | } 37 | 38 | h2, h3, h4, h5 { 39 | color: var(--MAIN-TITLES-TEXT-color) !important; 40 | } 41 | 42 | a { 43 | color: var(--MAIN-LINK-color); 44 | } 45 | 46 | .anchor { 47 | color: var(--MAIN-ANCHOR-color); 48 | } 49 | 50 | a:hover { 51 | color: var(--MAIN-LINK-HOVER-color); 52 | } 53 | 54 | #sidebar ul li.visited > a .read-icon { 55 | color: var(--MENU-VISITED-color); 56 | } 57 | 58 | #sidebar #footer { 59 | padding-top: 20px !important; 60 | } 61 | 62 | #sidebar #footer h2.github-title { 63 | font-size: 20px; 64 | color: #fd9827 !important; 65 | margin: 10px 0px 5px; 66 | padding: 0px; 67 | font-weight: normal !important; 68 | margin-top: 10px; 69 | padding-top: 30px; 70 | border-top: 1px dotted #384657; 71 | } 72 | 73 | #sidebar #footer h3.github-title { 74 | font-size: 14px; 75 | margin: 10px 0px 5px; 76 | padding: 0px; 77 | text-transform: uppercase; 78 | letter-spacing: .15px; 79 | } 80 | 81 | #sidebar #footer h5.copyright, #sidebar #footer p.build-number { 82 | font-size: 10px; 83 | letter-spacing: .15px; 84 | line-height: 150% !important; 85 | } 86 | 87 | #body a.highlight:after { 88 | display: block; 89 | content: ""; 90 | height: 1px; 91 | width: 0%; 92 | -webkit-transition: width 0.5s ease; 93 | -moz-transition: width 0.5s ease; 94 | -ms-transition: width 0.5s ease; 95 | transition: width 0.5s ease; 96 | background-color: var(--MAIN-LINK-HOVER-color); 97 | } 98 | #sidebar { 99 | background-color: var(--MENU-SECTIONS-BG-color); 100 | } 101 | #sidebar #header-wrapper { 102 | background: var(--MENU-HEADER-BG-color); 103 | color: var(--MENU-SEARCH-BOX-color); 104 | border-color: var(--MENU-HEADER-BORDER-color); 105 | } 106 | #sidebar .searchbox { 107 | border-color: var(--MENU-SEARCH-BOX-color); 108 | background: var(--MENU-SEARCH-BG-color); 109 | } 110 | #sidebar ul.topics > li.parent, #sidebar ul.topics > li.active { 111 | background: var(--MENU-SECTIONS-ACTIVE-BG-color); 112 | } 113 | #sidebar .searchbox * { 114 | color: var(--MENU-SEARCH-BOX-ICONS-color); 115 | } 116 | 117 | #sidebar a { 118 | color: var(--MENU-SECTIONS-LINK-color); 119 | } 120 | 121 | #sidebar a:hover { 122 | color: var(--MENU-SECTIONS-LINK-HOVER-color); 123 | } 124 | 125 | #sidebar ul li.active > a { 126 | background: var(--MENU-SECTION-ACTIVE-CATEGORY-BG-color); 127 | color: var(--MENU-SECTION-ACTIVE-CATEGORY-color) !important; 128 | } 129 | 130 | #sidebar hr { 131 | border-color: var(--MENU-SECTION-HR-color); 132 | } 133 | 134 | #navigation a.nav-prev, #navigation a.nav-next { 135 | color: #f19e39 !important; 136 | } 137 | 138 | #navigation a.nav-prev:hover, #navigation a.nav-next:hover { 139 | color: #e07d04 !important; 140 | } 141 | 142 | div.notices p:first-child:before { 143 | position: absolute; 144 | top: 2px; 145 | color: #fff; 146 | font-family: 'Font Awesome\ 5 Free'; 147 | content: #F06A; 148 | font-weight: 900; /* Fix version 5.0.9 */ 149 | left: 10px; 150 | } 151 | 152 | .ui-state-default, .ui-widget-content .ui-state-default, .ui-widget-header .ui-state-default, .ui-button, html .ui-button.ui-state-disabled:hover, html .ui-button.ui-state-disabled:active { 153 | border: 1px solid #dddddd; 154 | font-weight: normal; 155 | color: #454545; 156 | } 157 | 158 | .ui-state-active, .ui-widget-content .ui-state-active, .ui-widget-header .ui-state-active, a.ui-button:active, .ui-button:active, .ui-button.ui-state-active:hover { 159 | border: 1px solid var(--MENU-HEADER-BG-color); 160 | background: var(--MENU-HEADER-BG-color); 161 | font-weight: normal; 162 | color: #fff; 163 | } 164 | 165 | .ui-widget.ui-widget-content { 166 | border: 1px solid #eeeeee; 167 | } 168 | 169 | .ui-widget-header { 170 | border: 1px solid #eeeeee; 171 | } 172 | 173 | .hljs { 174 | background-color: none; 175 | } 176 | 177 | pre { 178 | background-color: var(--MENU-SECTIONS-BG-color) !important; 179 | } 180 | 181 | div.notices.info p { 182 | border-top: 30px solid #fd9827; 183 | background: #FFF2DB; 184 | } 185 | 186 | -------------------------------------------------------------------------------- /static/images/cleanup/sm_cleanup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/cleanup/sm_cleanup.png -------------------------------------------------------------------------------- /static/images/convert_script/distributed_script.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/convert_script/distributed_script.png -------------------------------------------------------------------------------- /static/images/convert_script/single_instance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/convert_script/single_instance.png -------------------------------------------------------------------------------- /static/images/eks/create_repo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/eks/create_repo.png -------------------------------------------------------------------------------- /static/images/eks/eksctl_launch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/eks/eksctl_launch.png -------------------------------------------------------------------------------- /static/images/eks/get_container.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/eks/get_container.png -------------------------------------------------------------------------------- /static/images/eks/job_yaml_container.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/eks/job_yaml_container.png -------------------------------------------------------------------------------- /static/images/eks/push_commands.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/eks/push_commands.png -------------------------------------------------------------------------------- /static/images/eks/subnet_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/eks/subnet_image.png -------------------------------------------------------------------------------- /static/images/eks/verify_eks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/eks/verify_eks.png -------------------------------------------------------------------------------- /static/images/eks/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/eks/workflow.png -------------------------------------------------------------------------------- /static/images/intro/approaches.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/intro/approaches.png -------------------------------------------------------------------------------- /static/images/intro/challenges.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/intro/challenges.png -------------------------------------------------------------------------------- /static/images/intro/containers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/intro/containers.png -------------------------------------------------------------------------------- /static/images/intro/containers_ecr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/intro/containers_ecr.png -------------------------------------------------------------------------------- /static/images/intro/forward_backward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/intro/forward_backward.png -------------------------------------------------------------------------------- /static/images/intro/home.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/intro/home.png -------------------------------------------------------------------------------- /static/images/intro/how_it_runs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/intro/how_it_runs.png -------------------------------------------------------------------------------- /static/images/intro/mlinfra.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/intro/mlinfra.png -------------------------------------------------------------------------------- /static/images/intro/parallel_distributed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/intro/parallel_distributed.png -------------------------------------------------------------------------------- /static/images/sagemaker/aws_console.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/sagemaker/aws_console.png -------------------------------------------------------------------------------- /static/images/sagemaker/sm_notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/sagemaker/sm_notebook.png -------------------------------------------------------------------------------- /static/images/sagemaker/tensorboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/sagemaker/tensorboard.png -------------------------------------------------------------------------------- /static/images/sagemaker/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/sagemaker/workflow.png -------------------------------------------------------------------------------- /static/images/setup/admin_attach.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/admin_attach.png -------------------------------------------------------------------------------- /static/images/setup/attach_policy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/attach_policy.png -------------------------------------------------------------------------------- /static/images/setup/go_to_IAM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/go_to_IAM.png -------------------------------------------------------------------------------- /static/images/setup/launch_jupyter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/launch_jupyter.png -------------------------------------------------------------------------------- /static/images/setup/launch_terminal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/launch_terminal.png -------------------------------------------------------------------------------- /static/images/setup/notebook_iam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/notebook_iam.png -------------------------------------------------------------------------------- /static/images/setup/setup_aws_console.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/setup_aws_console.png -------------------------------------------------------------------------------- /static/images/setup/setup_create_notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/setup_create_notebook.png -------------------------------------------------------------------------------- /static/images/setup/setup_fill_notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/setup_fill_notebook.png -------------------------------------------------------------------------------- /static/images/setup/setup_notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/setup_notebook.png -------------------------------------------------------------------------------- /static/tf-world-distributed-training-workshop.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/tf-world-distributed-training-workshop.pdf --------------------------------------------------------------------------------