├── .gitignore
├── .gitmodules
├── README.md
├── archetypes
    └── default.md
├── config.toml
├── content
    ├── _index.md
    ├── cleanup
    │   ├── _index.md
    │   └── clean_resources.md
    ├── intro
    │   ├── _index.md
    │   ├── addressing_challenges-1.md
    │   ├── addressing_challenges.md
    │   ├── challenges_solution.md
    │   └── horovod.md
    ├── kubernetes_dist_training
    │   ├── _index.md
    │   ├── build_container.md
    │   ├── fsx_lustre.md
    │   ├── install_cli.md
    │   ├── install_kubeflow.md
    │   ├── setup_eks.md
    │   ├── submit_job.md
    │   ├── verify_cluster.md
    │   └── workflow.md
    ├── sagemaker_dist_training
    │   ├── _index.md
    │   ├── monitoring_results.md
    │   ├── sagemaker_training.md
    │   ├── training_scrip_updates.md
    │   └── workflow.md
    ├── setup
    │   ├── _index.md
    │   ├── add_admin_policy.md
    │   ├── download_workshop.md
    │   └── sm_jupyter_instance.md
    └── update_code_dist_training
    │   ├── _index.md
    │   ├── distributed_training_script.md
    │   ├── prepare_dataset.md
    │   ├── problem_setup.md
    │   └── single_instance_script.md
├── layouts
    ├── 404.html
    ├── partials
    │   ├── custom-footer.html
    │   ├── favicon.html
    │   ├── footer.html
    │   ├── google.html
    │   ├── header.html
    │   ├── logo.html
    │   └── menu-footer.html
    └── shortcodes
    │   ├── cf-download.html
    │   ├── cf-launch.html
    │   ├── ghcontributors.html
    │   ├── github.html
    │   ├── mermaid.html
    │   ├── surveymonkey.html
    │   ├── tab.html
    │   ├── tabs.html
    │   └── year.html
├── notebooks
    ├── generate_cifar10_tfrecords.py
    ├── part-1-horovod
    │   ├── cifar10-distributed.ipynb
    │   ├── cifar10-single-instance.ipynb
    │   └── model_def.py
    ├── part-2-sagemaker
    │   ├── cifar10-sagemaker-distributed.ipynb
    │   └── code
    │   │   ├── cifar10-multi-gpu-horovod-sagemaker.py
    │   │   └── model_def.py
    └── part-3-kubernetes
    │   ├── Dockerfile.cpu
    │   ├── Dockerfile.gpu
    │   ├── code
    │       ├── cifar10-multi-gpu-horovod-k8s.py
    │       └── model_def.py
    │   ├── cpu_eks_cluster.sh
    │   ├── gpu_eks_cluster.sh
    │   └── specs
    │       ├── claim-fsx-s3.yaml
    │       ├── eks_tf_training_job-cpu.yaml
    │       ├── eks_tf_training_job-gpu.yaml
    │       ├── fsx_lustre_policy.json
    │       └── storage-class-fsx-s3-template.yaml
└── static
    ├── 640px-Amazon_Web_Services_Logo.svg.png
    ├── AWS-Logo.svg
    ├── Amazon_Web_Services_Logo.svg
    ├── css
        ├── all.css
        ├── jquery-ui.min.css
        └── theme-mine.css
    ├── images
        ├── cleanup
        │   └── sm_cleanup.png
        ├── convert_script
        │   ├── distributed_script.png
        │   └── single_instance.png
        ├── eks
        │   ├── create_repo.png
        │   ├── eksctl_launch.png
        │   ├── get_container.png
        │   ├── job_yaml_container.png
        │   ├── push_commands.png
        │   ├── subnet_image.png
        │   ├── verify_eks.png
        │   └── workflow.png
        ├── intro
        │   ├── approaches.png
        │   ├── challenges.png
        │   ├── containers.png
        │   ├── containers_ecr.png
        │   ├── forward_backward.png
        │   ├── home.png
        │   ├── how_it_runs.png
        │   ├── mlinfra.png
        │   └── parallel_distributed.png
        ├── sagemaker
        │   ├── aws_console.png
        │   ├── sm_notebook.png
        │   ├── tensorboard.png
        │   └── workflow.png
        └── setup
        │   ├── admin_attach.png
        │   ├── attach_policy.png
        │   ├── go_to_IAM.png
        │   ├── launch_jupyter.png
        │   ├── launch_terminal.png
        │   ├── notebook_iam.png
        │   ├── setup_aws_console.png
        │   ├── setup_create_notebook.png
        │   ├── setup_fill_notebook.png
        │   └── setup_notebook.png
    ├── js
        ├── jquery-3.3.1.min.js
        └── jquery-ui-1.12.1.min.js
    ├── mermaid
        └── mermaid.min.js
    └── tf-world-distributed-training-workshop.pdf


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | .DS_Store
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # Environments
 87 | .env
 88 | .venv
 89 | env/
 90 | venv/
 91 | ENV/
 92 | env.bak/
 93 | venv.bak/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "themes/learn"]
2 | 	path = themes/learn
3 | 	url = https://github.com/matcornic/hugo-theme-learn.git
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AWS Distributed Training Workshop
2 | <hr>
3 | This repository contains code, webpages and config files accompanying the AWS Distributed Training Workshop
4 | 
5 | 
6 | * [Workshop content](https://distributed-training-workshop.go-aws.com/)
7 | 
8 | * [Presentation slides](static/tf-world-distributed-training-workshop.pdf)
9 | 


--------------------------------------------------------------------------------
/archetypes/default.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "{{ replace .Name "-" " " | title }}"
3 | date: {{ .Date }}
4 | ---
5 | 


--------------------------------------------------------------------------------
/config.toml:
--------------------------------------------------------------------------------
 1 | baseURL = "https://distributed-training-workshop.go-aws.com"
 2 | languageCode = "en-us"
 3 | defaultContentLanguage = "en"
 4 | title = "Distributed training with Amazon SageMaker / Amazon EKS  Workshop"
 5 | theme = "learn"
 6 | uglyurls = true
 7 | googleAnalytics = "UA-151135045-1"
 8 | sectionPagesMenu = "main"
 9 | pygmentsCodeFences = true
10 | 
11 | [blackfriday]
12 | hrefTargetBlank = true
13 | 
14 | [params]
15 | themeVariant = "mine"
16 | showVisitedLinks = false
17 | author = "Shashank Prasanna"
18 | description = "Distributed training workshop with Amazon SageMaker and Amazon EKS"
19 | disableSearch = false
20 | disableAssetsBusting = false
21 | 
22 | disableInlineCopyToClipBoard = false
23 | disableShortcutsTitle = false
24 | disableLanguageSwitchingButton = false
25 | disableBreadcrumb = true
26 | disableNextPrev = true
27 | ordersectionsby = "weight"
28 | 
29 | [[menu.shortcuts]]
30 | name = "<i class='fab fa-twitter fa-lg'></i> @shshnkp"
31 | identifier = "tw"
32 | url = "https://twitter.com/shshnkp"
33 | weight = 1
34 | 
35 | [outputs]
36 | home = [ "HTML", "AMP", "RSS", "JSON"]
37 | page = [ "HTML", "AMP"]
38 | 


--------------------------------------------------------------------------------
/content/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Distributed Training Workshop"
 3 | chapter: true
 4 | weight: 1
 5 | ---
 6 | 
 7 | # Distributed Training Workshop
 8 | 
 9 | ### Welcome to the distributed training workshop with TensorFlow on Amazon SageMaker and Amazon Elastic Kubernetes Service (EKS).
10 | <br>
11 | #### **At the end of this workshop, you'll be able to:**
12 | <br>
13 | #### - Identify when to consider distributed training
14 | #### - Describe different approaches to distributed training
15 | #### - Outline libraries and tools needed for distributing training workloads on large clusters
16 | #### - Demonstrate code changes required to go from single-GPU to multi-GPU distributed training
17 | #### - Demonstrate using Amazon SageMaker and Amazon EKS to run distributed training jobs
18 | #### - Apply these skills to your own deep learning problem
19 | 


--------------------------------------------------------------------------------
/content/cleanup/_index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Cleanup"
 3 | date: 2019-10-27T15:25:09-07:00
 4 | chapter: true
 5 | weight: 6
 6 | ---
 7 | 
 8 | # Clean up resources
 9 | In this section, we'll walkthrough steps to clean up resources.
10 | 
11 | 


--------------------------------------------------------------------------------
/content/cleanup/clean_resources.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Clean up resources"
 3 | date: 2019-10-31T23:12:17-07:00
 4 | ---
 5 | 
 6 | ## Amazon EKS resources
 7 | 
 8 | #### Kill all distributed training jobs
 9 | ```
10 | kubectl delete MPIJobs --all
11 | ```
12 | 
13 | #### Delete StorageClass, PersistentVolumeClaim and FSx for Lustre CSI Driver
14 | {{% notice tip %}}
15 | Note: This will automatically delete the FSx for luster file system. Your files are safe in Amazon S3.
16 | {{% /notice %}}
17 | ```
18 | kubectl delete -f specs/storage-class-fsx-s3.yaml
19 | kubectl delete -f specs/claim-fsx-s3.yaml
20 | kubectl delete -f https://raw.githubusercontent.com/kubernetes-sigs/aws-fsx-csi-driver/master/deploy/kubernetes/manifest.yaml
21 | ```
22 | #### Delete security group
23 | ```
24 | aws ec2 delete-security-group --group-id ${SECURITY_GROUP_ID}
25 | ```
26 | 
27 | #### Delete policies attached to the instance role
28 | These policies were automatically added to the node IAM roles, but we'll need to manually remove them.
29 | 
30 | * Copy the role associated with the worker instances
31 | ```
32 | echo $INSTANCE_ROLE_NAME
33 | ```
34 | * Navigate to IAM console
35 | * Click on Roles on the left pane
36 | * Search for the output of `echo $INSTANCE_ROLE_NAME`
37 | * Delete the two inline policies.
38 |  * `iam_alb_ingress_policy`
39 |  * `iam_csi_fsx_policy`
40 | 
41 | #### Finally, delete the cluster
42 | ```
43 | eksctl delete cluster aws-tf-cluster-cpu
44 | ```
45 | 
46 | ## SageMaker resources
47 | SageMaker resources are easier to clear.
48 | Login into the SageMaker console and click dashboard
49 | Make sure that you don't have any resources that are **Green** as shown below. Click on the resources that is shown as green and either stop or delete them.
50 | 
51 | ![sm_dashboard](/images/cleanup/sm_cleanup.png)
52 | 
53 | ## Other resources
54 | It's always good idea to ensure that:
55 | 
56 | 


--------------------------------------------------------------------------------
/content/intro/_index.md:
--------------------------------------------------------------------------------
 1 | +++
 2 | title = "Introduction"
 3 | date = 2019-10-27T15:22:24-07:00
 4 | weight = 1
 5 | chapter = true
 6 | +++
 7 | 
 8 | # Introduction
 9 | In a typical machine learning development workflow, there are two main stages where you can get benefit from scaling out.
10 | 
11 | ![parallel distributed](/images/intro/parallel_distributed.png)
12 | 
13 | 1. Running large-scale parallel experiments: In this scenario our goal is to find the best model/hyperparameters/network architecture by exploring a space of possibilities.
14 | 1. Running distributed training of a single model: In this scenario our goal is to train a single model faster, by distributing its computation across nodes in a cluster.
15 | 
16 | ### The focus of this workshop is distributed training of a single model
17 | 


--------------------------------------------------------------------------------
/content/intro/addressing_challenges-1.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Addressing scaling challenges - Infrastructure management"
 3 | date: 2019-10-28T21:07:47-07:00
 4 | weight: 4
 5 | ---
 6 | 
 7 | ### Infrastructure management
 8 | 
 9 | ![containers](/images/intro/mlinfra.png)
10 | 


--------------------------------------------------------------------------------
/content/intro/addressing_challenges.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Addressing scaling challenges - software dependencies"
 3 | date: 2019-10-28T21:02:29-07:00
 4 | weight: 3
 5 | ---
 6 | 
 7 | ### Software dependencies
 8 | 
 9 | Containers provide consistent, lightweight and portable environment that includes not just the training code but also dependencies and configuration.
10 | ![containers](/images/intro/containers.png)
11 | 
12 | Simply package up your code and push it to a container registry.
13 | The container image can then be pulled into a cluster and run at scale.
14 | ![containers](/images/intro/containers_ecr.png)
15 | 


--------------------------------------------------------------------------------
/content/intro/challenges_solution.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Challenges with scaling machine learning"
 3 | date: 2019-10-28T20:56:39-07:00
 4 | weight: 2
 5 | ---
 6 | ![challenges](/images/intro/challenges.png)
 7 | 
 8 | There are two key challenges associated with scaling machine learning computation.
 9 | 
10 | 1. Development setup on a single computer or instance doesn't translate well when deploying to cluster
11 | 2. Managing infrastructure is challenging for machine learning researchers, data scientists and developer without IT/ops background
12 | 


--------------------------------------------------------------------------------
/content/intro/horovod.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Distributed training approaches"
 3 | date: 2019-10-28T21:11:22-07:00
 4 | weight: 5
 5 | ---
 6 | ![approaches](/images/intro/approaches.png)
 7 | 
 8 | ## Horovod
 9 | [(horovod.ai)](horovod.ai)
10 | 
11 | Horovod is based on the MPI concepts:
12 | size, rank, local rank, allreduce, allgather, and broadcast.
13 | 
14 | * Library for distributed deep learning with support for multiple frameworks including TensorFlow
15 | * Separates infrastructure from ML engineers
16 | * Uses ring-allreduce and uses Message Passing Interface (MPI) popular in the HPC community
17 | * Infrastructure services such as Amazon SageMaker and Amazon EKS provides container and MPI environment
18 | 
19 | ![allreduce](/images/intro/forward_backward.png)
20 | 
21 | 1. Forward pass on each device
22 | 1. Backward pass compute gradients
23 | 1. ”All reduce” (average and broadcast) gradients across devices
24 | 1. Update local variables with “all reduced” gradients
25 | 
26 | Horovod will run the same copy of the script on all hosts/servers/nodes/instances
27 | 
28 | ![mpi](/images/intro/how_it_runs.png)
29 | 
30 | `horovodrun -np 16 -H server1:4,server2:4,server3:4,server4:4 python training_script.py`
31 | 


--------------------------------------------------------------------------------
/content/kubernetes_dist_training/_index.md:
--------------------------------------------------------------------------------
 1 | +++
 2 | title = "Distributed Training with Amazon EKS"
 3 | date = 2019-10-21T13:21:28-07:00
 4 | weight = 5
 5 | chapter = true
 6 | #pre = "<b>2. </b>"
 7 | +++
 8 | 
 9 | # Distributed Training with Amazon EKS
10 | 
11 | In this section, we’ll run distributed training on Amazon Elastic Kubernetes Service (Amazon EKS). Amazon EKS makes it easy to deploy, manage, and scale containerized applications using Kubernetes on AWS. To run deep learning workloads on Amazon EKS, we'll install Kubeflow. The Kubeflow project includes capabilities that make deployments of machine learning (ML) workflows on Kubernetes easy. With EKS and Kubeflow, you'll still need to manage the underlying CPU and GPU instances that form your cluster. EKS and Kubeflow make it easy to manage and schedule machine learning workloads on your cluster.
12 | 


--------------------------------------------------------------------------------
/content/kubernetes_dist_training/build_container.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Build training container image and push it to ECR"
 3 | date: 2019-10-28T16:51:02-07:00
 4 | weight: 7
 5 | ---
 6 | 
 7 | #### Build a custom docker image with our training code
 8 | 
 9 | In our Dockerfile we start with an AWS Deep Learning TensorFlow container and copy our training code into the container.
10 | 
11 | ```
12 | cd ~/SageMaker/distributed-training-workshop/notebooks/part-3-kubernetes/
13 | cat Dockerfile.cpu
14 | ```
15 | `Dockerfile.cpu` Output:
16 | ```
17 | FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:1.14.0-cpu-py36-ubuntu16.04
18 | COPY code /opt/training/
19 | WORKDIR /opt/training
20 | ```
21 | 
22 | {{% notice tip %}}
23 | Replace with `Dockerfile.gpu` if you're going to be running training on a GPU cluster.
24 | {{% /notice %}}
25 | 
26 | #### Build and push a custom Docker container
27 | 
28 | * Navigate to [ECR and create a new repository](https://console.aws.amazon.com/ecr/home)
29 | * Click create repository
30 | * Provide a repository name
31 | * Click create
32 | 
33 | {{% notice tip %}}
34 | By clicking on **View push commands** button below, you can get access to docker build and push commands, so you don't have to remember them.
35 | {{% /notice %}}
36 | ![create repo](/images/eks/create_repo.png)
37 | ![push commands](/images/eks/push_commands.png)
38 | #### Create a new Elastic Container Registry repository
39 | 
40 | * Head over to the terminal on JupyterLab and log-in to the AWS Deep Learning registry
41 | ```
42 | $(aws ecr get-login --no-include-email --region us-west-2 --registry-ids 763104351884)
43 | ```
44 | * Run `docker build` command in **Step 2** from the Docker push commands menu. Make sure to update it with the correct Docker file name for CPU or GPU:
45 |   * For CPU container: `docker build -t <your_docker_repo_name> -f Dockerfile.cpu .`
46 |   * For GPU container: `docker build -t <your_docker_repo_name> -f Dockerfile.gpu .`
47 | * Run the `docker tag` command in **Step 3** from the Docker push commands menu
48 | 
49 | * Log in to your docker registry
50 |  * `$(aws ecr get-login --no-include-email --region us-west-2)`
51 | 
52 | * Run `docker push` command in **Step 4** from the Docker push commands menu
53 | 
54 | {{% notice tip %}}
55 | What happened?
56 | (1) You first logged into the AWS Deep Learning container registry in order to pull the deep learning container (2) You then built your container. (3) After the container is built, you added the appropriate tag needed to push it to ECR. (4) Then you login to your own registry. (4) Then you push the container to your registry
57 | 
58 | {{% /notice %}}
59 | 


--------------------------------------------------------------------------------
/content/kubernetes_dist_training/fsx_lustre.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Enable Amazon FSx for Lustre access"
 3 | date: 2019-10-28T16:20:52-07:00
 4 | weight: 6
 5 | ---
 6 | 
 7 | Amazon FSx for Lustre provides a high-performance file system optimized for fast processing of workloads such as in deep learning. FSx for Lustre file system transparently presents S3 objects as files and allows you to write results back to S3.
 8 | 
 9 | #### Install the FSx CSI Driver
10 | ```
11 | kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/aws-fsx-csi-driver/master/deploy/kubernetes/manifest.yaml
12 | ```
13 | 
14 | 
15 | ```
16 | VPC_ID=$(aws ec2 describe-vpcs --filters "Name=tag:Name,Values=eksctl-${CLUSTER_NAME}-cluster/VPC" --query "Vpcs[0].VpcId" --output text)
17 | ```
18 | 
19 | #### Get subnet ID from the EC2 console
20 | Navigate to [AWS EC2 console](https://console.aws.amazon.com/ec2/v2/home) and click on **Instances**.
21 | Select one of the running instances which starts with the name of the EKS cluster. This instance is a node on the EKS cluster.
22 | Copy the subnet ID as show in the image below. Click on the copy-to-clipboard icon show next to the arrow.
23 | 
24 | ![subnet](/images/eks/subnet_image.png)
25 | paste the subnet ID below
26 | ```
27 | export SUBNET_ID=<subnet_id>
28 | ```
29 | 
30 | #### Create your security group for the FSx file system
31 | ```
32 | export SECURITY_GROUP_ID=$(aws ec2 create-security-group --group-name eks-fsx-security-group --vpc-id ${VPC_ID} --description "FSx for Lustre Security Group" --query "GroupId" --output text)
33 | ```
34 | 
35 | {{% notice warning %}}
36 | **Stop:** Make sure that the security group was created before proceeding.
37 | Confirm by running `echo $SECURITY_GROUP_ID`. Don't proceed if this is empty.
38 | {{% /notice %}}
39 | 
40 | #### Add an ingress rule that opens up port 988 from the 192.168.0.0/16 CIDR range
41 | ```
42 | aws ec2 authorize-security-group-ingress --group-id ${SECURITY_GROUP_ID} --protocol tcp --port 988 --cidr 192.168.0.0/16
43 | ```
44 | 
45 | #### Update the environment variables in the storage class spec file
46 | Running envsubst will populate SUBNET_ID, SECURITY_GROUP_ID, BUCKET_NAME
47 | ```
48 | cd ~/SageMaker/distributed-training-workshop/notebooks/part-3-kubernetes/
49 | 
50 | envsubst < specs/storage-class-fsx-s3-template.yaml > specs/storage-class-fsx-s3.yaml
51 | ```
52 | 
53 | #### Deploy the StorageClass and PersistentVolumeClaim
54 | ```
55 | kubectl apply -f specs/storage-class-fsx-s3.yaml
56 | kubectl apply -f specs/claim-fsx-s3.yaml
57 | ```
58 | 
59 | This will take several minutes. You can check the status by running the following command. Hit `Ctrl+C` if you don't want the terminal to be blocked. To manually check, run the command without `-w`
60 | 
61 | ```
62 | kubectl get pvc fsx-claim -w
63 | ```
64 | 


--------------------------------------------------------------------------------
/content/kubernetes_dist_training/install_cli.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Install CLI tools"
 3 | date: 2019-10-28T15:02:28-07:00
 4 | weight: 2
 5 | ---
 6 | 
 7 | Navigate to the following directory for part 3 of the workshop
 8 | ```
 9 | cd ~/SageMaker/distributed-training-workshop/notebooks/part-3-kubernetes/
10 | ```
11 | 
12 | 
13 | #### Install `eksctl`
14 | 
15 | To get started we'll fist install the eksctl CLI tool. [eksctl](https://eksctl.io) simplifies the process of creating EKS clusters.
16 | 
17 | ```bash
18 | pip install awscli --upgrade --user
19 | curl --silent --location "https://github.com/weaveworks/eksctl/releases/download/latest_release/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
20 | 
21 | ```
22 | 
23 | Move eksctl to /usr/local/bin to that it's on path
24 | 
25 | ```
26 | sudo mv /tmp/eksctl /usr/local/bin
27 | eksctl version
28 | 
29 | ```
30 | 
31 | #### Install `kubectl`
32 | Kubectl is a command line interface for running commands against Kubernetes clusters. Run the following to install Kubectl
33 | 
34 | ```bash
35 | curl -o kubectl https://amazon-eks.s3-us-west-2.amazonaws.com/1.14.6/2019-08-22/bin/linux/amd64/kubectl
36 | chmod +x ./kubectl
37 | sudo mv ./kubectl /usr/local/bin
38 | kubectl version --short --client
39 | 
40 | ```
41 | 
42 | #### Install `aws-iam-authenticator`
43 | 
44 | ```
45 | curl -o aws-iam-authenticator https://amazon-eks.s3-us-west-2.amazonaws.com/1.14.6/2019-08-22/bin/linux/amd64/aws-iam-authenticator
46 | 
47 | chmod +x ./aws-iam-authenticator
48 | 
49 | sudo mv aws-iam-authenticator /usr/local/bin
50 | ```
51 | 


--------------------------------------------------------------------------------
/content/kubernetes_dist_training/install_kubeflow.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Install Kubeflow"
 3 | date: 2019-10-28T15:42:44-07:00
 4 | weight: 5
 5 | ---
 6 | 
 7 | #### Download the kfctl CLI tool
 8 | 
 9 | ```
10 | curl --silent --location https://github.com/kubeflow/kubeflow/releases/download/v0.7.0-rc.6/kfctl_v0.7.0-rc.5-7-gc66ebff3_linux.tar.gz | tar xz
11 | 
12 | sudo mv kfctl /usr/local/bin
13 | ```
14 | 
15 | #### Get the latest Kubeflow configuration file
16 | 
17 | ```
18 | export CONFIG='https://raw.githubusercontent.com/kubeflow/manifests/v0.7-branch/kfdef/kfctl_aws.0.7.0.yaml'
19 | ```
20 | 
21 | #### Create environment and local variables
22 | 
23 | ```
24 | CLUSTER_NAME=$(eksctl get cluster --output=json | jq '.[0].name' --raw-output)
25 | 
26 | INSTANCE_ROLE_NAME=$(eksctl get iamidentitymapping --name ${CLUSTER_NAME} --output=json | jq '.[0].rolearn' --raw-output | sed -e 's/.*\///')
27 | ```
28 | 
29 | {{% notice warning %}}
30 | Make sure that both environment variables are set before proceeding.
31 | Confirm by running `echo $CLUSTER_NAME` and `echo $INSTANCE_ROLE_NAME`.
32 | Make sure that these are not empty.
33 | {{% /notice %}}
34 | 
35 | Add your S3 bucket name below:
36 | ```
37 | export BUCKET_NAME=<your_bucket>
38 | ```
39 | 
40 | {{% notice warning %}}
41 | **Stop:** Verify that you have the correct bucket name before proceeding.
42 | {{% /notice %}}
43 | 
44 | ```
45 | export KF_NAME=${CLUSTER_NAME}
46 | export KF_DIR=$PWD/${KF_NAME}
47 | ```
48 | 
49 | #### Build your configuration files
50 | We'll edit the configuration with the right names for the cluster and node groups before deploying Kubeflow.
51 | 
52 | ```
53 | mkdir -p ${KF_DIR}
54 | cd ${KF_DIR}
55 | kfctl build -V -f ${CONFIG}
56 | export CONFIG_FILE=${KF_DIR}/kfctl_aws.0.7.0.yaml
57 | 
58 | ```
59 | 
60 | #### Edit the configuration file to include the correct instance role name and cluster name
61 | ```
62 | sed -i "s@eksctl-kubeflow-aws-nodegroup-ng-a2-NodeInstanceRole-xxxxxxx@$INSTANCE_ROLE_NAME@" ${CONFIG_FILE}
63 | 
64 | sed -i "s@kubeflow-aws@$CLUSTER_NAME@" ${CONFIG_FILE}
65 | 
66 | ```
67 | 
68 | #### Apply the changes and deploy Kubeflow
69 | ```
70 | cd ${KF_DIR}
71 | rm -rf kustomize/
72 | kfctl apply -V -f ${CONFIG_FILE}
73 | ```
74 | 
75 | #### Wait for resource to become available
76 | 
77 | Monitor changes by running kubectl get all namespaces command.
78 | ```
79 | kubectl -n kubeflow get all
80 | ```
81 | 


--------------------------------------------------------------------------------
/content/kubernetes_dist_training/setup_eks.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Setup an Amazon EKS cluster"
 3 | date: 2019-10-28T15:14:12-07:00
 4 | weight: 3
 5 | ---
 6 | 
 7 | Navigate to ***distributed-training-workshop > notebooks > part-3-sagemaker***
 8 | 
 9 | The `cpu_eks_cluster.sh` and `gpu_eks_cluster.sh` files include the necessary options to lauch a CPU or GPU cluster. Take a look at the options by running the following script to launch an EKS clusters
10 | 
11 | ```bash
12 | cd ~/SageMaker/distributed-training-workshop/notebooks/part-3-kubernetes/
13 | cat cpu_eks_cluster.sh
14 | ```
15 | You should see the following output
16 | ```
17 | Output:
18 | eksctl create cluster \
19 |     --name aws-tf-cluster-cpu \
20 |     --version 1.14 \
21 |     --region us-west-2 \
22 |     --nodegroup-name cpu-nodes \
23 |     --node-type c5.xlarge \
24 |     --nodes 2 \
25 |     --node-volume-size 50 \
26 |     --node-zones us-west-2a \
27 |     --timeout=40m \
28 |     --zones=us-west-2a,us-west-2b,us-west-2c \
29 |     --auto-kubeconfig
30 | ```
31 | 
32 | {{% notice tip %}}
33 | To launch a cluster with GPU use the script `gpu_eks_cluster.sh` instead. If you wish to launch a cluster with more than 2 nodes, update the `nodes` argument to number of nodes you want in the cluster.
34 | {{% /notice %}}
35 | 
36 |  Now launch an EKS cluster:
37 | ```
38 | sh cpu_eks_cluster.sh
39 | ```
40 | 
41 | You should an output that something similar to this.
42 | 
43 | ![eks output](/images/eks/eksctl_launch.png)
44 | 
45 | Creating a cluster may take about 15 mins. You could head over to [AWS cloud formation console](https://console.aws.amazon.com/cloudformation) to monitor the progress.
46 | 


--------------------------------------------------------------------------------
/content/kubernetes_dist_training/submit_job.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Submit distributed training job"
 3 | date: 2019-10-28T17:14:05-07:00
 4 | weight: 8
 5 | ---
 6 | #### Confirm that you are in the right directory
 7 | ```
 8 | cd ~/SageMaker/distributed-training-workshop/notebooks/part-3-kubernetes/
 9 | ```
10 | #### Copy the container image name
11 | 
12 | ![copy container](/images/eks/get_container.png?width=60pc)
13 | 
14 | 
15 | #### Update the MPIJob spec file
16 | 
17 | Open `specs/eks_tf_training_job-cpu.yaml` and update `image: <YOUR_DOCKER_IMAGE>` with the name of your container.
18 | 
19 | ![update container](/images/eks/job_yaml_container.png?width=60pc)
20 | 
21 | #### Submit a job run:
22 | ```
23 | kubectl apply -f specs/eks_tf_training_job-cpu.yaml
24 | ```
25 | {{% notice tip %}}
26 | For GPU jobs use this instead: `eks_tf_training_job-gpu.yaml`
27 | {{% /notice %}}
28 | 
29 | You should see an output something like this:
30 | ```
31 | mpijob.kubeflow.org/eks-tf-distributed-training created
32 | ```
33 | Running `kubectl get pods` will should you the number of workers + 1 number of pods.
34 | 
35 | ```bash
36 | $ kubectl get pods
37 | NAME                                         READY   STATUS    RESTARTS   AGE
38 | eks-tf-distributed-training-launcher-6lgzg   1/1     Running   0          63s
39 | eks-tf-distributed-training-worker-0         1/1     Running   0          66s
40 | eks-tf-distributed-training-worker-1         1/1     Running   0          66s
41 | ```
42 | 
43 | To observer training logs, run `kubectl logs <pod_name>`. Select the launcher pod from the list. You can use tab complete or copy the name of the pod from the output of `kubectl get pods`
44 | 
45 | ```
46 | kubectl logs eks-tf-distributed-training-launcher-<TAB>
47 | ```
48 | 
49 | output:
50 | ```
51 | ...
52 | Epoch 1/30
53 | Epoch 1/30
54 |  3/78 [>.............................] - ETA: 4:05 - loss: 3.6816 - acc: 0.1172 3/724/78 [========>.....................] - ETA: 1:29 - loss: 2.7493 - acc: 0.161024/778/78 [==============================] - 128s 2s/step - loss: 2.1984 - acc: 0.2268 - val_loss: 2.1794 - val_acc: 0.1699
55 | Epoch 2/30
56 | 78/78 [==============================] - 129s 2s/step - loss: 2.2108 - acc: 0.2268 - val_loss: 2.1794 - val_acc: 0.1699
57 | Epoch 2/30
58 | ```
59 | 


--------------------------------------------------------------------------------
/content/kubernetes_dist_training/verify_cluster.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Verify installation and test cluster"
 3 | date: 2019-10-28T15:33:52-07:00
 4 | weight: 4
 5 | ---
 6 | 
 7 | Once the cluster is up and running, you should see a message that your cluster is now ready.
 8 | ![verify eks](/images/eks/verify_eks.png)
 9 | 
10 | Update kubeconfig file to point to our new cluster.
11 | If you chose a different name for you cluster (other than aws-tf-cluster-cpu) then be sure to include the name of your cluster below.
12 | 
13 | ```
14 | aws eks --region us-west-2 update-kubeconfig --name aws-tf-cluster-cpu
15 | ```
16 | 
17 | Run the following to confirm that you can access the EKS cluster:
18 | 
19 | You should see a list of kubernetes namespaces:
20 | ```
21 | kubectl get ns
22 | ```
23 | ```
24 | Output:
25 | NAME              STATUS   AGE
26 | default           Active   12m
27 | kube-node-lease   Active   13m
28 | kube-public       Active   13m
29 | kube-system       Active   13m
30 | ```
31 | 
32 | You should see total number of nodes in your cluster:
33 | ```
34 | kubectl get nodes
35 | ```
36 | ```
37 | Output:
38 | NAME                                           STATUS   ROLES    AGE    VERSION
39 | ip-192-168-10-211.us-west-2.compute.internal   Ready    <none>   7m3s   v1.14.7-eks-1861c5
40 | ip-192-168-10-229.us-west-2.compute.internal   Ready    <none>   7m4s   v1.14.7-eks-1861c5
41 | ```
42 | 


--------------------------------------------------------------------------------
/content/kubernetes_dist_training/workflow.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Workflow"
 3 | date: 2019-10-28T14:18:11-07:00
 4 | weight: 1
 5 | ---
 6 | 
 7 | Navigate to
 8 | ***distributed-training-workshop > notebooks > part-2-sagemaker***
 9 | You should see the following files:
10 | 
11 | ```bash
12 | part-3-kubernetes/
13 | ├── Dockerfile
14 | ├── cpu_eks_cluster.sh
15 | ├── gpu_eks_cluster.sh
16 | ├── code
17 | │   ├── cifar10-multi-gpu-horovod-k8s.py
18 | │   └── model_def.py
19 | └── specs
20 |     ├── claim-fsx-s3.yaml
21 |     ├── eks_tf_training_job.yaml
22 |     ├── fsx_lustre_policy.json
23 |     └── storage-class-fsx-s3-template.yaml
24 | ```
25 | 
26 | |Files/directories|Description|
27 | |-----|-----|
28 | |Dockerfile | Use this build a custom container image for training on Amazon EKS|
29 | |cpu_eks_cluster.sh, gpu_eks_cluster.sh |shell scripts using `eksctl` CLI tool to launch an Amazon EKS cluster|
30 | |code|Contains the training scrip and other training script dependencies|
31 | |specs|List of spec files required to configure Kubeflow|
32 | 
33 | ![workflow](/images/eks/workflow.png)
34 | 
35 | We'll need to first setup Amazon EKS, Amazon FSx for Lustre file  system and install Kubeflow. This involves multiple steps and we'll leverage various CLI tools to to help install, configure and interact with EKS. At a high level, we'll perform the following steps:
36 | 
37 | 1. Install `eksctl` CLI and use it to launch an Amazon EKS cluster
38 | 1. Install `kubectl` CLI to interact with the Amazon EKS cluster
39 | 1. Install `kfclt` CLI and use it to configure and install Kubeflow
40 | 1. Allow Amazon EKS to access Amazon FSx for Lustre file system that's linked to an Amazon S3 bucket
41 | 1. Finally, launch a distributed training job
42 | 


--------------------------------------------------------------------------------
/content/sagemaker_dist_training/_index.md:
--------------------------------------------------------------------------------
 1 | +++
 2 | title = "Distributed Training with Amazon Sagemaker"
 3 | date = 2019-10-21T13:21:01-07:00
 4 | weight = 4
 5 | chapter = true
 6 | # pre = "<b>1. </b>"
 7 | +++
 8 | 
 9 | # Distributed training with Amazon SageMaker
10 | 
11 | In this section, we'll run distributed training on Amazon SageMaker. We'll provide SageMaker with our updated training script that includes horovod API, and SageMaker will take care of the rest - spinning up requested number of CPU or GPU instances, copying the training code and dependencies to the training cluster, copying the dataset from Amazon S3 to the training cluster, keeping track of training progress and shutting down the instances once training is done. Amazon SageMaker is a fully managed service, so you don't have to worry about managing instances.
12 | 


--------------------------------------------------------------------------------
/content/sagemaker_dist_training/monitoring_results.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Monitoring training progress"
 3 | date: 2019-10-28T13:54:27-07:00
 4 | weights: 4
 5 | ---
 6 | 
 7 | ### Monitoring training progress using tensorboard
 8 | 
 9 | The ***cifar10-sagemaker-distributed.ipynb*** notebook will automatically start a tensorboard server for you when your run the following cell. Tensorboard is running locally on your Jupyter notebook instance, but reading the events from the Amazon S3 bucket we used to save the events using the keras callback.
10 | 
11 | ```bash
12 | !S3_REGION=us-west-2 tensorboard --logdir s3://{bucket_name}/tensorboard_logs/
13 | ```
14 | 
15 | Navigate to https://tfworld2019.notebook.us-west-2.sagemaker.aws/proxy/6006/
16 | 
17 | Replace `tfworld2019` with the name of your Jupyter notebook instance.
18 | ![tensorboard](/images/sagemaker/tensorboard.png)
19 | 
20 | ### Monitoring training job status on the AWS SageMaker console
21 | 
22 | Navigate to ***AWS management console > SageMaker console*** to see a full list of training jobs and their status.
23 | 
24 | ![tensorboard](/images/sagemaker/aws_console.png)
25 | 
26 | To view cloudwatch logs from the training instances, click on the ***training job name > Monitor > View logs***
27 | 


--------------------------------------------------------------------------------
/content/sagemaker_dist_training/sagemaker_training.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "SageMaker distributed training"
  3 | date: 2019-10-28T13:17:44-07:00
  4 | weight: 3
  5 | ---
  6 | 
  7 | <br> Open `cifar10-sagemaker-distributed.ipynb` and run through the cells. The following notebook is located at: <br>
  8 | ***distributed-training-workshop > notebooks > part-2-sagemaker > cifar10-sagemaker-distributed.ipynb***
  9 | 
 10 | ![sm_notebook](/images/sagemaker/sm_notebook.png)
 11 | 
 12 | {{% notice warning %}}
 13 | **Stop:** Do this section on JupyterLab. Below is a copy of the jupyter notebook for reference. <br>
 14 | {{% /notice %}}
 15 | 
 16 | ----
 17 | 
 18 | 
 19 | ## Distributed training with Amazon SageMaker
 20 | 
 21 | In this notebook we use the SageMaker Python SDK to setup and run a distributed training job.
 22 | SageMaker makes it easy to train models across a cluster containing a large number of machines, without having to explicitly manage those resources.
 23 | 
 24 | **Step 1:** Import essentials packages, start a sagemaker session and specify the bucket name you created in the pre-requsites section of this workshop.
 25 | 
 26 | 
 27 | ```python
 28 | import os
 29 | import time
 30 | import numpy as np
 31 | import sagemaker
 32 | 
 33 | sagemaker_session = sagemaker.Session()
 34 | role = sagemaker.get_execution_role()
 35 | bucket_name = 'tfworld2019-<your_bucket_name>'
 36 | ```
 37 | 
 38 | **Step 2:** Specify hyperparameters, instance type and number of instances to distribute training to. The `hvd_processes_per_host` corrosponds to number of GPUs per instances.
 39 | For example, if you choose:
 40 | ```
 41 | hvd_instance_type = 'ml.p3.8large'
 42 | hvd_instance_count = 2
 43 | hvd_processes_per_host = 4
 44 | ```
 45 | 
 46 | Since p3.8xlarge instance has 4 GPUs, we'll we distributing training to 8 workers, 1 per GPU.
 47 | This is spread across 2 instances (or nodes). SageMaker automatically takes care of spinning up these instances and making sure they can communiate with each other.
 48 | 
 49 | 
 50 | ```python
 51 | hyperparameters = {'epochs': 100,
 52 |                    'learning-rate': 0.001,
 53 |                    'momentum': 0.9,
 54 |                    'weight-decay': 2e-4,
 55 |                    'optimizer': 'adam',
 56 |                    'batch-size' : 256}
 57 | 
 58 | hvd_instance_type = 'ml.c5.xlarge'
 59 | hvd_instance_count = 2
 60 | hvd_processes_per_host = 1
 61 | 
 62 | print('Distributed training with a total of {} workers'.format(hvd_processes_per_host*hvd_instance_count))
 63 | print('{} x {} instances with {} processes per instance'.format(hvd_instance_count, hvd_instance_type, hvd_processes_per_host))
 64 | ```
 65 | 
 66 | **Step 3:** In this cell we create a SageMaker estimator, by providing it with all the information it needs to launch instances and execute training on those instances.
 67 | 
 68 | Since we're using horovod for distributed training, we specify `distributions` to mpi which is used by horovod.
 69 | 
 70 | In the TensorFlow estimator call, we specify training script under `entry_point` and dependencies under `code`. SageMaker automatically copies these files into a TensorFlow container behind the scenes, and are executed on the training instances.
 71 | 
 72 | 
 73 | ```python
 74 | from sagemaker.tensorflow import TensorFlow
 75 | 
 76 | output_path = 's3://{}/'.format(bucket_name)
 77 | job_name = 'sm-dist-{}x{}-workers'.format(hvd_instance_count, hvd_processes_per_host) + time.strftime('%Y-%m-%d-%H-%M-%S-%j', time.gmtime())
 78 | model_dir = output_path + 'tensorboard_logs/' + job_name
 79 | 
 80 | distributions = {'mpi': {
 81 |                     'enabled': True,
 82 |                     'processes_per_host': hvd_processes_per_host,
 83 |                     'custom_mpi_options': '-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none'
 84 |                         }
 85 |                 }
 86 | 
 87 | estimator_hvd = TensorFlow(base_job_name='hvd-cifar10-tf',
 88 |                        source_dir='code',
 89 |                        entry_point='cifar10-multi-gpu-horovod-sagemaker.py',
 90 |                        role=role,
 91 |                        framework_version='1.14',
 92 |                        py_version='py3',
 93 |                        hyperparameters=hyperparameters,
 94 |                        train_instance_count=hvd_instance_count,
 95 |                        train_instance_type=hvd_instance_type,
 96 |                        output_path=output_path,
 97 |                        model_dir=model_dir,
 98 |                        tags = [{'Key' : 'Project', 'Value' : 'cifar10'},{'Key' : 'TensorBoard', 'Value' : 'dist'}],
 99 |                        metric_definitions=[{'Name': 'val_acc', 'Regex': 'val_acc: ([0-9\\.]+)'}],
100 |                        distributions=distributions)
101 | ```
102 | 
103 | **Step 4:** Specify dataset locations in Amazon S3 and then call the fit function.
104 | 
105 | 
106 | ```python
107 | train_path = 's3://{}/cifar10-dataset/train'.format(bucket_name)
108 | val_path = 's3://{}/cifar10-dataset/validation'.format(bucket_name)
109 | eval_path = 's3://{}/cifar10-dataset/eval/'.format(bucket_name)
110 | 
111 | estimator_hvd.fit({'train': train_path,'validation': val_path,'eval': eval_path},
112 |                   job_name=job_name, wait=False)
113 | ```
114 | 
115 | **Step 5:** Monitor progress on TensorBoard. Launch tensorboard and open the link on a new tab to visualize training progress, and navigate to the following link
116 | 
117 | 
118 | ```python
119 | !S3_REGION=us-west-2 tensorboard --logdir s3://{bucket_name}/tensorboard_logs/
120 | ```
121 | 
122 | Open a new browser tan and navigate to the folloiwng link to access TensorBoard:
123 | <br> https://tfworld2019.notebook.us-west-2.sagemaker.aws/proxy/6006/
124 | <br> Make sure that the name of the notebook instance is correct in the link above.
125 | <br> Don't forget the slash at the end of the URL 6006/
126 | 


--------------------------------------------------------------------------------
/content/sagemaker_dist_training/training_scrip_updates.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Updates required to run on SageMaker"
 3 | date: 2019-10-28T13:42:13-07:00
 4 | weight: 2
 5 | ---
 6 | 
 7 | There are few minor changes required to run a training script on Amazon Sagemaker
 8 | 
 9 | 
10 | ##### SageMaker hyperparameters
11 | * SageMaker passes hyperparameters to the training scripts as commandline arguments. Your script must be able to parse these arguments.
12 | 
13 | ##### SageMaker environment variables
14 | * SageMaker makes several environment variables available inside the container that a training script can take advantage of for finding location of the training dataset, number of GPU in the instance, dataset channels and others. A full list of environment variables an be found on the [SageMaker container GitHub repository](https://github.com/aws/sagemaker-containers#important-environment-variables)
15 | 
16 | ```
17 | parser = argparse.ArgumentParser()
18 | 
19 | # Hyper-parameters
20 | parser.add_argument('--epochs',        type=int,   default=15)
21 | parser.add_argument('--learning-rate', type=float, default=0.001)
22 | parser.add_argument('--batch-size',    type=int,   default=256)
23 | parser.add_argument('--weight-decay',  type=float, default=2e-4)
24 | parser.add_argument('--momentum',      type=float, default='0.9')
25 | parser.add_argument('--optimizer',     type=str,   default='adam')
26 | 
27 | # SageMaker parameters
28 | parser.add_argument('--model_dir',        type=str)
29 | parser.add_argument('--model_output_dir', type=str,   default=os.environ['SM_MODEL_DIR'])
30 | parser.add_argument('--output_data_dir',  type=str,   default=os.environ['SM_OUTPUT_DATA_DIR'])
31 | 
32 | # Data directories and other options
33 | parser.add_argument('--gpu-count',        type=int,   default=os.environ['SM_NUM_GPUS'])
34 | parser.add_argument('--train',            type=str,   default=os.environ['SM_CHANNEL_TRAIN'])
35 | parser.add_argument('--validation',       type=str,   default=os.environ['SM_CHANNEL_VALIDATION'])
36 | parser.add_argument('--eval',             type=str,   default=os.environ['SM_CHANNEL_EVAL'])
37 | 
38 | args = parser.parse_args()
39 | ```
40 | 
41 | ##### (Optional) TensorBoard callback for real-time monitoring of training
42 | * Using a keras callback we can upload tensorboard files to Amazon S3 so that we can monitor progress in real-time.
43 | tensorboard already comes installed on the SageMaker JupyterLab instance, and has support for reading event files from Amazon S3.
44 | 
45 | `tensorboard --logdir s3://{bucket_name}/tensorboard_logs/`
46 | 
47 | ```
48 | class Sync2S3(tf.keras.callbacks.Callback):
49 |     def __init__(self, logdir, s3logdir):
50 |         super(Sync2S3, self).__init__()
51 |         self.logdir = logdir
52 |         self.s3logdir = s3logdir
53 | 
54 |     def on_epoch_end(self, batch, logs={}):
55 |         os.system('aws s3 sync '+self.logdir+' '+self.s3logdir)
56 |         # ' >/dev/null 2>&1'
57 | ```
58 | 


--------------------------------------------------------------------------------
/content/sagemaker_dist_training/workflow.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Workflow"
 3 | date: 2019-10-28T12:59:15-07:00
 4 | weight: 1
 5 | ---
 6 | Navigate to
 7 | ***distributed-training-workshop > notebooks > part-2-sagemaker***
 8 | You should see the following files:
 9 | 
10 | ```bash
11 | part-2-sagemaker/
12 | ├── cifar10-sagemaker-distributed.ipynb
13 | └── code
14 |     ├── cifar10-multi-gpu-horovod-sagemaker.py
15 |     └── model_def.py
16 | ```
17 | 
18 | |Files/directories|Description|
19 | |-----|-----|
20 | |cifar10-sagemaker-distributed.ipynb |This jupyter notebook contains code to define and kick off a SageMaker training job|
21 | |code |This directory contains the training scrip and other training script dependencies|
22 | 
23 | ![sagemaker_workflow](/images/sagemaker/workflow.png)
24 | 
25 | SageMaker is a fully-managed service, which means when you kick off a training job using the SageMaker SDK in the `cifar10-sagemaker-distributed.ipynb` notebook, few different things happen behind the scene
26 | 
27 | * SageMaker spins up request number of instances in a fully-managed SageMaker cluster
28 | * SageMaker pulls the latest (or specified version) of TensorFlow container images, instantiates it on the new instances and loads the content of the `code` directory into the container
29 | * SageMaker runs the training script on each instance. Since we're running distributed training SageMaker launches an `MPI` job with the right settings so that workers can communicate with each other.
30 | * SageMaker copies the dataset over from Amazon S3 and makes it available inside the container for Training
31 | * SageMaker monitors the training and updates progress on the Amazon SageMaker console
32 | * SageMaker copies all the code and model artifacts to Amazon S3 after the training is finished
33 | 
34 | In addition, SageMaker does a lot more to ensure that the jobs run optimally and you get the best perfomance out-of-the box. As a user you don't have to worry about managing machine learning infrastructure.
35 | 


--------------------------------------------------------------------------------
/content/setup/_index.md:
--------------------------------------------------------------------------------
 1 | +++
 2 | title = "Prerequisites"
 3 | date = 2019-10-21T13:30:44-07:00
 4 | weight = 2
 5 | chapter = true
 6 | #pre = "<b>0. </b>"
 7 | +++
 8 | 
 9 | # Getting Started
10 | In this section, we'll setup up our development environment.
11 | We'll be using an Amazon SageMaker notebook instance which is a fully managed compute instance running the Jupyter Notebook server.
12 | 


--------------------------------------------------------------------------------
/content/setup/add_admin_policy.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Update notebook IAM role"
 3 | date: 2019-10-27T23:41:36-07:00
 4 | weight: 3
 5 | ---
 6 | 
 7 | ### Give your notebook instances admin privileges
 8 | {{% notice warning %}}
 9 | **Note:** We're providing admin privileges to the SageMaker notebook instance only because we'll be using the same instance to launch an Amazon EKS cluster in the later part of the workshop. If you're only going to be using SageMaker managed cluster for training, S3 full access policy should suffice.
10 | {{% /notice %}}
11 | 
12 | * Click on the **tfworld2019** and you'll see additional details about the instance. Click on the IAM role link, this should take you to the IAM Management Console. Once there, click attach policy button.
13 | ![go_to_IAM](/images/setup/go_to_IAM.png)
14 | ![attach_policy](/images/setup/attach_policy.png)
15 | * Select **AdministratorAccess** and click on **Attach policy**
16 | ![admin attach](/images/setup/admin_attach.png)
17 | * Close the the IAM Management Console window and head back to the SageMaker console. 
18 | 


--------------------------------------------------------------------------------
/content/setup/download_workshop.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Download the workshop content"
 3 | date: 2019-10-28T00:14:06-07:00
 4 | weight: 4
 5 | ---
 6 | ### Launch JupyterLab client and clone the workshop repository
 7 | * Your notebook instance should now be ready. Click *JupyterLab* to launch your client.
 8 | ![launch jupyter](/images/setup/launch_jupyter.png)
 9 | 
10 | * Click *File > New >  Terminal* to launch terminal in your JupyterLab instance.
11 | ![Launch terminal](/images/setup/launch_terminal.png)
12 | 
13 | * Download the workshop code and notebooks. Enter bash (optional), change directory to ~/SageMaker, clone the repository
14 | ```bash
15 | bash
16 | cd ~/SageMaker
17 | git clone https://github.com/shashankprasanna/distributed-training-workshop.git
18 | ```
19 | 
20 | * Confirm that you're able to see the contents. Should see 3 parts
21 | ```
22 | ls distributed-training-workshop/notebooks
23 | ```
24 | 


--------------------------------------------------------------------------------
/content/setup/sm_jupyter_instance.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Launch a SageMaker notebook instance"
 3 | date: 2019-10-27T22:39:43-07:00
 4 | weight: 2
 5 | ---
 6 | 
 7 | {{% notice info %}}
 8 | **Note:** In this workshop, we'll be using an Amazon SageMaker notebook instance for simplicity and convenience. You can use any local client to perform steps detailed in this and subsequent sections. You'll just need to make sure you have the right privileges to access AWS services such as SageMaker, EKS, S3, ECR and others from your clinet. You'll also need to install AWS Command Line Interface (AWS CLI), python, boto3 and SageMaker SDK installed. The SageMaker Jupyter notebook on the other hand is preconfigured and ready to use.
 9 | {{% /notice %}}
10 | 
11 | ### Launch an Amazon SageMaker notebook instance
12 | 
13 | * Open the [AWS Management Console](https://console.aws.amazon.com/console/home)
14 | {{% notice info %}}
15 | **Note:** This workshop has been tested on the US West (Oregon) (us-west-2) region. Make sure that you see **Oregon** on the top right hand corner of your AWS Management Console. If you see a different region, click the dropdown menu and select US West (Oregon)
16 | {{% /notice %}}
17 | 
18 | * In the AWS Console search bar, type SageMaker and select Amazon SageMaker to open the service console.
19 | ![SageMaker Console](/images/setup/setup_aws_console.png)
20 | * Click on Notebook Instances
21 | ![Launch notebook instance 1](/images/setup/setup_notebook.png)
22 | * From the Amazon SageMaker > Notebook instances page, select Create notebook instance.
23 | ![Launch notebook instance 2](/images/setup/setup_create_notebook.png)
24 | * In the Notebook instance name text box, enter a name for the notebook instance.
25 |  * For this workshop select **"tfworld2019"** as the instance name
26 |  * Choose ml.c5.xlarge. We'll only be using this instance to launch jobs. The training job themselves will run either on a SageMaker managed cluster or an Amazon EKS cluster
27 |  * Volume size 50 - this is only needed for building docker containers. During training data is copied directly from Amazon S3 to the training cluster when using SageMaker. When using Amazon EKS, we'll setup an FSx for lustre file system that worker nodes will use to get access to training data.
28 | ![Fill notebook instance](/images/setup/setup_fill_notebook.png)
29 | * To create an IAM role, from the IAM role drop-down list, select Create a new role. In the Create an IAM role dialog box, select Any S3 bucket. After that select Select **Create role**. Amazon SageMaker creates the **AmazonSageMaker-ExecutionRole-*** ** role.
30 | ![iam](/images/setup/notebook_iam.png)
31 | * Keep the default settings for the other options and click Create notebook instance. On the **Notebook instances** section you should see the status change from *Pending -> InService*
32 | * While the notebook instance spins up, continue to work on the next section, and we'll come back and launch the instance when it's ready.
33 | 


--------------------------------------------------------------------------------
/content/update_code_dist_training/_index.md:
--------------------------------------------------------------------------------
 1 | +++
 2 | title = "Prepare your training scripts"
 3 | date = 2019-10-28T00:51:31-07:00
 4 | weight = 3
 5 | chapter = true
 6 | +++
 7 | 
 8 | # Prepare your training scripts
 9 | 
10 | In this section, we'll walk through the process of modifying an existing TensorFlow-Keras training script so that it can perform training in a distributed environment.
11 | 


--------------------------------------------------------------------------------
/content/update_code_dist_training/distributed_training_script.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Using horovod API for distributed training"
  3 | date: 2019-10-28T02:47:30-07:00
  4 | weight: 4
  5 | ---
  6 | 
  7 | ## Exercise 1: Convert training script to use horovod
  8 | 
  9 | <br>In this section you'll update the  training script with horovod API for run distributed training.
 10 | 
 11 | Open `cifar10-distributed.ipynb` and run through the cells. The following notebook is located at: <br>
 12 | ***distributed-training-workshop > notebooks > part-1-horovod***
 13 | 
 14 | ![singe_instance](/images/convert_script/distributed_script.png)
 15 | 
 16 | {{% notice warning %}}
 17 | **Stop:** Do this section on JupyterLab. Below is a copy of the jupyter notebook for reference. <br><br>
 18 | Open `cifar10-distributed.ipynb` and run these cells. <br><br>
 19 | Look for cells that say **Change X** and fill in those cells with the modifications - where **X** is the change number. There are a total of 8 changes.
 20 | Click on **> Solution** to see the answers
 21 | {{% /notice %}}
 22 | 
 23 | You'll need to make the following modifications to your training script to use horovod for distributed training.
 24 | 
 25 | 1. Run hvd.init()
 26 | 2. Pin a server GPU to be used by this process using config.gpu_options.visible_device_list.
 27 | 3. Scale the learning rate by the number of workers.
 28 | 4. Wrap the optimizer in hvd.DistributedOptimizer.
 29 | 5. Add hvd.callbacks.BroadcastGlobalVariablesCallback(0) to broadcast initial variable states from rank 0 to all other processes.
 30 | 6. Modify your code to save checkpoints only on worker 0 to prevent other workers from corrupting them.
 31 | 
 32 | 
 33 | 
 34 | #### Change 1: Import horovod and keras backend
 35 | 
 36 | 
 37 | ```python
 38 | import tensorflow as tf
 39 | 
 40 | 
 41 | 
 42 | ```
 43 | 
 44 | <details><summary><b>Solution</b></summary>
 45 |    <pre>
 46 | import horovod.tensorflow.keras as hvd
 47 | import tensorflow.keras.backend as K
 48 | </pre>
 49 | </details>
 50 | 
 51 | 
 52 | ```python
 53 | from datetime import datetime
 54 | import argparse
 55 | import os
 56 | import numpy as np
 57 | import codecs
 58 | import json
 59 | 
 60 | from tensorflow import keras
 61 | from tensorflow.keras.layers import Input, Dense, Flatten
 62 | from tensorflow.keras.models import Model
 63 | from tensorflow.keras.optimizers import Adam, SGD
 64 | from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
 65 | from model_def import get_model
 66 | 
 67 | HEIGHT = 32
 68 | WIDTH  = 32
 69 | DEPTH  = 3
 70 | NUM_CLASSES = 10
 71 | NUM_TRAIN_IMAGES = 40000
 72 | NUM_VALID_IMAGES = 10000
 73 | NUM_TEST_IMAGES  = 10000
 74 | ```
 75 | 
 76 | 
 77 | ```python
 78 | def train_preprocess_fn(image):
 79 | 
 80 |     # Resize the image to add four extra pixels on each side.
 81 |     image = tf.image.resize_image_with_crop_or_pad(image, HEIGHT + 8, WIDTH + 8)
 82 | 
 83 |     # Randomly crop a [HEIGHT, WIDTH] section of the image.
 84 |     image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH])
 85 | 
 86 |     # Randomly flip the image horizontally.
 87 |     image = tf.image.random_flip_left_right(image)
 88 | 
 89 |     return image
 90 | ```
 91 | 
 92 | 
 93 | ```python
 94 | def make_batch(filenames, batch_size):
 95 |     """Read the images and labels from 'filenames'."""
 96 |     # Repeat infinitely.
 97 |     dataset = tf.data.TFRecordDataset(filenames).repeat()
 98 | 
 99 |     # Parse records.
100 |     dataset = dataset.map(single_example_parser, num_parallel_calls=1)
101 | 
102 |     # Batch it up.
103 |     dataset = dataset.batch(batch_size, drop_remainder=True)
104 |     iterator = dataset.make_one_shot_iterator()
105 | 
106 |     image_batch, label_batch = iterator.get_next()
107 |     return image_batch, label_batch
108 | ```
109 | 
110 | 
111 | ```python
112 | def single_example_parser(serialized_example):
113 |     """Parses a single tf.Example into image and label tensors."""
114 |     # Dimensions of the images in the CIFAR-10 dataset.
115 |     # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the
116 |     # input format.
117 |     features = tf.parse_single_example(
118 |         serialized_example,
119 |         features={
120 |             'image': tf.FixedLenFeature([], tf.string),
121 |             'label': tf.FixedLenFeature([], tf.int64),
122 |         })
123 |     image = tf.decode_raw(features['image'], tf.uint8)
124 |     image.set_shape([DEPTH * HEIGHT * WIDTH])
125 | 
126 |     # Reshape from [depth * height * width] to [depth, height, width].
127 |     image = tf.cast(
128 |         tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]),
129 |         tf.float32)
130 |     label = tf.cast(features['label'], tf.int32)
131 | 
132 |     image = train_preprocess_fn(image)
133 |     label = tf.one_hot(label, NUM_CLASSES)
134 | 
135 |     return image, label
136 | ```
137 | 
138 | 
139 | ```python
140 | # Hyper-parameters
141 | epochs = 1
142 | lr = 0.01
143 | batch_size = 128
144 | momentum = 0.9
145 | weight_decay = 2e-4
146 | optimizer = 'sgd'
147 | gpu_count = 1
148 | 
149 | # Data directories and other options
150 | checkpoint_dir = '../ckpt_dir'
151 | if not os.path.exists(checkpoint_dir):
152 |     os.makedirs(checkpoint_dir)
153 | 
154 | train_dir = '../data/train'
155 | validation_dir = '../data/validation'
156 | eval_dir = '../data/eval'
157 | 
158 | train_dataset = make_batch(train_dir+'/train.tfrecords',  batch_size)
159 | val_dataset = make_batch(validation_dir+'/validation.tfrecords', batch_size)
160 | eval_dataset = make_batch(eval_dir+'/eval.tfrecords', batch_size)
161 | ```
162 | 
163 | #### Change 2: Initialize horovod and get the size of the cluster
164 | 
165 | 
166 | ```python
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | ```
174 | 
175 | <details><summary><b>Solution</b></summary>
176 |    <pre>
177 | hvd.init()
178 | size = hvd.size()
179 | </pre>
180 | </details>
181 | 
182 | #### Change 3 - Pin GPU to be used to process local rank (one GPU per process)
183 | 
184 | 
185 | ```python
186 | 
187 | 
188 | 
189 | 
190 | 
191 | ```
192 | 
193 | <details><summary><b>Solution</b></summary>
194 |    <pre>
195 | config = tf.ConfigProto()
196 | config.gpu_options.allow_growth = True
197 | config.gpu_options.visible_device_list = str(hvd.local_rank())
198 | K.set_session(tf.Session(config=config))
199 | </pre>
200 | </details>
201 | 
202 | 
203 | ```python
204 | model = get_model(lr, weight_decay, optimizer, momentum)
205 | ```
206 | 
207 | #### Change 4: How will you update the learning rate for distributed training? What changes should you make to the following command?
208 | 
209 | 
210 | ```python
211 | opt = SGD(lr=lr, decay=weight_decay, momentum=momentum)
212 | ```
213 | 
214 | <details><summary><b>Solution</b></summary>
215 |    <pre>
216 | opt = SGD(lr=lr * size, decay=weight_decay, momentum=momentum)
217 | 
218 | You need to scale the learning using the size of the cluster (total number of workers)
219 | </pre>
220 | </details>
221 | 
222 | #### Change 6: How will you convert the optimizer to distributed optimizer?
223 | 
224 | 
225 | ```python
226 | model.compile(loss='categorical_crossentropy',
227 |               optimizer=opt,
228 |               metrics=['accuracy'])
229 | ```
230 | 
231 | <details><summary><b>Solution</b></summary>
232 |    <pre>
233 | opt = hvd.DistributedOptimizer(opt)
234 | model.compile(loss='categorical_crossentropy',
235 |               optimizer=opt,
236 |               metrics=['accuracy'])
237 | </pre>
238 | </details>
239 | 
240 | #### Change 7: Add callbacks for syncing initial state, and saving checkpoints only on 1st worker
241 | 
242 | 
243 | ```python
244 | 
245 | 
246 | 
247 | 
248 | ```
249 | 
250 | <details><summary><b>Solution</b></summary>
251 |    <pre>
252 | callbacks = []
253 | callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
254 | callbacks.append(hvd.callbacks.MetricAverageCallback())
255 | callbacks.append(hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1))
256 | callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1))
257 | if hvd.rank() == 0:
258 |     callbacks.append(ModelCheckpoint('ckpt_dir' + '/checkpoint-{epoch}.h5'))
259 | </pre>
260 | </details>
261 | 
262 | #### Change 8: Update the number of steps/epoch
263 | 
264 | 
265 | ```python
266 | %%time
267 | # Train model
268 | history = model.fit(x=train_dataset[0], y=train_dataset[1],
269 |                     steps_per_epoch=NUM_TRAIN_IMAGES // batch_size,
270 |                     validation_data=val_dataset,
271 |                     validation_steps=NUM_VALID_IMAGES // batch_size,
272 |                     epochs=epochs,
273 |                     callbacks=[ModelCheckpoint(checkpoint_dir + '/checkpoint-{epoch}.h5')])
274 | 
275 | ```
276 | 
277 | <details><summary><b>Solution</b></summary>
278 |    <pre>
279 | history = model.fit(x=train_dataset[0], y=train_dataset[1],
280 |                     steps_per_epoch= (NUM_TRAIN_IMAGES // batch_size)// size,
281 |                     validation_data=val_dataset,
282 |                     validation_steps= (NUM_VALID_IMAGES // batch_size)// size,
283 |                     epochs=epochs, callbacks=callbacks)
284 | </pre>
285 | </details>
286 | 
287 | 
288 | ```python
289 | # Evaluate model performance
290 | score = model.evaluate(eval_dataset[0],
291 |                        eval_dataset[1],
292 |                        steps=NUM_TEST_IMAGES // batch_size,
293 |                        verbose=0)
294 | print('Test loss    :', score[0])
295 | print('Test accuracy:', score[1])
296 | ```
297 | 
298 | Note once these changes are made, you can convert the jupyter notebook into a python training script by running:
299 | <code> $ jupyter nbconvert --to script notebook_name.ipynb </code>
300 | 


--------------------------------------------------------------------------------
/content/update_code_dist_training/prepare_dataset.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Prepare training dataset"
 3 | date: 2019-10-28T01:48:16-07:00
 4 | weight: 2
 5 | ---
 6 | ### Download the CIFAR10 dataset and upload it to Amazon S3
 7 | 
 8 | On a terminal window in JupyterLab client, navigate to the notebook directory
 9 | 
10 | ```
11 | cd ~/SageMaker/distributed-training-workshop/notebooks/
12 | ```
13 | Activate the TensorFlow conda environment
14 | ```
15 | source activate tensorflow_p36
16 | ```
17 | 
18 | Download CIFAR10 dataset and convert it to TFRecords format
19 | ```
20 | python generate_cifar10_tfrecords.py --data-dir dataset
21 | ```
22 | Confirm that the dataset was downloaded successfully. Run:
23 | ```
24 | sudo yum install tree -y
25 | tree dataset
26 | ```
27 | You should see the following output
28 | ```
29 | dataset
30 | ├── eval
31 | │   └── eval.tfrecords
32 | ├── train
33 | │   └── train.tfrecords
34 | └── validation
35 |     └── validation.tfrecords
36 | ```
37 | 
38 | Create a new S3 bucket and upload the dataset to it. Be sure to add a unique identifier, such as your name.
39 | ```
40 | aws s3 mb s3://<your_bucket>
41 | ```
42 | {{% notice warning %}}
43 | **Note:** Bucket names should be unique globally. If a bucket with the same name already exists, add another unique identifier such as today's date or your last name.
44 | {{% /notice %}}
45 | 
46 | Proceed only if you don't see an error. Now, upload the dataset to S3
47 | ```
48 | aws s3 sync dataset/ s3://<your_bucket>/cifar10-dataset/
49 | ```
50 | 


--------------------------------------------------------------------------------
/content/update_code_dist_training/problem_setup.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Problem statement"
 3 | date: 2019-10-28T01:12:18-07:00
 4 | weight: 1
 5 | ---
 6 | 
 7 | ### Converting a single CPU/GPU training script to a multi-node/distributed compatible training script
 8 | **Frameworks:** This workshop currently uses TensorFlow 1.14, Keras and Horovod 0.18.
 9 | 
10 | **Dataset:** The CIFAR-10 consists of 60,000 32x32 images belonging to 10 different classes (6,000 images per class).
11 | <br>CIFAR-10 dataset includes:
12 | 
13 | * 40,000 images for training
14 | * 10,000 images for validation
15 | * 10,000 images for test
16 | 
17 | Here are the classes in the dataset, as well as 10 random images from each:
18 | ![cifar10](https://camo.githubusercontent.com/a426b9aca74c978ecc8b093dddc540f113591858/68747470733a2f2f6d616574333630382e6769746875622e696f2f6e7574732d6d6c2f5f696d616765732f636966617231302e706e67)
19 | 
20 | {{% notice info %}}
21 | **Note:** Although the dataset is small and this is a simpler problem, all the steps we'll take can easily be applied to large datasets that don't fit in memory. Amazon SageMaker has native pipe-mode support to stream dataset directly from S3 to the training instances. With Amazon EKS, we'll setup an Amazon FSx for lustre file system that's accessible to every worker.
22 | {{% /notice %}}
23 | 


--------------------------------------------------------------------------------
/content/update_code_dist_training/single_instance_script.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Getting familiar with a single instance training script"
  3 | date: 2019-10-28T02:38:32-07:00
  4 | weight: 3
  5 | ---
  6 | 
  7 | ## Single CPU/GPU training on the local instance
  8 | 
  9 | <br>In this section you'll get familiar with the training script we'll be converting for distributed training in the next notebook.
 10 | 
 11 | Open `cifar10-single-instance.ipynb` and run through the cells. The following notebook is located at: <br>
 12 | ***distributed-training-workshop > notebooks > part-1-horovod***
 13 | 
 14 | ![singe_instance](/images/convert_script/single_instance.png)
 15 | 
 16 | {{% notice warning %}}
 17 | **Stop:** Do this section on JupyterLab. Below is a copy of the jupyter notebook for reference. <br>
 18 | {{% /notice %}}
 19 | 
 20 | 
 21 | ----
 22 | #### Below is a copy of the Jupyter notebook `cifar10-single-instance.ipynb`
 23 | 
 24 | ----
 25 | 
 26 | This Jupyter notebook contains code that trains a DNN on the CIFAR10 dataset.
 27 | The script was written for local training on a single instance. Run through this notebook either cell-by-cell or by hitting *Run > Run All Cells*
 28 | 
 29 | Once you start to feel comfortable with what the script is doing, we'll then start to make changes to this script so that it can run on a cluster in a distributed fashion.
 30 | 
 31 | **Step 1:** Import essentials packages and define constants
 32 | 
 33 | 
 34 | ```python
 35 | import tensorflow as tf
 36 | import argparse
 37 | from datetime import datetime
 38 | import os
 39 | from tensorflow import keras
 40 | from tensorflow.keras.layers import Input, Dense, Flatten
 41 | from tensorflow.keras.models import Model
 42 | from tensorflow.keras.optimizers import Adam, SGD
 43 | from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
 44 | 
 45 | # Import DNN model definition file
 46 | from model_def import get_model
 47 | 
 48 | HEIGHT = 32
 49 | WIDTH  = 32
 50 | DEPTH  = 3
 51 | NUM_CLASSES = 10
 52 | NUM_TRAIN_IMAGES = 40000
 53 | NUM_VALID_IMAGES = 10000
 54 | NUM_TEST_IMAGES  = 10000
 55 | ```
 56 | 
 57 | **Step 2:** Define functions used to load and prepare dataset for training. We incorporate 3 types of data augmentation schemes: random resize, random crop, random flip. Feel free to update this if you're comfortable. Leave the cell as it is if you aren't comfortable making changes.
 58 | 
 59 | 
 60 | ```python
 61 | def train_preprocess_fn(image):
 62 | 
 63 |     # Resize the image to add four extra pixels on each side.
 64 |     image = tf.image.resize_image_with_crop_or_pad(image, HEIGHT + 8, WIDTH + 8)
 65 | 
 66 |     # Randomly crop a [HEIGHT, WIDTH] section of the image.
 67 |     image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH])
 68 | 
 69 |     # Randomly flip the image horizontally.
 70 |     image = tf.image.random_flip_left_right(image)
 71 | 
 72 |     return image
 73 | 
 74 | def make_batch(filenames, batch_size):
 75 |     """Read the images and labels from 'filenames'."""
 76 |     # Repeat infinitely.
 77 |     dataset = tf.data.TFRecordDataset(filenames).repeat()
 78 | 
 79 |     # Parse records.
 80 |     dataset = dataset.map(single_example_parser, num_parallel_calls=1)
 81 | 
 82 |     # Batch it up.
 83 |     dataset = dataset.batch(batch_size, drop_remainder=True)
 84 |     iterator = dataset.make_one_shot_iterator()
 85 | 
 86 |     image_batch, label_batch = iterator.get_next()
 87 |     return image_batch, label_batch
 88 | 
 89 | def single_example_parser(serialized_example):
 90 |     """Parses a single tf.Example into image and label tensors."""
 91 |     # Dimensions of the images in the CIFAR-10 dataset.
 92 |     # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the
 93 |     # input format.
 94 |     features = tf.parse_single_example(
 95 |         serialized_example,
 96 |         features={
 97 |             'image': tf.FixedLenFeature([], tf.string),
 98 |             'label': tf.FixedLenFeature([], tf.int64),
 99 |         })
100 |     image = tf.decode_raw(features['image'], tf.uint8)
101 |     image.set_shape([DEPTH * HEIGHT * WIDTH])
102 | 
103 |     # Reshape from [depth * height * width] to [depth, height, width].
104 |     image = tf.cast(
105 |         tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]),
106 |         tf.float32)
107 |     label = tf.cast(features['label'], tf.int32)
108 | 
109 |     image = train_preprocess_fn(image)
110 |     label = tf.one_hot(label, NUM_CLASSES)
111 | 
112 |     return image, label
113 | ```
114 | 
115 | **Step 3:**
116 | * Define hyperameters, directories for train, validation and test.
117 | * Load model from model_def.py
118 | * Compile model and fit
119 | 
120 | 
121 | ```python
122 | # Hyper-parameters
123 | epochs = 1
124 | lr = 0.01
125 | batch_size = 128
126 | momentum = 0.9
127 | weight_decay = 2e-4
128 | optimizer = 'sgd'
129 | gpu_count = 1
130 | 
131 | # Data directories and other options
132 | checkpoint_dir = '../ckpt_dir'
133 | if not os.path.exists(checkpoint_dir):
134 |     os.makedirs(checkpoint_dir)
135 | 
136 | train_dir = '../dataset/train'
137 | validation_dir = '../dataset/validation'
138 | eval_dir = '../dataset/eval'
139 | 
140 | train_dataset = make_batch(train_dir+'/train.tfrecords',  batch_size)
141 | val_dataset = make_batch(validation_dir+'/validation.tfrecords', batch_size)
142 | eval_dataset = make_batch(eval_dir+'/eval.tfrecords', batch_size)
143 | ```
144 | 
145 | 
146 | ```python
147 | model = get_model(lr, weight_decay, optimizer, momentum)
148 | opt = SGD(lr=lr, decay=weight_decay, momentum=momentum)
149 | ```
150 | 
151 | 
152 | ```python
153 | model.compile(loss='categorical_crossentropy',
154 |               optimizer=opt,
155 |               metrics=['accuracy'])
156 | ```
157 | 
158 | 
159 | ```python
160 | # Compile model
161 | model.compile(optimizer=SGD(lr=lr, decay=weight_decay, momentum=momentum),
162 |               loss='categorical_crossentropy',
163 |               metrics=['accuracy'])
164 | ```
165 | 
166 | 
167 | ```python
168 | %%time
169 | # Train model
170 | history = model.fit(x=train_dataset[0], y=train_dataset[1],
171 |                     steps_per_epoch=NUM_TRAIN_IMAGES // batch_size,
172 |                     validation_data=val_dataset,
173 |                     validation_steps=NUM_VALID_IMAGES // batch_size,
174 |                     epochs=epochs,
175 |                     callbacks=[ModelCheckpoint(checkpoint_dir + '/checkpoint-{epoch}.h5')])
176 | ```
177 | 
178 | 
179 | ```python
180 | # Evaluate model performance
181 | score = model.evaluate(eval_dataset[0],
182 |                        eval_dataset[1],
183 |                        steps=NUM_TEST_IMAGES // batch_size,
184 |                        verbose=0)
185 | print('Test loss    :', score[0])
186 | print('Test accuracy:', score[1])
187 | ```
188 | 
189 | 
190 | ----
191 | ##### Now that you have a successfully working training script, open `cifar10-distributed.ipynb` and start converting it for distributed training
192 | 


--------------------------------------------------------------------------------
/layouts/404.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="{{ .Page.Language | default "en" }}" class="js csstransforms3d">
 3 | 
 4 | <head>
 5 |   <meta charset="utf-8"> {{ partial "meta.html" . }} {{ partial "favicon.html" . }} {{ .Scratch.Add "title" "" }}{{ if eq .Site.Data.titles .Title }}{{ .Scratch.Set "title" (index .Site.Data.titles .Title).title }}{{ else }}{{ .Scratch.Set "title" .Title}}{{end}}
 6 |   <title>{{ .Scratch.Get "title" }}</title>
 7 |   
 8 |    {{ $assetBusting := not .Site.Params.disableAssetsBusting }}
 9 |     <link href="{{"css/nucleus.css" | relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}" rel="stylesheet">
10 |     <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.2.0/css/all.css" integrity="sha384-hWVjflwFxL6sNzntih27bfxkr27PmbbK/iSvJ+a4+0owXq79v+lsFkW54bOGbiDQ" crossorigin="anonymous">
11 |     <link href="{{"css/hybrid.css" | relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}" rel="stylesheet">
12 |     <link href="{{"css/featherlight.min.css" | relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}" rel="stylesheet">
13 |     <link href="{{"css/perfect-scrollbar.min.css" | relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}" rel="stylesheet">
14 |     <link href="{{"css/theme.css" | relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}" rel="stylesheet">
15 |     <link href="{{"css/hugo-theme.css" | relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}" rel="stylesheet">
16 |     {{with .Site.Params.themeVariant}}
17 |       <link href="{{(printf "css/theme-%s.css" .) | relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}" rel="stylesheet">
18 |     {{end}}
19 | <style type="text/css">
20 |     :root #header + #content > #left > #rlblock_left {
21 |       display: none !important;
22 |     }
23 | 
24 |     p,
25 |     li,
26 |     ul {
27 |       text-align: center
28 |     }
29 | 
30 |     ul {
31 |       list-style-type: none;
32 |     }
33 | </style>
34 | {{ partial "custom-header.html" . }}
35 | </head>
36 | 
37 | <body>
38 | 
39 |   <body class="" data-url="/">
40 | 
41 |     <section id="body" style="margin-left:0px;">
42 |       <div id="overlay"></div>
43 |       <div id="chapter">
44 |         <div id="body-inner">
45 |           <h1>{{T "title-404"}}</h1>
46 |           <p>
47 |           </p>
48 |           <p>{{T "message-404"}}</p>
49 |           <p></p>
50 |           <p><a href='{{ "" | relLangURL }}'>{{T "Go-to-homepage"}}</a></p>
51 |           <p><img src='{{ "/images/gopher-404.jpg" | relURL }}' style="width:50%"></img></p>
52 |         </div>
53 |       </div>
54 | 
55 |     </section>
56 |   </body>
57 | 
58 | </html>
59 | 


--------------------------------------------------------------------------------
/layouts/partials/custom-footer.html:
--------------------------------------------------------------------------------
1 | {{ template "_internal/google_analytics.html" . }}
2 | 


--------------------------------------------------------------------------------
/layouts/partials/favicon.html:
--------------------------------------------------------------------------------
1 | <link rel="shortcut icon" href="{{"images/favicon.ico" | relURL}}" type="image/x-icon" />
2 | <link rel="icon" href="{{"images/favicon.ico" | relURL}}" type="image/x-icon" />
3 | 


--------------------------------------------------------------------------------
/layouts/partials/footer.html:
--------------------------------------------------------------------------------
 1 |         {{ if .Params.chapter }}
 2 |             </div> <!-- end chapter-->
 3 |         {{ end }}
 4 |         </div>
 5 |         {{ partial "custom-comments.html" . }}
 6 |       </div>
 7 | 
 8 |     <div id="navigation">
 9 |         <!-- Next prev page -->
10 |         {{ $currentNode := . }}
11 | 
12 |         {{ template "menu-nextprev" dict "menu" .Site.Home "currentnode" $currentNode }}
13 | 
14 |         {{ define "menu-nextprev" }}
15 |             {{$currentNode := .currentnode }}
16 |             {{ if ne .menu.Params.hidden true}}
17 |                 {{if hasPrefix $currentNode.URL .menu.URL }}
18 |                     {{ $currentNode.Scratch.Set "NextPageOK" "OK" }}
19 |                     {{ $currentNode.Scratch.Set "prevPage" ($currentNode.Scratch.Get "prevPageTmp") }}
20 |                 {{else}}
21 |                     {{if eq ($currentNode.Scratch.Get "NextPageOK") "OK"}}
22 |                         {{ $currentNode.Scratch.Set "NextPageOK" nil }}
23 |                         {{ $currentNode.Scratch.Set "nextPage" .menu }}
24 |                     {{end}}
25 |                 {{end}}
26 |                 {{ $currentNode.Scratch.Set "prevPageTmp" .menu }}
27 | 
28 |                     {{ $currentNode.Scratch.Set "pages" .menu.Pages }}
29 |                     {{ if .menu.IsHome}}
30 |                         {{ $currentNode.Scratch.Set "pages" .menu.Sections }}
31 |                     {{ else if .menu.Sections}}
32 |                         {{ $currentNode.Scratch.Set "pages" (.menu.Pages | union .menu.Sections) }}
33 |                     {{end}}
34 |                     {{ $pages := ($currentNode.Scratch.Get "pages") }}
35 | 
36 |                     {{ range $pages.ByWeight  }}
37 |                         {{ template "menu-nextprev" dict "menu" . "currentnode" $currentNode }}
38 |                     {{end}}
39 |             {{ end }}
40 |         {{ end }}
41 | 
42 | 
43 |         {{with ($.Scratch.Get "prevPage")}}
44 |             <a class="nav nav-prev" href="{{.URL}}" title="{{.Title}}"> <i class="fas fa-chevron-left"></i></a>
45 |         {{end}}
46 |         {{with ($.Scratch.Get "nextPage")}}
47 |             <a class="nav nav-next" href="{{.URL}}" title="{{.Title}}" style="margin-right: 0px;"><i class="fas fa-chevron-right"></i></a>
48 |         {{end}}
49 |     </div>
50 | 
51 |     </section>
52 | 
53 |     <div style="left: -1000px; overflow: scroll; position: absolute; top: -1000px; border: none; box-sizing: content-box; height: 200px; margin: 0px; padding: 0px; width: 200px;">
54 |       <div style="border: none; box-sizing: content-box; height: 200px; margin: 0px; padding: 0px; width: 200px;"></div>
55 |     </div>
56 |     <script src="{{"js/clipboard.min.js" | relURL}}{{ if not .Site.Params.disableAssetsBusting }}?{{ now.Unix }}{{ end }}"></script>
57 |     <script src="{{"js/perfect-scrollbar.min.js" | relURL}}{{ if not .Site.Params.disableAssetsBusting }}?{{ now.Unix }}{{ end }}"></script>
58 |     <script src="{{"js/perfect-scrollbar.jquery.min.js" | relURL}}{{ if not .Site.Params.disableAssetsBusting }}?{{ now.Unix }}{{ end }}"></script>
59 |     <script src="{{"js/jquery.sticky.js" | relURL}}{{ if not .Site.Params.disableAssetsBusting }}?{{ now.Unix }}{{ end }}"></script>
60 |     <script src="{{"js/featherlight.min.js" | relURL}}{{ if not .Site.Params.disableAssetsBusting }}?{{ now.Unix }}{{ end }}"></script>
61 |     <script src="{{"js/highlight.pack.js" | relURL}}{{ if not .Site.Params.disableAssetsBusting }}?{{ now.Unix }}{{ end }}"></script>
62 |     <script>hljs.initHighlightingOnLoad();</script>
63 |     <script src="{{"js/modernizr.custom-3.6.0.js" | relURL}}{{ if not .Site.Params.disableAssetsBusting }}?{{ now.Unix }}{{ end }}"></script>
64 |     <script src="{{"js/learn.js" | relURL}}{{ if not .Site.Params.disableAssetsBusting }}?{{ now.Unix }}{{ end }}"></script>
65 |     <script src="{{"js/hugo-learn.js" | relURL}}{{ if not .Site.Params.disableAssetsBusting }}?{{ now.Unix }}{{ end }}"></script>
66 | 
67 |     <link href="{{"mermaid/mermaid.css" | relURL}}{{ if not .Site.Params.disableAssetsBusting }}?{{ now.Unix }}{{ end }}" type="text/css" rel="stylesheet" />
68 |     <script src="{{"mermaid/mermaid.min.js" | relURL}}{{ if not .Site.Params.disableAssetsBusting }}?{{ now.Unix }}{{ end }}"></script>
69 |     <script>
70 | 	var config = {
71 |         startOnLoad:true,
72 |         flowchart:{
73 | 	    curve:'basis'
74 |         }
75 |       };
76 |         mermaid.initialize(config);
77 |     </script>
78 |     {{ partial "custom-footer.html" . }}
79 |   </body>
80 | </html>
81 | 


--------------------------------------------------------------------------------
/layouts/partials/google.html:
--------------------------------------------------------------------------------
1 | <!-- Global site tag (gtag.js) - Google Analytics -->
2 | <script async src="https://www.googletagmanager.com/gtag/js?id=UA-151135045-1"></script>
3 | <script>
4 |   window.dataLayer = window.dataLayer || [];
5 |   function gtag(){dataLayer.push(arguments);}
6 |   gtag('js', new Date());
7 | 
8 |   gtag('config', 'UA-151135045-1');
9 | </script>


--------------------------------------------------------------------------------
/layouts/partials/header.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html class="no-js" lang="en-US" prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb#">
  3 | <html lang="{{ .Page.Language | default "en" }}" class="js csstransforms3d">
  4 |   <head>
  5 |     <meta charset="utf-8">
  6 |     <meta property="og:title" content="distributed-training-workshop.go-aws.com" />
  7 |     <meta property="og:type" content="website" />
  8 |     <meta property="og:url" content="https://distributed-training-workshop.go-aws.com" />
  9 |     <meta property="og:image" content="https://distributed-training-workshop.go-aws.com/images/intro/home.png" />
 10 |     <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no">
 11 |     {{ .Hugo.Generator }}
 12 |     {{ partial "meta.html" . }}
 13 |     {{ partial "favicon.html" . }}
 14 |     {{ partials "google/analytics" . }}
 15 |     <title>{{ .Title }} :: {{ .Site.Title }}</title>
 16 | 
 17 |     {{ $assetBusting := not .Site.Params.disableAssetsBusting }}
 18 |     <link href="{{"css/nucleus.css" | relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}" rel="stylesheet">
 19 |     <link href="{{"css/fontawesome-all.min.css" | relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}" rel="stylesheet">
 20 |     <link href="{{"css/hybrid.css" | relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}" rel="stylesheet">
 21 |     <link href="{{"css/featherlight.min.css" | relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}" rel="stylesheet">
 22 |     <link href="{{"css/perfect-scrollbar.min.css" | relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}" rel="stylesheet">
 23 |     <link href="{{"css/auto-complete.css" | relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}" rel="stylesheet">
 24 |     <link href="{{"css/theme.css" | relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}" rel="stylesheet">
 25 |     <link href="{{"css/hugo-theme.css" | relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}" rel="stylesheet">
 26 |     <link href="{{"css/jquery-ui.min.css" | relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}" rel="stylesheet">
 27 |     {{with .Site.Params.themeVariant}}
 28 |       <link href="{{(printf "css/theme-%s.css" .) | relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}" rel="stylesheet">
 29 |     {{end}}
 30 | 
 31 |     <script src="{{"js/jquery-3.3.1.min.js"| relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}"></script>
 32 |     <script src="{{"js/jquery-ui-1.12.1.min.js"| relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}"></script>
 33 | <!--
 34 |     <script src="{{"js/tabvisibility.js"| relURL}}{{ if $assetBusting }}?{{ now.Unix }}{{ end }}"></script>
 35 | -->
 36 | 
 37 |     <style type="text/css">
 38 |       :root #header + #content > #left > #rlblock_left{
 39 |           display:none !important;
 40 |       }
 41 |       {{ if .Site.Params.disableInlineCopyToClipBoard }}
 42 |         :not(pre) > code + .copy-to-clipboard {
 43 |             display: none;
 44 |         }
 45 |       {{ end }}
 46 |     </style>
 47 |     {{ partial "custom-header.html" . }}
 48 |   </head>
 49 |   <body class="" data-url="{{ .RelPermalink }}">
 50 |     {{ partial "menu.html" . }}
 51 |         <section id="body">
 52 |         <div id="overlay"></div>
 53 |         <div class="padding highlightable">
 54 |               {{if not .IsHome}}
 55 |               <div>
 56 |                 <div id="top-bar">
 57 |                 {{ if and (or .IsPage .IsSection) .Site.Params.editURL }}
 58 |                   {{ $File := .File }}
 59 |                   {{ $Site := .Site }}
 60 |                   {{with $File.Path }}
 61 |                   <div id="top-github-link">
 62 |                     <a class="github-link" title='{{T "Edit-this-page"}}' href="{{ $Site.Params.editURL }}{{ replace $File.Dir "\\" "/" }}{{ $File.LogicalName }}" target="blank">
 63 |                       <i class="fas fa-code-branch"></i>
 64 |                       <span id="top-github-link-text">{{T "Edit-this-page"}}</span>
 65 |                     </a>
 66 |                   </div>
 67 |                   {{ end }}
 68 |                 {{ end }}
 69 |                 {{$toc := (and (not .Params.disableToc) (not .Params.chapter))}}
 70 |                 <div id="breadcrumbs" itemscope="" itemtype="http://data-vocabulary.org/Breadcrumb">
 71 |                     <span id="sidebar-toggle-span">
 72 |                         <a href="#" id="sidebar-toggle" data-sidebar-toggle="">
 73 |                           <i class="fas fa-bars"></i>
 74 |                         </a>
 75 |                     </span>
 76 |                   {{ if $toc }}
 77 |                   <span id="toc-menu"><i class="fas fa-list-alt"></i></span>
 78 |                   {{ end }}
 79 |                   <span class="links">
 80 |                     {{ template "breadcrumb" dict "page" . "value" .Title }}
 81 |                   </span>
 82 |                 </div>
 83 |                 {{ if $toc }}
 84 |                     {{ partial "toc.html" . }}
 85 |                 {{ end }}
 86 |               </div>
 87 |             </div>
 88 |             {{ end }}
 89 | 
 90 |         {{ if .Params.chapter }}
 91 |           <div id="chapter">
 92 |         {{ end }}
 93 |         <div id="body-inner">
 94 |           {{if and (not .IsHome) (not .Params.chapter) }}
 95 |             <h1>{{.Title}}</h1>
 96 |           {{end}}
 97 | 
 98 |         {{define "breadcrumb"}}
 99 |           {{$parent := .page.Parent }}
100 |           {{ if $parent }}
101 |             {{ $value := (printf "<a href='%s'>%s</a> > %s" $parent.URL $parent.Title .value) }}
102 |             {{ template "breadcrumb" dict "page" $parent "value" $value }}
103 |           {{else}}
104 |             {{.value|safeHTML}}
105 |           {{end}}
106 |         {{end}}
107 | 


--------------------------------------------------------------------------------
/layouts/partials/logo.html:
--------------------------------------------------------------------------------
1 | <a href="/" title="Go home"><svg id="Layer_1" data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 50 30" width="60%" style="padding:20px 0px;"><defs><style>.cls-1{fill:#fff;}.cls-2{fill:#f90;fill-rule:evenodd;}</style></defs><title>AWS-Logo_White-Color</title><path class="cls-1" d="M14.09,10.85a4.7,4.7,0,0,0,.19,1.48,7.73,7.73,0,0,0,.54,1.19.77.77,0,0,1,.12.38.64.64,0,0,1-.32.49l-1,.7a.83.83,0,0,1-.44.15.69.69,0,0,1-.49-.23,3.8,3.8,0,0,1-.6-.77q-.25-.42-.51-1a6.14,6.14,0,0,1-4.89,2.3,4.54,4.54,0,0,1-3.32-1.19,4.27,4.27,0,0,1-1.22-3.2A4.28,4.28,0,0,1,3.61,7.75,6.06,6.06,0,0,1,7.69,6.46a12.47,12.47,0,0,1,1.76.13q.92.13,1.91.36V5.73a3.65,3.65,0,0,0-.79-2.66A3.81,3.81,0,0,0,7.86,2.3a7.71,7.71,0,0,0-1.79.22,12.78,12.78,0,0,0-1.79.57,4.55,4.55,0,0,1-.58.22l-.26,0q-.35,0-.35-.52V2a1.09,1.09,0,0,1,.12-.58,1.2,1.2,0,0,1,.47-.35A10.88,10.88,0,0,1,5.77.32,10.19,10.19,0,0,1,8.36,0a6,6,0,0,1,4.35,1.35,5.49,5.49,0,0,1,1.38,4.09ZM7.34,13.38a5.36,5.36,0,0,0,1.72-.31A3.63,3.63,0,0,0,10.63,12,2.62,2.62,0,0,0,11.19,11a5.63,5.63,0,0,0,.16-1.44v-.7a14.35,14.35,0,0,0-1.53-.28,12.37,12.37,0,0,0-1.56-.1,3.84,3.84,0,0,0-2.47.67A2.34,2.34,0,0,0,5,11a2.35,2.35,0,0,0,.61,1.76A2.4,2.4,0,0,0,7.34,13.38Zm13.35,1.8a1,1,0,0,1-.64-.16,1.3,1.3,0,0,1-.35-.65L15.81,1.51a3,3,0,0,1-.15-.67.36.36,0,0,1,.41-.41H17.7a1,1,0,0,1,.65.16,1.4,1.4,0,0,1,.33.65l2.79,11,2.59-11A1.17,1.17,0,0,1,24.39.6a1.1,1.1,0,0,1,.67-.16H26.4a1.1,1.1,0,0,1,.67.16,1.17,1.17,0,0,1,.32.65L30,12.39,32.88,1.25A1.39,1.39,0,0,1,33.22.6a1,1,0,0,1,.65-.16h1.54a.36.36,0,0,1,.41.41,1.36,1.36,0,0,1,0,.26,3.64,3.64,0,0,1-.12.41l-4,12.86a1.3,1.3,0,0,1-.35.65,1,1,0,0,1-.64.16H29.25a1,1,0,0,1-.67-.17,1.26,1.26,0,0,1-.32-.67L25.67,3.64,23.11,14.34a1.26,1.26,0,0,1-.32.67,1,1,0,0,1-.67.17Zm21.36.44a11.28,11.28,0,0,1-2.56-.29,7.44,7.44,0,0,1-1.92-.67,1,1,0,0,1-.61-.93v-.84q0-.52.38-.52a.9.9,0,0,1,.31.06l.42.17a8.77,8.77,0,0,0,1.83.58,9.78,9.78,0,0,0,2,.2,4.48,4.48,0,0,0,2.43-.55,1.76,1.76,0,0,0,.86-1.57,1.61,1.61,0,0,0-.45-1.16A4.29,4.29,0,0,0,43,9.22l-2.41-.76A5.15,5.15,0,0,1,38,6.78a3.94,3.94,0,0,1-.83-2.41,3.7,3.7,0,0,1,.45-1.85,4.47,4.47,0,0,1,1.19-1.37A5.27,5.27,0,0,1,40.51.29,7.4,7.4,0,0,1,42.6,0a8.87,8.87,0,0,1,1.12.07q.57.07,1.08.19t.95.26a4.27,4.27,0,0,1,.7.29,1.59,1.59,0,0,1,.49.41.94.94,0,0,1,.15.55v.79q0,.52-.38.52a1.76,1.76,0,0,1-.64-.2,7.74,7.74,0,0,0-3.2-.64,4.37,4.37,0,0,0-2.21.47,1.6,1.6,0,0,0-.79,1.48,1.58,1.58,0,0,0,.49,1.18,4.94,4.94,0,0,0,1.83.92L44.55,7a5.08,5.08,0,0,1,2.57,1.6A3.76,3.76,0,0,1,47.9,11a4.21,4.21,0,0,1-.44,1.93,4.4,4.4,0,0,1-1.21,1.47,5.43,5.43,0,0,1-1.85.93A8.25,8.25,0,0,1,42.05,15.62Z"></path><path class="cls-2" d="M45.19,23.81C39.72,27.85,31.78,30,25,30A36.64,36.64,0,0,1,.22,20.57c-.51-.46-.06-1.09.56-.74A49.78,49.78,0,0,0,25.53,26.4,49.23,49.23,0,0,0,44.4,22.53C45.32,22.14,46.1,23.14,45.19,23.81Z"></path><path class="cls-2" d="M47.47,21.21c-.7-.9-4.63-.42-6.39-.21-.53.06-.62-.4-.14-.74,3.13-2.2,8.27-1.57,8.86-.83s-.16,5.89-3.09,8.35c-.45.38-.88.18-.68-.32C46.69,25.8,48.17,22.11,47.47,21.21Z"></path></svg></a>
2 | 


--------------------------------------------------------------------------------
/layouts/partials/menu-footer.html:
--------------------------------------------------------------------------------
 1 | <left>
 2 | 
 3 |     <h2 class="github-title">GitHub Repo</h2>
 4 | 
 5 |     <h3 class="github-title">Distributed training workshop</h3>
 6 |     <a class="github-button" href="https://github.com/shashankprasanna/distributed-training-workshop.git" data-icon="octicon-star" data-show-count="true" aria-label="Star shashankprasanna/distributed-training-workshop on GitHub">Star</a>
 7 |     &nbsp;<a class="github-button" href="https://github.com/shashankprasanna/distributed-training-workshop/fork" data-icon="octicon-repo-forked" data-show-count="true" aria-label="Fork shashankprasanna/distributed-training-workshop on GitHub">Fork</a>
 8 | 
 9 |     <h5 class="copyright">&copy; 2019 Amazon Web Services, Inc. or its Affiliates. All rights reserved.<h5>
10 | 
11 | </left>
12 | <!-- Place this tag in your head or just before your close body tag. -->
13 | <script async defer src="https://buttons.github.io/buttons.js"></script>
14 | 


--------------------------------------------------------------------------------
/layouts/shortcodes/cf-download.html:
--------------------------------------------------------------------------------
1 | <a 
2 | href="https://s3.amazonaws.com/{{ $.Site.Params.template_bucket  }}/templates/{{ $.Site.Params.branch  }}/{{ .Get 0 }}" 
3 | target="_blank" class="btn btn-default"> 
4 | <i class="fas fa-download"></i> 
5 | Download
6 | </a>
7 | 


--------------------------------------------------------------------------------
/layouts/shortcodes/cf-launch.html:
--------------------------------------------------------------------------------
1 | <a
2 | href="https://console.aws.amazon.com/cloudformation/home?#/stacks/create/review?stackName={{ .Get 1 }}&templateURL=https://s3.amazonaws.com/{{ $.Site.Params.template_bucket  }}/templates/{{ $.Site.Params.branch  }}/{{ .Get 0 }}"
3 | target="_blank" class="btn btn-default">
4 | <i class="fas fa-cloud-upload-alt"></i>
5 | Launch
6 | </a>
7 | 


--------------------------------------------------------------------------------
/layouts/shortcodes/ghcontributors.html:
--------------------------------------------------------------------------------
 1 | <style type="text/css">
 2 | .ghContributors{
 3 | display:flex;
 4 | flex-flow:  wrap;
 5 | align-content: flex-start
 6 | }
 7 | 
 8 | .ghContributors > div{
 9 | width: 50% ;
10 | display: inline-flex;
11 | margin-bottom: 5px;
12 | }
13 | .ghContributors > div label{
14 | padding-left: 4px ;
15 | }
16 | .ghContributors > div span{
17 | font-size: x-small;
18 | padding-left: 4px ;
19 | }
20 | 
21 | </style>
22 | <div class="ghContributors">
23 |   {{ $url := .Get 0 }}
24 |   {{ range getJSON $url }}
25 |   <div>
26 |     <img src="{{.avatar_url}}" class="inline" width="32" height="32" style="height: 32px;height: 32px;margin-bottom:.25em; vertical-align:middle; ">
27 |     <label><a href="{{.html_url}}">@{{.login}}</a></label>
28 |     <span class="contributions">{{.contributions}} commits</span>
29 |   </div>
30 |   {{ end }}
31 | </div>
32 | 


--------------------------------------------------------------------------------
/layouts/shortcodes/github.html:
--------------------------------------------------------------------------------
1 | <a href="https://github.com/{{ .Get 0 }}">{{ .Get 0 }}</a>
2 | 


--------------------------------------------------------------------------------
/layouts/shortcodes/mermaid.html:
--------------------------------------------------------------------------------
1 | <div class="mermaid" align="{{ if .Get "align" }}{{ .Get "align" }}{{ else }}center{{ end }}">{{ safeHTML .Inner }}</div>
2 | 


--------------------------------------------------------------------------------
/layouts/shortcodes/surveymonkey.html:
--------------------------------------------------------------------------------
1 | <script>(function(t,e,s,n){var o,a,c;t.SMCX=t.SMCX||[],e.getElementById(n)||(o=e.getElementsByTagName(s),a=o[o.length-1],c=e.createElement(s),c.type="text/javascript",c.async=!0,c.id=n,c.src=["https:"===location.protocol?"https://":"http://","widget.surveymonkey.com/collect/website/js/tRaiETqnLgj758hTBazgd_2BU860jlhPrsKW9DSM0aec7fijRMWQEdDb7y2zM_2FUrIx.js"].join(""),a.parentNode.insertBefore(c,a))})(window,document,"script","smcx-sdk");</script><a style="font: 12px Helvetica, sans-serif; color: #999; text-decoration: none;" href=https://www.surveymonkey.com> Create your own user feedback survey </a>
2 | 


--------------------------------------------------------------------------------
/layouts/shortcodes/tab.html:
--------------------------------------------------------------------------------
 1 | {{ if .Parent }}
 2 | 	{{ $name := trim (.Get "name") " " }}
 3 | 	{{ $include := trim (.Get "include") " "}}
 4 | 	{{ $codelang := .Get "codelang" }}
 5 | 	{{ if not (.Parent.Scratch.Get "tabs") }}
 6 | 	{{ .Parent.Scratch.Set "tabs" slice }}
 7 | 	{{ end }}
 8 | 	{{ with .Inner }}
 9 | 	{{ if $codelang }}
10 | 	{{ $.Parent.Scratch.Add "tabs" (dict "name" $name "content" (highlight . $codelang "") ) }}
11 | 	{{ else }}
12 | 	{{ $.Parent.Scratch.Add "tabs" (dict "name" $name "content" . ) }}
13 | 	{{ end }}
14 | 	{{ else }}
15 | 	{{ $.Parent.Scratch.Add "tabs" (dict "name" $name "include" $include "codelang" $codelang) }}
16 | 	{{ end }}
17 | {{ else }}
18 | 	{{- errorf "[%s] %q: tab shortcode missing its parent" .Page.Site.Language.Lang .Page.Path -}}
19 | {{ end}}


--------------------------------------------------------------------------------
/layouts/shortcodes/tabs.html:
--------------------------------------------------------------------------------
 1 | {{ .Page.Scratch.Add "tabset-counter" 1 }}
 2 | {{ $tab_set_id := .Get "name" | default (printf "tabset-%s-%d" (.Page.RelPermalink) (.Page.Scratch.Get "tabset-counter") ) | anchorize }}
 3 | {{ $tabs := .Scratch.Get "tabs" }}
 4 | {{ if .Inner }}{{/* We don't use the inner content, but Hugo will complain if we don't reference it. */}}{{ end }}
 5 | <div id="{{ $tab_set_id }}">
 6 | <ul>
 7 | 	{{ range $i, $e := $tabs }}
 8 | 	  {{ $id := printf "%s-%d" $tab_set_id $i }}
 9 |   <li><a href="#{{ $id }}">{{ trim .name " " }}</a></li>
10 | {{ end }}
11 | </ul>
12 | {{ range $i, $e := $tabs }}
13 | {{ $id := printf "%s-%d" $tab_set_id $i }}
14 | <div id="{{ $id }}">
15 | 	{{ with .content }}
16 | 		{{ . }}
17 | 	{{ else }}
18 | 		{{ if eq $.Page.BundleType "leaf" }}
19 | 			{{/* find the file somewhere inside the bundle. Note the use of double asterisk */}}
20 | 			{{ with $.Page.Resources.GetMatch (printf "**%s*" .include)  }}
21 | 				{{ if ne .ResourceType "page" }}
22 | 				{{/* Assume it is a file that needs code highlighting. */}}
23 | 				{{ $codelang := $e.codelang | default ( path.Ext .Name | strings.TrimPrefix ".") }} 
24 | 				{{ highlight .Content $codelang "" }}
25 | 				{{ else}}
26 | 					{{ .Content }}
27 | 				{{ end }}
28 | 			{{ end }}
29 | 		{{ else}}
30 | 		{{ $path := path.Join $.Page.Dir .include }}
31 | 		{{ $page := $.Page.Site.GetPage "page" $path }}
32 | 		{{ with $page }}
33 | 			{{ .Content }}
34 | 		{{ else }}	
35 | 		{{ errorf "[%s] tabs include not found for path %q" $.Page.Site.Language.Lang $path}}
36 | 		{{ end }}
37 | 		{{ end }}
38 | 	{{ end }}
39 | </div>
40 | {{ end }}
41 | </div>
42 | {{ $elem :=  $tab_set_id | safeJS }}
43 | <script>$(function(){$("#{{ $elem }}").tabs();});</script>
44 | 


--------------------------------------------------------------------------------
/layouts/shortcodes/year.html:
--------------------------------------------------------------------------------
1 | {{ .Page.Now.Year }}
2 | 
3 | 


--------------------------------------------------------------------------------
/notebooks/generate_cifar10_tfrecords.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Read CIFAR-10 data from pickled numpy arrays and writes TFRecords.
 16 | 
 17 | Generates tf.train.Example protos and writes them to TFRecord files from the
 18 | python version of the CIFAR-10 dataset downloaded from
 19 | https://www.cs.toronto.edu/~kriz/cifar.html.
 20 | """
 21 | 
 22 | from __future__ import absolute_import
 23 | from __future__ import division
 24 | from __future__ import print_function
 25 | 
 26 | import argparse
 27 | import os
 28 | import sys
 29 | 
 30 | import tarfile
 31 | from six.moves import cPickle as pickle
 32 | from six.moves import xrange  # pylint: disable=redefined-builtin
 33 | import tensorflow as tf
 34 | 
 35 | tf.logging.set_verbosity(tf.logging.ERROR)
 36 | if type(tf.contrib) != type(tf): tf.contrib._warning = None
 37 | 
 38 | CIFAR_FILENAME = 'cifar-10-python.tar.gz'
 39 | CIFAR_DOWNLOAD_URL = 'https://www.cs.toronto.edu/~kriz/' + CIFAR_FILENAME
 40 | CIFAR_LOCAL_FOLDER = 'cifar-10-batches-py'
 41 | 
 42 | 
 43 | def download_and_extract(data_dir):
 44 |   # download CIFAR-10 if not already downloaded.
 45 |   tf.contrib.learn.datasets.base.maybe_download(CIFAR_FILENAME, data_dir,
 46 |                                                 CIFAR_DOWNLOAD_URL)
 47 |   tarfile.open(os.path.join(data_dir, CIFAR_FILENAME),
 48 |                'r:gz').extractall(data_dir)
 49 | 
 50 | 
 51 | def _int64_feature(value):
 52 |   return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
 53 | 
 54 | 
 55 | def _bytes_feature(value):
 56 |   return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
 57 | 
 58 | 
 59 | def _get_file_names():
 60 |   """Returns the file names expected to exist in the input_dir."""
 61 |   file_names = {}
 62 |   file_names['train'] = ['data_batch_%d' % i for i in xrange(1, 5)]
 63 |   file_names['validation'] = ['data_batch_5']
 64 |   file_names['eval'] = ['test_batch']
 65 |   return file_names
 66 | 
 67 | 
 68 | def read_pickle_from_file(filename):
 69 |   with tf.gfile.Open(filename, 'rb') as f:
 70 |     if sys.version_info >= (3, 0):
 71 |       data_dict = pickle.load(f, encoding='bytes')
 72 |     else:
 73 |       data_dict = pickle.load(f)
 74 |   return data_dict
 75 | 
 76 | 
 77 | def convert_to_tfrecord(input_files, output_file):
 78 |   """Converts a file to TFRecords."""
 79 |   print('Generating %s' % output_file)
 80 |   with tf.python_io.TFRecordWriter(output_file) as record_writer:
 81 |     for input_file in input_files:
 82 |       data_dict = read_pickle_from_file(input_file)
 83 |       data = data_dict[b'data']
 84 |       labels = data_dict[b'labels']
 85 | 
 86 |       num_entries_in_batch = len(labels)
 87 |       for i in range(num_entries_in_batch):
 88 |         example = tf.train.Example(features=tf.train.Features(
 89 |             feature={
 90 |                 'image': _bytes_feature(data[i].tobytes()),
 91 |                 'label': _int64_feature(labels[i])
 92 |             }))
 93 |         record_writer.write(example.SerializeToString())
 94 | 
 95 | 
 96 | def main(data_dir):
 97 |   print('Download from {} and extract.'.format(CIFAR_DOWNLOAD_URL))
 98 |   download_and_extract(data_dir)
 99 |   file_names = _get_file_names()
100 |   input_dir = os.path.join(data_dir, CIFAR_LOCAL_FOLDER)
101 |   for mode, files in file_names.items():
102 |     input_files = [os.path.join(input_dir, f) for f in files]
103 |     output_file = os.path.join(data_dir+'/'+mode, mode + '.tfrecords')
104 |     if not os.path.exists(data_dir+'/'+mode):
105 |         os.makedirs(data_dir+'/'+mode)
106 |     try:
107 |       os.remove(output_file)
108 |     except OSError:
109 |       pass
110 |     # Convert to tf.train.Example and write the to TFRecords.
111 |     convert_to_tfrecord(input_files, output_file)
112 |   print('Done!')
113 |   import shutil
114 |   shutil.rmtree(data_dir+'/cifar-10-batches-py')
115 |   os.remove(data_dir+'/cifar-10-python.tar.gz')
116 | 
117 | 
118 | if __name__ == '__main__':
119 |   parser = argparse.ArgumentParser()
120 |   parser.add_argument(
121 |       '--data-dir',
122 |       type=str,
123 |       default='',
124 |       help='Directory to download and extract CIFAR-10 to.')
125 | 
126 |   args = parser.parse_args()
127 |   main(args.data_dir)
128 | 


--------------------------------------------------------------------------------
/notebooks/part-1-horovod/cifar10-distributed.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Exercise 1: Convert training script to use horovod"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "You'll need to make the following modifications to your training script to use horovod for distributed training.\n",
 15 |     "\n",
 16 |     "1. Run hvd.init()\n",
 17 |     "2. Pin a server GPU to be used by this process using config.gpu_options.visible_device_list.\n",
 18 |     "3. Scale the learning rate by the number of workers.\n",
 19 |     "4. Wrap the optimizer in hvd.DistributedOptimizer.\n",
 20 |     "5. Add hvd.callbacks.BroadcastGlobalVariablesCallback(0) to broadcast initial variable states from rank 0 to all other processes.\n",
 21 |     "6. Modify your code to save checkpoints only on worker 0 to prevent other workers from corrupting them."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "Look for cells that say **Change X** and fill in those cells with the modifications - where **X** is the change number. There are a total of 8 changes.\n",
 29 |     "Click on **Solution** to see the answers\n",
 30 |     "\n",
 31 |     "After you've finished making necessary changes, run the script by hitting *Run > Run All Cells*.\n",
 32 |     "\n",
 33 |     "**Confirm that that the script still runs after introducing the horovod API**"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "#### Change 1: Import horovod and keras backend"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "import tensorflow as tf\n",
 50 |     "\n",
 51 |     "\n"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "<details><summary><b>Solution</b></summary>\n",
 59 |     "   <pre>\n",
 60 |     "import horovod.tensorflow.keras as hvd\n",
 61 |     "import tensorflow.keras.backend as K\n",
 62 |     "</pre>\n",
 63 |     "</details>"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "from datetime import datetime\n",
 73 |     "import argparse\n",
 74 |     "import os\n",
 75 |     "import numpy as np\n",
 76 |     "import codecs\n",
 77 |     "import json\n",
 78 |     "\n",
 79 |     "from tensorflow import keras\n",
 80 |     "from tensorflow.keras.layers import Input, Dense, Flatten\n",
 81 |     "from tensorflow.keras.models import Model\n",
 82 |     "from tensorflow.keras.optimizers import Adam, SGD\n",
 83 |     "from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint\n",
 84 |     "from model_def import get_model\n",
 85 |     "    \n",
 86 |     "HEIGHT = 32\n",
 87 |     "WIDTH  = 32\n",
 88 |     "DEPTH  = 3\n",
 89 |     "NUM_CLASSES = 10\n",
 90 |     "NUM_TRAIN_IMAGES = 40000\n",
 91 |     "NUM_VALID_IMAGES = 10000\n",
 92 |     "NUM_TEST_IMAGES  = 10000"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "def train_preprocess_fn(image):\n",
102 |     "\n",
103 |     "    # Resize the image to add four extra pixels on each side.\n",
104 |     "    image = tf.image.resize_image_with_crop_or_pad(image, HEIGHT + 8, WIDTH + 8)\n",
105 |     "\n",
106 |     "    # Randomly crop a [HEIGHT, WIDTH] section of the image.\n",
107 |     "    image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH])\n",
108 |     "\n",
109 |     "    # Randomly flip the image horizontally.\n",
110 |     "    image = tf.image.random_flip_left_right(image)\n",
111 |     "\n",
112 |     "    return image"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "def make_batch(filenames, batch_size):\n",
122 |     "    \"\"\"Read the images and labels from 'filenames'.\"\"\"\n",
123 |     "    # Repeat infinitely.\n",
124 |     "    dataset = tf.data.TFRecordDataset(filenames).repeat()\n",
125 |     "\n",
126 |     "    # Parse records.\n",
127 |     "    dataset = dataset.map(single_example_parser, num_parallel_calls=1)\n",
128 |     "\n",
129 |     "    # Batch it up.\n",
130 |     "    dataset = dataset.batch(batch_size, drop_remainder=True)\n",
131 |     "    iterator = dataset.make_one_shot_iterator()\n",
132 |     "\n",
133 |     "    image_batch, label_batch = iterator.get_next()\n",
134 |     "    return image_batch, label_batch"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "def single_example_parser(serialized_example):\n",
144 |     "    \"\"\"Parses a single tf.Example into image and label tensors.\"\"\"\n",
145 |     "    # Dimensions of the images in the CIFAR-10 dataset.\n",
146 |     "    # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the\n",
147 |     "    # input format.\n",
148 |     "    features = tf.parse_single_example(\n",
149 |     "        serialized_example,\n",
150 |     "        features={\n",
151 |     "            'image': tf.FixedLenFeature([], tf.string),\n",
152 |     "            'label': tf.FixedLenFeature([], tf.int64),\n",
153 |     "        })\n",
154 |     "    image = tf.decode_raw(features['image'], tf.uint8)\n",
155 |     "    image.set_shape([DEPTH * HEIGHT * WIDTH])\n",
156 |     "\n",
157 |     "    # Reshape from [depth * height * width] to [depth, height, width].\n",
158 |     "    image = tf.cast(\n",
159 |     "        tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]),\n",
160 |     "        tf.float32)\n",
161 |     "    label = tf.cast(features['label'], tf.int32)\n",
162 |     "    \n",
163 |     "    image = train_preprocess_fn(image)\n",
164 |     "    label = tf.one_hot(label, NUM_CLASSES)\n",
165 |     "    \n",
166 |     "    return image, label"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "# Hyper-parameters\n",
176 |     "epochs = 1\n",
177 |     "lr = 0.01\n",
178 |     "batch_size = 128\n",
179 |     "momentum = 0.9\n",
180 |     "weight_decay = 2e-4\n",
181 |     "optimizer = 'sgd'\n",
182 |     "gpu_count = 1\n",
183 |     "\n",
184 |     "# Data directories and other options\n",
185 |     "checkpoint_dir = '../ckpt_dir'\n",
186 |     "if not os.path.exists(checkpoint_dir):\n",
187 |     "    os.makedirs(checkpoint_dir)\n",
188 |     "\n",
189 |     "train_dir = '../dataset/train'\n",
190 |     "validation_dir = '../dataset/validation'\n",
191 |     "eval_dir = '../dataset/eval'\n",
192 |     "\n",
193 |     "train_dataset = make_batch(train_dir+'/train.tfrecords',  batch_size)\n",
194 |     "val_dataset = make_batch(validation_dir+'/validation.tfrecords', batch_size)\n",
195 |     "eval_dataset = make_batch(eval_dir+'/eval.tfrecords', batch_size)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "#### Change 2: Initialize horovod and get the size of the cluster"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "\n",
212 |     "\n",
213 |     "\n",
214 |     "\n",
215 |     "\n"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {},
221 |    "source": [
222 |     "<details><summary><b>Solution</b></summary>\n",
223 |     "   <pre>\n",
224 |     "hvd.init()\n",
225 |     "size = hvd.size()\n",
226 |     "</pre>\n",
227 |     "</details>"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "#### Change 3 - Pin GPU to be used to process local rank (one GPU per process)"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "\n",
244 |     "\n",
245 |     "\n",
246 |     "\n"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "markdown",
251 |    "metadata": {},
252 |    "source": [
253 |     "<details><summary><b>Solution</b></summary>\n",
254 |     "   <pre>\n",
255 |     "config = tf.ConfigProto()\n",
256 |     "config.gpu_options.allow_growth = True\n",
257 |     "config.gpu_options.visible_device_list = str(hvd.local_rank())\n",
258 |     "K.set_session(tf.Session(config=config))\n",
259 |     "</pre>\n",
260 |     "</details>"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": [
269 |     "model = get_model(lr, weight_decay, optimizer, momentum)"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {},
275 |    "source": [
276 |     "#### Change 4: How will you update the learning rate for distributed training? What changes should you make to the following command?"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "opt = SGD(lr=lr, decay=weight_decay, momentum=momentum)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {},
291 |    "source": [
292 |     "<details><summary><b>Solution</b></summary>\n",
293 |     "   <pre>\n",
294 |     "opt = SGD(lr=lr * size, decay=weight_decay, momentum=momentum)\n",
295 |     "\n",
296 |     "You need to scale the learning using the size of the cluster (total number of workers)\n",
297 |     "</pre>\n",
298 |     "</details>"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "markdown",
303 |    "metadata": {},
304 |    "source": [
305 |     "#### Change 6: How will you convert the optimizer to distributed optimizer?"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "model.compile(loss='categorical_crossentropy',\n",
315 |     "              optimizer=opt,\n",
316 |     "              metrics=['accuracy'])"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "markdown",
321 |    "metadata": {},
322 |    "source": [
323 |     "<details><summary><b>Solution</b></summary>\n",
324 |     "   <pre>\n",
325 |     "opt = hvd.DistributedOptimizer(opt)\n",
326 |     "model.compile(loss='categorical_crossentropy',\n",
327 |     "              optimizer=opt,\n",
328 |     "              metrics=['accuracy'])\n",
329 |     "</pre>\n",
330 |     "</details>"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "markdown",
335 |    "metadata": {},
336 |    "source": [
337 |     "#### Change 7: Add callbacks for syncing initial state, and saving checkpoints only on 1st worker"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": null,
343 |    "metadata": {},
344 |    "outputs": [],
345 |    "source": [
346 |     "\n",
347 |     "\n",
348 |     "\n"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "markdown",
353 |    "metadata": {},
354 |    "source": [
355 |     "<details><summary><b>Solution</b></summary>\n",
356 |     "   <pre>\n",
357 |     "callbacks = []\n",
358 |     "callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))\n",
359 |     "callbacks.append(hvd.callbacks.MetricAverageCallback())\n",
360 |     "callbacks.append(hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1))\n",
361 |     "callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1))\n",
362 |     "if hvd.rank() == 0:\n",
363 |     "    callbacks.append(ModelCheckpoint('ckpt_dir' + '/checkpoint-{epoch}.h5'))\n",
364 |     "</pre>\n",
365 |     "</details>"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "markdown",
370 |    "metadata": {},
371 |    "source": [
372 |     "#### Change 8: Update the number of steps/epoch"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": null,
378 |    "metadata": {},
379 |    "outputs": [],
380 |    "source": [
381 |     "%%time\n",
382 |     "# Train model\n",
383 |     "history = model.fit(x=train_dataset[0], y=train_dataset[1],\n",
384 |     "                    steps_per_epoch=NUM_TRAIN_IMAGES // batch_size,\n",
385 |     "                    validation_data=val_dataset,\n",
386 |     "                    validation_steps=NUM_VALID_IMAGES // batch_size,\n",
387 |     "                    epochs=epochs, \n",
388 |     "                    callbacks=[ModelCheckpoint(checkpoint_dir + '/checkpoint-{epoch}.h5')])\n"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "markdown",
393 |    "metadata": {},
394 |    "source": [
395 |     "<details><summary><b>Solution</b></summary>\n",
396 |     "   <pre>\n",
397 |     "history = model.fit(x=train_dataset[0], y=train_dataset[1],\n",
398 |     "                    steps_per_epoch= (NUM_TRAIN_IMAGES // batch_size)// size,\n",
399 |     "                    validation_data=val_dataset,\n",
400 |     "                    validation_steps= (NUM_VALID_IMAGES // batch_size)// size,\n",
401 |     "                    epochs=epochs, callbacks=callbacks)\n",
402 |     "</pre>\n",
403 |     "</details>"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": null,
409 |    "metadata": {},
410 |    "outputs": [],
411 |    "source": [
412 |     "# Evaluate model performance\n",
413 |     "score = model.evaluate(eval_dataset[0],\n",
414 |     "                       eval_dataset[1],\n",
415 |     "                       steps=NUM_TEST_IMAGES // batch_size,\n",
416 |     "                       verbose=0)\n",
417 |     "print('Test loss    :', score[0])\n",
418 |     "print('Test accuracy:', score[1])"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "markdown",
423 |    "metadata": {},
424 |    "source": [
425 |     "Note once these changes are made, you can convert the jupyter notebook into a python training script by running:\n",
426 |     "<code> $ jupyter nbconvert --to script notebook_name.ipynb </code>"
427 |    ]
428 |   }
429 |  ],
430 |  "metadata": {
431 |   "kernelspec": {
432 |    "display_name": "conda_tensorflow_p36",
433 |    "language": "python",
434 |    "name": "conda_tensorflow_p36"
435 |   },
436 |   "language_info": {
437 |    "codemirror_mode": {
438 |     "name": "ipython",
439 |     "version": 3
440 |    },
441 |    "file_extension": ".py",
442 |    "mimetype": "text/x-python",
443 |    "name": "python",
444 |    "nbconvert_exporter": "python",
445 |    "pygments_lexer": "ipython3",
446 |    "version": "3.6.5"
447 |   }
448 |  },
449 |  "nbformat": 4,
450 |  "nbformat_minor": 4
451 | }
452 | 


--------------------------------------------------------------------------------
/notebooks/part-1-horovod/cifar10-single-instance.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Single CPU/GPU training on the local instance\n",
  8 |     "This Jupyter notebook contains code that trains a DNN on the CIFAR10 dataset.\n",
  9 |     "The CIFAR-10 dataset consists of 60,000 32x32 images belonging to 10 different classes (6,000 images per class).\n",
 10 |     "\n",
 11 |     "This script was written for local training on a single instance. Run through this notebook either cell-by-cell or by hitting *Run > Run All Cells*\n",
 12 |     "\n",
 13 |     "Once you start to feel comfortable with what the script is doing, we'll then start to make changes to this script so that it can run on a cluster in a distributed fashion."
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "**Step 1:** Import essentials packages and define constants"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "import tensorflow as tf\n",
 30 |     "import argparse\n",
 31 |     "from datetime import datetime\n",
 32 |     "import os\n",
 33 |     "from tensorflow import keras\n",
 34 |     "from tensorflow.keras.layers import Input, Dense, Flatten\n",
 35 |     "from tensorflow.keras.models import Model\n",
 36 |     "from tensorflow.keras.optimizers import Adam, SGD\n",
 37 |     "from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint\n",
 38 |     "\n",
 39 |     "# Import DNN model definition file\n",
 40 |     "from model_def import get_model\n",
 41 |     "\n",
 42 |     "HEIGHT = 32\n",
 43 |     "WIDTH  = 32\n",
 44 |     "DEPTH  = 3\n",
 45 |     "NUM_CLASSES = 10\n",
 46 |     "NUM_TRAIN_IMAGES = 40000\n",
 47 |     "NUM_VALID_IMAGES = 10000\n",
 48 |     "NUM_TEST_IMAGES  = 10000"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "**Step 2:** Define functions used to load and prepare dataset for training. We incorporate 3 types of data augmentation schemes: random resize, random crop, random flip. Feel free to update this if you're comfortable. Leave the cell as it is if you aren't comfortable making changes."
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "def train_preprocess_fn(image):\n",
 65 |     "\n",
 66 |     "    # Resize the image to add four extra pixels on each side.\n",
 67 |     "    image = tf.image.resize_image_with_crop_or_pad(image, HEIGHT + 8, WIDTH + 8)\n",
 68 |     "\n",
 69 |     "    # Randomly crop a [HEIGHT, WIDTH] section of the image.\n",
 70 |     "    image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH])\n",
 71 |     "\n",
 72 |     "    # Randomly flip the image horizontally.\n",
 73 |     "    image = tf.image.random_flip_left_right(image)\n",
 74 |     "\n",
 75 |     "    return image\n",
 76 |     "\n",
 77 |     "def make_batch(filenames, batch_size):\n",
 78 |     "    \"\"\"Read the images and labels from 'filenames'.\"\"\"\n",
 79 |     "    # Repeat infinitely.\n",
 80 |     "    dataset = tf.data.TFRecordDataset(filenames).repeat()\n",
 81 |     "\n",
 82 |     "    # Parse records.\n",
 83 |     "    dataset = dataset.map(single_example_parser, num_parallel_calls=1)\n",
 84 |     "\n",
 85 |     "    # Batch it up.\n",
 86 |     "    dataset = dataset.batch(batch_size, drop_remainder=True)\n",
 87 |     "    iterator = dataset.make_one_shot_iterator()\n",
 88 |     "\n",
 89 |     "    image_batch, label_batch = iterator.get_next()\n",
 90 |     "    return image_batch, label_batch\n",
 91 |     "\n",
 92 |     "def single_example_parser(serialized_example):\n",
 93 |     "    \"\"\"Parses a single tf.Example into image and label tensors.\"\"\"\n",
 94 |     "    # Dimensions of the images in the CIFAR-10 dataset.\n",
 95 |     "    # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the\n",
 96 |     "    # input format.\n",
 97 |     "    features = tf.parse_single_example(\n",
 98 |     "        serialized_example,\n",
 99 |     "        features={\n",
100 |     "            'image': tf.FixedLenFeature([], tf.string),\n",
101 |     "            'label': tf.FixedLenFeature([], tf.int64),\n",
102 |     "        })\n",
103 |     "    image = tf.decode_raw(features['image'], tf.uint8)\n",
104 |     "    image.set_shape([DEPTH * HEIGHT * WIDTH])\n",
105 |     "\n",
106 |     "    # Reshape from [depth * height * width] to [depth, height, width].\n",
107 |     "    image = tf.cast(\n",
108 |     "        tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]),\n",
109 |     "        tf.float32)\n",
110 |     "    label = tf.cast(features['label'], tf.int32)\n",
111 |     "    \n",
112 |     "    image = train_preprocess_fn(image)\n",
113 |     "    label = tf.one_hot(label, NUM_CLASSES)\n",
114 |     "    \n",
115 |     "    return image, label"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "**Step 3:** \n",
123 |     "* Define hyperameters, directories for train, validation and test.\n",
124 |     "* Load model from model_def.py\n",
125 |     "* Compile model and fit"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "# Hyper-parameters\n",
135 |     "epochs = 1\n",
136 |     "lr = 0.01\n",
137 |     "batch_size = 128\n",
138 |     "momentum = 0.9\n",
139 |     "weight_decay = 2e-4\n",
140 |     "optimizer = 'sgd'\n",
141 |     "gpu_count = 1\n",
142 |     "\n",
143 |     "# Data directories and other options\n",
144 |     "checkpoint_dir = '../ckpt_dir'\n",
145 |     "if not os.path.exists(checkpoint_dir):\n",
146 |     "    os.makedirs(checkpoint_dir)\n",
147 |     "    \n",
148 |     "train_dir = '../dataset/train'\n",
149 |     "validation_dir = '../dataset/validation'\n",
150 |     "eval_dir = '../dataset/eval'\n",
151 |     "\n",
152 |     "train_dataset = make_batch(train_dir+'/train.tfrecords',  batch_size)\n",
153 |     "val_dataset = make_batch(validation_dir+'/validation.tfrecords', batch_size)\n",
154 |     "eval_dataset = make_batch(eval_dir+'/eval.tfrecords', batch_size)"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "model = get_model(lr, weight_decay, optimizer, momentum)\n",
164 |     "opt = SGD(lr=lr, decay=weight_decay, momentum=momentum)"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "model.compile(loss='categorical_crossentropy',\n",
174 |     "              optimizer=opt,\n",
175 |     "              metrics=['accuracy'])"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "# Compile model\n",
185 |     "model.compile(optimizer=SGD(lr=lr, decay=weight_decay, momentum=momentum),\n",
186 |     "              loss='categorical_crossentropy',\n",
187 |     "              metrics=['accuracy'])"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "%%time\n",
197 |     "# Train model\n",
198 |     "history = model.fit(x=train_dataset[0], y=train_dataset[1],\n",
199 |     "                    steps_per_epoch=NUM_TRAIN_IMAGES // batch_size,\n",
200 |     "                    validation_data=val_dataset,\n",
201 |     "                    validation_steps=NUM_VALID_IMAGES // batch_size,\n",
202 |     "                    epochs=epochs, \n",
203 |     "                    callbacks=[ModelCheckpoint(checkpoint_dir + '/checkpoint-{epoch}.h5')])"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "# Evaluate model performance\n",
213 |     "score = model.evaluate(eval_dataset[0],\n",
214 |     "                       eval_dataset[1],\n",
215 |     "                       steps=NUM_TEST_IMAGES // batch_size,\n",
216 |     "                       verbose=0)\n",
217 |     "print('Test loss    :', score[0])\n",
218 |     "print('Test accuracy:', score[1])"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {},
224 |    "source": [
225 |     "\n",
226 |     "----\n",
227 |     "##### Now that you have a successfully working training script, open `cifar10-distributed.ipynb` and start converting it for distributed training"
228 |    ]
229 |   }
230 |  ],
231 |  "metadata": {
232 |   "kernelspec": {
233 |    "display_name": "conda_tensorflow_p36",
234 |    "language": "python",
235 |    "name": "conda_tensorflow_p36"
236 |   },
237 |   "language_info": {
238 |    "codemirror_mode": {
239 |     "name": "ipython",
240 |     "version": 3
241 |    },
242 |    "file_extension": ".py",
243 |    "mimetype": "text/x-python",
244 |    "name": "python",
245 |    "nbconvert_exporter": "python",
246 |    "pygments_lexer": "ipython3",
247 |    "version": "3.6.5"
248 |   }
249 |  },
250 |  "nbformat": 4,
251 |  "nbformat_minor": 4
252 | }
253 | 


--------------------------------------------------------------------------------
/notebooks/part-1-horovod/model_def.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras.layers import Activation, Conv2D, Dense, Dropout, Flatten, MaxPooling2D, BatchNormalization
 3 | from tensorflow.keras.models import Sequential
 4 | from tensorflow.keras.optimizers import Adam, SGD, RMSprop
 5 | 
 6 | HEIGHT = 32
 7 | WIDTH = 32
 8 | DEPTH = 3
 9 | NUM_CLASSES = 10
10 | 
11 | def get_model(learning_rate, weight_decay, optimizer, momentum):
12 | 
13 |     model = Sequential()
14 |     model.add(Conv2D(32, (3, 3), padding='same', input_shape=(HEIGHT, WIDTH, DEPTH)))
15 |     model.add(BatchNormalization())
16 |     model.add(Activation('relu'))
17 |     model.add(Conv2D(32, (3, 3)))
18 |     model.add(BatchNormalization())
19 |     model.add(Activation('relu'))
20 |     model.add(MaxPooling2D(pool_size=(2, 2)))
21 |     model.add(Dropout(0.2))
22 | 
23 |     model.add(Conv2D(64, (3, 3), padding='same'))
24 |     model.add(BatchNormalization())
25 |     model.add(Activation('relu'))
26 |     model.add(Conv2D(64, (3, 3)))
27 |     model.add(BatchNormalization())
28 |     model.add(Activation('relu'))
29 |     model.add(MaxPooling2D(pool_size=(2, 2)))
30 |     model.add(Dropout(0.3))
31 | 
32 |     model.add(Conv2D(128, (3, 3), padding='same'))
33 |     model.add(BatchNormalization())
34 |     model.add(Activation('relu'))
35 |     model.add(Conv2D(128, (3, 3)))
36 |     model.add(BatchNormalization())
37 |     model.add(Activation('relu'))
38 |     model.add(MaxPooling2D(pool_size=(2, 2)))
39 |     model.add(Dropout(0.4))
40 | 
41 |     model.add(Flatten())
42 |     model.add(Dense(512))
43 |     model.add(Activation('relu'))
44 |     model.add(Dropout(0.5))
45 |     model.add(Dense(NUM_CLASSES))
46 |     model.add(Activation('softmax'))
47 |     
48 |     return model
49 | 
50 | 


--------------------------------------------------------------------------------
/notebooks/part-2-sagemaker/cifar10-sagemaker-distributed.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Distributed training with Amazon SageMaker\n",
  8 |     "\n",
  9 |     "In this notebook we use the SageMaker Python SDK to setup and run a distributed training job.\n",
 10 |     "SageMaker makes it easy to train models across a cluster containing a large number of machines, without having to explicitly manage those resources. "
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "**Step 1:** Import essentials packages, start a sagemaker session and specify the bucket name you created in the pre-requsites section of this workshop."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import os\n",
 27 |     "import time\n",
 28 |     "import numpy as np\n",
 29 |     "import sagemaker\n",
 30 |     "\n",
 31 |     "sagemaker_session = sagemaker.Session()\n",
 32 |     "role = sagemaker.get_execution_role()\n",
 33 |     "\n",
 34 |     "bucket_name = '<your_bucket_name>'"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "**Step 2:** Specify hyperparameters, instance type and number of instances to distribute training to. The `hvd_processes_per_host` corrosponds to number of GPUs per instances. \n",
 42 |     "For example, if you choose:\n",
 43 |     "```\n",
 44 |     "hvd_instance_type = 'ml.p3.8xlarge'\n",
 45 |     "hvd_instance_count = 2\n",
 46 |     "hvd_processes_per_host = 4\n",
 47 |     "```\n",
 48 |     "\n",
 49 |     "Since p3.8xlarge instance has 4 GPUs, we'll we distributing training to 8 workers, 1 per GPU.\n",
 50 |     "This is spread across 2 instances (or nodes). SageMaker automatically takes care of spinning up these instances and making sure they can communiate with each other."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "hyperparameters = {'epochs': 100, \n",
 60 |     "                   'learning-rate': 0.001,\n",
 61 |     "                   'momentum': 0.9,\n",
 62 |     "                   'weight-decay': 2e-4,\n",
 63 |     "                   'optimizer': 'adam',\n",
 64 |     "                   'batch-size' : 256}\n",
 65 |     "\n",
 66 |     "hvd_instance_type = 'ml.c5.xlarge'\n",
 67 |     "hvd_instance_count = 2\n",
 68 |     "hvd_processes_per_host = 1\n",
 69 |     "\n",
 70 |     "print('Distributed training with a total of {} workers'.format(hvd_processes_per_host*hvd_instance_count))\n",
 71 |     "print('{} x {} instances with {} processes per instance'.format(hvd_instance_count, hvd_instance_type, hvd_processes_per_host))"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "**Step 3:** In this cell we create a SageMaker estimator, by providing it with all the information it needs to launch instances and execute training on those instances.\n",
 79 |     "\n",
 80 |     "Since we're using horovod for distributed training, we specify `distributions` to mpi which is used by horovod.\n",
 81 |     "\n",
 82 |     "In the TensorFlow estimator call, we specify training script under `entry_point` and dependencies under `code`. SageMaker automatically copies these files into a TensorFlow container behind the scenes, and are executed on the training instances."
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "from sagemaker.tensorflow import TensorFlow\n",
 92 |     "\n",
 93 |     "output_path = 's3://{}/'.format(bucket_name)\n",
 94 |     "job_name = 'sm-dist-{}x{}-workers-'.format(hvd_instance_count, hvd_processes_per_host) + time.strftime('%Y-%m-%d-%H-%M-%S-%j', time.gmtime())\n",
 95 |     "model_dir = output_path + 'tensorboard_logs/' + job_name\n",
 96 |     "\n",
 97 |     "distributions = {'mpi': {\n",
 98 |     "                    'enabled': True,\n",
 99 |     "                    'processes_per_host': hvd_processes_per_host,\n",
100 |     "                    'custom_mpi_options': '-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none'\n",
101 |     "                        }\n",
102 |     "                }\n",
103 |     "\n",
104 |     "estimator_hvd = TensorFlow(base_job_name='hvd-cifar10-tf',\n",
105 |     "                       source_dir='code',\n",
106 |     "                       entry_point='cifar10-multi-gpu-horovod-sagemaker.py', \n",
107 |     "                       role=role,\n",
108 |     "                       framework_version='1.14',\n",
109 |     "                       py_version='py3',\n",
110 |     "                       hyperparameters=hyperparameters,\n",
111 |     "                       train_instance_count=hvd_instance_count, \n",
112 |     "                       train_instance_type=hvd_instance_type,\n",
113 |     "                       output_path=output_path,\n",
114 |     "                       model_dir=model_dir,\n",
115 |     "                       tags = [{'Key' : 'Project', 'Value' : 'cifar10'},{'Key' : 'TensorBoard', 'Value' : 'dist'}],\n",
116 |     "                       metric_definitions=[{'Name': 'val_acc', 'Regex': 'val_acc: ([0-9\\\\.]+)'}],\n",
117 |     "                       distributions=distributions)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "**Step 4:** Specify dataset locations in Amazon S3 and then call the fit function."
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "train_path = 's3://{}/cifar10-dataset/train'.format(bucket_name)\n",
134 |     "val_path = 's3://{}/cifar10-dataset/validation'.format(bucket_name)\n",
135 |     "eval_path = 's3://{}/cifar10-dataset/eval/'.format(bucket_name)\n",
136 |     "\n",
137 |     "estimator_hvd.fit({'train': train_path,'validation': val_path,'eval': eval_path}, \n",
138 |     "                  job_name=job_name, wait=False)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "**Note**: in the `estimator_hvd.fit()` function above, change`wait=True` if you want to see the training output in the Jupyter notebook.\n",
146 |     "Advantage of setting `wait=False`, is that you can continue to run cells. \n",
147 |     "Since we're unblocked due to `wait=False` we can now launch tensorboard in the notebook and monitor progress."
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "**Step 5:** Monitor progress on TensorBoard. Launch tensorboard and open the link on a new tab to visualize training progress, and navigate to the following link"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "!S3_REGION=us-west-2 tensorboard --logdir s3://{bucket_name}/tensorboard_logs/"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "Open a new browser and navigate to the folloiwng link to access TensorBoard:\n",
171 |     "<br> https://***your_notebook_name***.notebook.us-west-2.sagemaker.aws/proxy/6006/\n",
172 |     "<br> <br> \n",
173 |     "**Note:** Make sure to replace `your_notebook_name` with the name of the notebook instance. You can find the name of your notebook instance on the browser URL.\n",
174 |     "<br> Don't forget the slash at the end of the URL 6006/"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": []
183 |   }
184 |  ],
185 |  "metadata": {
186 |   "kernelspec": {
187 |    "display_name": "conda_tensorflow_p36",
188 |    "language": "python",
189 |    "name": "conda_tensorflow_p36"
190 |   },
191 |   "language_info": {
192 |    "codemirror_mode": {
193 |     "name": "ipython",
194 |     "version": 3
195 |    },
196 |    "file_extension": ".py",
197 |    "mimetype": "text/x-python",
198 |    "name": "python",
199 |    "nbconvert_exporter": "python",
200 |    "pygments_lexer": "ipython3",
201 |    "version": "3.6.5"
202 |   }
203 |  },
204 |  "nbformat": 4,
205 |  "nbformat_minor": 4
206 | }
207 | 


--------------------------------------------------------------------------------
/notebooks/part-2-sagemaker/code/cifar10-multi-gpu-horovod-sagemaker.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import horovod.tensorflow.keras as hvd
  3 | 
  4 | from datetime import datetime
  5 | import argparse
  6 | import os
  7 | import numpy as np
  8 | import codecs
  9 | import json
 10 | import boto3
 11 | 
 12 | import tensorflow.keras.backend as K
 13 | from tensorflow import keras
 14 | from tensorflow.keras.layers import Input, Dense, Flatten
 15 | from tensorflow.keras.models import Model
 16 | from tensorflow.keras.utils import multi_gpu_model
 17 | from tensorflow.keras.optimizers import Adam, SGD
 18 | from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
 19 | from model_def import get_model
 20 | 
 21 | HEIGHT = 32
 22 | WIDTH = 32
 23 | DEPTH  = 3
 24 | NUM_CLASSES = 10
 25 | NUM_TRAIN_IMAGES = 40000
 26 | NUM_VALID_IMAGES = 10000
 27 | NUM_TEST_IMAGES  = 10000
 28 | 
 29 | class Sync2S3(tf.keras.callbacks.Callback):
 30 |     def __init__(self, logdir, s3logdir):
 31 |         super(Sync2S3, self).__init__()
 32 |         self.logdir = logdir
 33 |         self.s3logdir = s3logdir
 34 |     
 35 |     def on_epoch_end(self, batch, logs={}):
 36 |         os.system('aws s3 sync '+self.logdir+' '+self.s3logdir)
 37 |         # ' >/dev/null 2>&1'
 38 | 
 39 | def train_preprocess_fn(image):
 40 | 
 41 |     # Resize the image to add four extra pixels on each side.
 42 |     image = tf.image.resize_image_with_crop_or_pad(image, HEIGHT + 8, WIDTH + 8)
 43 | 
 44 |     # Randomly crop a [HEIGHT, WIDTH] section of the image.
 45 |     image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH])
 46 | 
 47 |     # Randomly flip the image horizontally.
 48 |     image = tf.image.random_flip_left_right(image)
 49 | 
 50 |     return image
 51 | 
 52 | 
 53 | def make_batch(filenames, batch_size):
 54 |     """Read the images and labels from 'filenames'."""
 55 |     # Repeat infinitely.
 56 |     dataset = tf.data.TFRecordDataset(filenames).repeat()
 57 | 
 58 |     # Parse records.
 59 |     dataset = dataset.map(single_example_parser, num_parallel_calls=1)
 60 | 
 61 |     # Batch it up.
 62 |     dataset = dataset.batch(batch_size, drop_remainder=True)
 63 |     iterator = dataset.make_one_shot_iterator()
 64 | 
 65 |     image_batch, label_batch = iterator.get_next()
 66 |     return image_batch, label_batch
 67 | 
 68 | 
 69 | def single_example_parser(serialized_example):
 70 |     """Parses a single tf.Example into image and label tensors."""
 71 |     # Dimensions of the images in the CIFAR-10 dataset.
 72 |     # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the
 73 |     # input format.
 74 |     features = tf.parse_single_example(
 75 |         serialized_example,
 76 |         features={
 77 |             'image': tf.FixedLenFeature([], tf.string),
 78 |             'label': tf.FixedLenFeature([], tf.int64),
 79 |         })
 80 |     image = tf.decode_raw(features['image'], tf.uint8)
 81 |     image.set_shape([DEPTH * HEIGHT * WIDTH])
 82 | 
 83 |     # Reshape from [depth * height * width] to [depth, height, width].
 84 |     image = tf.cast(
 85 |         tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]),
 86 |         tf.float32)
 87 |     label = tf.cast(features['label'], tf.int32)
 88 |     
 89 |     image = train_preprocess_fn(image)
 90 |     label = tf.one_hot(label, NUM_CLASSES)
 91 |     
 92 |     return image, label
 93 | 
 94 | def save_history(path, history):
 95 | 
 96 |     history_for_json = {}
 97 |     # transform float values that aren't json-serializable
 98 |     for key in list(history.history.keys()):
 99 |         if type(history.history[key]) == np.ndarray:
100 |             history_for_json[key] == history.history[key].tolist()
101 |         elif type(history.history[key]) == list:
102 |            if  type(history.history[key][0]) == np.float32 or type(history.history[key][0]) == np.float64:
103 |                history_for_json[key] = list(map(float, history.history[key]))
104 | 
105 |     with codecs.open(path, 'w', encoding='utf-8') as f:
106 |         json.dump(history_for_json, f, separators=(',', ':'), sort_keys=True, indent=4) 
107 | 
108 | 
109 | def main(args):
110 |     # Hyper-parameters
111 |     epochs = args.epochs
112 |     lr = args.learning_rate
113 |     batch_size = args.batch_size
114 |     momentum = args.momentum
115 |     weight_decay = args.weight_decay
116 |     optimizer = args.optimizer
117 | 
118 |     # SageMaker options
119 |     gpu_count = args.gpu_count
120 |     model_dir = args.model_dir
121 |     training_dir = args.train
122 |     validation_dir = args.validation
123 |     eval_dir = args.eval
124 |     
125 |     # Change 2
126 |     hvd.init()
127 |     size = hvd.size()
128 |     
129 |     # Change 3 - pin GPU to be used to process local rank (one GPU per process)
130 |     config = tf.ConfigProto()
131 |     config.gpu_options.allow_growth = True
132 |     config.gpu_options.visible_device_list = str(hvd.local_rank())
133 |     K.set_session(tf.Session(config=config))
134 |     
135 |     train_dataset = make_batch(training_dir+'/train.tfrecords',  batch_size)
136 |     val_dataset = make_batch(validation_dir+'/validation.tfrecords', batch_size)
137 |     eval_dataset = make_batch(eval_dir+'/eval.tfrecords', batch_size)
138 |     
139 |     input_shape = (HEIGHT, WIDTH, DEPTH)
140 |     
141 |     # Change 4 - update learning rate
142 |     # Change 5 - update training code
143 |     
144 |     # Change 6 - update callbacks - sync initial state, checkpoint only on 1st worker
145 |     callbacks = []
146 |     callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
147 |     callbacks.append(hvd.callbacks.MetricAverageCallback())
148 |     callbacks.append(hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1))
149 |     callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1))
150 |     if hvd.rank() == 0:
151 |         callbacks.append(ModelCheckpoint(args.output_data_dir + '/checkpoint-{epoch}.h5'))
152 |         logdir = args.output_data_dir + '/' + datetime.now().strftime("%Y%m%d-%H%M%S")
153 |         callbacks.append(TensorBoard(log_dir=logdir, profile_batch=0))
154 |         callbacks.append(Sync2S3(logdir=logdir, s3logdir=model_dir))
155 |     
156 |     model = get_model(lr, weight_decay, optimizer, momentum, hvd)
157 | 
158 |     # Train model
159 |     history = model.fit(x=train_dataset[0], y=train_dataset[1],
160 |                         steps_per_epoch= (NUM_TRAIN_IMAGES // batch_size)// size,
161 |                         validation_data=val_dataset,
162 |                         validation_steps= (NUM_VALID_IMAGES // batch_size)// size,
163 |                         epochs=epochs, callbacks=callbacks)
164 | 
165 |     # Evaluate model performance
166 |     score = model.evaluate(eval_dataset[0],
167 |                            eval_dataset[1],
168 |                            steps=NUM_TEST_IMAGES // args.batch_size,
169 |                            verbose=0)
170 |     print('Test loss    :', score[0])
171 |     print('Test accuracy:', score[1])
172 | 
173 |     if hvd.rank() == 0:
174 |         save_history(args.output_data_dir + "/hvd_history.p", history)
175 |         # Save model to model directory
176 |         # bug: https://github.com/horovod/horovod/issues/1437
177 |         # tf.contrib.saved_model.save_keras_model(model, args.model_output_dir)
178 | 
179 | if __name__ == "__main__":
180 |     
181 |     parser = argparse.ArgumentParser()
182 | 
183 |     # Hyper-parameters
184 |     parser.add_argument('--epochs',        type=int,   default=15)
185 |     parser.add_argument('--learning-rate', type=float, default=0.001)
186 |     parser.add_argument('--batch-size',    type=int,   default=256)
187 |     parser.add_argument('--weight-decay',  type=float, default=2e-4)
188 |     parser.add_argument('--momentum',      type=float, default='0.9')
189 |     parser.add_argument('--optimizer',     type=str,   default='adam')
190 | 
191 |     # SageMaker parameters
192 |     parser.add_argument('--model_dir',        type=str)
193 |     parser.add_argument('--model_output_dir', type=str,   default=os.environ['SM_MODEL_DIR'])
194 |     parser.add_argument('--output_data_dir',  type=str,   default=os.environ['SM_OUTPUT_DATA_DIR'])
195 |     
196 |     # Data directories and other options
197 |     parser.add_argument('--gpu-count',        type=int,   default=os.environ['SM_NUM_GPUS'])
198 |     parser.add_argument('--train',            type=str,   default=os.environ['SM_CHANNEL_TRAIN'])
199 |     parser.add_argument('--validation',       type=str,   default=os.environ['SM_CHANNEL_VALIDATION'])
200 |     parser.add_argument('--eval',             type=str,   default=os.environ['SM_CHANNEL_EVAL'])
201 |     
202 |     args = parser.parse_args()
203 | 
204 |     main(args)
205 | 


--------------------------------------------------------------------------------
/notebooks/part-2-sagemaker/code/model_def.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras.layers import Activation, Conv2D, Dense, Dropout, Flatten, MaxPooling2D, BatchNormalization
 3 | from tensorflow.keras.models import Sequential
 4 | from tensorflow.keras.optimizers import Adam, SGD, RMSprop
 5 | 
 6 | HEIGHT = 32
 7 | WIDTH = 32
 8 | DEPTH = 3
 9 | NUM_CLASSES = 10
10 | 
11 | def get_model(learning_rate, weight_decay, optimizer, momentum, hvd):
12 | 
13 |     model = Sequential()
14 |     model.add(Conv2D(32, (3, 3), padding='same', input_shape=(HEIGHT, WIDTH, DEPTH)))
15 |     model.add(BatchNormalization())
16 |     model.add(Activation('relu'))
17 |     model.add(Conv2D(32, (3, 3)))
18 |     model.add(BatchNormalization())
19 |     model.add(Activation('relu'))
20 |     model.add(MaxPooling2D(pool_size=(2, 2)))
21 |     model.add(Dropout(0.2))
22 | 
23 |     model.add(Conv2D(64, (3, 3), padding='same'))
24 |     model.add(BatchNormalization())
25 |     model.add(Activation('relu'))
26 |     model.add(Conv2D(64, (3, 3)))
27 |     model.add(BatchNormalization())
28 |     model.add(Activation('relu'))
29 |     model.add(MaxPooling2D(pool_size=(2, 2)))
30 |     model.add(Dropout(0.3))
31 | 
32 |     model.add(Conv2D(128, (3, 3), padding='same'))
33 |     model.add(BatchNormalization())
34 |     model.add(Activation('relu'))
35 |     model.add(Conv2D(128, (3, 3)))
36 |     model.add(BatchNormalization())
37 |     model.add(Activation('relu'))
38 |     model.add(MaxPooling2D(pool_size=(2, 2)))
39 |     model.add(Dropout(0.4))
40 | 
41 |     model.add(Flatten())
42 |     model.add(Dense(512))
43 |     model.add(Activation('relu'))
44 |     model.add(Dropout(0.5))
45 |     model.add(Dense(NUM_CLASSES))
46 |     model.add(Activation('softmax'))
47 | 
48 |     size = hvd.size()
49 | 
50 |     if optimizer.lower() == 'sgd':
51 |         opt = SGD(lr=learning_rate * size, decay=weight_decay, momentum=momentum)
52 |     elif optimizer.lower() == 'rmsprop':
53 |         opt = RMSprop(lr=learning_rate * size, decay=weight_decay)
54 |     else:
55 |         opt = Adam(lr=learning_rate * size, decay=weight_decay)
56 | 
57 |     opt = hvd.DistributedOptimizer(opt)
58 | 
59 |     model.compile(loss='categorical_crossentropy',
60 |                   optimizer=opt,
61 |                   metrics=['accuracy'])
62 |     
63 |     return model
64 | 
65 | 


--------------------------------------------------------------------------------
/notebooks/part-3-kubernetes/Dockerfile.cpu:
--------------------------------------------------------------------------------
1 | FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:1.14.0-cpu-py36-ubuntu16.04
2 | 
3 | COPY code /opt/training/
4 | 
5 | WORKDIR /opt/training
6 | 
7 | 


--------------------------------------------------------------------------------
/notebooks/part-3-kubernetes/Dockerfile.gpu:
--------------------------------------------------------------------------------
1 | FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:1.14.0-gpu-py36-cu100-ubuntu16.04
2 | 
3 | COPY code /opt/training/
4 | 
5 | WORKDIR /opt/training
6 | 
7 | 


--------------------------------------------------------------------------------
/notebooks/part-3-kubernetes/code/cifar10-multi-gpu-horovod-k8s.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | import horovod.tensorflow.keras as hvd
  4 | 
  5 | from datetime import datetime
  6 | import argparse
  7 | import os
  8 | import numpy as np
  9 | import codecs
 10 | import json
 11 | 
 12 | import tensorflow.keras.backend as K
 13 | from tensorflow import keras
 14 | from tensorflow.keras.layers import Input, Dense, Flatten
 15 | from tensorflow.keras.models import Model
 16 | from tensorflow.keras.utils import multi_gpu_model
 17 | from tensorflow.keras.optimizers import Adam, SGD
 18 | from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
 19 | from model_def import get_model
 20 |     
 21 | HEIGHT = 32
 22 | WIDTH  = 32
 23 | DEPTH  = 3
 24 | NUM_CLASSES = 10
 25 | NUM_TRAIN_IMAGES = 40000
 26 | NUM_VALID_IMAGES = 10000
 27 | NUM_TEST_IMAGES  = 10000
 28 | 
 29 | def train_preprocess_fn(image):
 30 | 
 31 |     # Resize the image to add four extra pixels on each side.
 32 |     image = tf.image.resize_image_with_crop_or_pad(image, HEIGHT + 8, WIDTH + 8)
 33 | 
 34 |     # Randomly crop a [HEIGHT, WIDTH] section of the image.
 35 |     image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH])
 36 | 
 37 |     # Randomly flip the image horizontally.
 38 |     image = tf.image.random_flip_left_right(image)
 39 | 
 40 |     return image
 41 | 
 42 | 
 43 | def make_batch(filenames, batch_size):
 44 |     """Read the images and labels from 'filenames'."""
 45 |     # Repeat infinitely.
 46 |     dataset = tf.data.TFRecordDataset(filenames).repeat()
 47 | 
 48 |     # Parse records.
 49 |     dataset = dataset.map(single_example_parser, num_parallel_calls=1)
 50 | 
 51 |     # Batch it up.
 52 |     dataset = dataset.batch(batch_size, drop_remainder=True)
 53 |     iterator = dataset.make_one_shot_iterator()
 54 | 
 55 |     image_batch, label_batch = iterator.get_next()
 56 |     return image_batch, label_batch
 57 | 
 58 | 
 59 | def single_example_parser(serialized_example):
 60 |     """Parses a single tf.Example into image and label tensors."""
 61 |     # Dimensions of the images in the CIFAR-10 dataset.
 62 |     # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the
 63 |     # input format.
 64 |     features = tf.parse_single_example(
 65 |         serialized_example,
 66 |         features={
 67 |             'image': tf.FixedLenFeature([], tf.string),
 68 |             'label': tf.FixedLenFeature([], tf.int64),
 69 |         })
 70 |     image = tf.decode_raw(features['image'], tf.uint8)
 71 |     image.set_shape([DEPTH * HEIGHT * WIDTH])
 72 | 
 73 |     # Reshape from [depth * height * width] to [depth, height, width].
 74 |     image = tf.cast(
 75 |         tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]),
 76 |         tf.float32)
 77 |     label = tf.cast(features['label'], tf.int32)
 78 |     
 79 |     image = train_preprocess_fn(image)
 80 |     label = tf.one_hot(label, NUM_CLASSES)
 81 |     
 82 |     return image, label
 83 | 
 84 | def save_history(path, history):
 85 | 
 86 |     history_for_json = {}
 87 |     # transform float values that aren't json-serializable
 88 |     for key in list(history.history.keys()):
 89 |         if type(history.history[key]) == np.ndarray:
 90 |             history_for_json[key] == history.history[key].tolist()
 91 |         elif type(history.history[key]) == list:
 92 |            if  type(history.history[key][0]) == np.float32 or type(history.history[key][0]) == np.float64:
 93 |                history_for_json[key] = list(map(float, history.history[key]))
 94 | 
 95 |     with codecs.open(path, 'w', encoding='utf-8') as f:
 96 |         json.dump(history_for_json, f, separators=(',', ':'), sort_keys=True, indent=4) 
 97 | 
 98 | 
 99 | def main(args):
100 |     # Hyper-parameters
101 |     epochs = args.epochs
102 |     lr = args.learning_rate
103 |     batch_size = args.batch_size
104 |     momentum = args.momentum
105 |     weight_decay = args.weight_decay
106 |     optimizer = args.optimizer
107 | 
108 |     # Data directories and other options
109 |     gpu_count = args.gpu_count
110 |     model_dir = args.model_dir
111 |     training_dir = args.train
112 |     validation_dir = args.validation
113 |     eval_dir = args.eval
114 |     
115 |     # Change 2
116 |     hvd.init()
117 |     size = hvd.size()
118 |     
119 |     # Change 3 - pin GPU to be used to process local rank (one GPU per process)
120 |     config = tf.ConfigProto()
121 |     config.gpu_options.allow_growth = True
122 |     config.gpu_options.visible_device_list = str(hvd.local_rank())
123 |     K.set_session(tf.Session(config=config))
124 |     
125 |     train_dataset = make_batch(training_dir+'/train.tfrecords',  batch_size)
126 |     val_dataset = make_batch(validation_dir+'/validation.tfrecords', batch_size)
127 |     eval_dataset = make_batch(eval_dir+'/eval.tfrecords', batch_size)
128 |     
129 |     input_shape = (HEIGHT, WIDTH, DEPTH)
130 |     
131 |     # Change 4 - update learning rate
132 |     # Change 5 - update training code
133 |     
134 |     # Change 6 - update callbacks - sync initial state, checkpoint only on 1st worker
135 |     callbacks = []
136 |     callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
137 |     callbacks.append(hvd.callbacks.MetricAverageCallback())
138 |     callbacks.append(hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1))
139 |     callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1))
140 |     if hvd.rank() == 0:
141 |         callbacks.append(ModelCheckpoint(args.output_data_dir + '/checkpoint-{epoch}.h5'))
142 |         logdir = args.output_data_dir + '/' + datetime.now().strftime("%Y%m%d-%H%M%S")
143 |         callbacks.append(TensorBoard(log_dir=logdir, profile_batch=0))
144 |     
145 |     model = get_model(lr, weight_decay, optimizer, momentum, hvd)
146 | 
147 |     # Train model
148 |     history = model.fit(x=train_dataset[0], y=train_dataset[1],
149 |                         steps_per_epoch= (NUM_TRAIN_IMAGES // batch_size)// size,
150 |                         validation_data=val_dataset,
151 |                         validation_steps= (NUM_VALID_IMAGES // batch_size)// size,
152 |                         epochs=epochs, callbacks=callbacks)
153 | 
154 |     # Evaluate model performance
155 |     score = model.evaluate(eval_dataset[0],
156 |                            eval_dataset[1],
157 |                            steps=NUM_TEST_IMAGES // args.batch_size,
158 |                            verbose=0)
159 |     print('Test loss    :', score[0])
160 |     print('Test accuracy:', score[1])
161 | 
162 |     if hvd.rank() == 0:
163 |         save_history(args.output_data_dir + "/hvd_history.p", history)
164 |         # Save model to model directory
165 |         #bug: https://github.com/horovod/horovod/issues/1437
166 |         #tf.contrib.saved_model.save_keras_model(model, args.model_output_dir)
167 | 
168 | if __name__ == "__main__":
169 |     
170 |     parser = argparse.ArgumentParser()
171 | 
172 |     # Hyper-parameters
173 |     parser.add_argument('--epochs',        type=int,   default=15)
174 |     parser.add_argument('--learning-rate', type=float, default=0.001)
175 |     parser.add_argument('--batch-size',    type=int,   default=256)
176 |     parser.add_argument('--weight-decay',  type=float, default=2e-4)
177 |     parser.add_argument('--momentum',      type=float, default='0.9')
178 |     parser.add_argument('--optimizer',     type=str,   default='adam')
179 | 
180 |     # Data directories and other options
181 |     parser.add_argument('--gpu-count',        type=int,   default=0)
182 |     parser.add_argument('--train',            type=str)
183 |     parser.add_argument('--validation',       type=str)
184 |     parser.add_argument('--eval',             type=str)
185 |     
186 |     parser.add_argument('--model_dir',        type=str)
187 |     parser.add_argument('--model_output_dir', type=str)
188 |     parser.add_argument('--output_data_dir',  type=str)
189 |     parser.add_argument('--tensorboard_dir',  type=str)
190 |     
191 |     args = parser.parse_args()
192 |     main(args)
193 | 


--------------------------------------------------------------------------------
/notebooks/part-3-kubernetes/code/model_def.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras.layers import Activation, Conv2D, Dense, Dropout, Flatten, MaxPooling2D, BatchNormalization
 3 | from tensorflow.keras.models import Sequential
 4 | from tensorflow.keras.optimizers import Adam, SGD, RMSprop
 5 | 
 6 | HEIGHT = 32
 7 | WIDTH = 32
 8 | DEPTH = 3
 9 | NUM_CLASSES = 10
10 | 
11 | def get_model(learning_rate, weight_decay, optimizer, momentum, hvd):
12 | 
13 |     model = Sequential()
14 |     model.add(Conv2D(32, (3, 3), padding='same', input_shape=(HEIGHT, WIDTH, DEPTH)))
15 |     model.add(BatchNormalization())
16 |     model.add(Activation('relu'))
17 |     model.add(Conv2D(32, (3, 3)))
18 |     model.add(BatchNormalization())
19 |     model.add(Activation('relu'))
20 |     model.add(MaxPooling2D(pool_size=(2, 2)))
21 |     model.add(Dropout(0.2))
22 | 
23 |     model.add(Conv2D(64, (3, 3), padding='same'))
24 |     model.add(BatchNormalization())
25 |     model.add(Activation('relu'))
26 |     model.add(Conv2D(64, (3, 3)))
27 |     model.add(BatchNormalization())
28 |     model.add(Activation('relu'))
29 |     model.add(MaxPooling2D(pool_size=(2, 2)))
30 |     model.add(Dropout(0.3))
31 | 
32 |     model.add(Conv2D(128, (3, 3), padding='same'))
33 |     model.add(BatchNormalization())
34 |     model.add(Activation('relu'))
35 |     model.add(Conv2D(128, (3, 3)))
36 |     model.add(BatchNormalization())
37 |     model.add(Activation('relu'))
38 |     model.add(MaxPooling2D(pool_size=(2, 2)))
39 |     model.add(Dropout(0.4))
40 | 
41 |     model.add(Flatten())
42 |     model.add(Dense(512))
43 |     model.add(Activation('relu'))
44 |     model.add(Dropout(0.5))
45 |     model.add(Dense(NUM_CLASSES))
46 |     model.add(Activation('softmax'))
47 | 
48 |     size = hvd.size()
49 | 
50 |     if optimizer.lower() == 'sgd':
51 |         opt = SGD(lr=learning_rate * size, decay=weight_decay, momentum=momentum)
52 |     elif optimizer.lower() == 'rmsprop':
53 |         opt = RMSprop(lr=learning_rate * size, decay=weight_decay)
54 |     else:
55 |         opt = Adam(lr=learning_rate * size, decay=weight_decay)
56 | 
57 |     opt = hvd.DistributedOptimizer(opt)
58 | 
59 |     model.compile(loss='categorical_crossentropy',
60 |                   optimizer=opt,
61 |                   metrics=['accuracy'])
62 |     
63 |     return model
64 | 


--------------------------------------------------------------------------------
/notebooks/part-3-kubernetes/cpu_eks_cluster.sh:
--------------------------------------------------------------------------------
 1 | eksctl create cluster \
 2 |     --name aws-tf-cluster-cpu \
 3 |     --version 1.14 \
 4 |     --region us-west-2 \
 5 |     --nodegroup-name cpu-nodes \
 6 |     --node-type c5.xlarge \
 7 |     --nodes 2 \
 8 |     --node-volume-size 50 \
 9 |     --node-zones us-west-2a \
10 |     --timeout=40m \
11 |     --zones=us-west-2a,us-west-2b,us-west-2c \
12 |     --auto-kubeconfig


--------------------------------------------------------------------------------
/notebooks/part-3-kubernetes/gpu_eks_cluster.sh:
--------------------------------------------------------------------------------
 1 | eksctl create cluster \
 2 |     --name aws-tf-cluster \
 3 |     --version 1.14 \
 4 |     --region us-west-2 \
 5 |     --nodegroup-name gpu-nodes \
 6 |     --node-type p3.2xlarge \
 7 |     --nodes 2 \
 8 |     --node-volume-size 100 \
 9 |     --node-zones us-west-2a \
10 |     --timeout=40m \
11 |     --zones=us-west-2a,us-west-2b,us-west-2c \
12 |     --auto-kubeconfig


--------------------------------------------------------------------------------
/notebooks/part-3-kubernetes/specs/claim-fsx-s3.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: fsx-claim
 5 | spec:
 6 |   accessModes:
 7 |     - ReadWriteMany
 8 |   storageClassName: fsx-sc
 9 |   resources:
10 |     requests:
11 |       storage: 1200Gi


--------------------------------------------------------------------------------
/notebooks/part-3-kubernetes/specs/eks_tf_training_job-cpu.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1alpha1
 2 | kind: MPIJob
 3 | metadata:
 4 |   name: eks-tf-distributed-training
 5 | spec:
 6 |   replicas: 2
 7 |   template:
 8 |     metadata: 
 9 |       annotations:
10 |         sidecar.istio.io/inject: "false"
11 |     spec:
12 |       restartPolicy: Never
13 |       containers:
14 |       - name: eks-tf-dist-job
15 |         image: <YOUR_DOCKER_IMAGE>
16 |         env:
17 |         - name: HDF5_USE_FILE_LOCKING
18 |           value: 'FALSE'
19 |         command:
20 |         - mpirun
21 |         - --allow-run-as-root
22 |         - -mca
23 |         - btl_tcp_if_exclude
24 |         - lo
25 |         - -mca
26 |         - pml
27 |         - ob1
28 |         - -mca
29 |         - btl
30 |         - ^openib
31 |         - --bind-to
32 |         - none
33 |         - -map-by
34 |         - slot
35 |         - -x
36 |         - LD_LIBRARY_PATH
37 |         - -x
38 |         - PATH
39 |         - -x
40 |         - NCCL_DEBUG=INFO
41 |         - python
42 |         - cifar10-multi-gpu-horovod-k8s.py
43 |         - --epochs=30
44 |         - --learning-rate=0.01
45 |         - --batch-size=256
46 |         - --weight-decay=0.0002
47 |         - --momentum=0.9
48 |         - --optimizer=sgd
49 |         - --train=/training-data/cifar10-dataset/train
50 |         - --eval=/training-data/cifar10-dataset/eval
51 |         - --validation=/training-data/cifar10-dataset/validation
52 |         - --model_dir=/training-data/eks-output
53 |         - --output_data_dir=/training-data/eks-output
54 |         - --tensorboard_dir=/training-data/eks-output
55 |         volumeMounts:
56 |         - mountPath: /training-data
57 |           name: persistent-storage
58 |       volumes:
59 |       - name: persistent-storage
60 |         persistentVolumeClaim:
61 |           claimName: fsx-claim


--------------------------------------------------------------------------------
/notebooks/part-3-kubernetes/specs/eks_tf_training_job-gpu.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1alpha1
 2 | kind: MPIJob
 3 | metadata:
 4 |   name: eks-tf-distributed-training
 5 | spec:
 6 |   replicas: 2
 7 |   template:
 8 |     metadata: 
 9 |       annotations:
10 |         sidecar.istio.io/inject: "false"
11 |     spec:
12 |       restartPolicy: Never
13 |       containers:
14 |       - name: eks-tf-dist-job
15 |         image: <YOUR_DOCKER_IMAGE>
16 |         env:
17 |         - name: HDF5_USE_FILE_LOCKING
18 |           value: 'FALSE'
19 |         command:
20 |         - mpirun
21 |         - --allow-run-as-root
22 |         - -mca
23 |         - btl_tcp_if_exclude
24 |         - lo
25 |         - -mca
26 |         - pml
27 |         - ob1
28 |         - -mca
29 |         - btl
30 |         - ^openib
31 |         - --bind-to
32 |         - none
33 |         - -map-by
34 |         - slot
35 |         - -x
36 |         - LD_LIBRARY_PATH
37 |         - -x
38 |         - PATH
39 |         - -x
40 |         - NCCL_DEBUG=INFO
41 |         - python
42 |         - cifar10-multi-gpu-horovod.py
43 |         - --epochs=30
44 |         - --learning-rate=0.01
45 |         - --batch-size=256
46 |         - --weight-decay=0.0002
47 |         - --momentum=0.9
48 |         - --optimizer=sgd
49 |         - --train=/training-data/cifar10-dataset/train
50 |         - --eval=/training-data/cifar10-dataset/eval
51 |         - --validation=/training-data/cifar10-dataset/validation
52 |         - --model_dir=/training-data/eks-output
53 |         - --output_data_dir=/training-data/eks-output
54 |         - --tensorboard_dir=/training-data/eks-output
55 |         resources:
56 |           limits:
57 |             nvidia.com/gpu: 1
58 |         volumeMounts:
59 |         - mountPath: /training-data
60 |           name: persistent-storage
61 |       volumes:
62 |       - name: persistent-storage
63 |         persistentVolumeClaim:
64 |           claimName: fsx-claim


--------------------------------------------------------------------------------
/notebooks/part-3-kubernetes/specs/fsx_lustre_policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Action": [
 7 |         "iam:CreateServiceLinkedRole",
 8 |         "iam:AttachRolePolicy",
 9 |         "iam:PutRolePolicy"
10 |        ],
11 |       "Resource": "arn:aws:iam::*:role/aws-service-role/s3.data-source.lustre.fsx.amazonaws.com/*"
12 |     },
13 |     {
14 |       "Effect": "Allow",
15 |       "Action": [
16 |         "s3:*",
17 |         "fsx:*"
18 |       ],
19 |       "Resource": ["*"]
20 |     }
21 |   ]
22 | }


--------------------------------------------------------------------------------
/notebooks/part-3-kubernetes/specs/storage-class-fsx-s3-template.yaml:
--------------------------------------------------------------------------------
 1 | kind: StorageClass
 2 | apiVersion: storage.k8s.io/v1
 3 | metadata:
 4 |   name: fsx-sc
 5 | provisioner: fsx.csi.aws.com
 6 | parameters:
 7 |   subnetId: $SUBNET_ID
 8 |   securityGroupIds: $SECURITY_GROUP_ID
 9 |   s3ImportPath: s3://$BUCKET_NAME/
10 |   s3ExportPath: s3://$BUCKET_NAME/
11 |   apiVersion: v1


--------------------------------------------------------------------------------
/static/640px-Amazon_Web_Services_Logo.svg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/640px-Amazon_Web_Services_Logo.svg.png


--------------------------------------------------------------------------------
/static/AWS-Logo.svg:
--------------------------------------------------------------------------------
1 | <svg id="Layer_1" data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 50 30"><defs><style>.cls-1{fill:#fff;}.cls-2{fill:#f90;fill-rule:evenodd;}</style></defs><title>AWS-Logo_White-Color</title><path class="cls-1" d="M14.09,10.85a4.7,4.7,0,0,0,.19,1.48,7.73,7.73,0,0,0,.54,1.19.77.77,0,0,1,.12.38.64.64,0,0,1-.32.49l-1,.7a.83.83,0,0,1-.44.15.69.69,0,0,1-.49-.23,3.8,3.8,0,0,1-.6-.77q-.25-.42-.51-1a6.14,6.14,0,0,1-4.89,2.3,4.54,4.54,0,0,1-3.32-1.19,4.27,4.27,0,0,1-1.22-3.2A4.28,4.28,0,0,1,3.61,7.75,6.06,6.06,0,0,1,7.69,6.46a12.47,12.47,0,0,1,1.76.13q.92.13,1.91.36V5.73a3.65,3.65,0,0,0-.79-2.66A3.81,3.81,0,0,0,7.86,2.3a7.71,7.71,0,0,0-1.79.22,12.78,12.78,0,0,0-1.79.57,4.55,4.55,0,0,1-.58.22l-.26,0q-.35,0-.35-.52V2a1.09,1.09,0,0,1,.12-.58,1.2,1.2,0,0,1,.47-.35A10.88,10.88,0,0,1,5.77.32,10.19,10.19,0,0,1,8.36,0a6,6,0,0,1,4.35,1.35,5.49,5.49,0,0,1,1.38,4.09ZM7.34,13.38a5.36,5.36,0,0,0,1.72-.31A3.63,3.63,0,0,0,10.63,12,2.62,2.62,0,0,0,11.19,11a5.63,5.63,0,0,0,.16-1.44v-.7a14.35,14.35,0,0,0-1.53-.28,12.37,12.37,0,0,0-1.56-.1,3.84,3.84,0,0,0-2.47.67A2.34,2.34,0,0,0,5,11a2.35,2.35,0,0,0,.61,1.76A2.4,2.4,0,0,0,7.34,13.38Zm13.35,1.8a1,1,0,0,1-.64-.16,1.3,1.3,0,0,1-.35-.65L15.81,1.51a3,3,0,0,1-.15-.67.36.36,0,0,1,.41-.41H17.7a1,1,0,0,1,.65.16,1.4,1.4,0,0,1,.33.65l2.79,11,2.59-11A1.17,1.17,0,0,1,24.39.6a1.1,1.1,0,0,1,.67-.16H26.4a1.1,1.1,0,0,1,.67.16,1.17,1.17,0,0,1,.32.65L30,12.39,32.88,1.25A1.39,1.39,0,0,1,33.22.6a1,1,0,0,1,.65-.16h1.54a.36.36,0,0,1,.41.41,1.36,1.36,0,0,1,0,.26,3.64,3.64,0,0,1-.12.41l-4,12.86a1.3,1.3,0,0,1-.35.65,1,1,0,0,1-.64.16H29.25a1,1,0,0,1-.67-.17,1.26,1.26,0,0,1-.32-.67L25.67,3.64,23.11,14.34a1.26,1.26,0,0,1-.32.67,1,1,0,0,1-.67.17Zm21.36.44a11.28,11.28,0,0,1-2.56-.29,7.44,7.44,0,0,1-1.92-.67,1,1,0,0,1-.61-.93v-.84q0-.52.38-.52a.9.9,0,0,1,.31.06l.42.17a8.77,8.77,0,0,0,1.83.58,9.78,9.78,0,0,0,2,.2,4.48,4.48,0,0,0,2.43-.55,1.76,1.76,0,0,0,.86-1.57,1.61,1.61,0,0,0-.45-1.16A4.29,4.29,0,0,0,43,9.22l-2.41-.76A5.15,5.15,0,0,1,38,6.78a3.94,3.94,0,0,1-.83-2.41,3.7,3.7,0,0,1,.45-1.85,4.47,4.47,0,0,1,1.19-1.37A5.27,5.27,0,0,1,40.51.29,7.4,7.4,0,0,1,42.6,0a8.87,8.87,0,0,1,1.12.07q.57.07,1.08.19t.95.26a4.27,4.27,0,0,1,.7.29,1.59,1.59,0,0,1,.49.41.94.94,0,0,1,.15.55v.79q0,.52-.38.52a1.76,1.76,0,0,1-.64-.2,7.74,7.74,0,0,0-3.2-.64,4.37,4.37,0,0,0-2.21.47,1.6,1.6,0,0,0-.79,1.48,1.58,1.58,0,0,0,.49,1.18,4.94,4.94,0,0,0,1.83.92L44.55,7a5.08,5.08,0,0,1,2.57,1.6A3.76,3.76,0,0,1,47.9,11a4.21,4.21,0,0,1-.44,1.93,4.4,4.4,0,0,1-1.21,1.47,5.43,5.43,0,0,1-1.85.93A8.25,8.25,0,0,1,42.05,15.62Z"/><path class="cls-2" d="M45.19,23.81C39.72,27.85,31.78,30,25,30A36.64,36.64,0,0,1,.22,20.57c-.51-.46-.06-1.09.56-.74A49.78,49.78,0,0,0,25.53,26.4,49.23,49.23,0,0,0,44.4,22.53C45.32,22.14,46.1,23.14,45.19,23.81Z"/><path class="cls-2" d="M47.47,21.21c-.7-.9-4.63-.42-6.39-.21-.53.06-.62-.4-.14-.74,3.13-2.2,8.27-1.57,8.86-.83s-.16,5.89-3.09,8.35c-.45.38-.88.18-.68-.32C46.69,25.8,48.17,22.11,47.47,21.21Z"/></svg>


--------------------------------------------------------------------------------
/static/Amazon_Web_Services_Logo.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 19.0.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 304 182" style="enable-background:new 0 0 304 182;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#252F3E;}
 7 | 	.st1{fill-rule:evenodd;clip-rule:evenodd;fill:#FF9900;}
 8 | </style>
 9 | <g>
10 | 	<path class="st0" d="M86.4,66.4c0,3.7,0.4,6.7,1.1,8.9c0.8,2.2,1.8,4.6,3.2,7.2c0.5,0.8,0.7,1.6,0.7,2.3c0,1-0.6,2-1.9,3l-6.3,4.2
11 | 		c-0.9,0.6-1.8,0.9-2.6,0.9c-1,0-2-0.5-3-1.4C76.2,90,75,88.4,74,86.8c-1-1.7-2-3.6-3.1-5.9c-7.8,9.2-17.6,13.8-29.4,13.8
12 | 		c-8.4,0-15.1-2.4-20-7.2c-4.9-4.8-7.4-11.2-7.4-19.2c0-8.5,3-15.4,9.1-20.6c6.1-5.2,14.2-7.8,24.5-7.8c3.4,0,6.9,0.3,10.6,0.8
13 | 		c3.7,0.5,7.5,1.3,11.5,2.2v-7.3c0-7.6-1.6-12.9-4.7-16c-3.2-3.1-8.6-4.6-16.3-4.6c-3.5,0-7.1,0.4-10.8,1.3c-3.7,0.9-7.3,2-10.8,3.4
14 | 		c-1.6,0.7-2.8,1.1-3.5,1.3c-0.7,0.2-1.2,0.3-1.6,0.3c-1.4,0-2.1-1-2.1-3.1v-4.9c0-1.6,0.2-2.8,0.7-3.5c0.5-0.7,1.4-1.4,2.8-2.1
15 | 		c3.5-1.8,7.7-3.3,12.6-4.5c4.9-1.3,10.1-1.9,15.6-1.9c11.9,0,20.6,2.7,26.2,8.1c5.5,5.4,8.3,13.6,8.3,24.6V66.4z M45.8,81.6
16 | 		c3.3,0,6.7-0.6,10.3-1.8c3.6-1.2,6.8-3.4,9.5-6.4c1.6-1.9,2.8-4,3.4-6.4c0.6-2.4,1-5.3,1-8.7v-4.2c-2.9-0.7-6-1.3-9.2-1.7
17 | 		c-3.2-0.4-6.3-0.6-9.4-0.6c-6.7,0-11.6,1.3-14.9,4c-3.3,2.7-4.9,6.5-4.9,11.5c0,4.7,1.2,8.2,3.7,10.6
18 | 		C37.7,80.4,41.2,81.6,45.8,81.6z M126.1,92.4c-1.8,0-3-0.3-3.8-1c-0.8-0.6-1.5-2-2.1-3.9L96.7,10.2c-0.6-2-0.9-3.3-0.9-4
19 | 		c0-1.6,0.8-2.5,2.4-2.5h9.8c1.9,0,3.2,0.3,3.9,1c0.8,0.6,1.4,2,2,3.9l16.8,66.2l15.6-66.2c0.5-2,1.1-3.3,1.9-3.9c0.8-0.6,2.2-1,4-1
20 | 		h8c1.9,0,3.2,0.3,4,1c0.8,0.6,1.5,2,1.9,3.9l15.8,67l17.3-67c0.6-2,1.3-3.3,2-3.9c0.8-0.6,2.1-1,3.9-1h9.3c1.6,0,2.5,0.8,2.5,2.5
21 | 		c0,0.5-0.1,1-0.2,1.6c-0.1,0.6-0.3,1.4-0.7,2.5l-24.1,77.3c-0.6,2-1.3,3.3-2.1,3.9c-0.8,0.6-2.1,1-3.8,1h-8.6c-1.9,0-3.2-0.3-4-1
22 | 		c-0.8-0.7-1.5-2-1.9-4L156,23l-15.4,64.4c-0.5,2-1.1,3.3-1.9,4c-0.8,0.7-2.2,1-4,1H126.1z M254.6,95.1c-5.2,0-10.4-0.6-15.4-1.8
23 | 		c-5-1.2-8.9-2.5-11.5-4c-1.6-0.9-2.7-1.9-3.1-2.8c-0.4-0.9-0.6-1.9-0.6-2.8v-5.1c0-2.1,0.8-3.1,2.3-3.1c0.6,0,1.2,0.1,1.8,0.3
24 | 		c0.6,0.2,1.5,0.6,2.5,1c3.4,1.5,7.1,2.7,11,3.5c4,0.8,7.9,1.2,11.9,1.2c6.3,0,11.2-1.1,14.6-3.3c3.4-2.2,5.2-5.4,5.2-9.5
25 | 		c0-2.8-0.9-5.1-2.7-7c-1.8-1.9-5.2-3.6-10.1-5.2L246,52c-7.3-2.3-12.7-5.7-16-10.2c-3.3-4.4-5-9.3-5-14.5c0-4.2,0.9-7.9,2.7-11.1
26 | 		c1.8-3.2,4.2-6,7.2-8.2c3-2.3,6.4-4,10.4-5.2c4-1.2,8.2-1.7,12.6-1.7c2.2,0,4.5,0.1,6.7,0.4c2.3,0.3,4.4,0.7,6.5,1.1
27 | 		c2,0.5,3.9,1,5.7,1.6c1.8,0.6,3.2,1.2,4.2,1.8c1.4,0.8,2.4,1.6,3,2.5c0.6,0.8,0.9,1.9,0.9,3.3v4.7c0,2.1-0.8,3.2-2.3,3.2
28 | 		c-0.8,0-2.1-0.4-3.8-1.2c-5.7-2.6-12.1-3.9-19.2-3.9c-5.7,0-10.2,0.9-13.3,2.8c-3.1,1.9-4.7,4.8-4.7,8.9c0,2.8,1,5.2,3,7.1
29 | 		c2,1.9,5.7,3.8,11,5.5l14.2,4.5c7.2,2.3,12.4,5.5,15.5,9.6c3.1,4.1,4.6,8.8,4.6,14c0,4.3-0.9,8.2-2.6,11.6
30 | 		c-1.8,3.4-4.2,6.4-7.3,8.8c-3.1,2.5-6.8,4.3-11.1,5.6C264.4,94.4,259.7,95.1,254.6,95.1z"/>
31 | 	<g>
32 | 		<path class="st1" d="M273.5,143.7c-32.9,24.3-80.7,37.2-121.8,37.2c-57.6,0-109.5-21.3-148.7-56.7c-3.1-2.8-0.3-6.6,3.4-4.4
33 | 			c42.4,24.6,94.7,39.5,148.8,39.5c36.5,0,76.6-7.6,113.5-23.2C274.2,133.6,278.9,139.7,273.5,143.7z"/>
34 | 		<path class="st1" d="M287.2,128.1c-4.2-5.4-27.8-2.6-38.5-1.3c-3.2,0.4-3.7-2.4-0.8-4.5c18.8-13.2,49.7-9.4,53.3-5
35 | 			c3.6,4.5-1,35.4-18.6,50.2c-2.7,2.3-5.3,1.1-4.1-1.9C282.5,155.7,291.4,133.4,287.2,128.1z"/>
36 | 	</g>
37 | </g>
38 | </svg>
39 | 


--------------------------------------------------------------------------------
/static/css/jquery-ui.min.css:
--------------------------------------------------------------------------------
1 | /*! jQuery UI - v1.12.1 - 2018-10-14
2 | * http://jqueryui.com
3 | * Includes: draggable.css, core.css, resizable.css, selectable.css, sortable.css, accordion.css, autocomplete.css, menu.css, button.css, controlgroup.css, checkboxradio.css, datepicker.css, dialog.css, progressbar.css, selectmenu.css, slider.css, spinner.css, tabs.css, tooltip.css, theme.css
4 | * To view and modify this theme, visit http://jqueryui.com/themeroller/?scope=&folderName=base&cornerRadiusShadow=8px&offsetLeftShadow=0px&offsetTopShadow=0px&thicknessShadow=5px&opacityShadow=30&bgImgOpacityShadow=0&bgTextureShadow=flat&bgColorShadow=666666&opacityOverlay=30&bgImgOpacityOverlay=0&bgTextureOverlay=flat&bgColorOverlay=aaaaaa&iconColorError=cc0000&fcError=5f3f3f&borderColorError=f1a899&bgTextureError=flat&bgColorError=fddfdf&iconColorHighlight=777620&fcHighlight=777620&borderColorHighlight=dad55e&bgTextureHighlight=flat&bgColorHighlight=fffa90&iconColorActive=ffffff&fcActive=ffffff&borderColorActive=003eff&bgTextureActive=flat&bgColorActive=007fff&iconColorHover=555555&fcHover=2b2b2b&borderColorHover=cccccc&bgTextureHover=flat&bgColorHover=ededed&iconColorDefault=777777&fcDefault=454545&borderColorDefault=c5c5c5&bgTextureDefault=flat&bgColorDefault=f6f6f6&iconColorContent=444444&fcContent=333333&borderColorContent=dddddd&bgTextureContent=flat&bgColorContent=ffffff&iconColorHeader=444444&fcHeader=333333&borderColorHeader=dddddd&bgTextureHeader=flat&bgColorHeader=e9e9e9&cornerRadius=3px&fwDefault=normal&fsDefault=1em&ffDefault=Arial%2CHelvetica%2Csans-serif
5 | * Copyright jQuery Foundation and other contributors; Licensed MIT */
6 | 
7 | .ui-draggable-handle{-ms-touch-action:none;touch-action:none}.ui-helper-hidden{display:none}.ui-helper-hidden-accessible{border:0;clip:rect(0 0 0 0);height:1px;margin:-1px;overflow:hidden;padding:0;position:absolute;width:1px}.ui-helper-reset{margin:0;padding:0;border:0;outline:0;line-height:1.3;text-decoration:none;font-size:100%;list-style:none}.ui-helper-clearfix:before,.ui-helper-clearfix:after{content:"";display:table;border-collapse:collapse}.ui-helper-clearfix:after{clear:both}.ui-helper-zfix{width:100%;height:100%;top:0;left:0;position:absolute;opacity:0;filter:Alpha(Opacity=0)}.ui-front{z-index:100}.ui-state-disabled{cursor:default!important;pointer-events:none}.ui-icon{display:inline-block;vertical-align:middle;margin-top:-.25em;position:relative;text-indent:-99999px;overflow:hidden;background-repeat:no-repeat}.ui-widget-icon-block{left:50%;margin-left:-8px;display:block}.ui-widget-overlay{position:fixed;top:0;left:0;width:100%;height:100%}.ui-resizable{position:relative}.ui-resizable-handle{position:absolute;font-size:0.1px;display:block;-ms-touch-action:none;touch-action:none}.ui-resizable-disabled .ui-resizable-handle,.ui-resizable-autohide .ui-resizable-handle{display:none}.ui-resizable-n{cursor:n-resize;height:7px;width:100%;top:-5px;left:0}.ui-resizable-s{cursor:s-resize;height:7px;width:100%;bottom:-5px;left:0}.ui-resizable-e{cursor:e-resize;width:7px;right:-5px;top:0;height:100%}.ui-resizable-w{cursor:w-resize;width:7px;left:-5px;top:0;height:100%}.ui-resizable-se{cursor:se-resize;width:12px;height:12px;right:1px;bottom:1px}.ui-resizable-sw{cursor:sw-resize;width:9px;height:9px;left:-5px;bottom:-5px}.ui-resizable-nw{cursor:nw-resize;width:9px;height:9px;left:-5px;top:-5px}.ui-resizable-ne{cursor:ne-resize;width:9px;height:9px;right:-5px;top:-5px}.ui-selectable{-ms-touch-action:none;touch-action:none}.ui-selectable-helper{position:absolute;z-index:100;border:1px dotted black}.ui-sortable-handle{-ms-touch-action:none;touch-action:none}.ui-accordion .ui-accordion-header{display:block;cursor:pointer;position:relative;margin:2px 0 0 0;padding:.5em .5em .5em .7em;font-size:100%}.ui-accordion .ui-accordion-content{padding:1em 2.2em;border-top:0;overflow:auto}.ui-autocomplete{position:absolute;top:0;left:0;cursor:default}.ui-menu{list-style:none;padding:0;margin:0;display:block;outline:0}.ui-menu .ui-menu{position:absolute}.ui-menu .ui-menu-item{margin:0;cursor:pointer;list-style-image:url("data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7")}.ui-menu .ui-menu-item-wrapper{position:relative;padding:3px 1em 3px .4em}.ui-menu .ui-menu-divider{margin:5px 0;height:0;font-size:0;line-height:0;border-width:1px 0 0 0}.ui-menu .ui-state-focus,.ui-menu .ui-state-active{margin:-1px}.ui-menu-icons{position:relative}.ui-menu-icons .ui-menu-item-wrapper{padding-left:2em}.ui-menu .ui-icon{position:absolute;top:0;bottom:0;left:.2em;margin:auto 0}.ui-menu .ui-menu-icon{left:auto;right:0}.ui-button{padding:.4em 1em;display:inline-block;position:relative;line-height:normal;margin-right:.1em;cursor:pointer;vertical-align:middle;text-align:center;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;overflow:visible}.ui-button,.ui-button:link,.ui-button:visited,.ui-button:hover,.ui-button:active{text-decoration:none}.ui-button-icon-only{width:2em;box-sizing:border-box;text-indent:-9999px;white-space:nowrap}input.ui-button.ui-button-icon-only{text-indent:0}.ui-button-icon-only .ui-icon{position:absolute;top:50%;left:50%;margin-top:-8px;margin-left:-8px}.ui-button.ui-icon-notext .ui-icon{padding:0;width:2.1em;height:2.1em;text-indent:-9999px;white-space:nowrap}input.ui-button.ui-icon-notext .ui-icon{width:auto;height:auto;text-indent:0;white-space:normal;padding:.4em 1em}input.ui-button::-moz-focus-inner,button.ui-button::-moz-focus-inner{border:0;padding:0}.ui-controlgroup{vertical-align:middle;display:inline-block}.ui-controlgroup > .ui-controlgroup-item{float:left;margin-left:0;margin-right:0}.ui-controlgroup > .ui-controlgroup-item:focus,.ui-controlgroup > .ui-controlgroup-item.ui-visual-focus{z-index:9999}.ui-controlgroup-vertical > .ui-controlgroup-item{display:block;float:none;width:100%;margin-top:0;margin-bottom:0;text-align:left}.ui-controlgroup-vertical .ui-controlgroup-item{box-sizing:border-box}.ui-controlgroup .ui-controlgroup-label{padding:.4em 1em}.ui-controlgroup .ui-controlgroup-label span{font-size:80%}.ui-controlgroup-horizontal .ui-controlgroup-label + .ui-controlgroup-item{border-left:none}.ui-controlgroup-vertical .ui-controlgroup-label + .ui-controlgroup-item{border-top:none}.ui-controlgroup-horizontal .ui-controlgroup-label.ui-widget-content{border-right:none}.ui-controlgroup-vertical .ui-controlgroup-label.ui-widget-content{border-bottom:none}.ui-controlgroup-vertical .ui-spinner-input{width:75%;width:calc( 100% - 2.4em )}.ui-controlgroup-vertical .ui-spinner .ui-spinner-up{border-top-style:solid}.ui-checkboxradio-label .ui-icon-background{box-shadow:inset 1px 1px 1px #ccc;border-radius:.12em;border:none}.ui-checkboxradio-radio-label .ui-icon-background{width:16px;height:16px;border-radius:1em;overflow:visible;border:none}.ui-checkboxradio-radio-label.ui-checkboxradio-checked .ui-icon,.ui-checkboxradio-radio-label.ui-checkboxradio-checked:hover .ui-icon{background-image:none;width:8px;height:8px;border-width:4px;border-style:solid}.ui-checkboxradio-disabled{pointer-events:none}.ui-datepicker{width:17em;padding:.2em .2em 0;display:none}.ui-datepicker .ui-datepicker-header{position:relative;padding:.2em 0}.ui-datepicker .ui-datepicker-prev,.ui-datepicker .ui-datepicker-next{position:absolute;top:2px;width:1.8em;height:1.8em}.ui-datepicker .ui-datepicker-prev-hover,.ui-datepicker .ui-datepicker-next-hover{top:1px}.ui-datepicker .ui-datepicker-prev{left:2px}.ui-datepicker .ui-datepicker-next{right:2px}.ui-datepicker .ui-datepicker-prev-hover{left:1px}.ui-datepicker .ui-datepicker-next-hover{right:1px}.ui-datepicker .ui-datepicker-prev span,.ui-datepicker .ui-datepicker-next span{display:block;position:absolute;left:50%;margin-left:-8px;top:50%;margin-top:-8px}.ui-datepicker .ui-datepicker-title{margin:0 2.3em;line-height:1.8em;text-align:center}.ui-datepicker .ui-datepicker-title select{font-size:1em;margin:1px 0}.ui-datepicker select.ui-datepicker-month,.ui-datepicker select.ui-datepicker-year{width:45%}.ui-datepicker table{width:100%;font-size:.9em;border-collapse:collapse;margin:0 0 .4em}.ui-datepicker th{padding:.7em .3em;text-align:center;font-weight:bold;border:0}.ui-datepicker td{border:0;padding:1px}.ui-datepicker td span,.ui-datepicker td a{display:block;padding:.2em;text-align:right;text-decoration:none}.ui-datepicker .ui-datepicker-buttonpane{background-image:none;margin:.7em 0 0 0;padding:0 .2em;border-left:0;border-right:0;border-bottom:0}.ui-datepicker .ui-datepicker-buttonpane button{float:right;margin:.5em .2em .4em;cursor:pointer;padding:.2em .6em .3em .6em;width:auto;overflow:visible}.ui-datepicker .ui-datepicker-buttonpane button.ui-datepicker-current{float:left}.ui-datepicker.ui-datepicker-multi{width:auto}.ui-datepicker-multi .ui-datepicker-group{float:left}.ui-datepicker-multi .ui-datepicker-group table{width:95%;margin:0 auto .4em}.ui-datepicker-multi-2 .ui-datepicker-group{width:50%}.ui-datepicker-multi-3 .ui-datepicker-group{width:33.3%}.ui-datepicker-multi-4 .ui-datepicker-group{width:25%}.ui-datepicker-multi .ui-datepicker-group-last .ui-datepicker-header,.ui-datepicker-multi .ui-datepicker-group-middle .ui-datepicker-header{border-left-width:0}.ui-datepicker-multi .ui-datepicker-buttonpane{clear:left}.ui-datepicker-row-break{clear:both;width:100%;font-size:0}.ui-datepicker-rtl{direction:rtl}.ui-datepicker-rtl .ui-datepicker-prev{right:2px;left:auto}.ui-datepicker-rtl .ui-datepicker-next{left:2px;right:auto}.ui-datepicker-rtl .ui-datepicker-prev:hover{right:1px;left:auto}.ui-datepicker-rtl .ui-datepicker-next:hover{left:1px;right:auto}.ui-datepicker-rtl .ui-datepicker-buttonpane{clear:right}.ui-datepicker-rtl .ui-datepicker-buttonpane button{float:left}.ui-datepicker-rtl .ui-datepicker-buttonpane button.ui-datepicker-current,.ui-datepicker-rtl .ui-datepicker-group{float:right}.ui-datepicker-rtl .ui-datepicker-group-last .ui-datepicker-header,.ui-datepicker-rtl .ui-datepicker-group-middle .ui-datepicker-header{border-right-width:0;border-left-width:1px}.ui-datepicker .ui-icon{display:block;text-indent:-99999px;overflow:hidden;background-repeat:no-repeat;left:.5em;top:.3em}.ui-dialog{position:absolute;top:0;left:0;padding:.2em;outline:0}.ui-dialog .ui-dialog-titlebar{padding:.4em 1em;position:relative}.ui-dialog .ui-dialog-title{float:left;margin:.1em 0;white-space:nowrap;width:90%;overflow:hidden;text-overflow:ellipsis}.ui-dialog .ui-dialog-titlebar-close{position:absolute;right:.3em;top:50%;width:20px;margin:-10px 0 0 0;padding:1px;height:20px}.ui-dialog .ui-dialog-content{position:relative;border:0;padding:.5em 1em;background:none;overflow:auto}.ui-dialog .ui-dialog-buttonpane{text-align:left;border-width:1px 0 0 0;background-image:none;margin-top:.5em;padding:.3em 1em .5em .4em}.ui-dialog .ui-dialog-buttonpane .ui-dialog-buttonset{float:right}.ui-dialog .ui-dialog-buttonpane button{margin:.5em .4em .5em 0;cursor:pointer}.ui-dialog .ui-resizable-n{height:2px;top:0}.ui-dialog .ui-resizable-e{width:2px;right:0}.ui-dialog .ui-resizable-s{height:2px;bottom:0}.ui-dialog .ui-resizable-w{width:2px;left:0}.ui-dialog .ui-resizable-se,.ui-dialog .ui-resizable-sw,.ui-dialog .ui-resizable-ne,.ui-dialog .ui-resizable-nw{width:7px;height:7px}.ui-dialog .ui-resizable-se{right:0;bottom:0}.ui-dialog .ui-resizable-sw{left:0;bottom:0}.ui-dialog .ui-resizable-ne{right:0;top:0}.ui-dialog .ui-resizable-nw{left:0;top:0}.ui-draggable .ui-dialog-titlebar{cursor:move}.ui-progressbar{height:2em;text-align:left;overflow:hidden}.ui-progressbar .ui-progressbar-value{margin:-1px;height:100%}.ui-progressbar .ui-progressbar-overlay{background:url("data:image/gif;base64,R0lGODlhKAAoAIABAAAAAP///yH/C05FVFNDQVBFMi4wAwEAAAAh+QQJAQABACwAAAAAKAAoAAACkYwNqXrdC52DS06a7MFZI+4FHBCKoDeWKXqymPqGqxvJrXZbMx7Ttc+w9XgU2FB3lOyQRWET2IFGiU9m1frDVpxZZc6bfHwv4c1YXP6k1Vdy292Fb6UkuvFtXpvWSzA+HycXJHUXiGYIiMg2R6W459gnWGfHNdjIqDWVqemH2ekpObkpOlppWUqZiqr6edqqWQAAIfkECQEAAQAsAAAAACgAKAAAApSMgZnGfaqcg1E2uuzDmmHUBR8Qil95hiPKqWn3aqtLsS18y7G1SzNeowWBENtQd+T1JktP05nzPTdJZlR6vUxNWWjV+vUWhWNkWFwxl9VpZRedYcflIOLafaa28XdsH/ynlcc1uPVDZxQIR0K25+cICCmoqCe5mGhZOfeYSUh5yJcJyrkZWWpaR8doJ2o4NYq62lAAACH5BAkBAAEALAAAAAAoACgAAAKVDI4Yy22ZnINRNqosw0Bv7i1gyHUkFj7oSaWlu3ovC8GxNso5fluz3qLVhBVeT/Lz7ZTHyxL5dDalQWPVOsQWtRnuwXaFTj9jVVh8pma9JjZ4zYSj5ZOyma7uuolffh+IR5aW97cHuBUXKGKXlKjn+DiHWMcYJah4N0lYCMlJOXipGRr5qdgoSTrqWSq6WFl2ypoaUAAAIfkECQEAAQAsAAAAACgAKAAAApaEb6HLgd/iO7FNWtcFWe+ufODGjRfoiJ2akShbueb0wtI50zm02pbvwfWEMWBQ1zKGlLIhskiEPm9R6vRXxV4ZzWT2yHOGpWMyorblKlNp8HmHEb/lCXjcW7bmtXP8Xt229OVWR1fod2eWqNfHuMjXCPkIGNileOiImVmCOEmoSfn3yXlJWmoHGhqp6ilYuWYpmTqKUgAAIfkECQEAAQAsAAAAACgAKAAAApiEH6kb58biQ3FNWtMFWW3eNVcojuFGfqnZqSebuS06w5V80/X02pKe8zFwP6EFWOT1lDFk8rGERh1TTNOocQ61Hm4Xm2VexUHpzjymViHrFbiELsefVrn6XKfnt2Q9G/+Xdie499XHd2g4h7ioOGhXGJboGAnXSBnoBwKYyfioubZJ2Hn0RuRZaflZOil56Zp6iioKSXpUAAAh+QQJAQABACwAAAAAKAAoAAACkoQRqRvnxuI7kU1a1UU5bd5tnSeOZXhmn5lWK3qNTWvRdQxP8qvaC+/yaYQzXO7BMvaUEmJRd3TsiMAgswmNYrSgZdYrTX6tSHGZO73ezuAw2uxuQ+BbeZfMxsexY35+/Qe4J1inV0g4x3WHuMhIl2jXOKT2Q+VU5fgoSUI52VfZyfkJGkha6jmY+aaYdirq+lQAACH5BAkBAAEALAAAAAAoACgAAAKWBIKpYe0L3YNKToqswUlvznigd4wiR4KhZrKt9Upqip61i9E3vMvxRdHlbEFiEXfk9YARYxOZZD6VQ2pUunBmtRXo1Lf8hMVVcNl8JafV38aM2/Fu5V16Bn63r6xt97j09+MXSFi4BniGFae3hzbH9+hYBzkpuUh5aZmHuanZOZgIuvbGiNeomCnaxxap2upaCZsq+1kAACH5BAkBAAEALAAAAAAoACgAAAKXjI8By5zf4kOxTVrXNVlv1X0d8IGZGKLnNpYtm8Lr9cqVeuOSvfOW79D9aDHizNhDJidFZhNydEahOaDH6nomtJjp1tutKoNWkvA6JqfRVLHU/QUfau9l2x7G54d1fl995xcIGAdXqMfBNadoYrhH+Mg2KBlpVpbluCiXmMnZ2Sh4GBqJ+ckIOqqJ6LmKSllZmsoq6wpQAAAh+QQJAQABACwAAAAAKAAoAAAClYx/oLvoxuJDkU1a1YUZbJ59nSd2ZXhWqbRa2/gF8Gu2DY3iqs7yrq+xBYEkYvFSM8aSSObE+ZgRl1BHFZNr7pRCavZ5BW2142hY3AN/zWtsmf12p9XxxFl2lpLn1rseztfXZjdIWIf2s5dItwjYKBgo9yg5pHgzJXTEeGlZuenpyPmpGQoKOWkYmSpaSnqKileI2FAAACH5BAkBAAEALAAAAAAoACgAAAKVjB+gu+jG4kORTVrVhRlsnn2dJ3ZleFaptFrb+CXmO9OozeL5VfP99HvAWhpiUdcwkpBH3825AwYdU8xTqlLGhtCosArKMpvfa1mMRae9VvWZfeB2XfPkeLmm18lUcBj+p5dnN8jXZ3YIGEhYuOUn45aoCDkp16hl5IjYJvjWKcnoGQpqyPlpOhr3aElaqrq56Bq7VAAAOw==");height:100%;filter:alpha(opacity=25);opacity:0.25}.ui-progressbar-indeterminate .ui-progressbar-value{background-image:none}.ui-selectmenu-menu{padding:0;margin:0;position:absolute;top:0;left:0;display:none}.ui-selectmenu-menu .ui-menu{overflow:auto;overflow-x:hidden;padding-bottom:1px}.ui-selectmenu-menu .ui-menu .ui-selectmenu-optgroup{font-size:1em;font-weight:bold;line-height:1.5;padding:2px 0.4em;margin:0.5em 0 0 0;height:auto;border:0}.ui-selectmenu-open{display:block}.ui-selectmenu-text{display:block;margin-right:20px;overflow:hidden;text-overflow:ellipsis}.ui-selectmenu-button.ui-button{text-align:left;white-space:nowrap;width:14em}.ui-selectmenu-icon.ui-icon{float:right;margin-top:0}.ui-slider{position:relative;text-align:left}.ui-slider .ui-slider-handle{position:absolute;z-index:2;width:1.2em;height:1.2em;cursor:default;-ms-touch-action:none;touch-action:none}.ui-slider .ui-slider-range{position:absolute;z-index:1;font-size:.7em;display:block;border:0;background-position:0 0}.ui-slider.ui-state-disabled .ui-slider-handle,.ui-slider.ui-state-disabled .ui-slider-range{filter:inherit}.ui-slider-horizontal{height:.8em}.ui-slider-horizontal .ui-slider-handle{top:-.3em;margin-left:-.6em}.ui-slider-horizontal .ui-slider-range{top:0;height:100%}.ui-slider-horizontal .ui-slider-range-min{left:0}.ui-slider-horizontal .ui-slider-range-max{right:0}.ui-slider-vertical{width:.8em;height:100px}.ui-slider-vertical .ui-slider-handle{left:-.3em;margin-left:0;margin-bottom:-.6em}.ui-slider-vertical .ui-slider-range{left:0;width:100%}.ui-slider-vertical .ui-slider-range-min{bottom:0}.ui-slider-vertical .ui-slider-range-max{top:0}.ui-spinner{position:relative;display:inline-block;overflow:hidden;padding:0;vertical-align:middle}.ui-spinner-input{border:none;background:none;color:inherit;padding:.222em 0;margin:.2em 0;vertical-align:middle;margin-left:.4em;margin-right:2em}.ui-spinner-button{width:1.6em;height:50%;font-size:.5em;padding:0;margin:0;text-align:center;position:absolute;cursor:default;display:block;overflow:hidden;right:0}.ui-spinner a.ui-spinner-button{border-top-style:none;border-bottom-style:none;border-right-style:none}.ui-spinner-up{top:0}.ui-spinner-down{bottom:0}.ui-tabs{position:relative;padding:.2em}.ui-tabs .ui-tabs-nav{margin:0;padding:.2em .2em 0}.ui-tabs .ui-tabs-nav li{list-style:none;float:left;position:relative;top:0;margin:1px .2em 0 0;border-bottom-width:0;padding:0;white-space:nowrap}.ui-tabs .ui-tabs-nav .ui-tabs-anchor{float:left;padding:.5em 1em;text-decoration:none}.ui-tabs .ui-tabs-nav li.ui-tabs-active{margin-bottom:-1px;padding-bottom:1px}.ui-tabs .ui-tabs-nav li.ui-tabs-active .ui-tabs-anchor,.ui-tabs .ui-tabs-nav li.ui-state-disabled .ui-tabs-anchor,.ui-tabs .ui-tabs-nav li.ui-tabs-loading .ui-tabs-anchor{cursor:text}.ui-tabs-collapsible .ui-tabs-nav li.ui-tabs-active .ui-tabs-anchor{cursor:pointer}.ui-tabs .ui-tabs-panel{display:block;border-width:0;padding:1em 1.4em;background:none}.ui-tooltip{padding:8px;position:absolute;z-index:9999;max-width:300px}body .ui-tooltip{border-width:2px}.ui-widget{font-family:Arial,Helvetica,sans-serif;font-size:1em}.ui-widget .ui-widget{font-size:1em}.ui-widget input,.ui-widget select,.ui-widget textarea,.ui-widget button{font-family:Arial,Helvetica,sans-serif;font-size:1em}.ui-widget.ui-widget-content{border:1px solid #c5c5c5}.ui-widget-content{border:1px solid #ddd;background:#fff;color:#333}.ui-widget-content a{color:#333}.ui-widget-header{border:1px solid #ddd;background:#e9e9e9;color:#333;font-weight:bold}.ui-widget-header a{color:#333}.ui-state-default,.ui-widget-content .ui-state-default,.ui-widget-header .ui-state-default,.ui-button,html .ui-button.ui-state-disabled:hover,html .ui-button.ui-state-disabled:active{border:1px solid #c5c5c5;background:#f6f6f6;font-weight:normal;color:#454545}.ui-state-default a,.ui-state-default a:link,.ui-state-default a:visited,a.ui-button,a:link.ui-button,a:visited.ui-button,.ui-button{color:#454545;text-decoration:none}.ui-state-hover,.ui-widget-content .ui-state-hover,.ui-widget-header .ui-state-hover,.ui-state-focus,.ui-widget-content .ui-state-focus,.ui-widget-header .ui-state-focus,.ui-button:hover,.ui-button:focus{border:1px solid #ccc;background:#ededed;font-weight:normal;color:#2b2b2b}.ui-state-hover a,.ui-state-hover a:hover,.ui-state-hover a:link,.ui-state-hover a:visited,.ui-state-focus a,.ui-state-focus a:hover,.ui-state-focus a:link,.ui-state-focus a:visited,a.ui-button:hover,a.ui-button:focus{color:#2b2b2b;text-decoration:none}.ui-visual-focus{box-shadow:0 0 3px 1px rgb(94,158,214)}.ui-state-active,.ui-widget-content .ui-state-active,.ui-widget-header .ui-state-active,a.ui-button:active,.ui-button:active,.ui-button.ui-state-active:hover{border:1px solid #003eff;background:#007fff;font-weight:normal;color:#fff}.ui-icon-background,.ui-state-active .ui-icon-background{border:#003eff;background-color:#fff}.ui-state-active a,.ui-state-active a:link,.ui-state-active a:visited{color:#fff;text-decoration:none}.ui-state-highlight,.ui-widget-content .ui-state-highlight,.ui-widget-header .ui-state-highlight{border:1px solid #dad55e;background:#fffa90;color:#777620}.ui-state-checked{border:1px solid #dad55e;background:#fffa90}.ui-state-highlight a,.ui-widget-content .ui-state-highlight a,.ui-widget-header .ui-state-highlight a{color:#777620}.ui-state-error,.ui-widget-content .ui-state-error,.ui-widget-header .ui-state-error{border:1px solid #f1a899;background:#fddfdf;color:#5f3f3f}.ui-state-error a,.ui-widget-content .ui-state-error a,.ui-widget-header .ui-state-error a{color:#5f3f3f}.ui-state-error-text,.ui-widget-content .ui-state-error-text,.ui-widget-header .ui-state-error-text{color:#5f3f3f}.ui-priority-primary,.ui-widget-content .ui-priority-primary,.ui-widget-header .ui-priority-primary{font-weight:bold}.ui-priority-secondary,.ui-widget-content .ui-priority-secondary,.ui-widget-header .ui-priority-secondary{opacity:.7;filter:Alpha(Opacity=70);font-weight:normal}.ui-state-disabled,.ui-widget-content .ui-state-disabled,.ui-widget-header .ui-state-disabled{opacity:.35;filter:Alpha(Opacity=35);background-image:none}.ui-state-disabled .ui-icon{filter:Alpha(Opacity=35)}.ui-icon{width:16px;height:16px}.ui-icon,.ui-widget-content .ui-icon{background-image:url("images/ui-icons_444444_256x240.png")}.ui-widget-header .ui-icon{background-image:url("images/ui-icons_444444_256x240.png")}.ui-state-hover .ui-icon,.ui-state-focus .ui-icon,.ui-button:hover .ui-icon,.ui-button:focus .ui-icon{background-image:url("images/ui-icons_555555_256x240.png")}.ui-state-active .ui-icon,.ui-button:active .ui-icon{background-image:url("images/ui-icons_ffffff_256x240.png")}.ui-state-highlight .ui-icon,.ui-button .ui-state-highlight.ui-icon{background-image:url("images/ui-icons_777620_256x240.png")}.ui-state-error .ui-icon,.ui-state-error-text .ui-icon{background-image:url("images/ui-icons_cc0000_256x240.png")}.ui-button .ui-icon{background-image:url("images/ui-icons_777777_256x240.png")}.ui-icon-blank{background-position:16px 16px}.ui-icon-caret-1-n{background-position:0 0}.ui-icon-caret-1-ne{background-position:-16px 0}.ui-icon-caret-1-e{background-position:-32px 0}.ui-icon-caret-1-se{background-position:-48px 0}.ui-icon-caret-1-s{background-position:-65px 0}.ui-icon-caret-1-sw{background-position:-80px 0}.ui-icon-caret-1-w{background-position:-96px 0}.ui-icon-caret-1-nw{background-position:-112px 0}.ui-icon-caret-2-n-s{background-position:-128px 0}.ui-icon-caret-2-e-w{background-position:-144px 0}.ui-icon-triangle-1-n{background-position:0 -16px}.ui-icon-triangle-1-ne{background-position:-16px -16px}.ui-icon-triangle-1-e{background-position:-32px -16px}.ui-icon-triangle-1-se{background-position:-48px -16px}.ui-icon-triangle-1-s{background-position:-65px -16px}.ui-icon-triangle-1-sw{background-position:-80px -16px}.ui-icon-triangle-1-w{background-position:-96px -16px}.ui-icon-triangle-1-nw{background-position:-112px -16px}.ui-icon-triangle-2-n-s{background-position:-128px -16px}.ui-icon-triangle-2-e-w{background-position:-144px -16px}.ui-icon-arrow-1-n{background-position:0 -32px}.ui-icon-arrow-1-ne{background-position:-16px -32px}.ui-icon-arrow-1-e{background-position:-32px -32px}.ui-icon-arrow-1-se{background-position:-48px -32px}.ui-icon-arrow-1-s{background-position:-65px -32px}.ui-icon-arrow-1-sw{background-position:-80px -32px}.ui-icon-arrow-1-w{background-position:-96px -32px}.ui-icon-arrow-1-nw{background-position:-112px -32px}.ui-icon-arrow-2-n-s{background-position:-128px -32px}.ui-icon-arrow-2-ne-sw{background-position:-144px -32px}.ui-icon-arrow-2-e-w{background-position:-160px -32px}.ui-icon-arrow-2-se-nw{background-position:-176px -32px}.ui-icon-arrowstop-1-n{background-position:-192px -32px}.ui-icon-arrowstop-1-e{background-position:-208px -32px}.ui-icon-arrowstop-1-s{background-position:-224px -32px}.ui-icon-arrowstop-1-w{background-position:-240px -32px}.ui-icon-arrowthick-1-n{background-position:1px -48px}.ui-icon-arrowthick-1-ne{background-position:-16px -48px}.ui-icon-arrowthick-1-e{background-position:-32px -48px}.ui-icon-arrowthick-1-se{background-position:-48px -48px}.ui-icon-arrowthick-1-s{background-position:-64px -48px}.ui-icon-arrowthick-1-sw{background-position:-80px -48px}.ui-icon-arrowthick-1-w{background-position:-96px -48px}.ui-icon-arrowthick-1-nw{background-position:-112px -48px}.ui-icon-arrowthick-2-n-s{background-position:-128px -48px}.ui-icon-arrowthick-2-ne-sw{background-position:-144px -48px}.ui-icon-arrowthick-2-e-w{background-position:-160px -48px}.ui-icon-arrowthick-2-se-nw{background-position:-176px -48px}.ui-icon-arrowthickstop-1-n{background-position:-192px -48px}.ui-icon-arrowthickstop-1-e{background-position:-208px -48px}.ui-icon-arrowthickstop-1-s{background-position:-224px -48px}.ui-icon-arrowthickstop-1-w{background-position:-240px -48px}.ui-icon-arrowreturnthick-1-w{background-position:0 -64px}.ui-icon-arrowreturnthick-1-n{background-position:-16px -64px}.ui-icon-arrowreturnthick-1-e{background-position:-32px -64px}.ui-icon-arrowreturnthick-1-s{background-position:-48px -64px}.ui-icon-arrowreturn-1-w{background-position:-64px -64px}.ui-icon-arrowreturn-1-n{background-position:-80px -64px}.ui-icon-arrowreturn-1-e{background-position:-96px -64px}.ui-icon-arrowreturn-1-s{background-position:-112px -64px}.ui-icon-arrowrefresh-1-w{background-position:-128px -64px}.ui-icon-arrowrefresh-1-n{background-position:-144px -64px}.ui-icon-arrowrefresh-1-e{background-position:-160px -64px}.ui-icon-arrowrefresh-1-s{background-position:-176px -64px}.ui-icon-arrow-4{background-position:0 -80px}.ui-icon-arrow-4-diag{background-position:-16px -80px}.ui-icon-extlink{background-position:-32px -80px}.ui-icon-newwin{background-position:-48px -80px}.ui-icon-refresh{background-position:-64px -80px}.ui-icon-shuffle{background-position:-80px -80px}.ui-icon-transfer-e-w{background-position:-96px -80px}.ui-icon-transferthick-e-w{background-position:-112px -80px}.ui-icon-folder-collapsed{background-position:0 -96px}.ui-icon-folder-open{background-position:-16px -96px}.ui-icon-document{background-position:-32px -96px}.ui-icon-document-b{background-position:-48px -96px}.ui-icon-note{background-position:-64px -96px}.ui-icon-mail-closed{background-position:-80px -96px}.ui-icon-mail-open{background-position:-96px -96px}.ui-icon-suitcase{background-position:-112px -96px}.ui-icon-comment{background-position:-128px -96px}.ui-icon-person{background-position:-144px -96px}.ui-icon-print{background-position:-160px -96px}.ui-icon-trash{background-position:-176px -96px}.ui-icon-locked{background-position:-192px -96px}.ui-icon-unlocked{background-position:-208px -96px}.ui-icon-bookmark{background-position:-224px -96px}.ui-icon-tag{background-position:-240px -96px}.ui-icon-home{background-position:0 -112px}.ui-icon-flag{background-position:-16px -112px}.ui-icon-calendar{background-position:-32px -112px}.ui-icon-cart{background-position:-48px -112px}.ui-icon-pencil{background-position:-64px -112px}.ui-icon-clock{background-position:-80px -112px}.ui-icon-disk{background-position:-96px -112px}.ui-icon-calculator{background-position:-112px -112px}.ui-icon-zoomin{background-position:-128px -112px}.ui-icon-zoomout{background-position:-144px -112px}.ui-icon-search{background-position:-160px -112px}.ui-icon-wrench{background-position:-176px -112px}.ui-icon-gear{background-position:-192px -112px}.ui-icon-heart{background-position:-208px -112px}.ui-icon-star{background-position:-224px -112px}.ui-icon-link{background-position:-240px -112px}.ui-icon-cancel{background-position:0 -128px}.ui-icon-plus{background-position:-16px -128px}.ui-icon-plusthick{background-position:-32px -128px}.ui-icon-minus{background-position:-48px -128px}.ui-icon-minusthick{background-position:-64px -128px}.ui-icon-close{background-position:-80px -128px}.ui-icon-closethick{background-position:-96px -128px}.ui-icon-key{background-position:-112px -128px}.ui-icon-lightbulb{background-position:-128px -128px}.ui-icon-scissors{background-position:-144px -128px}.ui-icon-clipboard{background-position:-160px -128px}.ui-icon-copy{background-position:-176px -128px}.ui-icon-contact{background-position:-192px -128px}.ui-icon-image{background-position:-208px -128px}.ui-icon-video{background-position:-224px -128px}.ui-icon-script{background-position:-240px -128px}.ui-icon-alert{background-position:0 -144px}.ui-icon-info{background-position:-16px -144px}.ui-icon-notice{background-position:-32px -144px}.ui-icon-help{background-position:-48px -144px}.ui-icon-check{background-position:-64px -144px}.ui-icon-bullet{background-position:-80px -144px}.ui-icon-radio-on{background-position:-96px -144px}.ui-icon-radio-off{background-position:-112px -144px}.ui-icon-pin-w{background-position:-128px -144px}.ui-icon-pin-s{background-position:-144px -144px}.ui-icon-play{background-position:0 -160px}.ui-icon-pause{background-position:-16px -160px}.ui-icon-seek-next{background-position:-32px -160px}.ui-icon-seek-prev{background-position:-48px -160px}.ui-icon-seek-end{background-position:-64px -160px}.ui-icon-seek-start{background-position:-80px -160px}.ui-icon-seek-first{background-position:-80px -160px}.ui-icon-stop{background-position:-96px -160px}.ui-icon-eject{background-position:-112px -160px}.ui-icon-volume-off{background-position:-128px -160px}.ui-icon-volume-on{background-position:-144px -160px}.ui-icon-power{background-position:0 -176px}.ui-icon-signal-diag{background-position:-16px -176px}.ui-icon-signal{background-position:-32px -176px}.ui-icon-battery-0{background-position:-48px -176px}.ui-icon-battery-1{background-position:-64px -176px}.ui-icon-battery-2{background-position:-80px -176px}.ui-icon-battery-3{background-position:-96px -176px}.ui-icon-circle-plus{background-position:0 -192px}.ui-icon-circle-minus{background-position:-16px -192px}.ui-icon-circle-close{background-position:-32px -192px}.ui-icon-circle-triangle-e{background-position:-48px -192px}.ui-icon-circle-triangle-s{background-position:-64px -192px}.ui-icon-circle-triangle-w{background-position:-80px -192px}.ui-icon-circle-triangle-n{background-position:-96px -192px}.ui-icon-circle-arrow-e{background-position:-112px -192px}.ui-icon-circle-arrow-s{background-position:-128px -192px}.ui-icon-circle-arrow-w{background-position:-144px -192px}.ui-icon-circle-arrow-n{background-position:-160px -192px}.ui-icon-circle-zoomin{background-position:-176px -192px}.ui-icon-circle-zoomout{background-position:-192px -192px}.ui-icon-circle-check{background-position:-208px -192px}.ui-icon-circlesmall-plus{background-position:0 -208px}.ui-icon-circlesmall-minus{background-position:-16px -208px}.ui-icon-circlesmall-close{background-position:-32px -208px}.ui-icon-squaresmall-plus{background-position:-48px -208px}.ui-icon-squaresmall-minus{background-position:-64px -208px}.ui-icon-squaresmall-close{background-position:-80px -208px}.ui-icon-grip-dotted-vertical{background-position:0 -224px}.ui-icon-grip-dotted-horizontal{background-position:-16px -224px}.ui-icon-grip-solid-vertical{background-position:-32px -224px}.ui-icon-grip-solid-horizontal{background-position:-48px -224px}.ui-icon-gripsmall-diagonal-se{background-position:-64px -224px}.ui-icon-grip-diagonal-se{background-position:-80px -224px}.ui-corner-all,.ui-corner-top,.ui-corner-left,.ui-corner-tl{border-top-left-radius:3px}.ui-corner-all,.ui-corner-top,.ui-corner-right,.ui-corner-tr{border-top-right-radius:3px}.ui-corner-all,.ui-corner-bottom,.ui-corner-left,.ui-corner-bl{border-bottom-left-radius:3px}.ui-corner-all,.ui-corner-bottom,.ui-corner-right,.ui-corner-br{border-bottom-right-radius:3px}.ui-widget-overlay{background:#aaa;opacity:.3;filter:Alpha(Opacity=30)}.ui-widget-shadow{-webkit-box-shadow:0 0 5px #666;box-shadow:0 0 5px #666}


--------------------------------------------------------------------------------
/static/css/theme-mine.css:
--------------------------------------------------------------------------------
  1 | 
  2 | :root{
  3 | 
  4 |     --MAIN-TEXT-color:#323235; /* Color of text by default */
  5 |     --MAIN-TITLES-TEXT-color: #778ba5; /* Color of titles h2-h3-h4-h5 */
  6 |     --MAIN-LINK-color:#4881cd; /* Color of links */
  7 |     --MAIN-LINK-HOVER-color:#599af1; /* Color of hovered links */
  8 |     --MAIN-ANCHOR-color: #4881cd; /* color of anchors on titles */
  9 | 
 10 |     --MENU-HEADER-BG-color:#283e5b; /* Background color of menu header */
 11 |     --MENU-HEADER-BORDER-color:#435c7c; /*Color of menu header border */
 12 | 
 13 |     --MENU-SEARCH-BG-color:#202c3c; /* Search field background color (by default borders + icons) */
 14 |     --MENU-SEARCH-BOX-color: #4d6584; /* Override search field border color */
 15 |     --MENU-SEARCH-BOX-ICONS-color: #4d6584; /* Override search field icons color */
 16 | 
 17 |     --MENU-SECTIONS-ACTIVE-BG-color:#0a0c0e; /* Background color of the active section and its childs */
 18 |     --MENU-SECTIONS-BG-color:#1c222a; /* Background color of other sections */
 19 |     --MENU-SECTIONS-LINK-color: #ccc; /* Color of links in menu */
 20 |     --MENU-SECTIONS-LINK-HOVER-color: #e6e6e6;  /* Color of links in menu, when hovered */
 21 |     --MENU-SECTION-ACTIVE-CATEGORY-color: #777; /* Color of active category text */
 22 |     --MENU-SECTION-ACTIVE-CATEGORY-BG-color: #fff; /* Color of background for the active category (only) */
 23 | 
 24 |     --MENU-VISITED-color: #33a1ff; /* Color of 'page visited' icons in menu */
 25 |     --MENU-SECTION-HR-color: #20272b; /* Color of <hr> separator in menu */
 26 | 
 27 | }
 28 | 
 29 | body {
 30 |     color: var(--MAIN-TEXT-color) !important;
 31 | }
 32 | 
 33 | textarea:focus, input[type="email"]:focus, input[type="number"]:focus, input[type="password"]:focus, input[type="search"]:focus, input[type="tel"]:focus, input[type="text"]:focus, input[type="url"]:focus, input[type="color"]:focus, input[type="date"]:focus, input[type="datetime"]:focus, input[type="datetime-local"]:focus, input[type="month"]:focus, input[type="time"]:focus, input[type="week"]:focus, select[multiple=multiple]:focus {
 34 |     border-color: none;
 35 |     box-shadow: none;
 36 | }
 37 | 
 38 | h2, h3, h4, h5 {
 39 |     color: var(--MAIN-TITLES-TEXT-color) !important;
 40 | }
 41 | 
 42 | a {
 43 |     color: var(--MAIN-LINK-color);
 44 | }
 45 | 
 46 | .anchor {
 47 |     color: var(--MAIN-ANCHOR-color);
 48 | }
 49 | 
 50 | a:hover {
 51 |     color: var(--MAIN-LINK-HOVER-color);
 52 | }
 53 | 
 54 | #sidebar ul li.visited > a .read-icon {
 55 | 	color: var(--MENU-VISITED-color);
 56 | }
 57 | 
 58 | #sidebar #footer {
 59 |   padding-top: 20px !important;
 60 | }
 61 | 
 62 | #sidebar #footer h2.github-title {
 63 |   font-size: 20px;
 64 |   color: #fd9827 !important;
 65 |   margin: 10px 0px 5px;
 66 |   padding: 0px;
 67 |   font-weight: normal !important;
 68 |   margin-top: 10px;
 69 |   padding-top: 30px;
 70 |   border-top: 1px dotted #384657;
 71 | }
 72 | 
 73 | #sidebar #footer h3.github-title {
 74 |   font-size: 14px;
 75 |   margin: 10px 0px 5px;
 76 |   padding: 0px;
 77 |   text-transform: uppercase;
 78 |   letter-spacing: .15px;
 79 | }
 80 | 
 81 | #sidebar #footer h5.copyright, #sidebar #footer p.build-number {
 82 |   font-size: 10px;
 83 |   letter-spacing: .15px;
 84 |   line-height: 150% !important;
 85 | }
 86 | 
 87 | #body a.highlight:after {
 88 |     display: block;
 89 |     content: "";
 90 |     height: 1px;
 91 |     width: 0%;
 92 |     -webkit-transition: width 0.5s ease;
 93 |     -moz-transition: width 0.5s ease;
 94 |     -ms-transition: width 0.5s ease;
 95 |     transition: width 0.5s ease;
 96 |     background-color: var(--MAIN-LINK-HOVER-color);
 97 | }
 98 | #sidebar {
 99 | 	background-color: var(--MENU-SECTIONS-BG-color);
100 | }
101 | #sidebar #header-wrapper {
102 |     background: var(--MENU-HEADER-BG-color);
103 |     color: var(--MENU-SEARCH-BOX-color);
104 |     border-color: var(--MENU-HEADER-BORDER-color);
105 | }
106 | #sidebar .searchbox {
107 | 	border-color: var(--MENU-SEARCH-BOX-color);
108 |     background: var(--MENU-SEARCH-BG-color);
109 | }
110 | #sidebar ul.topics > li.parent, #sidebar ul.topics > li.active {
111 |     background: var(--MENU-SECTIONS-ACTIVE-BG-color);
112 | }
113 | #sidebar .searchbox * {
114 |     color: var(--MENU-SEARCH-BOX-ICONS-color);
115 | }
116 | 
117 | #sidebar a {
118 |     color: var(--MENU-SECTIONS-LINK-color);
119 | }
120 | 
121 | #sidebar a:hover {
122 |     color: var(--MENU-SECTIONS-LINK-HOVER-color);
123 | }
124 | 
125 | #sidebar ul li.active > a {
126 |     background: var(--MENU-SECTION-ACTIVE-CATEGORY-BG-color);
127 |     color: var(--MENU-SECTION-ACTIVE-CATEGORY-color) !important;
128 | }
129 | 
130 | #sidebar hr {
131 |     border-color: var(--MENU-SECTION-HR-color);
132 | }
133 | 
134 | #navigation a.nav-prev, #navigation a.nav-next {
135 |     color: #f19e39 !important;
136 | }
137 | 
138 | #navigation a.nav-prev:hover, #navigation a.nav-next:hover  {
139 |     color: #e07d04 !important;
140 | }
141 | 
142 | div.notices p:first-child:before {
143 |     position: absolute;
144 |     top: 2px;
145 |     color: #fff;
146 |     font-family: 'Font Awesome\ 5 Free';
147 |     content: #F06A;
148 |     font-weight: 900; /* Fix version 5.0.9 */
149 |     left: 10px;
150 | }
151 | 
152 | .ui-state-default, .ui-widget-content .ui-state-default, .ui-widget-header .ui-state-default, .ui-button, html .ui-button.ui-state-disabled:hover, html .ui-button.ui-state-disabled:active {
153 |     border: 1px solid #dddddd;
154 |     font-weight: normal;
155 |     color: #454545;
156 | }
157 | 
158 | .ui-state-active, .ui-widget-content .ui-state-active, .ui-widget-header .ui-state-active, a.ui-button:active, .ui-button:active, .ui-button.ui-state-active:hover {
159 |     border: 1px solid var(--MENU-HEADER-BG-color);
160 |     background: var(--MENU-HEADER-BG-color);
161 |     font-weight: normal;
162 |     color: #fff;
163 | }
164 | 
165 | .ui-widget.ui-widget-content {
166 |     border: 1px solid #eeeeee;
167 | }
168 | 
169 | .ui-widget-header {
170 |   border: 1px solid #eeeeee;
171 | }
172 | 
173 | .hljs {
174 |   background-color: none; 
175 | }
176 | 
177 | pre {
178 |   background-color: var(--MENU-SECTIONS-BG-color) !important;
179 | }
180 | 
181 | div.notices.info p {
182 |     border-top: 30px solid #fd9827;
183 |     background: #FFF2DB;
184 | }
185 | 
186 | 


--------------------------------------------------------------------------------
/static/images/cleanup/sm_cleanup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/cleanup/sm_cleanup.png


--------------------------------------------------------------------------------
/static/images/convert_script/distributed_script.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/convert_script/distributed_script.png


--------------------------------------------------------------------------------
/static/images/convert_script/single_instance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/convert_script/single_instance.png


--------------------------------------------------------------------------------
/static/images/eks/create_repo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/eks/create_repo.png


--------------------------------------------------------------------------------
/static/images/eks/eksctl_launch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/eks/eksctl_launch.png


--------------------------------------------------------------------------------
/static/images/eks/get_container.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/eks/get_container.png


--------------------------------------------------------------------------------
/static/images/eks/job_yaml_container.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/eks/job_yaml_container.png


--------------------------------------------------------------------------------
/static/images/eks/push_commands.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/eks/push_commands.png


--------------------------------------------------------------------------------
/static/images/eks/subnet_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/eks/subnet_image.png


--------------------------------------------------------------------------------
/static/images/eks/verify_eks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/eks/verify_eks.png


--------------------------------------------------------------------------------
/static/images/eks/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/eks/workflow.png


--------------------------------------------------------------------------------
/static/images/intro/approaches.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/intro/approaches.png


--------------------------------------------------------------------------------
/static/images/intro/challenges.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/intro/challenges.png


--------------------------------------------------------------------------------
/static/images/intro/containers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/intro/containers.png


--------------------------------------------------------------------------------
/static/images/intro/containers_ecr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/intro/containers_ecr.png


--------------------------------------------------------------------------------
/static/images/intro/forward_backward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/intro/forward_backward.png


--------------------------------------------------------------------------------
/static/images/intro/home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/intro/home.png


--------------------------------------------------------------------------------
/static/images/intro/how_it_runs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/intro/how_it_runs.png


--------------------------------------------------------------------------------
/static/images/intro/mlinfra.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/intro/mlinfra.png


--------------------------------------------------------------------------------
/static/images/intro/parallel_distributed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/intro/parallel_distributed.png


--------------------------------------------------------------------------------
/static/images/sagemaker/aws_console.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/sagemaker/aws_console.png


--------------------------------------------------------------------------------
/static/images/sagemaker/sm_notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/sagemaker/sm_notebook.png


--------------------------------------------------------------------------------
/static/images/sagemaker/tensorboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/sagemaker/tensorboard.png


--------------------------------------------------------------------------------
/static/images/sagemaker/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/sagemaker/workflow.png


--------------------------------------------------------------------------------
/static/images/setup/admin_attach.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/admin_attach.png


--------------------------------------------------------------------------------
/static/images/setup/attach_policy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/attach_policy.png


--------------------------------------------------------------------------------
/static/images/setup/go_to_IAM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/go_to_IAM.png


--------------------------------------------------------------------------------
/static/images/setup/launch_jupyter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/launch_jupyter.png


--------------------------------------------------------------------------------
/static/images/setup/launch_terminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/launch_terminal.png


--------------------------------------------------------------------------------
/static/images/setup/notebook_iam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/notebook_iam.png


--------------------------------------------------------------------------------
/static/images/setup/setup_aws_console.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/setup_aws_console.png


--------------------------------------------------------------------------------
/static/images/setup/setup_create_notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/setup_create_notebook.png


--------------------------------------------------------------------------------
/static/images/setup/setup_fill_notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/setup_fill_notebook.png


--------------------------------------------------------------------------------
/static/images/setup/setup_notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/images/setup/setup_notebook.png


--------------------------------------------------------------------------------
/static/tf-world-distributed-training-workshop.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shashankprasanna/distributed-training-workshop/c3bf8bbf7ad4c04069b0b982e75ddaba837f9b94/static/tf-world-distributed-training-workshop.pdf


--------------------------------------------------------------------------------