├── .env
├── .gitignore
├── .util
    ├── cloud9.cfn
    └── resize.sh
├── 1-create-cluster
    ├── .gitignore
    ├── 1-1-configure.sh
    ├── 1-2-install-tools.sh
    ├── 1-3-create-cluster.sh
    ├── 1-4-deploy-packages.sh
    ├── eks.yaml.template
    ├── etcd
    │   ├── deploy.sh
    │   ├── etcd-deployment.yaml
    │   └── remove.sh
    └── kubeflow-training-operator
    │   ├── clusterrole-hpa-access.yaml
    │   ├── clusterrolebinding-training-operator-hpa-access.yaml
    │   ├── deploy.sh
    │   └── remove.sh
├── 2-create-volume
    ├── .gitignore
    ├── 2-1-create-efs.sh
    ├── 2-2-create-pvc.sh
    ├── efs-pv.yaml.template
    ├── efs-pvc.yaml
    └── efs-sc.yaml.template
├── 3-build-container
    ├── 3-1-build.sh
    ├── 3-2-push.sh
    ├── Dockerfile-cpu
    ├── Dockerfile-gpu
    ├── cifar10-model-test.py
    ├── cifar10-model-train.py
    ├── cnn_model.py
    ├── data-prep.sh
    └── utils.py
├── 4-get-data
    ├── .gitignore
    ├── 4-1-get-data.sh
    ├── 4-2-show-status.sh
    ├── 4-3-show-log.sh
    └── efs-data-copy.yaml.template
├── 5-train-model
    ├── .gitignore
    ├── 5-1-generate-pytorchjob.sh
    ├── 5-2-launch-pytorchjob.sh
    ├── 5-3-show-status.sh
    ├── 5-4-show-utilization.sh
    ├── 5-5-show-logs.sh
    ├── 5-6-delete-pytorchjob.sh
    ├── cleanup.yaml.template
    └── train.yaml.template
├── 6-test-model
    ├── .gitignore
    ├── 6-1-generate-job.sh
    ├── 6-2-launch-job.sh
    ├── 6-3-show-status.sh
    ├── 6-4-show-log.sh
    ├── 6-5-delete-job.sh
    └── test.yaml.template
├── 7-cleanup
    ├── 7-1-delete-efs.sh
    └── 7-2-delete-cluster.sh
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── SETUP.md
├── THIRD-PARTY-LICENSES
└── img
    ├── aws-console-cloud9-link.png
    ├── aws-console-my-account.png
    ├── aws-console-signin-iam-user.png
    ├── aws-console-signin.png
    ├── cloud9-configure-settings.png
    ├── cloud9-credentials-dialog.png
    ├── cloud9-credentials-disable.png
    ├── cloud9-ide-manage-ec2.png
    ├── cloud9-instance-storage.png
    ├── cloud9-landing-page.png
    ├── cloud9-managed-credentials.png
    ├── cloud9-modify-volume.png
    ├── cloud9-name-environment.png
    ├── cloud9-reboot-instance.png
    ├── cloud9-resized-volume.png
    ├── cloud9-volume-actions.png
    ├── cloud9-volume-optimizing.png
    ├── iam-add-group.png
    ├── iam-add-user-access-key.png
    ├── iam-add-user-admins-group.png
    ├── iam-add-user.png
    ├── iam-create-group.png
    ├── step-1-create-cluster.png
    ├── step-2-create-volume.png
    ├── step-3-build-container.png
    ├── step-4-get-data.png
    ├── step-5-train-model.png
    ├── step-6-test-model.png
    ├── step-7-cleanup.png
    └── workshop-architecture.png


/.env:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export AWS_PROFILE=workshop
 4 | export CLUSTER_NAME=do-eks
 5 | export REGION=us-west-2
 6 | export AZ1=us-west-2a
 7 | export AZ2=us-west-2b
 8 | export NODE_TYPE=c5.4xlarge
 9 | export NODE_COUNT=2
10 | # PROCESSOR - target processor for training, PROCESSOR=cpu(default)|gpu
11 | export PROCESSOR=cpu
12 | # CPU_LIMIT - number of CPUs per node to use
13 | export CPU_LIMIT=15
14 | # GPU_LIMIT - number of GPUs per node to use. Must be 0 if PROCESSOR=cpu
15 | export GPU_LIMIT=0
16 | export ACCOUNT=$(aws sts get-caller-identity --query Account --output text)
17 | export REGISTRY=${ACCOUNT}.dkr.ecr.${REGION}.amazonaws.com/
18 | export IMAGE=pytorch-${PROCESSOR}
19 | export TAG=:latest
20 | export MOUNT_PATH=/efs-shared
21 | export EPOCHS=10
22 | export BATCH_SIZE=128
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 


--------------------------------------------------------------------------------
/.util/cloud9.cfn:
--------------------------------------------------------------------------------
  1 | Parameters:
  2 |   LabName:
  3 |     Type: String
  4 |     AllowedPattern: ".+"
  5 |     Default: "Workshop"
  6 |   VolSizeGB:
  7 |     Type: String
  8 |     AllowedPattern: "[0-9]+"
  9 |     Default: "100" 
 10 | 
 11 | Resources:
 12 |   Cloud9Lab:
 13 |     Type: AWS::Cloud9::EnvironmentEC2
 14 |     Properties:
 15 |       AutomaticStopTimeMinutes: 1440
 16 |       ImageId: amazonlinux-2-x86_64
 17 |       InstanceType: "c5.4xlarge"
 18 |       Name: !Ref LabName
 19 |       OwnerArn: !Sub 'arn:aws:sts::${AWS::AccountId}:assumed-role/TeamRole/MasterKey'
 20 | 
 21 |   Cloud9Role:
 22 |     Type: AWS::IAM::Role
 23 |     Properties:
 24 |       AssumeRolePolicyDocument:
 25 |         Version: "2012-10-17"
 26 |         Statement:
 27 |           - Effect: Allow
 28 |             Principal:
 29 |               Service:
 30 |                 - ec2.amazonaws.com
 31 |             Action:
 32 |               - 'sts:AssumeRole'
 33 |       ManagedPolicyArns:
 34 |         - arn:aws:iam::aws:policy/AdministratorAccess
 35 |       MaxSessionDuration: 28800
 36 |       Path: /
 37 |       RoleName: 'workshop-admin'
 38 | 
 39 |   Cloud9InstanceProfile:
 40 |     Type: 'AWS::IAM::InstanceProfile'
 41 |     Properties:
 42 |       Path: /
 43 |       Roles:
 44 |         - !Ref Cloud9Role
 45 | 
 46 |   LambdaRole:
 47 |     Type: AWS::IAM::Role
 48 |     Properties:
 49 |       AssumeRolePolicyDocument:
 50 |         Version: "2012-10-17"
 51 |         Statement:
 52 |           - Effect: Allow
 53 |             Principal:
 54 |               Service:
 55 |                 - lambda.amazonaws.com
 56 |             Action:
 57 |               - 'sts:AssumeRole'
 58 |       ManagedPolicyArns:
 59 |         - arn:aws:iam::aws:policy/AmazonEC2FullAccess
 60 |         - arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole
 61 |         - arn:aws:iam::aws:policy/CloudWatchLogsFullAccess
 62 |         - arn:aws:iam::aws:policy/IAMFullAccess
 63 |       MaxSessionDuration: 3600
 64 |       Path: /
 65 |       RoleName: 'hpcworkshop-lambda'
 66 | 
 67 | 
 68 |   LambdaCloud9:
 69 |     DependsOn: Cloud9Lab
 70 |     Type: AWS::Lambda::Function
 71 |     Properties:
 72 |       Environment:
 73 |         Variables:
 74 |           LAB_NAME: !Ref LabName
 75 |           VOL_SIZE_GB: !Ref VolSizeGB
 76 |       Architectures:
 77 |         - 'x86_64'
 78 |       Code:
 79 |         ZipFile: |
 80 |           #!/usr/bin/env python3
 81 | 
 82 |           import boto3
 83 |           import cfnresponse
 84 |           import os
 85 |           import json
 86 |           import time
 87 | 
 88 |           client = boto3.Session().client('ec2')
 89 | 
 90 |           def lambda_handler(event, context):
 91 |               responseValue = event['ResourceProperties']['ProfileArn']
 92 |               responseData = {}
 93 |               responseData['Data'] = responseValue
 94 |               cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData, "CustomResourcePhysicalID")
 95 |               main(responseValue)
 96 |               return {'statusCode': 200, 'body': json.dumps('Cloud9')}
 97 | 
 98 |           def get_modification_state(volume_id):
 99 |               resp = client.describe_volumes_modifications(
100 |                   VolumeIds=[
101 |                       volume_id
102 |                   ]
103 |               ) 
104 |               return resp['VolumesModifications'][0]['ModificationState']
105 | 
106 |           def main(instance_profile_arn):
107 |               response = client.describe_instances(Filters=[
108 |                   {
109 |                       'Name': 'tag:Name',
110 |                       'Values': [
111 |                           'aws-cloud9-' + os.environ.get('LAB_NAME','Workshop') + '-*',
112 |                       ]
113 |                   },
114 |                   {
115 |                   'Name': 'instance-state-name',
116 |                   'Values': ["pending", "running"]
117 |                   }
118 |               ])
119 | 
120 |               ec2 = boto3.resource('ec2')
121 | 
122 |               instance_id = response['Reservations'][0]['Instances'][0]['InstanceId']
123 |               volume_id = response['Reservations'][0]['Instances'][0]['BlockDeviceMappings'][0]['Ebs']['VolumeId']
124 |               IamInstanceProfile = {'Name': instance_profile_arn}
125 |               instance = ec2.Instance(instance_id)
126 |               instance.wait_until_running()
127 |               response = client.describe_iam_instance_profile_associations(Filters=[
128 |                   {
129 |                       'Name': 'instance-id',
130 |                       'Values': [
131 |                           instance_id,
132 |                       ]
133 |                   },
134 |               ])
135 |               if len(response['IamInstanceProfileAssociations']) > 0:
136 |                   instance_profile_association_id = response[
137 |                       'IamInstanceProfileAssociations'][0]['AssociationId']
138 |                   response = client.replace_iam_instance_profile_association(
139 |                       IamInstanceProfile=IamInstanceProfile,
140 |                       AssociationId=instance_profile_association_id)
141 |               else:
142 |                   response = client.associate_iam_instance_profile(
143 |                       IamInstanceProfile=IamInstanceProfile, InstanceId=instance_id)
144 |                       
145 |               # Modify volume size
146 |               volume_size_str = os.environ.get('VOL_SIZE_GB', '100')
147 |               volume_size = int(volume_size_str)
148 |               modify_volume_response = client.modify_volume(VolumeId=volume_id,Size=volume_size)
149 |               while True:
150 |                   state = get_modification_state(volume_id)
151 |                   if state == 'completed' or state == None or state == 'optimizing':
152 |                       break
153 |                   elif state == 'failed':
154 |                       raise Exception('Failed to modify volume size')
155 |                   else:
156 |                       time.sleep(15)
157 |                       
158 |               # Reboot ec2 instance so the new volume size takes effect
159 |               reboot_instance_reponse = client.reboot_instances(
160 |                 InstanceIds=[instance_id]
161 |               )              
162 | 
163 |       Handler: index.lambda_handler
164 |       MemorySize: 128
165 |       PackageType: 'Zip'
166 |       Role: !GetAtt LambdaRole.Arn
167 |       Runtime: 'python3.9'
168 |       Timeout: 300
169 | 
170 |   Primerinvoke:
171 |     Type: AWS::CloudFormation::CustomResource
172 |     DependsOn: LambdaCloud9
173 |     Version: "1.0"
174 |     Properties:
175 |       ServiceToken: !GetAtt LambdaCloud9.Arn
176 |       ProfileArn: !Ref Cloud9InstanceProfile
177 | 
178 | Outputs:
179 |   Cloud9URl:
180 |     Description: URL of your AWS Cloud9 Instance
181 |     Value: !Join ['', ['https://', !Ref 'AWS::Region','.console.aws.amazon.com/cloud9/ide/', !Ref Cloud9Lab ] ]
182 | 
183 | 
184 | 


--------------------------------------------------------------------------------
/.util/resize.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Specify the desired volume size in GiB as a command line argument. If not specified, default to 20 GiB.
 4 | SIZE=${1:-20}
 5 | 
 6 | # Get the ID of the environment host Amazon EC2 instance.
 7 | INSTANCEID=$(curl http://169.254.169.254/latest/meta-data/instance-id)
 8 | REGION=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone | sed 's/\(.*\)[a-z]/\1/')
 9 | 
10 | # Get the ID of the Amazon EBS volume associated with the instance.
11 | VOLUMEID=$(aws ec2 describe-instances \
12 |   --instance-id $INSTANCEID \
13 |   --query "Reservations[0].Instances[0].BlockDeviceMappings[0].Ebs.VolumeId" \
14 |   --output text \
15 |   --region $REGION)
16 | 
17 | # Resize the EBS volume.
18 | aws ec2 modify-volume --volume-id $VOLUMEID --size $SIZE
19 | 
20 | # Wait for the resize to finish.
21 | while [ \
22 |   "$(aws ec2 describe-volumes-modifications \
23 |     --volume-id $VOLUMEID \
24 |     --filters Name=modification-state,Values="optimizing","completed" \
25 |     --query "length(VolumesModifications)"\
26 |     --output text)" != "1" ]; do
27 | sleep 1
28 | done
29 | 
30 | #Check if we're on an NVMe filesystem
31 | if [[ -e "/dev/xvda" && $(readlink -f /dev/xvda) = "/dev/xvda" ]]
32 | then
33 |   # Rewrite the partition table so that the partition takes up all the space that it can.
34 |   sudo growpart /dev/xvda 1
35 | 
36 |   # Expand the size of the file system.
37 |   # Check if we're on AL2
38 |   STR=$(cat /etc/os-release)
39 |   SUB="VERSION_ID=\"2\""
40 |   if [[ "$STR" == *"$SUB"* ]]
41 |   then
42 |     sudo xfs_growfs -d /
43 |   else
44 |     sudo resize2fs /dev/xvda1
45 |   fi
46 | 
47 | else
48 |   # Rewrite the partition table so that the partition takes up all the space that it can.
49 |   sudo growpart /dev/nvme0n1 1
50 | 
51 |   # Expand the size of the file system.
52 |   # Check if we're on AL2
53 |   STR=$(cat /etc/os-release)
54 |   SUB="VERSION_ID=\"2\""
55 |   if [[ "$STR" == *"$SUB"* ]]
56 |   then
57 |     sudo xfs_growfs -d /
58 |   else
59 |     sudo resize2fs /dev/nvme0n1p1
60 |   fi
61 | fi
62 | 


--------------------------------------------------------------------------------
/1-create-cluster/.gitignore:
--------------------------------------------------------------------------------
1 | eks.yaml
2 | 


--------------------------------------------------------------------------------
/1-create-cluster/1-1-configure.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ../.env
 4 | 
 5 | echo ""
 6 | echo "Configuring AWS client ..."
 7 | aws configure --profile $AWS_PROFILE
 8 | 
 9 | echo ""
10 | echo "Generating cluster configuration eks.yaml ..."
11 | cat eks.yaml.template | envsubst > eks.yaml
12 | 
13 | 


--------------------------------------------------------------------------------
/1-create-cluster/1-2-install-tools.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Install tools
 4 | 
 5 | # eksctl
 6 | echo ""
 7 | echo "Installing eksctl ..."
 8 | curl --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
 9 | 
10 | sudo mv /tmp/eksctl /usr/local/bin
11 | eksctl version
12 | 
13 | # kubectl
14 | echo ""
15 | echo "Installing kubectl ..."
16 | curl -O https://s3.us-west-2.amazonaws.com/amazon-eks/1.26.2/2023-03-17/bin/linux/amd64/kubectl
17 | chmod +x ./kubectl
18 | sudo mv ./kubectl /usr/local/bin
19 | kubectl version --client
20 | 
21 | # kubectx
22 | echo ""
23 | echo "Installing kubectx ..."
24 | pushd /tmp
25 | git clone https://github.com/ahmetb/kubectx
26 | sudo mv kubectx /opt
27 | sudo ln -s /opt/kubectx/kubectx /usr/local/bin/kubectx
28 | sudo ln -s /opt/kubectx/kubens /usr/local/bin/kubens
29 | popd
30 | 
31 | # kubetail
32 | echo ""
33 | echo "Installing kubetail ..."
34 | curl -o /tmp/kubetail https://raw.githubusercontent.com/johanhaleby/kubetail/master/kubetail
35 | chmod +x /tmp/kubetail
36 | sudo mv /tmp/kubetail /usr/local/bin/kubetail
37 | 
38 | # kubeshell
39 | echo ""
40 | echo "Installing kubeshell ..."
41 | curl -LO https://github.com/kvaps/kubectl-node-shell/raw/master/kubectl-node_shell
42 | chmod +x ./kubectl-node_shell
43 | sudo mv ./kubectl-node_shell /usr/local/bin/kubectl-node_shell
44 | 
45 | # jq
46 | echo ""
47 | echo "Installing jq ..."
48 | sudo yum install -y jq
49 | 
50 | # yq
51 | echo ""
52 | echo "Installing yq ..."
53 | pip3 install yq
54 | 
55 | # Set up aliases
56 | echo ""
57 | echo "Setting up aliases ..."
58 | cat << EOF >> ~/.bashrc
59 | alias ll='ls -alh --color=auto'
60 | alias k='kubectl'
61 | alias kc='kubectx'
62 | alias kn='kubens'
63 | alias kt='kubetail'
64 | alias ks='kubectl node-shell'
65 | EOF
66 | 
67 | echo ""
68 | echo "Done setting up tools."
69 | echo ""
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/1-create-cluster/1-3-create-cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ../.env
 4 | 
 5 | echo ""
 6 | echo "Creating EKS cluster ..."
 7 | echo ""
 8 | echo "... using configuration from ./eks.yaml ..."
 9 | echo ""
10 | cat ./eks.yaml
11 | echo ""
12 | date
13 | CMD="eksctl create cluster -f ./eks.yaml"
14 | echo "${CMD}"
15 | ${CMD}
16 | echo ""
17 | date
18 | echo "Done creating EKS cluster"
19 | 
20 | echo ""
21 | echo "Updating kubeconfig ..."
22 | aws eks update-kubeconfig --name $CLUSTER_NAME
23 | echo ""
24 | 
25 | echo ""
26 | echo "Displaying cluster nodes ..."
27 | kubectl get nodes
28 | 
29 | 


--------------------------------------------------------------------------------
/1-create-cluster/1-4-deploy-packages.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Deploy Kuberntes Packages
 4 | 
 5 | # Metrics server
 6 | echo ""
 7 | echo "Deploying Kubernetes Metrics Server ..."
 8 | kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml
 9 | 
10 | # Kubeflow Training Operator
11 | echo ""
12 | echo "Deploying Kubeflow Training Operator ..."
13 | pushd ./kubeflow-training-operator
14 | ./deploy.sh
15 | popd
16 | 
17 | # Etcd
18 | echo ""
19 | echo "Deploying etcd ..."
20 | kubectl apply -f etcd/etcd-deployment.yaml
21 | 
22 | 


--------------------------------------------------------------------------------
/1-create-cluster/eks.yaml.template:
--------------------------------------------------------------------------------
 1 | apiVersion: eksctl.io/v1alpha5
 2 | kind: ClusterConfig
 3 | 
 4 | metadata:
 5 |   name: ${CLUSTER_NAME}
 6 |   version: "1.26"
 7 |   region: ${REGION}
 8 | 
 9 | availabilityZones:
10 |   - ${AZ1}
11 |   - ${AZ2}
12 | 
13 | iam:
14 |   withOIDC: true
15 | 
16 | managedNodeGroups:
17 |   - name: wks-node
18 |     instanceType: ${NODE_TYPE}
19 |     instancePrefix: workshop
20 |     privateNetworking: true
21 |     availabilityZones: ["${AZ1}","${AZ2}"]
22 |     efaEnabled: false
23 |     minSize: 0
24 |     desiredCapacity: ${NODE_COUNT}
25 |     maxSize: 10
26 |     volumeSize: 900
27 |     iam:
28 |       withAddonPolicies:
29 |         cloudWatch: true
30 |         autoScaler: true
31 |         ebs: true
32 | 


--------------------------------------------------------------------------------
/1-create-cluster/etcd/deploy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | kubectl apply -f ./etcd-deployment.yaml
4 | 
5 | 


--------------------------------------------------------------------------------
/1-create-cluster/etcd/etcd-deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: etcd-service
 5 |   #namespace: elastic-job
 6 | spec:
 7 |   ports:
 8 |     - name: etcd-client-port
 9 |       port: 2379
10 |       protocol: TCP
11 |       targetPort: 2379
12 |   selector:
13 |     app: etcd
14 | 
15 | ---
16 | apiVersion: apps/v1
17 | kind: Deployment
18 | metadata:
19 |   labels:
20 |     app: etcd
21 |   name: etcd
22 |   #namespace: elastic-job
23 | spec:
24 |   replicas: 1
25 |   selector:
26 |     matchLabels:
27 |       app: etcd
28 |   template:
29 |     metadata:
30 |       labels:
31 |         app: etcd
32 |     spec:
33 |       containers:
34 |         - name: etcd
35 |           command: ["/usr/local/bin/etcd"]
36 |           args:
37 |             - "--data-dir"
38 |             - "/var/lib/etcd"
39 |             - "--enable-v2"
40 |             - "--listen-client-urls"
41 |             - "http://0.0.0.0:2379"
42 |             - "--advertise-client-urls"
43 |             - "http://0.0.0.0:2379"
44 |             - "--initial-cluster-state"
45 |             - "new"
46 |           image: quay.io/coreos/etcd:latest
47 |           ports:
48 |             - containerPort: 2379
49 |               name: client
50 |               protocol: TCP
51 |             - containerPort: 2380
52 |               name: server
53 |               protocol: TCP
54 |       restartPolicy: Always
55 | 


--------------------------------------------------------------------------------
/1-create-cluster/etcd/remove.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | kubectl delete -f ./etcd-deployment.yaml
4 | 
5 | 


--------------------------------------------------------------------------------
/1-create-cluster/kubeflow-training-operator/clusterrole-hpa-access.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRole
 3 | metadata:
 4 |   # "namespace" omitted since ClusterRoles are not namespaced
 5 |   name: hpa-access
 6 | rules:
 7 | - apiGroups: ["autoscaling"]
 8 |   #
 9 |   # at the HTTP level, the name of the resource for accessing Secret
10 |   # objects is "secrets"
11 |   resources: ["horizontalpodautoscalers"]
12 |   verbs: ["get", "watch", "list", "create", "delete"]
13 | 


--------------------------------------------------------------------------------
/1-create-cluster/kubeflow-training-operator/clusterrolebinding-training-operator-hpa-access.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | # This cluster role binding allows anyone in the "manager" group to read secrets in any namespace.
 3 | kind: ClusterRoleBinding
 4 | metadata:
 5 |   name: training-operator-hpa-access
 6 | subjects:
 7 | - kind: ServiceAccount
 8 |   name: training-operator # Name is case sensitive
 9 |   namespace: kubeflow
10 |   apiGroup: ""
11 | roleRef:
12 |   kind: ClusterRole
13 |   name: hpa-access
14 |   apiGroup: rbac.authorization.k8s.io
15 | 


--------------------------------------------------------------------------------
/1-create-cluster/kubeflow-training-operator/deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Deploy Kubeflow training operator
 4 | 
 5 | kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.7.0"
 6 | 
 7 | # Configure RBAC resources
 8 | 
 9 | kubectl apply -f ./clusterrole-hpa-access.yaml
10 | 
11 | kubectl apply -f ./clusterrolebinding-training-operator-hpa-access.yaml
12 | 
13 | 


--------------------------------------------------------------------------------
/1-create-cluster/kubeflow-training-operator/remove.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Remove RBAC resources
 4 | 
 5 | kubectl delete -f ./clusterrolebinding-training-operator-hpa-access.yaml
 6 | 
 7 | kubectl delete -f ./clusterrole-hpa-access.yaml
 8 | 
 9 | # Remove Kubeflow training operator
10 | 
11 | kubectl delete -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.5.0"
12 | 
13 | 


--------------------------------------------------------------------------------
/2-create-volume/.gitignore:
--------------------------------------------------------------------------------
1 | efs-pv.yaml
2 | efs-sc.yaml
3 | 


--------------------------------------------------------------------------------
/2-create-volume/2-1-create-efs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ../.env
 4 | 
 5 | # This script follows the following eks workshop
 6 | # https://www.eksworkshop.com/beginner/190_efs/launching-efs/
 7 | 
 8 | # Assume the cluster name is the first cluster in the list
 9 | echo ""
10 | echo 'Cluster name: ' $CLUSTER_NAME
11 | VPC_ID=$(aws eks describe-cluster --name $CLUSTER_NAME --query "cluster.resourcesVpcConfig.vpcId" --output text)
12 | CIDR_BLOCK=$(aws ec2 describe-vpcs --vpc-ids $VPC_ID --query "Vpcs[].CidrBlock" --output text)
13 | 
14 | echo 'VPC: ' $VPC_ID
15 | echo 'CIDR: ' $CIDR_BLOCK
16 | 
17 | echo ""
18 | echo "Creating security group ..."
19 | MOUNT_TARGET_GROUP_NAME="eks-efs-group-${CLUSTER_NAME}"
20 | MOUNT_TARGET_GROUP_DESC="NFS access to EFS from EKS worker nodes"
21 | aws ec2 create-security-group --group-name $MOUNT_TARGET_GROUP_NAME --description "$MOUNT_TARGET_GROUP_DESC" --vpc-id $VPC_ID
22 | sleep 5
23 | 
24 | MOUNT_TARGET_GROUP_ID=$(aws ec2 describe-security-groups --filter Name=vpc-id,Values=$VPC_ID Name=group-name,Values=$MOUNT_TARGET_GROUP_NAME --query 'SecurityGroups[*].[GroupId]' --output text)
25 | echo $MOUNT_TARGET_GROUP_NAME $MOUNT_TARGET_GROUP_DESC $MOUNT_TARGET_GROUP_ID
26 | 
27 | aws ec2 authorize-security-group-ingress --group-id $MOUNT_TARGET_GROUP_ID --protocol tcp --port 2049 --cidr $CIDR_BLOCK
28 | sleep 2
29 | 
30 | echo ""
31 | echo "Creating EFS volume ..."
32 | FILE_SYSTEM_ID=$(aws efs create-file-system | jq --raw-output '.FileSystemId')
33 | echo $FILE_SYSTEM_ID
34 | sleep 10
35 | 
36 | TAG1=tag:alpha.eksctl.io/cluster-name
37 | TAG2=tag:kubernetes.io/role/elb
38 | SUBNETS=$(aws ec2 describe-subnets --filter Name=$TAG1,Values=$CLUSTER_NAME Name=$TAG2,Values=1 --query 'Subnets[*].SubnetId' --output text)
39 | echo $SUBNETS
40 | 
41 | for subnet in ${SUBNETS}
42 | do
43 |     echo "Creating mount target in subnet " $subnet " , security group " $MOUNT_TARGET_GROUP_ID " ,for efs id " $FILE_SYSTEM_ID
44 |     aws efs create-mount-target --file-system-id $FILE_SYSTEM_ID --subnet-id $subnet --security-groups $MOUNT_TARGET_GROUP_ID
45 |     sleep 2
46 | done
47 | sleep 30
48 | 
49 | echo ""
50 | echo "Mount points state ..."
51 | aws efs describe-mount-targets --file-system-id $FILE_SYSTEM_ID | jq --raw-output '.MountTargets[].LifeCycleState'
52 | 
53 | echo ""
54 | echo "Done."
55 | echo ""
56 | 


--------------------------------------------------------------------------------
/2-create-volume/2-2-create-pvc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ../.env
 4 | 
 5 | # This script mostly follows this eks workshop
 6 | # https://www.eksworkshop.com/beginner/190_efs/launching-efs/
 7 | 
 8 | echo ""
 9 | echo "Checking EFS File System ..."
10 | 
11 | # if the pvc already exists, exit
12 | PV_EXISTS=$(kubectl get pv -o json | jq --raw-output '.items[].spec.storageClassName')
13 | for pv in ${PV_EXISTS}
14 | do
15 |     if [ "$pv" == "efs-sc" ]; then
16 |         echo "Persistant Volume already exists"
17 |         kubectl get pv
18 |         exit 0
19 |     fi
20 | done
21 | 
22 | # Assign file system id. Create EFS file system if needed. If more than one filesystem exists, take first one in the list
23 | FILE_SYSTEM_ID=$(aws efs describe-file-systems --query 'FileSystems[*].FileSystemId' --output json | jq -r .[0])
24 | if [ "$FILE_SYSTEM_ID" == "null" ]; then
25 |         echo ""
26 |         echo "No EFS file system found. Setting up new EFS File System ..."
27 |         ./2-1-create-efs.sh
28 |         FILE_SYSTEM_ID=$(aws efs describe-file-systems --query 'FileSystems[*].FileSystemId' --output json | jq -r .[0])
29 | fi
30 | echo 'EFS volume id' $FILE_SYSTEM_ID
31 | 
32 | echo ""
33 | echo "Deploying EFS CSI Driver ..."
34 | kubectl apply -k "github.com/kubernetes-sigs/aws-efs-csi-driver/deploy/kubernetes/overlays/stable/?ref=release-1.3"
35 | sleep 5
36 | kubectl get pods -n kube-system | grep efs
37 | 
38 | echo ""
39 | echo "Generating efs-sc.yaml ..."
40 | cat efs-sc.yaml.template | sed -e "s/EFS_VOLUME_ID/$FILE_SYSTEM_ID/g" > efs-sc.yaml
41 | echo ""
42 | echo "Applying efs-sc.yaml ..."
43 | kubectl apply -f efs-sc.yaml
44 | kubectl get sc
45 | 
46 | echo ""
47 | echo "Generating efs-pv.yaml ..."
48 | cat efs-pv.yaml.template | sed -e "s/EFS_VOLUME_ID/$FILE_SYSTEM_ID/g" > efs-pv.yaml
49 | echo "Applying efs-pv.yaml ..."
50 | kubectl apply -f efs-pv.yaml
51 | sleep 10
52 | kubectl get pv
53 | 
54 | echo ""
55 | echo "Creating persistent volume claim efs-pvc ..."
56 | kubectl apply -f efs-pvc.yaml
57 | kubectl get pvc
58 | 
59 | echo ""
60 | echo "Done."
61 | echo ""
62 | 


--------------------------------------------------------------------------------
/2-create-volume/efs-pv.yaml.template:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: efs-pv
 5 | spec:
 6 |   capacity:
 7 |     storage: 5Gi
 8 |   volumeMode: Filesystem
 9 |   accessModes:
10 |     - ReadWriteMany
11 |   persistentVolumeReclaimPolicy: Retain
12 |   storageClassName: efs-sc
13 |   csi:
14 |     driver: efs.csi.aws.com
15 |     volumeHandle: EFS_VOLUME_ID
16 | 


--------------------------------------------------------------------------------
/2-create-volume/efs-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: efs-pvc
 5 | spec:
 6 |   accessModes:
 7 |     - ReadWriteMany
 8 |   storageClassName: efs-sc
 9 |   volumeName: efs-pv
10 |   resources:
11 |     requests:
12 |       storage: 5Gi
13 | 


--------------------------------------------------------------------------------
/2-create-volume/efs-sc.yaml.template:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 |   
 3 | kind: StorageClass
 4 | apiVersion: storage.k8s.io/v1
 5 | metadata:
 6 |   name: efs-sc
 7 | provisioner: efs.csi.aws.com
 8 | parameters:
 9 |   provisioningMode: efs-ap
10 |   fileSystemId: EFS_VOLUME_ID
11 |   directoryPerms: "700"
12 | 


--------------------------------------------------------------------------------
/3-build-container/3-1-build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | . ../.env
4 | 
5 | # Build Docker image
6 | docker image build -f Dockerfile-${PROCESSOR} -t ${REGISTRY}${IMAGE}${TAG} .
7 | 
8 | 


--------------------------------------------------------------------------------
/3-build-container/3-2-push.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ../.env
 4 | 
 5 | # Create registry if needed
 6 | REGISTRY_COUNT=$(aws ecr describe-repositories | grep ${IMAGE} | wc -l)
 7 | if [ "$REGISTRY_COUNT" == "0" ]; then
 8 | 	aws ecr create-repository --repository-name ${IMAGE}
 9 | fi
10 | 
11 | # Login to container registry
12 | echo "Logging in to $REGISTRY ..."
13 | aws ecr get-login-password | docker login --username AWS --password-stdin $REGISTRY
14 | 
15 | # Push image to registry
16 | echo "Pushing ${IMAGE}${TAG} to registry ..."
17 | docker push ${REGISTRY}${IMAGE}${TAG}
18 | 
19 | 


--------------------------------------------------------------------------------
/3-build-container/Dockerfile-cpu:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:22.04
 2 | 
 3 | RUN apt-get update && apt-get install -y wget unzip python3 python3-pip htop
 4 | 
 5 | RUN pip3 install python-etcd 
 6 | RUN pip3 install torch torchvision --extra-index-url https://download.pytorch.org/whl/cpu
 7 | RUN pip3 install tensorboard debugpy
 8 | 
 9 | RUN mkdir -p /workspace/
10 | ADD cifar10-model-train.py /workspace/
11 | ADD cifar10-model-test.py /workspace/
12 | ADD cnn_model.py /workspace/
13 | ADD utils.py /workspace/
14 | ADD data-prep.sh /workspace/
15 | 


--------------------------------------------------------------------------------
/3-build-container/Dockerfile-gpu:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
 2 | #FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime
 3 | 
 4 | RUN apt-get update && apt-get install -y wget unzip python3 python3-pip htop
 5 | 
 6 | RUN pip3 install python-etcd 
 7 | #RUN pip3 install torch torchvision --extra-index-url https://download.pytorch.org/whl/gpu
 8 | RUN pip3 install tensorboard debugpy
 9 | 
10 | RUN mkdir -p /workspace/
11 | ADD cifar10-model-train.py /workspace/
12 | ADD cifar10-model-test.py /workspace/
13 | ADD cnn_model.py /workspace/
14 | ADD utils.py /workspace/
15 | ADD data-prep.sh /workspace/
16 | 


--------------------------------------------------------------------------------
/3-build-container/cifar10-model-test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | import torch
 5 | from torch.utils.data import DataLoader
 6 | 
 7 | from cnn_model import MyCnnModel # custom cnn model
 8 | from utils import *
 9 | 
10 | parser = argparse.ArgumentParser(description="PyTorch Elastic cifar10 Training")
11 | parser.add_argument("data", metavar="DIR", help="path to dataset")
12 | parser.add_argument('--workers', default=1, type=int,
13 |                     help='number of data loading workers (default: 1)')
14 | parser.add_argument('--batch-size', default=128, type=int,
15 |                     help='mini-batch size on each node (default: 128)')
16 | parser.add_argument('--model-file', default='/efs-shared/cifar10_model.pth', type=str,
17 |                     help='filename with path to save model (default: /efs-shared/cifar10_model.pth')
18 | 
19 | 
20 | def cifar10_test_dataloader(data_dir, batch_size, num_data_workers):
21 |     test_images, test_labels = unpickle(data_dir + 'test_batch')
22 | 
23 |     # convert numpy arrays to torch TensorDataset
24 |     test_dataset = get_tensordataset(test_images, test_labels)
25 |     
26 |     test_loader = DataLoader(
27 |         test_dataset,
28 |         batch_size=batch_size,
29 |         shuffle=False,
30 |         num_workers=num_data_workers,
31 |         pin_memory=True,
32 |     )
33 | 
34 |     return test_loader
35 | 
36 | 
37 | def main():
38 |     args = parser.parse_args()
39 |     print("reading", args.data)
40 |     test_loader = cifar10_test_dataloader(args.data, args.batch_size, args.workers)
41 |     print('loading model', args.model_file)
42 |     model = MyCnnModel()
43 |     model.load_state_dict(torch.load(args.model_file))
44 | 
45 |     classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
46 |     correct = 0
47 |     total = 0
48 | 
49 |     # prepare to count predictions for each class
50 |     correct_pred = {classname: 0 for classname in classes}
51 |     total_pred = {classname: 0 for classname in classes}
52 | 
53 |     # since we're not training, we don't need to calculate the gradients for our outputs
54 |     with torch.no_grad():
55 |         for data in test_loader:
56 |             images, labels = data
57 |             # calculate outputs by running images through the network
58 |             outputs = model(images)
59 |             _, predictions = torch.max(outputs, 1)        
60 | 
61 |             total += labels.size(0)
62 |             correct += (predictions == labels).sum().item()
63 | 
64 |             # collect the correct predictions for each class
65 |             for label, prediction in zip(labels, predictions):
66 |                 if label == prediction:
67 |                     correct_pred[classes[label]] += 1
68 |                 total_pred[classes[label]] += 1
69 | 
70 |             
71 |     print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')
72 | 
73 |     for classname, correct_count in correct_pred.items():
74 |         accuracy = 100 * float(correct_count) / total_pred[classname]
75 |         print(f'Accuracy for class: {classname:5s} is {accuracy:.1f} %')
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     main()
80 |     print('Finished Testing')
81 | 


--------------------------------------------------------------------------------
/3-build-container/cifar10-model-train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from datetime import timedelta
  3 | import argparse
  4 | 
  5 | import torch
  6 | from torch.nn import CrossEntropyLoss
  7 | from torch.optim import SGD
  8 | from torch.utils.data import DataLoader
  9 | from torch.nn.parallel import DistributedDataParallel
 10 | from torch.distributed.elastic.utils.data import ElasticDistributedSampler
 11 | from torch.distributed import init_process_group
 12 | from torch.utils.tensorboard import SummaryWriter
 13 | 
 14 | from cnn_model import MyCnnModel # custom cnn model
 15 | from utils import *
 16 | 
 17 | parser = argparse.ArgumentParser(description="PyTorch Elastic cifar10 Training")
 18 | parser.add_argument("data", help="path to dataset")
 19 | parser.add_argument('--workers', default=32, type=int,
 20 |                     help='number of data loading workers (default: 32)')
 21 | parser.add_argument('--epochs', default=10, type=int,
 22 |                     help='number of total epochs to run (default: 10)')
 23 | parser.add_argument('--batch-size', default=256, type=int,
 24 |                     help='mini-batch size on each node (default: 256)')
 25 | parser.add_argument('--learning-rate', default=0.001, type=float,
 26 |                     help='learning rate (default: 0.001')
 27 | parser.add_argument('--momentum', default=0.9, type=float,
 28 |                     help='momentum (default: 0.9)')
 29 | parser.add_argument('--weight-decay', default=1e-4, type=float,
 30 |                     help='weight decay (default: 1e-4)')
 31 | parser.add_argument('--print-freq', default=5, type=int,
 32 |                     help='print frequency (default: 5)')
 33 | parser.add_argument('--model-file', default='/efs-shared/cifar10_model.pth', type=str,
 34 |                     help='filename with path to save model (default: /efs-shared/cifar10_model.pth')
 35 | parser.add_argument("--checkpoint-file", default="/efs-shared/checkpoint.pth.tar", type=str,
 36 |                     help="checkpoint file path, to load and save to")
 37 | 
 38 | 
 39 | def cifar10_train_dataloader(data_dir, batch_size, num_data_workers):
 40 |     files = ['data_batch_'+str(i+1) for i in range(5)]
 41 | 
 42 |     train_images = []
 43 |     train_labels = []
 44 |     for file in files:
 45 |         images, labels = unpickle(data_dir + file)
 46 |         train_images.extend(images)
 47 |         train_labels.extend(labels)
 48 | 
 49 |     # convert numpy arrays to torch TensorDataset
 50 |     train_dataset = get_tensordataset(train_images, train_labels)
 51 | 
 52 |     train_sampler = ElasticDistributedSampler(train_dataset)
 53 |     train_loader = DataLoader(
 54 |         train_dataset,
 55 |         batch_size=batch_size,
 56 |         num_workers=num_data_workers,
 57 |         pin_memory=True,
 58 |         sampler=train_sampler,
 59 |     )
 60 | 
 61 |     return train_loader
 62 | 
 63 | 
 64 | def initialize_model(lr, momentum, weight_decay):
 65 |     model = MyCnnModel()
 66 |     model = DistributedDataParallel(model)
 67 | 
 68 |     criterion = CrossEntropyLoss()
 69 |     optimizer = SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay)
 70 | 
 71 |     return model, criterion, optimizer
 72 | 
 73 | 
 74 | def main():
 75 |     print("Main function called ...")
 76 |     #import debugpy; debugpy.listen(('0.0.0.0',5678)); debugpy.wait_for_client(); breakpoint()
 77 |     init_process_group(backend="gloo", init_method="env://", timeout=timedelta(seconds=10))
 78 |     args = parser.parse_args()
 79 |     rank = int(os.environ["RANK"])
 80 |     modelFile = args.model_file
 81 |     tensorDir = "%s/runs/"%(os.path.dirname(modelFile))
 82 |     writer = SummaryWriter(log_dir=tensorDir)
 83 | 
 84 |     print("reading", args.data)
 85 |     train_loader = cifar10_train_dataloader(args.data, args.batch_size, args.workers)
 86 |     model, criterion, optimizer = initialize_model(args.learning_rate, args.momentum, args.weight_decay)
 87 | 
 88 |     processor = os.getenv("PROCESSOR","cpu")
 89 |     print("Desired processor type: %s"%processor)
 90 | 
 91 |     device_type="cpu"
 92 |     device=torch.device("cpu")
 93 | 
 94 |     if processor == "gpu":
 95 |         if torch.cuda.is_available():
 96 |             device=torch.device("cuda")
 97 |             device_type="gpu"
 98 |             model.to(device)
 99 |         else:
100 |             print("torch.cuda.is_available() returned False!")
101 | 
102 |     print("Running on processor type: %s"%(device_type))
103 |     
104 |     start_epoch = 0
105 |     # load previously stored checkpoint if it exists.
106 |     start_epoch = load_checkpoint(args.checkpoint_file, model, optimizer)
107 | 
108 |     model.train()
109 |     for epoch in range(start_epoch, args.epochs):  # loop over the dataset multiple times
110 | 
111 |         running_loss = 0.0
112 |         for i, (inputs, labels) in enumerate(train_loader):
113 |             # zero the parameter gradients
114 |             optimizer.zero_grad()
115 | 
116 |             # forward + backward + optimize
117 |             if device_type == "gpu":
118 |               inputs = inputs.to(device)
119 |               labels = labels.to(device)
120 |             outputs = model(inputs)
121 |             loss = criterion(outputs, labels)
122 |             loss.backward()
123 |             optimizer.step()
124 | 
125 |             # print statistics
126 |             running_loss += loss.item()
127 |             if i % args.print_freq == args.print_freq-1:    # print every args.print_freq mini-batches
128 |                 print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / args.print_freq:.3f}')
129 |                 writer.add_scalar('Epoch', epoch+1)
130 |                 writer.add_scalar('Iteration', i+1)
131 |                 writer.add_scalar('Loss', running_loss / args.print_freq)
132 |                 running_loss = 0.0
133 |             
134 |         if rank==0: # Only one pod will save the checkpoint
135 |             save_checkpoint(args.checkpoint_file, epoch, model, optimizer)
136 | 
137 |     if rank==0:  # Only one pod will save the final model
138 |         print('saving final model:', args.model_file)
139 |         torch.save(model.module.state_dict(), args.model_file)
140 |     
141 |     writer.close()
142 | 
143 | if __name__ == "__main__":
144 |     main()
145 |     print('Finished Training')
146 | 


--------------------------------------------------------------------------------
/3-build-container/cnn_model.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | class MyCnnModel(nn.Module):
 4 |     def __init__(self):
 5 |         super().__init__()
 6 |         self.network = nn.Sequential(
 7 |             nn.Conv2d(3, 32, 3, padding='same'),
 8 |             nn.ReLU(),
 9 |             nn.Conv2d(32, 64, 3, padding='same'),
10 |             nn.ReLU(),
11 |             nn.MaxPool2d(2, 2),
12 |             nn.BatchNorm2d(64),
13 | 
14 |             nn.Conv2d(64, 128, 3, padding='same'),
15 |             nn.ReLU(),
16 |             nn.Conv2d(128, 128, 3, padding='same'),
17 |             nn.ReLU(),
18 |             nn.MaxPool2d(2, 2),
19 |             nn.BatchNorm2d(128),
20 | 
21 |             nn.Conv2d(128, 256, 3, padding='same'),
22 |             nn.ReLU(),
23 |             nn.Conv2d(256, 256, 3, padding='same'),
24 |             nn.ReLU(),
25 |             nn.MaxPool2d(2, 2),
26 |             nn.BatchNorm2d(256),
27 | 
28 |             nn.Flatten(), 
29 |             nn.Linear(256*4*4, 512),
30 |             nn.ReLU(),
31 |             nn.Linear(512, 10))
32 |         
33 |     def forward(self, xb):
34 |         return self.network(xb)
35 | 


--------------------------------------------------------------------------------
/3-build-container/data-prep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo ""
 4 | echo "Shared path - ${1}"
 5 | 
 6 | mkdir -p ${1}
 7 | cd ${1}
 8 | 
 9 | echo ""
10 | echo "Downloading data ..."
11 | wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
12 | tar -xzf cifar-10-python.tar.gz
13 | rm cifar-10-python.tar.gz
14 | 
15 | echo ""
16 | echo "Setting permissions ..."
17 | chown -R 1000:100 ${1}
18 | 
19 | echo ""
20 | echo "Done"
21 | echo ""
22 | 
23 | 


--------------------------------------------------------------------------------
/3-build-container/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import numpy as np
 4 | import torch
 5 | from torch.utils.data import TensorDataset
 6 | 
 7 | def unpickle(file):
 8 |     with open(file, 'rb') as fo:
 9 |         data = pickle.load(fo, encoding='bytes')
10 |     return data[b'data'], data[b'labels']
11 | 
12 | def get_tensordataset(images, labels):
13 |     images_arr = np.array(images)
14 |     images_arr = np.reshape(images_arr, (-1,3,32,32))
15 |     labels_arr = np.array(labels)
16 |     images_arr = images_arr/255.  # normalize
17 | 
18 |     tensor_X = torch.tensor(images_arr, dtype=torch.float32)
19 |     tensor_y = torch.tensor(labels_arr, dtype=torch.long)
20 |     dataset = TensorDataset(tensor_X, tensor_y)
21 | 
22 |     return dataset
23 | 
24 | def save_checkpoint(checkpoint_file, epoch, model, optimizer):
25 |     checkpoint_dir = os.path.dirname(checkpoint_file)
26 |     os.makedirs(checkpoint_dir, exist_ok=True)
27 | 
28 |     snapshot = {
29 |         "epoch": epoch,
30 |         "state_dict":  model.state_dict(),
31 |         "optimizer": optimizer.state_dict()
32 |     }
33 | 
34 |     torch.save(snapshot, checkpoint_file)
35 |     print(f"=> saved checkpoint for epoch {epoch+1} at {checkpoint_file}")
36 | 
37 | def load_checkpoint(checkpoint_file, model, optimizer):
38 |     if os.path.isfile(checkpoint_file):
39 |         print('loading checkpoint file:', checkpoint_file)
40 |         snapshot = torch.load(checkpoint_file)
41 |         epoch = snapshot["epoch"] + 1 # start from next epoch
42 |         model.load_state_dict(snapshot["state_dict"])
43 |         optimizer.load_state_dict(snapshot["optimizer"])
44 |         print("Restored model from previous checkpoint")
45 |     else:
46 |         epoch = 0
47 | 
48 |     return epoch
49 | 


--------------------------------------------------------------------------------
/4-get-data/.gitignore:
--------------------------------------------------------------------------------
1 | efs-data-copy.yaml
2 | 


--------------------------------------------------------------------------------
/4-get-data/4-1-get-data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ../.env
 4 | 
 5 | echo ""
 6 | echo "Generating pod manifest ..."
 7 | cat efs-data-copy.yaml.template | envsubst > efs-data-copy.yaml
 8 | 
 9 | echo ""
10 | echo "Creating efs-data-prep pod ..."
11 | kubectl apply -f efs-data-copy.yaml
12 | sleep 3
13 | kubectl get pods | grep data-prep
14 | 
15 | 


--------------------------------------------------------------------------------
/4-get-data/4-2-show-status.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo ""
 4 | echo "Describing data prep pod ..."
 5 | kubectl describe pod efs-data-prep-pod
 6 | 
 7 | echo ""
 8 | echo "Showing status of data prep pod ..."
 9 | kubectl get pods | grep data-prep
10 | 
11 | 


--------------------------------------------------------------------------------
/4-get-data/4-3-show-log.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | kubectl logs -f efs-data-prep-pod
4 | 
5 | 


--------------------------------------------------------------------------------
/4-get-data/efs-data-copy.yaml.template:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: efs-data-prep-pod
 5 |   annotations:
 6 |     sidecar.istio.io/inject: "false"
 7 | spec:
 8 |   containers:
 9 |   - name: efs-data-prep-pod
10 |     image: ${REGISTRY}${IMAGE}${TAG}
11 |     imagePullPolicy: Always
12 |     command: ["/bin/bash"]
13 |     args: ["-c", "/workspace/data-prep.sh ${MOUNT_PATH}"]
14 |     volumeMounts:
15 |     - name: efs-pv
16 |       mountPath: ${MOUNT_PATH}
17 |   volumes:
18 |   - name: efs-pv
19 |     persistentVolumeClaim:
20 |       claimName: efs-pvc
21 |   restartPolicy: Never
22 | 


--------------------------------------------------------------------------------
/5-train-model/.gitignore:
--------------------------------------------------------------------------------
1 | train.yaml
2 | cleanup.yaml
3 | 


--------------------------------------------------------------------------------
/5-train-model/5-1-generate-pytorchjob.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ../.env
 4 | 
 5 | echo ""
 6 | echo "Generating ElasticJob manifest ..."
 7 | cat train.yaml.template | envsubst > train.yaml
 8 | echo ""
 9 | echo "Generating Checkpoint Cleanup job ..."
10 | cat cleanup.yaml.template | envsubst > cleanup.yaml
11 | echo ""
12 | echo "ElasticJob Manifest:"
13 | echo ""
14 | cat train.yaml
15 | echo ""
16 | 
17 | 


--------------------------------------------------------------------------------
/5-train-model/5-2-launch-pytorchjob.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | echo ""
4 | echo "Launching PyTorchJob ..."
5 | kubectl apply -f ./train.yaml
6 | 
7 | 


--------------------------------------------------------------------------------
/5-train-model/5-3-show-status.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | kubectl get pods -o wide | grep train
4 | 
5 | 


--------------------------------------------------------------------------------
/5-train-model/5-4-show-utilization.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | kubectl top nodes
4 | 
5 | 


--------------------------------------------------------------------------------
/5-train-model/5-5-show-logs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | kubetail cifar10-train
4 | 
5 | 


--------------------------------------------------------------------------------
/5-train-model/5-6-delete-pytorchjob.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo ""
 4 | echo "Deleting PyTorchJob ..."
 5 | kubectl delete -f ./train.yaml
 6 | 
 7 | echo ""
 8 | echo "Restarting etcd ..."
 9 | kubectl delete pod $(kubectl get pods | grep etcd | cut -d ' ' -f 1)
10 | 
11 | echo ""
12 | echo "Cleaning up model checkpoint ..."
13 | echo ""
14 | kubectl apply -f ./cleanup.yaml
15 | echo ""
16 | while true; do
17 | 	JOB="$(kubectl get job | grep cleanup)"
18 | 	COMPLETED=$(echo $JOB | awk -e '{print $2}' | cut -d '/' -f 1)
19 | 	if [ "$COMPLETED" == "1" ]; then
20 | 		kubectl logs $(kubectl get pods | grep cleanup | cut -d ' ' -f 1)
21 | 		break;
22 | 	else
23 | 		echo "$JOB"
24 | 		sleep 1
25 | 	fi
26 | done
27 | echo ""
28 | kubectl delete -f ./cleanup.yaml
29 | echo ""
30 | 
31 | 


--------------------------------------------------------------------------------
/5-train-model/cleanup.yaml.template:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: cifar10-cleanup
 5 | spec:
 6 |   template:
 7 |     metadata:
 8 |       annotations:
 9 |         sidecar.istio.io/inject: "false"
10 |     spec:
11 |       restartPolicy: Never
12 |       nodeSelector:
13 |         beta.kubernetes.io/instance-type: ${NODE_TYPE}
14 |       containers:
15 |         - name: test
16 |           image: ${REGISTRY}${IMAGE}${TAG}
17 |           imagePullPolicy: Always
18 |           command: ["/bin/bash", "-c", "if [ -f ${MOUNT_PATH}/checkpoint.pth.tar ]; then echo Cleaning up checkpoint; rm -vf ${MOUNT_PATH}/checkpoint.pth.tar; else echo Checkpoint is already clean; fi; if [ -d ${MOUNT_PATH}/runs ]; then echo Cleaning up tensorboard logs; rm -rvf ${MOUNT_PATH}/runs; else echo Tensorboard logs are already clean; fi"]
19 |           volumeMounts:
20 |             - name: efs-pv
21 |               mountPath: ${MOUNT_PATH} 
22 |       volumes:
23 |         - name: efs-pv
24 |           persistentVolumeClaim:
25 |             claimName: efs-pvc
26 | 
27 | 


--------------------------------------------------------------------------------
/5-train-model/train.yaml.template:
--------------------------------------------------------------------------------
 1 | apiVersion: "kubeflow.org/v1"
 2 | kind: PyTorchJob
 3 | metadata:
 4 |   name: cifar10-train
 5 | spec:
 6 |   elasticPolicy:
 7 |     rdzvBackend: etcd
 8 |     rdzvHost: etcd-service
 9 |     rdzvPort: 2379
10 |     minReplicas: 1
11 |     maxReplicas: 128
12 |     maxRestarts: 100
13 |     metrics:
14 |       - type: Resource
15 |         resource:
16 |           name: cpu
17 |           target:
18 |             type: Utilization
19 |             averageUtilization: 80
20 |   pytorchReplicaSpecs:
21 |     Worker:
22 |       replicas: ${NODE_COUNT}
23 |       restartPolicy: OnFailure
24 |       template:
25 |         spec:
26 |           containers:
27 |             - name: pytorch
28 |               image: ${REGISTRY}${IMAGE}${TAG}
29 |               imagePullPolicy: IfNotPresent
30 |               env:
31 |               - name: PROCESSOR
32 |                 value: "${PROCESSOR}"
33 |               command:
34 |                 - python3
35 |                 - -m
36 |                 - torch.distributed.run
37 |                 - /workspace/cifar10-model-train.py
38 |                 - "--epochs=${EPOCHS}"
39 |                 - "--batch-size=${BATCH_SIZE}"
40 |                 - "--workers=${CPU_LIMIT}"
41 |                 - "--model-file=${MOUNT_PATH}/cifar10-model.pth"
42 |                 - "${MOUNT_PATH}/cifar-10-batches-py/"
43 |               volumeMounts:
44 |                 - name: efs-pv
45 |                   mountPath: ${MOUNT_PATH}
46 |                 # The following enables the worker pods to use increased shared memory 
47 |                 # which is required when specifying more than 0 data loader workers
48 |                 - name: dshm
49 |                   mountPath: /dev/shm
50 |           volumes:
51 |             - name: efs-pv
52 |               persistentVolumeClaim:
53 |                 claimName: efs-pvc
54 |             - name: dshm
55 |               emptyDir:     
56 |                 medium: Memory
57 | 


--------------------------------------------------------------------------------
/6-test-model/.gitignore:
--------------------------------------------------------------------------------
1 | test.yaml
2 | 


--------------------------------------------------------------------------------
/6-test-model/6-1-generate-job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ../.env
 4 | 
 5 | echo ""
 6 | echo "Generating test job manifest ..."
 7 | cat test.yaml.template | envsubst > test.yaml
 8 | cat test.yaml
 9 | echo ""
10 | 


--------------------------------------------------------------------------------
/6-test-model/6-2-launch-job.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | echo ""
4 | echo "Launching test job ..."
5 | kubectl apply -f ./test.yaml
6 | 


--------------------------------------------------------------------------------
/6-test-model/6-3-show-status.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | echo ""
4 | echo "Showing test job status ..."
5 | kubectl get pods | grep test
6 | 
7 | 


--------------------------------------------------------------------------------
/6-test-model/6-4-show-log.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | echo ""
4 | echo "Showing cifar10-test log ..."
5 | echo ""
6 | 
7 | kubectl logs -f $(kubectl get pods | grep cifar10-test | cut -d ' ' -f 1 | head -n 1)
8 | 
9 | 


--------------------------------------------------------------------------------
/6-test-model/6-5-delete-job.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | echo ""
4 | echo "Deleting test job ..."
5 | kubectl delete -f ./test.yaml
6 | 


--------------------------------------------------------------------------------
/6-test-model/test.yaml.template:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: cifar10-test
 5 | spec:
 6 |   template:
 7 |     spec:
 8 |       restartPolicy: Never
 9 |       nodeSelector:
10 |         beta.kubernetes.io/instance-type: ${NODE_TYPE}
11 |       containers:
12 |         - name: test
13 |           image: ${REGISTRY}${IMAGE}${TAG}
14 |           imagePullPolicy: Always
15 |           command: ["python3"]
16 |           args:
17 |             - "/workspace/cifar10-model-test.py"
18 |             - "--model-file=${MOUNT_PATH}/cifar10-model.pth"
19 |             - "--batch-size=128"
20 |             - "--workers=1"
21 |             - "${MOUNT_PATH}/cifar-10-batches-py/"
22 |           volumeMounts:
23 |             - name: efs-pv
24 |               mountPath: ${MOUNT_PATH} 
25 |       volumes:
26 |         - name: efs-pv
27 |           persistentVolumeClaim:
28 |             claimName: efs-pvc
29 | 


--------------------------------------------------------------------------------
/7-cleanup/7-1-delete-efs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ../.env
 4 | 
 5 | echo ""
 6 | FILE_SYSTEM_ID=$(aws efs describe-file-systems --query 'FileSystems[*].FileSystemId' --output json | jq -r .[0] )
 7 | if [ "$FILE_SYSTEM_ID" == "" ]; then
 8 |         echo "No EFS Filesystems found."
 9 | else
10 |         echo "Deleting EFS mount targets for File System $FILE_SYSTEM_ID ..."
11 |         MOUNT_TARGETS="$(aws efs describe-mount-targets --file-system-id $FILE_SYSTEM_ID --query MountTargets[].MountTargetId --output text)"
12 |         MT=$(echo $MOUNT_TARGETS)
13 |         for t in $MT; do echo Deleting mount target $t; aws efs delete-mount-target --mount-target-id $t; done 
14 |         sleep 10
15 |         echo "Deleting EFS file system $FILE_SYSTEM_ID ..."
16 |         aws efs delete-file-system --file-system-id $FILE_SYSTEM_ID
17 | fi
18 | 
19 | echo ""
20 | echo 'Done.'
21 | 
22 | 


--------------------------------------------------------------------------------
/7-cleanup/7-2-delete-cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ../.env
 4 | 
 5 | echo ""
 6 | read -p "Deleting cluster $CLUSTER_NAME. Proceed? [Y/n]: " PROCEED
 7 | if [ "$PROCEED" == "Y" ]; then
 8 |         echo "Confirmed ..."
 9 |         eksctl delete cluster -f ../1-create-cluster/eks.yaml
10 |         echo "Please note that the cluster will be fully deleted when the Cloud Formation stack completes its removal"
11 |         echo "Only after the process in Cloud Formation is finished, you will be able to create a new cluster with the same name"
12 | elif [ "$PROCEED" == "n" ]; then
13 |         echo "Cancelling. Cluster will not be deleted."
14 | else
15 |         echo "$PROCEED is not a valid response"
16 |         echo "Please run the script again and choose Y or n (case sensitive)"
17 | fi
18 | echo ""
19 | 
20 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Distributed Model Training Workshop for AWS EKS
  2 | 
  3 | Welcome! By completing this workshop you will learn how to run distributed data parallel model training on [AWS EKS](https://aws.amazon.com/eks) using [PyTorch](https://pytorch.org). 
  4 | The only prerequisite for this workshop is access to an AWS account. The steps included here will walk you through creating and AWS EKS cluster, a shared data volume, building a model training container image, downloading and pre-processing data, running distributed training of an image classification model, and finally running the model with new images to test it.
  5 | 
  6 | The workshop architecture at a high level can be visualized by the diagram below.
  7 | 
  8 | <center><img src="img/workshop-architecture.png" width="80%"/> </br>
  9 | 
 10 | Fig. 1 - Workshop Infrastructure Architecture
 11 | </center>
 12 | 
 13 | The workshop is designed to introduce the concepts of deploying this architecture and running small-scale distributed training for educational purposes, however the same architecture can be applied for training at large scale by adjusting the number and type of nodes used in the EKS cluster, using accelerators ([NVIDIA GPUs](https://aws.amazon.com/nvidia/), [AWS Trainium](https://aws.amazon.com/machine-learning/trainium/), [Intel Habana Gaudi](https://aws.amazon.com/ec2/instance-types/dl1/)), and high-performance shared storage like [FSx for Lustre](https://aws.amazon.com/fsx/lustre/). Further information and scripts that help deploy distributed training on EKS using GPUs and FSx can be found in the [aws-do-eks](https://github.com/aws-samples/aws-do-eks) open-source project.
 14 | 
 15 | This workshop is organized in a number of sequential steps. The scripts that belong to each step are organized in folders with corresponding names. To execute a step, we will change the current directory accordingly and execute scripts in their designated order. The prerequisites section is required, but there are no scripts associated with it. We will complete setting up prerequisites by following instructions. Steps 1 through 6 are required to complete the workshop. Step 7-Cleanup is optional. 
 16 | 
 17 | ## 0. Prerequisites
 18 | Before we get started, we need to set up an AWS account and Cloud9 IDE from which we will execute all the steps in the workshop. You will not be required to install anything on your computer. All of the steps in the workshop will be completed on the cloud through your browser. To set up your account and IDE, please follow the instructions in [SETUP.md](SETUP.md).
 19 | 
 20 | ## 1. Create EKS Cluster
 21 | 
 22 | <center><img src="img/step-1-create-cluster.png" width="80%"/> </br>
 23 | 
 24 | Fig. 1.0 - Step 1 - Create EKS cluster
 25 | </center>
 26 | 
 27 | In this step we will execute scripts to create a managed [Kubernetes](https://kubernetes.io) cluster using the Amazon Elastic Kubernetes Service ([EKS](https://aws.amazon.com/eks)). Later we will use this cluster to run our distributed model training job.
 28 | 
 29 | In the last part of your prerequisites setup, you cloned the workshop code into your Cloud9 IDE. To build our distributed training infrastructure on EKS, we will start by changing the current directory to `1-create-cluster`.
 30 | 
 31 | ```console
 32 | cd 1-create-cluster
 33 | ```
 34 | 
 35 | ### 1.1. Configure AWS client and EKS cluster
 36 | Many of the scripts provided in the workshop use the [AWS CLI](https://aws.amazon.com/cli/) to access the AWS APIs in the account. That is why the AWS CLI needs to be configured with the credentials (access key id and secret access key) we saved previously. The configuration of the EKS cluster is specified by a .yaml file which we will also generate in this step. 
 37 | 
 38 | Execute:
 39 | ```console
 40 | ./1-1-configure.sh
 41 | ```
 42 | 
 43 | Output:
 44 | ```
 45 | The config profile (workshop) could not be found
 46 | 
 47 | Configuring AWS client ...
 48 | AWS Access Key ID [None]: ************
 49 | AWS Secret Access Key [None]: ****************************************
 50 | Default region name [None]: us-west-2
 51 | Default output format [None]: json
 52 | 
 53 | Generating cluster configuration eks.yaml ...
 54 | ```
 55 | 
 56 | By default, Cloud9 uses AWS managed temporary credentials, which we override with the script. If the managed temporary credentials setting has not been disabled, as soon as the script completes, Cloud9 will display the following dialog.
 57 | 
 58 | <center>
 59 | <img src="img/cloud9-credentials-dialog.png" width="40%"/> 
 60 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 
 61 | <img src="img/cloud9-credentials-disable.png" width="40%"/> 
 62 | </br>
 63 | 
 64 | Fig. 1.1 Cloud9 credentials dialogs
 65 | </center>
 66 | 
 67 | Please click **Cancel** in this dialog, immediatlely another dialog appears. Please click **Permanently disable** in the second dialog. If these dialogs do not appear, then AWS managed temporary credentials have already been disabled in your Cloud9 IDE and you may proceed to the next step.
 68 | 
 69 | ### 1.2. Install tools
 70 | 
 71 | The Cloud9 IDE comes with Docker pre-installed. In order to provision an EKS cluster, we will install [eksctl](https://eksctl.io/). To be able to execute commands against Kubernetes, we will install [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/). We will also install other miscellaneous utilities like [kubectx](https://github.com/ahmetb/kubectx), [kubetail](https://github.com/johanhaleby/kubetail), [jq](https://github.com/stedolan/jq), [yq](https://kislyuk.github.io/yq/), and will set up some shorthand command aliases (ll='ls -alh', k=kubectl, kc=kubectx, kn=kubens, kt=kubetail, ks=kubeshell) for convenience.
 72 | 
 73 | Execute:
 74 | ```console
 75 | ./1-2-install-tools.sh
 76 | ```
 77 | 
 78 | Output:
 79 | ```
 80 | Installing eksctl ...
 81 |   % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
 82 |                                  Dload  Upload   Total   Spent    Left  Speed
 83 |   0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 84 | 100 18.6M  100 18.6M    0     0  19.1M      0 --:--:-- --:--:-- --:--:-- 31.5M
 85 | 0.66.0
 86 | 
 87 | Installing kubectl ...
 88 |   % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
 89 |                                  Dload  Upload   Total   Spent    Left  Speed
 90 | 100 57.4M  100 57.4M    0     0  93.6M      0 --:--:-- --:--:-- --:--:-- 93.6M
 91 | Client Version: version.Info{Major:"1", Minor:"19+", GitVersion:"v1.19.6-eks-49a6c0", GitCommit:"49a6c0bf091506e7bafcdb1b142351b69363355a", GitTreeState:"clean", BuildDate:"2020-12-23T22:13:28Z", GoVersion:"go1.15.5", Compiler:"gc", Platform:"linux/amd64"}
 92 | 
 93 | ...
 94 | 
 95 | Setting up aliases ...
 96 | 
 97 | Done setting up tools.
 98 | ```
 99 | 
100 | ### 1.3. Launch cluster
101 | 
102 | We will use `eksctl` and the generated `eks.yaml` configuration to launch a new EKS cluster.
103 | 
104 | Execute:
105 | ```console
106 | ./1-3-create-cluster.sh
107 | ```
108 | 
109 | Output:
110 | ```
111 | Creating EKS cluster ...
112 | 
113 | ... using configuration from ./eks.yaml ...
114 | 
115 | apiVersion: eksctl.io/v1alpha5
116 | kind: ClusterConfig
117 | 
118 | metadata:
119 |   name: do-eks
120 |   version: "1.26"
121 |   region: us-west-2
122 | 
123 | availabilityZones:
124 |   - us-west-2a
125 |   - us-west-2b
126 | 
127 | iam:
128 |   withOIDC: true
129 | 
130 | managedNodeGroups:
131 |   - name: wks-node
132 |     instanceType: c5.4xlarge
133 |     instancePrefix: workshop
134 |     privateNetworking: true
135 |     availabilityZones: ["us-west-2a","us-west-2b"]
136 |     efaEnabled: false
137 |     minSize: 0
138 |     desiredCapacity: 2
139 |     maxSize: 10
140 |     volumeSize: 900
141 |     iam:
142 |       withAddonPolicies:
143 |         cloudWatch: true
144 |         autoScaler: true
145 |         ebs: true
146 | 
147 | Sat Jun  4 06:06:16 UTC 2022
148 | eksctl create cluster -f ./eks.yaml
149 | 2022-06-04 06:06:16 [ℹ]  eksctl version 0.66.0
150 | 2022-06-04 06:06:16 [ℹ]  using region us-west-2
151 | 2022-06-04 06:06:16 [ℹ]  subnets for us-west-2a - public:192.168.0.0/19 private:192.168.64.0/19
152 | 2022-06-04 06:06:16 [ℹ]  subnets for us-west-2b - public:192.168.32.0/19 private:192.168.96.0/19
153 | 2022-06-04 06:06:16 [ℹ]  nodegroup "wks-node" will use "" [AmazonLinux2/1.21]
154 | 2022-06-04 06:06:16 [ℹ]  using Kubernetes version 1.21
155 | 2022-06-04 06:06:16 [ℹ]  creating EKS cluster "do-eks" in "us-west-2" region with managed nodes
156 | 2022-06-04 06:06:16 [ℹ]  1 nodegroup (wks-node) was included (based on the include/exclude rules)
157 | 2022-06-04 06:06:16 [ℹ]  will create a CloudFormation stack for cluster itself and 0 nodegroup stack(s)
158 | 2022-06-04 06:06:16 [ℹ]  will create a CloudFormation stack for cluster itself and 1 managed nodegroup stack(s)
159 | 2022-06-04 06:06:16 [ℹ]  if you encounter any issues, check CloudFormation console or try 'eksctl utils describe-stacks --region=us-west-2 --cluster=do-eks'
160 | 2022-06-04 06:06:16 [ℹ]  CloudWatch logging will not be enabled for cluster "do-eks" in "us-west-2"
161 | 2022-06-04 06:06:16 [ℹ]  you can enable it with 'eksctl utils update-cluster-logging --enable-types={SPECIFY-YOUR-LOG-TYPES-HERE (e.g. all)} --region=us-west-2 --cluster=do-eks'
162 | 2022-06-04 06:06:16 [ℹ]  Kubernetes API endpoint access will use default of {publicAccess=true, privateAccess=false} for cluster "do-eks" in "us-west-2"
163 | 2022-06-04 06:06:16 [ℹ]  2 sequential tasks: { create cluster control plane "do-eks", 3 sequential sub-tasks: { 4 sequential sub-tasks: { wait for control plane to become ready, associate IAM OIDC provider, 2 sequential sub-tasks: { create IAM role for serviceaccount "kube-system/aws-node", create serviceaccount "kube-system/aws-node" }, restart daemonset "kube-system/aws-node" }, 1 task: { create addons }, create managed nodegroup "wks-node" } }
164 | 2022-06-04 06:06:16 [ℹ]  building cluster stack "eksctl-do-eks-cluster"
165 | 2022-06-04 06:06:16 [ℹ]  deploying stack "eksctl-do-eks-cluster"
166 | 
167 | ...
168 | 
169 | 2022-06-04 06:27:59 [ℹ]  waiting for CloudFormation stack "eksctl-do-eks-nodegroup-wks-node"
170 | 2022-06-04 06:27:59 [ℹ]  waiting for the control plane availability...
171 | 2022-06-04 06:27:59 [✔]  saved kubeconfig as "/home/ec2-user/.kube/config"
172 | 2022-06-04 06:27:59 [ℹ]  no tasks
173 | 2022-06-04 06:27:59 [✔]  all EKS cluster resources for "do-eks" have been created
174 | 2022-06-04 06:30:01 [ℹ]  kubectl command should work with "/home/ec2-user/.kube/config", try 'kubectl get nodes'
175 | 2022-06-04 06:30:01 [✔]  EKS cluster "do-eks" in "us-west-2" region is ready
176 | 
177 | Sat Jun  4 06:30:01 UTC 2022
178 | Done creating EKS cluster
179 | 
180 | Updating kubeconfig ...
181 | Added new context arn:aws:eks:us-west-2:620266777012:cluster/do-eks to /home/ec2-user/.kube/config
182 | 
183 | 
184 | Displaying cluster nodes ...
185 | NAME                                            STATUS   ROLES    AGE    VERSION
186 | ip-192-168-111-138.us-west-2.compute.internal   Ready    <none>   3m3s   v1.21.12-eks-5308cf7
187 | ip-192-168-90-82.us-west-2.compute.internal     Ready    <none>   3m3s   v1.21.12-eks-5308cf7
188 | 
189 | ```
190 | 
191 | The `eksctl` command uses Cloud Formation behind the scenes. In addition to the command output, provisioning progress can be seen in [CloudFormation](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#).
192 | 
193 | Please expect that creation of the cluster may take up to 30 min.
194 | 
195 | ### 1.4. Deploy packages to cluster
196 | We are going to use [TorchElastic Job Controller](https://github.com/pytorch/elastic/tree/master/kubernetes) for Kubernetes to launch a distributed training job using an ElasticJob custom resource. We will also use [Kubernetes Metrics Server](https://github.com/kubernetes-sigs/metrics-server) to monitor node resource utilization in the cluster during training. To deploy both to the EKS cluster, execute:
197 | 
198 | ```console
199 | ./1-4-deploy-packages.sh
200 | ```
201 | 
202 | Output:
203 | ```
204 | Deploying Kubernetes Metrics Server ...
205 | serviceaccount/metrics-server created
206 | clusterrole.rbac.authorization.k8s.io/system:aggregated-metrics-reader created
207 | clusterrole.rbac.authorization.k8s.io/system:metrics-server created
208 | rolebinding.rbac.authorization.k8s.io/metrics-server-auth-reader created
209 | clusterrolebinding.rbac.authorization.k8s.io/metrics-server:system:auth-delegator created
210 | clusterrolebinding.rbac.authorization.k8s.io/system:metrics-server created
211 | service/metrics-server created
212 | deployment.apps/metrics-server created
213 | apiservice.apiregistration.k8s.io/v1beta1.metrics.k8s.io created
214 | 
215 | Deploying Kubeflow Training Operator ...
216 | ~/update-workshop/1-create-cluster/kubeflow-training-operator ~/update-workshop/1-create-cluster
217 | namespace/kubeflow created
218 | customresourcedefinition.apiextensions.k8s.io/mpijobs.kubeflow.org created
219 | customresourcedefinition.apiextensions.k8s.io/mxjobs.kubeflow.org created
220 | customresourcedefinition.apiextensions.k8s.io/pytorchjobs.kubeflow.org created
221 | customresourcedefinition.apiextensions.k8s.io/tfjobs.kubeflow.org created
222 | customresourcedefinition.apiextensions.k8s.io/xgboostjobs.kubeflow.org created
223 | serviceaccount/training-operator created
224 | clusterrole.rbac.authorization.k8s.io/training-operator created
225 | clusterrolebinding.rbac.authorization.k8s.io/training-operator created
226 | service/training-operator created
227 | deployment.apps/training-operator created
228 | clusterrole.rbac.authorization.k8s.io/hpa-access created
229 | clusterrolebinding.rbac.authorization.k8s.io/training-operator-hpa-access created
230 | ~/update-workshop/1-create-cluster
231 | 
232 | Deploying etcd ...
233 | service/etcd-service created
234 | deployment.apps/etcd created
235 | ```
236 | 
237 | The EKS cluster is now provisioned and prepared to run distributed training jobs.
238 | 
239 | ## 2. Create Shared Volume
240 | 
241 | <center><img src="img/step-2-create-volume.png" width="80%"/> </br>
242 | 
243 | Fig. 2.0 - Step 2 - Create shared volume
244 | </center>
245 | 
246 | With distributed data parallel training, all workers need to have access to the training data. We can achieve that by creating a shared volume which can be mounted in each of the worker pods.
247 | 
248 | To create a shared volume, we will use the scripts in the directory for step 2.
249 | 
250 | ```console
251 | cd ../2-create-volume
252 | ```
253 | 
254 | ### 2.1. Create EFS file system
255 | First we will use the AWS CLI to provision an EFS file system.
256 | 
257 | Execute:
258 | ```console
259 | ./2-1-create-efs.sh
260 | ```
261 | 
262 | Output:
263 | ```
264 | Cluster name do-eks
265 | VPC vpc-0ecd59e0bf1426491
266 | Creating security group ...
267 | {
268 |     "GroupId": "sg-0ab73460e1a1b3e67"
269 | }
270 | eks-efs-group NFS access to EFS from EKS worker nodes sg-0ab73460e1a1b3e67
271 | 
272 | ...
273 | 
274 | Creating EFS volume ...
275 | fs-0b15155937d1c6b83
276 | subnet-07767ca17e93fe901 subnet-04859dc111ed82685
277 | Creating mount target in subnet-07767ca17e93fe901 in security group sg-0ab73460e1a1b3e67 for efs fs-0b15155937d1c6b83
278 | 
279 | ...
280 | 
281 | Done.
282 | ```
283 | 
284 | The EFS file system is now created and configured so that it can be accessed from the EKS cluster.
285 | 
286 | ### 2.2. Create Kubernetes Persistent Volume Claim
287 | In order to create a Kubernetes persistent volume claim (PVC) agains the EFS file system, we need to deploy the EFS container storage interface (CSI) driver to the cluster, then create a storage class and a persistent volume (PV). To do that, execute:
288 | 
289 | ```console
290 | ./2-2-create-pvc.sh
291 | ```
292 | 
293 | Output:
294 | ```
295 | Checking EFS File System ...
296 | EFS volume id fs-0b15155937d1c6b83
297 | 
298 | Deploying EFS CSI Driver ...
299 | serviceaccount/efs-csi-controller-sa created
300 | serviceaccount/efs-csi-node-sa created
301 | clusterrole.rbac.authorization.k8s.io/efs-csi-external-provisioner-role created
302 | clusterrolebinding.rbac.authorization.k8s.io/efs-csi-provisioner-binding created
303 | deployment.apps/efs-csi-controller created
304 | daemonset.apps/efs-csi-node created
305 | csidriver.storage.k8s.io/efs.csi.aws.com configured
306 | efs-csi-controller-66fcf64846-4dcbv   0/3     ContainerCreating   0          6s
307 | efs-csi-controller-66fcf64846-df6p9   0/3     ContainerCreating   0          6s
308 | efs-csi-node-7cnkt                    0/3     ContainerCreating   0          6s
309 | efs-csi-node-9ljw2                    0/3     ContainerCreating   0          6s
310 | 
311 | Generating efs-sc.yaml ...
312 | 
313 | Applying efs-sc.yaml ...
314 | storageclass.storage.k8s.io/efs-sc created
315 | NAME            PROVISIONER             RECLAIMPOLICY   VOLUMEBINDINGMODE      ALLOWVOLUMEEXPANSION   AGE
316 | efs-sc          efs.csi.aws.com         Delete          Immediate              false                  0s
317 | gp2 (default)   kubernetes.io/aws-ebs   Delete          WaitForFirstConsumer   false                  94m
318 | 
319 | Generating efs-pv.yaml ...
320 | Applying efs-pv.yaml ...
321 | persistentvolume/efs-pv created
322 | NAME     CAPACITY   ACCESS MODES   RECLAIM POLICY   STATUS      CLAIM   STORAGECLASS   REASON   AGE
323 | efs-pv   5Gi        RWX            Retain           Available           efs-sc                  11s
324 | 
325 | Creating persistent volume claim efs-pvc ...
326 | persistentvolumeclaim/efs-pvc created
327 | NAME      STATUS   VOLUME   CAPACITY   ACCESS MODES   STORAGECLASS   AGE
328 | efs-pvc   Bound    efs-pv   5Gi        RWX            efs-sc         1s
329 | 
330 | Done.
331 | 
332 | ```
333 | 
334 | ## 3. Build Deep Learning Container
335 | 
336 | <center><img src="img/step-3-build-container.png" width="80%"/> </br>
337 | 
338 | Fig. 3.0 - Step 3 - Build deep learning container
339 | </center>
340 | 
341 | In this step, we will build a container that has code to train our PyTorch model. 
342 | To do that we need to change the current directory to `3-build-container`.
343 | 
344 | ```console
345 | cd ../3-build-container
346 | ```
347 | 
348 | Please note that this folder contains a Dockerfile, python and shell scripts. We will only need to execute the scripts that start with `3-*`.
349 | 
350 | ### 3.1. Build container image
351 | 
352 | To build the container image, execute:
353 | ```console
354 | ./3-1-build.sh
355 | ```
356 | 
357 | Output:
358 | ```
359 | inflating: aws/dist/awscli/data/dax/2017-04-19/completions-1.json  
360 |    creating: aws/dist/awscli/data/health/2016-08-04/
361 |  
362 |  16650K .......... .......... .......... .......... .......... 98% 29.1M 0s
363 |  16700K .......... .......... .......... .......... .......... 99% 23.6M 0s
364 |  16750K .......... .......... .......... .......... .......... 99% 16.3M 0s
365 |  16800K .......... .......... .......... .......... .......... 99% 25.4M 0s
366 |  16850K .......... .......... .....                           100%  268M=1.3s
367 | 
368 | 2022-06-04 07:56:41 (12.3 MB/s) - '/tmp/etcd-v3.4.3/etcd-v3.4.3-linux-amd64.tar.gz' saved [17280028/17280028]
369 | 
370 | ------------------------
371 | etcdctl version: 3.4.3
372 | API version: 3.4
373 | ------------------------
374 | Finished installing etcd v3.4.3. To use: /usr/local/bin/(etcd | etcdctl)
375 | Removing intermediate container 71951321d43d
376 |  
377 |  ...
378 |  
379 |  tep 12/15 : ADD cifar10-model-train.py /workspace/
380 |  ---> 622630ffa5b7
381 | Step 13/15 : ADD cifar10-model-test.py /workspace/
382 |  ---> 33974972d759
383 | Step 14/15 : ADD cnn_model.py /workspace/
384 |  ---> 8d1492e4f0a1
385 | Step 15/15 : ADD data-prep.sh /workspace/
386 |  ---> b1ec9d533050
387 | Successfully built b1ec9d533050
388 | Successfully tagged 620266777012.dkr.ecr.us-west-2.amazonaws.com/pytorch-cpu:latest
389 | 
390 | ```
391 | 
392 | ### 3.2 Push container image to ECR
393 | After it is built, the image needs to be pushed to ECR so it can be used by Kubernetes nodes.
394 | 
395 | Execute:
396 | ```console
397 | ./3-2-push.sh
398 | ```
399 | 
400 | Output:
401 | ```
402 | Logging in to 620266777012.dkr.ecr.us-west-2.amazonaws.com/ ...
403 | WARNING! Your password will be stored unencrypted in /home/ec2-user/.docker/config.json.
404 | Configure a credential helper to remove this warning. See
405 | https://docs.docker.com/engine/reference/commandline/login/#credentials-store
406 | 
407 | Login Succeeded
408 | Pushing pytorch-cpu:latest to registry ...
409 | The push refers to repository [620266777012.dkr.ecr.us-west-2.amazonaws.com/pytorch-cpu]
410 | 85fb7c19f7ba: Pushed 
411 | 1915f933c51f: Pushed 
412 | 69f193e41d27: Pushed 
413 | fac272423a4b: Pushed 
414 | 3c8419b41ef5: Pushed 
415 | 0f550fa492fc: Pushed 
416 | ff0f8f83e19d: Pushed 
417 | 11c114e08199: Pushed 
418 | e9b65af3368a: Pushed 
419 | bf8cedc62fb3: Layer already exists 
420 | latest: digest: sha256:a7bc0842b2681a84ebbfeda35096d8d8f09baffdb0e8ce9d42d6b3f9d983ac6d size: 3459
421 | 
422 | ```
423 | 
424 | ## 4. Download and Preprocess Image Dataset
425 | 
426 | <center><img src="img/step-4-get-data.png" width="80%"/> </br>
427 | 
428 | Fig. 4.0 - Step 4 - Download data
429 | </center>
430 | 
431 | In this step we will run a pod which mounts the persistent volume and downloads the [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset on it.
432 | We will execute the scripts from directory `4-get-data`.
433 | 
434 | ```console
435 | cd ../4-get-data
436 | ```
437 | 
438 | ### 4.1. Launch download pod
439 | The [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) condists of images with size 32x32 pixels, grouped in 10 classes (airplane, automobile, bird, cat, deer, dog, frog, horse, ship, truck) with 6,000 images per class. To download this dataset and save it to the shared volume, execute: 
440 | 
441 | ```console
442 | ./4-1-get-data.sh
443 | ```
444 | 
445 | Output:
446 | ```
447 | 
448 | Generating pod manifest ...
449 | 
450 | Creating efs-data-prep pod ...
451 | pod/efs-data-prep-pod created
452 | efs-data-prep-pod       0/1     ContainerCreating   0          3s
453 | 
454 | ```
455 | 
456 | ### 4.2. Show data prep pod status
457 | The data-prep pod status changes from ContainerCreating, to Running, to Complete. To show the current status, execute:
458 | 
459 | ```console
460 | ./4-2-show-status.sh
461 | ```
462 | 
463 | Output:
464 | ```
465 | Describing data prep pod ...
466 | Name:         efs-data-prep-pod
467 | Namespace:    default
468 | Priority:     0
469 | 
470 | ...
471 | 
472 | Tolerations:                 node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
473 |                              node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
474 | Events:
475 |   Type    Reason     Age    From               Message
476 |   ----    ------     ----   ----               -------
477 |   Normal  Scheduled  3m23s  default-scheduler  Successfully assigned default/efs-data-prep-pod to ip-192-168-111-138.us-west-2.compute.internal
478 |   Normal  Pulling    3m16s  kubelet            Pulling image "620266777012.dkr.ecr.us-west-2.amazonaws.com/pytorch-cpu:latest"
479 |   Normal  Pulled     2m57s  kubelet            Successfully pulled image "620266777012.dkr.ecr.us-west-2.amazonaws.com/pytorch-cpu:latest" in 19.458971841s
480 |   Normal  Created    2m43s  kubelet            Created container efs-data-prep-pod
481 |   Normal  Started    2m43s  kubelet            Started container efs-data-prep-pod
482 | 
483 | Showing status of data prep pod ...
484 | efs-data-prep-pod       0/1     Completed   0          3m23s
485 | ```
486 | 
487 | ### 4.3. Show data-prep log
488 | When the pod enters the Running or Completed status, you can display its log by executing:
489 | 
490 | ```console
491 | ./4-3-show-log.sh
492 | ```
493 | 
494 | Output:
495 | ```
496 | Shared path - /efs-shared
497 | --2022-06-05 06:50:53--  https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
498 | Resolving www.cs.toronto.edu (www.cs.toronto.edu)... 128.100.3.30
499 | Connecting to www.cs.toronto.edu (www.cs.toronto.edu)|128.100.3.30|:443... connected.
500 | HTTP request sent, awaiting response... 200 OK
501 | Length: 170498071 (163M) [application/x-gzip]
502 | Saving to: 'cifar-10-python.tar.gz'
503 | 
504 |      0K .......... .......... .......... .......... ..........  0%  350K 7m56s
505 |     50K .......... .......... .......... .......... ..........  0%  695K 5m58s
506 |    100K .......... .......... .......... .......... ..........  0%  693K 5m18s
507 |    150K .......... .......... .......... .......... ..........  0% 18.6M 4m1s
508 |    200K .......... .......... .......... .......... ..........  0% 21.5M 3m14s
509 |    250K .......... .......... .......... .......... ..........  0%  732K 3m20s
510 |    300K .......... .......... .......... .......... ..........  0% 66.4M 2m51s
511 |    350K .......... .......... .......... .......... ..........  0% 63.0M 2m30s
512 |    400K .......... .......... .......... .......... ..........  0% 18.6M 2m15s
513 |    450K .......... .......... .......... .......... ..........  0% 60.7M 2m1s
514 |    500K .......... .......... .......... .......... ..........  0% 80.4M 1m50s
515 |    550K .......... .......... .......... .......... ..........  0%  745K 2m0s
516 |    600K .......... .......... .......... .......... ..........  0% 78.2M 1m51s
517 | 
518 |    ...
519 | 
520 | 166250K .......... .......... .......... .......... .......... 99%  118M 0s
521 | 166300K .......... .......... .......... .......... .......... 99%  129M 0s
522 | 166350K .......... .......... .......... .......... .......... 99% 4.00M 0s
523 | 166400K .......... .......... .......... .......... .......... 99%  100M 0s
524 | 166450K .......... .......... .......... .......... .......... 99%  137M 0s
525 | 166500K ..                                                    100% 3858G=4.8s
526 | 
527 | 2022-06-05 06:50:59 (33.9 MB/s) - 'cifar-10-python.tar.gz' saved [170498071/170498071]
528 | ```
529 | 
530 | The last message showing the dataset was saved, indicates a successful download.
531 | 
532 | ## 5. Train Image Classification Model
533 | 
534 | <center><img src="img/step-5-train-model.png" width="80%"/> </br>
535 | 
536 | Fig. 5.0 - Step 5 - Distributed data-parallel model training
537 | </center>
538 | 
539 | Next we will execute the model training scripts from directory `5-train-model`.
540 | 
541 | ```console
542 | cd ../5-train-model
543 | ```
544 | ### 5.1. Generate PyTorchJob
545 | The Kubernetes manifests in this workshop are generated from templates, based on the configuration stored in file [`./env`](.env). To generate the PyTorchJob manifest for our distributed training, execute:
546 | 
547 | ```console
548 | ./5-1-generate-pytorchjob.sh
549 | ```
550 | 
551 | Output:
552 | ```
553 | Generating PyTorchJob manifest ...
554 | 
555 | apiVersion: "kubeflow.org/v1"
556 | kind: PyTorchJob
557 | metadata:
558 |   name: cifar10-train
559 | spec:
560 |   elasticPolicy:
561 |     rdzvBackend: etcd
562 |     rdzvHost: etcd-service
563 |     rdzvPort: 2379
564 |     minReplicas: 1
565 |     maxReplicas: 128
566 |     maxRestarts: 100
567 |     metrics:
568 |       - type: Resource
569 |         resource:
570 |           name: cpu
571 |           target:
572 |             type: Utilization
573 |             averageUtilization: 80
574 |   pytorchReplicaSpecs:
575 |     Worker:
576 |       replicas: 2
577 |       restartPolicy: OnFailure
578 |       template:
579 |         spec:
580 |           containers:
581 |             - name: pytorch
582 |               image: 999701187340.dkr.ecr.us-west-2.amazonaws.com/pytorch-cpu:latest
583 |               imagePullPolicy: IfNotPresent
584 |               env:
585 |               - name: PROCESSOR
586 |                 value: "cpu"
587 |               command:
588 |                 - python3
589 |                 - -m
590 |                 - torch.distributed.run
591 |                 - /workspace/cifar10-model-train.py
592 |                 - "--epochs=10"
593 |                 - "--batch-size=128"
594 |                 - "--workers=15"
595 |                 - "--model-file=/efs-shared/cifar10-model.pth"
596 |                 - "/efs-shared/cifar-10-batches-py/"
597 |               volumeMounts:
598 |                 - name: efs-pv
599 |                   mountPath: /efs-shared
600 |                 # The following enables the worker pods to use increased shared memory
601 |                 # which is required when specifying more than 0 data loader workers
602 |                 - name: dshm
603 |                   mountPath: /dev/shm
604 |           volumes:
605 |             - name: efs-pv
606 |               persistentVolumeClaim:
607 |                 claimName: efs-pvc
608 |             - name: dshm
609 |               emptyDir:
610 |                 medium: Memory
611 | ```
612 | 
613 | The manifest specifies an elastic job named **cifar10-train**. The job is configured to communicate with rendez-vous end point `etcd-service:2379` which is the etcd service we launched in the same namespace. It is also configured to run two workers, each of them on a separate node. Each worker will execute the `torchrun` command and run training for 10 epochs. 
614 | 
615 | ### 5.2. Launch PyTorchJob
616 | Next we will launch the PyTorchJob by applying the generated manifest.
617 | 
618 | Execute:
619 | ```console
620 | ./5-2-launch-pytorchjob.sh
621 | ```
622 | 
623 | Output:
624 | ```
625 | Launching PyTorchJob ...
626 | pytorchjob.kubeflow.org/cifar10-train created
627 | ```
628 | 
629 | ### 5.3. Show training worker pods status
630 | Each launched worker is represented by a pod in the cluster. To see the status of the worker pods, execute:
631 | 
632 | ```console
633 | ./5-3-show-status.sh
634 | ```
635 | 
636 | Output:
637 | ```
638 | cifar10-train-worker-0   1/1     Running     0          47s   192.168.109.172   ip-192-168-111-138.us-west-2.compute.internal   <none>           <none>
639 | cifar10-train-worker-1   1/1     Running     0          47s   192.168.93.104    ip-192-168-90-82.us-west-2.compute.internal     <none>           <none>
640 | ```
641 | 
642 | ### 5.4. Show node utilization
643 | Once the training starts, you will be able to see the CPU utilization of the two nodes rise.
644 | 
645 | Execute:
646 | ```console
647 | ./5-4-show-utilization.sh
648 | ```
649 | 
650 | Output:
651 | ```
652 | NAME                                            CPU(cores)   CPU%   MEMORY(bytes)   MEMORY%   
653 | ip-192-168-111-138.us-west-2.compute.internal   18246m       50%    2306Mi          3%        
654 | ip-192-168-90-82.us-west-2.compute.internal     17936m       50%    2322Mi          3%     
655 | ```
656 | 
657 | ### 5.5. Show training logs
658 | After the worker pods have been created, we can see their combined logs using the **kubetail** tool. 
659 | 
660 | Execute:
661 | ```console
662 | ./5-5-show-logs.sh
663 | ```
664 | 
665 | Output:
666 | ```
667 | Will tail 2 logs...
668 | cifar10-train-worker-0
669 | cifar10-train-worker-1
670 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:22,775 Keep-alive key /torchelastic/p2p/run_cifar10-train/rdzv/v_1/rank_0 is not renewed.
671 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:22,775 Rendevous version 1 is incomplete. 
672 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:22,775 Attempting to destroy it.
673 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:22,776 Destroyed rendezvous version 1 successfully.
674 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:22,776 Previously existing rendezvous state changed. Will re-try joining.
675 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:22,776 Attempting to join next rendezvous
676 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:22,780 New rendezvous state created: {'status': 'joinable', 'version': '2', 'participants': []}
677 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:22,869 Joined rendezvous version 2 as rank 0. Full state: {'status': 'joinable', 'version': '2', 'participants': [0]}
678 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:22,869 Rank 0 is responsible for join last call.
679 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:22,776 Keep-alive key /torchelastic/p2p/run_cifar10-train/rdzv/v_1/rank_0 is not renewed.
680 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:22,776 Rendevous version 1 is incomplete. 
681 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:22,777 Attempting to destroy it.
682 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:22,778 Rendezvous attempt failed, will retry. Reason: Compare failed : [{"status": "final", "version": "1", "participants": [0], "keep_alives": ["/torchelastic/p2p/run_cifar10-train/rdzv/v_1/rank_0"], "num_workers_waiting": 2} != {"status": "setup"}]
683 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:23,779 Attempting to join next rendezvous
684 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:23,784 Observed existing rendezvous state: {'status': 'joinable', 'version': '2', 'participants': [0]}
685 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:23,816 Joined rendezvous version 2 as rank 1. Full state: {'status': 'joinable', 'version': '2', 'participants': [0, 1]}
686 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:23,816 Waiting for remaining peers.
687 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:53,867 Rank 0 finished join last call.
688 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:53,869 All peers arrived. Confirming membership.
689 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:53,867 Waiting for remaining peers.
690 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:53,867 All peers arrived. Confirming membership.
691 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:53,890 Waiting for confirmations from all peers.
692 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:53,913 Waiting for confirmations from all peers.
693 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:53,913 Rendezvous version 2 is complete. Final state: {'status': 'final', 'version': '2', 'participants': [0, 1], 'keep_alives': ['/torchelastic/p2p/run_cifar10-train/rdzv/v_2/rank_0', '/torchelastic/p2p/run_cifar10-train/rdzv/v_2/rank_1'], 'num_workers_waiting': 0}
694 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:53,915 Rendezvous version 2 is complete. Final state: {'status': 'final', 'version': '2', 'participants': [0, 1], 'keep_alives': ['/torchelastic/p2p/run_cifar10-train/rdzv/v_2/rank_0', '/torchelastic/p2p/run_cifar10-train/rdzv/v_2/rank_1'], 'num_workers_waiting': 0}
695 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:53,915 Creating EtcdStore as the c10d::Store implementation
696 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:53,913 Creating EtcdStore as the c10d::Store implementation
697 | [cifar10-train-worker-0] reading /efs-shared/cifar-10-batches-py/
698 | [cifar10-train-worker-1] reading /efs-shared/cifar-10-batches-py/
699 | [cifar10-train-worker-1] [1,     5] loss: 2.335
700 | [cifar10-train-worker-0] [1,     5] loss: 2.323
701 | [cifar10-train-worker-1] [1,    10] loss: 2.247
702 | [cifar10-train-worker-0] [1,    10] loss: 2.225
703 | [cifar10-train-worker-1] [1,    15] loss: 2.168
704 | [cifar10-train-worker-0] [1,    15] loss: 2.163
705 | [cifar10-train-worker-1] [1,    20] loss: 2.061
706 | [cifar10-train-worker-0] [1,    20] loss: 2.077
707 | [cifar10-train-worker-1] [1,    25] loss: 2.011
708 | [cifar10-train-worker-0] [1,    25] loss: 2.010
709 | [cifar10-train-worker-1] [1,    30] loss: 1.963
710 | [cifar10-train-worker-0] [1,    30] loss: 1.938
711 | ...
712 | [cifar10-train-worker-1] [6,   180] loss: 0.496
713 | [cifar10-train-worker-0] [6,   185] loss: 0.499
714 | [cifar10-train-worker-1] [6,   185] loss: 0.503
715 | [cifar10-train-worker-1] [6,   190] loss: 0.504
716 | [cifar10-train-worker-0] [6,   190] loss: 0.594
717 | [cifar10-train-worker-0] [6,   195] loss: 0.536
718 | [cifar10-train-worker-1] [6,   195] loss: 0.522
719 | [cifar10-train-worker-0] [7,     5] loss: 0.470
720 | [cifar10-train-worker-1] [7,     5] loss: 0.464
721 | [cifar10-train-worker-0] [7,    10] loss: 0.510
722 | [cifar10-train-worker-1] [7,    10] loss: 0.465
723 | [cifar10-train-worker-0] [7,    15] loss: 0.525
724 | [cifar10-train-worker-1] [7,    15] loss: 0.489
725 | [cifar10-train-worker-0] [7,    20] loss: 0.479
726 | [cifar10-train-worker-1] [7,    20] loss: 0.478
727 | [cifar10-train-worker-0] [7,    25] loss: 0.523
728 | [cifar10-train-worker-1] [7,    25] loss: 0.520
729 | ...
730 | [cifar10-train-worker-0] [10,   190] loss: 0.247
731 | [cifar10-train-worker-1] [10,   190] loss: 0.185
732 | [cifar10-train-worker-0] [10,   195] loss: 0.200
733 | [cifar10-train-worker-1] [10,   195] loss: 0.202
734 | [cifar10-train-worker-0] saving model: /efs-shared/cifar10-model.pth
735 | [cifar10-train-worker-1] saving model: /efs-shared/cifar10-model.pth
736 | [cifar10-train-worker-1] Finished Training
737 | [cifar10-train-worker-0] Finished Training
738 | ```
739 | 
740 | In the beginning of the logs you will see the workers registering with the rendez-vous endpoint to coordinate their work, then they will train collaboratively over 10 epochs. Each epoch has 400 iterations. Since we are training with two workers, the work is split in two and each of the workers executes only 200 iterations from the epoch. As the training progresses, you will see the `loss` decrease, which indicates that the model is converging. At the end of the 10th epoch, we save the model to the shared volume.
741 | 
742 | Press `Ctrl-C` to stop tailing the logs at any time. 
743 | 
744 | ## 5.6. Delete ElasticJob (*Optional*)
745 | If you wish to run another instance of the elastic job, please delete the current job first.
746 | 
747 | Execute:
748 | ```console
749 | ./5-6-delete-pytorchjob.sh
750 | ```
751 | 
752 | Output:
753 | ```
754 | Deleting PyTorchJob ...
755 | pytorchjob.kubeflow.org "cifar10-train" deleted
756 | ```
757 | 
758 | Note: when starting a new job instance if the workers fail to start with errors indicating failure to connect to the rendez-vous service, please delete the etcd pod as well before starting the elastic job.
759 | 
760 | ## 6. Test Model using New Images
761 | 
762 | <center><img src="img/step-6-test-model.png" width="80%"/> </br>
763 | 
764 | Fig. 6.0 - Step 6 - Test model
765 | </center>
766 | 
767 | This step will be executed from directory `6-test-model`.
768 | 
769 | ```console
770 | cd ../6-test-model
771 | ```
772 | 
773 | ### 6.1 Generate test job
774 | We are going to use a standard Kubernetes job manifest (as opposed to an ElasticJob manifest, which we used for training) since we do not need to run the test in a distributed manner. To generate the job manifest, execute:
775 | 
776 | ```console
777 | ./6-1-generate-job.sh
778 | ```
779 | 
780 | Output:
781 | ```
782 | Generating test job manifest ...
783 | apiVersion: batch/v1
784 | kind: Job
785 | metadata:
786 |   name: cifar10-test
787 | spec:
788 |   template:
789 |     spec:
790 |       restartPolicy: Never
791 |       nodeSelector:
792 |         beta.kubernetes.io/instance-type: c5.4xlarge
793 |       containers:
794 |         - name: test
795 |           image: 042407962002.dkr.ecr.us-west-2.amazonaws.com/pytorch-cpu:latest
796 |           imagePullPolicy: Always
797 |           command: ["python3"]
798 |           args:
799 |             - "/workspace/cifar10-model-test.py"
800 |             - "--model-file=/efs-shared/cifar10-model.pth"
801 |             - "/efs-shared/cifar-10-batches-py/"
802 |           volumeMounts:
803 |             - name: efs-pv
804 |               mountPath: /efs-shared 
805 |       volumes:
806 |         - name: efs-pv
807 |           persistentVolumeClaim:
808 |             claimName: efs-pvc
809 | ```
810 | 
811 | As evident from the manifest above, we will create a single pod named **cifar10-test** and execute the `cifar10-model-test.py` script in it, passing the model file that we saved from the training step.
812 | 
813 | ### 6.2. Launch job
814 | The test job will take 10,000 images that were not used during training and use the model to classify them. Then it will calculate accuracy measurements.
815 | 
816 | Execute:
817 | ```console
818 | ./6-2-launch-job.sh
819 | ```
820 | 
821 | Output:
822 | ```
823 | Launching test job ...
824 | job.batch/cifar10-test created
825 | ```
826 | 
827 | ### 6.3. Show job status
828 | When the job manifest is applied, a pod is created and runs to completion. To see the pod status, execute:
829 | 
830 | ```console
831 | ./6-3-show-status.sh
832 | ```
833 | 
834 | Output:
835 | ```
836 | Showing test job status ...
837 | cifar10-test-tlnjn       1/1     Running     0          4s
838 | ```
839 | 
840 | ### 6.4. Show test log
841 | The results from the test are written to the pod log.
842 | 
843 | Execute:
844 | ```console
845 | ./6-4-show-log.sh
846 | ```
847 | 
848 | Output:
849 | ```
850 | Showing cifar10-test log ...
851 | 
852 | reading /efs-shared/cifar-10-batches-py/
853 | loading model /efs-shared/cifar10-model.pth
854 | Accuracy of the network on the 10000 test images: 74 %
855 | Accuracy for class: plane is 77.5 %
856 | Accuracy for class: car   is 85.2 %
857 | Accuracy for class: bird  is 64.4 %
858 | Accuracy for class: cat   is 53.3 %
859 | Accuracy for class: deer  is 71.3 %
860 | Accuracy for class: dog   is 66.6 %
861 | Accuracy for class: frog  is 85.9 %
862 | Accuracy for class: horse is 80.5 %
863 | Accuracy for class: ship  is 82.8 %
864 | Accuracy for class: truck is 82.4 %
865 | Finished Testing
866 | ```
867 | 
868 | As we can see the model classified the images into 10 different categories with overall accuracy of 74%.
869 | 
870 | ### 6.5. Delete test job
871 | In the event that the test job needs to be run again for a different model, the old job needs to be deleted first.
872 | 
873 | Execute:
874 | ```console
875 | ./6-5-delete-job.sh
876 | ```
877 | 
878 | Output:
879 | ```
880 | Deleting test job ...
881 | job.batch "cifar10-test" deleted
882 | ```
883 | 
884 | ### 6.6. Optional exercise
885 | We have run distributed training on two nodes. Edit the autoscaling group to set the desired number of nodes to 4, then modify the configuration file `.env` to reflect the new number of nodes and re-run the training job. You will notice that the time to run 10 epochs decreases as the workload gets distributed among more nodes.
886 | 
887 | ## 7. Cleanup (optional)
888 | 
889 | <center><img src="img/step-7-cleanup.png" width="80%"/> </br>
890 | 
891 | Fig. 7.0 - Step 7 - Cleanup
892 | </center>
893 | 
894 | Optionally you can execute the scripts in the cleanup folder to delete the shared storage volume and the EKS cluster you created for this workshop.
895 | 
896 | ```console
897 | cd ../7-cleanup
898 | ```
899 | 
900 | ### 7.1. Delete EFS volume
901 | The EFS file system needs to be deleted first since it is associated with subnets within the VPC used by the EKS cluster.
902 | 
903 | Execute:
904 | ```console
905 | ./7-1-delete-efs.sh
906 | ```
907 | 
908 | Output:
909 | ```
910 | Deleting EFS mount targets for File System fs-070041b9153fa56b8 ...
911 | Deleting mount target fsmt-02128c3560394ce31
912 | Deleting mount target fsmt-0f80225b4ba7580b0
913 | Deleting EFS file system fs-070041b9153fa56b8 ...
914 | 
915 | Done.
916 | ```
917 | 
918 | Note: If an error occurs during the deletion of the file system, please wait for a minute and run the script again. The EFS file system can only be deleted after the mount targets are fully deleted.
919 | 
920 | ### 7.2. Delete EKS cluster
921 | Performing this step deletes all of the remaining infrastructure that was used in this workshop. This includes the node groups, cluster, NAT gateways, subnets, and VPC.
922 | 
923 | Execute:
924 | ```console
925 | ./7-2-delete-cluster.sh
926 | ```
927 | 
928 | Output:
929 | ```
930 | Deleting cluster do-eks. Proceed? [Y/n]: Y
931 | Confirmed ...
932 | 2022-06-07 02:03:19 [ℹ]  eksctl version 0.66.0
933 | 2022-06-07 02:03:19 [ℹ]  using region us-west-2
934 | 2022-06-07 02:03:19 [ℹ]  deleting EKS cluster "do-eks"
935 | 2022-06-07 02:03:20 [ℹ]  deleted 0 Fargate profile(s)
936 | 2022-06-07 02:03:20 [✔]  kubeconfig has been updated
937 | 2022-06-07 02:03:20 [ℹ]  cleaning up AWS load balancers created by Kubernetes objects of Kind Service or Ingress
938 | 2022-06-07 02:03:27 [ℹ]  3 sequential tasks: { delete nodegroup "wks-node", 2 sequential sub-tasks: { 2 sequential sub-tasks: { delete IAM role for serviceaccount "kube-system/aws-node", delete serviceaccount "kube-system/aws-node" }, delete IAM OIDC provider }, delete cluster control plane "do-eks" [async] }
939 | 2022-06-07 02:03:27 [ℹ]  will delete stack "eksctl-do-eks-nodegroup-wks-node"
940 | 2022-06-07 02:03:27 [ℹ]  waiting for stack "eksctl-do-eks-nodegroup-wks-node" to get deleted
941 | 2022-06-07 02:03:27 [ℹ]  waiting for CloudFormation stack "eksctl-do-eks-nodegroup-wks-node"
942 | ...
943 | 2022-06-07 02:07:17 [ℹ]  waiting for CloudFormation stack "eksctl-do-eks-nodegroup-wks-node"
944 | 2022-06-07 02:07:17 [ℹ]  will delete stack "eksctl-do-eks-addon-iamserviceaccount-kube-system-aws-node"
945 | 2022-06-07 02:07:17 [ℹ]  waiting for stack "eksctl-do-eks-addon-iamserviceaccount-kube-system-aws-node" to get deleted
946 | 2022-06-07 02:07:17 [ℹ]  waiting for CloudFormation stack "eksctl-do-eks-addon-iamserviceaccount-kube-system-aws-node"
947 | 2022-06-07 02:07:34 [ℹ]  waiting for CloudFormation stack "eksctl-do-eks-addon-iamserviceaccount-kube-system-aws-node"
948 | 2022-06-07 02:07:34 [ℹ]  deleted serviceaccount "kube-system/aws-node"
949 | 2022-06-07 02:07:34 [ℹ]  will delete stack "eksctl-do-eks-cluster"
950 | 2022-06-07 02:07:34 [✔]  all cluster resources were deleted
951 | Please note that the cluster will be fully deleted when the Cloud Formation stack completes its removal
952 | Only after the process in Cloud Formation is finished, you will be able to create a new cluster with the same name
953 | ```
954 | 
955 | # Conclusion
956 | Congratulations on completing this Distributed Model Training workshop!
957 | You now have experience with building and running a distributed model training architecture on AWS EKS.
958 | The techniques demonstrated here are generic and can be applied for your own distributed model trainig needs and at larger scale.
959 | 
960 | # License
961 | This repository is released under the MIT-0 License. See the [LICENSE](LICENSE) file for details.
962 | 
963 | # References
964 | * [Docker](https://docker.com)
965 | * [PyTorch](https://pytorch.org)
966 | * [Kubernetes](https://kubernetes.io)
967 | * [Amazon Web Services (AWS)](https://aws.amazon.com/)
968 | * [Amazon Elastic Kubernetes Service (EKS)](https://aws.amazon.com/eks)
969 | * [Do Framework](https://bit.ly/do-framework)
970 | * [Do EKS Project](https://github.com/aws-samples/aws-do-eks)
971 | 


--------------------------------------------------------------------------------
/SETUP.md:
--------------------------------------------------------------------------------
  1 | ## 0. Prerequisites setup
  2 | This document describes how to set up your AWS account and Cloud9 IDE, which will be used to execute all of the steps in the workshop.
  3 | 
  4 | ### 0.1. Setup AWS Account
  5 | For this workshop, you may use your own AWS account, or use an account generated by AWS Event Engine. If you are using your own AWS account, please proceed to Section 0.2. If you would like to receive a temporary AWS Account through AWS Event Engine follow these steps:
  6 | 
  7 |     1. Go to the Event Engine link provided by your workshop host
  8 |     2. Follow the on-screen instructions to gain access to your temporary AWS account
  9 | 
 10 | Once you have logged in successfully, proceed to create an IAM user.
 11 | 
 12 | ### 0.2. Create IAM user with admin rights
 13 | Once logged into the account through the [AWS console](https://console.aws.amazon.com/console/home?region=us-west-2#), navigate to [IAM Users](https://console.aws.amazon.com/iamv2/home?#/users) and add a new user by clicking the **Add users** button and filling out the form as shown below. Use **inferentia_user** as the User Name. 
 14 | <div align="center">
 15 | <img src="img/iam-add-user.png" width="80%">
 16 | <br/>
 17 | Fig. 0.1 - Add user screen
 18 | </div>
 19 | <br/>
 20 | 
 21 | Click **Next: Permissions** and click the **Create group** button on the screen. 
 22 | 
 23 | <div align="center">
 24 | <img src="img/iam-create-group.png" width="80%">
 25 | <br/>
 26 | Fig. 0.2 - Set permissions screen
 27 | </div>
 28 | <br/>
 29 | 
 30 | Provide group name **admins** and select the **AdministratorAccess** policy as shown below.
 31 | 
 32 | <div align="center">
 33 | <img src="img/iam-add-group.png" width="80%">
 34 | <br/>
 35 | Fig. 0.3 - Create group
 36 | </div>
 37 | <br/>
 38 | 
 39 | Click the **Create group** button and you will be brought back to the **Set permissions** screen. Select the **admins** group as shown on the figure below, then click **Next: Tags** .
 40 | <div align="center">
 41 | <img src="img/iam-add-user-admins-group.png" width="80%">
 42 | <br/>
 43 | Fig. 0.4 - Add user to admins group
 44 | </div>
 45 | 
 46 | 
 47 | Follow the wizard through to the end to create the user (remaining options can be left as default). When the user is added successfully, you will see a confirmation screen from which you can copy the user's Access Key and Secret Access Key. 
 48 | <div align="center">
 49 | <img src="img/iam-add-user-access-key.png" width="80%">
 50 | <br/>
 51 | Fig. 0.5 - Confirmation screen with access key information for new user
 52 | </div>
 53 | <br/>
 54 | 
 55 | Click the **Download .csv** button to download the user's credentials as a `.csv` file. Alternatively you can press the **Show** link and copy/paste the **Access key ID** and **Secret access key** locally. You will need to enter the crecentials later while you are completing the exercises in this workshop. This is the only time these credentials will be available for download or display. You will be able to generate new credentials if necessary.
 56 | 
 57 | ### 0.3) Sign into the AWS Console
 58 | In this step you will sign in to the AWS Console as the user you just created.
 59 | Pull down the user menu from your current AWS Console screen and copy the Account number displayed next to **My Account** as shown on the figure below.
 60 | <div align="center">
 61 | <img src="img/aws-console-my-account.png" width="40%">
 62 | <br/>
 63 | Fig. 0.6 - Sign out of AWS Console
 64 | </div>
 65 | <br/>
 66 | 
 67 | Once you have copied the account number, cick **Sign Out**, then click **Sign In to the Console**.
 68 | 
 69 | <div align="center">
 70 | <img src="img/aws-console-signin.png" width="60%">
 71 | <br/>
 72 | Fig. 0.7 - Sign in landing screen
 73 | </div>
 74 | <br/>
 75 | 
 76 | On the **Sign in** screen select **IAM user**, enter the **Account ID** that you just copied, and click Next.
 77 | 
 78 | <div align="center">
 79 | <img src="img/aws-console-signin-iam-user.png" width="60%">
 80 | <br/>
 81 | Fig. 0.8 - Sign in as IAM user
 82 | </div>
 83 | <br/>
 84 | 
 85 | When presented with the login screen shown below, fill in the IAM username and password that you created in the previous step.
 86 | 
 87 | Next, click the **Sign in** button and sign in as the new IAM user.
 88 | 
 89 | ### 0.4. Setup Cloud9 IDE
 90 | 
 91 | Please verify that the `us-west-2` region **Oregon** is selected in your console and is showing in the upper right corner of your browser as highlighted in the figure below. We will use [Cloud9](https://us-west-2.console.aws.amazon.com/cloud9/home/product) to execute the steps in this workshop. To provision a Cloud9 IDE, click on the **Services** menu (from the top left of the screen) then select **Developer Tools** and choose **Cloud9**, or just open the following <a href="https://us-west-2.console.aws.amazon.com/cloud9/home/product" target="_blank">link to Cloud9</a>.
 92 | 
 93 | <div align="center">
 94 | <img src="img/aws-console-cloud9-link.png" width="100%">
 95 | <br/>
 96 | Fig. 0.9 - Cloud9 link
 97 | </div>
 98 | <br/>
 99 | 
100 | Following the link will open the Cloud9 landing page.
101 | 
102 | <div align="center">
103 | <img src="img/cloud9-landing-page.png" width="100%">
104 | <br/>
105 | Fig. 0.10 - Cloud9 landing page
106 | </div>
107 | <br/>
108 | 
109 | Click on the `Create environment` button.
110 | 
111 | <div align="center">
112 | <img src="img/cloud9-name-environment.png" width="100%">
113 | <br/>
114 | Fig. 0.11 - Cloud9 name environment screen
115 | </div>
116 | <br/>
117 | 
118 | Type a name for your Cloud9 environment, then click `Next`.
119 | 
120 | <div align="center">
121 | <img src="img/cloud9-configure-settings.png" width="100%">
122 | <br/>
123 | Fig. 0.12 - Cloud9 configure settings screen
124 | </div>
125 | <br/>
126 | 
127 | Under **Instance type** selcet `Other instance type` and `c5.9xlarge`. Then click **Next step** and **Create environment**.
128 | This will launch your Cloud9 instance. Provisioning of the instance can take a few minutes.
129 | 
130 | <div align="center">
131 | <img src="img/cloud9-configure-settings.png" width="100%">
132 | <br/>
133 | Fig. 0.13 - Cloud9 instance
134 | </div>
135 | <br/>
136 | 
137 | The default Cloud9 instance comes with a root EBS volume that is only 10GB in size. 
138 | 
139 | <div align="center">
140 | <img src="img/cloud9-ide-manage-ec2.png" width="100%">
141 | <br/>
142 | Fig. 0.14 - Cloud9 Manage EC2 Instance
143 | </div>
144 | <br/>
145 | 
146 | We will increase the root volume size, to avoid running out of space later in the workshop.
147 | Click on the user icon in the upper-right corner and select **Manage EC2 Instance**.
148 | 
149 | <div align="center">
150 | <img src="img/cloud9-instance-storage.png" width="100%">
151 | <br/>
152 | Fig. 0.15 - Cloud9 EC2 Instance Storage
153 | </div>
154 | <br/>
155 | 
156 | Select the instance, then click on the **Storage** tab and click on the link under **Volume ID** to select the current root volume. 
157 | 
158 | <div align="center">
159 | <img src="img/cloud9-volume-actions.png" width="100%">
160 | <br/>
161 | Fig. 0.16 - Cloud9 EC2 Instance Volume
162 | </div>
163 | <br/>
164 | 
165 | Select the volume, then click on Actions and select **Modify volume**.
166 | 
167 | <div align="center">
168 | <img src="img/cloud9-modify-volume.png" width="100%">
169 | <br/>
170 | Fig. 0.16 - Cloud9 Modify Volume
171 | </div>
172 | <br/>
173 | 
174 | Increase the size of the volume by typing the desired size in the **Size (GiB)** field, then click **Modify**, and confirm.
175 | 
176 | <div align="center">
177 | <img src="img/cloud9-volume-optimizing.png" width="100%">
178 | <br/>
179 | Fig. 0.17 - Cloud9 volume optimizing
180 | </div>
181 | <br/>
182 | 
183 | The volume status changes to **In-use - modifying** and in a few seconds becomes **In-use - optimizing**. As soon as the status changes to **optimizing** we need to reboot the instance in order for the resized volume to become available in Cloud9.
184 | 
185 | <div align="center">
186 | <img src="img/cloud9-reboot-instance.png" width="100%">
187 | <br/>
188 | Fig. 0.18 - Cloud9 reboot instance
189 | </div>
190 | <br/>
191 | 
192 | To reboot the instance, select **Instances** from the console navigation menu, then highlight the instance and select **Instance state -> Reboot**.
193 | 
194 | <div align="center">
195 | <img src="img/cloud9-resized-volume.png" width="100%">
196 | <br/>
197 | Fig. 0.19 - Cloud9 IDE with resized volume
198 | </div>
199 | <br/>
200 | 
201 | Once the instance is restarted, refresh the Cloud9 IDE window and type `df -h` in the terminal window. You should see that the root volume has the size you specified earlier.
202 | 
203 | Open the IDE Preferences by clicking on the settings icon in the upper-right corner of the screen, or by clicking the Cloud9 icon in the menu and selecting Preferences. Scroll the list of preferences down and selct the `AWS Settings` section. Disable the `AWS managed temporary credentials` setting as shown below.
204 | 
205 | <div align="center">
206 | <img src="img/cloud9-managed-credentials.png" width="100%">
207 | <br/>
208 | Fig. 0.20 - Disable Cloud9 IDE `AWS managed temporary credentials` setting
209 | </div>
210 | <br/>
211 | 
212 | 
213 | ### 0.5 Clone workshop repository
214 | 
215 | ```
216 | git clone https://github.com/aws-samples/aws-distributed-training-workshop-eks.git
217 | ```
218 | 
219 | and
220 | 
221 | ```
222 | cd aws-distributed-training-workshop-eks
223 | ```
224 | 
225 | Your Cloud9 work environment is now completely set up and you are ready to dive into the [Distributed Model Training Workshop for AWS EKS](README.md).
226 | 


--------------------------------------------------------------------------------
/THIRD-PARTY-LICENSES:
--------------------------------------------------------------------------------
  1 | The AWS Distributed Training Workshop for EKS includes the following third-party software/licensing:
  2 | 
  3 | ** PyTorch - https://pytorch.org
  4 | 
  5 | License URL: https://github.com/pytorch/pytorch/blob/master/LICENSE
  6 | 
  7 | From PyTorch:
  8 | 
  9 | Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
 10 | Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
 11 | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
 12 | Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
 13 | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
 14 | Copyright (c) 2011-2013 NYU                      (Clement Farabet)
 15 | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
 16 | Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
 17 | Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
 18 | 
 19 | From Caffe2:
 20 | 
 21 | Copyright (c) 2016-present, Facebook Inc. All rights reserved.
 22 | 
 23 | All contributions by Facebook:
 24 | Copyright (c) 2016 Facebook Inc.
 25 | 
 26 | All contributions by Google:
 27 | Copyright (c) 2015 Google Inc.
 28 | All rights reserved.
 29 | 
 30 | All contributions by Yangqing Jia:
 31 | Copyright (c) 2015 Yangqing Jia
 32 | All rights reserved.
 33 | 
 34 | All contributions by Kakao Brain:
 35 | Copyright 2019-2020 Kakao Brain
 36 | 
 37 | All contributions by Cruise LLC:
 38 | Copyright (c) 2022 Cruise LLC.
 39 | All rights reserved.
 40 | 
 41 | All contributions from Caffe:
 42 | Copyright(c) 2013, 2014, 2015, the respective contributors
 43 | All rights reserved.
 44 | 
 45 | All other contributions:
 46 | Copyright(c) 2015, 2016 the respective contributors
 47 | All rights reserved.
 48 | 
 49 | Caffe2 uses a copyright model similar to Caffe: each contributor holds
 50 | copyright over their contributions to Caffe2. The project versioning records
 51 | all such contribution and copyright details. If a contributor wants to further
 52 | mark their specific copyright on a particular contribution, they should
 53 | indicate their copyright solely in the commit message of the change when it is
 54 | committed.
 55 | 
 56 | All rights reserved.
 57 | 
 58 | Redistribution and use in source and binary forms, with or without
 59 | modification, are permitted provided that the following conditions are met:
 60 | 
 61 | 1. Redistributions of source code must retain the above copyright
 62 |    notice, this list of conditions and the following disclaimer.
 63 | 
 64 | 2. Redistributions in binary form must reproduce the above copyright
 65 |    notice, this list of conditions and the following disclaimer in the
 66 |    documentation and/or other materials provided with the distribution.
 67 | 
 68 | 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
 69 |    and IDIAP Research Institute nor the names of its contributors may be
 70 |    used to endorse or promote products derived from this software without
 71 |    specific prior written permission.
 72 | 
 73 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 74 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 75 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 76 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 77 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 78 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 79 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 80 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 81 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 82 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 83 | POSSIBILITY OF SUCH DAMAGE.
 84 | 
 85 | ------------------------------------------------------------
 86 | 
 87 | ** TorchElastic - https://github.com/pytorch/elastic/
 88 | 
 89 | https://github.com/pytorch/elastic/blob/master/LICENSE
 90 | 
 91 | BSD 3-Clause License
 92 | 
 93 | Copyright (c) 2019-present, Facebook, Inc.
 94 | All rights reserved.
 95 | 
 96 | Redistribution and use in source and binary forms, with or without
 97 | modification, are permitted provided that the following conditions are met:
 98 | 
 99 | * Redistributions of source code must retain the above copyright notice, this
100 |   list of conditions and the following disclaimer.
101 | 
102 | * Redistributions in binary form must reproduce the above copyright notice,
103 |   this list of conditions and the following disclaimer in the documentation
104 |   and/or other materials provided with the distribution.
105 | 
106 | * Neither the name of the copyright holder nor the names of its
107 |   contributors may be used to endorse or promote products derived from
108 |   this software without specific prior written permission.
109 | 
110 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
111 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
112 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
113 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
114 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
115 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
116 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
117 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
118 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
119 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
120 | 
121 | ------------------------------------------------------------
122 | ** kubetail - https://github.com/johanhaleby/kubetail
123 | ** kubectx - https://github.com/ahmetb/kubectx
124 | ** yq - https://github.com/kislyuk/yq
125 | ** Kubernetes metrics server - https://github.com/kubernetes-sigs/metrics-server
126 | 
127 |                                  Apache License
128 |                            Version 2.0, January 2004
129 |                         http://www.apache.org/licenses/
130 | 
131 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
132 | 
133 |    1. Definitions.
134 | 
135 |       "License" shall mean the terms and conditions for use, reproduction,
136 |       and distribution as defined by Sections 1 through 9 of this document.
137 | 
138 |       "Licensor" shall mean the copyright owner or entity authorized by
139 |       the copyright owner that is granting the License.
140 | 
141 |       "Legal Entity" shall mean the union of the acting entity and all
142 |       other entities that control, are controlled by, or are under common
143 |       control with that entity. For the purposes of this definition,
144 |       "control" means (i) the power, direct or indirect, to cause the
145 |       direction or management of such entity, whether by contract or
146 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
147 |       outstanding shares, or (iii) beneficial ownership of such entity.
148 | 
149 |       "You" (or "Your") shall mean an individual or Legal Entity
150 |       exercising permissions granted by this License.
151 | 
152 |       "Source" form shall mean the preferred form for making modifications,
153 |       including but not limited to software source code, documentation
154 |       source, and configuration files.
155 | 
156 |       "Object" form shall mean any form resulting from mechanical
157 |       transformation or translation of a Source form, including but
158 |       not limited to compiled object code, generated documentation,
159 |       and conversions to other media types.
160 | 
161 |       "Work" shall mean the work of authorship, whether in Source or
162 |       Object form, made available under the License, as indicated by a
163 |       copyright notice that is included in or attached to the work
164 |       (an example is provided in the Appendix below).
165 | 
166 |       "Derivative Works" shall mean any work, whether in Source or Object
167 |       form, that is based on (or derived from) the Work and for which the
168 |       editorial revisions, annotations, elaborations, or other modifications
169 |       represent, as a whole, an original work of authorship. For the purposes
170 |       of this License, Derivative Works shall not include works that remain
171 |       separable from, or merely link (or bind by name) to the interfaces of,
172 |       the Work and Derivative Works thereof.
173 | 
174 |       "Contribution" shall mean any work of authorship, including
175 |       the original version of the Work and any modifications or additions
176 |       to that Work or Derivative Works thereof, that is intentionally
177 |       submitted to Licensor for inclusion in the Work by the copyright owner
178 |       or by an individual or Legal Entity authorized to submit on behalf of
179 |       the copyright owner. For the purposes of this definition, "submitted"
180 |       means any form of electronic, verbal, or written communication sent
181 |       to the Licensor or its representatives, including but not limited to
182 |       communication on electronic mailing lists, source code control systems,
183 |       and issue tracking systems that are managed by, or on behalf of, the
184 |       Licensor for the purpose of discussing and improving the Work, but
185 |       excluding communication that is conspicuously marked or otherwise
186 |       designated in writing by the copyright owner as "Not a Contribution."
187 | 
188 |       "Contributor" shall mean Licensor and any individual or Legal Entity
189 |       on behalf of whom a Contribution has been received by Licensor and
190 |       subsequently incorporated within the Work.
191 | 
192 |    2. Grant of Copyright License. Subject to the terms and conditions of
193 |       this License, each Contributor hereby grants to You a perpetual,
194 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
195 |       copyright license to reproduce, prepare Derivative Works of,
196 |       publicly display, publicly perform, sublicense, and distribute the
197 |       Work and such Derivative Works in Source or Object form.
198 | 
199 |    3. Grant of Patent License. Subject to the terms and conditions of
200 |       this License, each Contributor hereby grants to You a perpetual,
201 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
202 |       (except as stated in this section) patent license to make, have made,
203 |       use, offer to sell, sell, import, and otherwise transfer the Work,
204 |       where such license applies only to those patent claims licensable
205 |       by such Contributor that are necessarily infringed by their
206 |       Contribution(s) alone or by combination of their Contribution(s)
207 |       with the Work to which such Contribution(s) was submitted. If You
208 |       institute patent litigation against any entity (including a
209 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
210 |       or a Contribution incorporated within the Work constitutes direct
211 |       or contributory patent infringement, then any patent licenses
212 |       granted to You under this License for that Work shall terminate
213 |       as of the date such litigation is filed.
214 | 
215 |    4. Redistribution. You may reproduce and distribute copies of the
216 |       Work or Derivative Works thereof in any medium, with or without
217 |       modifications, and in Source or Object form, provided that You
218 |       meet the following conditions:
219 | 
220 |       (a) You must give any other recipients of the Work or
221 |           Derivative Works a copy of this License; and
222 | 
223 |       (b) You must cause any modified files to carry prominent notices
224 |           stating that You changed the files; and
225 | 
226 |       (c) You must retain, in the Source form of any Derivative Works
227 |           that You distribute, all copyright, patent, trademark, and
228 |           attribution notices from the Source form of the Work,
229 |           excluding those notices that do not pertain to any part of
230 |           the Derivative Works; and
231 | 
232 |       (d) If the Work includes a "NOTICE" text file as part of its
233 |           distribution, then any Derivative Works that You distribute must
234 |           include a readable copy of the attribution notices contained
235 |           within such NOTICE file, excluding those notices that do not
236 |           pertain to any part of the Derivative Works, in at least one
237 |           of the following places: within a NOTICE text file distributed
238 |           as part of the Derivative Works; within the Source form or
239 |           documentation, if provided along with the Derivative Works; or,
240 |           within a display generated by the Derivative Works, if and
241 |           wherever such third-party notices normally appear. The contents
242 |           of the NOTICE file are for informational purposes only and
243 |           do not modify the License. You may add Your own attribution
244 |           notices within Derivative Works that You distribute, alongside
245 |           or as an addendum to the NOTICE text from the Work, provided
246 |           that such additional attribution notices cannot be construed
247 |           as modifying the License.
248 | 
249 |       You may add Your own copyright statement to Your modifications and
250 |       may provide additional or different license terms and conditions
251 |       for use, reproduction, or distribution of Your modifications, or
252 |       for any such Derivative Works as a whole, provided Your use,
253 |       reproduction, and distribution of the Work otherwise complies with
254 |       the conditions stated in this License.
255 | 
256 |    5. Submission of Contributions. Unless You explicitly state otherwise,
257 |       any Contribution intentionally submitted for inclusion in the Work
258 |       by You to the Licensor shall be under the terms and conditions of
259 |       this License, without any additional terms or conditions.
260 |       Notwithstanding the above, nothing herein shall supersede or modify
261 |       the terms of any separate license agreement you may have executed
262 |       with Licensor regarding such Contributions.
263 | 
264 |    6. Trademarks. This License does not grant permission to use the trade
265 |       names, trademarks, service marks, or product names of the Licensor,
266 |       except as required for reasonable and customary use in describing the
267 |       origin of the Work and reproducing the content of the NOTICE file.
268 | 
269 |    7. Disclaimer of Warranty. Unless required by applicable law or
270 |       agreed to in writing, Licensor provides the Work (and each
271 |       Contributor provides its Contributions) on an "AS IS" BASIS,
272 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
273 |       implied, including, without limitation, any warranties or conditions
274 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
275 |       PARTICULAR PURPOSE. You are solely responsible for determining the
276 |       appropriateness of using or redistributing the Work and assume any
277 |       risks associated with Your exercise of permissions under this License.
278 | 
279 |    8. Limitation of Liability. In no event and under no legal theory,
280 |       whether in tort (including negligence), contract, or otherwise,
281 |       unless required by applicable law (such as deliberate and grossly
282 |       negligent acts) or agreed to in writing, shall any Contributor be
283 |       liable to You for damages, including any direct, indirect, special,
284 |       incidental, or consequential damages of any character arising as a
285 |       result of this License or out of the use or inability to use the
286 |       Work (including but not limited to damages for loss of goodwill,
287 |       work stoppage, computer failure or malfunction, or any and all
288 |       other commercial damages or losses), even if such Contributor
289 |       has been advised of the possibility of such damages.
290 | 
291 |    9. Accepting Warranty or Additional Liability. While redistributing
292 |       the Work or Derivative Works thereof, You may choose to offer,
293 |       and charge a fee for, acceptance of support, warranty, indemnity,
294 |       or other liability obligations and/or rights consistent with this
295 |       License. However, in accepting such obligations, You may act only
296 |       on Your own behalf and on Your sole responsibility, not on behalf
297 |       of any other Contributor, and only if You agree to indemnify,
298 |       defend, and hold each Contributor harmless for any liability
299 |       incurred by, or claims asserted against, such Contributor by reason
300 |       of your accepting any such warranty or additional liability.
301 | 
302 |    END OF TERMS AND CONDITIONS
303 | 
304 | ------------------------------------------------------------
305 | 
306 | ** jq - https://github.com/stedolan/jq
307 | 
308 | jq is copyright (C) 2012 Stephen Dolan
309 | 
310 | Permission is hereby granted, free of charge, to any person obtaining
311 | a copy of this software and associated documentation files (the
312 | "Software"), to deal in the Software without restriction, including
313 | without limitation the rights to use, copy, modify, merge, publish,
314 | distribute, sublicense, and/or sell copies of the Software, and to
315 | permit persons to whom the Software is furnished to do so, subject to
316 | the following conditions:
317 | 
318 | The above copyright notice and this permission notice shall be
319 | included in all copies or substantial portions of the Software.
320 | 
321 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
322 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
323 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
324 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
325 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
326 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
327 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
328 | 
329 | 
330 | 
331 | jq's documentation (everything found under the docs/ subdirectory in
332 | the source tree) is licensed under the Creative Commons CC BY 3.0
333 | license, which can be found at:
334 | 
335 |          https://creativecommons.org/licenses/by/3.0/
336 | 
337 | The documentation website includes a copy of Twitter's Boostrap and
338 | relies on Bonsai, Liquid templates and various other projects, look
339 | them up for detailed licensing conditions.
340 | 
341 | 
342 | 
343 | jq incorporates David M. Gay's dtoa.c and g_fmt.c, which bear the
344 | following notices:
345 | 
346 | dtoa.c:
347 | The author of this software is David M. Gay.
348 | 
349 | Copyright (c) 1991, 2000, 2001 by Lucent Technologies.
350 | 
351 | Permission to use, copy, modify, and distribute this software for any
352 | purpose without fee is hereby granted, provided that this entire notice
353 | is included in all copies of any software which is or includes a copy
354 | or modification of this software and in all copies of the supporting
355 | documentation for such software.
356 | 
357 | THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
358 | WARRANTY.  IN PARTICULAR, NEITHER THE AUTHOR NOR LUCENT MAKES ANY
359 | REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
360 | OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
361 | 
362 | g_fmt.c:
363 | The author of this software is David M. Gay.
364 | 
365 | Copyright (c) 1991, 1996 by Lucent Technologies.
366 | 
367 | Permission to use, copy, modify, and distribute this software for any
368 | purpose without fee is hereby granted, provided that this entire notice
369 | is included in all copies of any software which is or includes a copy
370 | or modification of this software and in all copies of the supporting
371 | documentation for such software.
372 | 
373 | THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
374 | WARRANTY.  IN PARTICULAR, NEITHER THE AUTHOR NOR LUCENT MAKES ANY
375 | REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
376 | OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
377 | 
378 | 
379 | 
380 | jq uses parts of the open source C library "decNumber", which is distribured
381 | under the following license:
382 | 
383 | 
384 | ICU License - ICU 1.8.1 and later
385 | 
386 | COPYRIGHT AND PERMISSION NOTICE
387 | 
388 | Copyright (c) 1995-2005 International Business Machines Corporation and others
389 | All rights reserved.
390 | 
391 | Permission is hereby granted, free of charge, to any person obtaining a
392 | copy of this software and associated documentation files (the
393 | "Software"), to deal in the Software without restriction, including
394 | without limitation the rights to use, copy, modify, merge, publish,
395 | distribute, and/or sell copies of the Software, and to permit persons
396 | to whom the Software is furnished to do so, provided that the above
397 | copyright notice(s) and this permission notice appear in all copies of
398 | the Software and that both the above copyright notice(s) and this
399 | permission notice appear in supporting documentation.
400 | 
401 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
402 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
403 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
404 | OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
405 | HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
406 | INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
407 | FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
408 | NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
409 | WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
410 | 
411 | Except as contained in this notice, the name of a copyright holder
412 | shall not be used in advertising or otherwise to promote the sale, use
413 | or other dealings in this Software without prior written authorization
414 | of the copyright holder.
415 | 
416 | Portions Copyright (c) 2016 Kungliga Tekniska Högskolan
417 | (Royal Institute of Technology, Stockholm, Sweden).
418 | All rights reserved.
419 | 
420 | Redistribution and use in source and binary forms, with or without
421 | modification, are permitted provided that the following conditions
422 | are met:
423 | 
424 | 1. Redistributions of source code must retain the above copyright
425 |    notice, this list of conditions and the following disclaimer.
426 | 
427 | 2. Redistributions in binary form must reproduce the above copyright
428 |    notice, this list of conditions and the following disclaimer in the
429 |    documentation and/or other materials provided with the distribution.
430 | 
431 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
432 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
433 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
434 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
435 | COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
436 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
437 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
438 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
439 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
440 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
441 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
442 | OF THE POSSIBILITY OF SUCH DAMAGE.
443 | 
444 | 


--------------------------------------------------------------------------------
/img/aws-console-cloud9-link.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/aws-console-cloud9-link.png


--------------------------------------------------------------------------------
/img/aws-console-my-account.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/aws-console-my-account.png


--------------------------------------------------------------------------------
/img/aws-console-signin-iam-user.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/aws-console-signin-iam-user.png


--------------------------------------------------------------------------------
/img/aws-console-signin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/aws-console-signin.png


--------------------------------------------------------------------------------
/img/cloud9-configure-settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-configure-settings.png


--------------------------------------------------------------------------------
/img/cloud9-credentials-dialog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-credentials-dialog.png


--------------------------------------------------------------------------------
/img/cloud9-credentials-disable.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-credentials-disable.png


--------------------------------------------------------------------------------
/img/cloud9-ide-manage-ec2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-ide-manage-ec2.png


--------------------------------------------------------------------------------
/img/cloud9-instance-storage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-instance-storage.png


--------------------------------------------------------------------------------
/img/cloud9-landing-page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-landing-page.png


--------------------------------------------------------------------------------
/img/cloud9-managed-credentials.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-managed-credentials.png


--------------------------------------------------------------------------------
/img/cloud9-modify-volume.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-modify-volume.png


--------------------------------------------------------------------------------
/img/cloud9-name-environment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-name-environment.png


--------------------------------------------------------------------------------
/img/cloud9-reboot-instance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-reboot-instance.png


--------------------------------------------------------------------------------
/img/cloud9-resized-volume.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-resized-volume.png


--------------------------------------------------------------------------------
/img/cloud9-volume-actions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-volume-actions.png


--------------------------------------------------------------------------------
/img/cloud9-volume-optimizing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-volume-optimizing.png


--------------------------------------------------------------------------------
/img/iam-add-group.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/iam-add-group.png


--------------------------------------------------------------------------------
/img/iam-add-user-access-key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/iam-add-user-access-key.png


--------------------------------------------------------------------------------
/img/iam-add-user-admins-group.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/iam-add-user-admins-group.png


--------------------------------------------------------------------------------
/img/iam-add-user.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/iam-add-user.png


--------------------------------------------------------------------------------
/img/iam-create-group.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/iam-create-group.png


--------------------------------------------------------------------------------
/img/step-1-create-cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/step-1-create-cluster.png


--------------------------------------------------------------------------------
/img/step-2-create-volume.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/step-2-create-volume.png


--------------------------------------------------------------------------------
/img/step-3-build-container.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/step-3-build-container.png


--------------------------------------------------------------------------------
/img/step-4-get-data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/step-4-get-data.png


--------------------------------------------------------------------------------
/img/step-5-train-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/step-5-train-model.png


--------------------------------------------------------------------------------
/img/step-6-test-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/step-6-test-model.png


--------------------------------------------------------------------------------
/img/step-7-cleanup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/step-7-cleanup.png


--------------------------------------------------------------------------------
/img/workshop-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/workshop-architecture.png


--------------------------------------------------------------------------------