├── .env ├── .gitignore ├── .util ├── cloud9.cfn └── resize.sh ├── 1-create-cluster ├── .gitignore ├── 1-1-configure.sh ├── 1-2-install-tools.sh ├── 1-3-create-cluster.sh ├── 1-4-deploy-packages.sh ├── eks.yaml.template ├── etcd │ ├── deploy.sh │ ├── etcd-deployment.yaml │ └── remove.sh └── kubeflow-training-operator │ ├── clusterrole-hpa-access.yaml │ ├── clusterrolebinding-training-operator-hpa-access.yaml │ ├── deploy.sh │ └── remove.sh ├── 2-create-volume ├── .gitignore ├── 2-1-create-efs.sh ├── 2-2-create-pvc.sh ├── efs-pv.yaml.template ├── efs-pvc.yaml └── efs-sc.yaml.template ├── 3-build-container ├── 3-1-build.sh ├── 3-2-push.sh ├── Dockerfile-cpu ├── Dockerfile-gpu ├── cifar10-model-test.py ├── cifar10-model-train.py ├── cnn_model.py ├── data-prep.sh └── utils.py ├── 4-get-data ├── .gitignore ├── 4-1-get-data.sh ├── 4-2-show-status.sh ├── 4-3-show-log.sh └── efs-data-copy.yaml.template ├── 5-train-model ├── .gitignore ├── 5-1-generate-pytorchjob.sh ├── 5-2-launch-pytorchjob.sh ├── 5-3-show-status.sh ├── 5-4-show-utilization.sh ├── 5-5-show-logs.sh ├── 5-6-delete-pytorchjob.sh ├── cleanup.yaml.template └── train.yaml.template ├── 6-test-model ├── .gitignore ├── 6-1-generate-job.sh ├── 6-2-launch-job.sh ├── 6-3-show-status.sh ├── 6-4-show-log.sh ├── 6-5-delete-job.sh └── test.yaml.template ├── 7-cleanup ├── 7-1-delete-efs.sh └── 7-2-delete-cluster.sh ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SETUP.md ├── THIRD-PARTY-LICENSES └── img ├── aws-console-cloud9-link.png ├── aws-console-my-account.png ├── aws-console-signin-iam-user.png ├── aws-console-signin.png ├── cloud9-configure-settings.png ├── cloud9-credentials-dialog.png ├── cloud9-credentials-disable.png ├── cloud9-ide-manage-ec2.png ├── cloud9-instance-storage.png ├── cloud9-landing-page.png ├── cloud9-managed-credentials.png ├── cloud9-modify-volume.png ├── cloud9-name-environment.png ├── cloud9-reboot-instance.png ├── cloud9-resized-volume.png ├── cloud9-volume-actions.png ├── cloud9-volume-optimizing.png ├── iam-add-group.png ├── iam-add-user-access-key.png ├── iam-add-user-admins-group.png ├── iam-add-user.png ├── iam-create-group.png ├── step-1-create-cluster.png ├── step-2-create-volume.png ├── step-3-build-container.png ├── step-4-get-data.png ├── step-5-train-model.png ├── step-6-test-model.png ├── step-7-cleanup.png └── workshop-architecture.png /.env: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export AWS_PROFILE=workshop 4 | export CLUSTER_NAME=do-eks 5 | export REGION=us-west-2 6 | export AZ1=us-west-2a 7 | export AZ2=us-west-2b 8 | export NODE_TYPE=c5.4xlarge 9 | export NODE_COUNT=2 10 | # PROCESSOR - target processor for training, PROCESSOR=cpu(default)|gpu 11 | export PROCESSOR=cpu 12 | # CPU_LIMIT - number of CPUs per node to use 13 | export CPU_LIMIT=15 14 | # GPU_LIMIT - number of GPUs per node to use. Must be 0 if PROCESSOR=cpu 15 | export GPU_LIMIT=0 16 | export ACCOUNT=$(aws sts get-caller-identity --query Account --output text) 17 | export REGISTRY=${ACCOUNT}.dkr.ecr.${REGION}.amazonaws.com/ 18 | export IMAGE=pytorch-${PROCESSOR} 19 | export TAG=:latest 20 | export MOUNT_PATH=/efs-shared 21 | export EPOCHS=10 22 | export BATCH_SIZE=128 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /.util/cloud9.cfn: -------------------------------------------------------------------------------- 1 | Parameters: 2 | LabName: 3 | Type: String 4 | AllowedPattern: ".+" 5 | Default: "Workshop" 6 | VolSizeGB: 7 | Type: String 8 | AllowedPattern: "[0-9]+" 9 | Default: "100" 10 | 11 | Resources: 12 | Cloud9Lab: 13 | Type: AWS::Cloud9::EnvironmentEC2 14 | Properties: 15 | AutomaticStopTimeMinutes: 1440 16 | ImageId: amazonlinux-2-x86_64 17 | InstanceType: "c5.4xlarge" 18 | Name: !Ref LabName 19 | OwnerArn: !Sub 'arn:aws:sts::${AWS::AccountId}:assumed-role/TeamRole/MasterKey' 20 | 21 | Cloud9Role: 22 | Type: AWS::IAM::Role 23 | Properties: 24 | AssumeRolePolicyDocument: 25 | Version: "2012-10-17" 26 | Statement: 27 | - Effect: Allow 28 | Principal: 29 | Service: 30 | - ec2.amazonaws.com 31 | Action: 32 | - 'sts:AssumeRole' 33 | ManagedPolicyArns: 34 | - arn:aws:iam::aws:policy/AdministratorAccess 35 | MaxSessionDuration: 28800 36 | Path: / 37 | RoleName: 'workshop-admin' 38 | 39 | Cloud9InstanceProfile: 40 | Type: 'AWS::IAM::InstanceProfile' 41 | Properties: 42 | Path: / 43 | Roles: 44 | - !Ref Cloud9Role 45 | 46 | LambdaRole: 47 | Type: AWS::IAM::Role 48 | Properties: 49 | AssumeRolePolicyDocument: 50 | Version: "2012-10-17" 51 | Statement: 52 | - Effect: Allow 53 | Principal: 54 | Service: 55 | - lambda.amazonaws.com 56 | Action: 57 | - 'sts:AssumeRole' 58 | ManagedPolicyArns: 59 | - arn:aws:iam::aws:policy/AmazonEC2FullAccess 60 | - arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole 61 | - arn:aws:iam::aws:policy/CloudWatchLogsFullAccess 62 | - arn:aws:iam::aws:policy/IAMFullAccess 63 | MaxSessionDuration: 3600 64 | Path: / 65 | RoleName: 'hpcworkshop-lambda' 66 | 67 | 68 | LambdaCloud9: 69 | DependsOn: Cloud9Lab 70 | Type: AWS::Lambda::Function 71 | Properties: 72 | Environment: 73 | Variables: 74 | LAB_NAME: !Ref LabName 75 | VOL_SIZE_GB: !Ref VolSizeGB 76 | Architectures: 77 | - 'x86_64' 78 | Code: 79 | ZipFile: | 80 | #!/usr/bin/env python3 81 | 82 | import boto3 83 | import cfnresponse 84 | import os 85 | import json 86 | import time 87 | 88 | client = boto3.Session().client('ec2') 89 | 90 | def lambda_handler(event, context): 91 | responseValue = event['ResourceProperties']['ProfileArn'] 92 | responseData = {} 93 | responseData['Data'] = responseValue 94 | cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData, "CustomResourcePhysicalID") 95 | main(responseValue) 96 | return {'statusCode': 200, 'body': json.dumps('Cloud9')} 97 | 98 | def get_modification_state(volume_id): 99 | resp = client.describe_volumes_modifications( 100 | VolumeIds=[ 101 | volume_id 102 | ] 103 | ) 104 | return resp['VolumesModifications'][0]['ModificationState'] 105 | 106 | def main(instance_profile_arn): 107 | response = client.describe_instances(Filters=[ 108 | { 109 | 'Name': 'tag:Name', 110 | 'Values': [ 111 | 'aws-cloud9-' + os.environ.get('LAB_NAME','Workshop') + '-*', 112 | ] 113 | }, 114 | { 115 | 'Name': 'instance-state-name', 116 | 'Values': ["pending", "running"] 117 | } 118 | ]) 119 | 120 | ec2 = boto3.resource('ec2') 121 | 122 | instance_id = response['Reservations'][0]['Instances'][0]['InstanceId'] 123 | volume_id = response['Reservations'][0]['Instances'][0]['BlockDeviceMappings'][0]['Ebs']['VolumeId'] 124 | IamInstanceProfile = {'Name': instance_profile_arn} 125 | instance = ec2.Instance(instance_id) 126 | instance.wait_until_running() 127 | response = client.describe_iam_instance_profile_associations(Filters=[ 128 | { 129 | 'Name': 'instance-id', 130 | 'Values': [ 131 | instance_id, 132 | ] 133 | }, 134 | ]) 135 | if len(response['IamInstanceProfileAssociations']) > 0: 136 | instance_profile_association_id = response[ 137 | 'IamInstanceProfileAssociations'][0]['AssociationId'] 138 | response = client.replace_iam_instance_profile_association( 139 | IamInstanceProfile=IamInstanceProfile, 140 | AssociationId=instance_profile_association_id) 141 | else: 142 | response = client.associate_iam_instance_profile( 143 | IamInstanceProfile=IamInstanceProfile, InstanceId=instance_id) 144 | 145 | # Modify volume size 146 | volume_size_str = os.environ.get('VOL_SIZE_GB', '100') 147 | volume_size = int(volume_size_str) 148 | modify_volume_response = client.modify_volume(VolumeId=volume_id,Size=volume_size) 149 | while True: 150 | state = get_modification_state(volume_id) 151 | if state == 'completed' or state == None or state == 'optimizing': 152 | break 153 | elif state == 'failed': 154 | raise Exception('Failed to modify volume size') 155 | else: 156 | time.sleep(15) 157 | 158 | # Reboot ec2 instance so the new volume size takes effect 159 | reboot_instance_reponse = client.reboot_instances( 160 | InstanceIds=[instance_id] 161 | ) 162 | 163 | Handler: index.lambda_handler 164 | MemorySize: 128 165 | PackageType: 'Zip' 166 | Role: !GetAtt LambdaRole.Arn 167 | Runtime: 'python3.9' 168 | Timeout: 300 169 | 170 | Primerinvoke: 171 | Type: AWS::CloudFormation::CustomResource 172 | DependsOn: LambdaCloud9 173 | Version: "1.0" 174 | Properties: 175 | ServiceToken: !GetAtt LambdaCloud9.Arn 176 | ProfileArn: !Ref Cloud9InstanceProfile 177 | 178 | Outputs: 179 | Cloud9URl: 180 | Description: URL of your AWS Cloud9 Instance 181 | Value: !Join ['', ['https://', !Ref 'AWS::Region','.console.aws.amazon.com/cloud9/ide/', !Ref Cloud9Lab ] ] 182 | 183 | 184 | -------------------------------------------------------------------------------- /.util/resize.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Specify the desired volume size in GiB as a command line argument. If not specified, default to 20 GiB. 4 | SIZE=${1:-20} 5 | 6 | # Get the ID of the environment host Amazon EC2 instance. 7 | INSTANCEID=$(curl http://169.254.169.254/latest/meta-data/instance-id) 8 | REGION=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone | sed 's/\(.*\)[a-z]/\1/') 9 | 10 | # Get the ID of the Amazon EBS volume associated with the instance. 11 | VOLUMEID=$(aws ec2 describe-instances \ 12 | --instance-id $INSTANCEID \ 13 | --query "Reservations[0].Instances[0].BlockDeviceMappings[0].Ebs.VolumeId" \ 14 | --output text \ 15 | --region $REGION) 16 | 17 | # Resize the EBS volume. 18 | aws ec2 modify-volume --volume-id $VOLUMEID --size $SIZE 19 | 20 | # Wait for the resize to finish. 21 | while [ \ 22 | "$(aws ec2 describe-volumes-modifications \ 23 | --volume-id $VOLUMEID \ 24 | --filters Name=modification-state,Values="optimizing","completed" \ 25 | --query "length(VolumesModifications)"\ 26 | --output text)" != "1" ]; do 27 | sleep 1 28 | done 29 | 30 | #Check if we're on an NVMe filesystem 31 | if [[ -e "/dev/xvda" && $(readlink -f /dev/xvda) = "/dev/xvda" ]] 32 | then 33 | # Rewrite the partition table so that the partition takes up all the space that it can. 34 | sudo growpart /dev/xvda 1 35 | 36 | # Expand the size of the file system. 37 | # Check if we're on AL2 38 | STR=$(cat /etc/os-release) 39 | SUB="VERSION_ID=\"2\"" 40 | if [[ "$STR" == *"$SUB"* ]] 41 | then 42 | sudo xfs_growfs -d / 43 | else 44 | sudo resize2fs /dev/xvda1 45 | fi 46 | 47 | else 48 | # Rewrite the partition table so that the partition takes up all the space that it can. 49 | sudo growpart /dev/nvme0n1 1 50 | 51 | # Expand the size of the file system. 52 | # Check if we're on AL2 53 | STR=$(cat /etc/os-release) 54 | SUB="VERSION_ID=\"2\"" 55 | if [[ "$STR" == *"$SUB"* ]] 56 | then 57 | sudo xfs_growfs -d / 58 | else 59 | sudo resize2fs /dev/nvme0n1p1 60 | fi 61 | fi 62 | -------------------------------------------------------------------------------- /1-create-cluster/.gitignore: -------------------------------------------------------------------------------- 1 | eks.yaml 2 | -------------------------------------------------------------------------------- /1-create-cluster/1-1-configure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ../.env 4 | 5 | echo "" 6 | echo "Configuring AWS client ..." 7 | aws configure --profile $AWS_PROFILE 8 | 9 | echo "" 10 | echo "Generating cluster configuration eks.yaml ..." 11 | cat eks.yaml.template | envsubst > eks.yaml 12 | 13 | -------------------------------------------------------------------------------- /1-create-cluster/1-2-install-tools.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Install tools 4 | 5 | # eksctl 6 | echo "" 7 | echo "Installing eksctl ..." 8 | curl --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp 9 | 10 | sudo mv /tmp/eksctl /usr/local/bin 11 | eksctl version 12 | 13 | # kubectl 14 | echo "" 15 | echo "Installing kubectl ..." 16 | curl -O https://s3.us-west-2.amazonaws.com/amazon-eks/1.26.2/2023-03-17/bin/linux/amd64/kubectl 17 | chmod +x ./kubectl 18 | sudo mv ./kubectl /usr/local/bin 19 | kubectl version --client 20 | 21 | # kubectx 22 | echo "" 23 | echo "Installing kubectx ..." 24 | pushd /tmp 25 | git clone https://github.com/ahmetb/kubectx 26 | sudo mv kubectx /opt 27 | sudo ln -s /opt/kubectx/kubectx /usr/local/bin/kubectx 28 | sudo ln -s /opt/kubectx/kubens /usr/local/bin/kubens 29 | popd 30 | 31 | # kubetail 32 | echo "" 33 | echo "Installing kubetail ..." 34 | curl -o /tmp/kubetail https://raw.githubusercontent.com/johanhaleby/kubetail/master/kubetail 35 | chmod +x /tmp/kubetail 36 | sudo mv /tmp/kubetail /usr/local/bin/kubetail 37 | 38 | # kubeshell 39 | echo "" 40 | echo "Installing kubeshell ..." 41 | curl -LO https://github.com/kvaps/kubectl-node-shell/raw/master/kubectl-node_shell 42 | chmod +x ./kubectl-node_shell 43 | sudo mv ./kubectl-node_shell /usr/local/bin/kubectl-node_shell 44 | 45 | # jq 46 | echo "" 47 | echo "Installing jq ..." 48 | sudo yum install -y jq 49 | 50 | # yq 51 | echo "" 52 | echo "Installing yq ..." 53 | pip3 install yq 54 | 55 | # Set up aliases 56 | echo "" 57 | echo "Setting up aliases ..." 58 | cat << EOF >> ~/.bashrc 59 | alias ll='ls -alh --color=auto' 60 | alias k='kubectl' 61 | alias kc='kubectx' 62 | alias kn='kubens' 63 | alias kt='kubetail' 64 | alias ks='kubectl node-shell' 65 | EOF 66 | 67 | echo "" 68 | echo "Done setting up tools." 69 | echo "" 70 | 71 | 72 | -------------------------------------------------------------------------------- /1-create-cluster/1-3-create-cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ../.env 4 | 5 | echo "" 6 | echo "Creating EKS cluster ..." 7 | echo "" 8 | echo "... using configuration from ./eks.yaml ..." 9 | echo "" 10 | cat ./eks.yaml 11 | echo "" 12 | date 13 | CMD="eksctl create cluster -f ./eks.yaml" 14 | echo "${CMD}" 15 | ${CMD} 16 | echo "" 17 | date 18 | echo "Done creating EKS cluster" 19 | 20 | echo "" 21 | echo "Updating kubeconfig ..." 22 | aws eks update-kubeconfig --name $CLUSTER_NAME 23 | echo "" 24 | 25 | echo "" 26 | echo "Displaying cluster nodes ..." 27 | kubectl get nodes 28 | 29 | -------------------------------------------------------------------------------- /1-create-cluster/1-4-deploy-packages.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Deploy Kuberntes Packages 4 | 5 | # Metrics server 6 | echo "" 7 | echo "Deploying Kubernetes Metrics Server ..." 8 | kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml 9 | 10 | # Kubeflow Training Operator 11 | echo "" 12 | echo "Deploying Kubeflow Training Operator ..." 13 | pushd ./kubeflow-training-operator 14 | ./deploy.sh 15 | popd 16 | 17 | # Etcd 18 | echo "" 19 | echo "Deploying etcd ..." 20 | kubectl apply -f etcd/etcd-deployment.yaml 21 | 22 | -------------------------------------------------------------------------------- /1-create-cluster/eks.yaml.template: -------------------------------------------------------------------------------- 1 | apiVersion: eksctl.io/v1alpha5 2 | kind: ClusterConfig 3 | 4 | metadata: 5 | name: ${CLUSTER_NAME} 6 | version: "1.26" 7 | region: ${REGION} 8 | 9 | availabilityZones: 10 | - ${AZ1} 11 | - ${AZ2} 12 | 13 | iam: 14 | withOIDC: true 15 | 16 | managedNodeGroups: 17 | - name: wks-node 18 | instanceType: ${NODE_TYPE} 19 | instancePrefix: workshop 20 | privateNetworking: true 21 | availabilityZones: ["${AZ1}","${AZ2}"] 22 | efaEnabled: false 23 | minSize: 0 24 | desiredCapacity: ${NODE_COUNT} 25 | maxSize: 10 26 | volumeSize: 900 27 | iam: 28 | withAddonPolicies: 29 | cloudWatch: true 30 | autoScaler: true 31 | ebs: true 32 | -------------------------------------------------------------------------------- /1-create-cluster/etcd/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | kubectl apply -f ./etcd-deployment.yaml 4 | 5 | -------------------------------------------------------------------------------- /1-create-cluster/etcd/etcd-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: etcd-service 5 | #namespace: elastic-job 6 | spec: 7 | ports: 8 | - name: etcd-client-port 9 | port: 2379 10 | protocol: TCP 11 | targetPort: 2379 12 | selector: 13 | app: etcd 14 | 15 | --- 16 | apiVersion: apps/v1 17 | kind: Deployment 18 | metadata: 19 | labels: 20 | app: etcd 21 | name: etcd 22 | #namespace: elastic-job 23 | spec: 24 | replicas: 1 25 | selector: 26 | matchLabels: 27 | app: etcd 28 | template: 29 | metadata: 30 | labels: 31 | app: etcd 32 | spec: 33 | containers: 34 | - name: etcd 35 | command: ["/usr/local/bin/etcd"] 36 | args: 37 | - "--data-dir" 38 | - "/var/lib/etcd" 39 | - "--enable-v2" 40 | - "--listen-client-urls" 41 | - "http://0.0.0.0:2379" 42 | - "--advertise-client-urls" 43 | - "http://0.0.0.0:2379" 44 | - "--initial-cluster-state" 45 | - "new" 46 | image: quay.io/coreos/etcd:latest 47 | ports: 48 | - containerPort: 2379 49 | name: client 50 | protocol: TCP 51 | - containerPort: 2380 52 | name: server 53 | protocol: TCP 54 | restartPolicy: Always 55 | -------------------------------------------------------------------------------- /1-create-cluster/etcd/remove.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | kubectl delete -f ./etcd-deployment.yaml 4 | 5 | -------------------------------------------------------------------------------- /1-create-cluster/kubeflow-training-operator/clusterrole-hpa-access.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | # "namespace" omitted since ClusterRoles are not namespaced 5 | name: hpa-access 6 | rules: 7 | - apiGroups: ["autoscaling"] 8 | # 9 | # at the HTTP level, the name of the resource for accessing Secret 10 | # objects is "secrets" 11 | resources: ["horizontalpodautoscalers"] 12 | verbs: ["get", "watch", "list", "create", "delete"] 13 | -------------------------------------------------------------------------------- /1-create-cluster/kubeflow-training-operator/clusterrolebinding-training-operator-hpa-access.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | # This cluster role binding allows anyone in the "manager" group to read secrets in any namespace. 3 | kind: ClusterRoleBinding 4 | metadata: 5 | name: training-operator-hpa-access 6 | subjects: 7 | - kind: ServiceAccount 8 | name: training-operator # Name is case sensitive 9 | namespace: kubeflow 10 | apiGroup: "" 11 | roleRef: 12 | kind: ClusterRole 13 | name: hpa-access 14 | apiGroup: rbac.authorization.k8s.io 15 | -------------------------------------------------------------------------------- /1-create-cluster/kubeflow-training-operator/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Deploy Kubeflow training operator 4 | 5 | kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.7.0" 6 | 7 | # Configure RBAC resources 8 | 9 | kubectl apply -f ./clusterrole-hpa-access.yaml 10 | 11 | kubectl apply -f ./clusterrolebinding-training-operator-hpa-access.yaml 12 | 13 | -------------------------------------------------------------------------------- /1-create-cluster/kubeflow-training-operator/remove.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Remove RBAC resources 4 | 5 | kubectl delete -f ./clusterrolebinding-training-operator-hpa-access.yaml 6 | 7 | kubectl delete -f ./clusterrole-hpa-access.yaml 8 | 9 | # Remove Kubeflow training operator 10 | 11 | kubectl delete -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.5.0" 12 | 13 | -------------------------------------------------------------------------------- /2-create-volume/.gitignore: -------------------------------------------------------------------------------- 1 | efs-pv.yaml 2 | efs-sc.yaml 3 | -------------------------------------------------------------------------------- /2-create-volume/2-1-create-efs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ../.env 4 | 5 | # This script follows the following eks workshop 6 | # https://www.eksworkshop.com/beginner/190_efs/launching-efs/ 7 | 8 | # Assume the cluster name is the first cluster in the list 9 | echo "" 10 | echo 'Cluster name: ' $CLUSTER_NAME 11 | VPC_ID=$(aws eks describe-cluster --name $CLUSTER_NAME --query "cluster.resourcesVpcConfig.vpcId" --output text) 12 | CIDR_BLOCK=$(aws ec2 describe-vpcs --vpc-ids $VPC_ID --query "Vpcs[].CidrBlock" --output text) 13 | 14 | echo 'VPC: ' $VPC_ID 15 | echo 'CIDR: ' $CIDR_BLOCK 16 | 17 | echo "" 18 | echo "Creating security group ..." 19 | MOUNT_TARGET_GROUP_NAME="eks-efs-group-${CLUSTER_NAME}" 20 | MOUNT_TARGET_GROUP_DESC="NFS access to EFS from EKS worker nodes" 21 | aws ec2 create-security-group --group-name $MOUNT_TARGET_GROUP_NAME --description "$MOUNT_TARGET_GROUP_DESC" --vpc-id $VPC_ID 22 | sleep 5 23 | 24 | MOUNT_TARGET_GROUP_ID=$(aws ec2 describe-security-groups --filter Name=vpc-id,Values=$VPC_ID Name=group-name,Values=$MOUNT_TARGET_GROUP_NAME --query 'SecurityGroups[*].[GroupId]' --output text) 25 | echo $MOUNT_TARGET_GROUP_NAME $MOUNT_TARGET_GROUP_DESC $MOUNT_TARGET_GROUP_ID 26 | 27 | aws ec2 authorize-security-group-ingress --group-id $MOUNT_TARGET_GROUP_ID --protocol tcp --port 2049 --cidr $CIDR_BLOCK 28 | sleep 2 29 | 30 | echo "" 31 | echo "Creating EFS volume ..." 32 | FILE_SYSTEM_ID=$(aws efs create-file-system | jq --raw-output '.FileSystemId') 33 | echo $FILE_SYSTEM_ID 34 | sleep 10 35 | 36 | TAG1=tag:alpha.eksctl.io/cluster-name 37 | TAG2=tag:kubernetes.io/role/elb 38 | SUBNETS=$(aws ec2 describe-subnets --filter Name=$TAG1,Values=$CLUSTER_NAME Name=$TAG2,Values=1 --query 'Subnets[*].SubnetId' --output text) 39 | echo $SUBNETS 40 | 41 | for subnet in ${SUBNETS} 42 | do 43 | echo "Creating mount target in subnet " $subnet " , security group " $MOUNT_TARGET_GROUP_ID " ,for efs id " $FILE_SYSTEM_ID 44 | aws efs create-mount-target --file-system-id $FILE_SYSTEM_ID --subnet-id $subnet --security-groups $MOUNT_TARGET_GROUP_ID 45 | sleep 2 46 | done 47 | sleep 30 48 | 49 | echo "" 50 | echo "Mount points state ..." 51 | aws efs describe-mount-targets --file-system-id $FILE_SYSTEM_ID | jq --raw-output '.MountTargets[].LifeCycleState' 52 | 53 | echo "" 54 | echo "Done." 55 | echo "" 56 | -------------------------------------------------------------------------------- /2-create-volume/2-2-create-pvc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ../.env 4 | 5 | # This script mostly follows this eks workshop 6 | # https://www.eksworkshop.com/beginner/190_efs/launching-efs/ 7 | 8 | echo "" 9 | echo "Checking EFS File System ..." 10 | 11 | # if the pvc already exists, exit 12 | PV_EXISTS=$(kubectl get pv -o json | jq --raw-output '.items[].spec.storageClassName') 13 | for pv in ${PV_EXISTS} 14 | do 15 | if [ "$pv" == "efs-sc" ]; then 16 | echo "Persistant Volume already exists" 17 | kubectl get pv 18 | exit 0 19 | fi 20 | done 21 | 22 | # Assign file system id. Create EFS file system if needed. If more than one filesystem exists, take first one in the list 23 | FILE_SYSTEM_ID=$(aws efs describe-file-systems --query 'FileSystems[*].FileSystemId' --output json | jq -r .[0]) 24 | if [ "$FILE_SYSTEM_ID" == "null" ]; then 25 | echo "" 26 | echo "No EFS file system found. Setting up new EFS File System ..." 27 | ./2-1-create-efs.sh 28 | FILE_SYSTEM_ID=$(aws efs describe-file-systems --query 'FileSystems[*].FileSystemId' --output json | jq -r .[0]) 29 | fi 30 | echo 'EFS volume id' $FILE_SYSTEM_ID 31 | 32 | echo "" 33 | echo "Deploying EFS CSI Driver ..." 34 | kubectl apply -k "github.com/kubernetes-sigs/aws-efs-csi-driver/deploy/kubernetes/overlays/stable/?ref=release-1.3" 35 | sleep 5 36 | kubectl get pods -n kube-system | grep efs 37 | 38 | echo "" 39 | echo "Generating efs-sc.yaml ..." 40 | cat efs-sc.yaml.template | sed -e "s/EFS_VOLUME_ID/$FILE_SYSTEM_ID/g" > efs-sc.yaml 41 | echo "" 42 | echo "Applying efs-sc.yaml ..." 43 | kubectl apply -f efs-sc.yaml 44 | kubectl get sc 45 | 46 | echo "" 47 | echo "Generating efs-pv.yaml ..." 48 | cat efs-pv.yaml.template | sed -e "s/EFS_VOLUME_ID/$FILE_SYSTEM_ID/g" > efs-pv.yaml 49 | echo "Applying efs-pv.yaml ..." 50 | kubectl apply -f efs-pv.yaml 51 | sleep 10 52 | kubectl get pv 53 | 54 | echo "" 55 | echo "Creating persistent volume claim efs-pvc ..." 56 | kubectl apply -f efs-pvc.yaml 57 | kubectl get pvc 58 | 59 | echo "" 60 | echo "Done." 61 | echo "" 62 | -------------------------------------------------------------------------------- /2-create-volume/efs-pv.yaml.template: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: efs-pv 5 | spec: 6 | capacity: 7 | storage: 5Gi 8 | volumeMode: Filesystem 9 | accessModes: 10 | - ReadWriteMany 11 | persistentVolumeReclaimPolicy: Retain 12 | storageClassName: efs-sc 13 | csi: 14 | driver: efs.csi.aws.com 15 | volumeHandle: EFS_VOLUME_ID 16 | -------------------------------------------------------------------------------- /2-create-volume/efs-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: efs-pvc 5 | spec: 6 | accessModes: 7 | - ReadWriteMany 8 | storageClassName: efs-sc 9 | volumeName: efs-pv 10 | resources: 11 | requests: 12 | storage: 5Gi 13 | -------------------------------------------------------------------------------- /2-create-volume/efs-sc.yaml.template: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | 3 | kind: StorageClass 4 | apiVersion: storage.k8s.io/v1 5 | metadata: 6 | name: efs-sc 7 | provisioner: efs.csi.aws.com 8 | parameters: 9 | provisioningMode: efs-ap 10 | fileSystemId: EFS_VOLUME_ID 11 | directoryPerms: "700" 12 | -------------------------------------------------------------------------------- /3-build-container/3-1-build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ../.env 4 | 5 | # Build Docker image 6 | docker image build -f Dockerfile-${PROCESSOR} -t ${REGISTRY}${IMAGE}${TAG} . 7 | 8 | -------------------------------------------------------------------------------- /3-build-container/3-2-push.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ../.env 4 | 5 | # Create registry if needed 6 | REGISTRY_COUNT=$(aws ecr describe-repositories | grep ${IMAGE} | wc -l) 7 | if [ "$REGISTRY_COUNT" == "0" ]; then 8 | aws ecr create-repository --repository-name ${IMAGE} 9 | fi 10 | 11 | # Login to container registry 12 | echo "Logging in to $REGISTRY ..." 13 | aws ecr get-login-password | docker login --username AWS --password-stdin $REGISTRY 14 | 15 | # Push image to registry 16 | echo "Pushing ${IMAGE}${TAG} to registry ..." 17 | docker push ${REGISTRY}${IMAGE}${TAG} 18 | 19 | -------------------------------------------------------------------------------- /3-build-container/Dockerfile-cpu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:22.04 2 | 3 | RUN apt-get update && apt-get install -y wget unzip python3 python3-pip htop 4 | 5 | RUN pip3 install python-etcd 6 | RUN pip3 install torch torchvision --extra-index-url https://download.pytorch.org/whl/cpu 7 | RUN pip3 install tensorboard debugpy 8 | 9 | RUN mkdir -p /workspace/ 10 | ADD cifar10-model-train.py /workspace/ 11 | ADD cifar10-model-test.py /workspace/ 12 | ADD cnn_model.py /workspace/ 13 | ADD utils.py /workspace/ 14 | ADD data-prep.sh /workspace/ 15 | -------------------------------------------------------------------------------- /3-build-container/Dockerfile-gpu: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime 2 | #FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime 3 | 4 | RUN apt-get update && apt-get install -y wget unzip python3 python3-pip htop 5 | 6 | RUN pip3 install python-etcd 7 | #RUN pip3 install torch torchvision --extra-index-url https://download.pytorch.org/whl/gpu 8 | RUN pip3 install tensorboard debugpy 9 | 10 | RUN mkdir -p /workspace/ 11 | ADD cifar10-model-train.py /workspace/ 12 | ADD cifar10-model-test.py /workspace/ 13 | ADD cnn_model.py /workspace/ 14 | ADD utils.py /workspace/ 15 | ADD data-prep.sh /workspace/ 16 | -------------------------------------------------------------------------------- /3-build-container/cifar10-model-test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | import torch 5 | from torch.utils.data import DataLoader 6 | 7 | from cnn_model import MyCnnModel # custom cnn model 8 | from utils import * 9 | 10 | parser = argparse.ArgumentParser(description="PyTorch Elastic cifar10 Training") 11 | parser.add_argument("data", metavar="DIR", help="path to dataset") 12 | parser.add_argument('--workers', default=1, type=int, 13 | help='number of data loading workers (default: 1)') 14 | parser.add_argument('--batch-size', default=128, type=int, 15 | help='mini-batch size on each node (default: 128)') 16 | parser.add_argument('--model-file', default='/efs-shared/cifar10_model.pth', type=str, 17 | help='filename with path to save model (default: /efs-shared/cifar10_model.pth') 18 | 19 | 20 | def cifar10_test_dataloader(data_dir, batch_size, num_data_workers): 21 | test_images, test_labels = unpickle(data_dir + 'test_batch') 22 | 23 | # convert numpy arrays to torch TensorDataset 24 | test_dataset = get_tensordataset(test_images, test_labels) 25 | 26 | test_loader = DataLoader( 27 | test_dataset, 28 | batch_size=batch_size, 29 | shuffle=False, 30 | num_workers=num_data_workers, 31 | pin_memory=True, 32 | ) 33 | 34 | return test_loader 35 | 36 | 37 | def main(): 38 | args = parser.parse_args() 39 | print("reading", args.data) 40 | test_loader = cifar10_test_dataloader(args.data, args.batch_size, args.workers) 41 | print('loading model', args.model_file) 42 | model = MyCnnModel() 43 | model.load_state_dict(torch.load(args.model_file)) 44 | 45 | classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') 46 | correct = 0 47 | total = 0 48 | 49 | # prepare to count predictions for each class 50 | correct_pred = {classname: 0 for classname in classes} 51 | total_pred = {classname: 0 for classname in classes} 52 | 53 | # since we're not training, we don't need to calculate the gradients for our outputs 54 | with torch.no_grad(): 55 | for data in test_loader: 56 | images, labels = data 57 | # calculate outputs by running images through the network 58 | outputs = model(images) 59 | _, predictions = torch.max(outputs, 1) 60 | 61 | total += labels.size(0) 62 | correct += (predictions == labels).sum().item() 63 | 64 | # collect the correct predictions for each class 65 | for label, prediction in zip(labels, predictions): 66 | if label == prediction: 67 | correct_pred[classes[label]] += 1 68 | total_pred[classes[label]] += 1 69 | 70 | 71 | print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %') 72 | 73 | for classname, correct_count in correct_pred.items(): 74 | accuracy = 100 * float(correct_count) / total_pred[classname] 75 | print(f'Accuracy for class: {classname:5s} is {accuracy:.1f} %') 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | print('Finished Testing') 81 | -------------------------------------------------------------------------------- /3-build-container/cifar10-model-train.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import timedelta 3 | import argparse 4 | 5 | import torch 6 | from torch.nn import CrossEntropyLoss 7 | from torch.optim import SGD 8 | from torch.utils.data import DataLoader 9 | from torch.nn.parallel import DistributedDataParallel 10 | from torch.distributed.elastic.utils.data import ElasticDistributedSampler 11 | from torch.distributed import init_process_group 12 | from torch.utils.tensorboard import SummaryWriter 13 | 14 | from cnn_model import MyCnnModel # custom cnn model 15 | from utils import * 16 | 17 | parser = argparse.ArgumentParser(description="PyTorch Elastic cifar10 Training") 18 | parser.add_argument("data", help="path to dataset") 19 | parser.add_argument('--workers', default=32, type=int, 20 | help='number of data loading workers (default: 32)') 21 | parser.add_argument('--epochs', default=10, type=int, 22 | help='number of total epochs to run (default: 10)') 23 | parser.add_argument('--batch-size', default=256, type=int, 24 | help='mini-batch size on each node (default: 256)') 25 | parser.add_argument('--learning-rate', default=0.001, type=float, 26 | help='learning rate (default: 0.001') 27 | parser.add_argument('--momentum', default=0.9, type=float, 28 | help='momentum (default: 0.9)') 29 | parser.add_argument('--weight-decay', default=1e-4, type=float, 30 | help='weight decay (default: 1e-4)') 31 | parser.add_argument('--print-freq', default=5, type=int, 32 | help='print frequency (default: 5)') 33 | parser.add_argument('--model-file', default='/efs-shared/cifar10_model.pth', type=str, 34 | help='filename with path to save model (default: /efs-shared/cifar10_model.pth') 35 | parser.add_argument("--checkpoint-file", default="/efs-shared/checkpoint.pth.tar", type=str, 36 | help="checkpoint file path, to load and save to") 37 | 38 | 39 | def cifar10_train_dataloader(data_dir, batch_size, num_data_workers): 40 | files = ['data_batch_'+str(i+1) for i in range(5)] 41 | 42 | train_images = [] 43 | train_labels = [] 44 | for file in files: 45 | images, labels = unpickle(data_dir + file) 46 | train_images.extend(images) 47 | train_labels.extend(labels) 48 | 49 | # convert numpy arrays to torch TensorDataset 50 | train_dataset = get_tensordataset(train_images, train_labels) 51 | 52 | train_sampler = ElasticDistributedSampler(train_dataset) 53 | train_loader = DataLoader( 54 | train_dataset, 55 | batch_size=batch_size, 56 | num_workers=num_data_workers, 57 | pin_memory=True, 58 | sampler=train_sampler, 59 | ) 60 | 61 | return train_loader 62 | 63 | 64 | def initialize_model(lr, momentum, weight_decay): 65 | model = MyCnnModel() 66 | model = DistributedDataParallel(model) 67 | 68 | criterion = CrossEntropyLoss() 69 | optimizer = SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay) 70 | 71 | return model, criterion, optimizer 72 | 73 | 74 | def main(): 75 | print("Main function called ...") 76 | #import debugpy; debugpy.listen(('0.0.0.0',5678)); debugpy.wait_for_client(); breakpoint() 77 | init_process_group(backend="gloo", init_method="env://", timeout=timedelta(seconds=10)) 78 | args = parser.parse_args() 79 | rank = int(os.environ["RANK"]) 80 | modelFile = args.model_file 81 | tensorDir = "%s/runs/"%(os.path.dirname(modelFile)) 82 | writer = SummaryWriter(log_dir=tensorDir) 83 | 84 | print("reading", args.data) 85 | train_loader = cifar10_train_dataloader(args.data, args.batch_size, args.workers) 86 | model, criterion, optimizer = initialize_model(args.learning_rate, args.momentum, args.weight_decay) 87 | 88 | processor = os.getenv("PROCESSOR","cpu") 89 | print("Desired processor type: %s"%processor) 90 | 91 | device_type="cpu" 92 | device=torch.device("cpu") 93 | 94 | if processor == "gpu": 95 | if torch.cuda.is_available(): 96 | device=torch.device("cuda") 97 | device_type="gpu" 98 | model.to(device) 99 | else: 100 | print("torch.cuda.is_available() returned False!") 101 | 102 | print("Running on processor type: %s"%(device_type)) 103 | 104 | start_epoch = 0 105 | # load previously stored checkpoint if it exists. 106 | start_epoch = load_checkpoint(args.checkpoint_file, model, optimizer) 107 | 108 | model.train() 109 | for epoch in range(start_epoch, args.epochs): # loop over the dataset multiple times 110 | 111 | running_loss = 0.0 112 | for i, (inputs, labels) in enumerate(train_loader): 113 | # zero the parameter gradients 114 | optimizer.zero_grad() 115 | 116 | # forward + backward + optimize 117 | if device_type == "gpu": 118 | inputs = inputs.to(device) 119 | labels = labels.to(device) 120 | outputs = model(inputs) 121 | loss = criterion(outputs, labels) 122 | loss.backward() 123 | optimizer.step() 124 | 125 | # print statistics 126 | running_loss += loss.item() 127 | if i % args.print_freq == args.print_freq-1: # print every args.print_freq mini-batches 128 | print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / args.print_freq:.3f}') 129 | writer.add_scalar('Epoch', epoch+1) 130 | writer.add_scalar('Iteration', i+1) 131 | writer.add_scalar('Loss', running_loss / args.print_freq) 132 | running_loss = 0.0 133 | 134 | if rank==0: # Only one pod will save the checkpoint 135 | save_checkpoint(args.checkpoint_file, epoch, model, optimizer) 136 | 137 | if rank==0: # Only one pod will save the final model 138 | print('saving final model:', args.model_file) 139 | torch.save(model.module.state_dict(), args.model_file) 140 | 141 | writer.close() 142 | 143 | if __name__ == "__main__": 144 | main() 145 | print('Finished Training') 146 | -------------------------------------------------------------------------------- /3-build-container/cnn_model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | class MyCnnModel(nn.Module): 4 | def __init__(self): 5 | super().__init__() 6 | self.network = nn.Sequential( 7 | nn.Conv2d(3, 32, 3, padding='same'), 8 | nn.ReLU(), 9 | nn.Conv2d(32, 64, 3, padding='same'), 10 | nn.ReLU(), 11 | nn.MaxPool2d(2, 2), 12 | nn.BatchNorm2d(64), 13 | 14 | nn.Conv2d(64, 128, 3, padding='same'), 15 | nn.ReLU(), 16 | nn.Conv2d(128, 128, 3, padding='same'), 17 | nn.ReLU(), 18 | nn.MaxPool2d(2, 2), 19 | nn.BatchNorm2d(128), 20 | 21 | nn.Conv2d(128, 256, 3, padding='same'), 22 | nn.ReLU(), 23 | nn.Conv2d(256, 256, 3, padding='same'), 24 | nn.ReLU(), 25 | nn.MaxPool2d(2, 2), 26 | nn.BatchNorm2d(256), 27 | 28 | nn.Flatten(), 29 | nn.Linear(256*4*4, 512), 30 | nn.ReLU(), 31 | nn.Linear(512, 10)) 32 | 33 | def forward(self, xb): 34 | return self.network(xb) 35 | -------------------------------------------------------------------------------- /3-build-container/data-prep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | echo "Shared path - ${1}" 5 | 6 | mkdir -p ${1} 7 | cd ${1} 8 | 9 | echo "" 10 | echo "Downloading data ..." 11 | wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz 12 | tar -xzf cifar-10-python.tar.gz 13 | rm cifar-10-python.tar.gz 14 | 15 | echo "" 16 | echo "Setting permissions ..." 17 | chown -R 1000:100 ${1} 18 | 19 | echo "" 20 | echo "Done" 21 | echo "" 22 | 23 | -------------------------------------------------------------------------------- /3-build-container/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import numpy as np 4 | import torch 5 | from torch.utils.data import TensorDataset 6 | 7 | def unpickle(file): 8 | with open(file, 'rb') as fo: 9 | data = pickle.load(fo, encoding='bytes') 10 | return data[b'data'], data[b'labels'] 11 | 12 | def get_tensordataset(images, labels): 13 | images_arr = np.array(images) 14 | images_arr = np.reshape(images_arr, (-1,3,32,32)) 15 | labels_arr = np.array(labels) 16 | images_arr = images_arr/255. # normalize 17 | 18 | tensor_X = torch.tensor(images_arr, dtype=torch.float32) 19 | tensor_y = torch.tensor(labels_arr, dtype=torch.long) 20 | dataset = TensorDataset(tensor_X, tensor_y) 21 | 22 | return dataset 23 | 24 | def save_checkpoint(checkpoint_file, epoch, model, optimizer): 25 | checkpoint_dir = os.path.dirname(checkpoint_file) 26 | os.makedirs(checkpoint_dir, exist_ok=True) 27 | 28 | snapshot = { 29 | "epoch": epoch, 30 | "state_dict": model.state_dict(), 31 | "optimizer": optimizer.state_dict() 32 | } 33 | 34 | torch.save(snapshot, checkpoint_file) 35 | print(f"=> saved checkpoint for epoch {epoch+1} at {checkpoint_file}") 36 | 37 | def load_checkpoint(checkpoint_file, model, optimizer): 38 | if os.path.isfile(checkpoint_file): 39 | print('loading checkpoint file:', checkpoint_file) 40 | snapshot = torch.load(checkpoint_file) 41 | epoch = snapshot["epoch"] + 1 # start from next epoch 42 | model.load_state_dict(snapshot["state_dict"]) 43 | optimizer.load_state_dict(snapshot["optimizer"]) 44 | print("Restored model from previous checkpoint") 45 | else: 46 | epoch = 0 47 | 48 | return epoch 49 | -------------------------------------------------------------------------------- /4-get-data/.gitignore: -------------------------------------------------------------------------------- 1 | efs-data-copy.yaml 2 | -------------------------------------------------------------------------------- /4-get-data/4-1-get-data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ../.env 4 | 5 | echo "" 6 | echo "Generating pod manifest ..." 7 | cat efs-data-copy.yaml.template | envsubst > efs-data-copy.yaml 8 | 9 | echo "" 10 | echo "Creating efs-data-prep pod ..." 11 | kubectl apply -f efs-data-copy.yaml 12 | sleep 3 13 | kubectl get pods | grep data-prep 14 | 15 | -------------------------------------------------------------------------------- /4-get-data/4-2-show-status.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | echo "Describing data prep pod ..." 5 | kubectl describe pod efs-data-prep-pod 6 | 7 | echo "" 8 | echo "Showing status of data prep pod ..." 9 | kubectl get pods | grep data-prep 10 | 11 | -------------------------------------------------------------------------------- /4-get-data/4-3-show-log.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | kubectl logs -f efs-data-prep-pod 4 | 5 | -------------------------------------------------------------------------------- /4-get-data/efs-data-copy.yaml.template: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: efs-data-prep-pod 5 | annotations: 6 | sidecar.istio.io/inject: "false" 7 | spec: 8 | containers: 9 | - name: efs-data-prep-pod 10 | image: ${REGISTRY}${IMAGE}${TAG} 11 | imagePullPolicy: Always 12 | command: ["/bin/bash"] 13 | args: ["-c", "/workspace/data-prep.sh ${MOUNT_PATH}"] 14 | volumeMounts: 15 | - name: efs-pv 16 | mountPath: ${MOUNT_PATH} 17 | volumes: 18 | - name: efs-pv 19 | persistentVolumeClaim: 20 | claimName: efs-pvc 21 | restartPolicy: Never 22 | -------------------------------------------------------------------------------- /5-train-model/.gitignore: -------------------------------------------------------------------------------- 1 | train.yaml 2 | cleanup.yaml 3 | -------------------------------------------------------------------------------- /5-train-model/5-1-generate-pytorchjob.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ../.env 4 | 5 | echo "" 6 | echo "Generating ElasticJob manifest ..." 7 | cat train.yaml.template | envsubst > train.yaml 8 | echo "" 9 | echo "Generating Checkpoint Cleanup job ..." 10 | cat cleanup.yaml.template | envsubst > cleanup.yaml 11 | echo "" 12 | echo "ElasticJob Manifest:" 13 | echo "" 14 | cat train.yaml 15 | echo "" 16 | 17 | -------------------------------------------------------------------------------- /5-train-model/5-2-launch-pytorchjob.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | echo "Launching PyTorchJob ..." 5 | kubectl apply -f ./train.yaml 6 | 7 | -------------------------------------------------------------------------------- /5-train-model/5-3-show-status.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | kubectl get pods -o wide | grep train 4 | 5 | -------------------------------------------------------------------------------- /5-train-model/5-4-show-utilization.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | kubectl top nodes 4 | 5 | -------------------------------------------------------------------------------- /5-train-model/5-5-show-logs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | kubetail cifar10-train 4 | 5 | -------------------------------------------------------------------------------- /5-train-model/5-6-delete-pytorchjob.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | echo "Deleting PyTorchJob ..." 5 | kubectl delete -f ./train.yaml 6 | 7 | echo "" 8 | echo "Restarting etcd ..." 9 | kubectl delete pod $(kubectl get pods | grep etcd | cut -d ' ' -f 1) 10 | 11 | echo "" 12 | echo "Cleaning up model checkpoint ..." 13 | echo "" 14 | kubectl apply -f ./cleanup.yaml 15 | echo "" 16 | while true; do 17 | JOB="$(kubectl get job | grep cleanup)" 18 | COMPLETED=$(echo $JOB | awk -e '{print $2}' | cut -d '/' -f 1) 19 | if [ "$COMPLETED" == "1" ]; then 20 | kubectl logs $(kubectl get pods | grep cleanup | cut -d ' ' -f 1) 21 | break; 22 | else 23 | echo "$JOB" 24 | sleep 1 25 | fi 26 | done 27 | echo "" 28 | kubectl delete -f ./cleanup.yaml 29 | echo "" 30 | 31 | -------------------------------------------------------------------------------- /5-train-model/cleanup.yaml.template: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: cifar10-cleanup 5 | spec: 6 | template: 7 | metadata: 8 | annotations: 9 | sidecar.istio.io/inject: "false" 10 | spec: 11 | restartPolicy: Never 12 | nodeSelector: 13 | beta.kubernetes.io/instance-type: ${NODE_TYPE} 14 | containers: 15 | - name: test 16 | image: ${REGISTRY}${IMAGE}${TAG} 17 | imagePullPolicy: Always 18 | command: ["/bin/bash", "-c", "if [ -f ${MOUNT_PATH}/checkpoint.pth.tar ]; then echo Cleaning up checkpoint; rm -vf ${MOUNT_PATH}/checkpoint.pth.tar; else echo Checkpoint is already clean; fi; if [ -d ${MOUNT_PATH}/runs ]; then echo Cleaning up tensorboard logs; rm -rvf ${MOUNT_PATH}/runs; else echo Tensorboard logs are already clean; fi"] 19 | volumeMounts: 20 | - name: efs-pv 21 | mountPath: ${MOUNT_PATH} 22 | volumes: 23 | - name: efs-pv 24 | persistentVolumeClaim: 25 | claimName: efs-pvc 26 | 27 | -------------------------------------------------------------------------------- /5-train-model/train.yaml.template: -------------------------------------------------------------------------------- 1 | apiVersion: "kubeflow.org/v1" 2 | kind: PyTorchJob 3 | metadata: 4 | name: cifar10-train 5 | spec: 6 | elasticPolicy: 7 | rdzvBackend: etcd 8 | rdzvHost: etcd-service 9 | rdzvPort: 2379 10 | minReplicas: 1 11 | maxReplicas: 128 12 | maxRestarts: 100 13 | metrics: 14 | - type: Resource 15 | resource: 16 | name: cpu 17 | target: 18 | type: Utilization 19 | averageUtilization: 80 20 | pytorchReplicaSpecs: 21 | Worker: 22 | replicas: ${NODE_COUNT} 23 | restartPolicy: OnFailure 24 | template: 25 | spec: 26 | containers: 27 | - name: pytorch 28 | image: ${REGISTRY}${IMAGE}${TAG} 29 | imagePullPolicy: IfNotPresent 30 | env: 31 | - name: PROCESSOR 32 | value: "${PROCESSOR}" 33 | command: 34 | - python3 35 | - -m 36 | - torch.distributed.run 37 | - /workspace/cifar10-model-train.py 38 | - "--epochs=${EPOCHS}" 39 | - "--batch-size=${BATCH_SIZE}" 40 | - "--workers=${CPU_LIMIT}" 41 | - "--model-file=${MOUNT_PATH}/cifar10-model.pth" 42 | - "${MOUNT_PATH}/cifar-10-batches-py/" 43 | volumeMounts: 44 | - name: efs-pv 45 | mountPath: ${MOUNT_PATH} 46 | # The following enables the worker pods to use increased shared memory 47 | # which is required when specifying more than 0 data loader workers 48 | - name: dshm 49 | mountPath: /dev/shm 50 | volumes: 51 | - name: efs-pv 52 | persistentVolumeClaim: 53 | claimName: efs-pvc 54 | - name: dshm 55 | emptyDir: 56 | medium: Memory 57 | -------------------------------------------------------------------------------- /6-test-model/.gitignore: -------------------------------------------------------------------------------- 1 | test.yaml 2 | -------------------------------------------------------------------------------- /6-test-model/6-1-generate-job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ../.env 4 | 5 | echo "" 6 | echo "Generating test job manifest ..." 7 | cat test.yaml.template | envsubst > test.yaml 8 | cat test.yaml 9 | echo "" 10 | -------------------------------------------------------------------------------- /6-test-model/6-2-launch-job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | echo "Launching test job ..." 5 | kubectl apply -f ./test.yaml 6 | -------------------------------------------------------------------------------- /6-test-model/6-3-show-status.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | echo "Showing test job status ..." 5 | kubectl get pods | grep test 6 | 7 | -------------------------------------------------------------------------------- /6-test-model/6-4-show-log.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | echo "Showing cifar10-test log ..." 5 | echo "" 6 | 7 | kubectl logs -f $(kubectl get pods | grep cifar10-test | cut -d ' ' -f 1 | head -n 1) 8 | 9 | -------------------------------------------------------------------------------- /6-test-model/6-5-delete-job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "" 4 | echo "Deleting test job ..." 5 | kubectl delete -f ./test.yaml 6 | -------------------------------------------------------------------------------- /6-test-model/test.yaml.template: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: cifar10-test 5 | spec: 6 | template: 7 | spec: 8 | restartPolicy: Never 9 | nodeSelector: 10 | beta.kubernetes.io/instance-type: ${NODE_TYPE} 11 | containers: 12 | - name: test 13 | image: ${REGISTRY}${IMAGE}${TAG} 14 | imagePullPolicy: Always 15 | command: ["python3"] 16 | args: 17 | - "/workspace/cifar10-model-test.py" 18 | - "--model-file=${MOUNT_PATH}/cifar10-model.pth" 19 | - "--batch-size=128" 20 | - "--workers=1" 21 | - "${MOUNT_PATH}/cifar-10-batches-py/" 22 | volumeMounts: 23 | - name: efs-pv 24 | mountPath: ${MOUNT_PATH} 25 | volumes: 26 | - name: efs-pv 27 | persistentVolumeClaim: 28 | claimName: efs-pvc 29 | -------------------------------------------------------------------------------- /7-cleanup/7-1-delete-efs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ../.env 4 | 5 | echo "" 6 | FILE_SYSTEM_ID=$(aws efs describe-file-systems --query 'FileSystems[*].FileSystemId' --output json | jq -r .[0] ) 7 | if [ "$FILE_SYSTEM_ID" == "" ]; then 8 | echo "No EFS Filesystems found." 9 | else 10 | echo "Deleting EFS mount targets for File System $FILE_SYSTEM_ID ..." 11 | MOUNT_TARGETS="$(aws efs describe-mount-targets --file-system-id $FILE_SYSTEM_ID --query MountTargets[].MountTargetId --output text)" 12 | MT=$(echo $MOUNT_TARGETS) 13 | for t in $MT; do echo Deleting mount target $t; aws efs delete-mount-target --mount-target-id $t; done 14 | sleep 10 15 | echo "Deleting EFS file system $FILE_SYSTEM_ID ..." 16 | aws efs delete-file-system --file-system-id $FILE_SYSTEM_ID 17 | fi 18 | 19 | echo "" 20 | echo 'Done.' 21 | 22 | -------------------------------------------------------------------------------- /7-cleanup/7-2-delete-cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ../.env 4 | 5 | echo "" 6 | read -p "Deleting cluster $CLUSTER_NAME. Proceed? [Y/n]: " PROCEED 7 | if [ "$PROCEED" == "Y" ]; then 8 | echo "Confirmed ..." 9 | eksctl delete cluster -f ../1-create-cluster/eks.yaml 10 | echo "Please note that the cluster will be fully deleted when the Cloud Formation stack completes its removal" 11 | echo "Only after the process in Cloud Formation is finished, you will be able to create a new cluster with the same name" 12 | elif [ "$PROCEED" == "n" ]; then 13 | echo "Cancelling. Cluster will not be deleted." 14 | else 15 | echo "$PROCEED is not a valid response" 16 | echo "Please run the script again and choose Y or n (case sensitive)" 17 | fi 18 | echo "" 19 | 20 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Distributed Model Training Workshop for AWS EKS 2 | 3 | Welcome! By completing this workshop you will learn how to run distributed data parallel model training on [AWS EKS](https://aws.amazon.com/eks) using [PyTorch](https://pytorch.org). 4 | The only prerequisite for this workshop is access to an AWS account. The steps included here will walk you through creating and AWS EKS cluster, a shared data volume, building a model training container image, downloading and pre-processing data, running distributed training of an image classification model, and finally running the model with new images to test it. 5 | 6 | The workshop architecture at a high level can be visualized by the diagram below. 7 | 8 |

9 | 10 | Fig. 1 - Workshop Infrastructure Architecture 11 |
12 | 13 | The workshop is designed to introduce the concepts of deploying this architecture and running small-scale distributed training for educational purposes, however the same architecture can be applied for training at large scale by adjusting the number and type of nodes used in the EKS cluster, using accelerators ([NVIDIA GPUs](https://aws.amazon.com/nvidia/), [AWS Trainium](https://aws.amazon.com/machine-learning/trainium/), [Intel Habana Gaudi](https://aws.amazon.com/ec2/instance-types/dl1/)), and high-performance shared storage like [FSx for Lustre](https://aws.amazon.com/fsx/lustre/). Further information and scripts that help deploy distributed training on EKS using GPUs and FSx can be found in the [aws-do-eks](https://github.com/aws-samples/aws-do-eks) open-source project. 14 | 15 | This workshop is organized in a number of sequential steps. The scripts that belong to each step are organized in folders with corresponding names. To execute a step, we will change the current directory accordingly and execute scripts in their designated order. The prerequisites section is required, but there are no scripts associated with it. We will complete setting up prerequisites by following instructions. Steps 1 through 6 are required to complete the workshop. Step 7-Cleanup is optional. 16 | 17 | ## 0. Prerequisites 18 | Before we get started, we need to set up an AWS account and Cloud9 IDE from which we will execute all the steps in the workshop. You will not be required to install anything on your computer. All of the steps in the workshop will be completed on the cloud through your browser. To set up your account and IDE, please follow the instructions in [SETUP.md](SETUP.md). 19 | 20 | ## 1. Create EKS Cluster 21 | 22 |

23 | 24 | Fig. 1.0 - Step 1 - Create EKS cluster 25 |
26 | 27 | In this step we will execute scripts to create a managed [Kubernetes](https://kubernetes.io) cluster using the Amazon Elastic Kubernetes Service ([EKS](https://aws.amazon.com/eks)). Later we will use this cluster to run our distributed model training job. 28 | 29 | In the last part of your prerequisites setup, you cloned the workshop code into your Cloud9 IDE. To build our distributed training infrastructure on EKS, we will start by changing the current directory to `1-create-cluster`. 30 | 31 | ```console 32 | cd 1-create-cluster 33 | ``` 34 | 35 | ### 1.1. Configure AWS client and EKS cluster 36 | Many of the scripts provided in the workshop use the [AWS CLI](https://aws.amazon.com/cli/) to access the AWS APIs in the account. That is why the AWS CLI needs to be configured with the credentials (access key id and secret access key) we saved previously. The configuration of the EKS cluster is specified by a .yaml file which we will also generate in this step. 37 | 38 | Execute: 39 | ```console 40 | ./1-1-configure.sh 41 | ``` 42 | 43 | Output: 44 | ``` 45 | The config profile (workshop) could not be found 46 | 47 | Configuring AWS client ... 48 | AWS Access Key ID [None]: ************ 49 | AWS Secret Access Key [None]: **************************************** 50 | Default region name [None]: us-west-2 51 | Default output format [None]: json 52 | 53 | Generating cluster configuration eks.yaml ... 54 | ``` 55 | 56 | By default, Cloud9 uses AWS managed temporary credentials, which we override with the script. If the managed temporary credentials setting has not been disabled, as soon as the script completes, Cloud9 will display the following dialog. 57 | 58 |
59 | 60 |        61 | 62 |
63 | 64 | Fig. 1.1 Cloud9 credentials dialogs 65 |
66 | 67 | Please click **Cancel** in this dialog, immediatlely another dialog appears. Please click **Permanently disable** in the second dialog. If these dialogs do not appear, then AWS managed temporary credentials have already been disabled in your Cloud9 IDE and you may proceed to the next step. 68 | 69 | ### 1.2. Install tools 70 | 71 | The Cloud9 IDE comes with Docker pre-installed. In order to provision an EKS cluster, we will install [eksctl](https://eksctl.io/). To be able to execute commands against Kubernetes, we will install [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/). We will also install other miscellaneous utilities like [kubectx](https://github.com/ahmetb/kubectx), [kubetail](https://github.com/johanhaleby/kubetail), [jq](https://github.com/stedolan/jq), [yq](https://kislyuk.github.io/yq/), and will set up some shorthand command aliases (ll='ls -alh', k=kubectl, kc=kubectx, kn=kubens, kt=kubetail, ks=kubeshell) for convenience. 72 | 73 | Execute: 74 | ```console 75 | ./1-2-install-tools.sh 76 | ``` 77 | 78 | Output: 79 | ``` 80 | Installing eksctl ... 81 | % Total % Received % Xferd Average Speed Time Time Time Current 82 | Dload Upload Total Spent Left Speed 83 | 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 84 | 100 18.6M 100 18.6M 0 0 19.1M 0 --:--:-- --:--:-- --:--:-- 31.5M 85 | 0.66.0 86 | 87 | Installing kubectl ... 88 | % Total % Received % Xferd Average Speed Time Time Time Current 89 | Dload Upload Total Spent Left Speed 90 | 100 57.4M 100 57.4M 0 0 93.6M 0 --:--:-- --:--:-- --:--:-- 93.6M 91 | Client Version: version.Info{Major:"1", Minor:"19+", GitVersion:"v1.19.6-eks-49a6c0", GitCommit:"49a6c0bf091506e7bafcdb1b142351b69363355a", GitTreeState:"clean", BuildDate:"2020-12-23T22:13:28Z", GoVersion:"go1.15.5", Compiler:"gc", Platform:"linux/amd64"} 92 | 93 | ... 94 | 95 | Setting up aliases ... 96 | 97 | Done setting up tools. 98 | ``` 99 | 100 | ### 1.3. Launch cluster 101 | 102 | We will use `eksctl` and the generated `eks.yaml` configuration to launch a new EKS cluster. 103 | 104 | Execute: 105 | ```console 106 | ./1-3-create-cluster.sh 107 | ``` 108 | 109 | Output: 110 | ``` 111 | Creating EKS cluster ... 112 | 113 | ... using configuration from ./eks.yaml ... 114 | 115 | apiVersion: eksctl.io/v1alpha5 116 | kind: ClusterConfig 117 | 118 | metadata: 119 | name: do-eks 120 | version: "1.26" 121 | region: us-west-2 122 | 123 | availabilityZones: 124 | - us-west-2a 125 | - us-west-2b 126 | 127 | iam: 128 | withOIDC: true 129 | 130 | managedNodeGroups: 131 | - name: wks-node 132 | instanceType: c5.4xlarge 133 | instancePrefix: workshop 134 | privateNetworking: true 135 | availabilityZones: ["us-west-2a","us-west-2b"] 136 | efaEnabled: false 137 | minSize: 0 138 | desiredCapacity: 2 139 | maxSize: 10 140 | volumeSize: 900 141 | iam: 142 | withAddonPolicies: 143 | cloudWatch: true 144 | autoScaler: true 145 | ebs: true 146 | 147 | Sat Jun 4 06:06:16 UTC 2022 148 | eksctl create cluster -f ./eks.yaml 149 | 2022-06-04 06:06:16 [ℹ] eksctl version 0.66.0 150 | 2022-06-04 06:06:16 [ℹ] using region us-west-2 151 | 2022-06-04 06:06:16 [ℹ] subnets for us-west-2a - public:192.168.0.0/19 private:192.168.64.0/19 152 | 2022-06-04 06:06:16 [ℹ] subnets for us-west-2b - public:192.168.32.0/19 private:192.168.96.0/19 153 | 2022-06-04 06:06:16 [ℹ] nodegroup "wks-node" will use "" [AmazonLinux2/1.21] 154 | 2022-06-04 06:06:16 [ℹ] using Kubernetes version 1.21 155 | 2022-06-04 06:06:16 [ℹ] creating EKS cluster "do-eks" in "us-west-2" region with managed nodes 156 | 2022-06-04 06:06:16 [ℹ] 1 nodegroup (wks-node) was included (based on the include/exclude rules) 157 | 2022-06-04 06:06:16 [ℹ] will create a CloudFormation stack for cluster itself and 0 nodegroup stack(s) 158 | 2022-06-04 06:06:16 [ℹ] will create a CloudFormation stack for cluster itself and 1 managed nodegroup stack(s) 159 | 2022-06-04 06:06:16 [ℹ] if you encounter any issues, check CloudFormation console or try 'eksctl utils describe-stacks --region=us-west-2 --cluster=do-eks' 160 | 2022-06-04 06:06:16 [ℹ] CloudWatch logging will not be enabled for cluster "do-eks" in "us-west-2" 161 | 2022-06-04 06:06:16 [ℹ] you can enable it with 'eksctl utils update-cluster-logging --enable-types={SPECIFY-YOUR-LOG-TYPES-HERE (e.g. all)} --region=us-west-2 --cluster=do-eks' 162 | 2022-06-04 06:06:16 [ℹ] Kubernetes API endpoint access will use default of {publicAccess=true, privateAccess=false} for cluster "do-eks" in "us-west-2" 163 | 2022-06-04 06:06:16 [ℹ] 2 sequential tasks: { create cluster control plane "do-eks", 3 sequential sub-tasks: { 4 sequential sub-tasks: { wait for control plane to become ready, associate IAM OIDC provider, 2 sequential sub-tasks: { create IAM role for serviceaccount "kube-system/aws-node", create serviceaccount "kube-system/aws-node" }, restart daemonset "kube-system/aws-node" }, 1 task: { create addons }, create managed nodegroup "wks-node" } } 164 | 2022-06-04 06:06:16 [ℹ] building cluster stack "eksctl-do-eks-cluster" 165 | 2022-06-04 06:06:16 [ℹ] deploying stack "eksctl-do-eks-cluster" 166 | 167 | ... 168 | 169 | 2022-06-04 06:27:59 [ℹ] waiting for CloudFormation stack "eksctl-do-eks-nodegroup-wks-node" 170 | 2022-06-04 06:27:59 [ℹ] waiting for the control plane availability... 171 | 2022-06-04 06:27:59 [✔] saved kubeconfig as "/home/ec2-user/.kube/config" 172 | 2022-06-04 06:27:59 [ℹ] no tasks 173 | 2022-06-04 06:27:59 [✔] all EKS cluster resources for "do-eks" have been created 174 | 2022-06-04 06:30:01 [ℹ] kubectl command should work with "/home/ec2-user/.kube/config", try 'kubectl get nodes' 175 | 2022-06-04 06:30:01 [✔] EKS cluster "do-eks" in "us-west-2" region is ready 176 | 177 | Sat Jun 4 06:30:01 UTC 2022 178 | Done creating EKS cluster 179 | 180 | Updating kubeconfig ... 181 | Added new context arn:aws:eks:us-west-2:620266777012:cluster/do-eks to /home/ec2-user/.kube/config 182 | 183 | 184 | Displaying cluster nodes ... 185 | NAME STATUS ROLES AGE VERSION 186 | ip-192-168-111-138.us-west-2.compute.internal Ready 3m3s v1.21.12-eks-5308cf7 187 | ip-192-168-90-82.us-west-2.compute.internal Ready 3m3s v1.21.12-eks-5308cf7 188 | 189 | ``` 190 | 191 | The `eksctl` command uses Cloud Formation behind the scenes. In addition to the command output, provisioning progress can be seen in [CloudFormation](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#). 192 | 193 | Please expect that creation of the cluster may take up to 30 min. 194 | 195 | ### 1.4. Deploy packages to cluster 196 | We are going to use [TorchElastic Job Controller](https://github.com/pytorch/elastic/tree/master/kubernetes) for Kubernetes to launch a distributed training job using an ElasticJob custom resource. We will also use [Kubernetes Metrics Server](https://github.com/kubernetes-sigs/metrics-server) to monitor node resource utilization in the cluster during training. To deploy both to the EKS cluster, execute: 197 | 198 | ```console 199 | ./1-4-deploy-packages.sh 200 | ``` 201 | 202 | Output: 203 | ``` 204 | Deploying Kubernetes Metrics Server ... 205 | serviceaccount/metrics-server created 206 | clusterrole.rbac.authorization.k8s.io/system:aggregated-metrics-reader created 207 | clusterrole.rbac.authorization.k8s.io/system:metrics-server created 208 | rolebinding.rbac.authorization.k8s.io/metrics-server-auth-reader created 209 | clusterrolebinding.rbac.authorization.k8s.io/metrics-server:system:auth-delegator created 210 | clusterrolebinding.rbac.authorization.k8s.io/system:metrics-server created 211 | service/metrics-server created 212 | deployment.apps/metrics-server created 213 | apiservice.apiregistration.k8s.io/v1beta1.metrics.k8s.io created 214 | 215 | Deploying Kubeflow Training Operator ... 216 | ~/update-workshop/1-create-cluster/kubeflow-training-operator ~/update-workshop/1-create-cluster 217 | namespace/kubeflow created 218 | customresourcedefinition.apiextensions.k8s.io/mpijobs.kubeflow.org created 219 | customresourcedefinition.apiextensions.k8s.io/mxjobs.kubeflow.org created 220 | customresourcedefinition.apiextensions.k8s.io/pytorchjobs.kubeflow.org created 221 | customresourcedefinition.apiextensions.k8s.io/tfjobs.kubeflow.org created 222 | customresourcedefinition.apiextensions.k8s.io/xgboostjobs.kubeflow.org created 223 | serviceaccount/training-operator created 224 | clusterrole.rbac.authorization.k8s.io/training-operator created 225 | clusterrolebinding.rbac.authorization.k8s.io/training-operator created 226 | service/training-operator created 227 | deployment.apps/training-operator created 228 | clusterrole.rbac.authorization.k8s.io/hpa-access created 229 | clusterrolebinding.rbac.authorization.k8s.io/training-operator-hpa-access created 230 | ~/update-workshop/1-create-cluster 231 | 232 | Deploying etcd ... 233 | service/etcd-service created 234 | deployment.apps/etcd created 235 | ``` 236 | 237 | The EKS cluster is now provisioned and prepared to run distributed training jobs. 238 | 239 | ## 2. Create Shared Volume 240 | 241 |

242 | 243 | Fig. 2.0 - Step 2 - Create shared volume 244 |
245 | 246 | With distributed data parallel training, all workers need to have access to the training data. We can achieve that by creating a shared volume which can be mounted in each of the worker pods. 247 | 248 | To create a shared volume, we will use the scripts in the directory for step 2. 249 | 250 | ```console 251 | cd ../2-create-volume 252 | ``` 253 | 254 | ### 2.1. Create EFS file system 255 | First we will use the AWS CLI to provision an EFS file system. 256 | 257 | Execute: 258 | ```console 259 | ./2-1-create-efs.sh 260 | ``` 261 | 262 | Output: 263 | ``` 264 | Cluster name do-eks 265 | VPC vpc-0ecd59e0bf1426491 266 | Creating security group ... 267 | { 268 | "GroupId": "sg-0ab73460e1a1b3e67" 269 | } 270 | eks-efs-group NFS access to EFS from EKS worker nodes sg-0ab73460e1a1b3e67 271 | 272 | ... 273 | 274 | Creating EFS volume ... 275 | fs-0b15155937d1c6b83 276 | subnet-07767ca17e93fe901 subnet-04859dc111ed82685 277 | Creating mount target in subnet-07767ca17e93fe901 in security group sg-0ab73460e1a1b3e67 for efs fs-0b15155937d1c6b83 278 | 279 | ... 280 | 281 | Done. 282 | ``` 283 | 284 | The EFS file system is now created and configured so that it can be accessed from the EKS cluster. 285 | 286 | ### 2.2. Create Kubernetes Persistent Volume Claim 287 | In order to create a Kubernetes persistent volume claim (PVC) agains the EFS file system, we need to deploy the EFS container storage interface (CSI) driver to the cluster, then create a storage class and a persistent volume (PV). To do that, execute: 288 | 289 | ```console 290 | ./2-2-create-pvc.sh 291 | ``` 292 | 293 | Output: 294 | ``` 295 | Checking EFS File System ... 296 | EFS volume id fs-0b15155937d1c6b83 297 | 298 | Deploying EFS CSI Driver ... 299 | serviceaccount/efs-csi-controller-sa created 300 | serviceaccount/efs-csi-node-sa created 301 | clusterrole.rbac.authorization.k8s.io/efs-csi-external-provisioner-role created 302 | clusterrolebinding.rbac.authorization.k8s.io/efs-csi-provisioner-binding created 303 | deployment.apps/efs-csi-controller created 304 | daemonset.apps/efs-csi-node created 305 | csidriver.storage.k8s.io/efs.csi.aws.com configured 306 | efs-csi-controller-66fcf64846-4dcbv 0/3 ContainerCreating 0 6s 307 | efs-csi-controller-66fcf64846-df6p9 0/3 ContainerCreating 0 6s 308 | efs-csi-node-7cnkt 0/3 ContainerCreating 0 6s 309 | efs-csi-node-9ljw2 0/3 ContainerCreating 0 6s 310 | 311 | Generating efs-sc.yaml ... 312 | 313 | Applying efs-sc.yaml ... 314 | storageclass.storage.k8s.io/efs-sc created 315 | NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE 316 | efs-sc efs.csi.aws.com Delete Immediate false 0s 317 | gp2 (default) kubernetes.io/aws-ebs Delete WaitForFirstConsumer false 94m 318 | 319 | Generating efs-pv.yaml ... 320 | Applying efs-pv.yaml ... 321 | persistentvolume/efs-pv created 322 | NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS REASON AGE 323 | efs-pv 5Gi RWX Retain Available efs-sc 11s 324 | 325 | Creating persistent volume claim efs-pvc ... 326 | persistentvolumeclaim/efs-pvc created 327 | NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE 328 | efs-pvc Bound efs-pv 5Gi RWX efs-sc 1s 329 | 330 | Done. 331 | 332 | ``` 333 | 334 | ## 3. Build Deep Learning Container 335 | 336 |

337 | 338 | Fig. 3.0 - Step 3 - Build deep learning container 339 |
340 | 341 | In this step, we will build a container that has code to train our PyTorch model. 342 | To do that we need to change the current directory to `3-build-container`. 343 | 344 | ```console 345 | cd ../3-build-container 346 | ``` 347 | 348 | Please note that this folder contains a Dockerfile, python and shell scripts. We will only need to execute the scripts that start with `3-*`. 349 | 350 | ### 3.1. Build container image 351 | 352 | To build the container image, execute: 353 | ```console 354 | ./3-1-build.sh 355 | ``` 356 | 357 | Output: 358 | ``` 359 | inflating: aws/dist/awscli/data/dax/2017-04-19/completions-1.json 360 | creating: aws/dist/awscli/data/health/2016-08-04/ 361 | 362 | 16650K .......... .......... .......... .......... .......... 98% 29.1M 0s 363 | 16700K .......... .......... .......... .......... .......... 99% 23.6M 0s 364 | 16750K .......... .......... .......... .......... .......... 99% 16.3M 0s 365 | 16800K .......... .......... .......... .......... .......... 99% 25.4M 0s 366 | 16850K .......... .......... ..... 100% 268M=1.3s 367 | 368 | 2022-06-04 07:56:41 (12.3 MB/s) - '/tmp/etcd-v3.4.3/etcd-v3.4.3-linux-amd64.tar.gz' saved [17280028/17280028] 369 | 370 | ------------------------ 371 | etcdctl version: 3.4.3 372 | API version: 3.4 373 | ------------------------ 374 | Finished installing etcd v3.4.3. To use: /usr/local/bin/(etcd | etcdctl) 375 | Removing intermediate container 71951321d43d 376 | 377 | ... 378 | 379 | tep 12/15 : ADD cifar10-model-train.py /workspace/ 380 | ---> 622630ffa5b7 381 | Step 13/15 : ADD cifar10-model-test.py /workspace/ 382 | ---> 33974972d759 383 | Step 14/15 : ADD cnn_model.py /workspace/ 384 | ---> 8d1492e4f0a1 385 | Step 15/15 : ADD data-prep.sh /workspace/ 386 | ---> b1ec9d533050 387 | Successfully built b1ec9d533050 388 | Successfully tagged 620266777012.dkr.ecr.us-west-2.amazonaws.com/pytorch-cpu:latest 389 | 390 | ``` 391 | 392 | ### 3.2 Push container image to ECR 393 | After it is built, the image needs to be pushed to ECR so it can be used by Kubernetes nodes. 394 | 395 | Execute: 396 | ```console 397 | ./3-2-push.sh 398 | ``` 399 | 400 | Output: 401 | ``` 402 | Logging in to 620266777012.dkr.ecr.us-west-2.amazonaws.com/ ... 403 | WARNING! Your password will be stored unencrypted in /home/ec2-user/.docker/config.json. 404 | Configure a credential helper to remove this warning. See 405 | https://docs.docker.com/engine/reference/commandline/login/#credentials-store 406 | 407 | Login Succeeded 408 | Pushing pytorch-cpu:latest to registry ... 409 | The push refers to repository [620266777012.dkr.ecr.us-west-2.amazonaws.com/pytorch-cpu] 410 | 85fb7c19f7ba: Pushed 411 | 1915f933c51f: Pushed 412 | 69f193e41d27: Pushed 413 | fac272423a4b: Pushed 414 | 3c8419b41ef5: Pushed 415 | 0f550fa492fc: Pushed 416 | ff0f8f83e19d: Pushed 417 | 11c114e08199: Pushed 418 | e9b65af3368a: Pushed 419 | bf8cedc62fb3: Layer already exists 420 | latest: digest: sha256:a7bc0842b2681a84ebbfeda35096d8d8f09baffdb0e8ce9d42d6b3f9d983ac6d size: 3459 421 | 422 | ``` 423 | 424 | ## 4. Download and Preprocess Image Dataset 425 | 426 |

427 | 428 | Fig. 4.0 - Step 4 - Download data 429 |
430 | 431 | In this step we will run a pod which mounts the persistent volume and downloads the [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset on it. 432 | We will execute the scripts from directory `4-get-data`. 433 | 434 | ```console 435 | cd ../4-get-data 436 | ``` 437 | 438 | ### 4.1. Launch download pod 439 | The [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) condists of images with size 32x32 pixels, grouped in 10 classes (airplane, automobile, bird, cat, deer, dog, frog, horse, ship, truck) with 6,000 images per class. To download this dataset and save it to the shared volume, execute: 440 | 441 | ```console 442 | ./4-1-get-data.sh 443 | ``` 444 | 445 | Output: 446 | ``` 447 | 448 | Generating pod manifest ... 449 | 450 | Creating efs-data-prep pod ... 451 | pod/efs-data-prep-pod created 452 | efs-data-prep-pod 0/1 ContainerCreating 0 3s 453 | 454 | ``` 455 | 456 | ### 4.2. Show data prep pod status 457 | The data-prep pod status changes from ContainerCreating, to Running, to Complete. To show the current status, execute: 458 | 459 | ```console 460 | ./4-2-show-status.sh 461 | ``` 462 | 463 | Output: 464 | ``` 465 | Describing data prep pod ... 466 | Name: efs-data-prep-pod 467 | Namespace: default 468 | Priority: 0 469 | 470 | ... 471 | 472 | Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s 473 | node.kubernetes.io/unreachable:NoExecute op=Exists for 300s 474 | Events: 475 | Type Reason Age From Message 476 | ---- ------ ---- ---- ------- 477 | Normal Scheduled 3m23s default-scheduler Successfully assigned default/efs-data-prep-pod to ip-192-168-111-138.us-west-2.compute.internal 478 | Normal Pulling 3m16s kubelet Pulling image "620266777012.dkr.ecr.us-west-2.amazonaws.com/pytorch-cpu:latest" 479 | Normal Pulled 2m57s kubelet Successfully pulled image "620266777012.dkr.ecr.us-west-2.amazonaws.com/pytorch-cpu:latest" in 19.458971841s 480 | Normal Created 2m43s kubelet Created container efs-data-prep-pod 481 | Normal Started 2m43s kubelet Started container efs-data-prep-pod 482 | 483 | Showing status of data prep pod ... 484 | efs-data-prep-pod 0/1 Completed 0 3m23s 485 | ``` 486 | 487 | ### 4.3. Show data-prep log 488 | When the pod enters the Running or Completed status, you can display its log by executing: 489 | 490 | ```console 491 | ./4-3-show-log.sh 492 | ``` 493 | 494 | Output: 495 | ``` 496 | Shared path - /efs-shared 497 | --2022-06-05 06:50:53-- https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz 498 | Resolving www.cs.toronto.edu (www.cs.toronto.edu)... 128.100.3.30 499 | Connecting to www.cs.toronto.edu (www.cs.toronto.edu)|128.100.3.30|:443... connected. 500 | HTTP request sent, awaiting response... 200 OK 501 | Length: 170498071 (163M) [application/x-gzip] 502 | Saving to: 'cifar-10-python.tar.gz' 503 | 504 | 0K .......... .......... .......... .......... .......... 0% 350K 7m56s 505 | 50K .......... .......... .......... .......... .......... 0% 695K 5m58s 506 | 100K .......... .......... .......... .......... .......... 0% 693K 5m18s 507 | 150K .......... .......... .......... .......... .......... 0% 18.6M 4m1s 508 | 200K .......... .......... .......... .......... .......... 0% 21.5M 3m14s 509 | 250K .......... .......... .......... .......... .......... 0% 732K 3m20s 510 | 300K .......... .......... .......... .......... .......... 0% 66.4M 2m51s 511 | 350K .......... .......... .......... .......... .......... 0% 63.0M 2m30s 512 | 400K .......... .......... .......... .......... .......... 0% 18.6M 2m15s 513 | 450K .......... .......... .......... .......... .......... 0% 60.7M 2m1s 514 | 500K .......... .......... .......... .......... .......... 0% 80.4M 1m50s 515 | 550K .......... .......... .......... .......... .......... 0% 745K 2m0s 516 | 600K .......... .......... .......... .......... .......... 0% 78.2M 1m51s 517 | 518 | ... 519 | 520 | 166250K .......... .......... .......... .......... .......... 99% 118M 0s 521 | 166300K .......... .......... .......... .......... .......... 99% 129M 0s 522 | 166350K .......... .......... .......... .......... .......... 99% 4.00M 0s 523 | 166400K .......... .......... .......... .......... .......... 99% 100M 0s 524 | 166450K .......... .......... .......... .......... .......... 99% 137M 0s 525 | 166500K .. 100% 3858G=4.8s 526 | 527 | 2022-06-05 06:50:59 (33.9 MB/s) - 'cifar-10-python.tar.gz' saved [170498071/170498071] 528 | ``` 529 | 530 | The last message showing the dataset was saved, indicates a successful download. 531 | 532 | ## 5. Train Image Classification Model 533 | 534 |

535 | 536 | Fig. 5.0 - Step 5 - Distributed data-parallel model training 537 |
538 | 539 | Next we will execute the model training scripts from directory `5-train-model`. 540 | 541 | ```console 542 | cd ../5-train-model 543 | ``` 544 | ### 5.1. Generate PyTorchJob 545 | The Kubernetes manifests in this workshop are generated from templates, based on the configuration stored in file [`./env`](.env). To generate the PyTorchJob manifest for our distributed training, execute: 546 | 547 | ```console 548 | ./5-1-generate-pytorchjob.sh 549 | ``` 550 | 551 | Output: 552 | ``` 553 | Generating PyTorchJob manifest ... 554 | 555 | apiVersion: "kubeflow.org/v1" 556 | kind: PyTorchJob 557 | metadata: 558 | name: cifar10-train 559 | spec: 560 | elasticPolicy: 561 | rdzvBackend: etcd 562 | rdzvHost: etcd-service 563 | rdzvPort: 2379 564 | minReplicas: 1 565 | maxReplicas: 128 566 | maxRestarts: 100 567 | metrics: 568 | - type: Resource 569 | resource: 570 | name: cpu 571 | target: 572 | type: Utilization 573 | averageUtilization: 80 574 | pytorchReplicaSpecs: 575 | Worker: 576 | replicas: 2 577 | restartPolicy: OnFailure 578 | template: 579 | spec: 580 | containers: 581 | - name: pytorch 582 | image: 999701187340.dkr.ecr.us-west-2.amazonaws.com/pytorch-cpu:latest 583 | imagePullPolicy: IfNotPresent 584 | env: 585 | - name: PROCESSOR 586 | value: "cpu" 587 | command: 588 | - python3 589 | - -m 590 | - torch.distributed.run 591 | - /workspace/cifar10-model-train.py 592 | - "--epochs=10" 593 | - "--batch-size=128" 594 | - "--workers=15" 595 | - "--model-file=/efs-shared/cifar10-model.pth" 596 | - "/efs-shared/cifar-10-batches-py/" 597 | volumeMounts: 598 | - name: efs-pv 599 | mountPath: /efs-shared 600 | # The following enables the worker pods to use increased shared memory 601 | # which is required when specifying more than 0 data loader workers 602 | - name: dshm 603 | mountPath: /dev/shm 604 | volumes: 605 | - name: efs-pv 606 | persistentVolumeClaim: 607 | claimName: efs-pvc 608 | - name: dshm 609 | emptyDir: 610 | medium: Memory 611 | ``` 612 | 613 | The manifest specifies an elastic job named **cifar10-train**. The job is configured to communicate with rendez-vous end point `etcd-service:2379` which is the etcd service we launched in the same namespace. It is also configured to run two workers, each of them on a separate node. Each worker will execute the `torchrun` command and run training for 10 epochs. 614 | 615 | ### 5.2. Launch PyTorchJob 616 | Next we will launch the PyTorchJob by applying the generated manifest. 617 | 618 | Execute: 619 | ```console 620 | ./5-2-launch-pytorchjob.sh 621 | ``` 622 | 623 | Output: 624 | ``` 625 | Launching PyTorchJob ... 626 | pytorchjob.kubeflow.org/cifar10-train created 627 | ``` 628 | 629 | ### 5.3. Show training worker pods status 630 | Each launched worker is represented by a pod in the cluster. To see the status of the worker pods, execute: 631 | 632 | ```console 633 | ./5-3-show-status.sh 634 | ``` 635 | 636 | Output: 637 | ``` 638 | cifar10-train-worker-0 1/1 Running 0 47s 192.168.109.172 ip-192-168-111-138.us-west-2.compute.internal 639 | cifar10-train-worker-1 1/1 Running 0 47s 192.168.93.104 ip-192-168-90-82.us-west-2.compute.internal 640 | ``` 641 | 642 | ### 5.4. Show node utilization 643 | Once the training starts, you will be able to see the CPU utilization of the two nodes rise. 644 | 645 | Execute: 646 | ```console 647 | ./5-4-show-utilization.sh 648 | ``` 649 | 650 | Output: 651 | ``` 652 | NAME CPU(cores) CPU% MEMORY(bytes) MEMORY% 653 | ip-192-168-111-138.us-west-2.compute.internal 18246m 50% 2306Mi 3% 654 | ip-192-168-90-82.us-west-2.compute.internal 17936m 50% 2322Mi 3% 655 | ``` 656 | 657 | ### 5.5. Show training logs 658 | After the worker pods have been created, we can see their combined logs using the **kubetail** tool. 659 | 660 | Execute: 661 | ```console 662 | ./5-5-show-logs.sh 663 | ``` 664 | 665 | Output: 666 | ``` 667 | Will tail 2 logs... 668 | cifar10-train-worker-0 669 | cifar10-train-worker-1 670 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:22,775 Keep-alive key /torchelastic/p2p/run_cifar10-train/rdzv/v_1/rank_0 is not renewed. 671 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:22,775 Rendevous version 1 is incomplete. 672 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:22,775 Attempting to destroy it. 673 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:22,776 Destroyed rendezvous version 1 successfully. 674 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:22,776 Previously existing rendezvous state changed. Will re-try joining. 675 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:22,776 Attempting to join next rendezvous 676 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:22,780 New rendezvous state created: {'status': 'joinable', 'version': '2', 'participants': []} 677 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:22,869 Joined rendezvous version 2 as rank 0. Full state: {'status': 'joinable', 'version': '2', 'participants': [0]} 678 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:22,869 Rank 0 is responsible for join last call. 679 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:22,776 Keep-alive key /torchelastic/p2p/run_cifar10-train/rdzv/v_1/rank_0 is not renewed. 680 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:22,776 Rendevous version 1 is incomplete. 681 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:22,777 Attempting to destroy it. 682 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:22,778 Rendezvous attempt failed, will retry. Reason: Compare failed : [{"status": "final", "version": "1", "participants": [0], "keep_alives": ["/torchelastic/p2p/run_cifar10-train/rdzv/v_1/rank_0"], "num_workers_waiting": 2} != {"status": "setup"}] 683 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:23,779 Attempting to join next rendezvous 684 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:23,784 Observed existing rendezvous state: {'status': 'joinable', 'version': '2', 'participants': [0]} 685 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:23,816 Joined rendezvous version 2 as rank 1. Full state: {'status': 'joinable', 'version': '2', 'participants': [0, 1]} 686 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:23,816 Waiting for remaining peers. 687 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:53,867 Rank 0 finished join last call. 688 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:53,869 All peers arrived. Confirming membership. 689 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:53,867 Waiting for remaining peers. 690 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:53,867 All peers arrived. Confirming membership. 691 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:53,890 Waiting for confirmations from all peers. 692 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:53,913 Waiting for confirmations from all peers. 693 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:53,913 Rendezvous version 2 is complete. Final state: {'status': 'final', 'version': '2', 'participants': [0, 1], 'keep_alives': ['/torchelastic/p2p/run_cifar10-train/rdzv/v_2/rank_0', '/torchelastic/p2p/run_cifar10-train/rdzv/v_2/rank_1'], 'num_workers_waiting': 0} 694 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:53,915 Rendezvous version 2 is complete. Final state: {'status': 'final', 'version': '2', 'participants': [0, 1], 'keep_alives': ['/torchelastic/p2p/run_cifar10-train/rdzv/v_2/rank_0', '/torchelastic/p2p/run_cifar10-train/rdzv/v_2/rank_1'], 'num_workers_waiting': 0} 695 | [cifar10-train-worker-1] INFO 2022-06-06 21:38:53,915 Creating EtcdStore as the c10d::Store implementation 696 | [cifar10-train-worker-0] INFO 2022-06-06 21:38:53,913 Creating EtcdStore as the c10d::Store implementation 697 | [cifar10-train-worker-0] reading /efs-shared/cifar-10-batches-py/ 698 | [cifar10-train-worker-1] reading /efs-shared/cifar-10-batches-py/ 699 | [cifar10-train-worker-1] [1, 5] loss: 2.335 700 | [cifar10-train-worker-0] [1, 5] loss: 2.323 701 | [cifar10-train-worker-1] [1, 10] loss: 2.247 702 | [cifar10-train-worker-0] [1, 10] loss: 2.225 703 | [cifar10-train-worker-1] [1, 15] loss: 2.168 704 | [cifar10-train-worker-0] [1, 15] loss: 2.163 705 | [cifar10-train-worker-1] [1, 20] loss: 2.061 706 | [cifar10-train-worker-0] [1, 20] loss: 2.077 707 | [cifar10-train-worker-1] [1, 25] loss: 2.011 708 | [cifar10-train-worker-0] [1, 25] loss: 2.010 709 | [cifar10-train-worker-1] [1, 30] loss: 1.963 710 | [cifar10-train-worker-0] [1, 30] loss: 1.938 711 | ... 712 | [cifar10-train-worker-1] [6, 180] loss: 0.496 713 | [cifar10-train-worker-0] [6, 185] loss: 0.499 714 | [cifar10-train-worker-1] [6, 185] loss: 0.503 715 | [cifar10-train-worker-1] [6, 190] loss: 0.504 716 | [cifar10-train-worker-0] [6, 190] loss: 0.594 717 | [cifar10-train-worker-0] [6, 195] loss: 0.536 718 | [cifar10-train-worker-1] [6, 195] loss: 0.522 719 | [cifar10-train-worker-0] [7, 5] loss: 0.470 720 | [cifar10-train-worker-1] [7, 5] loss: 0.464 721 | [cifar10-train-worker-0] [7, 10] loss: 0.510 722 | [cifar10-train-worker-1] [7, 10] loss: 0.465 723 | [cifar10-train-worker-0] [7, 15] loss: 0.525 724 | [cifar10-train-worker-1] [7, 15] loss: 0.489 725 | [cifar10-train-worker-0] [7, 20] loss: 0.479 726 | [cifar10-train-worker-1] [7, 20] loss: 0.478 727 | [cifar10-train-worker-0] [7, 25] loss: 0.523 728 | [cifar10-train-worker-1] [7, 25] loss: 0.520 729 | ... 730 | [cifar10-train-worker-0] [10, 190] loss: 0.247 731 | [cifar10-train-worker-1] [10, 190] loss: 0.185 732 | [cifar10-train-worker-0] [10, 195] loss: 0.200 733 | [cifar10-train-worker-1] [10, 195] loss: 0.202 734 | [cifar10-train-worker-0] saving model: /efs-shared/cifar10-model.pth 735 | [cifar10-train-worker-1] saving model: /efs-shared/cifar10-model.pth 736 | [cifar10-train-worker-1] Finished Training 737 | [cifar10-train-worker-0] Finished Training 738 | ``` 739 | 740 | In the beginning of the logs you will see the workers registering with the rendez-vous endpoint to coordinate their work, then they will train collaboratively over 10 epochs. Each epoch has 400 iterations. Since we are training with two workers, the work is split in two and each of the workers executes only 200 iterations from the epoch. As the training progresses, you will see the `loss` decrease, which indicates that the model is converging. At the end of the 10th epoch, we save the model to the shared volume. 741 | 742 | Press `Ctrl-C` to stop tailing the logs at any time. 743 | 744 | ## 5.6. Delete ElasticJob (*Optional*) 745 | If you wish to run another instance of the elastic job, please delete the current job first. 746 | 747 | Execute: 748 | ```console 749 | ./5-6-delete-pytorchjob.sh 750 | ``` 751 | 752 | Output: 753 | ``` 754 | Deleting PyTorchJob ... 755 | pytorchjob.kubeflow.org "cifar10-train" deleted 756 | ``` 757 | 758 | Note: when starting a new job instance if the workers fail to start with errors indicating failure to connect to the rendez-vous service, please delete the etcd pod as well before starting the elastic job. 759 | 760 | ## 6. Test Model using New Images 761 | 762 |

763 | 764 | Fig. 6.0 - Step 6 - Test model 765 |
766 | 767 | This step will be executed from directory `6-test-model`. 768 | 769 | ```console 770 | cd ../6-test-model 771 | ``` 772 | 773 | ### 6.1 Generate test job 774 | We are going to use a standard Kubernetes job manifest (as opposed to an ElasticJob manifest, which we used for training) since we do not need to run the test in a distributed manner. To generate the job manifest, execute: 775 | 776 | ```console 777 | ./6-1-generate-job.sh 778 | ``` 779 | 780 | Output: 781 | ``` 782 | Generating test job manifest ... 783 | apiVersion: batch/v1 784 | kind: Job 785 | metadata: 786 | name: cifar10-test 787 | spec: 788 | template: 789 | spec: 790 | restartPolicy: Never 791 | nodeSelector: 792 | beta.kubernetes.io/instance-type: c5.4xlarge 793 | containers: 794 | - name: test 795 | image: 042407962002.dkr.ecr.us-west-2.amazonaws.com/pytorch-cpu:latest 796 | imagePullPolicy: Always 797 | command: ["python3"] 798 | args: 799 | - "/workspace/cifar10-model-test.py" 800 | - "--model-file=/efs-shared/cifar10-model.pth" 801 | - "/efs-shared/cifar-10-batches-py/" 802 | volumeMounts: 803 | - name: efs-pv 804 | mountPath: /efs-shared 805 | volumes: 806 | - name: efs-pv 807 | persistentVolumeClaim: 808 | claimName: efs-pvc 809 | ``` 810 | 811 | As evident from the manifest above, we will create a single pod named **cifar10-test** and execute the `cifar10-model-test.py` script in it, passing the model file that we saved from the training step. 812 | 813 | ### 6.2. Launch job 814 | The test job will take 10,000 images that were not used during training and use the model to classify them. Then it will calculate accuracy measurements. 815 | 816 | Execute: 817 | ```console 818 | ./6-2-launch-job.sh 819 | ``` 820 | 821 | Output: 822 | ``` 823 | Launching test job ... 824 | job.batch/cifar10-test created 825 | ``` 826 | 827 | ### 6.3. Show job status 828 | When the job manifest is applied, a pod is created and runs to completion. To see the pod status, execute: 829 | 830 | ```console 831 | ./6-3-show-status.sh 832 | ``` 833 | 834 | Output: 835 | ``` 836 | Showing test job status ... 837 | cifar10-test-tlnjn 1/1 Running 0 4s 838 | ``` 839 | 840 | ### 6.4. Show test log 841 | The results from the test are written to the pod log. 842 | 843 | Execute: 844 | ```console 845 | ./6-4-show-log.sh 846 | ``` 847 | 848 | Output: 849 | ``` 850 | Showing cifar10-test log ... 851 | 852 | reading /efs-shared/cifar-10-batches-py/ 853 | loading model /efs-shared/cifar10-model.pth 854 | Accuracy of the network on the 10000 test images: 74 % 855 | Accuracy for class: plane is 77.5 % 856 | Accuracy for class: car is 85.2 % 857 | Accuracy for class: bird is 64.4 % 858 | Accuracy for class: cat is 53.3 % 859 | Accuracy for class: deer is 71.3 % 860 | Accuracy for class: dog is 66.6 % 861 | Accuracy for class: frog is 85.9 % 862 | Accuracy for class: horse is 80.5 % 863 | Accuracy for class: ship is 82.8 % 864 | Accuracy for class: truck is 82.4 % 865 | Finished Testing 866 | ``` 867 | 868 | As we can see the model classified the images into 10 different categories with overall accuracy of 74%. 869 | 870 | ### 6.5. Delete test job 871 | In the event that the test job needs to be run again for a different model, the old job needs to be deleted first. 872 | 873 | Execute: 874 | ```console 875 | ./6-5-delete-job.sh 876 | ``` 877 | 878 | Output: 879 | ``` 880 | Deleting test job ... 881 | job.batch "cifar10-test" deleted 882 | ``` 883 | 884 | ### 6.6. Optional exercise 885 | We have run distributed training on two nodes. Edit the autoscaling group to set the desired number of nodes to 4, then modify the configuration file `.env` to reflect the new number of nodes and re-run the training job. You will notice that the time to run 10 epochs decreases as the workload gets distributed among more nodes. 886 | 887 | ## 7. Cleanup (optional) 888 | 889 |

890 | 891 | Fig. 7.0 - Step 7 - Cleanup 892 |
893 | 894 | Optionally you can execute the scripts in the cleanup folder to delete the shared storage volume and the EKS cluster you created for this workshop. 895 | 896 | ```console 897 | cd ../7-cleanup 898 | ``` 899 | 900 | ### 7.1. Delete EFS volume 901 | The EFS file system needs to be deleted first since it is associated with subnets within the VPC used by the EKS cluster. 902 | 903 | Execute: 904 | ```console 905 | ./7-1-delete-efs.sh 906 | ``` 907 | 908 | Output: 909 | ``` 910 | Deleting EFS mount targets for File System fs-070041b9153fa56b8 ... 911 | Deleting mount target fsmt-02128c3560394ce31 912 | Deleting mount target fsmt-0f80225b4ba7580b0 913 | Deleting EFS file system fs-070041b9153fa56b8 ... 914 | 915 | Done. 916 | ``` 917 | 918 | Note: If an error occurs during the deletion of the file system, please wait for a minute and run the script again. The EFS file system can only be deleted after the mount targets are fully deleted. 919 | 920 | ### 7.2. Delete EKS cluster 921 | Performing this step deletes all of the remaining infrastructure that was used in this workshop. This includes the node groups, cluster, NAT gateways, subnets, and VPC. 922 | 923 | Execute: 924 | ```console 925 | ./7-2-delete-cluster.sh 926 | ``` 927 | 928 | Output: 929 | ``` 930 | Deleting cluster do-eks. Proceed? [Y/n]: Y 931 | Confirmed ... 932 | 2022-06-07 02:03:19 [ℹ] eksctl version 0.66.0 933 | 2022-06-07 02:03:19 [ℹ] using region us-west-2 934 | 2022-06-07 02:03:19 [ℹ] deleting EKS cluster "do-eks" 935 | 2022-06-07 02:03:20 [ℹ] deleted 0 Fargate profile(s) 936 | 2022-06-07 02:03:20 [✔] kubeconfig has been updated 937 | 2022-06-07 02:03:20 [ℹ] cleaning up AWS load balancers created by Kubernetes objects of Kind Service or Ingress 938 | 2022-06-07 02:03:27 [ℹ] 3 sequential tasks: { delete nodegroup "wks-node", 2 sequential sub-tasks: { 2 sequential sub-tasks: { delete IAM role for serviceaccount "kube-system/aws-node", delete serviceaccount "kube-system/aws-node" }, delete IAM OIDC provider }, delete cluster control plane "do-eks" [async] } 939 | 2022-06-07 02:03:27 [ℹ] will delete stack "eksctl-do-eks-nodegroup-wks-node" 940 | 2022-06-07 02:03:27 [ℹ] waiting for stack "eksctl-do-eks-nodegroup-wks-node" to get deleted 941 | 2022-06-07 02:03:27 [ℹ] waiting for CloudFormation stack "eksctl-do-eks-nodegroup-wks-node" 942 | ... 943 | 2022-06-07 02:07:17 [ℹ] waiting for CloudFormation stack "eksctl-do-eks-nodegroup-wks-node" 944 | 2022-06-07 02:07:17 [ℹ] will delete stack "eksctl-do-eks-addon-iamserviceaccount-kube-system-aws-node" 945 | 2022-06-07 02:07:17 [ℹ] waiting for stack "eksctl-do-eks-addon-iamserviceaccount-kube-system-aws-node" to get deleted 946 | 2022-06-07 02:07:17 [ℹ] waiting for CloudFormation stack "eksctl-do-eks-addon-iamserviceaccount-kube-system-aws-node" 947 | 2022-06-07 02:07:34 [ℹ] waiting for CloudFormation stack "eksctl-do-eks-addon-iamserviceaccount-kube-system-aws-node" 948 | 2022-06-07 02:07:34 [ℹ] deleted serviceaccount "kube-system/aws-node" 949 | 2022-06-07 02:07:34 [ℹ] will delete stack "eksctl-do-eks-cluster" 950 | 2022-06-07 02:07:34 [✔] all cluster resources were deleted 951 | Please note that the cluster will be fully deleted when the Cloud Formation stack completes its removal 952 | Only after the process in Cloud Formation is finished, you will be able to create a new cluster with the same name 953 | ``` 954 | 955 | # Conclusion 956 | Congratulations on completing this Distributed Model Training workshop! 957 | You now have experience with building and running a distributed model training architecture on AWS EKS. 958 | The techniques demonstrated here are generic and can be applied for your own distributed model trainig needs and at larger scale. 959 | 960 | # License 961 | This repository is released under the MIT-0 License. See the [LICENSE](LICENSE) file for details. 962 | 963 | # References 964 | * [Docker](https://docker.com) 965 | * [PyTorch](https://pytorch.org) 966 | * [Kubernetes](https://kubernetes.io) 967 | * [Amazon Web Services (AWS)](https://aws.amazon.com/) 968 | * [Amazon Elastic Kubernetes Service (EKS)](https://aws.amazon.com/eks) 969 | * [Do Framework](https://bit.ly/do-framework) 970 | * [Do EKS Project](https://github.com/aws-samples/aws-do-eks) 971 | -------------------------------------------------------------------------------- /SETUP.md: -------------------------------------------------------------------------------- 1 | ## 0. Prerequisites setup 2 | This document describes how to set up your AWS account and Cloud9 IDE, which will be used to execute all of the steps in the workshop. 3 | 4 | ### 0.1. Setup AWS Account 5 | For this workshop, you may use your own AWS account, or use an account generated by AWS Event Engine. If you are using your own AWS account, please proceed to Section 0.2. If you would like to receive a temporary AWS Account through AWS Event Engine follow these steps: 6 | 7 | 1. Go to the Event Engine link provided by your workshop host 8 | 2. Follow the on-screen instructions to gain access to your temporary AWS account 9 | 10 | Once you have logged in successfully, proceed to create an IAM user. 11 | 12 | ### 0.2. Create IAM user with admin rights 13 | Once logged into the account through the [AWS console](https://console.aws.amazon.com/console/home?region=us-west-2#), navigate to [IAM Users](https://console.aws.amazon.com/iamv2/home?#/users) and add a new user by clicking the **Add users** button and filling out the form as shown below. Use **inferentia_user** as the User Name. 14 |
15 | 16 |
17 | Fig. 0.1 - Add user screen 18 |
19 |
20 | 21 | Click **Next: Permissions** and click the **Create group** button on the screen. 22 | 23 |
24 | 25 |
26 | Fig. 0.2 - Set permissions screen 27 |
28 |
29 | 30 | Provide group name **admins** and select the **AdministratorAccess** policy as shown below. 31 | 32 |
33 | 34 |
35 | Fig. 0.3 - Create group 36 |
37 |
38 | 39 | Click the **Create group** button and you will be brought back to the **Set permissions** screen. Select the **admins** group as shown on the figure below, then click **Next: Tags** . 40 |
41 | 42 |
43 | Fig. 0.4 - Add user to admins group 44 |
45 | 46 | 47 | Follow the wizard through to the end to create the user (remaining options can be left as default). When the user is added successfully, you will see a confirmation screen from which you can copy the user's Access Key and Secret Access Key. 48 |
49 | 50 |
51 | Fig. 0.5 - Confirmation screen with access key information for new user 52 |
53 |
54 | 55 | Click the **Download .csv** button to download the user's credentials as a `.csv` file. Alternatively you can press the **Show** link and copy/paste the **Access key ID** and **Secret access key** locally. You will need to enter the crecentials later while you are completing the exercises in this workshop. This is the only time these credentials will be available for download or display. You will be able to generate new credentials if necessary. 56 | 57 | ### 0.3) Sign into the AWS Console 58 | In this step you will sign in to the AWS Console as the user you just created. 59 | Pull down the user menu from your current AWS Console screen and copy the Account number displayed next to **My Account** as shown on the figure below. 60 |
61 | 62 |
63 | Fig. 0.6 - Sign out of AWS Console 64 |
65 |
66 | 67 | Once you have copied the account number, cick **Sign Out**, then click **Sign In to the Console**. 68 | 69 |
70 | 71 |
72 | Fig. 0.7 - Sign in landing screen 73 |
74 |
75 | 76 | On the **Sign in** screen select **IAM user**, enter the **Account ID** that you just copied, and click Next. 77 | 78 |
79 | 80 |
81 | Fig. 0.8 - Sign in as IAM user 82 |
83 |
84 | 85 | When presented with the login screen shown below, fill in the IAM username and password that you created in the previous step. 86 | 87 | Next, click the **Sign in** button and sign in as the new IAM user. 88 | 89 | ### 0.4. Setup Cloud9 IDE 90 | 91 | Please verify that the `us-west-2` region **Oregon** is selected in your console and is showing in the upper right corner of your browser as highlighted in the figure below. We will use [Cloud9](https://us-west-2.console.aws.amazon.com/cloud9/home/product) to execute the steps in this workshop. To provision a Cloud9 IDE, click on the **Services** menu (from the top left of the screen) then select **Developer Tools** and choose **Cloud9**, or just open the following link to Cloud9. 92 | 93 |
94 | 95 |
96 | Fig. 0.9 - Cloud9 link 97 |
98 |
99 | 100 | Following the link will open the Cloud9 landing page. 101 | 102 |
103 | 104 |
105 | Fig. 0.10 - Cloud9 landing page 106 |
107 |
108 | 109 | Click on the `Create environment` button. 110 | 111 |
112 | 113 |
114 | Fig. 0.11 - Cloud9 name environment screen 115 |
116 |
117 | 118 | Type a name for your Cloud9 environment, then click `Next`. 119 | 120 |
121 | 122 |
123 | Fig. 0.12 - Cloud9 configure settings screen 124 |
125 |
126 | 127 | Under **Instance type** selcet `Other instance type` and `c5.9xlarge`. Then click **Next step** and **Create environment**. 128 | This will launch your Cloud9 instance. Provisioning of the instance can take a few minutes. 129 | 130 |
131 | 132 |
133 | Fig. 0.13 - Cloud9 instance 134 |
135 |
136 | 137 | The default Cloud9 instance comes with a root EBS volume that is only 10GB in size. 138 | 139 |
140 | 141 |
142 | Fig. 0.14 - Cloud9 Manage EC2 Instance 143 |
144 |
145 | 146 | We will increase the root volume size, to avoid running out of space later in the workshop. 147 | Click on the user icon in the upper-right corner and select **Manage EC2 Instance**. 148 | 149 |
150 | 151 |
152 | Fig. 0.15 - Cloud9 EC2 Instance Storage 153 |
154 |
155 | 156 | Select the instance, then click on the **Storage** tab and click on the link under **Volume ID** to select the current root volume. 157 | 158 |
159 | 160 |
161 | Fig. 0.16 - Cloud9 EC2 Instance Volume 162 |
163 |
164 | 165 | Select the volume, then click on Actions and select **Modify volume**. 166 | 167 |
168 | 169 |
170 | Fig. 0.16 - Cloud9 Modify Volume 171 |
172 |
173 | 174 | Increase the size of the volume by typing the desired size in the **Size (GiB)** field, then click **Modify**, and confirm. 175 | 176 |
177 | 178 |
179 | Fig. 0.17 - Cloud9 volume optimizing 180 |
181 |
182 | 183 | The volume status changes to **In-use - modifying** and in a few seconds becomes **In-use - optimizing**. As soon as the status changes to **optimizing** we need to reboot the instance in order for the resized volume to become available in Cloud9. 184 | 185 |
186 | 187 |
188 | Fig. 0.18 - Cloud9 reboot instance 189 |
190 |
191 | 192 | To reboot the instance, select **Instances** from the console navigation menu, then highlight the instance and select **Instance state -> Reboot**. 193 | 194 |
195 | 196 |
197 | Fig. 0.19 - Cloud9 IDE with resized volume 198 |
199 |
200 | 201 | Once the instance is restarted, refresh the Cloud9 IDE window and type `df -h` in the terminal window. You should see that the root volume has the size you specified earlier. 202 | 203 | Open the IDE Preferences by clicking on the settings icon in the upper-right corner of the screen, or by clicking the Cloud9 icon in the menu and selecting Preferences. Scroll the list of preferences down and selct the `AWS Settings` section. Disable the `AWS managed temporary credentials` setting as shown below. 204 | 205 |
206 | 207 |
208 | Fig. 0.20 - Disable Cloud9 IDE `AWS managed temporary credentials` setting 209 |
210 |
211 | 212 | 213 | ### 0.5 Clone workshop repository 214 | 215 | ``` 216 | git clone https://github.com/aws-samples/aws-distributed-training-workshop-eks.git 217 | ``` 218 | 219 | and 220 | 221 | ``` 222 | cd aws-distributed-training-workshop-eks 223 | ``` 224 | 225 | Your Cloud9 work environment is now completely set up and you are ready to dive into the [Distributed Model Training Workshop for AWS EKS](README.md). 226 | -------------------------------------------------------------------------------- /THIRD-PARTY-LICENSES: -------------------------------------------------------------------------------- 1 | The AWS Distributed Training Workshop for EKS includes the following third-party software/licensing: 2 | 3 | ** PyTorch - https://pytorch.org 4 | 5 | License URL: https://github.com/pytorch/pytorch/blob/master/LICENSE 6 | 7 | From PyTorch: 8 | 9 | Copyright (c) 2016- Facebook, Inc (Adam Paszke) 10 | Copyright (c) 2014- Facebook, Inc (Soumith Chintala) 11 | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) 12 | Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) 13 | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) 14 | Copyright (c) 2011-2013 NYU (Clement Farabet) 15 | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) 16 | Copyright (c) 2006 Idiap Research Institute (Samy Bengio) 17 | Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) 18 | 19 | From Caffe2: 20 | 21 | Copyright (c) 2016-present, Facebook Inc. All rights reserved. 22 | 23 | All contributions by Facebook: 24 | Copyright (c) 2016 Facebook Inc. 25 | 26 | All contributions by Google: 27 | Copyright (c) 2015 Google Inc. 28 | All rights reserved. 29 | 30 | All contributions by Yangqing Jia: 31 | Copyright (c) 2015 Yangqing Jia 32 | All rights reserved. 33 | 34 | All contributions by Kakao Brain: 35 | Copyright 2019-2020 Kakao Brain 36 | 37 | All contributions by Cruise LLC: 38 | Copyright (c) 2022 Cruise LLC. 39 | All rights reserved. 40 | 41 | All contributions from Caffe: 42 | Copyright(c) 2013, 2014, 2015, the respective contributors 43 | All rights reserved. 44 | 45 | All other contributions: 46 | Copyright(c) 2015, 2016 the respective contributors 47 | All rights reserved. 48 | 49 | Caffe2 uses a copyright model similar to Caffe: each contributor holds 50 | copyright over their contributions to Caffe2. The project versioning records 51 | all such contribution and copyright details. If a contributor wants to further 52 | mark their specific copyright on a particular contribution, they should 53 | indicate their copyright solely in the commit message of the change when it is 54 | committed. 55 | 56 | All rights reserved. 57 | 58 | Redistribution and use in source and binary forms, with or without 59 | modification, are permitted provided that the following conditions are met: 60 | 61 | 1. Redistributions of source code must retain the above copyright 62 | notice, this list of conditions and the following disclaimer. 63 | 64 | 2. Redistributions in binary form must reproduce the above copyright 65 | notice, this list of conditions and the following disclaimer in the 66 | documentation and/or other materials provided with the distribution. 67 | 68 | 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America 69 | and IDIAP Research Institute nor the names of its contributors may be 70 | used to endorse or promote products derived from this software without 71 | specific prior written permission. 72 | 73 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 74 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 75 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 76 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 77 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 78 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 79 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 80 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 81 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 82 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 83 | POSSIBILITY OF SUCH DAMAGE. 84 | 85 | ------------------------------------------------------------ 86 | 87 | ** TorchElastic - https://github.com/pytorch/elastic/ 88 | 89 | https://github.com/pytorch/elastic/blob/master/LICENSE 90 | 91 | BSD 3-Clause License 92 | 93 | Copyright (c) 2019-present, Facebook, Inc. 94 | All rights reserved. 95 | 96 | Redistribution and use in source and binary forms, with or without 97 | modification, are permitted provided that the following conditions are met: 98 | 99 | * Redistributions of source code must retain the above copyright notice, this 100 | list of conditions and the following disclaimer. 101 | 102 | * Redistributions in binary form must reproduce the above copyright notice, 103 | this list of conditions and the following disclaimer in the documentation 104 | and/or other materials provided with the distribution. 105 | 106 | * Neither the name of the copyright holder nor the names of its 107 | contributors may be used to endorse or promote products derived from 108 | this software without specific prior written permission. 109 | 110 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 111 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 112 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 113 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 114 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 115 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 116 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 117 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 118 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 119 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 120 | 121 | ------------------------------------------------------------ 122 | ** kubetail - https://github.com/johanhaleby/kubetail 123 | ** kubectx - https://github.com/ahmetb/kubectx 124 | ** yq - https://github.com/kislyuk/yq 125 | ** Kubernetes metrics server - https://github.com/kubernetes-sigs/metrics-server 126 | 127 | Apache License 128 | Version 2.0, January 2004 129 | http://www.apache.org/licenses/ 130 | 131 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 132 | 133 | 1. Definitions. 134 | 135 | "License" shall mean the terms and conditions for use, reproduction, 136 | and distribution as defined by Sections 1 through 9 of this document. 137 | 138 | "Licensor" shall mean the copyright owner or entity authorized by 139 | the copyright owner that is granting the License. 140 | 141 | "Legal Entity" shall mean the union of the acting entity and all 142 | other entities that control, are controlled by, or are under common 143 | control with that entity. For the purposes of this definition, 144 | "control" means (i) the power, direct or indirect, to cause the 145 | direction or management of such entity, whether by contract or 146 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 147 | outstanding shares, or (iii) beneficial ownership of such entity. 148 | 149 | "You" (or "Your") shall mean an individual or Legal Entity 150 | exercising permissions granted by this License. 151 | 152 | "Source" form shall mean the preferred form for making modifications, 153 | including but not limited to software source code, documentation 154 | source, and configuration files. 155 | 156 | "Object" form shall mean any form resulting from mechanical 157 | transformation or translation of a Source form, including but 158 | not limited to compiled object code, generated documentation, 159 | and conversions to other media types. 160 | 161 | "Work" shall mean the work of authorship, whether in Source or 162 | Object form, made available under the License, as indicated by a 163 | copyright notice that is included in or attached to the work 164 | (an example is provided in the Appendix below). 165 | 166 | "Derivative Works" shall mean any work, whether in Source or Object 167 | form, that is based on (or derived from) the Work and for which the 168 | editorial revisions, annotations, elaborations, or other modifications 169 | represent, as a whole, an original work of authorship. For the purposes 170 | of this License, Derivative Works shall not include works that remain 171 | separable from, or merely link (or bind by name) to the interfaces of, 172 | the Work and Derivative Works thereof. 173 | 174 | "Contribution" shall mean any work of authorship, including 175 | the original version of the Work and any modifications or additions 176 | to that Work or Derivative Works thereof, that is intentionally 177 | submitted to Licensor for inclusion in the Work by the copyright owner 178 | or by an individual or Legal Entity authorized to submit on behalf of 179 | the copyright owner. For the purposes of this definition, "submitted" 180 | means any form of electronic, verbal, or written communication sent 181 | to the Licensor or its representatives, including but not limited to 182 | communication on electronic mailing lists, source code control systems, 183 | and issue tracking systems that are managed by, or on behalf of, the 184 | Licensor for the purpose of discussing and improving the Work, but 185 | excluding communication that is conspicuously marked or otherwise 186 | designated in writing by the copyright owner as "Not a Contribution." 187 | 188 | "Contributor" shall mean Licensor and any individual or Legal Entity 189 | on behalf of whom a Contribution has been received by Licensor and 190 | subsequently incorporated within the Work. 191 | 192 | 2. Grant of Copyright License. Subject to the terms and conditions of 193 | this License, each Contributor hereby grants to You a perpetual, 194 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 195 | copyright license to reproduce, prepare Derivative Works of, 196 | publicly display, publicly perform, sublicense, and distribute the 197 | Work and such Derivative Works in Source or Object form. 198 | 199 | 3. Grant of Patent License. Subject to the terms and conditions of 200 | this License, each Contributor hereby grants to You a perpetual, 201 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 202 | (except as stated in this section) patent license to make, have made, 203 | use, offer to sell, sell, import, and otherwise transfer the Work, 204 | where such license applies only to those patent claims licensable 205 | by such Contributor that are necessarily infringed by their 206 | Contribution(s) alone or by combination of their Contribution(s) 207 | with the Work to which such Contribution(s) was submitted. If You 208 | institute patent litigation against any entity (including a 209 | cross-claim or counterclaim in a lawsuit) alleging that the Work 210 | or a Contribution incorporated within the Work constitutes direct 211 | or contributory patent infringement, then any patent licenses 212 | granted to You under this License for that Work shall terminate 213 | as of the date such litigation is filed. 214 | 215 | 4. Redistribution. You may reproduce and distribute copies of the 216 | Work or Derivative Works thereof in any medium, with or without 217 | modifications, and in Source or Object form, provided that You 218 | meet the following conditions: 219 | 220 | (a) You must give any other recipients of the Work or 221 | Derivative Works a copy of this License; and 222 | 223 | (b) You must cause any modified files to carry prominent notices 224 | stating that You changed the files; and 225 | 226 | (c) You must retain, in the Source form of any Derivative Works 227 | that You distribute, all copyright, patent, trademark, and 228 | attribution notices from the Source form of the Work, 229 | excluding those notices that do not pertain to any part of 230 | the Derivative Works; and 231 | 232 | (d) If the Work includes a "NOTICE" text file as part of its 233 | distribution, then any Derivative Works that You distribute must 234 | include a readable copy of the attribution notices contained 235 | within such NOTICE file, excluding those notices that do not 236 | pertain to any part of the Derivative Works, in at least one 237 | of the following places: within a NOTICE text file distributed 238 | as part of the Derivative Works; within the Source form or 239 | documentation, if provided along with the Derivative Works; or, 240 | within a display generated by the Derivative Works, if and 241 | wherever such third-party notices normally appear. The contents 242 | of the NOTICE file are for informational purposes only and 243 | do not modify the License. You may add Your own attribution 244 | notices within Derivative Works that You distribute, alongside 245 | or as an addendum to the NOTICE text from the Work, provided 246 | that such additional attribution notices cannot be construed 247 | as modifying the License. 248 | 249 | You may add Your own copyright statement to Your modifications and 250 | may provide additional or different license terms and conditions 251 | for use, reproduction, or distribution of Your modifications, or 252 | for any such Derivative Works as a whole, provided Your use, 253 | reproduction, and distribution of the Work otherwise complies with 254 | the conditions stated in this License. 255 | 256 | 5. Submission of Contributions. Unless You explicitly state otherwise, 257 | any Contribution intentionally submitted for inclusion in the Work 258 | by You to the Licensor shall be under the terms and conditions of 259 | this License, without any additional terms or conditions. 260 | Notwithstanding the above, nothing herein shall supersede or modify 261 | the terms of any separate license agreement you may have executed 262 | with Licensor regarding such Contributions. 263 | 264 | 6. Trademarks. This License does not grant permission to use the trade 265 | names, trademarks, service marks, or product names of the Licensor, 266 | except as required for reasonable and customary use in describing the 267 | origin of the Work and reproducing the content of the NOTICE file. 268 | 269 | 7. Disclaimer of Warranty. Unless required by applicable law or 270 | agreed to in writing, Licensor provides the Work (and each 271 | Contributor provides its Contributions) on an "AS IS" BASIS, 272 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 273 | implied, including, without limitation, any warranties or conditions 274 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 275 | PARTICULAR PURPOSE. You are solely responsible for determining the 276 | appropriateness of using or redistributing the Work and assume any 277 | risks associated with Your exercise of permissions under this License. 278 | 279 | 8. Limitation of Liability. In no event and under no legal theory, 280 | whether in tort (including negligence), contract, or otherwise, 281 | unless required by applicable law (such as deliberate and grossly 282 | negligent acts) or agreed to in writing, shall any Contributor be 283 | liable to You for damages, including any direct, indirect, special, 284 | incidental, or consequential damages of any character arising as a 285 | result of this License or out of the use or inability to use the 286 | Work (including but not limited to damages for loss of goodwill, 287 | work stoppage, computer failure or malfunction, or any and all 288 | other commercial damages or losses), even if such Contributor 289 | has been advised of the possibility of such damages. 290 | 291 | 9. Accepting Warranty or Additional Liability. While redistributing 292 | the Work or Derivative Works thereof, You may choose to offer, 293 | and charge a fee for, acceptance of support, warranty, indemnity, 294 | or other liability obligations and/or rights consistent with this 295 | License. However, in accepting such obligations, You may act only 296 | on Your own behalf and on Your sole responsibility, not on behalf 297 | of any other Contributor, and only if You agree to indemnify, 298 | defend, and hold each Contributor harmless for any liability 299 | incurred by, or claims asserted against, such Contributor by reason 300 | of your accepting any such warranty or additional liability. 301 | 302 | END OF TERMS AND CONDITIONS 303 | 304 | ------------------------------------------------------------ 305 | 306 | ** jq - https://github.com/stedolan/jq 307 | 308 | jq is copyright (C) 2012 Stephen Dolan 309 | 310 | Permission is hereby granted, free of charge, to any person obtaining 311 | a copy of this software and associated documentation files (the 312 | "Software"), to deal in the Software without restriction, including 313 | without limitation the rights to use, copy, modify, merge, publish, 314 | distribute, sublicense, and/or sell copies of the Software, and to 315 | permit persons to whom the Software is furnished to do so, subject to 316 | the following conditions: 317 | 318 | The above copyright notice and this permission notice shall be 319 | included in all copies or substantial portions of the Software. 320 | 321 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 322 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 323 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 324 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 325 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 326 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 327 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 328 | 329 | 330 | 331 | jq's documentation (everything found under the docs/ subdirectory in 332 | the source tree) is licensed under the Creative Commons CC BY 3.0 333 | license, which can be found at: 334 | 335 | https://creativecommons.org/licenses/by/3.0/ 336 | 337 | The documentation website includes a copy of Twitter's Boostrap and 338 | relies on Bonsai, Liquid templates and various other projects, look 339 | them up for detailed licensing conditions. 340 | 341 | 342 | 343 | jq incorporates David M. Gay's dtoa.c and g_fmt.c, which bear the 344 | following notices: 345 | 346 | dtoa.c: 347 | The author of this software is David M. Gay. 348 | 349 | Copyright (c) 1991, 2000, 2001 by Lucent Technologies. 350 | 351 | Permission to use, copy, modify, and distribute this software for any 352 | purpose without fee is hereby granted, provided that this entire notice 353 | is included in all copies of any software which is or includes a copy 354 | or modification of this software and in all copies of the supporting 355 | documentation for such software. 356 | 357 | THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED 358 | WARRANTY. IN PARTICULAR, NEITHER THE AUTHOR NOR LUCENT MAKES ANY 359 | REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY 360 | OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. 361 | 362 | g_fmt.c: 363 | The author of this software is David M. Gay. 364 | 365 | Copyright (c) 1991, 1996 by Lucent Technologies. 366 | 367 | Permission to use, copy, modify, and distribute this software for any 368 | purpose without fee is hereby granted, provided that this entire notice 369 | is included in all copies of any software which is or includes a copy 370 | or modification of this software and in all copies of the supporting 371 | documentation for such software. 372 | 373 | THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED 374 | WARRANTY. IN PARTICULAR, NEITHER THE AUTHOR NOR LUCENT MAKES ANY 375 | REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY 376 | OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. 377 | 378 | 379 | 380 | jq uses parts of the open source C library "decNumber", which is distribured 381 | under the following license: 382 | 383 | 384 | ICU License - ICU 1.8.1 and later 385 | 386 | COPYRIGHT AND PERMISSION NOTICE 387 | 388 | Copyright (c) 1995-2005 International Business Machines Corporation and others 389 | All rights reserved. 390 | 391 | Permission is hereby granted, free of charge, to any person obtaining a 392 | copy of this software and associated documentation files (the 393 | "Software"), to deal in the Software without restriction, including 394 | without limitation the rights to use, copy, modify, merge, publish, 395 | distribute, and/or sell copies of the Software, and to permit persons 396 | to whom the Software is furnished to do so, provided that the above 397 | copyright notice(s) and this permission notice appear in all copies of 398 | the Software and that both the above copyright notice(s) and this 399 | permission notice appear in supporting documentation. 400 | 401 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 402 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 403 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 404 | OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 405 | HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL 406 | INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING 407 | FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, 408 | NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION 409 | WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 410 | 411 | Except as contained in this notice, the name of a copyright holder 412 | shall not be used in advertising or otherwise to promote the sale, use 413 | or other dealings in this Software without prior written authorization 414 | of the copyright holder. 415 | 416 | Portions Copyright (c) 2016 Kungliga Tekniska Högskolan 417 | (Royal Institute of Technology, Stockholm, Sweden). 418 | All rights reserved. 419 | 420 | Redistribution and use in source and binary forms, with or without 421 | modification, are permitted provided that the following conditions 422 | are met: 423 | 424 | 1. Redistributions of source code must retain the above copyright 425 | notice, this list of conditions and the following disclaimer. 426 | 427 | 2. Redistributions in binary form must reproduce the above copyright 428 | notice, this list of conditions and the following disclaimer in the 429 | documentation and/or other materials provided with the distribution. 430 | 431 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 432 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 433 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 434 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 435 | COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 436 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 437 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 438 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 439 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 440 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 441 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 442 | OF THE POSSIBILITY OF SUCH DAMAGE. 443 | 444 | -------------------------------------------------------------------------------- /img/aws-console-cloud9-link.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/aws-console-cloud9-link.png -------------------------------------------------------------------------------- /img/aws-console-my-account.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/aws-console-my-account.png -------------------------------------------------------------------------------- /img/aws-console-signin-iam-user.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/aws-console-signin-iam-user.png -------------------------------------------------------------------------------- /img/aws-console-signin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/aws-console-signin.png -------------------------------------------------------------------------------- /img/cloud9-configure-settings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-configure-settings.png -------------------------------------------------------------------------------- /img/cloud9-credentials-dialog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-credentials-dialog.png -------------------------------------------------------------------------------- /img/cloud9-credentials-disable.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-credentials-disable.png -------------------------------------------------------------------------------- /img/cloud9-ide-manage-ec2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-ide-manage-ec2.png -------------------------------------------------------------------------------- /img/cloud9-instance-storage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-instance-storage.png -------------------------------------------------------------------------------- /img/cloud9-landing-page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-landing-page.png -------------------------------------------------------------------------------- /img/cloud9-managed-credentials.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-managed-credentials.png -------------------------------------------------------------------------------- /img/cloud9-modify-volume.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-modify-volume.png -------------------------------------------------------------------------------- /img/cloud9-name-environment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-name-environment.png -------------------------------------------------------------------------------- /img/cloud9-reboot-instance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-reboot-instance.png -------------------------------------------------------------------------------- /img/cloud9-resized-volume.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-resized-volume.png -------------------------------------------------------------------------------- /img/cloud9-volume-actions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-volume-actions.png -------------------------------------------------------------------------------- /img/cloud9-volume-optimizing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/cloud9-volume-optimizing.png -------------------------------------------------------------------------------- /img/iam-add-group.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/iam-add-group.png -------------------------------------------------------------------------------- /img/iam-add-user-access-key.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/iam-add-user-access-key.png -------------------------------------------------------------------------------- /img/iam-add-user-admins-group.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/iam-add-user-admins-group.png -------------------------------------------------------------------------------- /img/iam-add-user.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/iam-add-user.png -------------------------------------------------------------------------------- /img/iam-create-group.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/iam-create-group.png -------------------------------------------------------------------------------- /img/step-1-create-cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/step-1-create-cluster.png -------------------------------------------------------------------------------- /img/step-2-create-volume.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/step-2-create-volume.png -------------------------------------------------------------------------------- /img/step-3-build-container.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/step-3-build-container.png -------------------------------------------------------------------------------- /img/step-4-get-data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/step-4-get-data.png -------------------------------------------------------------------------------- /img/step-5-train-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/step-5-train-model.png -------------------------------------------------------------------------------- /img/step-6-test-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/step-6-test-model.png -------------------------------------------------------------------------------- /img/step-7-cleanup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/step-7-cleanup.png -------------------------------------------------------------------------------- /img/workshop-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-distributed-training-workshop-eks/5fb3f4e1887d9c9236f5bf9c6d744e88190bd53b/img/workshop-architecture.png --------------------------------------------------------------------------------