├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── amd-neuron-provisioner.yaml
├── amd-nvidia-provisioner.yaml
├── app
    ├── Dockerfile-assets
    ├── Dockerfile.template
    ├── appsimulator.sh
    ├── assemble_multiarch_image.sh
    ├── bert_gpu_benchmark.py
    ├── build-assets.sh
    ├── build.sh
    ├── call-model.sh
    ├── compile-model.sh
    ├── compile-sd2.py
    ├── config
    ├── create_node_port_svc.sh
    ├── diffusion_benchmarker-0.0.1.tar.gz
    ├── gpt2_gpu_benchmark.py
    ├── node_port_svc_template.yaml
    ├── optimum-neuron-sd2-compile.sh
    ├── optimum-neuron.py
    ├── optimum-neuron.sh
    ├── run-model.sh
    ├── run-sd2.py
    ├── run.py
    ├── run1-model.sh
    ├── run1.py
    ├── sd2_512_benchmark.py
    ├── sd2_512_compile.py
    └── start.sh
├── appsimulator_sa.yaml
├── infer-in-region-g5.png
├── infer-in-region.png
├── infra-build
    ├── README.md
    ├── deploy-cluster.sh
    ├── eks-cluster-stack.ts
    ├── eks-cluster.ts
    └── vpc-resource-provider.ts
├── model-ci-build
    ├── README.md
    ├── deploy-pipeline.sh
    ├── pipeline-stack.ts
    └── pipeline.ts
├── neuron-top.png
├── sd2-512-cuda-compile-job.yaml
├── sd2-512-cuda-serve-deploy.yaml
├── sd2-512-xla-compile-job.yaml
└── sd2-512-xla-serve-deploy.yaml


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | infra-build/package*
3 | infra-build/cdk*
4 | infra-build/node_modules*
5 | .github
6 | .idea
7 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check [existing open](https://github.com/aws-samples/edge_diffusion_on_eks/issues), or [recently closed](https://github.com/aws-samples/edge_diffusion_on_eks/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws-samples/spotable-game-server/labels/help%20wanted) issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](https://github.com/aws-samples/edge_diffusion_on_eks/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 
61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
62 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # edge_diffusion inferences
  2 | Images, audio, and video content in augmented reality (AR) applications must be generated within milliseconds. Therefore, AR applications generate digital content on-device, but quality is limited by device capabilities. However, content created on a remote server with enough resources takes sub-seconds to be served. As on-device models enrich, this trend pushes inference capabilities back to the cloud within the submillisecond timeframe that cloud edge services such as CDN and LocalZone offer.
  3 | 
  4 | This example shows how AR app developers can decouple content quality from hardware by hosting models like Stable Diffusion by Stability AI on a chip such as NVIDIA or Neuron-based AI accelerators as close to the user device as possible.  
  5 | 
  6 | You compile and deploy Stable Diffusion 2.1 on EKS in LocalZone to 1/ reduce deploy-time by caching 20GB model's graph artifacts on LocalZone by storing the compiled model on S3 and load it with InitContainer prior the endpoint startup. / simplify a secured network path between the user device and remote server with K8s node-port service; and finally 3/ run the model on any compatible and available AI accelerators.
  7 | 
  8 | [build-time] This sample starts with the build pipeline that compiles the PyTorch code into optimized lower level hardware specific code to accelerate inference on GPU and Neuron-enabled instances. This model compiler utilizes neuron(torch_neuronx) or GPU specific features such as mixed precision support, performance optimized kernels, and minimized communication between the CPU and AI accelerator. The output Docker images are stored in regional image registers (ECR) and ready to deploy. We use Volcano, a Kubernetes native batch scheduler, to improve inference pipline orchestration.
  9 | 
 10 | /*The build phase compiles the model and stores it in S3. In [Dockerfile-assets](./app/Dockerfile-assets), models are pulled from S3 and stored as Docker image layers. i.e., neuron model are pulled for Inf2 images and CUDA model pulled for GPU images with the same Dockerfile. Note that using `if` statement in `RUN` section will not cache the model, line `RUN wget https://sdinfer.s3.us-west-2.amazonaws.com/sd2_compile_dir_512_${VAR}.tar.gz -O /model.tar.gz` in our case.
 11 | 
 12 | 
 13 | ```
 14 | ARG ai_chip
 15 | 
 16 | FROM public.ecr.aws/docker/library/python:latest as base
 17 | 
 18 | FROM base AS assets-amd64-cuda
 19 | ENV VAR=cuda
 20 | 
 21 | FROM base AS assets-amd64-neuron
 22 | ENV VAR=xla
 23 | 
 24 | FROM assets-${ai_chip} AS final
 25 | RUN wget https://sdinfer.s3.us-west-2.amazonaws.com/sd2_compile_dir_512_${VAR}.tar.gz -O /model.tar.gz
 26 | ```
 27 | */
 28 | Then, the SDK binaries are loaded at the next stage into the relevant [AWS deep-learning containers](https://github.com/aws/deep-learning-containers/blob/master/available_images.md). Specifically, we used:
 29 | `763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:2.0.1-gpu-py310-cu118-ubuntu20.04-ec2` for G5 instances and `763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-inference-neuronx:1.13.1-transformers4.34.1-neuronx-py310-sdk2.15.0-ubuntu20.04` for Inf2 instances.
 30 | 
 31 | [deploy-time] Next, EKS will instanciate the Docker image on EC2 instances launched by Karpenter based on availability, performance and cost policies. The inference endpoint uses a NodePort-based K8s service endpoint behind an EC2 security group. Each available endpoint is published to inference endpoints inventory that is pulled by the user device for ad-hoc inference.  
 32 | 
 33 | [run-time] KEDA will control K8s deployment size based on specific AI accelerator usage at run-time. Karpenter terminates unused pods to reclaim compute capacity.
 34 | 
 35 | ## Setup
 36 | * Install CDK k8s 
 37 | ```bash
 38 |   npm install -g cdk8s-cli
 39 | ```
 40 | * [Create EKS cluster and deploy Karpenter](https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/) 
 41 | * Use Service Quotas console to allocate Amazon Elastic Compute Cloud (Amazon EC2) "Running On-Demand Inf instances" and "Running On-Demand G and VT instances" limits.
 42 | 
 43 | * Deploy [NVIDIA device plugin for Kubernetes](https://github.com/NVIDIA/k8s-device-plugin)
 44 |   ```bash
 45 |   kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.1/nvidia-device-plugin.yml
 46 |   ```
 47 | * Deploy [Neuron device plugin for Kubernetes](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/containers/tutorials/k8s-setup.html#tutorial-k8s-env-setup-for-neuron)
 48 | 
 49 | ## Build multi-arch CPU and accelerator image
 50 | The build process creates OCI images for x86-based instances. You add another build step to create OCI images for Graviton-based instances. This new build process creates a OCI image manifest list that references both OCI images. The container runtime (Docker Engine or containerd) will pull the correct platform-specific image at deployment time. To automate the OCI image build process, we use AWS CodePipeline. AWS CodePipeline starts by building a OCI image from the code in AWS CodeBuild that is pushed to Amazon Elastic Container Registry (Amazon ECR). 
 51 | 
 52 | * [Deploy the CI-pipeline of the Stable Diffusion image](./ci-build)
 53 | 
 54 | ## Deploy the inference pipeline
 55 | * Deploy Karpenter NodePools for Inf2 and G instances
 56 |   ```bash
 57 |   kubectl apply -f amd-nvidia-provisioner.yaml
 58 |   kubectl apply -f amd-neuorn-provisioner.yaml
 59 |   ```
 60 | 
 61 | The model file is stored in S3 between compiling and deploy the model as docker asset image so need to grant access to s3 via k8s service account
 62 |   ```bash
 63 |   kubectl apply -f appsimulator_sa.yaml 
 64 |   ```
 65 |   TBD  - need to set EKS Pod Identities or IRSA
 66 |   ```bash
 67 |   aws iam create-policy --policy-name allow-access-to-model-assets --policy-document file://allow-access-to-model-assets.json
 68 |   eksctl create iamserviceaccount --name appsimulator --namespace default --cluster tlvsummit-demo --role-name appsimulator \
 69 |     --attach-policy-arn arn:aws:iam::891377065549:policy/allow-access-to-model-assets --approve
 70 |   ```
 71 | 
 72 | * Compile the model in a region (batch/v1 Job)
 73 |   ```bash
 74 |   kubectl apply -f sd2-512-cuda-compile-job.yaml
 75 |   kubectl apply -f sd2-512-xla-compile-job.yaml
 76 |   ```
 77 | * Deploy the model in a region (apps/v1 Deployment)
 78 |   ```bash
 79 |   kubectl apply -f sd2-512-xla-serve-deploy.yaml
 80 |   ```
 81 | 
 82 | * Discover the inference endpoint
 83 |   ```bash
 84 |   kubectl get svc
 85 |   ```
 86 |   e.g.,
 87 | ```
 88 |   $kubectl get svc
 89 | NAME                                                          TYPE        CLUSTER-IP      EXTERNAL-IP   PORT(S)          AGE
 90 | kubernetes                                                    ClusterIP   10.100.0.1      <none>        443/TCP          64d
 91 | stablediffusion-serve-inf-56dbffc68c-zcphj-svc-18-246-11-46   NodePort    10.100.228.62   <none>        7860:32697/TCP   2d20h
 92 | ```
 93 | The endpoint is `http://18.246.11.46:32697/`. Observe the AI chips utilization e.g., neuron-top
 94 | 
 95 | ```bash
 96 | kubectl exec -it stablediffusion-serve-inf-56dbffc68c-zcphj -- neuron-top
 97 | ```
 98 | Feel the prompt and enjoy the images generated. Note the the processing time. We will need that for the LocalZoe case.
 99 | ![neuron-top](./neuron-top.png)
100 | ![inferenced-image](./infer-in-region.png)
101 | 
102 | * Deploy inference endpoint with NVIDIA G5 (G4dn is not supported by Stable Diffusion)
103 |   ```bash
104 |   kubectl apply -f sd2-512-cuda-serve-deploy.yaml
105 |   ```
106 | Wait few minutes for the node provisioning and pod startup and discover the new service
107 |   ```bash
108 |   kubectl get svc
109 |   ```
110 | e.g., 
111 | ```
112 | kubectl get svc
113 | NAME                                                          TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)          AGE
114 | kubernetes                                                    ClusterIP   10.100.0.1       <none>        443/TCP          66d
115 | stablediffusion-serve-gpu-857c86776d-2wpb6-svc-35-90-0-175    NodePort    10.100.117.207   <none>        7860:31071/TCP   9m18s
116 | stablediffusion-serve-inf-56dbffc68c-zcphj-svc-18-246-11-46   NodePort    10.100.228.62    <none>        7860:32697/TCP   4d17h
117 | ```
118 | The relevant service is `stablediffusion-serve-gpu-857c86776d-2wpb6-svc-35-90-0-175`. Endpoint is `http://35.90.0.175:31071`
119 | 
120 | Observe the NVIDIA core usage while generating an image by:
121 | 
122 | ```bash
123 | watch kubectl exec -it stablediffusion-serve-gpu-857c86776d-2wpb6 -- nvidia-smi
124 | 
125 | Fri Dec  1 16:50:41 2023       
126 | +---------------------------------------------------------------------------------------+
127 | | NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
128 | |-----------------------------------------+----------------------+----------------------+
129 | | GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
130 | | Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
131 | |                                         |                      |               MIG M. |
132 | |=========================================+======================+======================|
133 | |   0  NVIDIA A10G                    On  | 00000000:00:1B.0 Off |                    0 |
134 | |  0%   33C    P0             222W / 300W |   3930MiB / 23028MiB |     99%      Default |
135 | |                                         |                      |                  N/A |
136 | +-----------------------------------------+----------------------+----------------------+
137 | |   1  NVIDIA A10G                    On  | 00000000:00:1C.0 Off |                    0 |
138 | |  0%   16C    P8              18W / 300W |      7MiB / 23028MiB |      0%      Default |
139 | |                                         |                      |                  N/A |
140 | +-----------------------------------------+----------------------+----------------------+
141 | |   2  NVIDIA A10G                    On  | 00000000:00:1D.0 Off |                    0 |
142 | |  0%   17C    P8              15W / 300W |      7MiB / 23028MiB |      0%      Default |
143 | |                                         |                      |                  N/A |
144 | +-----------------------------------------+----------------------+----------------------+
145 | |   3  NVIDIA A10G                    On  | 00000000:00:1E.0 Off |                    0 |
146 | |  0%   16C    P8               9W / 300W |      7MiB / 23028MiB |      0%      Default |
147 | |                                         |                      |                  N/A |
148 | +-----------------------------------------+----------------------+----------------------+
149 |                                                                                          
150 | +---------------------------------------------------------------------------------------+
151 | | Processes:                                                                            |
152 | |  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
153 | |        ID   ID                                                             Usage      |
154 | |=======================================================================================|
155 | +---------------------------------------------------------------------------------------+
156 | ```
157 | Note the first GPU core and memory utilization. 
158 | 
159 | ![infer-in-region-on-g5](./infer-in-region-g5.png)
160 | 
161 | * Deploy node pools on LocalZone
162 | TBD
163 | 


--------------------------------------------------------------------------------
/amd-neuron-provisioner.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: karpenter.sh/v1alpha5
 2 | kind: Provisioner
 3 | metadata:
 4 |   name: amd-neuron
 5 | spec:
 6 |   requirements:
 7 |     - key: karpenter.k8s.aws/instance-family
 8 |       operator: In
 9 |       values: ["inf2"]
10 |     - key: kubernetes.io/arch
11 |       operator: In
12 |       values: ["amd64"]
13 |   limits:
14 |   providerRef:
15 |     name: amd-neuron
16 |   ttlSecondsAfterEmpty: 30
17 | ---
18 | apiVersion: karpenter.k8s.aws/v1alpha1
19 | kind: AWSNodeTemplate
20 | metadata:
21 |   name: amd-neuron
22 | spec:
23 |   subnetSelector:
24 |     karpenter.sh/subnet/discovery: ai-usw2-public
25 |   securityGroupSelector:
26 |     karpenter.sh/discovery: ai-usw2
27 | #  amiSelector:
28 | #   aws::ids: "ami-051f84ff16e7d78c4"
29 |   tags:
30 |     InstanceType: "inferentia"
31 |   blockDeviceMappings:
32 |     - deviceName: /dev/xvda
33 |       ebs:
34 |         volumeSize: 300Gi
35 |         volumeType: gp3
36 |         encrypted: true
37 | 


--------------------------------------------------------------------------------
/amd-nvidia-provisioner.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: karpenter.sh/v1alpha5
 2 | kind: Provisioner
 3 | metadata:
 4 |   name: amd-nvidia
 5 | spec:
 6 |   requirements:
 7 |     - key: karpenter.k8s.aws/instance-family
 8 |       operator: In
 9 |       values: ["g5","p5"]
10 |     - key: kubernetes.io/arch
11 |       operator: In
12 |       values: ["amd64"]
13 |   limits:
14 |   providerRef:
15 |     name: amd-nvidia
16 |   ttlSecondsAfterEmpty: 30
17 | ---
18 | apiVersion: karpenter.k8s.aws/v1alpha1
19 | kind: AWSNodeTemplate
20 | metadata:
21 |   name: amd-nvidia
22 | spec:
23 |   subnetSelector:
24 |     karpenter.sh/discovery: ai-usw2
25 |   securityGroupSelector:
26 |     karpenter.sh/discovery: ai-usw2
27 | #  amiSelector:
28 | #   aws::ids: "ami-051f84ff16e7d78c4"
29 |   tags:
30 |     InstanceType: "nvidia"
31 |   blockDeviceMappings:
32 |     - deviceName: /dev/xvda
33 |       ebs:
34 |         volumeSize: 300Gi
35 |         volumeType: gp3
36 |         encrypted: true
37 | 


--------------------------------------------------------------------------------
/app/Dockerfile-assets:
--------------------------------------------------------------------------------
 1 | ARG ai_chip
 2 | 
 3 | FROM public.ecr.aws/docker/library/python:latest as base
 4 | RUN apt-get update -y --fix-missing
 5 | RUN apt-get install -y python3-venv g++ gettext-base jq
 6 | RUN mkdir -p /etc/apt/keyrings/
 7 | RUN curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.29/deb/Release.key | gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
 8 | RUN echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.29/deb/ /' | tee /etc/apt/sources.list.d/kubernetes.list
 9 | RUN apt-get update
10 | RUN apt-get install -y kubectl
11 | RUN kubectl version --client
12 | RUN python -m pip install wget
13 | RUN python -m pip install awscli
14 | RUN mkdir /root/.aws
15 | ADD config /root/.aws
16 | 
17 | FROM base AS assets-amd64-cuda
18 | ENV VAR=cuda
19 | 
20 | FROM base AS assets-arm64-cuda
21 | ENV VAR=cuda
22 | 
23 | FROM base AS assets-amd64-neuron
24 | ENV VAR=xla
25 | 
26 | FROM assets-${ai_chip} AS final
27 | ENV model="stable-diffusion-2-1-base"
28 | #RUN wget https://sdinfer.s3.us-west-2.amazonaws.com/${model}_${VAR}.tar.gz -O /model.tar.gz
29 | RUN echo "VAR is equal to ${VAR}"
30 | RUN echo "model is equal to ${model}"
31 | ADD call-model.sh /call-model.sh
32 | ADD appsimulator.sh /appsimulator.sh
33 | 


--------------------------------------------------------------------------------
/app/Dockerfile.template:
--------------------------------------------------------------------------------
 1 | FROM $BASE_IMAGE as base
 2 | 
 3 | #FROM $ASSETS_IMAGE AS assets
 4 | 
 5 | #FROM base AS model
 6 | #COPY --from=1 /model.tar.gz /model.tar.gz
 7 | 
 8 | RUN apt-get update --fix-missing
 9 | RUN apt-get install -y apt-transport-https ca-certificates curl gpg net-tools gettext-base python3-venv g++
10 | RUN python -m pip install wget
11 | RUN python -m pip install awscli
12 | RUN python -m pip install gradio
13 | RUN python -m pip install "uvicorn[standard]"
14 | RUN python -m pip install fastapi
15 | 
16 | #RUN curl -sS https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor | tee /etc/apt/trusted.gpg.d/kubernetes-archive-keyring.gpg
17 | #RUN echo "deb [signed-by=/etc/apt/trusted.gpg.d/kubernetes-archive-keyring.gpg] https://apt.kubernetes.io/ kubernetes-xenial main" | tee /etc/apt/sources.list.d/kubernetes.list
18 | RUN apt-get update
19 | RUN mkdir -p /etc/apt/keyrings/
20 | RUN curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.29/deb/Release.key | gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
21 | RUN echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.29/deb/ /' | tee /etc/apt/sources.list.d/kubernetes.list
22 | RUN apt-get update
23 | RUN apt-get install -y kubectl
24 | RUN kubectl version --client
25 | COPY * /
26 | 


--------------------------------------------------------------------------------
/app/appsimulator.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | #restore simulator state from SQS in the case of previous run
 3 | sqs_file="/tmp/"$RANDOM".json"
 4 | aws sqs receive-message --queue-url ${QUEUE_URL} > $sqs_file
 5 | echo "sqs exit code="$?
 6 | if (( $?>0 ))
 7 | then
 8 |   echo "ERR-SQS"
 9 |   j=0
10 | else
11 |   receipt_handle=`cat $sqs_file | jq '.Messages[].ReceiptHandle'|sed 's/"//g'`
12 |   j=`cat $sqs_file | jq '.Messages[].Body'|sed 's/"//g'`
13 |   if [ -z "$j" ]
14 |   then
15 |     echo "EMPTY-SQS"
16 |     j=0
17 |   else
18 |     aws sqs delete-message --queue-url ${QUEUE_URL} --receipt-handle $receipt_handle
19 |   fi
20 | fi
21 | rm -f $sqs_file
22 | 
23 | prev_clients=0
24 | 
25 | #simulator sine wave range. From $j to 3.14 in 0.1 increments
26 | _seq=`seq $j $RADIAN_INTERVAL $RADIAN_MAX`
27 | #_seq=`seq 0.01 0.168 3.14`
28 | echo "first seq is "$_seq
29 | while true; do
30 | for i in $_seq; do
31 |   sqs_file="/tmp/"$RANDOM".json"
32 |   aws sqs receive-message --queue-url ${QUEUE_URL} > $sqs_file
33 |   if (( $?<=0 )); then
34 |     receipt_handle=`cat $sqs_file | jq '.Messages[].ReceiptHandle'|sed 's/"//g'`
35 |     if [ -n "$receipt_handle" ]; then
36 |       echo "delete msg receipt_handle="$receipt_handle
37 |       aws sqs delete-message --queue-url ${QUEUE_URL} --receipt-handle $receipt_handle
38 |     fi
39 |   fi
40 |   rm -f $sqs_file
41 |   x=`echo $i|awk '{print $1}'`
42 |   sinx=`echo $i|awk '{print int(sin($1)*70)}'`
43 |   echo "sinx=" $sinx
44 |   echo "i=" $i
45 |   aws sqs send-message --queue-url ${QUEUE_URL} --message-body "$i"
46 | 
47 |   clients=`echo $(( (sinx * $CLIENT_SCALE_RATIO) + $MIN_AT_CYCLE_START ))`
48 | 
49 |   kubectl scale -n $CLIENT_DEPLOY_NS deploy/$CLIENT_DEPLOY_PREFIX --replicas=$clients
50 |   aws cloudwatch put-metric-data --metric-name app_workers --namespace ${DEPLOY_NAME} --value ${clients}
51 |   echo "app_workers(clients)="$clients" sinx="$sinx
52 | 
53 |   prev_clients=$clients
54 |   sleeptime=`awk -v min=$MIN_SLEEP_BETWEEN_CYCLE -v max=$MAX_SLEEP_BETWEEN_CYCLE 'BEGIN{srand(); print int(min+rand()*(max-min+1))}'`
55 |   echo "cleanning not ready nodes and faulty pods"
56 |   kubectl delete po `kubectl get po | egrep 'Evicted|CrashLoopBackOff|CreateContainerError|ExitCode|OOMKilled|RunContainerError'|awk '{print $1}'`
57 |   sleep $sleeptime"m"
58 | done
59 | #longer cycle _seq=`seq 0.01 0.021 3.14`
60 | j=0
61 | _seq=`seq $j $RADIAN_INTERVAL $RADIAN_MAX`
62 | #_seq=`seq 0.01 0.168 3.14`
63 | echo "new cycle "$_seq
64 | done
65 | 


--------------------------------------------------------------------------------
/app/assemble_multiarch_image.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | 
 3 | BASE_IMAGE=$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$BASE_REPO:$BASE_IMAGE_TAG
 4 | BASE_ARM_IMAGE=$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$BASE_REPO:$BASE_IMAGE_ARM_TAG
 5 | BASE_AMD_IMAGE=$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$BASE_REPO:$BASE_IMAGE_AMD_TAG
 6 | aws ecr get-login-password --region $AWS_REGION | docker login --username AWS --password-stdin $BASE_IMAGE
 7 | 
 8 | docker manifest create $BASE_IMAGE --amend $BASE_ARM_IMAGE --amend $BASE_AMD_IMAGE
 9 | docker manifest push $BASE_IMAGE
10 | 


--------------------------------------------------------------------------------
/app/bert_gpu_benchmark.py:
--------------------------------------------------------------------------------
 1 | from transformers import BertTokenizer, BertForQuestionAnswering
 2 | import torch
 3 | import time
 4 | import math
 5 | 
 6 | # Define model name and tokenizer
 7 | model_name = "bert-base-uncased"
 8 | task = "question-answering"
 9 | # Load tokenizer and model
10 | tokenizer = BertTokenizer.from_pretrained(model_name)
11 | # Load model and move to GPU
12 | model = BertForQuestionAnswering.from_pretrained(model_name).cuda()
13 | 
14 | # Define benchmark parameters
15 | n_runs = 1000
16 | 
17 | # Define sample question and context
18 | question = "What is the capital of France?"
19 | context = "France is a country located in Western Europe. Its capital is Paris."
20 | context = "The French Republic is a country located in Western Europe; the capital city of its republic is Paris, which is located in the heart of the French Republic. France is considered to be a developed country and has the fifth-largest economy in the world. It is also a member of the G20 and a founding member of the European Union. France is also a member of the United Nations, NATO, and other international organizations."
21 | context = "The French Republic is a country located in Western Europe; the capital city of its republic is Paris, which is located in the heart of the French Republic. France is considered to be a developed country and has the fifth-largest economy in the world. It is also a member of the G20 and a founding member of the European Union. France is also a member of the United Nations, NATO, and other international organizations. France is a popular tourist destination and is home to some of the world's most famous landmarks. French cuisine is also renowned for its excellence and sophistication. France is also renowned for its wine and fashion industries. French culture and art are also highly respected around the world. French citizens are also known for their strong sense of patriotism and loyalty to their country. France is also a leader in scientific research and technology. France is also a founding member of the European Union and the United Nations. France is also a founding member of the Francophonie, an organization of Francophone countries. All in all, France is a country renowned for its culture, people, and accomplishments."
22 | context = "The French Republic is a country located in Western Europe; the capital city of its republic is Paris, which is located in the heart of the French Republic. France is considered to be a developed country and has the fifth-largest economy in the world. It is also a member of the G20 and a founding member of the European Union. France is also a member of the United Nations, NATO, and other international organizations. France is a popular tourist destination and is home to some of the world's most famous landmarks. French cuisine is also renowned for its excellence and sophistication. France is also renowned for its wine and fashion industries. French culture and art are also highly respected around the world. French citizens are also known for their strong sense of patriotism and loyalty to their country. France is also a leader in scientific research and technology. France is also a founding member of the European Union and the United Nations. France is also a founding member of the Francophonie, an organization of Francophone countries. All in all, France is a country renowned for its culture, people, and accomplishments. France is also known for its rich history, with many iconic landmarks and monuments that attract millions of tourists every year. France is also a country renowned for its cuisine, with a wide range of traditional dishes that are enjoyed around the world. All in all, France is a country that is rightfully proud of its past, present, and future. However, France is also a country with a lot of problems. France has a high unemployment rate, and the French government is struggling to find ways to create jobs. France also has a high level of public debt, and the French government is struggling to find ways to pay off its debt."
23 | context = "The French Republic is a country located in Western Europe; the capital city of its republic is Paris, which is located in the heart of the French Republic. France is considered to be a developed country and has the fifth-largest economy in the world. It is also a member of the G20 and a founding member of the European Union. France is also a member of the United Nations, NATO, and other international organizations. France is a popular tourist destination and is home to some of the world's most famous landmarks. French cuisine is also renowned for its excellence and sophistication. France is also renowned for its wine and fashion industries. French culture and art are also highly respected around the world. French citizens are also known for their strong sense of patriotism and loyalty to their country. France is also a leader in scientific research and technology. France is also a founding member of the European Union and the United Nations. France is also a founding member of the Francophonie, an organization of Francophone countries. All in all, France is a country renowned for its culture, people, and accomplishments. France is also known for its rich history, with many iconic landmarks and monuments that attract millions of tourists every year. France is also a country renowned for its cuisine, with a wide range of traditional dishes that are enjoyed around the world. All in all, France is a country that is rightfully proud of its past, present, and future. However, France is also a country with a lot of problems. France has a high unemployment rate, and the French government is struggling to find ways to create jobs. France also has a high level of public debt, and the French government is struggling to find ways to pay off its debt. France is also facing a wave of terrorist attacks, and the French government is struggling to keep its citizens safe. Despite all of these problems, France remains a strong nation, with a proud history, a strong economy, and a strong sense of national identity. France is also a country that is unified in its resolve to address its problems, and is taking steps to do so. The government is also working to create jobs, reduce poverty, and improve public services. It is also committed to protecting the environment and promoting renewable energy sources. France is a nation that is determined to overcome its challenges and build a brighter future for its citizens."
24 | # Generate random input data
25 | inputs = tokenizer(question, context, return_tensors="pt").to(model.device)
26 | 
27 | # Benchmark method
28 | def benchmark(n_runs, test_name, model, model_inputs):
29 |     warmup_run = model(**model_inputs)
30 |     latency_collector = LatencyCollector()
31 | 
32 |     for _ in range(n_runs):
33 |         latency_collector.pre_hook()
34 |         res = model(**model_inputs)
35 |         #print(f'res={res}')
36 |         latency_collector.hook()
37 | 
38 |     p0_latency_ms = latency_collector.percentile(0) * 1000
39 |     p50_latency_ms = latency_collector.percentile(50) * 1000
40 |     p90_latency_ms = latency_collector.percentile(90) * 1000
41 |     p95_latency_ms = latency_collector.percentile(95) * 1000
42 |     p99_latency_ms = latency_collector.percentile(99) * 1000
43 |     p100_latency_ms = latency_collector.percentile(100) * 1000
44 | 
45 |     report_dict = dict()
46 |     report_dict["Latency P0"] = f'{p0_latency_ms:.1f}'
47 |     report_dict["P50"]=f'{p50_latency_ms:.1f}'
48 |     report_dict["P90"]=f'{p90_latency_ms:.1f}'
49 |     report_dict["P95"]=f'{p95_latency_ms:.1f}'
50 |     report_dict["P99"]=f'{p99_latency_ms:.1f}'
51 |     report_dict["P100"]=f'{p100_latency_ms:.1f}'
52 | 
53 |     report = f'RESULT FOR {test_name}; n_runs {n_runs}:'
54 |     for key, value in report_dict.items():
55 |         report += f' {key}={value}'
56 |     print(report)
57 | 
58 | class LatencyCollector:
59 |     def __init__(self):
60 |         self.start = None
61 |         self.latency_list = []
62 | 
63 |     def pre_hook(self, *args):
64 |         self.start = time.time()
65 | 
66 |     def hook(self, *args):
67 |         self.latency_list.append(time.time() - self.start)
68 | 
69 |     def percentile(self, percent):
70 |         latency_list = self.latency_list
71 |         pos_float = len(latency_list) * percent / 100
72 |         max_pos = len(latency_list) - 1
73 |         pos_floor = min(math.floor(pos_float), max_pos)
74 |         pos_ceil = min(math.ceil(pos_float), max_pos)
75 |         latency_list = sorted(latency_list)
76 |         return latency_list[pos_ceil] if pos_float - pos_floor > 0.5 else latency_list[pos_floor]
77 | 
78 | benchmark(n_runs,model_name,model,inputs)
79 | 


--------------------------------------------------------------------------------
/app/build-assets.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | 
 3 | if [ "$IMAGE_TAG" == "amd64-neuron" ]; then
 4 |   aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
 5 |   docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:1.13.1-neuronx-py310-sdk2.17.0-ubuntu20.04
 6 |   dlc_xla_image_id=$(docker images | grep 763104351884 | grep 1.13.1-neuronx-py310-sdk2.17.0-ubuntu20.04 | awk '{print $3}')
 7 |   docker tag $dlc_xla_image_id $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$BASE_REPO:1.13.1-neuronx-py310-sdk2.17.0-ubuntu20.04
 8 |   docker logout
 9 |   aws ecr get-login-password --region $AWS_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$BASE_REPO
10 |   docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$BASE_REPO:1.13.1-neuronx-py310-sdk2.17.0-ubuntu20.04
11 | fi
12 | if [ "$IMAGE_TAG" == "amd64-cuda" ]; then
13 |   aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com
14 |   docker pull 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:2.0.1-gpu-py310-cu118-ubuntu20.04-ec2
15 |   dlc_cuda_image_id=$(docker images | grep 763104351884 | grep 2.0.1-gpu-py310-cu118-ubuntu20.04-ec2 | awk '{print $3}')
16 |   docker tag $dlc_cuda_image_id $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$BASE_REPO:2.0.1-gpu-py310-cu118-ubuntu20.04-ec2
17 |   docker logout
18 |   aws ecr get-login-password --region $AWS_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$BASE_REPO
19 |   docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$BASE_REPO:2.0.1-gpu-py310-cu118-ubuntu20.04-ec2
20 | fi
21 | docker images
22 | 
23 | ASSETS="-assets"
24 | export IMAGE=$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$BASE_REPO:$IMAGE_TAG$ASSETS
25 | aws ecr get-login-password --region $AWS_REGION | docker login --username AWS --password-stdin $IMAGE
26 | docker build -t $IMAGE --build-arg ai_chip=$IMAGE_TAG  -f Dockerfile-assets .
27 | docker push $IMAGE
28 | 


--------------------------------------------------------------------------------
/app/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | 
 3 | #docker logout
 4 | #aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
 5 | #docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:1.13.1-neuronx-py310-sdk2.17.0-ubuntu20.04
 6 | #dlc_xla_image_id=$(docker images | grep 763104351884 | grep 1.13.1-neuronx-py310-sdk2.17.0-ubuntu20.04 | awk '{print $3}')
 7 | #docker logout
 8 | #aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com
 9 | #docker pull 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:2.0.1-gpu-py310-cu118-ubuntu20.04-ec2
10 | #dlc_cuda_image_id=$(docker images | grep 763104351884 | grep 2.0.1-gpu-py310-cu118-ubuntu20.04-ec2 | awk '{print $3}')
11 | #docker images
12 | #docker logout
13 | 
14 | export AWS_ACCOUNT_ID=$(aws sts get-caller-identity --output text --query Account)
15 | 
16 | ASSETS="-assets"
17 | export BASE_IMAGE=$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$BASE_REPO:$BASE_IMAGE_TAG
18 | export ASSETS_IMAGE=$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$BASE_REPO:$IMAGE_TAG$ASSETS
19 | export IMAGE=$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$BASE_REPO:$IMAGE_TAG
20 | 
21 | #if [ "$IMAGE_TAG" == "1.13.1-neuronx-py310-sdk2.17.0-ubuntu20.04" ]; then
22 | #  docker tag $dlc_xla_image_id $BASE_IMAGE
23 | #fi
24 | #if [ "$IMAGE_TAG" == "2.0.1-gpu-py310-cu118-ubuntu20.04-ec2" ]; then
25 | #  docker tag $dlc_cuda_image_id $BASE_IMAGE
26 | #fi
27 | #docker images
28 | 
29 | cat Dockerfile.template | envsubst > Dockerfile
30 | cat Dockerfile
31 | aws ecr get-login-password --region $AWS_REGION | docker login --username AWS --password-stdin $BASE_IMAGE
32 | docker build -t $IMAGE .
33 | docker push $IMAGE
34 | 


--------------------------------------------------------------------------------
/app/call-model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SERVE_ENDPOINT=$SERVE_ENDPOINT
 4 | NUM_OF_ITERATIONS=$NUM_OF_ITERATIONS
 5 | SLEEP_TIME=$SLEEP_TIME
 6 | while true
 7 | do
 8 |   curl $SERVE_ENDPOINT/load/$NUM_OF_ITERATIONS/infer/$NUM_OF_INF
 9 |   sleep $SLEEP_TIME
10 | done
11 | 


--------------------------------------------------------------------------------
/app/compile-model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | . /root/.bashrc
 3 | pip install --upgrade pip
 4 | if [ "$(uname -i)" = "x86_64" ]; then
 5 |   if [ $DEVICE="xla" ]; then
 6 |     /optimum-neuron-sd2-compile.sh 
 7 |   fi
 8 | fi
 9 | tar -czvf /${COMPILER_WORKDIR_ROOT}/${MODEL_FILE}.tar.gz /${COMPILER_WORKDIR_ROOT}/
10 | aws s3 cp /${COMPILER_WORKDIR_ROOT}/${MODEL_FILE}.tar.gz s3://${BUCKET}/${MODEL_FILE}_${DEVICE}_bsize_${BATCH_SIZE}.tar.gz
11 | while true; do sleep 1000; done
12 | 


--------------------------------------------------------------------------------
/app/compile-sd2.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ["NEURON_FUSE_SOFTMAX"] = "1"
 3 | model_id=os.environ['MODEL_ID']
 4 | device=os.environ["DEVICE"]
 5 | model_dir=os.environ['COMPILER_WORKDIR_ROOT']
 6 | height=int(os.environ['HEIGHT'])
 7 | width=int(os.environ['WIDTH'])
 8 | batch_size=int(os.environ['BATCH_SIZE'])
 9 | from optimum.neuron import NeuronStableDiffusionPipeline
10 | 
11 | compiler_args = {"auto_cast": "matmul", "auto_cast_type": "bf16","inline_weights_to_neff": "True"}
12 | input_shapes = {"batch_size": batch_size, "height": height, "width": width}
13 | stable_diffusion = NeuronStableDiffusionPipeline.from_pretrained(model_id, export=True, **compiler_args, **input_shapes)
14 | stable_diffusion.save_pretrained(model_dir)
15 | 
16 | 


--------------------------------------------------------------------------------
/app/config:
--------------------------------------------------------------------------------
1 | [default]
2 | retry_mode = adaptive
3 | max_attempts = 10
4 | region = us-west-2
5 | 


--------------------------------------------------------------------------------
/app/create_node_port_svc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | kubectl label pod $POD_NAME inferencepod=$POD_NAME
3 | service_name=$POD_NAME-svc-$(kubectl get no -o wide `kubectl  get po  $POD_NAME -o wide | awk '{print $7}'|grep -v NODE`| awk '{print $7}' | grep -v EXTERNAL-IP|sed "s/\./-/g")
4 | export SVC_NAME=$service_name
5 | cat /node_port_svc_template.yaml | envsubst | kubectl apply -f - 
6 | 


--------------------------------------------------------------------------------
/app/diffusion_benchmarker-0.0.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/edge_diffusion_on_eks/05e4dfefe30cc1965f7d9771e99513570f62d867/app/diffusion_benchmarker-0.0.1.tar.gz


--------------------------------------------------------------------------------
/app/gpt2_gpu_benchmark.py:
--------------------------------------------------------------------------------
 1 | from transformers import GPT2LMHeadModel, AutoTokenizer
 2 | import torch
 3 | import time
 4 | import math
 5 | 
 6 | # Define model name and tokenizer
 7 | model_name = "openai-community/gpt2"
 8 | tokenizer = AutoTokenizer.from_pretrained(model_name)
 9 | 
10 | # Define benchmark parameters
11 | sequence_length = 1024
12 | batch_size = 2
13 | n_runs = 100
14 | 
15 | # Load model and move to GPU
16 | model = GPT2LMHeadModel.from_pretrained(model_name).cuda()
17 | 
18 | # Generate random input data
19 | input_ids = torch.randint(0, tokenizer.vocab_size, size=(batch_size, sequence_length)).cuda()
20 | 
21 | # Benchmark method
22 | def benchmark(n_runs, test_name, model, model_inputs):
23 |     if not isinstance(model_inputs, tuple):
24 |         model_inputs = (model_inputs,)
25 | 
26 |     warmup_run = model(*model_inputs)
27 | 
28 |     latency_collector = LatencyCollector()
29 | 
30 |     for _ in range(n_runs):
31 |         latency_collector.pre_hook()
32 |         res = model(*model_inputs)
33 |         #print(f'res={res}')
34 |         latency_collector.hook()
35 | 
36 |     p0_latency_ms = latency_collector.percentile(0) * 1000
37 |     p50_latency_ms = latency_collector.percentile(50) * 1000
38 |     p90_latency_ms = latency_collector.percentile(90) * 1000
39 |     p95_latency_ms = latency_collector.percentile(95) * 1000
40 |     p99_latency_ms = latency_collector.percentile(99) * 1000
41 |     p100_latency_ms = latency_collector.percentile(100) * 1000
42 | 
43 |     report_dict = dict()
44 |     report_dict["Latency P0"] = f'{p0_latency_ms:.1f}'
45 |     report_dict["Latency P50"]=f'{p50_latency_ms:.1f}'
46 |     report_dict["Latency P90"]=f'{p90_latency_ms:.1f}'
47 |     report_dict["Latency P95"]=f'{p95_latency_ms:.1f}'
48 |     report_dict["Latency P99"]=f'{p99_latency_ms:.1f}'
49 |     report_dict["Latency P100"]=f'{p100_latency_ms:.1f}'
50 | 
51 |     report = f'RESULT FOR {test_name}; n_runs {n_runs}; batch_size {batch_size}; sequence_length {sequence_length}:'
52 |     for key, value in report_dict.items():
53 |         report += f' {key}={value}'
54 |     print(report)
55 | 
56 | class LatencyCollector:
57 |     def __init__(self):
58 |         self.start = None
59 |         self.latency_list = []
60 | 
61 |     def pre_hook(self, *args):
62 |         self.start = time.time()
63 | 
64 |     def hook(self, *args):
65 |         self.latency_list.append(time.time() - self.start)
66 | 
67 |     def percentile(self, percent):
68 |         latency_list = self.latency_list
69 |         pos_float = len(latency_list) * percent / 100
70 |         max_pos = len(latency_list) - 1
71 |         pos_floor = min(math.floor(pos_float), max_pos)
72 |         pos_ceil = min(math.ceil(pos_float), max_pos)
73 |         latency_list = sorted(latency_list)
74 |         return latency_list[pos_ceil] if pos_float - pos_floor > 0.5 else latency_list[pos_floor]
75 | 
76 | benchmark(n_runs,model_name,model,input_ids)
77 | 


--------------------------------------------------------------------------------
/app/node_port_svc_template.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: $SVC_NAME
 5 | spec:
 6 |   selector:
 7 |     inferencepod: $POD_NAME
 8 |   ports:
 9 |     - protocol: TCP
10 |       port: 8000
11 |       targetPort: 8000
12 |   type: NodePort
13 | 


--------------------------------------------------------------------------------
/app/optimum-neuron-sd2-compile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | pip install --upgrade pip
 4 | pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
 5 | pip install "optimum[neuronx, diffusers]"
 6 | pip uninstall awscli -y
 7 | pip uninstall botocore s3transfer -y
 8 | pip install awscli
 9 | python /compile-sd2.py
10 | #optimum-cli export neuron \
11 | #  --model $MODEL_ID \
12 | #  --task stable-diffusion \
13 | #  --batch_size $BATCH_SIZE \
14 | #  --height $HEIGHT \
15 | #  --width $WIDTH \
16 | #  --auto_cast matmul \
17 | #  --auto_cast_type bf16 \
18 | #  $COMPILER_WORKDIR_ROOT/
19 | 
20 | 


--------------------------------------------------------------------------------
/app/optimum-neuron.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer
 2 | from optimum.neuron import NeuronModelForSequenceClassification
 3 | 
 4 | model = NeuronModelForSequenceClassification.from_pretrained("distilbert_base_uncased_finetuned_sst2_english_neuron")
 5 | 
 6 | tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
 7 | inputs = tokenizer("Hamilton is considered to be the best musical of past years.", return_tensors="pt")
 8 | logits = model(**inputs).logits
 9 | print(model.config.id2label[logits.argmax().item()])
10 | 


--------------------------------------------------------------------------------
/app/optimum-neuron.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | pip install --upgrade pip
 4 | pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
 5 | pip install "optimum[neuronx, diffusers]"
 6 | optimum-cli export neuron \
 7 |   --model stabilityai/stable-diffusion-2-1 \
 8 |   --task stable-diffusion \
 9 |   --batch_size 1 \
10 |   --height 512 \
11 |   --width 512 \
12 |   --auto_cast matmul \
13 |   --auto_cast_type bf16 \
14 |   sd_neuron_sd_21/
15 | 
16 | while true; do sleep 1000; done
17 | 


--------------------------------------------------------------------------------
/app/run-model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | pip install --upgrade pip
 3 | if [ "$(uname -i)" = "x86_64" ]; then
 4 |   if [ "$DEVICE" == "xla" ]; then
 5 |     pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
 6 |     pip install "optimum[neuronx, diffusers]"
 7 |     #pip install diffusers==0.20.2 transformers==4.33.1 accelerate==0.22.0 safetensors==0.3.1 matplotlib Pillow ipython -U      
 8 |     #pip install click
 9 |     #pip install /diffusion_benchmarker-0.0.1.tar.gz 
10 |     #diffusion_benchmarker t2i --pipeline neuronx_t2i root=/app/sd2_compile_dir_512 pretrained_model_name_or_path=stabilityai/stable-diffusion-2-1 torch_dtype=bfloat16
11 |   elif [ "$DEVICE" == "cuda" ]; then
12 |     pip install environment_kernels
13 |     pip install diffusers transformers accelerate safetensors matplotlib Pillow ipython torch -U
14 |     pip install click nvitop
15 |     #pip install /diffusion_benchmarker-0.0.1.tar.gz 
16 |     #diffusion_benchmarker t2i --pipeline inductor_t2i pretrained_model_name_or_path=stabilityai/stable-diffusion-2-1 torch_dtype=bfloat16
17 |   fi
18 |   #uvicorn run:app --host=0.0.0.0
19 |   uvicorn run-sd2:app --host=0.0.0.0
20 | fi
21 | #while true; do sleep 1000; done
22 | 


--------------------------------------------------------------------------------
/app/run-sd2.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import time
  4 | import random
  5 | import gradio as gr
  6 | from matplotlib import image as mpimg
  7 | from fastapi import FastAPI
  8 | import torch
  9 | 
 10 | pod_name=os.environ['POD_NAME']
 11 | model_id=os.environ['MODEL_ID']
 12 | device=os.environ["DEVICE"]
 13 | model_dir=os.environ['COMPILER_WORKDIR_ROOT']
 14 | number_of_runs_per_inference=os.environ['NUM_OF_RUNS_INF']
 15 | 
 16 | # Define datatype
 17 | DTYPE = torch.bfloat16
 18 | 
 19 | if device=='xla':
 20 |   from optimum.neuron import NeuronStableDiffusionPipeline
 21 | elif device=='cuda':
 22 |   from diffusers import StableDiffusionPipeline, EulerAncestralDiscreteScheduler
 23 | 
 24 | def benchmark(n_runs, test_name, model, model_inputs):
 25 |     if not isinstance(model_inputs, tuple):
 26 |         model_inputs = model_inputs
 27 | 
 28 |     warmup_run = model(**model_inputs)
 29 | 
 30 |     latency_collector = LatencyCollector()
 31 | 
 32 |     for _ in range(n_runs):
 33 |         latency_collector.pre_hook()
 34 |         res = model(**model_inputs)
 35 |         latency_collector.hook()
 36 | 
 37 |     p0_latency_ms = latency_collector.percentile(0) * 1000
 38 |     p50_latency_ms = latency_collector.percentile(50) * 1000
 39 |     p90_latency_ms = latency_collector.percentile(90) * 1000
 40 |     p95_latency_ms = latency_collector.percentile(95) * 1000
 41 |     p99_latency_ms = latency_collector.percentile(99) * 1000
 42 |     p100_latency_ms = latency_collector.percentile(100) * 1000
 43 | 
 44 |     report_dict = dict()
 45 |     report_dict["Latency P0"] = f'{p0_latency_ms:.1f}'
 46 |     report_dict["Latency P50"]=f'{p50_latency_ms:.1f}'
 47 |     report_dict["Latency P90"]=f'{p90_latency_ms:.1f}'
 48 |     report_dict["Latency P95"]=f'{p95_latency_ms:.1f}'
 49 |     report_dict["Latency P99"]=f'{p99_latency_ms:.1f}'
 50 |     report_dict["Latency P100"]=f'{p100_latency_ms:.1f}'
 51 | 
 52 |     report = f'RESULT FOR {test_name} on {pod_name}:'
 53 |     for key, value in report_dict.items():
 54 |         report += f' {key}={value}'
 55 |     print(report)
 56 |     return report
 57 | 
 58 | class LatencyCollector:
 59 |     def __init__(self):
 60 |         self.start = None
 61 |         self.latency_list = []
 62 | 
 63 |     def pre_hook(self, *args):
 64 |         self.start = time.time()
 65 | 
 66 |     def hook(self, *args):
 67 |         self.latency_list.append(time.time() - self.start)
 68 | 
 69 |     def percentile(self, percent):
 70 |         latency_list = self.latency_list
 71 |         pos_float = len(latency_list) * percent / 100
 72 |         max_pos = len(latency_list) - 1
 73 |         pos_floor = min(math.floor(pos_float), max_pos)
 74 |         pos_ceil = min(math.ceil(pos_float), max_pos)
 75 |         latency_list = sorted(latency_list)
 76 |         return latency_list[pos_ceil] if pos_float - pos_floor > 0.5 else latency_list[pos_floor]
 77 | 
 78 | if device=='xla':
 79 |   pipe = NeuronStableDiffusionPipeline.from_pretrained(model_dir)
 80 | elif device=='cuda':
 81 |   pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=DTYPE)
 82 |   pipe = pipe.to("cuda")
 83 |   pipe.enable_attention_slicing
 84 | 
 85 | def text2img(prompt):
 86 |   start_time = time.time()
 87 |   image = pipe(prompt).images[0]
 88 |   total_time =  time.time()-start_time
 89 |   r1 = random.randint(0,99999)
 90 |   imgname="image"+str(r1)+".png"
 91 |   image.save(imgname)
 92 |   image = mpimg.imread(imgname)
 93 |   return image, str(total_time)
 94 | 
 95 | #warmup
 96 | prompt = "a photo of an astronaut riding a horse on mars"
 97 | num_inference_steps=2
 98 | model_args={'prompt': prompt,'num_inference_steps': num_inference_steps,}
 99 | image = pipe(**model_args).images[0]
100 | 
101 | app = FastAPI()
102 | io = gr.Interface(fn=text2img,inputs=["text"],
103 |     outputs = [gr.Image(height=512, width=512), "text"],
104 |     title = 'Stable Diffusion 2.1 in AWS EC2 ' + device + ' instance')
105 | 
106 | @app.get("/")
107 | def read_main():
108 |   return {"message": "This is Stable Diffusion 2.1 pod " + pod_name + " in AWS EC2 " + device + " instance; try /load/{n_runs}/infer/{n_inf} or /serve"}
109 | 
110 | @app.get("/load/{n_runs}/infer/{n_inf}")
111 | def load(n_runs: int,n_inf: int):
112 |   prompt = "a photo of an astronaut riding a horse on mars"
113 |   num_inference_steps = n_inf
114 |   model_args={'prompt': prompt,'num_inference_steps': num_inference_steps,}
115 |   report=benchmark(n_runs, "stable_diffusion_512", pipe, model_args)
116 |   return {"message": "benchmark report:"+report}
117 | 
118 | @app.get("/health")
119 | def healthy():
120 |   return {"message": pod_name + "is healthy"}
121 | 
122 | @app.get("/readiness")
123 | def ready():
124 |   return {"message": pod_name + "is ready"}
125 | 
126 | app = gr.mount_gradio_app(app, io, path="/serve")
127 | 


--------------------------------------------------------------------------------
/app/run.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["NEURON_FUSE_SOFTMAX"] = "1"
  3 | pod_name=os.environ['POD_NAME']
  4 | model_id=os.environ['MODEL_ID']
  5 | device=os.environ["DEVICE"]
  6 | model_dir=os.environ['COMPILER_WORKDIR_ROOT']
  7 | number_of_runs_per_inference=os.environ['NUM_OF_RUNS_INF']
  8 | import gradio as gr
  9 | from fastapi import FastAPI
 10 | import random
 11 | from matplotlib import image as mpimg
 12 | from matplotlib import pyplot as plt
 13 | import torch
 14 | import torch.nn as nn
 15 | if device=='xla':
 16 |   import torch_neuronx
 17 | 
 18 | from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler, EulerAncestralDiscreteScheduler
 19 | from diffusers.models.unet_2d_condition import UNet2DConditionOutput
 20 | 
 21 | import time
 22 | import math
 23 | 
 24 | # Define datatype
 25 | DTYPE = torch.bfloat16
 26 | 
 27 | # Specialized benchmarking class for stable diffusion.
 28 | # We cannot use any of the pre-existing benchmarking utilities to benchmark E2E stable diffusion performance,
 29 | # because the top-level StableDiffusionPipeline cannot be serialized into a single Torchscript object.
 30 | # All of the pre-existing benchmarking utilities (in neuronperf or torch_neuronx) require the model to be a
 31 | # traced Torchscript.
 32 | def benchmark(n_runs, test_name, model, model_inputs):
 33 |     if not isinstance(model_inputs, tuple):
 34 |         model_inputs = model_inputs
 35 |     
 36 |     warmup_run = model(**model_inputs)
 37 | 
 38 |     latency_collector = LatencyCollector()
 39 |     # can't use register_forward_pre_hook or register_forward_hook because StableDiffusionPipeline is not a torch.nn.Module
 40 |     
 41 |     for _ in range(n_runs):
 42 |         latency_collector.pre_hook()
 43 |         res = model(**model_inputs)
 44 |         latency_collector.hook()
 45 |     
 46 |     p0_latency_ms = latency_collector.percentile(0) * 1000
 47 |     p50_latency_ms = latency_collector.percentile(50) * 1000
 48 |     p90_latency_ms = latency_collector.percentile(90) * 1000
 49 |     p95_latency_ms = latency_collector.percentile(95) * 1000
 50 |     p99_latency_ms = latency_collector.percentile(99) * 1000
 51 |     p100_latency_ms = latency_collector.percentile(100) * 1000
 52 | 
 53 |     report_dict = dict()
 54 |     report_dict["Latency P0"] = f'{p0_latency_ms:.1f}'
 55 |     report_dict["Latency P50"]=f'{p50_latency_ms:.1f}'
 56 |     report_dict["Latency P90"]=f'{p90_latency_ms:.1f}'
 57 |     report_dict["Latency P95"]=f'{p95_latency_ms:.1f}'
 58 |     report_dict["Latency P99"]=f'{p99_latency_ms:.1f}'
 59 |     report_dict["Latency P100"]=f'{p100_latency_ms:.1f}'
 60 | 
 61 |     report = f'RESULT FOR {test_name} on {pod_name}:'
 62 |     for key, value in report_dict.items():
 63 |         report += f' {key}={value}'
 64 |     print(report)
 65 |     return report
 66 | 
 67 | class LatencyCollector:
 68 |     def __init__(self):
 69 |         self.start = None
 70 |         self.latency_list = []
 71 | 
 72 |     def pre_hook(self, *args):
 73 |         self.start = time.time()
 74 | 
 75 |     def hook(self, *args):
 76 |         self.latency_list.append(time.time() - self.start)
 77 | 
 78 |     def percentile(self, percent):
 79 |         latency_list = self.latency_list
 80 |         pos_float = len(latency_list) * percent / 100
 81 |         max_pos = len(latency_list) - 1
 82 |         pos_floor = min(math.floor(pos_float), max_pos)
 83 |         pos_ceil = min(math.ceil(pos_float), max_pos)
 84 |         latency_list = sorted(latency_list)
 85 |         return latency_list[pos_ceil] if pos_float - pos_floor > 0.5 else latency_list[pos_floor]
 86 | 
 87 | 
 88 | class UNetWrap(nn.Module):
 89 |     def __init__(self, unet):
 90 |         super().__init__()
 91 |         self.unet = unet
 92 | 
 93 |     def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None):
 94 |         out_tuple = self.unet(sample, timestep, encoder_hidden_states, return_dict=False)
 95 |         return out_tuple
 96 | 
 97 | class NeuronUNet(nn.Module):
 98 |     def __init__(self, unetwrap):
 99 |         super().__init__()
100 |         self.unetwrap = unetwrap
101 |         self.config = unetwrap.unet.config
102 |         self.in_channels = unetwrap.unet.in_channels
103 |         self.device = unetwrap.unet.device
104 | 
105 |     def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None, return_dict=False):
106 |         sample = self.unetwrap(sample, timestep.to(dtype=DTYPE).expand((sample.shape[0],)), encoder_hidden_states)[0]
107 |         return UNet2DConditionOutput(sample=sample)
108 | 
109 | class NeuronTextEncoder(nn.Module):
110 |     def __init__(self, text_encoder):
111 |         super().__init__()
112 |         self.neuron_text_encoder = text_encoder
113 |         self.config = text_encoder.config
114 |         self.dtype = text_encoder.dtype
115 |         self.device = text_encoder.device
116 | 
117 |     def forward(self, emb, attention_mask = None):
118 |         return [self.neuron_text_encoder(emb)['last_hidden_state']]
119 |     
120 | def decode_latents(self, latents):
121 |     latents = latents.to(torch.float)
122 |     latents = 1 / self.vae.config.scaling_factor * latents
123 |     image = self.vae.decode(latents).sample
124 |     image = (image / 2 + 0.5).clamp(0, 1)
125 |     image = image.cpu().permute(0, 2, 3, 1).float().numpy()
126 |     return image
127 | 
128 | StableDiffusionPipeline.decode_latents = decode_latents
129 | 
130 | # --- Load all compiled models and benchmark pipeline ---
131 | COMPILER_WORKDIR_ROOT = model_dir
132 | #model_id = "stabilityai/stable-diffusion-2-1-base"
133 | text_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder/model.pt')
134 | decoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder/model.pt')
135 | unet_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'unet/model.pt')
136 | post_quant_conv_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv/model.pt')
137 | 
138 | pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=DTYPE)
139 | if device=='cuda':
140 |   pipe = pipe.to("cuda")
141 |   pipe.enable_attention_slicing
142 |   #pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
143 | 
144 | if device=='xla':
145 |   pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
146 | 
147 | #if device=='cuda':
148 | #  pipe.unet.to(memory_format=torch.channels_last)
149 | #  pipe.vae.to(memory_format=torch.channels_last)
150 | #  pipe.unet = torch.compile(pipe.unet, fullgraph=True, mode="max-autotune")
151 |   
152 | #  pipe.text_encoder = torch.compile(
153 | #    pipe.text_encoder,
154 | #    fullgraph=True,
155 | #    mode="max-autotune",
156 | #  )
157 | #  pipe.vae.decoder = torch.compile(
158 | #    pipe.vae.decoder,
159 | #    fullgraph=True,
160 | #    mode="max-autotune",
161 | #  )
162 | #  pipe.vae.post_quant_conv = torch.compile(
163 | #    pipe.vae.post_quant_conv,
164 | #    fullgraph=True,
165 | #    mode="max-autotune-no-cudagraphs",
166 | #  )
167 |    
168 | if device=='xla':
169 |   # Load the compiled UNet onto two neuron cores.
170 |   pipe.unet = NeuronUNet(UNetWrap(pipe.unet))
171 |   device_ids = [0,1]
172 |   pipe.unet.unetwrap = torch_neuronx.DataParallel(torch.jit.load(unet_filename), device_ids, set_dynamic_batching=False)
173 | 
174 | class NeuronTypeConversionWrapper(nn.Module):
175 |     def __init__(self, network):
176 |         super().__init__()
177 |         self.network = network
178 | 
179 |     def forward(self, x):
180 |         return self.network(x.float())
181 | 
182 | if device=='xla':
183 |   # Load other compiled models onto a single neuron core.
184 |   pipe.text_encoder = NeuronTextEncoder(pipe.text_encoder)
185 |   pipe.text_encoder.neuron_text_encoder = torch.jit.load(text_encoder_filename)
186 |   pipe.vae.decoder = NeuronTypeConversionWrapper(torch.jit.load(decoder_filename))
187 |   pipe.vae.post_quant_conv = NeuronTypeConversionWrapper(torch.jit.load(post_quant_conv_filename))
188 | 
189 | def text2img(PROMPT):
190 |   start_time = time.time()
191 |   image = pipe(PROMPT).images[0]
192 |   total_time =  time.time()-start_time
193 |   r1 = random.randint(0,99999)
194 |   imgname="image"+str(r1)+".png"
195 |   image.save(imgname)
196 |   image = mpimg.imread(imgname)
197 |   return image, str(total_time)
198 | #warmup
199 | prompt = "a photo of an astronaut riding a horse on mars"
200 | num_inference_steps = 10
201 | height = 512
202 | width = 512
203 | n_runs = 10
204 | model_args={'prompt': prompt, 'height': height, 'width': width, 'num_inference_steps': num_inference_steps,}
205 | report=benchmark(n_runs, "stable_diffusion_512", pipe, model_args)
206 | print(f'model warmup {report}')
207 | app = FastAPI()
208 | io = gr.Interface(fn=text2img,inputs=["text"],
209 |     outputs = [gr.Image(height=512, width=512), "text"],
210 |     title = 'Stable Diffusion 2.1 pod ' + pod_name + ' in AWS EC2 ' + device + ' instance')
211 | @app.get("/")
212 | def read_main():
213 |   return {"message": "This is Stable Diffusion 2.1 pod " + pod_name + " in AWS EC2 " + device + " instance; try /load/{n_runs}/infer/{n_inf} or /serve"}
214 | @app.get("/load/{n_runs}/infer/{n_inf}")
215 | def load(n_runs: int,n_inf: int):
216 |   prompt = "a photo of an astronaut riding a horse on mars"
217 |   num_inference_steps = n_inf
218 |   height = 512
219 |   width = 512
220 |   model_args={'prompt': prompt, 'height': height, 'width': width, 'num_inference_steps': num_inference_steps,}
221 |   report=benchmark(n_runs, "stable_diffusion_512", pipe, model_args)
222 |   return {"message": "benchmark report:"+report}
223 | @app.get("/health")
224 | def healthy():
225 |   return {"message": pod_name + "is healthy"}
226 | @app.get("/readiness")
227 | def ready():
228 |   return {"message": pod_name + "is ready"}
229 | app = gr.mount_gradio_app(app, io, path="/serve")
230 | 


--------------------------------------------------------------------------------
/app/run1-model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | 
 3 | tar -xzf /app/model.tar.gz 
 4 | pip install --upgrade pip
 5 | if [ "$(uname -i)" = "x86_64" ]; then
 6 |   if [ $DEVICE="xla" ]; then
 7 |     pip install diffusers==0.20.2 transformers==4.33.1 accelerate==0.22.0 safetensors==0.3.1 matplotlib Pillow ipython -U      
 8 |   elif [ $DEVICE="cuda" ]; then
 9 |     pip install environment_kernels
10 |     pip install diffusers transformers accelerate safetensors matplotlib Pillow ipython torch -U
11 |   fi
12 |   python /run1.py
13 | fi
14 | while true; do sleep 1000; done
15 | 


--------------------------------------------------------------------------------
/app/run1.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["NEURON_FUSE_SOFTMAX"] = "1"
  3 | model_id=os.environ['MODEL_ID']
  4 | device=os.environ["DEVICE"]
  5 | model_dir=os.environ['COMPILER_WORKDIR_ROOT']
  6 | model_id = "stabilityai/stable-diffusion-2-inpainting"
  7 | import gradio as gr
  8 | from fastapi import FastAPI
  9 | import random
 10 | from matplotlib import image as mpimg
 11 | from matplotlib import pyplot as plt
 12 | import torch
 13 | import torch.nn as nn
 14 | if device=='xla':
 15 |   import torch_neuronx
 16 | 
 17 | #from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
 18 | #from diffusers.models.unet_2d_condition import UNet2DConditionOutput
 19 | from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
 20 | from diffusers.models.unet_2d_condition import UNet2DConditionOutput
 21 | from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation
 22 | 
 23 | import time
 24 | import math
 25 | 
 26 | import copy
 27 | from IPython.display import clear_output
 28 | from PIL import Image
 29 | import requests
 30 | # Define datatype
 31 | DTYPE = torch.bfloat16
 32 | 
 33 | # Specialized benchmarking class for stable diffusion.
 34 | # We cannot use any of the pre-existing benchmarking utilities to benchmark E2E stable diffusion performance,
 35 | # because the top-level StableDiffusionPipeline cannot be serialized into a single Torchscript object.
 36 | # All of the pre-existing benchmarking utilities (in neuronperf or torch_neuronx) require the model to be a
 37 | # traced Torchscript.
 38 | def benchmark(n_runs, test_name, model, model_inputs):
 39 |     if not isinstance(model_inputs, tuple):
 40 |         model_inputs = (model_inputs,)
 41 |     
 42 |     warmup_run = model(*model_inputs)
 43 | 
 44 |     latency_collector = LatencyCollector()
 45 |     # can't use register_forward_pre_hook or register_forward_hook because StableDiffusionPipeline is not a torch.nn.Module
 46 |     
 47 |     for _ in range(n_runs):
 48 |         latency_collector.pre_hook()
 49 |         res = model(*model_inputs)
 50 |         latency_collector.hook()
 51 |     
 52 |     p0_latency_ms = latency_collector.percentile(0) * 1000
 53 |     p50_latency_ms = latency_collector.percentile(50) * 1000
 54 |     p90_latency_ms = latency_collector.percentile(90) * 1000
 55 |     p95_latency_ms = latency_collector.percentile(95) * 1000
 56 |     p99_latency_ms = latency_collector.percentile(99) * 1000
 57 |     p100_latency_ms = latency_collector.percentile(100) * 1000
 58 | 
 59 |     report_dict = dict()
 60 |     report_dict["Latency P0"] = f'{p0_latency_ms:.1f}'
 61 |     report_dict["Latency P50"]=f'{p50_latency_ms:.1f}'
 62 |     report_dict["Latency P90"]=f'{p90_latency_ms:.1f}'
 63 |     report_dict["Latency P95"]=f'{p95_latency_ms:.1f}'
 64 |     report_dict["Latency P99"]=f'{p99_latency_ms:.1f}'
 65 |     report_dict["Latency P100"]=f'{p100_latency_ms:.1f}'
 66 | 
 67 |     report = f'RESULT FOR {test_name}:'
 68 |     for key, value in report_dict.items():
 69 |         report += f' {key}={value}'
 70 |     print(report)
 71 |     return report
 72 | 
 73 | class LatencyCollector:
 74 |     def __init__(self):
 75 |         self.start = None
 76 |         self.latency_list = []
 77 | 
 78 |     def pre_hook(self, *args):
 79 |         self.start = time.time()
 80 | 
 81 |     def hook(self, *args):
 82 |         self.latency_list.append(time.time() - self.start)
 83 | 
 84 |     def percentile(self, percent):
 85 |         latency_list = self.latency_list
 86 |         pos_float = len(latency_list) * percent / 100
 87 |         max_pos = len(latency_list) - 1
 88 |         pos_floor = min(math.floor(pos_float), max_pos)
 89 |         pos_ceil = min(math.ceil(pos_float), max_pos)
 90 |         latency_list = sorted(latency_list)
 91 |         return latency_list[pos_ceil] if pos_float - pos_floor > 0.5 else latency_list[pos_floor]
 92 | 
 93 | 
 94 | class UNetWrap(nn.Module):
 95 |     def __init__(self, unet):
 96 |         super().__init__()
 97 |         self.unet = unet
 98 | 
 99 |     def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None):
100 |         out_tuple = self.unet(sample, timestep, encoder_hidden_states, return_dict=False)
101 |         return out_tuple
102 | 
103 | class NeuronUNet(nn.Module):
104 |     def __init__(self, unetwrap):
105 |         super().__init__()
106 |         self.unetwrap = unetwrap
107 |         self.config = unetwrap.unet.config
108 |         self.in_channels = unetwrap.unet.in_channels
109 |         self.device = unetwrap.unet.device
110 | 
111 |     def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None, return_dict=False):
112 |         sample = self.unetwrap(sample, timestep.to(dtype=DTYPE).expand((sample.shape[0],)), encoder_hidden_states)[0]
113 |         return UNet2DConditionOutput(sample=sample)
114 | 
115 | class NeuronTextEncoder(nn.Module):
116 |     def __init__(self, text_encoder):
117 |         super().__init__()
118 |         self.neuron_text_encoder = text_encoder
119 |         self.config = text_encoder.config
120 |         self.dtype = text_encoder.dtype
121 |         self.device = text_encoder.device
122 | 
123 |     def forward(self, emb, attention_mask = None):
124 |         return [self.neuron_text_encoder(emb)['last_hidden_state']]
125 |     
126 | def decode_latents(self, latents):
127 |     latents = latents.to(torch.float)
128 |     latents = 1 / self.vae.config.scaling_factor * latents
129 |     image = self.vae.decode(latents).sample
130 |     image = (image / 2 + 0.5).clamp(0, 1)
131 |     image = image.cpu().permute(0, 2, 3, 1).float().numpy()
132 |     return image
133 | 
134 | #StableDiffusionPipeline.decode_latents = decode_latents
135 | DiffusionPipeline.decode_latents = decode_latents
136 | 
137 | # --- Load all compiled models and benchmark pipeline ---
138 | COMPILER_WORKDIR_ROOT = model_dir
139 | #model_id = "stabilityai/stable-diffusion-2-1-base"
140 | text_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder/model.pt')
141 | decoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder/model.pt')
142 | unet_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'unet/model.pt')
143 | post_quant_conv_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv/model.pt')
144 | 
145 | #pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=DTYPE)
146 | pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=DTYPE)
147 | if device=='cuda':
148 |   pipe = pipe.to("cuda")
149 | 
150 | pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
151 | 
152 | if device=='xla':
153 |   # Load the compiled UNet onto two neuron cores.
154 |   pipe.unet = NeuronUNet(UNetWrap(pipe.unet))
155 |   device_ids = [0,1]
156 |   pipe.unet.unetwrap = torch_neuronx.DataParallel(torch.jit.load(unet_filename), device_ids, set_dynamic_batching=False)
157 | 
158 | class NeuronTypeConversionWrapper(nn.Module):
159 |     def __init__(self, network):
160 |         super().__init__()
161 |         self.network = network
162 | 
163 |     def forward(self, x):
164 |         return self.network(x.float())
165 | 
166 | if device=='xla':
167 |   # Load other compiled models onto a single neuron core.
168 |   pipe.text_encoder = NeuronTextEncoder(pipe.text_encoder)
169 |   pipe.text_encoder.neuron_text_encoder = torch.jit.load(text_encoder_filename)
170 |   pipe.vae.decoder = NeuronTypeConversionWrapper(torch.jit.load(decoder_filename))
171 |   pipe.vae.post_quant_conv = NeuronTypeConversionWrapper(torch.jit.load(post_quant_conv_filename))
172 | 
173 | prompt = "a photo of an astronaut riding a horse on mars"
174 | n_runs = 20
175 | #benchmark(n_runs, "stable_diffusion_512", pipe, prompt)
176 | 
177 | def text2img(PROMPT):
178 |   start_time = time.time()
179 |   image = pipe(PROMPT).images[0]
180 |   total_time =  time.time()-start_time
181 |   r1 = random.randint(0,99999)
182 |   imgname="image"+str(r1)+".png"
183 |   image.save(imgname)
184 |   image = mpimg.imread(imgname)
185 |   return image, str(total_time)
186 | 
187 | def prompt_paint(input_image, source_prompt, result_prompt):
188 |   processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
189 |   model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined") 
190 |   prompts = source_prompt.split(sep=',')
191 |   inputs = processor(text=prompts, images=[input_image] * len(prompts), padding="max_length", return_tensors="pt")
192 |   with torch.no_grad():
193 |     outputs = model(**inputs)
194 |     #DEBUG need to check
195 |     input_image.convert('RGB').resize((512, 512)).save("/init_image.png", "PNG")
196 |     #DEBUG need to check
197 |     preds = outputs.logits.unsqueeze(0)
198 |     filename = f"/mask.png"
199 |     plt.imsave(filename,torch.sigmoid(preds[0][0]))
200 |     maskimage=Image.open(filename)
201 | 
202 | with gr.Blocks() as app:
203 |   gr.Markdown("# stable-diffusion-2-inpainting")
204 |   with gr.Tab("Prompt basic"):
205 |     with gr.Row():
206 |       input_image = gr.Image(label = 'Upload your input image', type = 'pil')
207 |       source_prompt = gr.Textbox(label="What is in the input image you want to change? PLEASE add comma at the end")
208 |       result_prompt = gr.Textbox(label="Replace it with?")
209 |       image_output = gr.Image()
210 |       image_button = gr.Button("Generate")
211 |       image_button.click(prompt_paint, inputs=[input_image, source_prompt, result_prompt], outputs=image_output)
212 | 
213 | app.launch(share = True,server_name="0.0.0.0",debug = True,server_port=8000)
214 | #app = FastAPI()
215 | #io = gr.Interface(fn=text2img,inputs=["text"],
216 | #    outputs = [gr.Image(height=512, width=512), "text"],
217 | #    title = 'Stable Diffusion 2.1 in AWS EC2 ' + device + ' instance')
218 | #@app.get("/")
219 | #def read_main():
220 | #  return {"message": "This is Stable Diffusion 2.1 in AWS EC2 " + device + "instance; try /load/{n_runs} or /serve"}
221 | #@app.get("/load/{n_runs}")
222 | #def load(n_runs: int):
223 | #  prompt = "a photo of an astronaut riding a horse on mars"
224 | #  #n_runs = 20
225 | #  report=benchmark(n_runs, "stable_diffusion_512", pipe, prompt)
226 | #  return {"message": "benchmark report:"+report}
227 | #app = gr.mount_gradio_app(app, io, path="/serve")
228 | 


--------------------------------------------------------------------------------
/app/sd2_512_benchmark.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["NEURON_FUSE_SOFTMAX"] = "1"
  3 | model_id=os.environ['MODEL_ID']
  4 | device=os.environ["DEVICE"]
  5 | model_dir=os.environ['COMPILER_WORKDIR_ROOT']
  6 | import gradio as gr
  7 | import random
  8 | from matplotlib import image as mpimg
  9 | from matplotlib import pyplot as plt
 10 | import torch
 11 | import torch.nn as nn
 12 | if device=='xla':
 13 |   import torch_neuronx
 14 | 
 15 | from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler,EulerAncestralDiscreteScheduler
 16 | from diffusers.models.unet_2d_condition import UNet2DConditionOutput
 17 | 
 18 | import time
 19 | import math
 20 | 
 21 | # Define datatype
 22 | DTYPE = torch.bfloat16
 23 | 
 24 | # Specialized benchmarking class for stable diffusion.
 25 | # We cannot use any of the pre-existing benchmarking utilities to benchmark E2E stable diffusion performance,
 26 | # because the top-level StableDiffusionPipeline cannot be serialized into a single Torchscript object.
 27 | # All of the pre-existing benchmarking utilities (in neuronperf or torch_neuronx) require the model to be a
 28 | # traced Torchscript.
 29 | def benchmark(n_runs, test_name, model, model_inputs):
 30 |     if not isinstance(model_inputs, tuple):
 31 |         model_inputs=model_inputs
 32 |     
 33 |     warmup_run = model(**model_inputs)
 34 | 
 35 |     latency_collector = LatencyCollector()
 36 |     # can't use register_forward_pre_hook or register_forward_hook because StableDiffusionPipeline is not a torch.nn.Module
 37 |     
 38 |     for _ in range(n_runs):
 39 |         latency_collector.pre_hook()
 40 |         res = model(**model_inputs)
 41 |         latency_collector.hook()
 42 |     
 43 |     p0_latency_ms = latency_collector.percentile(0) * 1000
 44 |     p50_latency_ms = latency_collector.percentile(50) * 1000
 45 |     p90_latency_ms = latency_collector.percentile(90) * 1000
 46 |     p95_latency_ms = latency_collector.percentile(95) * 1000
 47 |     p99_latency_ms = latency_collector.percentile(99) * 1000
 48 |     p100_latency_ms = latency_collector.percentile(100) * 1000
 49 | 
 50 |     report_dict = dict()
 51 |     report_dict["Latency P0"] = f'{p0_latency_ms:.1f}'
 52 |     report_dict["Latency P50"]=f'{p50_latency_ms:.1f}'
 53 |     report_dict["Latency P90"]=f'{p90_latency_ms:.1f}'
 54 |     report_dict["Latency P95"]=f'{p95_latency_ms:.1f}'
 55 |     report_dict["Latency P99"]=f'{p99_latency_ms:.1f}'
 56 |     report_dict["Latency P100"]=f'{p100_latency_ms:.1f}'
 57 | 
 58 |     report = f'RESULT FOR {test_name}:'
 59 |     for key, value in report_dict.items():
 60 |         report += f' {key}={value}'
 61 |     print(report)
 62 | 
 63 | class LatencyCollector:
 64 |     def __init__(self):
 65 |         self.start = None
 66 |         self.latency_list = []
 67 | 
 68 |     def pre_hook(self, *args):
 69 |         self.start = time.time()
 70 | 
 71 |     def hook(self, *args):
 72 |         self.latency_list.append(time.time() - self.start)
 73 | 
 74 |     def percentile(self, percent):
 75 |         latency_list = self.latency_list
 76 |         pos_float = len(latency_list) * percent / 100
 77 |         max_pos = len(latency_list) - 1
 78 |         pos_floor = min(math.floor(pos_float), max_pos)
 79 |         pos_ceil = min(math.ceil(pos_float), max_pos)
 80 |         latency_list = sorted(latency_list)
 81 |         return latency_list[pos_ceil] if pos_float - pos_floor > 0.5 else latency_list[pos_floor]
 82 | 
 83 | 
 84 | class UNetWrap(nn.Module):
 85 |     def __init__(self, unet):
 86 |         super().__init__()
 87 |         self.unet = unet
 88 | 
 89 |     def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None):
 90 |         out_tuple = self.unet(sample, timestep, encoder_hidden_states, return_dict=False)
 91 |         return out_tuple
 92 | 
 93 | class NeuronUNet(nn.Module):
 94 |     def __init__(self, unetwrap):
 95 |         super().__init__()
 96 |         self.unetwrap = unetwrap
 97 |         self.config = unetwrap.unet.config
 98 |         self.in_channels = unetwrap.unet.in_channels
 99 |         self.device = unetwrap.unet.device
100 | 
101 |     def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None, return_dict=False):
102 |         sample = self.unetwrap(sample, timestep.to(dtype=DTYPE).expand((sample.shape[0],)), encoder_hidden_states)[0]
103 |         return UNet2DConditionOutput(sample=sample)
104 | 
105 | class NeuronTextEncoder(nn.Module):
106 |     def __init__(self, text_encoder):
107 |         super().__init__()
108 |         self.neuron_text_encoder = text_encoder
109 |         self.config = text_encoder.config
110 |         self.dtype = text_encoder.dtype
111 |         self.device = text_encoder.device
112 | 
113 |     def forward(self, emb, attention_mask = None):
114 |         return [self.neuron_text_encoder(emb)['last_hidden_state']]
115 |     
116 | def decode_latents(self, latents):
117 |     latents = latents.to(torch.float)
118 |     latents = 1 / self.vae.config.scaling_factor * latents
119 |     image = self.vae.decode(latents).sample
120 |     image = (image / 2 + 0.5).clamp(0, 1)
121 |     image = image.cpu().permute(0, 2, 3, 1).float().numpy()
122 |     return image
123 | 
124 | StableDiffusionPipeline.decode_latents = decode_latents
125 | 
126 | # --- Load all compiled models and benchmark pipeline ---
127 | COMPILER_WORKDIR_ROOT = model_dir
128 | #model_id = "stabilityai/stable-diffusion-2-1-base"
129 | text_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder/model.pt')
130 | decoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder/model.pt')
131 | unet_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'unet/model.pt')
132 | post_quant_conv_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv/model.pt')
133 | 
134 | pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=DTYPE)
135 | 
136 | if device=='cuda':
137 |   pipe = pipe.to("cuda")
138 |   pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
139 |   pipe.unet.to(memory_format=torch.channels_last)
140 |   pipe.vae.to(memory_format=torch.channels_last)
141 |   pipe.unet = torch.compile(pipe.unet, fullgraph=True, mode="max-autotune")
142 |   
143 |   pipe.text_encoder = torch.compile(
144 |     pipe.text_encoder,
145 |     fullgraph=True,
146 |     mode="max-autotune",
147 |   )
148 |   pipe.vae.decoder = torch.compile(
149 |     pipe.vae.decoder,
150 |     fullgraph=True,
151 |     mode="max-autotune",
152 |   )
153 |   pipe.vae.post_quant_conv = torch.compile(
154 |     pipe.vae.post_quant_conv,
155 |     fullgraph=True,
156 |     mode="max-autotune-no-cudagraphs",
157 |   )
158 | 
159 | if device=='xla':
160 |   pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
161 |   # Load the compiled UNet onto two neuron cores.
162 |   pipe.unet = NeuronUNet(UNetWrap(pipe.unet))
163 |   device_ids = [0,1]
164 |   pipe.unet.unetwrap = torch_neuronx.DataParallel(torch.jit.load(unet_filename), device_ids, set_dynamic_batching=False)
165 | 
166 | class NeuronTypeConversionWrapper(nn.Module):
167 |     def __init__(self, network):
168 |         super().__init__()
169 |         self.network = network
170 | 
171 |     def forward(self, x):
172 |         return self.network(x.float())
173 | 
174 | if device=='xla':
175 |   # Load other compiled models onto a single neuron core.
176 |   pipe.text_encoder = NeuronTextEncoder(pipe.text_encoder)
177 |   pipe.text_encoder.neuron_text_encoder = torch.jit.load(text_encoder_filename)
178 |   pipe.vae.decoder = NeuronTypeConversionWrapper(torch.jit.load(decoder_filename))
179 |   pipe.vae.post_quant_conv = NeuronTypeConversionWrapper(torch.jit.load(post_quant_conv_filename))
180 | 
181 | prompt = "a photo of an astronaut riding a horse on mars"
182 | n_runs = 20
183 | num_inference_steps = 20
184 | height = 512
185 | width = 512
186 | model_args={'prompt': prompt, 'height': height, 'width': width, 'num_inference_steps': num_inference_steps,}
187 | benchmark(n_runs, "stable_diffusion_512", pipe, model_args)
188 | 
189 | def text2img(PROMPT):
190 |   start_time = time.time()
191 |   image = pipe(PROMPT).images[0]
192 |   total_time =  time.time()-start_time
193 |   r1 = random.randint(0,99999)
194 |   imgname="image"+str(r1)+".png"
195 |   image.save(imgname)
196 |   image = mpimg.imread(imgname)
197 |   return image, str(total_time)
198 | 
199 | #app = gr.Interface(fn=text2img,inputs=["text"],
200 | #    outputs = [gr.Image(height=512, width=512), "text"],
201 | #    title = 'Stable Diffusion 2.1 in AWS EC2 ' + device + ' instance')
202 | #app.queue()
203 | #app.launch(share = True,server_name="0.0.0.0",debug = False)
204 | 


--------------------------------------------------------------------------------
/app/sd2_512_compile.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["NEURON_FUSE_SOFTMAX"] = "1"
  3 | model_id=os.environ['MODEL_ID']
  4 | device=os.environ["DEVICE"]
  5 | model_dir=os.environ['COMPILER_WORKDIR_ROOT']
  6 | from pathlib import Path
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | if device=='xla':
 11 |   import torch_neuronx
 12 | 
 13 | import copy
 14 | from diffusers import StableDiffusionPipeline,DPMSolverMultistepScheduler,EulerAncestralDiscreteScheduler
 15 | from diffusers.models.unet_2d_condition import UNet2DConditionOutput
 16 | # Compatibility for diffusers<0.18.0
 17 | from packaging import version
 18 | import diffusers
 19 | diffusers_version = version.parse(diffusers.__version__)
 20 | use_new_diffusers = diffusers_version >= version.parse('0.18.0')
 21 | if use_new_diffusers:
 22 |     from diffusers.models.attention_processor import Attention
 23 | else:
 24 |     from diffusers.models.cross_attention import CrossAttention
 25 | 
 26 | # Define datatype
 27 | DTYPE = torch.bfloat16
 28 | 
 29 | # Have to do this double wrapper trick to compile the unet, because
 30 | # of the special UNet2DConditionOutput output type.
 31 | class UNetWrap(nn.Module):
 32 |     def __init__(self, unet):
 33 |         super().__init__()
 34 |         self.unet = unet
 35 | 
 36 |     def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None):
 37 |         out_tuple = self.unet(sample, timestep, encoder_hidden_states, return_dict=False)
 38 |         return out_tuple
 39 | 
 40 | class NeuronUNet(nn.Module):
 41 |     def __init__(self, unetwrap):
 42 |         super().__init__()
 43 |         self.unetwrap = unetwrap
 44 |         self.config = unetwrap.unet.config
 45 |         self.in_channels = unetwrap.unet.in_channels
 46 |         self.device = unetwrap.unet.device
 47 | 
 48 |     def forward(self, sample, timestep, encoder_hidden_states, cross_attention_kwargs=None):
 49 |         sample = self.unetwrap(sample, timestep.to(dtype=DTYPE).expand((sample.shape[0],)), encoder_hidden_states)[0]
 50 |         return UNet2DConditionOutput(sample=sample)
 51 |     
 52 | class NeuronTextEncoder(nn.Module):
 53 |     def __init__(self, text_encoder):
 54 |         super().__init__()
 55 |         self.neuron_text_encoder = text_encoder
 56 |         self.config = text_encoder.config
 57 |         self.dtype = text_encoder.dtype
 58 |         self.device = text_encoder.device
 59 | 
 60 |     def forward(self, emb, attention_mask = None):
 61 |         return [self.neuron_text_encoder(emb)['last_hidden_state']]
 62 | 
 63 | 
 64 | # Optimized attention
 65 | def get_attention_scores(self, query, key, attn_mask):       
 66 |     dtype = query.dtype
 67 | 
 68 |     if self.upcast_attention:
 69 |         query = query.float()
 70 |         key = key.float()
 71 | 
 72 |     # Check for square matmuls
 73 |     if(query.size() == key.size()):
 74 |         attention_scores = custom_badbmm(
 75 |             key,
 76 |             query.transpose(-1, -2)
 77 |         )
 78 | 
 79 |         if self.upcast_softmax:
 80 |             attention_scores = attention_scores.float()
 81 | 
 82 |         attention_probs = attention_scores.softmax(dim=1).permute(0,2,1)
 83 |         attention_probs = attention_probs.to(dtype)
 84 | 
 85 |     else:
 86 |         attention_scores = custom_badbmm(
 87 |             query,
 88 |             key.transpose(-1, -2)
 89 |         )
 90 | 
 91 |         if self.upcast_softmax:
 92 |             attention_scores = attention_scores.float()
 93 | 
 94 |         attention_probs = attention_scores.softmax(dim=-1)
 95 |         attention_probs = attention_probs.to(dtype)
 96 |         
 97 |     return attention_probs
 98 | 
 99 | # In the original badbmm the bias is all zeros, so only apply scale
100 | def custom_badbmm(a, b):
101 |     bmm = torch.bmm(a, b)
102 |     scaled = bmm * 0.125
103 |     return scaled
104 | 
105 | 
106 | # For saving compiler artifacts
107 | COMPILER_WORKDIR_ROOT = 'sd2_compile_dir_512'
108 | 
109 | # Model ID for SD version pipeline
110 | #model_id = "stabilityai/stable-diffusion-2-1-base"
111 | 
112 | pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=DTYPE)
113 | if device=='cuda':
114 |   pipe = pipe.to("cuda")
115 |   pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
116 |   pipe.unet.to(memory_format=torch.channels_last)
117 |   pipe.vae.to(memory_format=torch.channels_last)
118 |   pipe.unet = torch.compile(pipe.unet, fullgraph=True, mode="max-autotune")
119 |   unet_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'unet/model.pt')
120 |   torch.save(pipe.unet.state_dict(), unet_filename) 
121 | 
122 |   pipe.text_encoder = torch.compile(
123 |     pipe.text_encoder,
124 |     fullgraph=True,
125 |     mode="max-autotune",
126 |   )
127 |   text_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder/model.pt')
128 |   torch.save(pipe.text_encoder.state_dict(), text_encoder_filename)
129 |   
130 |   pipe.vae.decoder = torch.compile(
131 |     pipe.vae.decoder,
132 |     fullgraph=True,
133 |     mode="max-autotune",
134 |   )
135 |   decoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder/model.pt')
136 |   torch.save(pipe.vae.decoder.state_dict(), decoder_filename)  
137 | 
138 |   pipe.vae.post_quant_conv = torch.compile(
139 |     pipe.vae.post_quant_conv,
140 |     fullgraph=True,
141 |     mode="max-autotune-no-cudagraphs",
142 |   )
143 |   post_quant_conv_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv/model.pt')
144 |   torch.save(pipe.vae.post_quant_conv.state_dict(), post_quant_conv_filename)
145 | 
146 | # --- Compile UNet and save ---
147 | # Replace original cross-attention module with custom cross-attention module for better performance
148 | if use_new_diffusers:
149 |     Attention.get_attention_scores = get_attention_scores
150 | else:
151 |     CrossAttention.get_attention_scores = get_attention_scores
152 | 
153 | if device=='xla':
154 |   # Apply double wrapper to deal with custom return type
155 |   pipe.unet = NeuronUNet(UNetWrap(pipe.unet))
156 | 
157 |   # Only keep the model being compiled in RAM to minimze memory pressure
158 |   unet = copy.deepcopy(pipe.unet.unetwrap)
159 |   del pipe
160 | 
161 | # Compile unet - FP32
162 | sample_1b = torch.randn([1, 4, 64, 64], dtype=DTYPE)
163 | timestep_1b = torch.tensor(999, dtype=DTYPE).expand((1,))
164 | encoder_hidden_states_1b = torch.randn([1, 77, 1024], dtype=DTYPE)
165 | example_inputs = sample_1b, timestep_1b, encoder_hidden_states_1b
166 | 
167 | if device=='xla':
168 |   unet_neuron = torch_neuronx.trace(
169 |     unet,
170 |     example_inputs,
171 |     compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'unet'),
172 |     compiler_args=["--model-type=unet-inference", "--enable-fast-loading-neuron-binaries"]
173 |   )
174 | 
175 |   # Enable asynchronous and lazy loading to speed up model load
176 |   torch_neuronx.async_load(unet_neuron)
177 |   torch_neuronx.lazy_load(unet_neuron)
178 | 
179 |   # save compiled unet
180 |   unet_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'unet/model.pt')
181 |   torch.jit.save(unet_neuron, unet_filename)
182 | 
183 |   # delete unused objects
184 |   del unet
185 |   del unet_neuron
186 | 
187 | # --- Compile CLIP text encoder and save ---
188 | 
189 | # Only keep the model being compiled in RAM to minimze memory pressure
190 | pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=DTYPE)
191 | text_encoder = copy.deepcopy(pipe.text_encoder)
192 | del pipe
193 | 
194 | if device=='xla':
195 |   # Apply the wrapper to deal with custom return type
196 |   text_encoder = NeuronTextEncoder(text_encoder)
197 | 
198 | # Compile text encoder
199 | # This is used for indexing a lookup table in torch.nn.Embedding,
200 | # so using random numbers may give errors (out of range).
201 | emb = torch.tensor([[49406, 18376,   525,  7496, 49407,     0,     0,     0,     0,     0,
202 |         0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
203 |         0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
204 |         0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
205 |         0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
206 |         0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
207 |         0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
208 |         0,     0,     0,     0,     0,     0,     0]])
209 | 
210 | if device=='xla':
211 |   text_encoder_neuron = torch_neuronx.trace(
212 |         text_encoder.neuron_text_encoder, 
213 |         emb, 
214 |         compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder'),
215 |         compiler_args=["--enable-fast-loading-neuron-binaries"]
216 |         )
217 | 
218 |   # Enable asynchronous loading to speed up model load
219 |   torch_neuronx.async_load(text_encoder_neuron)
220 | 
221 |   # Save the compiled text encoder
222 |   text_encoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'text_encoder/model.pt')
223 |   torch.jit.save(text_encoder_neuron, text_encoder_filename)
224 | 
225 |   # delete unused objects
226 |   del text_encoder
227 |   del text_encoder_neuron
228 | 
229 | # --- Compile VAE decoder and save ---
230 | 
231 | # Only keep the model being compiled in RAM to minimze memory pressure
232 | pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float32)
233 | decoder = copy.deepcopy(pipe.vae.decoder)
234 | del pipe
235 | 
236 | # Compile vae decoder
237 | decoder_in = torch.randn([1, 4, 64, 64], dtype=torch.float32)
238 | if device=='xla':
239 |   decoder_neuron = torch_neuronx.trace(
240 |     decoder, 
241 |     decoder_in, 
242 |     compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder'),
243 |     compiler_args=["--enable-fast-loading-neuron-binaries"]
244 |   )
245 | 
246 |   # Enable asynchronous loading to speed up model load
247 |   torch_neuronx.async_load(decoder_neuron)
248 | 
249 |   # Save the compiled vae decoder
250 |   decoder_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_decoder/model.pt')
251 |   torch.jit.save(decoder_neuron, decoder_filename)
252 | 
253 |   # delete unused objects
254 |   del decoder
255 |   del decoder_neuron
256 | 
257 | # --- Compile VAE post_quant_conv and save ---
258 | 
259 | # Only keep the model being compiled in RAM to minimze memory pressure
260 | pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float32)
261 | post_quant_conv = copy.deepcopy(pipe.vae.post_quant_conv)
262 | del pipe
263 | 
264 | # # Compile vae post_quant_conv
265 | post_quant_conv_in = torch.randn([1, 4, 64, 64], dtype=torch.float32)
266 | if device=='xla':
267 |   post_quant_conv_neuron = torch_neuronx.trace(
268 |     post_quant_conv, 
269 |     post_quant_conv_in,
270 |     compiler_workdir=os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv'),
271 |   )
272 | 
273 |   # Enable asynchronous loading to speed up model load
274 |   torch_neuronx.async_load(post_quant_conv_neuron)
275 | 
276 |   # # Save the compiled vae post_quant_conv
277 |   post_quant_conv_filename = os.path.join(COMPILER_WORKDIR_ROOT, 'vae_post_quant_conv/model.pt')
278 |   torch.jit.save(post_quant_conv_neuron, post_quant_conv_filename)
279 | 
280 |   # delete unused objects
281 |   del post_quant_conv
282 |   del post_quant_conv_neuron
283 | 
284 | 


--------------------------------------------------------------------------------
/app/start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | 
 3 | STAGE=$1
 4 | 
 5 | token=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
 6 | if [ -z "$token" ]; then
 7 |   echo "No token for IMDS_v2 - check /api/token url"
 8 |   exit
 9 | fi
10 | instance_type=$(curl -s -H "X-aws-ec2-metadata-token: $token" 169.254.169.254/latest/meta-data/instance-type)
11 | if [ -z "$instance_type" ]; then
12 |   echo "cant find the instance type. cant continue"
13 |   exit
14 | fi
15 | echo "instance_type="$instance_type
16 | 
17 | if [[ $instance_type == "inf"* ]]; then
18 |   echo "export PATH=/opt/aws/neuron/bin:\$PATH" >> /root/.bashrc
19 |   echo "export TERM=screen" >> /root/.bashrc
20 |   echo "export DEVICE=xla" >> /root/.bashrc
21 | fi
22 | if [[ $instance_type == "g"* ]]; then
23 |   echo "export DEVICE=cuda" >> /root/.bashrc
24 | fi
25 | . /root/.bashrc
26 | 
27 | if [[ $STAGE == "compile" ]]; then
28 |   /compile-model.sh
29 | elif [[ $STAGE == "run" ]]; then
30 |   /run-model.sh
31 | elif [[ $STAGE == "run1" ]]; then
32 |   /run1-model.sh
33 | else
34 |  echo "stage " $STAGE" is not supported"
35 |  exit
36 | fi
37 | 


--------------------------------------------------------------------------------
/appsimulator_sa.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | kind: ClusterRole
 3 | apiVersion: rbac.authorization.k8s.io/v1
 4 | metadata:
 5 |   name: appsimulator
 6 |   namespace: default
 7 | rules:
 8 | - apiGroups:
 9 |   - ""
10 |   resources:
11 |   - "*"
12 |   verbs:
13 |   - "*"
14 | - apiGroups:
15 |   - rbac.authorization.k8s.io
16 |   - extensions
17 |   - apps
18 |   resources:
19 |   - "*"
20 |   verbs:
21 |   - "*"
22 | - apiGroups:
23 |   - apiextensions.k8s.io
24 |   resources:
25 |   - customresourcedefinitions
26 |   - pods
27 |   - deployments
28 |   verbs:
29 |   - get
30 |   - list
31 |   - watch
32 |   - create
33 |   - delete
34 | ---
35 | apiVersion: v1
36 | kind: ServiceAccount
37 | metadata:
38 |   name: appsimulator
39 |   namespace: default
40 | ---
41 | kind: ClusterRoleBinding
42 | apiVersion: rbac.authorization.k8s.io/v1
43 | metadata:
44 |   name: appsimulator
45 |   namespace: default
46 | subjects:
47 | - kind: ServiceAccount
48 |   name: appsimulator
49 |   namespace: default
50 | roleRef:
51 |   kind: ClusterRole
52 |   name: appsimulator
53 |   apiGroup: rbac.authorization.k8s.io
54 | 


--------------------------------------------------------------------------------
/infer-in-region-g5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/edge_diffusion_on_eks/05e4dfefe30cc1965f7d9771e99513570f62d867/infer-in-region-g5.png


--------------------------------------------------------------------------------
/infer-in-region.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/edge_diffusion_on_eks/05e4dfefe30cc1965f7d9771e99513570f62d867/infer-in-region.png


--------------------------------------------------------------------------------
/infra-build/README.md:
--------------------------------------------------------------------------------
 1 | * Exec the following:
 2 | ```bash
 3 | npm uninstall -g aws-cdk
 4 | npm install -g aws-cdk
 5 | ```
 6 | * Export the following variables
 7 | ```bash
 8 | export AWS_ACCOUNT_ID=$(aws sts get-caller-identity --output text --query Account)
 9 | export AWS_REGION=us-west-2
10 | export CF_STACK=yahavb-cdk-k8s
11 | export CLUSTER=test5
12 | ```
13 | 


--------------------------------------------------------------------------------
/infra-build/deploy-cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | rm -rf cdk.* package* node_modules/
 3 | npm install -g aws-cdk
 4 | npm install aws-cdk-lib@2.115.0
 5 | npm i @aws-quickstart/eks-blueprints@1.13.1
 6 | . ~/.bash_profile
 7 | cdk bootstrap aws://$AWS_ACCOUNT_ID/$AWS_REGION
 8 | npm install
 9 | cdk deploy --app "npx ts-node --prefer-ts-exts ./eks-cluster.ts" --all 
10 | 


--------------------------------------------------------------------------------
/infra-build/eks-cluster-stack.ts:
--------------------------------------------------------------------------------
 1 | import * as cdk from 'aws-cdk-lib';
 2 | import * as blueprints from '@aws-quickstart/eks-blueprints';
 3 | import {GlobalResources } from "@aws-quickstart/eks-blueprints";
 4 | import {VpcResourceProvider } from "./vpc-resource-provider";
 5 | import {EndpointAccess,MachineImageType} from 'aws-cdk-lib/aws-eks';
 6 | import {SubnetFilter,SubnetType} from "aws-cdk-lib/aws-ec2";
 7 | import {AccountRootPrincipal} from "aws-cdk-lib/aws-iam";
 8 | 
 9 | const version = 'auto';
10 | let cluster_name = process.env.CLUSTER as string;
11 | 
12 | export class EksClusterStack extends cdk.Stack {
13 |   constructor(scope: cdk.App, id: string, props?: cdk.StackProps) {
14 |     super(scope, id, props);
15 | 
16 |     const account = this.account;
17 |     const region = this.region;
18 | 
19 |     const clusterVPC = new VpcResourceProvider();
20 | 
21 |     const addOns: Array<blueprints.ClusterAddOn> = [
22 |       new blueprints.addons.MetricsServerAddOn(),
23 |       new blueprints.addons.AwsLoadBalancerControllerAddOn(),
24 |       new blueprints.addons.VpcCniAddOn(),
25 |       new blueprints.addons.CoreDnsAddOn(),
26 |       new blueprints.addons.GpuOperatorAddon()
27 |     ];
28 | 
29 |     const nodesProvider = new blueprints.GenericClusterProvider(
30 |         {
31 |           clusterName: `${cluster_name}`,
32 |           vpcSubnets: [{ availabilityZones: ['us-west-2a','us-west-2b','us-west-2c','us-west-2d'] }],
33 |           endpointAccess: EndpointAccess.PUBLIC,
34 |           autoscalingNodeGroups: [
35 |             {
36 |               id: "core",
37 |               autoScalingGroupName: "core",
38 |               allowAllOutbound: true,
39 |               desiredSize: 3,
40 |               minSize: 1,
41 |               maxSize: 3,
42 |               machineImageType: MachineImageType.AMAZON_LINUX_2,
43 |               nodeGroupSubnets: {subnetType: SubnetType.PUBLIC ,subnetFilters: [SubnetFilter.availabilityZones(['us-west-2a', 'us-west-2b', 'us-west-2c', 'us-west-2d'])]},
44 |             },
45 |             {
46 |               id: "edge",
47 |               autoScalingGroupName: "edge",
48 |               allowAllOutbound: true,
49 |               desiredSize: 3,
50 |               minSize: 1,
51 |               maxSize: 3,
52 |               machineImageType: MachineImageType.AMAZON_LINUX_2,
53 |               nodeGroupSubnets: {subnetType: SubnetType.PUBLIC ,subnetFilters: [SubnetFilter.availabilityZones(['us-west-2-lax-1a'])]},
54 |             },
55 |           ]
56 |         }
57 |     )
58 | 
59 |     const stack = blueprints.EksBlueprint.builder()
60 |         .resourceProvider(GlobalResources.Vpc, clusterVPC)
61 |         .clusterProvider(nodesProvider)
62 |         .account(account)
63 |         .region(region)
64 |         .version(version)
65 |         .addOns(...addOns)
66 |         .teams()
67 |         .useDefaultSecretEncryption(false) //false to turn secret encryption off (demo cases)
68 |         .build(this, cluster_name);
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/infra-build/eks-cluster.ts:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | import 'source-map-support/register';
 3 | import * as cdk from 'aws-cdk-lib';
 4 | import { EksClusterStack } from './eks-cluster-stack';
 5 | 
 6 | const app = new cdk.App();
 7 | let stack = process.env.CF_STACK as string;
 8 | 
 9 | new EksClusterStack(app,stack,{
10 |   env: {account: process.env.CDK_DEFAULT_ACCOUNT,region: process.env.CDK_DEFAULT_REGION },
11 | });
12 | 


--------------------------------------------------------------------------------
/infra-build/vpc-resource-provider.ts:
--------------------------------------------------------------------------------
 1 | import {Stack,StackProps, App} from 'aws-cdk-lib';
 2 | import * as ec2 from 'aws-cdk-lib/aws-ec2';
 3 | import {ResourceContext, ResourceProvider} from "@aws-quickstart/eks-blueprints";
 4 | import {IVpc, SubnetType} from "aws-cdk-lib/aws-ec2";
 5 | 
 6 | export class VpcResourceProvider implements ResourceProvider<IVpc> {
 7 |     provide(context: ResourceContext): IVpc {
 8 |         return new ec2.Vpc(context.scope, 'vpc', {
 9 |             ipAddresses: ec2.IpAddresses.cidr('10.0.0.0/16'),
10 |             natGateways: 0,
11 |             availabilityZones: ['us-west-2a','us-west-2b','us-west-2c','us-west-2d','us-west-2-lax-1a'],
12 |             subnetConfiguration: [ 
13 |               {
14 |                 cidrMask: 26,
15 |                 name: 'public',
16 |                 subnetType: SubnetType.PUBLIC,
17 |                 mapPublicIpOnLaunch: true
18 |               },
19 |               {
20 |                 cidrMask: 26,
21 |                 name: 'private',
22 |                 subnetType: SubnetType.PRIVATE_WITH_EGRESS
23 |               },
24 |             ]
25 |         });
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/model-ci-build/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | * Fork https://github.com/aws-samples/edge_diffusion_on_eks/ and populate the `GITHUB_USER`.
 3 | * Check the latest [DLC](https://github.com/aws/deep-learning-containers/blob/master/available_images.md) for `BASE_IMAGE_AMD_XLA_TAG` and `BASE_IMAGE_AMD_CUD_TAG` values.
 4 | * Export the following variables
 5 | ```bash
 6 | export AWS_ACCOUNT_ID=$(aws sts get-caller-identity --output text --query Account)
 7 | export AWS_REGION=us-west-2
 8 | export BASE_IMAGE_AMD_XLA_TAG=1.13.1-neuronx-py310-sdk2.17.0-ubuntu20.04 
 9 | export BASE_IMAGE_AMD_CUD_TAG=2.0.1-gpu-py310-cu118-ubuntu20.04-ec2
10 | export BASE_IMAGE_ARM_CUD_TAG=pytorch-inference-graviton-2.1.0-cpu-py310-ubuntu20.04-ec2
11 | export IMAGE_AMD_XLA_TAG=amd64-neuron
12 | export IMAGE_AMD_CUD_TAG=amd64-cuda
13 | export IMAGE_ARM_CUD_TAG=arm64-cuda
14 | export BASE_REPO=stablediffusion
15 | export BASE_TAG=multiarch-ubuntu
16 | export BASE_ARM_TAG=arm64
17 | export BASE_AMD_TAG=amd64
18 | export GITHUB_BRANCH=master
19 | export GITHUB_USER=yahavb
20 | export GITHUB_REPO=edge_diffusion_on_eks
21 | export MODEL_DIR=sd2_compile_dir
22 | ```
23 | 
24 | ```bash
25 | cd ci-build
26 | ./deploy-pipeline.sh
27 | ```
28 | 


--------------------------------------------------------------------------------
/model-ci-build/deploy-pipeline.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | npm install aws-cdk-lib
4 | . ~/.bash_profile
5 | cdk bootstrap aws://$AWS_ACCOUNT_ID/$AWS_REGION
6 | npm install
7 | cdk deploy --app "npx ts-node --prefer-ts-exts ./pipeline.ts"  --parameters BASEIMAGEAMDXLATAG=$BASE_IMAGE_AMD_XLA_TAG --parameters BASEIMAGEAMDCUDTAG=$BASE_IMAGE_AMD_CUD_TAG  --parameters BASEIMAGEARMCUDTAG=$BASE_IMAGE_ARM_CUD_TAG  --parameters BASEREPO=$BASE_REPO --parameters IMAGEAMDXLATAG=$IMAGE_AMD_XLA_TAG --parameters IMAGEAMDCUDTAG=$IMAGE_AMD_CUD_TAG --parameters IMAGEARMCUDTAG=$IMAGE_ARM_CUD_TAG --parameters GITHUBREPO=$GITHUB_REPO --parameters GITHUBUSER=$GITHUB_USER --parameters GITHUBBRANCH=$GITHUB_BRANCH --parameters GITHUBOAUTHTOKEN=$GITHUB_OAUTH_TOKEN 
8 | 


--------------------------------------------------------------------------------
/model-ci-build/pipeline-stack.ts:
--------------------------------------------------------------------------------
  1 | import { Stack, StackProps,CfnParameter,SecretValue} from 'aws-cdk-lib';
  2 | import { Construct } from 'constructs'
  3 | import * as codecommit from 'aws-cdk-lib/aws-codecommit';
  4 | import * as ecr from 'aws-cdk-lib/aws-ecr';
  5 | import * as codebuild from 'aws-cdk-lib/aws-codebuild';
  6 | import * as codepipeline from 'aws-cdk-lib/aws-codepipeline';
  7 | import * as codepipeline_actions from 'aws-cdk-lib/aws-codepipeline-actions';
  8 | import * as iam from "aws-cdk-lib/aws-iam";
  9 | import * as secretsmanager from 'aws-cdk-lib/aws-secretsmanager';
 10 | import * as cdk from 'aws-cdk-lib/core';
 11 | import * as cfn from 'aws-cdk-lib/aws-cloudformation';
 12 | 
 13 | export class PipelineStack extends Stack {
 14 |   constructor(scope: Construct, id: string, props?: StackProps) {
 15 |     super(scope, id, props);
 16 |   const BASE_REPO = new CfnParameter(this,"BASEREPO",{type:"String"});
 17 |   const BASE_IMAGE_AMD_XLA_TAG = new CfnParameter(this,"BASEIMAGEAMDXLATAG",{type:"String"});
 18 |   const BASE_IMAGE_AMD_CUD_TAG = new CfnParameter(this,"BASEIMAGEAMDCUDTAG",{type:"String"});
 19 |   const BASE_IMAGE_ARM_CUD_TAG = new CfnParameter(this,"BASEIMAGEARMCUDTAG",{type:"String"});
 20 |   const IMAGE_AMD_XLA_TAG = new CfnParameter(this,"IMAGEAMDXLATAG",{type:"String"});
 21 |   const IMAGE_AMD_CUD_TAG = new CfnParameter(this,"IMAGEAMDCUDTAG",{type:"String"});
 22 |   const IMAGE_ARM_CUD_TAG = new CfnParameter(this,"IMAGEARMCUDTAG",{type:"String"});
 23 |   const GITHUB_OAUTH_TOKEN = new CfnParameter(this,"GITHUBOAUTHTOKEN",{type:"String"});
 24 |   const GITHUB_USER = new CfnParameter(this,"GITHUBUSER",{type:"String"});
 25 |   const GITHUB_REPO = new CfnParameter(this,"GITHUBREPO",{type:"String"});
 26 |   const GITHUB_BRANCH = new CfnParameter(this,"GITHUBBRANCH",{type:"String"});
 27 |   /* uncomment when you test the stack and dont want to manually delete the ecr registry 
 28 |   const base_registry = new ecr.Repository(this,`base_repo`,{
 29 |     repositoryName:BASE_REPO.valueAsString,
 30 |     imageScanOnPush: true
 31 |   });
 32 |   */ 
 33 |   const base_registry = ecr.Repository.fromRepositoryName(this,`base_repo`,BASE_REPO.valueAsString)
 34 | 
 35 |   //create a roleARN for codebuild 
 36 |   const buildRole = new iam.Role(this, 'BaseCodeBuildDeployRole',{
 37 |     roleName: "BaseCodeBuildDeployRole",
 38 |     assumedBy: new iam.ServicePrincipal('codebuild.amazonaws.com'),
 39 |   });
 40 |   
 41 |   buildRole.addToPolicy(new iam.PolicyStatement({
 42 |     resources: ['*'],
 43 |     actions: ['ssm:*','s3:*'],
 44 |   }));
 45 | 
 46 |   const githubSecret = new secretsmanager.Secret(this, 'githubSecret', {
 47 |     secretObjectValue: {
 48 |       token: SecretValue.unsafePlainText(GITHUB_OAUTH_TOKEN.valueAsString)
 49 |     },
 50 |   });
 51 |   const githubOAuthToken = SecretValue.secretsManager(githubSecret.secretArn,{jsonField:'token'});
 52 |   new cdk.CfnOutput(this, 'githubOAuthTokenRuntimeOutput1', {
 53 |       //value: SecretValue.secretsManager("githubtoken",{jsonField: "token"}).toString()
 54 |       value: githubSecret.secretValueFromJson('token').toString()
 55 |   });
 56 |   new cdk.CfnOutput(this, 'githubOAuthTokenRuntimeOutput2', {
 57 |       value: SecretValue.secretsManager(githubSecret.secretArn,{jsonField: "token"}).toString()
 58 |   });
 59 | 
 60 |   const base_image_arm_cud_build = new codebuild.Project(this, `ImageCudArmBuild`, {
 61 |     environment: {privileged:true,buildImage: codebuild.LinuxBuildImage.AMAZON_LINUX_2_ARM_2},
 62 |     cache: codebuild.Cache.local(codebuild.LocalCacheMode.DOCKER_LAYER, codebuild.LocalCacheMode.CUSTOM),
 63 |     role: buildRole,
 64 |     buildSpec: codebuild.BuildSpec.fromObject(
 65 |       {
 66 |         version: "0.2",
 67 |         env: {
 68 |           'exported-variables': [
 69 |             'AWS_ACCOUNT_ID','AWS_REGION','BASE_REPO','IMAGE_ARM_CUD_TAG','BASE_IMAGE_ARM_CUD_TAG'
 70 |           ],
 71 |         },
 72 |         phases: {
 73 |           build: {
 74 |             commands: [
 75 |               `export AWS_ACCOUNT_ID="${this.account}"`,
 76 |               `export AWS_REGION="${this.region}"`,
 77 |               `export BASE_REPO="${BASE_REPO.valueAsString}"`,
 78 |               `export IMAGE_TAG="${IMAGE_ARM_CUD_TAG.valueAsString}"`,
 79 |               `export BASE_IMAGE_TAG="${BASE_IMAGE_ARM_CUD_TAG.valueAsString}"`,
 80 |               `cd app`,
 81 |               `chmod +x ./build.sh && ./build.sh`
 82 |             ],
 83 |           }
 84 |         },
 85 |         artifacts: {
 86 |           files: ['imageDetail.json']
 87 |         },
 88 |       }
 89 |     ),
 90 |   });
 91 | 
 92 |   const base_image_amd_cud_build = new codebuild.Project(this, `ImageCudAmdBuild`, {
 93 |     environment: {privileged:true,buildImage: codebuild.LinuxBuildImage.AMAZON_LINUX_2_3},
 94 |     cache: codebuild.Cache.local(codebuild.LocalCacheMode.DOCKER_LAYER, codebuild.LocalCacheMode.CUSTOM),
 95 |     role: buildRole,
 96 |     buildSpec: codebuild.BuildSpec.fromObject(
 97 |       {
 98 |         version: "0.2",
 99 |         env: {
100 |           'exported-variables': [
101 |             'AWS_ACCOUNT_ID','AWS_REGION','BASE_REPO','IMAGE_AMD_CUD_TAG','BASE_IMAGE_AMD_CUD_TAG'
102 |           ],
103 |         },
104 |         phases: {
105 |           build: {
106 |             commands: [
107 |               `export AWS_ACCOUNT_ID="${this.account}"`,
108 |               `export AWS_REGION="${this.region}"`,
109 |               `export BASE_REPO="${BASE_REPO.valueAsString}"`,
110 |               `export IMAGE_TAG="${IMAGE_AMD_CUD_TAG.valueAsString}"`,
111 |               `export BASE_IMAGE_TAG="${BASE_IMAGE_AMD_CUD_TAG.valueAsString}"`,
112 |               `cd app`,
113 |               `chmod +x ./build.sh && ./build.sh`
114 |             ],
115 |           }
116 |         },
117 |         artifacts: {
118 |           files: ['imageDetail.json']
119 |         },
120 |       }
121 |     ),
122 |   });
123 | 
124 |   const base_image_amd_xla_build = new codebuild.Project(this, `ImageXlaAmdBuild`, {
125 |     environment: {privileged:true,buildImage: codebuild.LinuxBuildImage.AMAZON_LINUX_2_3},
126 |     cache: codebuild.Cache.local(codebuild.LocalCacheMode.DOCKER_LAYER, codebuild.LocalCacheMode.CUSTOM),
127 |     role: buildRole,
128 |     buildSpec: codebuild.BuildSpec.fromObject(
129 |       {
130 |         version: "0.2",
131 |         env: {
132 |           'exported-variables': [
133 |             'AWS_ACCOUNT_ID','AWS_REGION','BASE_REPO','IMAGE_AMD_XLA_TAG','BASE_IMAGE_AMD_XLA_TAG'
134 |           ],
135 |         },
136 |         phases: {
137 |           build: {
138 |             commands: [
139 |               `export AWS_ACCOUNT_ID="${this.account}"`,
140 |               `export AWS_REGION="${this.region}"`,
141 |               `export BASE_REPO="${BASE_REPO.valueAsString}"`,
142 |               `export IMAGE_TAG="${IMAGE_AMD_XLA_TAG.valueAsString}"`,
143 |               `export BASE_IMAGE_TAG="${BASE_IMAGE_AMD_XLA_TAG.valueAsString}"`,
144 |               `cd app`,
145 |               `chmod +x ./build.sh && ./build.sh`
146 |             ],
147 |           }
148 |         },
149 |         artifacts: {
150 |           files: ['imageDetail.json']
151 |         },
152 |       }
153 |     ),
154 |   });
155 | 
156 |   const assets_image_xla_amd_build = new codebuild.Project(this, `AssetsImageXlaAmdBuild`, {
157 |     environment: {privileged:true,buildImage: codebuild.LinuxBuildImage.AMAZON_LINUX_2_3},
158 |     cache: codebuild.Cache.local(codebuild.LocalCacheMode.DOCKER_LAYER, codebuild.LocalCacheMode.CUSTOM),
159 |     role: buildRole,
160 |     buildSpec: codebuild.BuildSpec.fromObject(
161 |       {
162 |         version: "0.2",
163 |         env: {
164 |           'exported-variables': [
165 |             'AWS_ACCOUNT_ID','AWS_REGION','BASE_REPO','IMAGE_AMD_XLA_TAG','BASE_IMAGE_AMD_XLA_TAG'
166 |           ],
167 |         },
168 |         phases: {
169 |           build: {
170 |             commands: [
171 |               `export AWS_ACCOUNT_ID="${this.account}"`,
172 |               `export AWS_REGION="${this.region}"`,
173 |               `export BASE_REPO="${BASE_REPO.valueAsString}"`,
174 |               `export IMAGE_TAG="${IMAGE_AMD_XLA_TAG.valueAsString}"`,
175 |               `export BASE_IMAGE_TAG="${BASE_IMAGE_AMD_XLA_TAG.valueAsString}"`,
176 |               `cd app`,
177 |               `chmod +x ./build-assets.sh && ./build-assets.sh`
178 |             ],
179 |           }
180 |         },
181 |         artifacts: {
182 |           files: ['imageDetail.json']
183 |         },
184 |       }
185 |     ),
186 |   });
187 | 
188 |   const assets_image_cud_arm_build = new codebuild.Project(this, `AssetsImageCudArmBuild`, {
189 |     environment: {privileged:true,buildImage: codebuild.LinuxBuildImage.AMAZON_LINUX_2_ARM_2},
190 |     cache: codebuild.Cache.local(codebuild.LocalCacheMode.DOCKER_LAYER, codebuild.LocalCacheMode.CUSTOM),
191 |     role: buildRole,
192 |     buildSpec: codebuild.BuildSpec.fromObject(
193 |       {
194 |         version: "0.2",
195 |         env: {
196 |           'exported-variables': [
197 |             'AWS_ACCOUNT_ID','AWS_REGION','BASE_REPO','IMAGE_ARM_CUD_TAG','BASE_IMAGE_ARM_CUD_TAG'
198 |           ],
199 |         },
200 |         phases: {
201 |           build: {
202 |             commands: [
203 |               `export AWS_ACCOUNT_ID="${this.account}"`,
204 |               `export AWS_REGION="${this.region}"`,
205 |               `export BASE_REPO="${BASE_REPO.valueAsString}"`,
206 |               `export IMAGE_TAG="${IMAGE_ARM_CUD_TAG.valueAsString}"`,
207 |               `export BASE_IMAGE_TAG="${BASE_IMAGE_ARM_CUD_TAG.valueAsString}"`,
208 |               `cd app`,
209 |               `chmod +x ./build-assets.sh && ./build-assets.sh`
210 |             ],
211 |           }
212 |         },
213 |         artifacts: {
214 |           files: ['imageDetail.json']
215 |         },
216 |       }
217 |     ),
218 |   });
219 | 
220 |   const assets_image_cud_amd_build = new codebuild.Project(this, `AssetsImageCudAmdBuild`, {
221 |     environment: {privileged:true,buildImage: codebuild.LinuxBuildImage.AMAZON_LINUX_2_3},
222 |     cache: codebuild.Cache.local(codebuild.LocalCacheMode.DOCKER_LAYER, codebuild.LocalCacheMode.CUSTOM),
223 |     role: buildRole,
224 |     buildSpec: codebuild.BuildSpec.fromObject(
225 |       {
226 |         version: "0.2",
227 |         env: {
228 |           'exported-variables': [
229 |             'AWS_ACCOUNT_ID','AWS_REGION','BASE_REPO','IMAGE_AMD_CUD_TAG','BASE_IMAGE_AMD_CUD_TAG'
230 |           ],
231 |         },
232 |         phases: {
233 |           build: {
234 |             commands: [
235 |               `export AWS_ACCOUNT_ID="${this.account}"`,
236 |               `export AWS_REGION="${this.region}"`,
237 |               `export BASE_REPO="${BASE_REPO.valueAsString}"`,
238 |               `export IMAGE_TAG="${IMAGE_AMD_CUD_TAG.valueAsString}"`,
239 |               `export BASE_IMAGE_TAG="${BASE_IMAGE_AMD_CUD_TAG.valueAsString}"`,
240 |               `cd app`,
241 |               `chmod +x ./build-assets.sh && ./build-assets.sh`
242 |             ],
243 |           }
244 |         },
245 |         artifacts: {
246 |           files: ['imageDetail.json']
247 |         },
248 |       }
249 |     ),
250 |   });
251 | 
252 |   //we allow the buildProject principal to push images to ecr
253 |   base_registry.grantPullPush(assets_image_cud_amd_build.grantPrincipal);
254 |   base_registry.grantPullPush(assets_image_xla_amd_build.grantPrincipal);
255 |   base_registry.grantPullPush(base_image_amd_xla_build.grantPrincipal);
256 |   base_registry.grantPullPush(base_image_amd_cud_build.grantPrincipal);
257 |   base_registry.grantPullPush(base_image_arm_cud_build.grantPrincipal);
258 | 
259 |   // here we define our pipeline and put together the assembly line
260 |   const sourceOutput = new codepipeline.Artifact();
261 |   const basebuildpipeline = new codepipeline.Pipeline(this,`BuildBasePipeline`);
262 |   basebuildpipeline.addStage({
263 |     stageName: 'Source',
264 |     actions: [
265 |       new codepipeline_actions.GitHubSourceAction({
266 |         actionName: 'GitHub_Source',
267 |         owner: GITHUB_USER.valueAsString,
268 |         repo: GITHUB_REPO.valueAsString,
269 |         branch: GITHUB_BRANCH.valueAsString,
270 |         output: sourceOutput,
271 |         oauthToken: SecretValue.secretsManager("githubtoken",{jsonField: "token"}),
272 |         trigger: codepipeline_actions.GitHubTrigger.WEBHOOK,
273 |         //oauthToken: SecretValue.unsafePlainText(GITHUB_OAUTH_TOKEN.valueAsString)
274 |       })
275 |       ]
276 |   });
277 | 
278 |   basebuildpipeline.addStage({
279 |     stageName: 'ImageBuild',
280 |     actions: [
281 |       new codepipeline_actions.CodeBuildAction({
282 |         actionName: 'AssetsImageXlaAmdBuild',
283 |         input: sourceOutput,
284 |         runOrder: 1,
285 |         project: assets_image_xla_amd_build
286 |       }),
287 |       new codepipeline_actions.CodeBuildAction({
288 |         actionName: 'AssetsImageCudAmdBuild',
289 |         input: sourceOutput,
290 |         runOrder: 1,
291 |         project: assets_image_cud_amd_build
292 |       }),
293 |       /*new codepipeline_actions.CodeBuildAction({
294 |         actionName: 'AssetsImageCudArmBuild',
295 |         input: sourceOutput,
296 |         runOrder: 1,
297 |         project: assets_image_cud_arm_build
298 |       }),*/
299 |       new codepipeline_actions.CodeBuildAction({
300 |         actionName: 'BaseImageAmdXlaBuild',
301 |         input: sourceOutput,
302 |         runOrder: 2,
303 |         project: base_image_amd_xla_build
304 |       }),
305 |       /*new codepipeline_actions.CodeBuildAction({
306 |         actionName: 'BaseImageArmCudBuild',
307 |         input: sourceOutput,
308 |         runOrder: 2,
309 |         project: base_image_arm_cud_build
310 |       }),*/
311 |       new codepipeline_actions.CodeBuildAction({
312 |         actionName: 'BaseImageAmdCudBuild',
313 |         input: sourceOutput,
314 |         runOrder: 2,
315 |         project: base_image_amd_cud_build
316 |       })
317 |     ]
318 |   });
319 |   }
320 | }
321 | 


--------------------------------------------------------------------------------
/model-ci-build/pipeline.ts:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | import 'source-map-support/register';
 3 | import * as cdk from 'aws-cdk-lib';
 4 | import { PipelineStack } from './pipeline-stack';
 5 | 
 6 | const app = new cdk.App();
 7 | new PipelineStack(app, 'PipelineStack', {
 8 |   env: { account: process.env.AWS_ACCOUNT_ID, region: process.env.AWS_REGION},
 9 | });
10 | 


--------------------------------------------------------------------------------
/neuron-top.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/edge_diffusion_on_eks/05e4dfefe30cc1965f7d9771e99513570f62d867/neuron-top.png


--------------------------------------------------------------------------------
/sd2-512-cuda-compile-job.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: sd21-gpu-compile
 5 | spec:
 6 |   template:
 7 |     spec:
 8 |       restartPolicy: OnFailure
 9 |       nodeSelector:
10 |         karpenter.sh/provisioner-name: amd-nvidia
11 |       serviceAccountName: appsimulator
12 |       volumes:
13 |       - name: dshm
14 |         emptyDir:
15 |           medium: Memory
16 |       containers:
17 |       - name: app
18 |         image: 133573815653.dkr.ecr.us-west-2.amazonaws.com/stablediffusion:amd64-cuda
19 |         imagePullPolicy: Always
20 |         volumeMounts:
21 |           - mountPath: /dev/shm
22 |             name: dshm
23 |         command: ["/start.sh"]
24 |         args: ["compile"]
25 |         resources:
26 |           limits:
27 |             nvidia.com/gpu: 1
28 |         env:
29 |         - name: DEVICE
30 |           value: "cuda"
31 |         - name: BUCKET
32 |           value: "sdinfer"
33 |         - name: MODEL_ID
34 |           value: "stabilityai/stable-diffusion-2-1-base"
35 |         - name: COMPILER_WORKDIR_ROOT
36 |           value: "sd2_compile_dir_512"
37 |         - name: MODEL_FILE
38 |           value: "stable-diffusion-2-1-base"
39 |         - name: POD_NAME
40 |           valueFrom:
41 |             fieldRef:
42 |               fieldPath: metadata.name
43 | 


--------------------------------------------------------------------------------
/sd2-512-cuda-serve-deploy.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   labels:
 5 |     app: sd21-g5-serve
 6 |   name: sd21-g5-serve
 7 | spec:
 8 |   selector:
 9 |     matchLabels:
10 |       app: sd21-g5-serve
11 |   template:
12 |     metadata:
13 |       labels:
14 |         app: sd21-g5-serve
15 |     spec:
16 |       nodeSelector:
17 |         karpenter.sh/provisioner-name: amd-nvidia
18 |       serviceAccountName: appsimulator
19 |       volumes:
20 |       - name: dshm
21 |         emptyDir:
22 |           medium: Memory
23 |       initContainers:
24 |       - name: pull-model-tar
25 |         image: 133573815653.dkr.ecr.us-west-2.amazonaws.com/stablediffusion:amd64-cuda-assets
26 |         imagePullPolicy: Always
27 |         volumeMounts:
28 |         - name: workdir
29 |           mountPath: /model
30 |         command:
31 |         - /bin/bash
32 |         - "-exc"
33 |         - |
34 |           set -x
35 |           aws s3 cp s3://sdinfer/stable-diffusion-2-1-base_cuda.tar.gz /model/model.tar.gz
36 |           cd /model
37 |           tar -xzf /model/model.tar.gz
38 |           ls /model 
39 |       containers:
40 |       - name: app
41 |         image: 133573815653.dkr.ecr.us-west-2.amazonaws.com/stablediffusion:amd64-cuda
42 |         imagePullPolicy: Always
43 |         volumeMounts:
44 |           - mountPath: /dev/shm
45 |             name: dshm
46 |           - mountPath: /app
47 |             name: workdir
48 |         command: ["/start.sh"]
49 |         args: ["run"]
50 |         resources:
51 |           limits:
52 |             nvidia.com/gpu: 1
53 |         lifecycle:
54 |           postStart:
55 |             exec:
56 |               command: ["/create_node_port_svc.sh"]
57 |           preStop:
58 |             exec:
59 |               command: ["/bin/sh","-c","kubectl delete svc `kubectl get svc|grep $POD_NAME | awk '{print $1}'`"]
60 |         env:
61 |         - name: DEVICE
62 |           value: "cuda"
63 |         - name: BUCKET
64 |           value: "sdinfer"
65 |         - name: MODEL_ID
66 |           value: "stabilityai/stable-diffusion-2-1-base"
67 |         - name: COMPILER_WORKDIR_ROOT
68 |           value: "sd2_compile_dir_512"
69 |         - name: MODEL_FILE
70 |           value: "stable-diffusion-2-1-base"
71 |         - name: POD_NAME
72 |           valueFrom:
73 |             fieldRef:
74 |               fieldPath: metadata.name
75 |       volumes:
76 |       - name: workdir
77 |         emptyDir: {}
78 |       - name: dshm
79 |         emptyDir:
80 |           medium: Memory
81 | 


--------------------------------------------------------------------------------
/sd2-512-xla-compile-job.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: sd21-inf2-compile
 5 | spec:
 6 |   template:
 7 |     spec:
 8 |       restartPolicy: OnFailure
 9 |       nodeSelector:
10 |         karpenter.sh/provisioner-name: amd-neuron
11 |       serviceAccountName: appsimulator
12 |       volumes:
13 |       - name: dshm
14 |         emptyDir:
15 |           medium: Memory
16 |       containers:
17 |       - name: app
18 |         image: 133573815653.dkr.ecr.us-west-2.amazonaws.com/stablediffusion:amd64-neuron
19 |         imagePullPolicy: Always
20 |         volumeMounts:
21 |           - mountPath: /dev/shm
22 |             name: dshm
23 |         command: ["/start.sh"]
24 |         args: ["compile"]
25 |         resources:
26 |           limits:
27 |             aws.amazon.com/neuron: 2
28 |         env:
29 |         - name: DEVICE
30 |           value: "xla"
31 |         - name: BUCKET
32 |           value: "sdinfer"
33 |         - name: MODEL_ID
34 |           value: "stabilityai/stable-diffusion-2-1-base"
35 |         - name: COMPILER_WORKDIR_ROOT
36 |           value: "sd2_compile_dir_512"
37 |         - name: MODEL_FILE
38 |           value: "stable-diffusion-2-1-base"
39 |         - name: POD_NAME
40 |           valueFrom:
41 |             fieldRef:
42 |               fieldPath: metadata.name
43 | 


--------------------------------------------------------------------------------
/sd2-512-xla-serve-deploy.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   labels:
 5 |     app: sd21-inf2-serve
 6 |   name: sd21-inf2-serve
 7 | spec:
 8 |   selector:
 9 |     matchLabels:
10 |       app: sd21-inf2-serve
11 |   template:
12 |     metadata:
13 |       labels:
14 |         app: sd21-inf2-serve
15 |     spec:
16 |       nodeSelector:
17 |         karpenter.sh/provisioner-name: amd-neuron
18 |       serviceAccountName: appsimulator
19 |       schedulerName: my-scheduler
20 |       volumes:
21 |       - name: dshm
22 |         emptyDir:
23 |           medium: Memory
24 |       initContainers:
25 |       - name: pull-model-tar
26 |         image: 133573815653.dkr.ecr.us-west-2.amazonaws.com/stablediffusion:amd64-neuron-assets
27 |         imagePullPolicy: Always
28 |         volumeMounts:
29 |         - name: workdir
30 |           mountPath: /model
31 |         command:
32 |         - /bin/bash
33 |         - "-exc"
34 |         - |
35 |           set -x
36 |           aws s3 cp s3://sdinfer/stable-diffusion-2-1-base.tar.gz /model/model.tar.gz
37 |           cd /model
38 |           tar -xzf /model/model.tar.gz
39 |           ls -l
40 |       containers:
41 |       - name: app
42 |         image: 133573815653.dkr.ecr.us-west-2.amazonaws.com/stablediffusion:amd64-neuron
43 |         imagePullPolicy: Always
44 |         volumeMounts:
45 |           - mountPath: /dev/shm
46 |             name: dshm
47 |           - mountPath: /app
48 |             name: workdir
49 |         command: ["/start.sh"]
50 |         args: ["run"]
51 |         resources:
52 |           limits:
53 |             aws.amazon.com/neuron: 1
54 |         lifecycle:
55 |           postStart:
56 |             exec:
57 |               command: ["/create_node_port_svc.sh"]
58 |           preStop:
59 |             exec:
60 |               command: ["/bin/sh","-c","kubectl delete svc `kubectl get svc|grep $POD_NAME | awk '{print $1}'`"]
61 |         env:
62 |         - name: AWS_NEURON_VISIBLE_DEVICES
63 |           value: "2-3"
64 |         - name: DEVICE
65 |           value: "xla"
66 |         - name: BUCKET
67 |           value: "sdinfer"
68 |         - name: MODEL_ID
69 |           value: "stabilityai/stable-diffusion-2-1-base"
70 |         - name: COMPILER_WORKDIR_ROOT
71 |           value: "/app/sd2_compile_dir_512"
72 |         - name: MODEL_FILE
73 |           value: "stable-diffusion-2-1-base"
74 |         - name: POD_NAME
75 |           valueFrom:
76 |             fieldRef:
77 |               fieldPath: metadata.name
78 |       volumes:
79 |       - name: workdir
80 |         emptyDir: {}
81 |       - name: dshm
82 |         emptyDir:
83 |           medium: Memory
84 | 


--------------------------------------------------------------------------------