├── models
    ├── multimodal
    │   └── text_to_image
    │   │   ├── sdxl_turbo
    │   │       ├── onnx_generation
    │   │       │   ├── __init__.py
    │   │       │   └── onnx_gen_utils.py
    │   │       ├── requirements.txt
    │   │       ├── utils.py
    │   │       ├── run_config_gen.sh
    │   │       ├── run_config_inference.sh
    │   │       ├── model.py
    │   │       ├── server.py
    │   │       ├── patches
    │   │       │   └── attention_patch.patch
    │   │       └── README.md
    │   │   ├── sdxl_deepcache
    │   │       ├── onnx_generation
    │   │       │   ├── __init__.py
    │   │       │   └── onnx_gen_utils.py
    │   │       ├── requirements.txt
    │   │       ├── utils.py
    │   │       ├── README.md
    │   │       ├── run_config_deep.sh
    │   │       ├── run_config_shallow.sh
    │   │       ├── patches
    │   │       │   ├── attention_patch.patch
    │   │       │   └── deepcache_unet.patch
    │   │       └── run_config_inference.sh
    │   │   ├── stable-diffusion-3.5-medium
    │   │       ├── onnx_generation
    │   │       │   ├── __init__.py
    │   │       │   └── onnx_gen_utils.py
    │   │       ├── requirements.txt
    │   │       ├── utils.py
    │   │       ├── README.md
    │   │       ├── run_config_inference.sh
    │   │       ├── run_config_gen.sh
    │   │       ├── patches
    │   │       │   ├── transformer_patch.patch
    │   │       │   └── attention_patch.patch
    │   │       └── model.py
    │   │   ├── DeciDiffusion-v2-0
    │   │       └── README.md
    │   │   ├── stable-diffusion-v1-5
    │   │       └── README.md
    │   │   └── stable-diffusion-xl-base-1.0
    │   │       ├── requirements.txt
    │   │       ├── fix_vae_decoder_onnx.py
    │   │       ├── onnx_gen_utils.py
    │   │       ├── attention_patch.patch
    │   │       └── compile_models.sh
    ├── vision
    │   ├── classification
    │   │   └── requirements.txt
    │   └── detection
    │   │   ├── requirements.txt
    │   │   ├── lut_yolo_models.csv
    │   │   └── README.md
    ├── language_processing
    │   ├── decoder
    │   │   ├── MptForCausalLM
    │   │   │   └── README.md
    │   │   ├── DeciCoder-6b
    │   │   │   ├── requirements.txt
    │   │   │   ├── init.sh
    │   │   │   ├── specializations_template.json
    │   │   │   ├── compileModel.sh
    │   │   │   └── README.md
    │   │   ├── LlamaForCausalLM
    │   │   │   └── README.md
    │   │   ├── README.md
    │   │   ├── GPTBigCodeForCausalLM
    │   │   │   └── README.md
    │   │   └── CodeGen-With-Speculative-Decoding
    │   │   │   └── README.md
    │   └── encoder
    │   │   ├── requirements.txt
    │   │   ├── model.py
    │   │   └── server.py
    └── speech
    │   └── whisper
    │       ├── requirements.txt
    │       ├── audio.py
    │       ├── README.md
    │       ├── generateModel.py
    │       └── runModel.py
├── images
    └── Cloud_AI_100.png
├── tutorials
    ├── NLP
    │   ├── Model-Onboarding-Beginner
    │   │   ├── bert-base-cased-config.yaml
    │   │   ├── distilbert-base-cased-distilled-squad-config.yaml
    │   │   ├── Images
    │   │   │   └── Workflow.jpg
    │   │   └── requirements.txt
    │   ├── Profiler-Intermediate
    │   │   ├── images
    │   │   │   ├── Latency.png
    │   │   │   ├── opstats_decoder.png
    │   │   │   ├── opstats_example.png
    │   │   │   └── operator_details.png
    │   │   └── requirements.txt
    │   └── Performance-Tuning-Beginner
    │   │   ├── Images
    │   │       └── Latency.jpg
    │   │   ├── requirements.txt
    │   │   ├── bert_base_dopt.json
    │   │   └── bert_base_dopt_min_latency.json
    ├── open-webui
    │   ├── open_webui_screen_1.png
    │   ├── open_webui_screen_2.png
    │   ├── serve.sh
    │   ├── open_webui.sh
    │   ├── vllm_container.sh
    │   └── README.md
    ├── Playground
    │   ├── images
    │   │   └── qualcomm_cloud_ai_playground.png
    │   └── README.md
    ├── Computer-Vision
    │   ├── Perfomance-Tuning-Beginner
    │   │   ├── Images
    │   │   │   └── Latency.jpg
    │   │   ├── requirements.txt
    │   │   ├── resnet_base_dopt_min_latency.json
    │   │   └── resnet_base_dopt_throughput.json
    │   └── DETR
    │   │   └── README.md
    ├── efficient_transformers
    │   └── README.md
    └── README.md
├── samples
    ├── python
    │   ├── qaic_features
    │   │   ├── resnet_config.yaml
    │   │   ├── benchmarking_eg.py
    │   │   ├── metrics_eg.py
    │   │   ├── profiling_eg.py
    │   │   └── README.md
    │   ├── vit_qaic
    │   │   ├── vit_config.yaml
    │   │   └── example.py
    │   ├── requirements.txt
    │   ├── README.md
    │   ├── aws_ai100_benchmarking
    │   │   ├── yolo_models
    │   │   │   ├── lut_yolo_models.csv
    │   │   │   └── README.md
    │   │   ├── cv_classifiers
    │   │   │   └── run_cv_classifiers.sh
    │   │   └── parse_latency_and_throughput.py
    │   └── common_utils.py
    └── cpp
    │   └── cpp_qpc_inference
    │       ├── CMakeLists.txt
    │       └── Readme.md
├── utils
    ├── multi-device
    │   ├── enable_mdp.json
    │   └── README.md
    ├── qaic-bench
    │   ├── config
    │   │   ├── config_tiny_llama.json
    │   │   ├── config_llama_3_1_8b.json
    │   │   └── config_dl2q.json
    │   └── README.md
    └── README.md
├── CONTRIBUTING.md
├── LICENSE
└── CODE-OF-CONDUCT.md


/models/multimodal/text_to_image/sdxl_turbo/onnx_generation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/onnx_generation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/onnx_generation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/images/Cloud_AI_100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/images/Cloud_AI_100.png


--------------------------------------------------------------------------------
/tutorials/NLP/Model-Onboarding-Beginner/bert-base-cased-config.yaml:
--------------------------------------------------------------------------------
1 | # Inference Parameters
2 | num_activations: 2
3 | set_size: 1
4 | 


--------------------------------------------------------------------------------
/tutorials/open-webui/open_webui_screen_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/open-webui/open_webui_screen_1.png


--------------------------------------------------------------------------------
/tutorials/open-webui/open_webui_screen_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/open-webui/open_webui_screen_2.png


--------------------------------------------------------------------------------
/tutorials/NLP/Model-Onboarding-Beginner/distilbert-base-cased-distilled-squad-config.yaml:
--------------------------------------------------------------------------------
1 | # Inference Parameters
2 | num_activations: 2
3 | set_size: 10


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/DeciDiffusion-v2-0/README.md:
--------------------------------------------------------------------------------
1 | ## DeciDiffusion 2.0
2 | 
3 | This model is deprecated. sdxl_turbo is the recommended alternative.
4 | 


--------------------------------------------------------------------------------
/tutorials/NLP/Profiler-Intermediate/images/Latency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/NLP/Profiler-Intermediate/images/Latency.png


--------------------------------------------------------------------------------
/tutorials/NLP/Model-Onboarding-Beginner/Images/Workflow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/NLP/Model-Onboarding-Beginner/Images/Workflow.jpg


--------------------------------------------------------------------------------
/tutorials/NLP/Performance-Tuning-Beginner/Images/Latency.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/NLP/Performance-Tuning-Beginner/Images/Latency.jpg


--------------------------------------------------------------------------------
/tutorials/Playground/images/qualcomm_cloud_ai_playground.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/Playground/images/qualcomm_cloud_ai_playground.png


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-v1-5/README.md:
--------------------------------------------------------------------------------
1 | ## Stable Diffusion v1-5
2 | 
3 | This model is deprecated. sdxl_turbo is the recommended alternative.
4 | 
5 | 


--------------------------------------------------------------------------------
/tutorials/NLP/Profiler-Intermediate/images/opstats_decoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/NLP/Profiler-Intermediate/images/opstats_decoder.png


--------------------------------------------------------------------------------
/tutorials/NLP/Profiler-Intermediate/images/opstats_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/NLP/Profiler-Intermediate/images/opstats_example.png


--------------------------------------------------------------------------------
/tutorials/NLP/Profiler-Intermediate/requirements.txt:
--------------------------------------------------------------------------------
1 | onnx==1.12.0
2 | optimum
3 | numpy==1.23.4
4 | onnxruntime
5 | torch===1.11.0
6 | pillow==8.3.2
7 | opencv-python
8 | paramiko


--------------------------------------------------------------------------------
/tutorials/NLP/Profiler-Intermediate/images/operator_details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/NLP/Profiler-Intermediate/images/operator_details.png


--------------------------------------------------------------------------------
/tutorials/NLP/Performance-Tuning-Beginner/requirements.txt:
--------------------------------------------------------------------------------
1 | onnx==1.12.0
2 | optimum
3 | numpy==1.23.4
4 | onnxruntime
5 | torch===1.11.0
6 | pillow==8.3.2
7 | opencv-python
8 | paramiko


--------------------------------------------------------------------------------
/samples/python/qaic_features/resnet_config.yaml:
--------------------------------------------------------------------------------
1 | aic_num_cores: 4
2 | num_activations: 1
3 | convert_to_fp16: true
4 | onnx_define_symbol:
5 |   batch: 2
6 | # output_dir: './resnet_qpc'
7 | 


--------------------------------------------------------------------------------
/tutorials/Computer-Vision/Perfomance-Tuning-Beginner/Images/Latency.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/Computer-Vision/Perfomance-Tuning-Beginner/Images/Latency.jpg


--------------------------------------------------------------------------------
/tutorials/Computer-Vision/Perfomance-Tuning-Beginner/requirements.txt:
--------------------------------------------------------------------------------
1 | onnx==1.12.0
2 | optimum
3 | numpy==1.23.4
4 | onnxruntime
5 | torch==1.13.0
6 | pillow==8.3.2
7 | opencv-python
8 | paramiko
9 | jsonschema


--------------------------------------------------------------------------------
/tutorials/NLP/Model-Onboarding-Beginner/requirements.txt:
--------------------------------------------------------------------------------
1 | onnx==1.12.0
2 | optimum
3 | numpy==1.23.4
4 | onnxruntime
5 | torch===1.11.0
6 | pillow==8.3.2
7 | onnxsim
8 | /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl


--------------------------------------------------------------------------------
/models/vision/classification/requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url https://download.pytorch.org/whl/cpu
2 | torch==2.3.1+cpu
3 | torchvision==0.18.1+cpu
4 | onnx==1.14.0
5 | onnxruntime==1.19.0
6 | transformers==4.41.2
7 | pandas==2.1.4
8 | 


--------------------------------------------------------------------------------
/utils/multi-device/enable_mdp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |  "request": [
 3 |   {
 4 |    "qid": -1,
 5 |    "dev_config": {
 6 |     "update_multi_device_partition_config_request": {
 7 |      "enable": true
 8 |     }
 9 |    }
10 |   }
11 |  ]
12 | }
13 | 


--------------------------------------------------------------------------------
/models/language_processing/decoder/MptForCausalLM/README.md:
--------------------------------------------------------------------------------
1 | # MptForCausalLM
2 | 
3 | Head over to the [Qualcomm Efficient Transformers library](https://github.com/quic/efficient-transformers) for optimized LLM deployment on Cloud AI 100 Accelerators.


--------------------------------------------------------------------------------
/models/language_processing/decoder/DeciCoder-6b/requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url https://download.pytorch.org/whl/cpu
2 | torch==2.1.2
3 | onnx==1.15.0
4 | onnxruntime==1.16.3
5 | onnxsim==0.4.35
6 | tiktoken==0.5.2
7 | protobuf==3.20.2
8 | numpy==1.26.4


--------------------------------------------------------------------------------
/models/language_processing/decoder/LlamaForCausalLM/README.md:
--------------------------------------------------------------------------------
1 | # LlamaForCausalLM 
2 | 
3 | Head over to the [Qualcomm Efficient Transformers library](https://github.com/quic/efficient-transformers) for optimized LLM deployment on Cloud AI 100 Accelerators.


--------------------------------------------------------------------------------
/models/language_processing/decoder/README.md:
--------------------------------------------------------------------------------
1 | # efficient-transformers package for LLMs
2 | 
3 | Head over to the [Qualcomm Efficient Transformers library](https://github.com/quic/efficient-transformers) for optimized LLM deployment on Cloud AI 100 Accelerators.


--------------------------------------------------------------------------------
/models/language_processing/decoder/GPTBigCodeForCausalLM/README.md:
--------------------------------------------------------------------------------
1 | # GPTBigCodeForCausalLM
2 | 
3 | Head over to the [Qualcomm Efficient Transformers library](https://github.com/quic/efficient-transformers) for optimized LLM deployment on Cloud AI 100 Accelerators.


--------------------------------------------------------------------------------
/models/language_processing/decoder/CodeGen-With-Speculative-Decoding/README.md:
--------------------------------------------------------------------------------
1 | # Speculative decoding - CodeGen
2 | 
3 | Head over to the [Qualcomm Efficient Transformers library](https://github.com/quic/efficient-transformers) for optimized LLM deployment on Cloud AI 100 Accelerators.


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_turbo/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cpu
 2 | networkx==3.0
 3 | torch==2.3.1
 4 | torchvision
 5 | torchaudio
 6 | onnx==1.12.0
 7 | onnxruntime
 8 | accelerate
 9 | transformers==4.42
10 | huggingface-hub==0.25.2


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cpu
 2 | networkx==3.0
 3 | torch==2.3.1
 4 | torchvision
 5 | torchaudio
 6 | onnx==1.12.0
 7 | onnxruntime
 8 | accelerate
 9 | transformers==4.42
10 | huggingface-hub==0.25.2


--------------------------------------------------------------------------------
/models/language_processing/decoder/DeciCoder-6b/init.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 2 | # SPDX-License-Identifier: BSD-3-Clause-Clear
 3 | 
 4 | MODEL_REPO="Deci"
 5 | MODEL_NAME="DeciCoder-6b"
 6 | BS=1
 7 | PL=256
 8 | CL=2048
 9 | CORES=14
10 | MX="-mxfp6-matmul"


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-xl-base-1.0/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cpu
 2 | networkx==3.0
 3 | torch==2.3.1
 4 | torchvision
 5 | torchaudio 
 6 | onnx==1.12.0
 7 | onnxruntime
 8 | accelerate
 9 | transformers==4.42
10 | huggingface-hub==0.25.2


--------------------------------------------------------------------------------
/models/language_processing/decoder/DeciCoder-6b/specializations_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"specializations": [
 3 | 		{
 4 | 			"batch_size": "BS",
 5 | 			"seq_len": "PL",
 6 | 			"ctx_len": "CL"
 7 | 		},
 8 | 		{
 9 | 			"batch_size": "BS",
10 | 			"seq_len": "1",
11 | 			"ctx_len": "CL"
12 | 		}
13 | 	]
14 | }
15 | 


--------------------------------------------------------------------------------
/samples/python/vit_qaic/vit_config.yaml:
--------------------------------------------------------------------------------
 1 | # compile parameters
 2 | aic_num_cores: 4
 3 | convert_to_fp16: true
 4 | mos: 1
 5 | ols: 2
 6 | multicast-weights: true
 7 | onnx_define_symbol:
 8 |   batch_size: 1
 9 | stats-batchsize: 1
10 | compile-only: true
11 | 
12 | # inference parameters
13 | num_activations: 3
14 | set_size: 4


--------------------------------------------------------------------------------
/models/vision/detection/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cpu
 2 | torch==2.3.1+cpu
 3 | torchvision==0.18.1+cpu
 4 | onnx==1.19.1
 5 | onnxruntime==1.19.0
 6 | onnxscript
 7 | transformers==4.41.2
 8 | pandas==2.1.4
 9 | opencv-python-headless
10 | opencv-contrib-python-headless
11 | ultralytics
12 | seaborn
13 | onnx-graphsurgeon
14 | 


--------------------------------------------------------------------------------
/samples/python/requirements.txt:
--------------------------------------------------------------------------------
 1 | altgraph==0.17.2
 2 | attrs==21.4.0
 3 | grpcio==1.44.0
 4 | iniconfig==1.1.1
 5 | nose==1.3.7
 6 | numpy==1.22.4
 7 | packaging==21.3
 8 | pluggy==1.0.0
 9 | protobuf==3.20.0
10 | py==1.11.0
11 | pyinstaller==4.9
12 | pyinstaller-hooks-contrib==2022.2
13 | pyparsing==3.0.7
14 | pytest==6.2.5
15 | pyudev==0.23.2
16 | PyYAML==6.0
17 | six==1.16.0
18 | toml==0.10.2
19 | yapf==0.32.0
20 | 


--------------------------------------------------------------------------------
/models/speech/whisper/requirements.txt:
--------------------------------------------------------------------------------
 1 | /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl
 2 | numpy==1.23.5
 3 | datasets==2.7.1
 4 | transformers==4.24.0
 5 | torch==1.12.1
 6 | onnx==1.12.0
 7 | fsspec==2022.11.0
 8 | multiprocess==0.70.14
 9 | huggingface-hub==0.11.0
10 | librosa==0.9.2
11 | soundfile==0.11.0
12 | whisper @ git+https://github.com/openai/whisper.git@ec1b34bb90dc2822ce4ebac23970b84dbb03ec6c
13 | pyarrow==20.0.0
14 | 


--------------------------------------------------------------------------------
/tutorials/open-webui/serve.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause-Clear
5 | 
6 | model=hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
7 | 
8 | /opt/vllm-env/bin/python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model ${model} --max-model-len 4096 --max-num-seq 1 --max-seq_len-to-capture 128 --device qaic --device-group 0,1,2,3
9 | 


--------------------------------------------------------------------------------
/models/language_processing/encoder/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cpu
 2 | networkx==3.1
 3 | torch==2.3.1
 4 | fsspec==2024.2.0
 5 | wheel==0.42.0
 6 | sentence-transformers==2.6.1
 7 | onnx==1.18.0
 8 | onnxruntime==1.22
 9 | transformers==4.40.2
10 | optimum==1.19.1
11 | protobuf==5.26.1
12 | urllib3==1.26.6
13 | /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl
14 | 
15 | # For inference serving
16 | fastapi
17 | uvicorn
18 | 


--------------------------------------------------------------------------------
/tutorials/open-webui/open_webui.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 2 | # SPDX-License-Identifier: BSD-3-Clause-Clear
 3 | 
 4 | image=ghcr.io/open-webui/open-webui:main
 5 | 
 6 | docker run \
 7 |   -d \
 8 |   --network host \
 9 |   -e OPENAI_API_KEY=test-key \
10 |   -e OPENAI_API_BASE_URL="http://localhost:8000/v1" \
11 |   -v open-webui:/app/backend/data \
12 |   --name open-webui \
13 |   --restart always \
14 |   ${image}
15 | 


--------------------------------------------------------------------------------
/utils/qaic-bench/config/config_tiny_llama.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "vllm_root": "/opt/qti-aic/integrations/vllm",
 3 | 
 4 |     "models": [
 5 |         {
 6 |             "name": "TinyLlama-1.1B-Chat-v1.0",
 7 |             "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
 8 |             "configs": [
 9 |                 {
10 |                     "batch_size": 1,
11 |                     "devices": 1,
12 |                     "prompt_len": 1024,
13 |                     "generation_len": 1024
14 |                 }
15 |             ]
16 |         }
17 |     ]
18 | }
19 | 


--------------------------------------------------------------------------------
/utils/qaic-bench/config/config_llama_3_1_8b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "vllm_root": "/opt/qti-aic/integrations/vllm",
 3 | 
 4 |     "models": [
 5 |         {
 6 |             "name": "Meta-Llama-3.1-8B-Instruct",
 7 |             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 8 |             "configs": [
 9 |                 {
10 |                     "batch_size": 1,
11 |                     "devices": 4,
12 |                     "prompt_len": 4096,
13 |                     "generation_len": 4096
14 |                 }
15 |             ]
16 |         }
17 |     ]
18 | }
19 | 


--------------------------------------------------------------------------------
/tutorials/Playground/README.md:
--------------------------------------------------------------------------------
 1 | # Cloud AI Playground Notebook setup
 2 | 
 3 | ## Python Setup
 4 | ```
 5 | # Setup venv
 6 | python3.10 -m venv imagine_env
 7 | source imagine_env/bin/activate
 8 | pip3 install pip -U
 9 | 
10 | # Install Qualcomm Imagine Python library
11 | pip3 install python-imagine-sdk
12 | 
13 | # Install dependencies
14 | pip3 install Pillow
15 | pip3 install notebook
16 | pip3 install pandas
17 | pip3 install openai
18 | ```
19 | 
20 | ## Launch Notebook
21 | ```
22 | jupyter notebook --no-browser --ip 0.0.0.0 --port 8080
23 | ```
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/utils/qaic-bench/config/config_dl2q.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "vllm_root": "/opt/qti-aic/integrations/vllm",
 3 | 
 4 |     "models": [
 5 |         {
 6 |             "name": "Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 7 |             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 8 |             "configs": [
 9 |                 {
10 |                     "batch_size": 1,
11 |                     "devices": 1,
12 |                     "cores": 14,
13 |                     "prompt_len": 4096,
14 |                     "generation_len": 4096
15 |                 }
16 |             ]
17 |         }
18 |     ]
19 | }
20 | 


--------------------------------------------------------------------------------
/tutorials/efficient_transformers/README.md:
--------------------------------------------------------------------------------
 1 | ## Installation steps
 2 | 
 3 | ### Create python virtual environment and activate it
 4 | ```
 5 | python3.10 -m venv qeff_env
 6 | source qeff_env/bin/activate
 7 | pip install --upgrade pip
 8 | ```
 9 | 
10 | ### Clone and install the efficient transformers repo
11 | ```
12 | pip install git+https://github.com/quic/efficient-transformers@release/v1.20.0
13 | ```
14 | 
15 | ### After installation of efficient transformers library, install jupyter notebook
16 | ```
17 | pip install notebook
18 | ```
19 | 
20 | ### Launch Notebook
21 | ```
22 | jupyter notebook --no-browser --allow-root
23 | ```
24 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cpu
 2 | accelerate==0.31.0
 3 | certifi==2024.6.2
 4 | charset-normalizer==3.3.2
 5 | coloredlogs==15.0.1
 6 | filelock==3.13.1
 7 | flatbuffers==24.3.25
 8 | fsspec==2024.2.0
 9 | huggingface-hub==0.23.4
10 | humanfriendly==10.0
11 | idna==3.7
12 | importlib-metadata==7.2.1
13 | Jinja2==3.1.3
14 | MarkupSafe==2.1.5
15 | mpmath==1.3.0
16 | networkx==3.1
17 | numpy==1.24.1
18 | onnx==1.12.0
19 | onnxruntime==1.16.3
20 | packaging==24.1
21 | pillow==10.2.0
22 | protobuf==3.20.1
23 | psutil==6.0.0
24 | PyYAML==6.0.1
25 | regex==2024.5.15
26 | requests==2.32.3
27 | safetensors==0.4.3
28 | sympy==1.12
29 | tokenizers==0.19.1
30 | torch==2.4.1
31 | tqdm==4.66.4
32 | transformers==4.41.2
33 | typing-extensions==4.9.0
34 | urllib3==2.2.2
35 | zipp==3.19.2
36 | onnxsim==0.4.36
37 | sentencepiece==0.2.0
38 | 


--------------------------------------------------------------------------------
/tutorials/open-webui/vllm_container.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 2 | # SPDX-License-Identifier: BSD-3-Clause-Clear
 3 | 
 4 | image=ghcr.io/quic/cloud_ai_inference_ubuntu22:1.19.8.0
 5 | qpc_path=/path/to/qpc
 6 | 
 7 | chmod +x serve.sh
 8 | 
 9 | docker run -dit \
10 |   --workdir /model \
11 |   --name qaic-vllm \
12 |   --network host \
13 |   --mount type=bind,source=${PWD}/serve.sh,target=/model/serve.sh \
14 |   --mount type=bind,source=${qpc_path},target=/model/qpc \
15 |   -v qaic-vllm:/model/data \
16 |   --env VLLM_QAIC_MAX_CPU_THREADS=8 \
17 |   --env VLLM_QAIC_QPC_PATH=/model/qpc \
18 |   --env HF_HOME=/model/data/huggingface \
19 |   --env QEFF_HOME=/model/data/qeff_models \
20 |   --device=/dev/accel/accel0 \
21 |   --device=/dev/accel/accel1 \
22 |   --device=/dev/accel/accel2 \
23 |   --device=/dev/accel/accel3 \
24 |   --entrypoint=/model/serve.sh \
25 |   ${image}
26 | 


--------------------------------------------------------------------------------
/tutorials/Computer-Vision/DETR/README.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | ---
 3 | 
 4 | Download the DETR-ResNet50 model, prepare for the Qualcomm AIC100, compile the model, run the model on a generated random sample along with input image, and obtain the output.
 5 | 
 6 | 
 7 | ## Source of the model
 8 | ---
 9 | 
10 | This model is an implementation of DETR-ResNet50 found at (https://github.com/facebookresearch/detr).
11 | 
12 | 
13 | ## Virtual environment
14 | ---
15 | For a quick environment setup:
16 | 
17 | ```commandline
18 | python3.8 -m venv cv_workflow_env
19 | source cv_workflow_env/bin/activate
20 | pip install --upgrade pip
21 | 
22 | ```
23 | 
24 | ## Framework and version
25 | ---
26 | ```commandline
27 | pip install torch==2.4.1+cpu torchvision==0.19.1+cpu --index-url https://download.pytorch.org/whl/cpu
28 | pip install numpy==1.24.4 onnx==1.17.0 pillow==10.4.0 requests==2.32.3 notebook==7.3.3 matplotlib==3.7.5 scipy==1.10.1
29 | 
30 | ```
31 | 
32 | 


--------------------------------------------------------------------------------
/models/vision/detection/lut_yolo_models.csv:
--------------------------------------------------------------------------------
 1 | MODEL_NAME,TASK,BATCH_SIZE,IMAGE_SIZE,CORES,INSTANCES,OLS,MOS,SET_SIZE,EXTRA,PRECISION,OBJECTIVE
 2 | yolov5s,object-detection,1,640,7,2,2,,1,,fp16,best-latency
 3 | yolov5s,object-detection,1,640,3,4,2,,1,,fp16,balanced
 4 | yolov5s,object-detection,1,640,2,7,2,,2,,fp16,best-throughput
 5 | yolov5m,object-detection,1,640,12,1,1,,1,,fp16,best-latency
 6 | yolov5m,object-detection,1,640,12,1,1,,2,,fp16,balanced
 7 | yolov5m,object-detection,1,640,2,7,2,,2,,fp16,best-throughput
 8 | yolov5l,object-detection,1,640,12,1,1,,1,,fp16,best-latency
 9 | yolov5l,object-detection,1,640,4,3,2,,1,,fp16,balanced
10 | yolov5l,object-detection,1,640,2,7,4,,2,,fp16,best-throughput
11 | yolov5x,object-detection,1,640,12,1,1,,1,,fp16,best-latency
12 | yolov5x,object-detection,1,640,7,2,2,,2,,fp16,balanced
13 | yolov5x,object-detection,1,640,2,7,4,,1,,fp16,best-throughput
14 | yolov7-e6e,object-detection,1,640,12,1,1,,1, -multicast-weights,fp16,best-latency
15 | yolov7-e6e,object-detection,1,640,4,3,2, ,1, -multicast-weights,fp16,balanced
16 | yolov7-e6e,object-detection,1,640,6,2,2, ,2, -multicast-weights,fp16,best-throughput
17 | 


--------------------------------------------------------------------------------
/tutorials/NLP/Performance-Tuning-Beginner/bert_base_dopt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "max_func_eval": 200,
 3 |   "objective": "maximize_inf_rate",
 4 |   "params": {
 5 |     "cores": {
 6 |       "min": 1,
 7 |       "max": 14
 8 |     },
 9 |     "mos": {
10 |       "min": 1,
11 |       "max": 8
12 |     },
13 |     "ols": {
14 |       "min": 1,
15 |       "max": 8
16 |     },
17 |     "bs": {
18 |       "min": 1,
19 |       "max": 16
20 |     },
21 |     "instances": {
22 |       "min": 1,
23 |       "max": 14
24 |     }
25 |   },
26 |   "initial_values": [
27 |     {
28 |       "cores": 1,
29 |       "mos": 1,
30 |       "ols": 1,
31 |       "bs": 1,
32 |       "instances": 14
33 |     },
34 |     {
35 |       "cores": 2,
36 |       "mos": 1,
37 |       "ols": 1,
38 |       "bs": 1,
39 |       "instances": 7
40 |     },
41 |     {
42 |       "cores": 4,
43 |       "mos": 1,
44 |       "ols": 1,
45 |       "bs": 1,
46 |       "instances": 3
47 |     },
48 |     {
49 |       "cores": 7,
50 |       "mos": 1,
51 |       "ols": 1,
52 |       "bs": 1,
53 |       "instances": 2
54 |     },
55 |     {
56 |       "cores": 14,
57 |       "mos": 1,
58 |       "ols": 1,
59 |       "bs": 1,
60 |       "instances": 1
61 |     }
62 |   ]
63 | }


--------------------------------------------------------------------------------
/tutorials/NLP/Performance-Tuning-Beginner/bert_base_dopt_min_latency.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "max_func_eval": 200,
 3 |   "objective": "minimize_latency",
 4 |   "params": {
 5 |     "cores": {
 6 |       "min": 1,
 7 |       "max": 14
 8 |     },
 9 |     "mos": {
10 |       "min": 1,
11 |       "max": 8
12 |     },
13 |     "ols": {
14 |       "min": 1,
15 |       "max": 8
16 |     },
17 |     "bs": {
18 |       "min": 1,
19 |       "max": 1
20 |     },
21 |     "instances": {
22 |       "min": 1,
23 |       "max": 14
24 |     }
25 |   },
26 |   "initial_values": [
27 |     {
28 |       "cores": 1,
29 |       "mos": 1,
30 |       "ols": 1,
31 |       "bs": 1,
32 |       "instances": 1
33 |     },
34 |     {
35 |       "cores": 2,
36 |       "mos": 1,
37 |       "ols": 1,
38 |       "bs": 1,
39 |       "instances": 1
40 |     },
41 |     {
42 |       "cores": 4,
43 |       "mos": 1,
44 |       "ols": 1,
45 |       "bs": 1,
46 |       "instances": 1
47 |     },
48 |     {
49 |       "cores": 7,
50 |       "mos": 1,
51 |       "ols": 1,
52 |       "bs": 1,
53 |       "instances": 1
54 |     },
55 |     {
56 |       "cores": 14,
57 |       "mos": 1,
58 |       "ols": 1,
59 |       "bs": 1,
60 |       "instances": 1
61 |     }
62 |   ]
63 | }
64 | 


--------------------------------------------------------------------------------
/samples/cpp/cpp_qpc_inference/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # ==============================================================================
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. <br>
 4 | # SPDX-License-Identifier: BSD-3-Clause-Clear
 5 | #
 6 | # ==============================================================================
 7 | 
 8 | project(simple-bert-inference-example)
 9 | cmake_minimum_required (VERSION 3.17.2)
10 | set(CMAKE_CXX_STANDARD 17)
11 | 
12 | find_package(Threads REQUIRED)
13 | 
14 | add_executable(simple-bert-inference-example main.cpp)
15 | 
16 | target_include_directories(simple-bert-inference-example
17 |     PRIVATE
18 |         "/opt/qti-aic/dev/inc"
19 | )
20 | 
21 | 
22 | set_target_properties(simple-bert-inference-example
23 |     PROPERTIES
24 |     LINK_FLAGS "-Wl,--no-as-needed"
25 | )
26 | 
27 | target_compile_options(simple-bert-inference-example
28 |     PRIVATE
29 |         -fstack-protector-all
30 |         -Werror
31 |         -Wall
32 |         -Wextra
33 |         -Wno-sign-compare
34 |         -Wno-unused-parameter
35 |         -Wno-missing-field-initializers
36 | )
37 | 
38 | target_link_libraries(simple-bert-inference-example
39 |     PRIVATE
40 |        Threads::Threads
41 |        dl
42 | )


--------------------------------------------------------------------------------
/tutorials/Computer-Vision/Perfomance-Tuning-Beginner/resnet_base_dopt_min_latency.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "search-mode": "optimized",
 3 |   "objective": "min-latency",
 4 |   "search-parameters": {
 5 |     "cores": {
 6 |       "min": 1,
 7 |       "max": 14
 8 |     },
 9 |     "mos": {
10 |       "min": 1,
11 |       "max": 8
12 |     },
13 |     "ols": {
14 |       "min": 1,
15 |       "max": 8
16 |     },
17 |     "batch-size": {
18 |       "min": 1,
19 |       "max": 16
20 |     },
21 |     "instances": {
22 |       "min": 1,
23 |       "max": 14
24 |     }
25 |   },
26 |   "initial-values": [
27 |     {
28 |       "cores": 1,
29 |       "mos": 1,
30 |       "ols": 1,
31 |       "batch-size": 1,
32 |       "instances": 1
33 |     },
34 |     {
35 |       "cores": 2,
36 |       "mos": 1,
37 |       "ols": 1,
38 |       "batch-size": 1,
39 |       "instances": 1
40 |     },
41 |     {
42 |       "cores": 4,
43 |       "mos": 1,
44 |       "ols": 1,
45 |       "batch-size": 1,
46 |       "instances": 1
47 |     },
48 |     {
49 |       "cores": 7,
50 |       "mos": 1,
51 |       "ols": 1,
52 |       "batch-size": 1,
53 |       "instances": 1
54 |     },
55 |     {
56 |       "cores": 14,
57 |       "mos": 1,
58 |       "ols": 1,
59 |       "batch-size": 1,
60 |       "instances": 1
61 |     }
62 |   ]
63 | }
64 | 


--------------------------------------------------------------------------------
/samples/python/README.md:
--------------------------------------------------------------------------------
 1 | # This folder consists
 2 | 
 3 | 1. `vit_qaic` and `resnet_qaic` folder contains example showing an end-to-end workflow for running inference on QAIC100 using the python APIs. 
 4 | 2. `qaic_features` folder consists of examples to show how to perform benchmarking, profiling and measuring metrics for inferences made on the device.
 5 | 
 6 | # Installation
 7 | 
 8 | Steps to install `qaic` API:
 9 | 
10 | ```
11 | pip install /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl
12 | pip install -r requirements.txt
13 | ```
14 | 
15 | 
16 | ## Structure of end to end workflow
17 | 
18 | Examples follow this pattern:
19 | 
20 | 1.	Get the model from open source. (HuggingFace for example)
21 | 2.	Convert the model to onnx using onnx library. 
22 | 3.	Call generate_bin function converts onnx to qpc (binary for the device).
23 | a.	Currently it is compiled for default arguments, can be replaced with best performance compile arguments) #FIXME
24 | 4.	Creating `qaic.session` with appropriate input and output names.
25 | 5.	Provide sample prepossessing steps. Build input_dict for the session. 
26 | 6.	Call session.run() to perform inference.
27 | 7.	Provide sample postprocessing steps. reshape output from the session. 
28 | 
29 | ## To run the example
30 | 
31 | ```
32 | python example.py 
33 | ```
34 | 


--------------------------------------------------------------------------------
/utils/README.md:
--------------------------------------------------------------------------------
 1 | # Basic Commands/Utilities for Cloud AI 100 devices 
 2 | 
 3 | ## Create `qaic` group to avoid `sudo` to read card/device status
 4 | 
 5 | ```
 6 | sudo usermod -aG qaic $USER
 7 | newgrp qaic
 8 | bash
 9 | ```
10 | 
11 | ## Check device health 
12 | Monitor the health of all AI 100 devices (SoCs) using the `qaic-util` utility. 
13 | 
14 | ```
15 | /opt/qti-aic/tools/qaic-util -q | grep -e Status -e QID
16 | ```
17 | 
18 | ## Monitoring of AI 100 devices (SoCs)
19 | Continuously monitor the health, telemetry (temperature, power etc) and resources (compute, DRAM etc) of the AI 100 devices (SoCs) using the `qaic-util` utility. 
20 | 
21 | ```
22 | /opt/qti-aic/tools/qaic-util -t 1
23 | ```
24 | 
25 | ## Reset AI 100 devices (SoCs)
26 | To reset **all** AI 100 devices (SoCs), run
27 | ```
28 | sudo /opt/qti-aic/tools/qaic-util -s
29 | ```
30 | 
31 | To reset **individual** AI 100 devices (SoCs), run 
32 | ```
33 | sudo /opt/qti-aic/tools/qaic-util -s -p <PCIe address SSSS:BB:DD.F>
34 | ```
35 | where,
36 |     - SSSS = 4 digits segment number
37 |     - BB = 2 digits bus number
38 |     - DD = 2 digits device number
39 |     - F = 1 digit function number
40 | 
41 | For example, 
42 | ```
43 | sudo /opt/qti-aic/tools/qaic-util -s -p 0000:83:00.0
44 | 
45 | Resetting 0000:83:00.0:
46 |         0000:83:00.0 success
47 | ```


--------------------------------------------------------------------------------
/models/language_processing/decoder/DeciCoder-6b/compileModel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | if [ -z "$1" ]; then
 6 | 	echo "Usage: $0 <model_name>"
 7 | 	exit 1
 8 | fi
 9 | 
10 | model_name="$1"
11 | batch_size="$2"
12 | prompt_len="$3"
13 | ctx_len="$4"
14 | num_cores="$5"
15 | with_or_no_mx="$6"
16 | 
17 | # Generate a new specializations.json
18 | sed -e "s/BS/${batch_size}/g" -e "s/PL/${prompt_len}/g" -e "s/CL/${ctx_len}/g" ./specializations_template.json > specializations.json
19 | 
20 | # Create qpc directory - Delete exisiting path 
21 | mkdir -p qpc
22 | rm -rf qpc/${model_name}-kv-${prompt_len}pl-${ctx_len}cl-${num_cores}c${with_or_no_mx}
23 | 
24 | model_path="${model_name}-kv/generatedModels/${model_name}-kv_fp16_simplified.onnx"
25 | if [ ! -f "$model_path" ]; then
26 | 	model_path="${model_name}-kv/generatedModels/${model_name}-kv_fp16.onnx"
27 | fi
28 | 
29 | /opt/qti-aic/exec/qaic-exec \
30 | 	-m=$model_path \
31 | 	-aic-hw \
32 | 	-aic-hw-version=2.0 \
33 | 	-network-specialization-config=specializations.json \
34 | 	-retained-state \
35 | 	-convert-to-fp16 \
36 | 	-ols=1 \
37 | 	-mos=${num_cores} \
38 | 	-aic-num-cores=${num_cores} \
39 | 	-custom-IO-list-file=${model_name}-kv/custom_io.yaml \
40 | 	-compile-only \
41 | 	-aic-binary-dir=qpc/${model_name}-kv-${prompt_len}pl-${ctx_len}cl-${num_cores}c${with_or_no_mx} \
42 | 	${with_or_no_mx}
43 | 
44 | 


--------------------------------------------------------------------------------
/models/speech/whisper/audio.py:
--------------------------------------------------------------------------------
 1 | ####################################################################################################
 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
 4 | ####################################################################################################
 5 | 
 6 | import os
 7 | import numpy as np
 8 | from datasets import load_dataset, Audio
 9 | import soundfile as sf
10 | from pathlib import Path
11 | 
12 | class AudioSample:
13 |     def __init__(self):
14 |         # load dummy dataset and read soundfiles
15 |         self.ds = load_dataset(
16 |             'hf-internal-testing/librispeech_asr_dummy', 'clean', split='validation'
17 |         )
18 | 
19 |     def to_file(self, parent='.'):
20 |         audio_sample = self.ds[0]['audio']
21 | 
22 |         audio_array  = audio_sample['array']
23 |         audio_fname = os.path.join(parent, Path(audio_sample['path']).name)
24 |         sampling_rate = audio_sample["sampling_rate"]
25 | 
26 |         # Convert to float32 for compatibility with soundfile
27 |         if audio_array.dtype != np.float32:
28 |             audio_array = audio_array.astype(np.float32)
29 | 
30 |         try:
31 |             sf.write(audio_fname, audio_array, sampling_rate, format='FLAC')
32 |         except Exception as e:
33 |             print('Error saving file: {}'.format(e))
34 | 
35 |         return audio_fname


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | ## Contributing to PROJECT
 2 | 
 3 | Hi there!
 4 | We’re thrilled that you’d like to contribute to this project.
 5 | Your help is essential for keeping this project great and for making it better.
 6 | 
 7 | ## Branching Strategy
 8 | 
 9 | In general, contributors should develop on branches based off of `master` and pull requests should be made against `master`.
10 | 
11 | ## Submitting a pull request
12 | 
13 | 1. Please read our [code of conduct](CODE-OF-CONDUCT.md) and [license](LICENSE).
14 | 1. Fork and clone the repository.
15 | 1. Create a new branch based on `master`: `git checkout -b <my-branch-name> master`.
16 | 1. Make your changes, add tests, and make sure the tests still pass.
17 | 1. Commit your changes using the [DCO](http://developercertificate.org/). You can attest to the DCO by commiting with the **-s** or **--signoff** options or manually adding the "Signed-off-by".
18 | 1. Push to your fork and submit a pull request from your branch to `master`.
19 | 1. Pat yourself on the back and wait for your pull request to be reviewed.
20 | 
21 | Here are a few things you can do that will increase the likelihood of your pull request to be accepted:
22 | 
23 | - Follow the existing style where possible.
24 | - Write tests.
25 | - Keep your change as focused as possible.
26 |   If you want to make multiple independent changes, please consider submitting them as separate pull requests.
27 | - Write a [good commit message](http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html).
28 | 


--------------------------------------------------------------------------------
/samples/python/aws_ai100_benchmarking/yolo_models/lut_yolo_models.csv:
--------------------------------------------------------------------------------
 1 | MODEL_NAME,TASK,BATCH_SIZE,IMAGE_SIZE,CORES,INSTANCES,OLS,MOS,SET_SIZE,EXTRA,PRECISION,OBJECTIVE
 2 | yolov4,object-detection,1,608,12,1,1,,1, -aic-pmu-recipe=KernelUtil -multicast-weights -aic-enable-depth-first,fp16,best-latency
 3 | yolov4,object-detection,1,608,7,2,2,,2, -aic-pmu-recipe=KernelUtil -multicast-weights -aic-enable-depth-first,fp16,balanced
 4 | yolov4,object-detection,1,608,1,14,1,,1, -aic-pmu-recipe=KernelUtil -multicast-weights -aic-enable-depth-first,fp16,best-throughput
 5 | yolov5s,object-detection,1,640,7,2,2,,1,,fp16,best-latency
 6 | yolov5s,object-detection,1,640,3,4,2,,1,,fp16,balanced
 7 | yolov5s,object-detection,1,640,2,7,2,,2,,fp16,best-throughput
 8 | yolov5m,object-detection,1,640,12,1,1,,1,,fp16,best-latency
 9 | yolov5m,object-detection,1,640,12,1,1,,2,,fp16,balanced
10 | yolov5m,object-detection,1,640,2,7,2,,2,,fp16,best-throughput
11 | yolov5l,object-detection,1,640,12,1,1,,1,,fp16,best-latency
12 | yolov5l,object-detection,1,640,4,3,2,,1,,fp16,balanced
13 | yolov5l,object-detection,1,640,2,7,4,,2,,fp16,best-throughput
14 | yolov5x,object-detection,1,640,12,1,1,,1,,fp16,best-latency
15 | yolov5x,object-detection,1,640,7,2,2,,2,,fp16,balanced
16 | yolov5x,object-detection,1,640,2,7,4,,1,,fp16,best-throughput
17 | yolov7-e6e,object-detection,1,640,12,1,1,,1, -multicast-weights,fp16,best-latency
18 | yolov7-e6e,object-detection,1,640,4,3,2, ,1, -multicast-weights,fp16,balanced
19 | yolov7-e6e,object-detection,1,640,6,2,2, ,2, -multicast-weights,fp16,best-throughput
20 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_turbo/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 2 | # SPDX-License-Identifier: BSD-3-Clause-Clear
 3 | 
 4 | import os
 5 | import onnx
 6 | from onnx import numpy_helper
 7 | 
 8 | 
 9 | # executes the command and writes it down in the command.txt. The first time, mode is 'w', then 'a' (append)
10 | def execute(cmd_elements, write_to_file, mode):
11 |     cmd_str = ' '.join(str(x) for x in cmd_elements)
12 |     redirect = f" 2>&1 | ts > {write_to_file}"
13 |     cmd_str += redirect
14 |     print(f"Executing: {cmd_str}")
15 |     os.system(cmd_str)
16 |     with open(write_to_file, mode) as file:
17 |         file.write(cmd_str + "\n\n")
18 | 
19 | 
20 | def scale_conv(model, conv_name, scale_factor):
21 |     cnodes = [x for x in model.graph.node if x.name == conv_name]
22 |     assert len(cnodes) == 1, f"Node '{conv_name}' not found"
23 |     x, w, b = cnodes[0].input
24 |     wi, bi = "", ""
25 |     for i, init in enumerate(model.graph.initializer):
26 |         if init.name == w:
27 |             wi = i
28 |         elif init.name == b:
29 |             bi = i
30 |         if wi != "" and bi != "":
31 |             break
32 |     else:
33 |         raise ValueError(f"Cannot find indices of weight: {w} and bias: {b}")
34 |     ww = numpy_helper.to_array(model.graph.initializer[wi])
35 |     bb = numpy_helper.to_array(model.graph.initializer[bi])
36 |     model.graph.initializer[wi].raw_data = (ww / scale_factor).tobytes()
37 |     model.graph.initializer[bi].raw_data = (bb / scale_factor).tobytes()
38 | 


--------------------------------------------------------------------------------
/tutorials/Computer-Vision/Perfomance-Tuning-Beginner/resnet_base_dopt_throughput.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "search-mode": "optimized",
 3 |   "objective": "max-throughput",
 4 |   "search-parameters": {
 5 |     "cores": {
 6 |       "min": 1,
 7 |       "max": 14
 8 |     },
 9 |     "mos": {
10 |       "min": 1,
11 |       "max": 8
12 |     },
13 |     "ols": {
14 |       "min": 1,
15 |       "max": 8
16 |     },
17 |     "batch-size": {
18 |       "min": 1,
19 |       "max": 16
20 |     },
21 |     "instances": {
22 |       "min": 1,
23 |       "max": 14
24 |     },
25 |     "set-size": {
26 |       "min": 1,
27 |       "max": 10
28 |     }
29 |   },
30 |   "initial-values": [
31 |     {
32 |       "cores": 1,
33 |       "mos": 1,
34 |       "ols": 1,
35 |       "batch-size": 1,
36 |       "instances": 14,
37 |       "set-size": 1
38 | 
39 |     },
40 |     {
41 |       "cores": 2,
42 |       "mos": 1,
43 |       "ols": 1,
44 |       "batch-size": 1,
45 |       "instances": 7,
46 |       "set-size": 1
47 |     },
48 |     {
49 |       "cores": 4,
50 |       "mos": 1,
51 |       "ols": 1,
52 |       "batch-size": 1,
53 |       "instances": 3,
54 |       "set-size": 1
55 |     },
56 |     {
57 |       "cores": 7,
58 |       "mos": 1,
59 |       "ols": 1,
60 |       "batch-size": 1,
61 |       "instances": 2,
62 |       "set-size": 1
63 |     },
64 |     {
65 |       "cores": 14,
66 |       "mos": 1,
67 |       "ols": 1,
68 |       "batch-size": 1,
69 |       "instances": 1,
70 |       "set-size": 1
71 |     }
72 |   ]
73 | }


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/utils.py:
--------------------------------------------------------------------------------
 1 | ####################################################################################################
 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
 4 | ####################################################################################################
 5 | 
 6 | import os
 7 | import onnx
 8 | from onnx import numpy_helper
 9 | 
10 | 
11 | # executes the command and writes it down in the command.txt. The first time, mode is 'w', then 'a' (append)
12 | def execute(cmd_elements, write_to_file, mode):
13 |     cmd_str = ' '.join(str(x) for x in cmd_elements)
14 |     redirect = f" 2>&1 | ts > {write_to_file}"
15 |     cmd_str += redirect
16 |     print(f"Executing: {cmd_str}")
17 |     os.system(cmd_str)
18 |     with open(write_to_file, mode) as file:
19 |         file.write(cmd_str + "\n\n")
20 | 
21 | 
22 | def scale_conv(model, conv_name, scale_factor):
23 |     cnodes = [x for x in model.graph.node if x.name == conv_name]
24 |     assert len(cnodes) == 1, f"Node '{conv_name}' not found"
25 |     x, w, b = cnodes[0].input
26 |     wi, bi = "", ""
27 |     for i, init in enumerate(model.graph.initializer):
28 |         if init.name == w:
29 |             wi = i
30 |         elif init.name == b:
31 |             bi = i
32 |         if wi != "" and bi != "":
33 |             break
34 |     else:
35 |         raise ValueError(f"Cannot find indices of weight: {w} and bias: {b}")
36 |     ww = numpy_helper.to_array(model.graph.initializer[wi])
37 |     bb = numpy_helper.to_array(model.graph.initializer[bi])
38 |     model.graph.initializer[wi].raw_data = (ww / scale_factor).tobytes()
39 |     model.graph.initializer[bi].raw_data = (bb / scale_factor).tobytes()
40 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/utils.py:
--------------------------------------------------------------------------------
 1 | ####################################################################################################
 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
 4 | ####################################################################################################
 5 | import os
 6 | import onnx
 7 | from onnx import numpy_helper
 8 | 
 9 | 
10 | # executes the command and writes it down in the command.txt. The first time, mode is 'w', then 'a' (append)
11 | def execute(cmd_elements, write_to_file, mode):
12 |     cmd_str = ' '.join(str(x) for x in cmd_elements)
13 |     redirect = f" 2>&1 | ts > {write_to_file}"
14 |     cmd_str += redirect
15 |     print(f"Executing: {cmd_str}")
16 |     os.system(cmd_str)
17 |     with open(write_to_file, mode) as file:
18 |         file.write(cmd_str + "\n\n")
19 | 
20 | 
21 | def scale_conv(model, conv_name, scale_factor):
22 |     cnodes = [x for x in model.graph.node if x.name == conv_name]
23 |     assert len(cnodes) == 1, f"Node '{conv_name}' not found"
24 |     x, w, b = cnodes[0].input
25 |     wi, bi = "", ""
26 |     for i, init in enumerate(model.graph.initializer):
27 |         if init.name == w:
28 |             wi = i
29 |         elif init.name == b:
30 |             bi = i
31 |         if wi != "" and bi != "":
32 |             break
33 |     else:
34 |         raise ValueError(f"Cannot find indices of weight: {w} and bias: {b}")
35 |     ww = numpy_helper.to_array(model.graph.initializer[wi])
36 |     bb = numpy_helper.to_array(model.graph.initializer[bi])
37 |     model.graph.initializer[wi].raw_data = (ww / scale_factor).tobytes()
38 |     model.graph.initializer[bi].raw_data = (bb / scale_factor).tobytes()
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted (subject to the limitations in the
 5 | disclaimer below) provided that the following conditions are met:
 6 | 
 7 |     * Redistributions of source code must retain the above copyright
 8 |       notice, this list of conditions and the following disclaimer.
 9 | 
10 |     * Redistributions in binary form must reproduce the above
11 |       copyright notice, this list of conditions and the following
12 |       disclaimer in the documentation and/or other materials provided
13 |       with the distribution.
14 | 
15 |     * Neither the name of Qualcomm Technologies, Inc. nor the names of its
16 |       contributors may be used to endorse or promote products derived
17 |       from this software without specific prior written permission.
18 | 
19 | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
20 | GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
21 | HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
22 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
23 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
27 | GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
29 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30 | OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
31 | IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 | 
33 | SPDX-License-Identifier: BSD-3-Clause-Clear
34 | 


--------------------------------------------------------------------------------
/tutorials/README.md:
--------------------------------------------------------------------------------
 1 | Tutorials are Jupyter notebools designed to walk the developer through the Cloud AI inference workflow. The tutorials are split into 2 categories - CV and NLP. Overall, the inference workflow for CV and NLP models are very similar and have been presented for convenience. 
 2 | 
 3 | `Model-Onboarding` - This is one of the beginner notebooks. This goes through exporting and preparing the model, compiling the model using a CLI tool and executing inference using CLI tool / Python APIs. 
 4 | 
 5 | `Performance-Tuning` - This is another beginner notebook that walks the developer through the key parameters to optimize for best performance (latency and throughput) on Cloud AI platforms. Going through this notebook and 'Performance Tuning' section in the Quick start guide will equip developers with a intuitive understanding of how to use the key parameters to meet inference application KPIs (AI compute resource usage, throughput and latency).   
 6 | 
 7 | `Profiler` - This is a intermediate-level notebook that describes system and device level inference profiling capabilities. Developers can use the tools and techniques described in this tutorial to measure application/device level latency and identify system/device bottlenecks. 
 8 | 
 9 | 
10 | ### Pre-requisites
11 | 1. Clone this repo
12 | 2. Create python3.8 venv and activate it.
13 |    `python3.8 -m venv jn_env` <br>
14 |    `source jn_env/bin/activate` <br>
15 | 3. Install qaic
16 |    `pip install /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl`
17 | 4. Install Jupyter notebook
18 |    `pip install notebook`
19 |    `pip install urllib3==1.26.6`
20 | 5. Run the notebook
21 |    `jupyter notebook --allow-root --ip 0.0.0.0 --no-browser`.<br>
22 |    You should see `http://ip-xx-yyy-zzz-aaa.us-west-2.compute.internal:8888/tree?token=<token#>`.<br>
23 |    On the local machine, type `http://xx.yyy.zzz.aaa:8888/tree?token=<token#>` on a browser to run the tutorial notebooks. 
24 | 


--------------------------------------------------------------------------------
/models/speech/whisper/README.md:
--------------------------------------------------------------------------------
 1 | # Whisper
 2 | 
 3 | [Whisper](https://github.com/openai/whisper) is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.
 4 | 
 5 | ## Environment and dependencies
 6 | 
 7 | ```commandline
 8 | python3.10 -m venv whisper_env
 9 | source whisper_env/bin/activate
10 | pip3 install -r requirements.txt
11 | 
12 | sudo apt-get update
13 | sudo apt-get install libsndfile1 ffmpeg
14 | ```
15 | 
16 | ## Model generation
17 | 
18 | The following command generates encoder and decoder ONNX files in the `output_whisper` folder:
19 | ```commandline
20 | python3 generateModel.py --model-name base --output-dir output_whisper
21 | ```
22 | 
23 | **Note** Check here for additional model variants:<br>
24 | https://github.com/openai/whisper#available-models-and-languages
25 | 
26 | 
27 | ## Model compilation
28 | 
29 | AIC binaries folder
30 | 
31 | ```commandline
32 | mkdir ./whisper_AIC
33 | ```
34 | 
35 | Whisper encoder
36 | 
37 | ```commandline
38 | rm -rf ./whisper_AIC/whisper-encoder
39 | /opt/qti-aic/exec/qaic-exec -m=./output_whisper/encoder_model.onnx -aic-hw -aic-num-cores=12 -mos=2 -ols=1 -convert-to-fp16 -onnx-define-symbol=batch_size,1 -onnx-define-symbol=feature_size,80 -onnx-define-symbol=encoder_sequence_length,3000 -aic-binary-dir=./whisper_AIC/whisper-encoder -compile-only
40 | ```
41 | 
42 | Whisper decoder
43 | 
44 | ```commandline
45 | rm -rf ./whisper_AIC/whisper-decoder
46 | /opt/qti-aic/exec/qaic-exec -m=./output_whisper/decoder_model.onnx -aic-hw -aic-num-cores=12 -mos=2 -ols=1 -convert-to-fp16 -onnx-define-symbol=batch_size,1 -onnx-define-symbol=encoder_sequence_length,1500 -onnx-define-symbol=decoder_sequence_length,150 -aic-binary-dir=./whisper_AIC/whisper-decoder -compile-only
47 | ```
48 | 
49 | ## Model execution
50 | 
51 | ```commandline
52 | sudo ./whisper_env/bin/python3 runModel.py
53 | ```
54 | 


--------------------------------------------------------------------------------
/samples/python/qaic_features/benchmarking_eg.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 3 | SPDX-License-Identifier: BSD-3-Clause-Clear
 4 | '''
 5 | 
 6 | import qaic
 7 | import numpy as np
 8 | import argparse
 9 | 
10 | # Establish arguments to accept
11 | def get_args():
12 |     
13 |     parser = argparse.ArgumentParser()
14 |     
15 |     parser.add_argument(
16 |         "--model-path",
17 |         dest='model_path',
18 |         default=
19 |         '/opt/qti-aic/integrations/qaic_onnxrt/tests/resnet50/resnet50-v1-12-batch.onnx',
20 |         help='Pass path to qpc of this model to avoid compilation')
21 | 
22 |     parser.add_argument(
23 |         "--config-path",
24 |         dest='config_path',
25 |         default=
26 |         './resnet_config.yaml',
27 |         help='Pass path to qpc of this model to avoid compilation')
28 |     
29 |     parser.add_argument(
30 |         "--input",
31 |         dest='input_img',
32 |         help=
33 |         'If image is not provided, random values will be generated as input. Input image should be 1*3*224*224 pixel in raw format'
34 |     )
35 | 
36 |     parser.add_argument(
37 |         "--num_iters",
38 |         dest='num_iters',
39 |         default=1000,
40 |         help='Enter number of inferences you want to run on the model')
41 |     
42 |     return parser.parse_args()
43 | 
44 | def main(args):
45 |     
46 |     resnet_sess = qaic.Session(
47 |         args.model_path,
48 |         options_path=args.config_path)
49 | 
50 |     input_shape, input_type = resnet_sess.model_input_shape_dict['data']
51 | 
52 |     # Read input
53 | 
54 |     if args.input_img is None:
55 |         x = np.random.randn(*input_shape).astype(input_type)
56 |     else:
57 |         img = np.fromfile(args.input_img, dtype=input_type)
58 |         x = np.resize(img, input_shape)
59 | 
60 |     # Run Benchmarking
61 |     input_dict = {'data': x}
62 |         
63 |     inf_completed, inf_rate, inf_time, batch_size = resnet_sess.run_benchmark(num_inferences = args.num_iters,
64 |                                     input_dict=input_dict)
65 |         
66 | if __name__ == '__main__':
67 |     args = get_args()
68 |     main(args)
69 | 


--------------------------------------------------------------------------------
/models/language_processing/encoder/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 2 | # SPDX-License-Identifier: BSD-3-Clause-Clear
 3 | 
 4 | import os
 5 | from transformers import AutoTokenizer
 6 | import numpy as np
 7 | import qaic
 8 | 
 9 | class QAicEmbeddingModel():
10 |     def __init__(self, model_name='BAAI/bge-large-en-v1.5', qpc_path='./models/BAAI/bge-large-en-v1.5/compiled-bin-fp16-B1-C4-A3-OLS2-MOS1-best-throughput', device=0):
11 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
12 |         self.aic_session = qaic.Session(model_path=os.path.join(qpc_path, 'programqpc.bin'), dev_id=device)
13 |         self.name = model_name
14 | 
15 |         self.aic_session.setup()
16 | 
17 |     def generate(self, input):
18 |         tokens = self.tokenizer(input, padding=True, return_tensors='np')
19 | 
20 |         input_data = {'input_ids': None,
21 |                       'attention_mask': None}
22 | 
23 |         for k in input_data.keys():
24 |             input_shape, input_type = self.aic_session.model_input_shape_dict[k]
25 | 
26 |             rows, cols = tokens[k].shape
27 |             input_data[k] = np.zeros(input_shape, dtype=input_type)
28 |             input_data[k][:rows, :cols] = tokens[k]
29 | 
30 |         outputs = self.aic_session.run(input_data)
31 | 
32 |         output_shape, output_type = self.aic_session.model_output_shape_dict['token_embeddings']
33 |         token_embeddings = np.frombuffer(outputs['token_embeddings'], dtype=output_type).reshape(output_shape)
34 |         token_embeddings = token_embeddings[:, 0]
35 | 
36 |         output_shape, output_type = self.aic_session.model_output_shape_dict['sentence_embedding']
37 |         sentence_embedding = np.frombuffer(outputs['sentence_embedding'], dtype=output_type).reshape(output_shape)
38 | 
39 |         return token_embeddings, sentence_embedding
40 | 
41 | def main():
42 |     inputs_txt = 'your_text_here'
43 |     model = QAicEmbeddingModel()
44 |     token_embedding, sentence_embeddings = model.generate(inputs_txt)
45 |     print('token_embedding {}'.format(token_embedding))
46 |     print('sentence_embeddings {}'.format(sentence_embeddings))
47 | 
48 | if __name__ == "__main__":
49 |     main()
50 | 


--------------------------------------------------------------------------------
/samples/python/qaic_features/metrics_eg.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 3 | SPDX-License-Identifier: BSD-3-Clause-Clear
 4 | '''
 5 | 
 6 | import qaic
 7 | import numpy as np
 8 | import argparse
 9 | 
10 | # Establish arguments to accept
11 | def get_args():
12 |     
13 |     parser = argparse.ArgumentParser()
14 |     
15 |     parser.add_argument(
16 |         "--model-path",
17 |         dest='model_path',
18 |         default=
19 |         '/opt/qti-aic/integrations/qaic_onnxrt/tests/resnet50/resnet50-v1-12-batch.onnx',
20 |         help='Pass path to qpc of this model to avoid compilation')
21 | 
22 |     parser.add_argument(
23 |         "--config-path",
24 |         dest='config_path',
25 |         default=
26 |         './resnet_config.yaml',
27 |         help='Pass path to qpc of this model to avoid compilation')
28 |     
29 |     parser.add_argument(
30 |         "--input",
31 |         dest='input_img',
32 |         help=
33 |         'If image is not provided, random values will be generated as input. Input image should be 1*3*224*224 pixel in raw format'
34 |     )
35 | 
36 |     parser.add_argument(
37 |         "--num_iters",
38 |         dest='num_iters',
39 |         default=1000,
40 |         help='Enter number of inferences you want to run on the model')
41 |     
42 |     return parser.parse_args()
43 | 
44 | def main(args):
45 |     
46 |     resnet_sess = qaic.Session(
47 |         args.model_path,
48 |         options_path=args.config_path,
49 |         enable_metrics=True)
50 | 
51 |     input_shape, input_type = resnet_sess.model_input_shape_dict['data']
52 | 
53 |     # Read input
54 | 
55 |     if args.input_img is None:
56 |         x = np.random.randn(*input_shape).astype(input_type)
57 |     else:
58 |         img = np.fromfile(args.input_img, dtype=input_type)
59 |         x = np.resize(img, input_shape)
60 | 
61 |     # Run inference
62 |     input_dict = {'data': x}
63 | 
64 |     for _ in range(args.num_iters):
65 |         resnet_sess.run(input_dict)
66 |         
67 |     print('\n\n\n\n-------------- Metrics --------------\n\n\n\n')
68 |     resnet_sess.print_metrics()
69 |     metrics = resnet_sess.get_metrics()
70 | 
71 | if __name__ == '__main__':
72 |     args = get_args()
73 |     main(args)


--------------------------------------------------------------------------------
/samples/python/qaic_features/profiling_eg.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 3 | SPDX-License-Identifier: BSD-3-Clause-Clear
 4 | '''
 5 | 
 6 | import qaic
 7 | import numpy as np
 8 | import argparse
 9 | 
10 | # Establish arguments to accept
11 | def get_args():
12 |     
13 |     parser = argparse.ArgumentParser()
14 |     
15 |     parser.add_argument(
16 |         "--model-path",
17 |         dest='model_path',
18 |         default=
19 |         '/opt/qti-aic/integrations/qaic_onnxrt/tests/resnet50/resnet50-v1-12-batch.onnx',
20 |         help='Pass path to qpc of this model to avoid compilation')
21 | 
22 |     parser.add_argument(
23 |         "--config-path",
24 |         dest='config_path',
25 |         default=
26 |         './resnet_config.yaml',
27 |         help='Pass path to qpc of this model to avoid compilation')
28 |     
29 |     parser.add_argument(
30 |         "--input",
31 |         dest='input_img',
32 |         help=
33 |         'If image is not provided, random values will be generated as input. Input image should be 1*3*224*224 pixel in raw format'
34 |     )
35 | 
36 |     parser.add_argument(
37 |         "--num_iters",
38 |         dest='num_iters',
39 |         default=1000,
40 |         help='Enter number of inferences you want to run on the model')
41 |     
42 |     return parser.parse_args()
43 | 
44 | def main(args):
45 |     
46 |     resnet_sess = qaic.Session(
47 |         args.model_path,
48 |         options_path=args.config_path,
49 |         enable_profiling=True)
50 | 
51 |     input_shape, input_type = resnet_sess.model_input_shape_dict['data']
52 | 
53 |     # Read input
54 | 
55 |     if args.input_img is None:
56 |         x = np.random.randn(*input_shape).astype(input_type)
57 |     else:
58 |         img = np.fromfile(args.input_img, dtype=input_type)
59 |         x = np.resize(img, input_shape)
60 | 
61 |     # Run inference
62 |     input_dict = {'data': x}
63 | 
64 |     for _ in range(args.num_iters):
65 |         resnet_sess.run(input_dict)
66 |     
67 |     print('\n\n\n\n-------------- Metrics --------------\n\n\n\n')
68 |     resnet_sess.print_metrics()
69 |     print('\n\n\n\n-------------- Profile Data --------------\n\n\n\n')
70 |     resnet_sess.print_profile_data(n=5)
71 |     metrics = resnet_sess.get_metrics()
72 | 
73 | if __name__ == '__main__':
74 |     args = get_args()
75 |     main(args)


--------------------------------------------------------------------------------
/samples/python/common_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 3 | SPDX-License-Identifier: BSD-3-Clause-Clear
 4 | '''
 5 | 
 6 | import os
 7 | import yaml
 8 | import inspect
 9 | 
10 | def generate_bin(onnx_filename, yaml_filename):
11 |     """
12 |     Generate compiled binary for QAIC
13 | 
14 |     Args:
15 |         onnx_path : path to onnx file.
16 |         yaml_path : path to yaml file which has compile time arguments.
17 | 
18 |     Returns:
19 |         qpc_path : path to qpc (compiled binary)
20 |     """
21 |     caller_path = inspect.stack()[1].filename #os.path.dirname(os.path.realpath
22 |     onnx_path = os.path.join(os.path.dirname(caller_path), onnx_filename)
23 |     yaml_path = os.path.join(os.path.dirname(caller_path), yaml_filename)
24 | 
25 |     filename, extension = os.path.splitext(onnx_filename)
26 |     onnx_folder = os.path.dirname(onnx_path)
27 |     qpc_bin = os.path.join(os.path.dirname(caller_path), filename+'_qpc')
28 |     with open(yaml_path, "r") as file:
29 |         yaml_data = yaml.load(file, Loader=yaml.FullLoader)
30 | 
31 |     if os.path.isdir(qpc_bin):
32 |         print(f'INFO: Removing existing QPC {qpc_bin}')
33 |         cmd = f'sudo rm -fr {qpc_bin}'
34 |         os.system(cmd)
35 |         print(f'INFO: Existing QPC {qpc_bin} is removed')
36 | 
37 |     # create the command string from the yaml arguments. 
38 |     cmd_list = [f'/opt/qti-aic/exec/qaic-exec -m={onnx_path} -aic-hw -aic-hw-version={2.0}']
39 | 
40 |     # ignore the following arguments:
41 |     ignore = ['num-activations', 'set-size']
42 |     replace_dict = {'aic_num_cores':'aic-num-cores'}
43 | 
44 |     for arg, value in yaml_data.items():
45 |         arg = arg.replace('_','-')
46 |         if arg in ignore:
47 |             continue
48 |         if isinstance(value, bool):
49 |             if value:# include the argument only if true; for example -convert-to-fp16
50 |                 cmd_list.append(f'-{arg}') 
51 |         elif isinstance(value, dict):
52 |             for subarg, subval in value.items():
53 |                 cmd_list.append(f'-{arg}={subarg},{subval}')
54 |         else:
55 |             cmd_list.append(f'-{arg}={value}')
56 | 
57 |     cmd_list.append(f'-aic-binary-dir={qpc_bin}')
58 | 
59 |     cmd = ' '.join(cmd_list)
60 |     print(f'INFO: Running the compile cmd: {cmd}')
61 |     os.system(cmd)
62 |     
63 |     return qpc_bin
64 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-xl-base-1.0/fix_vae_decoder_onnx.py:
--------------------------------------------------------------------------------
 1 | ####################################################################################################
 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
 4 | ####################################################################################################
 5 | import onnx
 6 | from onnx import numpy_helper
 7 | 
 8 | def scale_conv(model, conv_name, scale_factor):
 9 |     cnodes = [x for x in model.graph.node if x.name == conv_name]
10 |     assert len(cnodes) == 1, f"Node '{conv_name}' not found"
11 |     x, w, b = cnodes[0].input
12 |     wi, bi = "", ""
13 |     for i, init in enumerate(model.graph.initializer):
14 |         if init.name == w:
15 |             wi = i
16 |         elif init.name == b:
17 |             bi = i
18 |         if wi != "" and bi != "":
19 |             break
20 |     else:
21 |         raise ValueError(f"Cannot find indices of weight: {w} and bias: {b}")
22 |     ww = numpy_helper.to_array(model.graph.initializer[wi])
23 |     bb = numpy_helper.to_array(model.graph.initializer[bi])
24 |     model.graph.initializer[wi].raw_data = (ww / scale_factor).tobytes()
25 |     model.graph.initializer[bi].raw_data = (bb / scale_factor).tobytes()
26 | 
27 | 
28 | def main(model_path, scaling_factor):
29 |     model = onnx.load(model_path)
30 |     scale_conv(model, "/decoder/up_blocks.2/upsamplers.0/conv/Conv", scaling_factor)
31 |     scale_conv(model, "/decoder/up_blocks.3/resnets.0/conv2/Conv", scaling_factor)
32 |     # scale_conv(model, "/decoder/up_blocks.3/resnets.0/conv_shortcut/Conv", scaling_factor)
33 |     scale_conv(model, "/decoder/up_blocks.3/resnets.1/conv2/Conv", scaling_factor)
34 |     scale_conv(model, "/decoder/up_blocks.3/resnets.2/conv2/Conv", scaling_factor)
35 |     output_path = model_path[:-5] + f"_fixed_{scaling_factor}.onnx"
36 |     onnx.save(model, output_path)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     import argparse
41 |     argp = argparse.ArgumentParser()
42 |     argp.add_argument(
43 |         "--model-path",
44 |         default="stabilityai/stable-diffusion-xl-base-1.0/vae_decoder/model.onnx",
45 |         help="Model path to fix",
46 |     )
47 |     argp.add_argument("--scaling-factor", default=128, type=int, help="Scaling factor")
48 |     args = argp.parse_args()
49 |     main(**vars(args))
50 | 


--------------------------------------------------------------------------------
/models/speech/whisper/generateModel.py:
--------------------------------------------------------------------------------
 1 | ####################################################################################################
 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
 4 | ####################################################################################################
 5 | 
 6 | import os
 7 | import argparse
 8 | import numpy as np
 9 | import torch
10 | import whisper
11 | from audio import AudioSample
12 | 
13 | def main(model_name: str, output_dir: str):
14 |     cache_path = './cache'
15 | 
16 |     audio_sample = AudioSample()
17 |     audio_path = audio_sample.to_file()
18 | 
19 |     audio = whisper.load_audio(audio_path) # Read audio from file
20 |     audio_pad = whisper.pad_or_trim(audio) # Padding and trimming
21 | 
22 |     # make log-Mel spectrogram and move to the same device as the model
23 |     input_features = whisper.log_mel_spectrogram(audio_pad) # convert to mel spectrogram
24 |     input_features = torch.unsqueeze(input_features, 0) # add batch dimension
25 | 
26 |     model = whisper.load_model(model_name, download_root=cache_path)
27 |     audio_features = model.encoder(input_features)
28 |     decoder_input_ids = torch.tensor([[50258]])
29 | 
30 |     if not os.path.exists(output_dir):
31 |         os.makedirs(output_dir)
32 | 
33 |     # Encoder model
34 |     torch.onnx.export(
35 |         model.encoder,
36 |         (input_features),
37 |         os.path.join(output_dir, 'encoder_model.onnx'),
38 |         input_names=['input_features'],
39 |         output_names=['last_hidden_state'],
40 |         dynamic_axes={
41 |             'input_features': {0: 'batch_size', 1: 'feature_size', 2: 'encoder_sequence_length'},
42 |             'last_hidden_state': {0: 'batch_size'}
43 |         }
44 |     )
45 | 
46 |     # Decoder model
47 |     torch.onnx.export(
48 |         model.decoder,
49 |         (decoder_input_ids, audio_features),
50 |         os.path.join(output_dir, 'decoder_model.onnx'),
51 |         input_names=['input_ids', 'encoder_hidden_states'],
52 |         output_names=['logits'],
53 |         dynamic_axes={
54 |             'input_ids': {0: 'batch_size', 1: 'decoder_sequence_length'},
55 |             'encoder_hidden_states': {0: 'batch_size', 1: 'encoder_sequence_length'},
56 |             'logits': {0: 'batch_size', 1: 'decoder_sequence_length'}
57 |         }
58 |     )
59 | 
60 | if __name__ == '__main__':
61 |     import argparse
62 | 
63 |     argp = argparse.ArgumentParser()
64 |     argp.add_argument(
65 |         '--model-name',
66 |         required=True,
67 |         help='Model name to generate',
68 |     )
69 |     argp.add_argument(
70 |         '--output-dir',
71 |         required=False,
72 |         help='Path to store generated ONNX files',
73 |         default='./'
74 |     )
75 |     args = argp.parse_args()
76 |     main(**vars(args))
77 | 


--------------------------------------------------------------------------------
/models/language_processing/decoder/DeciCoder-6b/README.md:
--------------------------------------------------------------------------------
 1 | # Description
 2 | 
 3 | [DeciCoder-6b](https://huggingface.co/Deci/DeciCoder-6b) is a decoder-only large language model (LLM) developed by [Deci Ai](https://deci.ai) for code generation tasks. The architectures of the model was developed by AutoNAC which is Deci Ai's proprietary Neural Architecture Search technology. The model has a context length of 2048 tokens and is trained on the Python, Java, Javascript, C++, C#, Go, and Rust subsets of [The-Stack](https://huggingface.co/datasets/bigcode/the-stack) dataset. 
 4 | 
 5 | # Running on AIC100
 6 | 
 7 | ## Available Compute Resources
 8 | The following cloud provider instances are equipped with AIC100 accelerators. 
 9 | 
10 | 
11 | 
12 | |Provider | [AWS DL2q Instance](https://aws.amazon.com/ec2/instance-types/dl2q/) | [Cirrascale Instance](https://cirrascale.com/solutions-qualcomm-cloud-ai100.php) |
13 | | --------------------- | --------------------- | -------------------------- |
14 | |Cloud-AI Accelerators  |  8 Std (14 NSPs) SKUs | 1 to 8 Pro (16 NSPs) SKUs  |
15 | |Supported Formats for [DeciCoder-6b](https://huggingface.co/Deci/DeciCoder-6b)| FP16 and [MX6](https://arxiv.org/abs/2302.08007)  | FP16 and [MX6](https://arxiv.org/abs/2302.08007) |
16 | 
17 | ## Source of the Model
18 | 
19 | The model is downloaded from [HuggingFace](https://huggingface.co/Deci/DeciCoder-6b).
20 | 	
21 | ## Environment and Dependencies
22 | Create Python virtual environment and activate.
23 | 
24 | ```commandline
25 | python3.10 -m venv llm_env
26 | source llm_env/bin/activate
27 | pip3 install -r requirements.txt
28 | ```
29 | 
30 | Install the dependencies.
31 | 
32 | ```commandline
33 | git clone --branch v4.35.2 --depth 1 https://github.com/huggingface/transformers transformers-dev
34 | cd transformers-dev
35 | git apply ../Llama2_4.35.2.patch
36 | pip3 install .
37 | cd ..
38 | ```
39 | 
40 | ## Model and Hardware Parameters
41 | Customize the model repo/name and the compilation parameters in `init.sh`. Model will be compiled using MX6 compression. Let MX="" if you want to avoid MX6 compression. BS, PL and CL are Batchsize, Prompt Length and Context Length respectively.
42 | 
43 | ```commandline
44 | source init.sh
45 | ```
46 | 
47 | ## Model Generation
48 | Generate the model into onnx format.
49 | 		
50 | ```commandline
51 | python generateModel.py --model-name ${MODEL_REPO}/${MODEL_NAME} --model-class LlamaForCausalLM
52 | ```
53 | 
54 | ## Model Compilation for AIC100
55 | Compile the onnx format into bin file. Modify BS, PL, CL, CORES, and MX if needed.
56 | 
57 | ```commandline
58 | bash compileModel.sh $MODEL_NAME $BS $PL $CL $CORES $MX
59 | ```
60 | 
61 | ## Model Execution on AIC100
62 | Run the compiled model binary on AIC100. Modify DEVICE_ID if needed. Run  `/opt/qti-aic/tools/qaic-util -q` to check available devices.
63 | 
64 | ```commandline
65 | export PROMPT="insert your prompt here"
66 | export DEVICE_ID=0
67 | python runModel.py --model-name ${MODEL_REPO}/${MODEL_NAME} --qpc ./qpc/${MODEL_NAME}-kv-${PL}pl-${CL}cl-${CORES}c${MX} --device_id $DEVICE_ID --prompt "${PROMPT}"
68 | ```
69 | 
70 | ## References 
71 | - [Shared Micro-exponents](https://arxiv.org/abs/2302.08007)
72 | 
73 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/onnx_generation/onnx_gen_utils.py:
--------------------------------------------------------------------------------
 1 | ####################################################################################################
 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
 4 | ####################################################################################################
 5 | from packaging import version
 6 | import torch
 7 | import onnx
 8 | from onnx import external_data_helper, numpy_helper
 9 | import numpy as np
10 | from pathlib import Path
11 | 
12 | is_torch_less_than_1_11 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.11")
13 | 
14 | 
15 | def fix_onnx_fp16(
16 |     gen_models_path: str,
17 | ) -> str:
18 |     finfo = np.finfo(np.float16)
19 |     fp16_max = finfo.max
20 |     fp16_min = finfo.min
21 |     model = onnx.load(f"{gen_models_path}/model.onnx")
22 |     fp16_fix = False
23 |     for tensor in external_data_helper._get_all_tensors(model):
24 |         nptensor = numpy_helper.to_array(tensor, gen_models_path)
25 |         if nptensor.dtype == np.float32 and (
26 |             np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min)
27 |         ):
28 |             # print(f'tensor value : {nptensor} above {fp16_max} or below {fp16_min}')
29 |             nptensor = np.clip(nptensor, fp16_min, fp16_max)
30 |             new_tensor = numpy_helper.from_array(nptensor, tensor.name)
31 |             tensor.CopyFrom(new_tensor)
32 |             fp16_fix = True
33 |     
34 |             
35 |     if fp16_fix:
36 |         # Save FP16 model
37 |         print("Found constants out of FP16 range, clipped to FP16 range")
38 |         onnx.save(model, f=f"{gen_models_path}" / "model_fp16.onnx")
39 |         print(f"Saving modified onnx file at {gen_models_path}/model_fp16.onnx")
40 | 
41 | 
42 | def onnx_export(
43 |     model,
44 |     model_args: tuple,
45 |     output_path: Path,
46 |     ordered_input_names,
47 |     output_names,
48 |     dynamic_axes,
49 |     opset,
50 |     use_external_data_format=False,
51 | ):
52 |     output_path.parent.mkdir(parents=True, exist_ok=True)
53 |     # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11,
54 |     # so we check the torch version for backwards compatibility
55 |     if is_torch_less_than_1_11:
56 |         torch.onnx.export(
57 |             model,
58 |             model_args,
59 |             f=output_path.as_posix(),
60 |             input_names=ordered_input_names,
61 |             output_names=output_names,
62 |             dynamic_axes=dynamic_axes,
63 |             do_constant_folding=True,
64 |             use_external_data_format=use_external_data_format,
65 |             enable_onnx_checker=True,
66 |             opset_version=opset,
67 |         )
68 |     else:
69 |         torch.onnx.export(
70 |             model,
71 |             model_args,
72 |             f=output_path.as_posix(),
73 |             input_names=ordered_input_names,
74 |             output_names=output_names,
75 |             dynamic_axes=dynamic_axes,
76 |             do_constant_folding=True,
77 |             opset_version=opset,
78 |         )
79 | 
80 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_turbo/run_config_gen.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  4 | # SPDX-License-Identifier: BSD-3-Clause-Clear
  5 | 
  6 | # model configs
  7 | MODEL_PATH="stabilityai/sdxl-turbo"
  8 | PROMPT="\"photo of 8k ultra realistic harbour, port, boats, sunset, beautiful light, golden hour, full of colour, cinematic lighting, battered, trending on artstation, 4k, hyperrealistic, focused, extreme details, cinematic, masterpiece\""
  9 | VAE_TYPE="vae"
 10 | IMAGE_SIZE=512
 11 | BLOCK_SIZE=256
 12 | BATCH_SIZE=1
 13 | 
 14 | # onnx configs
 15 | GENERATE_ONNX=true
 16 | ONNX_TEXT_ENCODER=true
 17 | ONNX_UNET=true
 18 | ONNX_VAE=true
 19 | 
 20 | # compile configs
 21 | NUM_CORES=16
 22 | VAE_MOS=2
 23 | VAE_OLS=1
 24 | UNET_MOS=2
 25 | UNET_OLS=1
 26 | COMPILE_TEXT_ENCODER=true
 27 | COMPILE_UNET=true
 28 | COMPILE_VAE=true
 29 | 
 30 | # inference configs
 31 | RUN_ONLY=false
 32 | DEVICE=0
 33 | NUM_STEPS=1
 34 | WARMUP_ITERS=3
 35 | REPEAT_ITERS=1
 36 | 
 37 | # mode
 38 | TOGETHER=false
 39 | 
 40 | if [ ${GENERATE_ONNX} == true ]
 41 | then
 42 |     GENERATE_ONNX_CMD="--generate-onnx"
 43 | else
 44 |     GENERATE_ONNX_CMD=""
 45 | fi
 46 | 
 47 | if [ ${ONNX_TEXT_ENCODER} == true ]
 48 | then
 49 |     ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder"
 50 | else
 51 |     ONNX_TEXT_ENCODER_CMD=""
 52 | fi
 53 | 
 54 | if [ ${ONNX_UNET} == true ]
 55 | then
 56 |     ONNX_UNET_CMD="--onnx-unet"
 57 | else
 58 |     ONNX_UNET_CMD=""
 59 | fi
 60 | 
 61 | if [ ${ONNX_VAE} == true ]
 62 | then
 63 |     ONNX_VAE_CMD="--onnx-vae"
 64 | else
 65 |     ONNX_VAE_CMD=""
 66 | fi
 67 | 
 68 | if [ ${COMPILE_TEXT_ENCODER} == true ]
 69 | then
 70 |     COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder"
 71 | else
 72 |     COMPILE_TEXT_ENCODER_CMD=""
 73 | fi
 74 | 
 75 | if [ ${COMPILE_UNET} == true ]
 76 | then
 77 |     COMPILE_UNET_CMD="--compile-unet"
 78 | else
 79 |     COMPILE_UNET_CMD=""
 80 | fi
 81 | 
 82 | if [ ${COMPILE_VAE} == true ]
 83 | then
 84 |     COMPILE_VAE_CMD="--compile-vae"
 85 | else
 86 |     COMPILE_VAE_CMD=""
 87 | fi
 88 | 
 89 | if [ ${RUN_ONLY} == true ]
 90 | then
 91 |     RUN_ONLY_CMD="--run-only"
 92 | else
 93 |     RUN_ONLY_CMD=""
 94 | fi
 95 | 
 96 | if [ ${TOGETHER} == true ]
 97 | then
 98 |     TOGETHER_CMD="--together"
 99 | else
100 |     TOGETHER_CMD=""
101 | fi
102 | 
103 | export HF_HOME="cache"
104 | 
105 | rm run.sh
106 | 
107 | scripts="python main.py \
108 | --model-path $MODEL_PATH \
109 | --prompt $PROMPT \
110 | --vae-type $VAE_TYPE \
111 | --batch-size $BATCH_SIZE \
112 | --image-size $IMAGE_SIZE \
113 | --block-size $BLOCK_SIZE \
114 | --num-cores $NUM_CORES \
115 | --vae-mos $VAE_MOS \
116 | --vae-ols $VAE_OLS \
117 | --unet-mos $UNET_MOS \
118 | --unet-ols $UNET_OLS \
119 | --device $DEVICE \
120 | --num-steps $NUM_STEPS \
121 | --num-warmup-iters $WARMUP_ITERS \
122 | --num-repeat-iters $REPEAT_ITERS \
123 | $ONNX_TEXT_ENCODER_CMD \
124 | $ONNX_UNET_CMD \
125 | $ONNX_VAE_CMD \
126 | $COMPILE_TEXT_ENCODER_CMD \
127 | $COMPILE_UNET_CMD \
128 | $COMPILE_VAE_CMD \
129 | $GENERATE_ONNX_CMD \
130 | $RUN_ONLY_CMD \
131 | $TOGETHER_CMD"
132 | 
133 | echo $scripts >> run.sh
134 | 
135 | bash run.sh
136 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_turbo/run_config_inference.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  4 | # SPDX-License-Identifier: BSD-3-Clause-Clear
  5 | 
  6 | PYTHON=$1
  7 | echo $PYTHON
  8 | 
  9 | # model configs
 10 | MODEL_PATH="stabilityai/sdxl-turbo"
 11 | PROMPT="\"photo of 8k ultra realistic harbour, port, boats, sunset, beautiful light, golden hour, full of colour, cinematic lighting, battered, trending on artstation, 4k, hyperrealistic, focused, extreme details, cinematic, masterpiece\""
 12 | VAE_TYPE="vae"
 13 | IMAGE_SIZE=512
 14 | BLOCK_SIZE=256
 15 | BATCH_SIZE=1
 16 | 
 17 | # onnx configs
 18 | GENERATE_ONNX=false
 19 | ONNX_TEXT_ENCODER=false
 20 | ONNX_UNET=false
 21 | ONNX_VAE=false
 22 | 
 23 | # compile configs
 24 | NUM_CORES=16
 25 | VAE_MOS=2
 26 | VAE_OLS=1
 27 | UNET_MOS=2
 28 | UNET_OLS=1
 29 | COMPILE_TEXT_ENCODER=false
 30 | COMPILE_UNET=false
 31 | COMPILE_VAE=false
 32 | 
 33 | # inference configs
 34 | RUN_ONLY=true
 35 | DEVICE=0
 36 | NUM_STEPS=1
 37 | WARMUP_ITERS=3
 38 | REPEAT_ITERS=3
 39 | 
 40 | # mode
 41 | TOGETHER=false
 42 | 
 43 | if [ ${GENERATE_ONNX} == true ]
 44 | then
 45 |     GENERATE_ONNX_CMD="--generate-onnx"
 46 | else
 47 |     GENERATE_ONNX_CMD=""
 48 | fi
 49 | 
 50 | if [ ${ONNX_TEXT_ENCODER} == true ]
 51 | then
 52 |     ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder"
 53 | else
 54 |     ONNX_TEXT_ENCODER_CMD=""
 55 | fi
 56 | 
 57 | if [ ${ONNX_UNET} == true ]
 58 | then
 59 |     ONNX_UNET_CMD="--onnx-unet"
 60 | else
 61 |     ONNX_UNET_CMD=""
 62 | fi
 63 | 
 64 | if [ ${ONNX_VAE} == true ]
 65 | then
 66 |     ONNX_VAE_CMD="--onnx-vae"
 67 | else
 68 |     ONNX_VAE_CMD=""
 69 | fi
 70 | 
 71 | if [ ${COMPILE_TEXT_ENCODER} == true ]
 72 | then
 73 |     COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder"
 74 | else
 75 |     COMPILE_TEXT_ENCODER_CMD=""
 76 | fi
 77 | 
 78 | if [ ${COMPILE_UNET} == true ]
 79 | then
 80 |     COMPILE_UNET_CMD="--compile-unet"
 81 | else
 82 |     COMPILE_UNET_CMD=""
 83 | fi
 84 | 
 85 | if [ ${COMPILE_VAE} == true ]
 86 | then
 87 |     COMPILE_VAE_CMD="--compile-vae"
 88 | else
 89 |     COMPILE_VAE_CMD=""
 90 | fi
 91 | 
 92 | if [ ${RUN_ONLY} == true ]
 93 | then
 94 |     RUN_ONLY_CMD="--run-only"
 95 | else
 96 |     RUN_ONLY_CMD=""
 97 | fi
 98 | 
 99 | if [ ${TOGETHER} == true ]
100 | then
101 |     TOGETHER_CMD="--together"
102 | else
103 |     TOGETHER_CMD=""
104 | fi
105 | 
106 | export HF_HOME="cache"
107 | 
108 | rm run.sh
109 | 
110 | scripts="$PYTHON main.py \
111 | --model-path $MODEL_PATH \
112 | --prompt $PROMPT \
113 | --vae-type $VAE_TYPE \
114 | --batch-size $BATCH_SIZE \
115 | --image-size $IMAGE_SIZE \
116 | --block-size $BLOCK_SIZE \
117 | --num-cores $NUM_CORES \
118 | --vae-mos $VAE_MOS \
119 | --vae-ols $VAE_OLS \
120 | --unet-mos $UNET_MOS \
121 | --unet-ols $UNET_OLS \
122 | --device $DEVICE \
123 | --num-steps $NUM_STEPS \
124 | --num-warmup-iters $WARMUP_ITERS \
125 | --num-repeat-iters $REPEAT_ITERS \
126 | $ONNX_TEXT_ENCODER_CMD \
127 | $ONNX_UNET_CMD \
128 | $ONNX_VAE_CMD \
129 | $COMPILE_TEXT_ENCODER_CMD \
130 | $COMPILE_UNET_CMD \
131 | $COMPILE_VAE_CMD \
132 | $GENERATE_ONNX_CMD \
133 | $RUN_ONLY_CMD \
134 | $TOGETHER_CMD"
135 | 
136 | echo $scripts >> run.sh
137 | 
138 | bash run.sh
139 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/README.md:
--------------------------------------------------------------------------------
 1 | ### Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 2 | ### SPDX-License-Identifier: BSD-3-Clause-Clear
 3 | 
 4 | # Instructions to run SD3.5 on Cloud AI 100
 5 | 
 6 | The instructions below are to run the [Stable Diffusion 3.5 model](stabilityai/stable-diffusion-3.5-medium) on Cloud AI 100. Compile time parameters may need to be adjusted for different cards and different SDKs.
 7 | 
 8 | ## 1. Download model
 9 | 
10 | 1. Setup environment varialble
11 | ```
12 | mkdir cache
13 | export HF_HOME=cache
14 | export HF_TOKEN=<your-huggingface-auth-token>
15 | ```
16 | 
17 | 2. Follow [instructions on HuggingFace](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium) to gain access to model.
18 | 
19 | ## 2. Generate onnx files and compile for binaries
20 | 
21 | 1.  Set up a virtual environment for ONNX generation and compilation
22 | ```
23 | python3.10 -m venv env_onnx
24 | source ./env_onnx/bin/activate
25 | pip install -r requirements.txt
26 | ```
27 | 
28 | 2.  Create a folder for caching HuggingFace model downloads
29 | ```
30 | mkdir compile_logs
31 | mkdir qpc
32 | touch run.sh
33 | ```
34 | 
35 | 3. Install diffusers from source after patching for ONNX file generation
36 | ```
37 | git clone --depth 1 --branch v0.31.0 https://github.com/huggingface/diffusers.git diffusers-onnx
38 | cd diffusers-onnx
39 | git apply --reject --whitespace=fix ../patches/attention_patch.patch
40 | pip install .
41 | cd ..
42 | ```
43 | 
44 | 4. Install transformers from source (for T5 text_encoder_3 only)
45 | ```
46 | git clone -b v4.41.2 https://github.com/huggingface/transformers.git transformers-dev
47 | cd transformers-dev
48 | git apply --reject --whitespace=fix ../patches/transformer_patch.patch
49 | pip install .
50 | cd ..
51 | ```
52 | 
53 | 5. Generate ONNX files and model binaries
54 | ```
55 | bash run_config_gen.sh
56 | ```
57 | 
58 | ## 3. Run the end-to-end SD3 inference
59 | 
60 | 1. Set up a separate virtual environment
61 | ```
62 | python3.10 -m venv env_pipeline
63 | source ./env_pipeline/bin/activate
64 | pip install -r requirements.txt
65 | pip install --force-reinstall /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl
66 | ```
67 | 
68 | 2.  Re-install diffusers from source after patching the SD3 pipeline for inference
69 | ```
70 | git clone --depth 1 --branch v0.31.0 https://github.com/huggingface/diffusers.git diffusers-pipeline
71 | cd diffusers-pipeline
72 | git apply --reject --whitespace=fix ../patches/pipeline_patch.patch
73 | pip install .
74 | cd ..
75 | ```
76 | 
77 | 3. Run the inference with 'sudo' flag if needed to access the AI 100 devices.
78 | ```
79 | sudo bash run_config_inference.sh "<positive prompt>" "<negative prompt>"
80 | ```
81 | 
82 | ## 4. Python interface
83 | 
84 | ```
85 | source ./env_pipeline/bin/activate
86 | ```
87 | 
88 | ```python
89 | from model import QAICStableDiffusion3
90 | 
91 | model = QAICStableDiffusion3()
92 | prompt = 'photo of 8k ultra realistic harbour, port, boats, sunset, beautiful light, golden hour, full of colour, cinematic lighting, battered, trending on artstation, 4k, hyperrealistic, focused, extreme details, cinematic, masterpiece'
93 | image = model.generate(prompt, guidance=7.0)[0]
94 | image.save('harbor.png')
95 | ```
96 | 
97 | 


--------------------------------------------------------------------------------
/samples/python/qaic_features/README.md:
--------------------------------------------------------------------------------
  1 | # Python High-Level API (qaic) features
  2 | 
  3 | qaic_features depicts examples on how one can use different features provided by qaic module along with running inferences.
  4 | 
  5 | a) Metrics
  6 | After running inferences on AIC100 chip, if you want to get statistics regarding inference times, you can use get_metrics method as follows:
  7 | 
  8 | ```python
  9 | #Create Session with enable_metrics = True
 10 | session = qaic.Session(
 11 |     model_path,
 12 |     options_path=yaml_config_path,
 13 |     enable_metrics=True)
 14 | 
 15 | #Create input dictionary
 16 | input_dict = {'data': np.array()}
 17 | 
 18 | #Run Inferences
 19 | for i in range(100):
 20 |     session.run(input_dict)
 21 | 
 22 | #Get Metrics
 23 | session.print_metrics()
 24 | metrics = session.get_metrics()
 25 | ```
 26 | 
 27 | Sample output for session.print_metrics()
 28 | 
 29 | ```bash
 30 | Number of inferences utilized for calculation are 999
 31 | Minimum latency observed 0.0009578340000000001 s
 32 | Maximum latency observed 0.002209001 s
 33 | Average latency / inference time observed is 0.0012380756316316324 s
 34 | P25 / 25% of inferences observed latency less than 0.001095435 s
 35 | P50 / 50% of inferences observed latency less than 0.0012522870000000001 s
 36 | P75 / 75% of inferences observed latency less than 0.001299786 s
 37 | P90 / 90% of inferences observed latency less than 0.002209001 s
 38 | P99 / 99% of inferences observed latency less than 0.0016082370000000002 s
 39 | Sum of all the inference times 1.2368375560000007 s
 40 | Average latency / inference time observed is 0.0012380756316316324 s
 41 | ```
 42 | 
 43 | 
 44 | 
 45 | 
 46 | b) Profiling
 47 | To profile your inferences being performed on AIC100 chip and get inference time statistic metrics, you can use following methods:
 48 | 
 49 | ```python
 50 | #Create Session with enable_metrics = True
 51 | session = qaic.Session(
 52 |     model_path,
 53 |     options_path=yaml_config_path,
 54 |     enable_profiling=True)
 55 | 
 56 | #Create input dictionary
 57 | input_dict = {'data': np.array()}
 58 | 
 59 | #Run Inferences
 60 | for i in range(100):
 61 |     session.run(input_dict)
 62 | 
 63 | #Get Metrics
 64 | session.print_metrics()
 65 | metrics = session.get_metrics()
 66 | session.print_profile_data(n=5)
 67 | ```
 68 | 
 69 | Sample output for session.print_profile_data()
 70 | 
 71 | ```bash
 72 | |  File-Line-Function  | |  num calls  | |  func time  | |  tot time  |
 73 | 
 74 | ('~', 0, "<method 'astype' of 'numpy.ndarray' objects>") 1 0.000149101 0.000149101
 75 | 
 76 | ('~', 0, '<built-in method numpy.empty>') 1 2.38e-06 2.38e-06
 77 | 
 78 | ('~', 0, '<built-in method numpy.frombuffer>') 1 4.22e-06 4.22e-06
 79 | ```
 80 | 
 81 | 
 82 | 
 83 | 
 84 | c) Benchmarking
 85 | To run benchmarking for model inferences on AIC100 chip, following method can be used:
 86 | 
 87 | ```python
 88 | #Create Session with enable_metrics = True
 89 | session = qaic.Session(
 90 |     model_path,
 91 |     options_path=yaml_config_path)
 92 | 
 93 | #Create input dictionary
 94 | input_dict = {'data': np.array()}
 95 | 
 96 | # Run Benchmarking
 97 | input_dict = {'data': x}
 98 |     
 99 | inf_completed, inf_rate, inf_time, batch_size = session.run_benchmark(input_dict=input_dict)
100 | ```
101 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_turbo/onnx_generation/onnx_gen_utils.py:
--------------------------------------------------------------------------------
 1 | ####################################################################################################
 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
 4 | ####################################################################################################
 5 | from packaging import version
 6 | import torch
 7 | import onnx
 8 | from onnx import external_data_helper, numpy_helper
 9 | import numpy as np
10 | from pathlib import Path
11 | 
12 | is_torch_less_than_1_11 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.11")
13 | 
14 | 
15 | def fix_onnx_fp16(
16 |     gen_models_path: str,
17 |     model_base_name: str,
18 | ) -> str:
19 |     finfo = np.finfo(np.float16)
20 |     fp16_max = finfo.max
21 |     fp16_min = finfo.min
22 |     model = onnx.load(f"{gen_models_path}/{model_base_name}")
23 |     fp16_fix = False
24 |     for tensor in external_data_helper._get_all_tensors(model):
25 |         nptensor = numpy_helper.to_array(tensor, gen_models_path)
26 |         if nptensor.dtype == np.float32 and (
27 |             np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min)
28 |         ):
29 |             # print(f'tensor value : {nptensor} above {fp16_max} or below {fp16_min}')
30 |             nptensor = np.clip(nptensor, fp16_min, fp16_max)
31 |             new_tensor = numpy_helper.from_array(nptensor, tensor.name)
32 |             tensor.CopyFrom(new_tensor)
33 |             fp16_fix = True
34 |     
35 |             
36 |     if fp16_fix:
37 |         # Save FP16 model
38 |         print("Found constants out of FP16 range, clipped to FP16 range")
39 |         model_base_name += "_fix_outofrange_fp16"
40 |         onnx.save(model, f=f"{gen_models_path}/{model_base_name}")
41 |         print(f"Saving modified onnx file at {gen_models_path}/{model_base_name}")
42 |     return model_base_name
43 | 
44 | 
45 | def onnx_export(
46 |     model,
47 |     model_args: tuple,
48 |     output_path: Path,
49 |     ordered_input_names,
50 |     output_names,
51 |     dynamic_axes,
52 |     opset,
53 |     use_external_data_format=False,
54 | ):
55 |     output_path.parent.mkdir(parents=True, exist_ok=True)
56 |     # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11,
57 |     # so we check the torch version for backwards compatibility
58 |     if is_torch_less_than_1_11:
59 |         torch.onnx.export(
60 |             model,
61 |             model_args,
62 |             f=output_path.as_posix(),
63 |             input_names=ordered_input_names,
64 |             output_names=output_names,
65 |             dynamic_axes=dynamic_axes,
66 |             do_constant_folding=True,
67 |             use_external_data_format=use_external_data_format,
68 |             enable_onnx_checker=True,
69 |             opset_version=opset,
70 |         )
71 |     else:
72 |         torch.onnx.export(
73 |             model,
74 |             model_args,
75 |             f=output_path.as_posix(),
76 |             input_names=ordered_input_names,
77 |             output_names=output_names,
78 |             dynamic_axes=dynamic_axes,
79 |             do_constant_folding=True,
80 |             opset_version=opset,
81 |         )
82 | 
83 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/onnx_generation/onnx_gen_utils.py:
--------------------------------------------------------------------------------
 1 | ####################################################################################################
 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
 4 | ####################################################################################################
 5 | from packaging import version
 6 | import torch
 7 | import onnx
 8 | from onnx import external_data_helper, numpy_helper
 9 | import numpy as np
10 | from pathlib import Path
11 | 
12 | is_torch_less_than_1_11 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.11")
13 | 
14 | 
15 | def fix_onnx_fp16(
16 |     gen_models_path: str,
17 |     model_base_name: str,
18 | ) -> str:
19 |     finfo = np.finfo(np.float16)
20 |     fp16_max = finfo.max
21 |     fp16_min = finfo.min
22 |     model = onnx.load(f"{gen_models_path}/{model_base_name}")
23 |     fp16_fix = False
24 |     for tensor in external_data_helper._get_all_tensors(model):
25 |         nptensor = numpy_helper.to_array(tensor, gen_models_path)
26 |         if nptensor.dtype == np.float32 and (
27 |             np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min)
28 |         ):
29 |             # print(f'tensor value : {nptensor} above {fp16_max} or below {fp16_min}')
30 |             nptensor = np.clip(nptensor, fp16_min, fp16_max)
31 |             new_tensor = numpy_helper.from_array(nptensor, tensor.name)
32 |             tensor.CopyFrom(new_tensor)
33 |             fp16_fix = True
34 |     
35 |             
36 |     if fp16_fix:
37 |         # Save FP16 model
38 |         print("Found constants out of FP16 range, clipped to FP16 range")
39 |         model_base_name += "_fix_outofrange_fp16"
40 |         onnx.save(model, f=f"{gen_models_path}/{model_base_name}")
41 |         print(f"Saving modified onnx file at {gen_models_path}/{model_base_name}")
42 |     return model_base_name
43 | 
44 | 
45 | def onnx_export(
46 |     model,
47 |     model_args: tuple,
48 |     output_path: Path,
49 |     ordered_input_names,
50 |     output_names,
51 |     dynamic_axes,
52 |     opset,
53 |     use_external_data_format=False,
54 | ):
55 |     output_path.parent.mkdir(parents=True, exist_ok=True)
56 |     # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11,
57 |     # so we check the torch version for backwards compatibility
58 |     if is_torch_less_than_1_11:
59 |         torch.onnx.export(
60 |             model,
61 |             model_args,
62 |             f=output_path.as_posix(),
63 |             input_names=ordered_input_names,
64 |             output_names=output_names,
65 |             dynamic_axes=dynamic_axes,
66 |             do_constant_folding=True,
67 |             use_external_data_format=use_external_data_format,
68 |             enable_onnx_checker=True,
69 |             opset_version=opset,
70 |         )
71 |     else:
72 |         torch.onnx.export(
73 |             model,
74 |             model_args,
75 |             f=output_path.as_posix(),
76 |             input_names=ordered_input_names,
77 |             output_names=output_names,
78 |             dynamic_axes=dynamic_axes,
79 |             do_constant_folding=True,
80 |             opset_version=opset,
81 |         )
82 | 
83 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-xl-base-1.0/onnx_gen_utils.py:
--------------------------------------------------------------------------------
 1 | ####################################################################################################
 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
 4 | ####################################################################################################
 5 | from packaging import version
 6 | import torch
 7 | import onnx
 8 | from onnx import external_data_helper, numpy_helper
 9 | import numpy as np
10 | from pathlib import Path
11 | 
12 | is_torch_less_than_1_11 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.11")
13 | 
14 | 
15 | def fix_onnx_fp16(
16 |     gen_models_path: str,
17 |     model_base_name: str,
18 | ) -> str:
19 |     finfo = np.finfo(np.float16)
20 |     fp16_max = finfo.max
21 |     fp16_min = finfo.min
22 |     model = onnx.load(f"{gen_models_path}/{model_base_name}")
23 |     fp16_fix = False
24 |     for tensor in external_data_helper._get_all_tensors(model):
25 |         nptensor = numpy_helper.to_array(tensor, gen_models_path)
26 |         if nptensor.dtype == np.float32 and (
27 |             np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min)
28 |         ):
29 |             # print(f'tensor value : {nptensor} above {fp16_max} or below {fp16_min}')
30 |             nptensor = np.clip(nptensor, fp16_min, fp16_max)
31 |             new_tensor = numpy_helper.from_array(nptensor, tensor.name)
32 |             tensor.CopyFrom(new_tensor)
33 |             fp16_fix = True
34 |     
35 |             
36 |     if fp16_fix:
37 |         # Save FP16 model
38 |         print("Found constants out of FP16 range, clipped to FP16 range")
39 |         model_base_name += "_fix_outofrange_fp16"
40 |         onnx.save(model, f=f"{gen_models_path}/{model_base_name}")
41 |         print(f"Saving modified onnx file at {gen_models_path}/{model_base_name}")
42 |     return model_base_name
43 | 
44 | 
45 | def onnx_export(
46 |     model,
47 |     model_args: tuple,
48 |     output_path: Path,
49 |     ordered_input_names,
50 |     output_names,
51 |     dynamic_axes,
52 |     opset,
53 |     use_external_data_format=False,
54 | ):
55 |     output_path.parent.mkdir(parents=True, exist_ok=True)
56 |     # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11,
57 |     # so we check the torch version for backwards compatibility
58 |     if is_torch_less_than_1_11:
59 |         torch.onnx.export(
60 |             model,
61 |             model_args,
62 |             f=output_path.as_posix(),
63 |             input_names=ordered_input_names,
64 |             output_names=output_names,
65 |             dynamic_axes=dynamic_axes,
66 |             do_constant_folding=True,
67 |             use_external_data_format=use_external_data_format,
68 |             enable_onnx_checker=True,
69 |             opset_version=opset,
70 |         )
71 |     else:
72 |         torch.onnx.export(
73 |             model,
74 |             model_args,
75 |             f=output_path.as_posix(),
76 |             input_names=ordered_input_names,
77 |             output_names=output_names,
78 |             dynamic_axes=dynamic_axes,
79 |             do_constant_folding=True,
80 |             opset_version=opset,
81 |         )
82 | 
83 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_turbo/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 2 | # SPDX-License-Identifier: BSD-3-Clause-Clear
 3 | 
 4 | import asyncio
 5 | import os
 6 | import torch
 7 | 
 8 | from diffusers import AutoPipelineForText2Image
 9 | 
10 | class QAICStableDiffusion:
11 |     def __init__(self, model_id = 'stabilityai/sdxl-turbo', device_id=0):
12 |         text_encoder = './qpc/text_encoder_256b_512i_16c_1b/programqpc.bin'
13 |         unet = './qpc/unet_256b_512i_16c_1b_2m_1o/programqpc.bin'
14 |         text_encoder_2 = './qpc/text_encoder_2_256b_512i_16c_1b/programqpc.bin'
15 |         sdxl_vae_decoder = './qpc/vae_decoder_256b_512i_vae_16c_1b_2m_1o/programqpc.bin'
16 | 
17 |         # check the QPCs
18 |         unet_qpc = unet if unet.endswith('programqpc.bin') else os.path.join(unet,'programqpc.bin')
19 |         assert os.path.isfile(unet_qpc), f"Could not find binary {unet_qpc = }!"
20 |         vae_decoder_sdxl_qpc = sdxl_vae_decoder if sdxl_vae_decoder.endswith('programqpc.bin') else os.path.join(sdxl_vae_decoder,'programqpc.bin')
21 |         assert os.path.isfile(vae_decoder_sdxl_qpc), f"Could not find binary {vae_decoder_sdxl_qpc = }!"
22 |         text_encoder_qpc = text_encoder if text_encoder.endswith('programqpc.bin') else os.path.join(text_encoder,'programqpc.bin')
23 |         assert os.path.isfile(text_encoder_qpc), f"Could not find binary {text_encoder_qpc = }!"
24 |         text_encoder_2_qpc = text_encoder_2 if text_encoder_2.endswith('programqpc.bin') else os.path.join(text_encoder_2,'programqpc.bin')
25 |         assert os.path.isfile(text_encoder_2_qpc), f"Could not find binary {text_encoder_2_qpc = }!"
26 | 
27 |         self.num_steps = 1
28 |         self.vae_type = "vae"
29 | 
30 |         # load the latents
31 |         self.latents = None
32 | 
33 |         # load the model pipeline
34 |         self.pipe = AutoPipelineForText2Image.from_pretrained(model_id, torch_dtype=torch.float16, variant="fp16",
35 |                                                 device_id=device_id,
36 |                                                 unet_qpc=unet_qpc,
37 |                                                 vae_decoder_qpc=vae_decoder_sdxl_qpc,
38 |                                                 text_encoder_qpc=text_encoder_qpc,
39 |                                                 text_encoder_2_qpc=text_encoder_2_qpc)
40 | 
41 |     async def generate(self, prompt, n, image_size):
42 |         height, width = image_size[0], image_size[1]
43 |         images = self.pipe(prompt=prompt,
44 |                     num_inference_steps=self.num_steps,
45 |                     height=height,
46 |                     width=width,
47 |                     latents=self.latents,
48 |                     vae_type=self.vae_type,
49 |                     guidance_scale=0.0).images
50 | 
51 |         yield images[0]
52 | 
53 | async def main():
54 |     model = QAICStableDiffusion()
55 |     prompt = 'photo of 8k ultra realistic harbour, port, boats, sunset, beautiful light, golden hour, full of colour, cinematic lighting, battered, trending on artstation, 4k, hyperrealistic, focused, extreme details, cinematic, masterpiece'
56 |     idx = 0
57 |     async for image in model.generate(prompt, 1, (512, 512)):
58 |        image.save('generated_image_{}.png'.format(idx))
59 |        idx += 1
60 | 
61 | if __name__ == "__main__":
62 |     asyncio.run(main())
63 | 
64 | 


--------------------------------------------------------------------------------
/samples/python/vit_qaic/example.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 3 | SPDX-License-Identifier: BSD-3-Clause-Clear
 4 | '''
 5 | 
 6 | import sys
 7 | sys.path.append("/opt/qti-aic/examples/apps/qaic-python-sdk")
 8 | import qaic
 9 | import numpy as np
10 | import torchvision
11 | import torch
12 | import pandas as pd
13 | import os
14 | sys.path.insert(1, os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir))
15 | from common_utils import generate_bin
16 | from transformers import ViTImageProcessor, ViTForImageClassification
17 | from PIL import Image
18 | import requests
19 | import onnx
20 | from onnxsim import simplify
21 | 
22 | image_size = 224
23 | 
24 | model_name = f'vit-base-patch16-{image_size}'
25 | 
26 | # Import the model
27 | model = ViTForImageClassification.from_pretrained(f'google/{model_name}')
28 | onnx_filename = f'{model_name}.onnx'
29 | 
30 | # Export the PyTorch model to ONNX
31 | dummy_input = torch.randn(1, 3, image_size, image_size).type(torch.FloatTensor)
32 | torch.onnx.export(model,                # PyTorch model
33 |                   dummy_input,           # Input tensor
34 |                   onnx_filename,          # Output file
35 |                   export_params=True,   # Export the model parameters
36 |                   opset_version=11,     # ONNX opset version
37 |                   do_constant_folding=True,  # Fold constant values for optimization
38 |                   input_names=['image'],    # Input tensor names
39 |                   output_names=['output'],  # Output tensor names
40 |                   dynamic_axes={'image': {0: 'batch_size'},  # Dynamic axes
41 |                                 'output': {0: 'batch_size'}})
42 | 
43 | # apply onnxsim (optional)
44 | onnx_model = onnx.load(onnx_filename)
45 | onnx_model_simp, check = simplify(onnx_model)
46 | onnx.save(onnx_model_simp, onnx_filename)
47 | print("ONNX model saved at: ", onnx_filename)
48 | 
49 | # Generate binary for QAIC by default the binary using a helper library. 
50 | qpcPath = generate_bin(onnx_filename = onnx_filename , yaml_filename ='./vit_config.yaml') # return path to the folder containing compiled binary. 
51 | 
52 | # Compile and load the model
53 | vit_sess = qaic.Session(model_path= qpcPath+'/programqpc.bin', options_path='./vit_config.yaml')
54 | vit_sess.setup() 
55 | input_shape, input_type = vit_sess.model_input_shape_dict['image']
56 | output_shape, output_type = vit_sess.model_output_shape_dict['output']
57 | 
58 | processor = ViTImageProcessor.from_pretrained(f'google/{model_name}')
59 | 
60 | # input sample
61 | url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
62 | image = Image.open(requests.get(url, stream=True).raw)
63 | inputs = processor(images=image, return_tensors="pt")
64 | 
65 | device = True
66 | if device:
67 |     print("INFO: running inference on Qualcomm Cloud AI 100")
68 |     input_data = inputs['pixel_values'].numpy().astype(input_type)
69 |     input_dict = {'image': input_data}
70 |     output = vit_sess.run(input_dict)
71 |     logits = np.frombuffer(output['output'], dtype=output_type).reshape(output_shape) # dtype to be modified based on given model
72 | else:
73 |     print("INFO: running inference on CPU")
74 |     outputs = model(**inputs)
75 |     logits = outputs.logits
76 | 
77 | predicted_class_idx = logits.argmax(-1).item()
78 | print("Predicted class:", model.config.id2label[predicted_class_idx])


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_turbo/server.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  2 | # SPDX-License-Identifier: BSD-3-Clause-Clear
  3 | 
  4 | from contextlib import asynccontextmanager
  5 | from fastapi import FastAPI, HTTPException, Request
  6 | from typing import Optional
  7 | from pydantic import BaseModel
  8 | import time
  9 | import base64
 10 | import time
 11 | 
 12 | from io import BytesIO
 13 | 
 14 | from model import QAICStableDiffusion
 15 | 
 16 | @asynccontextmanager
 17 | async def lifespan(app: FastAPI):
 18 |     # Code to run before the application starts
 19 |     print("Application startup")
 20 | 
 21 |     app.model = QAICStableDiffusion(device_id=args.device)
 22 | 
 23 |     yield
 24 |     # Code to run when the application shuts down
 25 |     print("Application shutdown")
 26 | 
 27 | app = FastAPI(lifespan=lifespan)
 28 | 
 29 | class ImageRequest(BaseModel):
 30 |     model: str
 31 |     prompt: str
 32 |     n: Optional[int] = 1
 33 |     size: Optional[str] = '512x512'
 34 |     response_format: Optional[str] = 'b64_json'
 35 | 
 36 | @app.get("/v1/models")
 37 | async def get_models():
 38 |     try:
 39 |         response = {
 40 |             "object": "list",
 41 |             "data": [
 42 |                 {
 43 |                 "id": "sdxl-turbo",
 44 |                 "object": "model",
 45 |                 "created": 1746296172,
 46 |                 "owned_by": "system"
 47 |                 }
 48 |             ],
 49 |         }
 50 | 
 51 |         return {"response": response}
 52 |     except Exception as e:
 53 |         raise HTTPException(status_code=500, detail=str(e))
 54 | 
 55 | @app.post("/v1/images/generations")
 56 | async def generate_images(image_request: ImageRequest):
 57 |     print(image_request)
 58 |     utc_seconds = time.time()
 59 | 
 60 |     size = [int(dim) for dim in image_request.size.split('x')]
 61 | 
 62 |     try:
 63 |         async for image in app.model.generate(image_request.prompt,
 64 |                                               image_request.n,
 65 |                                               size):
 66 |             buffered = BytesIO()
 67 |             image.save(buffered, format='PNG')
 68 |             b64_json = base64.b64encode(buffered.getvalue()).decode()
 69 | 
 70 |             response = {
 71 |                 "created": int(utc_seconds),
 72 |                 "data": [
 73 |                     {
 74 |                         "b64_json": b64_json
 75 |                     }
 76 |                 ]
 77 |             }
 78 | 
 79 |             return response
 80 | 
 81 |     except Exception as e:
 82 |         raise HTTPException(status_code=500, detail=str(e))
 83 | 
 84 | if __name__ == "__main__":
 85 |     import uvicorn
 86 |     import argparse
 87 | 
 88 |     parser = argparse.ArgumentParser(description="SDXL-Turbo REST endpoint")
 89 | 
 90 |     parser.add_argument(
 91 |         "--host",
 92 |         type=str,
 93 |         help="IP address",
 94 |         default="0.0.0.0"
 95 |     )
 96 | 
 97 |     parser.add_argument(
 98 |         "--port",
 99 |         type=int,
100 |         help="Port",
101 |         default=8000
102 |     )
103 | 
104 |     parser.add_argument(
105 |         "--device",
106 |         type=int,
107 |         help="Cloud AI device",
108 |         default=0
109 |     )
110 | 
111 |     args = parser.parse_args()
112 | 
113 |     uvicorn.run(app, host=args.host, port=args.port)
114 | 
115 | 


--------------------------------------------------------------------------------
/CODE-OF-CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, gender identity and expression, level of experience,
 9 | nationality, personal appearance, race, religion, or sexual identity and
10 | orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team. All complaints will be reviewed
59 | and investigated and will result in a response that is deemed necessary and 
60 | appropriate to the circumstances. The project team is obligated to maintain
61 | confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-xl-base-1.0/attention_patch.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
 2 | index 21eb3a32..1df1b09c 100644
 3 | --- a/src/diffusers/models/attention_processor.py
 4 | +++ b/src/diffusers/models/attention_processor.py
 5 | @@ -200,10 +200,8 @@ class Attention(nn.Module):
 6 |          # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
 7 |          # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
 8 |          # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
 9 | -        if processor is None:
10 | -            processor = (
11 | -                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
12 | -            )
13 | +        # force to not use FlashAttention
14 | +        processor = AttnProcessor()
15 |          self.set_processor(processor)
16 |  
17 |      def set_use_memory_efficient_attention_xformers(
18 | @@ -588,7 +586,9 @@ class Attention(nn.Module):
19 |  
20 |          if attention_mask is None:
21 |              baddbmm_input = torch.empty(
22 | -                query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
23 | +                query.shape[0], query.shape[1], 
24 | +                key.shape[2], # key is already transposed
25 | +                dtype=query.dtype, device=query.device
26 |              )
27 |              beta = 0
28 |          else:
29 | @@ -598,7 +598,7 @@ class Attention(nn.Module):
30 |          attention_scores = torch.baddbmm(
31 |              baddbmm_input,
32 |              query,
33 | -            key.transpose(-1, -2),
34 | +            key,  # key is already transposed
35 |              beta=beta,
36 |              alpha=self.scale,
37 |          )
38 | @@ -740,8 +740,26 @@ class AttnProcessor:
39 |          key = attn.head_to_batch_dim(key)
40 |          value = attn.head_to_batch_dim(value)
41 |  
42 | -        attention_probs = attn.get_attention_scores(query, key, attention_mask)
43 | -        hidden_states = torch.bmm(attention_probs, value)
44 | +        # pre-transpose the key
45 | +        key = key.transpose(-1, -2)
46 | +        if query.size(-2) != value.size(-2): # cross-attention, use regular attention
47 | +            # QKV done in single block
48 | +            attention_probs = attn.get_attention_scores(query, key, attention_mask)
49 | +            hidden_states = torch.bmm(attention_probs, value)
50 | +        else: # self-attention, use blocked attention
51 | +            # QKV done with block-attention (a la FlashAttentionV2)
52 | +            print(f"{query.shape = }, {key.shape = }, {value.shape = }")
53 | +            query_block_size = 128
54 | +            query_seq_len = query.size(-2)
55 | +            num_blocks = (query_seq_len + query_block_size - 1) // query_block_size
56 | +            for qidx in range(num_blocks):
57 | +                query_block = query[:,qidx*query_block_size:(qidx+1)*query_block_size,:]
58 | +                attention_probs = attn.get_attention_scores(query_block, key, attention_mask)
59 | +                hidden_states_block = torch.bmm(attention_probs, value)
60 | +                if qidx == 0:
61 | +                    hidden_states = hidden_states_block
62 | +                else:
63 | +                    hidden_states = torch.cat((hidden_states, hidden_states_block), -2)
64 |          hidden_states = attn.batch_to_head_dim(hidden_states)
65 |  
66 |          # linear proj
67 | 


--------------------------------------------------------------------------------
/models/speech/whisper/runModel.py:
--------------------------------------------------------------------------------
 1 | ####################################################################################################
 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
 4 | ####################################################################################################
 5 | 
 6 | import os
 7 | from datasets import load_dataset
 8 | from transformers import WhisperProcessor
 9 | import whisper
10 | import numpy as np
11 | import torch
12 | from audio import AudioSample
13 | import qaic
14 | 
15 | model_name = 'base'
16 | aic_path = './whisper_AIC'
17 | 
18 | # Select an audio file and read it:
19 | audio_sample = AudioSample()
20 | audio_path = audio_sample.to_file()
21 | audio = whisper.load_audio(audio_path) # Read audio from file
22 | audio_pad = whisper.pad_or_trim(audio) # Padding and trimming
23 | # make log-Mel spectrogram and move to the same device as the model
24 | input_features = whisper.log_mel_spectrogram(audio_pad) # convert to mel spectrogram
25 | 
26 | # Load the Whisper processor for parsing results
27 | processor = WhisperProcessor.from_pretrained('openai/whisper-{}'.format(model_name))
28 | 
29 | eot = 50257 # end of transcript token
30 | startoftranscript = 50258 # start of transcript token
31 | 
32 | decoder_sequence_length=150
33 | 
34 | def run_AIC(input_features, device_id=0):
35 |     # Load both encoder and decoder models into Cloud AI accelerator memory
36 |     # via oversubscription.
37 |     # The number of NSP cores required is the maximum of the numbers of cores
38 |     # for which encoder and decoder are compiled.
39 |     # If encoder is compiled for 4 cores and decoder is compiled for 12 cores,
40 |     # then the max usage is 12 cores.
41 |     # Since encoder and decoder don't run at the same time, this allows us to
42 |     # efficiently utilize the available cores.
43 | 
44 |     encoder_sess = qaic.Session(
45 |         model_path=os.path.join(aic_path, 'whisper-encoder', 'programqpc.bin'),
46 |         num_activations=1,
47 |         set_size=1,
48 |         dev_id=device_id,
49 |         oversubscription_name='group1')
50 | 
51 |     decoder_sess = qaic.Session(
52 |         model_path=os.path.join(aic_path, 'whisper-decoder', 'programqpc.bin'),
53 |         num_activations=1,
54 |         set_size=1,
55 |         dev_id=device_id,
56 |         oversubscription_name='group1')
57 | 
58 |     encoder_inputs = {
59 |         'input_features': input_features.numpy().astype(np.float32).reshape(1,80,3000)
60 |     }
61 | 
62 |     audio_features = encoder_sess.run(encoder_inputs)['last_hidden_state']
63 | 
64 |     next_token = None
65 |     tokens = [startoftranscript]
66 |     decoder_input_ids = np.zeros((1, decoder_sequence_length), dtype=np.int64)
67 |     decoder_input_ids[:,0] = startoftranscript
68 | 
69 |     for iter in range(decoder_sequence_length):
70 |         if iter > 0:
71 |             decoder_input_ids[:,iter] = next_token.item()
72 | 
73 |         decoder_inputs = {
74 |             'input_ids': decoder_input_ids,
75 |             'encoder_hidden_states': audio_features,
76 |         }
77 | 
78 |         logits = decoder_sess.run(decoder_inputs)['logits']
79 |         logits = logits[:,iter,:]
80 | 
81 |         next_token = logits.argmax(axis=-1)
82 |         tokens.append(next_token.item())
83 | 
84 |         if next_token == eot: # stop at end-of-transcript token
85 |             break
86 | 
87 |     transcription = processor.batch_decode(tokens, skip_special_tokens=False)
88 |     print("result:", transcription)
89 | 
90 | if __name__ == '__main__':
91 |     run_AIC(input_features)
92 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/run_config_inference.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | ####################################################################################################
  4 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  5 | # SPDX-License-Identifier: BSD-3-Clause-Clear
  6 | ####################################################################################################
  7 | 
  8 | # model configs
  9 | MODEL_PATH="stabilityai/stable-diffusion-3.5-medium"
 10 | PROMPT="\"$1\""
 11 | NEG_PROMPT="\"$2\""
 12 | GUIDANCE=7.0
 13 | VAE_TYPE="vae"
 14 | IMAGE_SIZE=1024
 15 | BLOCK_SIZE=64
 16 | BATCH_SIZE=1
 17 | 
 18 | # onnx configs
 19 | GENERATE_ONNX=false
 20 | ONNX_TEXT_ENCODER=false
 21 | ONNX_TRANSFORMER=false
 22 | ONNX_VAE=false
 23 | 
 24 | # compile configs
 25 | NUM_CORES=16
 26 | VAE_MOS=2
 27 | VAE_OLS=1
 28 | TRANSFORMER_MOS=1
 29 | TRANSFORMER_OLS=2
 30 | COMPILE_TEXT_ENCODER=false
 31 | COMPILE_TRANSFORMER=false
 32 | COMPILE_VAE=false
 33 | 
 34 | # inference configs
 35 | RUN_ONLY=true
 36 | TEXT_ENCODER_3=false
 37 | DEVICE=0
 38 | DEVICE2=1
 39 | NUM_STEPS=28
 40 | WARMUP_ITERS=3
 41 | REPEAT_ITERS=3
 42 | 
 43 | # mode
 44 | TOGETHER=false
 45 | 
 46 | if [ ${GENERATE_ONNX} == true ]
 47 | then
 48 |     GENERATE_ONNX_CMD="--generate-onnx"
 49 | else
 50 |     GENERATE_ONNX_CMD=""
 51 | fi
 52 | 
 53 | if [ ${ONNX_TEXT_ENCODER} == true ]
 54 | then
 55 |     ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder"
 56 | else
 57 |     ONNX_TEXT_ENCODER_CMD=""
 58 | fi
 59 | 
 60 | if [ ${ONNX_TRANSFORMER} == true ]
 61 | then
 62 |     ONNX_TRANSFORMER_CMD="--onnx-transformer"
 63 | else
 64 |     ONNX_TRANSFORMER_CMD=""
 65 | fi
 66 | 
 67 | if [ ${ONNX_VAE} == true ]
 68 | then
 69 |     ONNX_VAE_CMD="--onnx-vae"
 70 | else
 71 |     ONNX_VAE_CMD=""
 72 | fi
 73 | 
 74 | if [ ${COMPILE_TEXT_ENCODER} == true ]
 75 | then
 76 |     COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder"
 77 | else
 78 |     COMPILE_TEXT_ENCODER_CMD=""
 79 | fi
 80 | 
 81 | if [ ${COMPILE_TRANSFORMER} == true ]
 82 | then
 83 |     COMPILE_TRANSFORMER_CMD="--compile-transformer"
 84 | else
 85 |     COMPILE_TRANSFORMER_CMD=""
 86 | fi
 87 | 
 88 | if [ ${COMPILE_VAE} == true ]
 89 | then
 90 |     COMPILE_VAE_CMD="--compile-vae"
 91 | else
 92 |     COMPILE_VAE_CMD=""
 93 | fi
 94 | 
 95 | if [ ${RUN_ONLY} == true ]
 96 | then
 97 |     RUN_ONLY_CMD="--run-only"
 98 | else
 99 |     RUN_ONLY_CMD=""
100 | fi
101 | 
102 | if [ ${TEXT_ENCODER_3} == true ]
103 | then
104 |     TEXT_ENCODER_3_CMD="--text-encoder-3"
105 | else
106 |     TEXT_ENCODER_3_CMD=""
107 | fi
108 | 
109 | if [ ${TOGETHER} == true ]
110 | then
111 |     TOGETHER_CMD="--together"
112 | else
113 |     TOGETHER_CMD=""
114 | fi
115 | 
116 | export HF_HOME="cache"
117 | 
118 | rm run.sh
119 | 
120 | scripts="python main.py \
121 | --model-path $MODEL_PATH \
122 | --prompt $PROMPT \
123 | --neg_prompt $NEG_PROMPT \
124 | --guidance $GUIDANCE \
125 | --vae-type $VAE_TYPE \
126 | --batch-size $BATCH_SIZE \
127 | --image-size $IMAGE_SIZE \
128 | --block-size $BLOCK_SIZE \
129 | --num-cores $NUM_CORES \
130 | --vae-mos $VAE_MOS \
131 | --vae-ols $VAE_OLS \
132 | --transformer-mos $TRANSFORMER_MOS \
133 | --transformer-ols $TRANSFORMER_OLS \
134 | --device-id $DEVICE \
135 | --device-id2 $DEVICE2 \
136 | --num-steps $NUM_STEPS \
137 | --num-warmup-iters $WARMUP_ITERS \
138 | --num-repeat-iters $REPEAT_ITERS \
139 | $ONNX_TEXT_ENCODER_CMD \
140 | $ONNX_TRANSFORMER_CMD \
141 | $ONNX_VAE_CMD \
142 | $COMPILE_TEXT_ENCODER_CMD \
143 | $COMPILE_TRANSFORMER_CMD \
144 | $COMPILE_VAE_CMD \
145 | $GENERATE_ONNX_CMD \
146 | $RUN_ONLY_CMD \
147 | $TEXT_ENCODER_3_CMD \
148 | $TOGETHER_CMD"
149 | 
150 | echo $scripts >> run.sh
151 | 
152 | bash run.sh
153 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/README.md:
--------------------------------------------------------------------------------
  1 | ### Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  2 | ### SPDX-License-Identifier: BSD-3-Clause-Clear
  3 | 
  4 | # Instructions to run SDXL on Cloud AI 100 with DeepCache
  5 | 
  6 | The instructions below are to run the [Stable Diffusion XL model](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) with [DeepCache](https://github.com/horseee/DeepCache) on Cloud AI 100.
  7 | 
  8 | 
  9 | ## Pre-requisites
 10 | 
 11 | Install the moreutils package for the `ts` timestamp tool:
 12 | ```
 13 | sudo apt update
 14 | sudo apt-get install moreutils
 15 | ```
 16 | 
 17 | Install Git Large File System (LFS) support
 18 | 
 19 | ```
 20 | sudo apt update
 21 | sudo apt-get install git-lfs
 22 | ```
 23 | 
 24 | ## 1. Generate onnx files and compile for binaries
 25 | 
 26 | 1.  Set up a virtual environment for ONNX generation and compilation
 27 | ```
 28 | python3.10 -m venv env_onnx
 29 | source ./env_onnx/bin/activate
 30 | pip install -r requirements.txt
 31 | ```
 32 | 
 33 | 2.  Setup environments
 34 | ```
 35 | mkdir cache
 36 | mkdir qpc
 37 | mkdir compile_logs
 38 | ```
 39 | 
 40 | 3. Install diffusers from source after patching for ONNX file generation
 41 | ```
 42 | git clone --depth 1 --branch v0.24.0 https://github.com/huggingface/diffusers.git diffusers_onnx
 43 | cd diffusers_onnx
 44 | git apply --reject --whitespace=fix ../patches/attention_patch.patch
 45 | pip install .
 46 | cd ..
 47 | ```
 48 | 
 49 | 4. Install DeepCache for ONNX file generation (deep UNet) 
 50 | ```
 51 | git clone https://github.com/horseee/DeepCache.git
 52 | cd DeepCache
 53 | git apply --reject --whitespace=fix ../patches/deepcache_unet.patch
 54 | pip install .
 55 | cd ..
 56 | ```
 57 | 
 58 | 5. Prepare VAE Decoder 
 59 | ```
 60 | export GIT_LFS_SKIP_SMUDGE=1
 61 | git clone https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0 cache/stabilityai/stable-diffusion-xl-base-1.0
 62 | cd cache/stabilityai/stable-diffusion-xl-base-1.0
 63 | git lfs pull -I vae_decoder/model.onnx
 64 | rm -rf .git/lfs # optional to save space
 65 | cd ../../../
 66 | ```
 67 | 
 68 | 6. Generate ONNX files and compile for binaries
 69 | ```
 70 | touch run.sh
 71 | bash run_config_deep.sh
 72 | ```
 73 | 
 74 | 7. Modify the UNet to be the shallow version
 75 | ```
 76 | sed -i '963s/False/True/' env_onnx/lib/python3.10/site-packages/DeepCache/sdxl/unet_2d_condition.py
 77 | ```
 78 | 
 79 | 8. Generate ONNX file and compile shallow UNet for DeepCache
 80 | ```
 81 | bash run_config_shallow.sh
 82 | ```
 83 | 
 84 | ## 2. Run the end-to-end SDXL inference
 85 | 
 86 | 1. Set up a separate virtual environment for running SDXL 
 87 | ```
 88 | python3.10 -m venv env_pipeline
 89 | source ./env_pipeline/bin/activate
 90 | pip install -r requirements.txt
 91 | pip install --force-reinstall /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl
 92 | ```
 93 | 
 94 | 2.  Re-install diffusers and DeepCache from source after patching the SDXL pipeline for inference
 95 | ```
 96 | git clone --depth 1 --branch v0.24.0 https://github.com/huggingface/diffusers.git diffusers_pipeline
 97 | cd diffusers_pipeline
 98 | pip install .
 99 | cd ..
100 | ```
101 | 
102 | 3. Install DeepCache and prepare the pipeline for inference
103 | ```
104 | git clone https://github.com/horseee/DeepCache.git deepcache_pipeline
105 | cd deepcache_pipeline
106 | git apply --reject --whitespace=fix ../patches/deepcache_pipeline.patch
107 | pip install .
108 | cd ..
109 | ```
110 | 
111 | 4. Run the SDXL inference with 'sudo' flag if needed to access the AI100 devices. 
112 | ```
113 | sudo bash run_config_inference.sh $(which python3)
114 | ```
115 | Note: ```CACHE_INTERVAL``` variable in ```run_config_inference.sh``` refers to the period of caching
116 | 
117 | 


--------------------------------------------------------------------------------
/samples/python/aws_ai100_benchmarking/cv_classifiers/run_cv_classifiers.sh:
--------------------------------------------------------------------------------
 1 | ##############################################################################
 2 | # @@-COPYRIGHT-START-@@
 3 | #
 4 | # Copyright (c) 2023, Qualcomm Technologies, Inc. All Rights Reserved.
 5 | # Redistribution and use in source and binary forms, with or without
 6 | # modification, are permitted provided that the following conditions are met:
 7 | #
 8 | # 1. Redistributions of source code must retain the above copyright notice,
 9 | #    this list of conditions and the following disclaimer.
10 | #
11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
12 | #    this list of conditions and the following disclaimer in the documentation
13 | #    and/or other materials provided with the distribution.
14 | #
15 | # 3. Neither the name of the copyright holder nor the names of its contributors
16 | #    may be used to endorse or promote products derived from this software
17 | #    without specific prior written permission.
18 | #
19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 | # POSSIBILITY OF SUCH DAMAGE.
30 | #
31 | # SPDX-License-Identifier: BSD-3-Clause
32 | #
33 | # @@-COPYRIGHT-END-@@
34 | ##############################################################################
35 | 
36 | #!/bin/bash
37 | 
38 | mkdir -p ./resnet-152/
39 | echo python run_cv_classifier.py -m resnet-152 -o best-latency "$@" 
40 | python run_cv_classifier.py -m resnet-152 -o best-latency "$@" | tee -a ./resnet-152/best-latency.log
41 | echo python run_cv_classifier.py -m resnet-152 -o balanced "$@" 
42 | python run_cv_classifier.py -m resnet-152 -o balanced "$@" | tee -a ./resnet-152/balanced.log
43 | echo python run_cv_classifier.py -m resnet-152 -o best-throughput "$@" 
44 | python run_cv_classifier.py -m resnet-152 -o best-throughput "$@" | tee -a ./resnet-152/best-throughput.log
45 | 
46 | mkdir -p ./resnet-50/
47 | echo python run_cv_classifier.py -m resnet-50 -o best-latency "$@" 
48 | python run_cv_classifier.py -m resnet-50 -o best-latency "$@" | tee -a ./resnet-50/best-latency.log
49 | echo python run_cv_classifier.py -m resnet-50 -o balanced "$@" 
50 | python run_cv_classifier.py -m resnet-50 -o balanced "$@" | tee -a ./resnet-50/balanced.log
51 | echo python run_cv_classifier.py -m resnet-50 -o best-throughput "$@" 
52 | python run_cv_classifier.py -m resnet-50 -o best-throughput "$@" | tee -a ./resnet-50/best-throughput.log
53 | 
54 | mkdir -p ./vit-base-patch16-224/
55 | echo python run_cv_classifier.py -m vit-base-patch16-224 -o best-latency "$@" 
56 | python run_cv_classifier.py -m vit-base-patch16-224 -o best-latency "$@" | tee -a ./vit-base-patch16-224/best-latency.log
57 | echo python run_cv_classifier.py -m vit-base-patch16-224 -o balanced "$@" 
58 | python run_cv_classifier.py -m vit-base-patch16-224 -o balanced "$@" | tee -a ./vit-base-patch16-224/balanced.log
59 | echo python run_cv_classifier.py -m vit-base-patch16-224 -o best-throughput "$@" 
60 | python run_cv_classifier.py -m vit-base-patch16-224 -o best-throughput "$@" | tee -a ./vit-base-patch16-224/best-throughput.log
61 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/run_config_gen.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | ####################################################################################################
  4 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  5 | # SPDX-License-Identifier: BSD-3-Clause-Clear
  6 | ####################################################################################################
  7 | 
  8 | # model configs
  9 | MODEL_PATH="stabilityai/stable-diffusion-3.5-medium"
 10 | VAE_TYPE="vae"
 11 | IMAGE_SIZE=1024
 12 | BLOCK_SIZE=64
 13 | BATCH_SIZE=1
 14 | 
 15 | # onnx configs
 16 | GENERATE_ONNX=true
 17 | ONNX_TEXT_ENCODER=true
 18 | ONNX_TEXT_ENCODER_3=true
 19 | ONNX_TRANSFORMER=true
 20 | ONNX_VAE=true
 21 | 
 22 | # compile configs
 23 | NUM_CORES=16
 24 | VAE_MOS=2
 25 | VAE_OLS=1
 26 | TRANSFORMER_MOS=1
 27 | TRANSFORMER_OLS=2
 28 | COMPILE_TEXT_ENCODER=true
 29 | COMPILE_TEXT_ENCODER_3=false
 30 | COMPILE_TRANSFORMER=true
 31 | COMPILE_VAE=true
 32 | 
 33 | # inference configs
 34 | RUN_ONLY=false
 35 | DEVICE=0
 36 | DEVICE2=1
 37 | NUM_STEPS=1
 38 | WARMUP_ITERS=3
 39 | REPEAT_ITERS=1
 40 | 
 41 | # mode
 42 | TOGETHER=false
 43 | 
 44 | if [ ${GENERATE_ONNX} == true ]
 45 | then
 46 |     GENERATE_ONNX_CMD="--generate-onnx"
 47 | else
 48 |     GENERATE_ONNX_CMD=""
 49 | fi
 50 | 
 51 | if [ ${ONNX_TEXT_ENCODER} == true ]
 52 | then
 53 |     ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder"
 54 | else
 55 |     ONNX_TEXT_ENCODER_CMD=""
 56 | fi
 57 | 
 58 | if [ ${ONNX_TEXT_ENCODER_3} == true ]
 59 | then
 60 |     ONNX_TEXT_ENCODER_3_CMD="--onnx-text-encoder-3"
 61 | else
 62 |     ONNX_TEXT_ENCODER_3_CMD=""
 63 | fi
 64 | 
 65 | if [ ${ONNX_TRANSFORMER} == true ]
 66 | then
 67 |     ONNX_TRANSFORMER_CMD="--onnx-transformer"
 68 | else
 69 |     ONNX_TRANSFORMER_CMD=""
 70 | fi
 71 | 
 72 | if [ ${ONNX_VAE} == true ]
 73 | then
 74 |     ONNX_VAE_CMD="--onnx-vae"
 75 | else
 76 |     ONNX_VAE_CMD=""
 77 | fi
 78 | 
 79 | if [ ${COMPILE_TEXT_ENCODER} == true ]
 80 | then
 81 |     COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder"
 82 | else
 83 |     COMPILE_TEXT_ENCODER_CMD=""
 84 | fi
 85 | 
 86 | if [ ${COMPILE_TEXT_ENCODER_3} == true ]
 87 | then
 88 |     COMPILE_TEXT_ENCODER_3_CMD="--compile-text-encoder-3"
 89 | else
 90 |     COMPILE_TEXT_ENCODER_3_CMD=""
 91 | fi
 92 | 
 93 | if [ ${COMPILE_TRANSFORMER} == true ]
 94 | then
 95 |     COMPILE_TRANSFORMER_CMD="--compile-transformer"
 96 | else
 97 |     COMPILE_TRANSFORMER_CMD=""
 98 | fi
 99 | 
100 | if [ ${COMPILE_VAE} == true ]
101 | then
102 |     COMPILE_VAE_CMD="--compile-vae"
103 | else
104 |     COMPILE_VAE_CMD=""
105 | fi
106 | 
107 | if [ ${RUN_ONLY} == true ]
108 | then
109 |     RUN_ONLY_CMD="--run-only"
110 | else
111 |     RUN_ONLY_CMD=""
112 | fi
113 | 
114 | if [ ${TOGETHER} == true ]
115 | then
116 |     TOGETHER_CMD="--together"
117 | else
118 |     TOGETHER_CMD=""
119 | fi
120 | 
121 | export HF_HOME="cache"
122 | 
123 | rm run.sh
124 | 
125 | scripts="python main.py \
126 | --model-path $MODEL_PATH \
127 | --vae-type $VAE_TYPE \
128 | --batch-size $BATCH_SIZE \
129 | --image-size $IMAGE_SIZE \
130 | --block-size $BLOCK_SIZE \
131 | --num-cores $NUM_CORES \
132 | --vae-mos $VAE_MOS \
133 | --vae-ols $VAE_OLS \
134 | --transformer-mos $TRANSFORMER_MOS \
135 | --transformer-ols $TRANSFORMER_OLS \
136 | --device-id $DEVICE \
137 | --device-id2 $DEVICE2 \
138 | --num-steps $NUM_STEPS \
139 | --num-warmup-iters $WARMUP_ITERS \
140 | --num-repeat-iters $REPEAT_ITERS \
141 | $ONNX_TEXT_ENCODER_CMD \
142 | $ONNX_TEXT_ENCODER_3_CMD \
143 | $ONNX_TRANSFORMER_CMD \
144 | $ONNX_VAE_CMD \
145 | $COMPILE_TEXT_ENCODER_CMD \
146 | $COMPILE_TEXT_ENCODER_3_CMD \
147 | $COMPILE_TRANSFORMER_CMD \
148 | $COMPILE_VAE_CMD \
149 | $GENERATE_ONNX_CMD \
150 | $RUN_ONLY_CMD \
151 | $TOGETHER_CMD"
152 | 
153 | echo $scripts >> run.sh
154 | 
155 | bash run.sh
156 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/run_config_deep.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | ####################################################################################################
  4 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  5 | # SPDX-License-Identifier: BSD-3-Clause-Clear
  6 | ####################################################################################################
  7 | 
  8 | # model configs
  9 | MODEL_PATH="stabilityai/stable-diffusion-xl-base-1.0"
 10 | PROMPT="\"A cinematic shot of a baby racoon wearing an intricate italian priest robe.\""
 11 | VAE_TYPE="vae"
 12 | UNET_TYPE="deep"
 13 | IMAGE_SIZE=1024
 14 | BLOCK_SIZE_DEEP=256
 15 | BLOCK_SIZE_SHALLOW=128
 16 | BATCH_SIZE=1
 17 | PRECISION=fp16,fp16,fp16,fp16
 18 | 
 19 | # onnx configs
 20 | GENERATE_ONNX=true
 21 | ONNX_TEXT_ENCODER=true
 22 | ONNX_UNET=true
 23 | ONNX_VAE=true
 24 | 
 25 | # compile configs
 26 | NUM_CORES=16
 27 | VAE_MOS=2
 28 | VAE_OLS=1
 29 | UNET_MOS_DEEP=2
 30 | UNET_OLS_DEEP=1
 31 | UNET_MOS_SHALLOW=1
 32 | UNET_OLS_SHALLOW=2
 33 | COMPILE_TEXT_ENCODER=true
 34 | COMPILE_UNET=true
 35 | COMPILE_VAE=true
 36 | 
 37 | # inference configs
 38 | RUN_ONLY=false
 39 | DEVICE=0
 40 | DEVICE_2=1
 41 | NUM_STEPS=20
 42 | WARMUP_ITERS=3
 43 | REPEAT_ITERS=3
 44 | 
 45 | # mode
 46 | TOGETHER=false
 47 | 
 48 | if [ ${GENERATE_ONNX} == true ]
 49 | then
 50 |     GENERATE_ONNX_CMD="--generate-onnx"
 51 | else
 52 |     GENERATE_ONNX_CMD=""
 53 | fi
 54 | 
 55 | if [ ${ONNX_TEXT_ENCODER} == true ]
 56 | then
 57 |     ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder"
 58 | else
 59 |     ONNX_TEXT_ENCODER_CMD=""
 60 | fi
 61 | 
 62 | if [ ${ONNX_UNET} == true ]
 63 | then
 64 |     ONNX_UNET_CMD="--onnx-unet"
 65 | else
 66 |     ONNX_UNET_CMD=""
 67 | fi
 68 | 
 69 | if [ ${ONNX_VAE} == true ]
 70 | then
 71 |     ONNX_VAE_CMD="--onnx-vae"
 72 | else
 73 |     ONNX_VAE_CMD=""
 74 | fi
 75 | 
 76 | if [ ${COMPILE_TEXT_ENCODER} == true ]
 77 | then
 78 |     COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder"
 79 | else
 80 |     COMPILE_TEXT_ENCODER_CMD=""
 81 | fi
 82 | 
 83 | if [ ${COMPILE_UNET} == true ]
 84 | then
 85 |     COMPILE_UNET_CMD="--compile-unet"
 86 | else
 87 |     COMPILE_UNET_CMD=""
 88 | fi
 89 | 
 90 | if [ ${COMPILE_VAE} == true ]
 91 | then
 92 |     COMPILE_VAE_CMD="--compile-vae"
 93 | else
 94 |     COMPILE_VAE_CMD=""
 95 | fi
 96 | 
 97 | if [ ${RUN_ONLY} == true ]
 98 | then
 99 |     RUN_ONLY_CMD="--run-only"
100 | else
101 |     RUN_ONLY_CMD=""
102 | fi
103 | 
104 | if [ ${TOGETHER} == true ]
105 | then
106 |     TOGETHER_CMD="--together"
107 | else
108 |     TOGETHER_CMD=""
109 | fi
110 | 
111 | export HF_HOME="cache"
112 | sed -i 's/query_block_size = 128/query_block_size = 256/g' ./env_onnx/lib/python3.10/site-packages/diffusers/models/attention_processor.py
113 | 
114 | rm run.sh
115 | 
116 | scripts="python main.py \
117 | --model-path $MODEL_PATH \
118 | --prompt $PROMPT \
119 | --unet-type $UNET_TYPE \
120 | --vae-type $VAE_TYPE \
121 | --batch-size $BATCH_SIZE \
122 | --image-size $IMAGE_SIZE \
123 | --block-size-deep $BLOCK_SIZE_DEEP \
124 | --block-size-shallow $BLOCK_SIZE_SHALLOW \
125 | --num-cores $NUM_CORES \
126 | --vae-mos $VAE_MOS \
127 | --vae-ols $VAE_OLS \
128 | --unet-mos-deep $UNET_MOS_DEEP \
129 | --unet-ols-deep $UNET_OLS_DEEP \
130 | --unet-mos-shallow $UNET_MOS_SHALLOW \
131 | --unet-ols-shallow $UNET_OLS_SHALLOW \
132 | --device-id $DEVICE \
133 | --device-id-2 $DEVICE_2 \
134 | --num-steps $NUM_STEPS \
135 | --num-warmup-iters $WARMUP_ITERS \
136 | --num-repeat-iters $REPEAT_ITERS \
137 | --precision $PRECISION \
138 | $ONNX_TEXT_ENCODER_CMD \
139 | $ONNX_UNET_CMD \
140 | $ONNX_VAE_CMD \
141 | $COMPILE_TEXT_ENCODER_CMD \
142 | $COMPILE_UNET_CMD \
143 | $COMPILE_VAE_CMD \
144 | $GENERATE_ONNX_CMD \
145 | $RUN_ONLY_CMD \
146 | $TOGETHER_CMD"
147 | 
148 | echo $scripts >> run.sh
149 | 
150 | bash run.sh
151 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/run_config_shallow.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | ####################################################################################################
  4 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  5 | # SPDX-License-Identifier: BSD-3-Clause-Clear
  6 | ####################################################################################################
  7 | 
  8 | # model configs
  9 | MODEL_PATH="stabilityai/stable-diffusion-xl-base-1.0"
 10 | PROMPT="\"A cinematic shot of a baby racoon wearing an intricate italian priest robe.\""
 11 | VAE_TYPE="vae"
 12 | UNET_TYPE="shallow"
 13 | IMAGE_SIZE=1024
 14 | BLOCK_SIZE_DEEP=256
 15 | BLOCK_SIZE_SHALLOW=128
 16 | BATCH_SIZE=1
 17 | PRECISION=fp16,fp16,fp16,fp16
 18 | 
 19 | # onnx configs
 20 | GENERATE_ONNX=true
 21 | ONNX_TEXT_ENCODER=false
 22 | ONNX_UNET=true
 23 | ONNX_VAE=false
 24 | 
 25 | # compile configs
 26 | NUM_CORES=16
 27 | VAE_MOS=2
 28 | VAE_OLS=1
 29 | UNET_MOS_DEEP=2
 30 | UNET_OLS_DEEP=1
 31 | UNET_MOS_SHALLOW=1
 32 | UNET_OLS_SHALLOW=2
 33 | COMPILE_TEXT_ENCODER=false
 34 | COMPILE_UNET=true
 35 | COMPILE_VAE=false
 36 | 
 37 | # inference configs
 38 | RUN_ONLY=false
 39 | DEVICE=0
 40 | DEVICE_2=1
 41 | NUM_STEPS=20
 42 | WARMUP_ITERS=3
 43 | REPEAT_ITERS=3
 44 | 
 45 | # mode
 46 | TOGETHER=false
 47 | 
 48 | if [ ${GENERATE_ONNX} == true ]
 49 | then
 50 |     GENERATE_ONNX_CMD="--generate-onnx"
 51 | else
 52 |     GENERATE_ONNX_CMD=""
 53 | fi
 54 | 
 55 | if [ ${ONNX_TEXT_ENCODER} == true ]
 56 | then
 57 |     ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder"
 58 | else
 59 |     ONNX_TEXT_ENCODER_CMD=""
 60 | fi
 61 | 
 62 | if [ ${ONNX_UNET} == true ]
 63 | then
 64 |     ONNX_UNET_CMD="--onnx-unet"
 65 | else
 66 |     ONNX_UNET_CMD=""
 67 | fi
 68 | 
 69 | if [ ${ONNX_VAE} == true ]
 70 | then
 71 |     ONNX_VAE_CMD="--onnx-vae"
 72 | else
 73 |     ONNX_VAE_CMD=""
 74 | fi
 75 | 
 76 | if [ ${COMPILE_TEXT_ENCODER} == true ]
 77 | then
 78 |     COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder"
 79 | else
 80 |     COMPILE_TEXT_ENCODER_CMD=""
 81 | fi
 82 | 
 83 | if [ ${COMPILE_UNET} == true ]
 84 | then
 85 |     COMPILE_UNET_CMD="--compile-unet"
 86 | else
 87 |     COMPILE_UNET_CMD=""
 88 | fi
 89 | 
 90 | if [ ${COMPILE_VAE} == true ]
 91 | then
 92 |     COMPILE_VAE_CMD="--compile-vae"
 93 | else
 94 |     COMPILE_VAE_CMD=""
 95 | fi
 96 | 
 97 | if [ ${RUN_ONLY} == true ]
 98 | then
 99 |     RUN_ONLY_CMD="--run-only"
100 | else
101 |     RUN_ONLY_CMD=""
102 | fi
103 | 
104 | if [ ${TOGETHER} == true ]
105 | then
106 |     TOGETHER_CMD="--together"
107 | else
108 |     TOGETHER_CMD=""
109 | fi
110 | 
111 | export HF_HOME="cache"
112 | sed -i 's/query_block_size = 256/query_block_size = 128/g' ./env_onnx/lib/python3.10/site-packages/diffusers/models/attention_processor.py
113 | 
114 | rm run.sh
115 | 
116 | scripts="python main.py \
117 | --model-path $MODEL_PATH \
118 | --prompt $PROMPT \
119 | --unet-type $UNET_TYPE \
120 | --vae-type $VAE_TYPE \
121 | --batch-size $BATCH_SIZE \
122 | --image-size $IMAGE_SIZE \
123 | --block-size-deep $BLOCK_SIZE_DEEP \
124 | --block-size-shallow $BLOCK_SIZE_SHALLOW \
125 | --num-cores $NUM_CORES \
126 | --vae-mos $VAE_MOS \
127 | --vae-ols $VAE_OLS \
128 | --unet-mos-deep $UNET_MOS_DEEP \
129 | --unet-ols-deep $UNET_OLS_DEEP \
130 | --unet-mos-shallow $UNET_MOS_SHALLOW \
131 | --unet-ols-shallow $UNET_OLS_SHALLOW \
132 | --device-id $DEVICE \
133 | --device-id-2 $DEVICE_2 \
134 | --num-steps $NUM_STEPS \
135 | --num-warmup-iters $WARMUP_ITERS \
136 | --num-repeat-iters $REPEAT_ITERS \
137 | --precision $PRECISION \
138 | $ONNX_TEXT_ENCODER_CMD \
139 | $ONNX_UNET_CMD \
140 | $ONNX_VAE_CMD \
141 | $COMPILE_TEXT_ENCODER_CMD \
142 | $COMPILE_UNET_CMD \
143 | $COMPILE_VAE_CMD \
144 | $GENERATE_ONNX_CMD \
145 | $RUN_ONLY_CMD \
146 | $TOGETHER_CMD"
147 | 
148 | echo $scripts >> run.sh
149 | 
150 | bash run.sh
151 | 


--------------------------------------------------------------------------------
/models/language_processing/encoder/server.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  2 | # SPDX-License-Identifier: BSD-3-Clause-Clear
  3 | 
  4 | from contextlib import asynccontextmanager
  5 | from fastapi import FastAPI, HTTPException
  6 | from typing import Optional, List,  Union
  7 | from pydantic import BaseModel
  8 | import argparse
  9 | 
 10 | from model import QAicEmbeddingModel
 11 | 
 12 | @asynccontextmanager
 13 | async def lifespan(app: FastAPI):   
 14 |     # Code to run before the application starts
 15 |     print("Application startup")
 16 | 
 17 |     app.model = QAicEmbeddingModel(model_name=args.model_name, qpc_path=args.qpc_path, device=args.device)
 18 | 
 19 |     yield
 20 | 
 21 |     # Code to run when the application shuts down
 22 |     print("Application shutdown")
 23 | 
 24 | app = FastAPI(lifespan=lifespan)
 25 | 
 26 | @app.get("/v1/models")
 27 | async def get_models():
 28 |     #print('get_models')
 29 |     try:
 30 |         response = {
 31 |             "object": "list",
 32 |             "data": [
 33 |                 {
 34 |                 "id": app.model.name,
 35 |                 "object": "model",
 36 |                 "created": 1746296172,
 37 |                 "owned_by": "system",
 38 |                 "max_model_len": 4096
 39 |                 }
 40 |             ],
 41 |         }
 42 | 
 43 |         return response
 44 |     except Exception as e:
 45 |         print(str(e))
 46 |         raise HTTPException(status_code=500, detail=str(e))
 47 | 
 48 | class EmbeddingsRequest(BaseModel):
 49 |     model: Optional[str] = "bge-large-en-v1.5"
 50 |     input: Union[str, List[str]]
 51 |     encoding_format: Optional[str] = 'float'
 52 |     user: Optional[str] = None
 53 | 
 54 | @app.post("/v1/embeddings")
 55 | async def embeddings(request: EmbeddingsRequest):
 56 |     try:
 57 |         response = {'object': 'list', 'data': []}
 58 | 
 59 |         inputs = request.input
 60 |         if isinstance(inputs, str):
 61 |             inputs = [inputs]
 62 | 
 63 |         for idx, input in enumerate(inputs):
 64 |             token_embedding, sentence_embeddings = app.model.generate(input)
 65 | 
 66 |             response['data'].append(
 67 |                 {
 68 |                     'object': 'embedding',
 69 |                     'embedding': sentence_embeddings.reshape(-1).tolist(),
 70 |                     'index': idx
 71 |                 }
 72 |             )
 73 |         #print(response)
 74 |         return response
 75 |     except Exception as e:
 76 |         print(str(e))
 77 |         raise HTTPException(status_code=500, detail=str(e))
 78 | 
 79 | if __name__ == "__main__":
 80 |     import uvicorn
 81 | 
 82 |     parser = argparse.ArgumentParser(description="Embedding model endpoint")
 83 | 
 84 |     parser.add_argument(
 85 |         "--host",
 86 |         type=str,
 87 |         help="IP address",
 88 |         default="0.0.0.0"
 89 |     )
 90 | 
 91 |     parser.add_argument(
 92 |         "--port",
 93 |         type=int,
 94 |         help="Port",
 95 |         default=8000
 96 |     )
 97 | 
 98 |     parser.add_argument(
 99 |         "--hf_token",
100 |         type=str,
101 |         help="Hugging Face auth token",
102 |         default=None
103 |     )
104 | 
105 |     parser.add_argument(
106 |         "--model_name",
107 |         type=str,
108 |         help="Hugging Face model path",
109 |         default='BAAI/bge-large-en-v1.5'
110 |     )
111 | 
112 |     parser.add_argument(
113 |         "--qpc_path",
114 |         type=str,
115 |         help="QPC model binary path",
116 |         default='./models/BAAI/bge-large-en-v1.5/compiled-bin-fp16-B1-C4-A3-OLS2-MOS1-best-throughput'
117 |     )
118 | 
119 |     parser.add_argument(
120 |         "--device",
121 |         type=int,
122 |         help="Cloud AI accelerator device ID",
123 |         default=0
124 |     )
125 | 
126 |     args = parser.parse_args()
127 | 
128 |     uvicorn.run(app, host=args.host, port=args.port)
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_turbo/patches/attention_patch.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
 2 | index 21eb3a3..4f8d68c 100644
 3 | --- a/src/diffusers/models/attention_processor.py
 4 | +++ b/src/diffusers/models/attention_processor.py
 5 | @@ -11,6 +11,10 @@
 6 |  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 7 |  # See the License for the specific language governing permissions and
 8 |  # limitations under the License.
 9 | +#
10 | +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
11 | +# SPDX-License-Identifier: BSD-3-Clause-Clear
12 | +# Not a Contribution
13 |  from importlib import import_module
14 |  from typing import Callable, Optional, Union
15 |  
16 | @@ -200,10 +204,8 @@ class Attention(nn.Module):
17 |          # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
18 |          # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
19 |          # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
20 | -        if processor is None:
21 | -            processor = (
22 | -                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
23 | -            )
24 | +        # force to not use FlashAttention
25 | +        processor = AttnProcessor()
26 |          self.set_processor(processor)
27 |  
28 |      def set_use_memory_efficient_attention_xformers(
29 | @@ -588,7 +590,9 @@ class Attention(nn.Module):
30 |  
31 |          if attention_mask is None:
32 |              baddbmm_input = torch.empty(
33 | -                query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
34 | +                query.shape[0], query.shape[1],
35 | +                key.shape[2], # key is already transposed
36 | +                dtype=query.dtype, device=query.device
37 |              )
38 |              beta = 0
39 |          else:
40 | @@ -598,7 +602,7 @@ class Attention(nn.Module):
41 |          attention_scores = torch.baddbmm(
42 |              baddbmm_input,
43 |              query,
44 | -            key.transpose(-1, -2),
45 | +            key,  # key is already transposed
46 |              beta=beta,
47 |              alpha=self.scale,
48 |          )
49 | @@ -740,8 +744,26 @@ class AttnProcessor:
50 |          key = attn.head_to_batch_dim(key)
51 |          value = attn.head_to_batch_dim(value)
52 |  
53 | -        attention_probs = attn.get_attention_scores(query, key, attention_mask)
54 | -        hidden_states = torch.bmm(attention_probs, value)
55 | +        # pre-transpose the key
56 | +        key = key.transpose(-1, -2)
57 | +        if query.size(-2) != value.size(-2): # cross-attention, use regular attention
58 | +            # QKV done in single block
59 | +            attention_probs = attn.get_attention_scores(query, key, attention_mask)
60 | +            hidden_states = torch.bmm(attention_probs, value)
61 | +        else: # self-attention, use blocked attention
62 | +            # QKV done with block-attention (a la FlashAttentionV2)
63 | +            print(f"{query.shape = }, {key.shape = }, {value.shape = }")
64 | +            query_block_size = 256
65 | +            query_seq_len = query.size(-2)
66 | +            num_blocks = (query_seq_len + query_block_size - 1) // query_block_size
67 | +            for qidx in range(num_blocks):
68 | +                query_block = query[:,qidx*query_block_size:(qidx+1)*query_block_size,:]
69 | +                attention_probs = attn.get_attention_scores(query_block, key, attention_mask)
70 | +                hidden_states_block = torch.bmm(attention_probs, value)
71 | +                if qidx == 0:
72 | +                    hidden_states = hidden_states_block
73 | +                else:
74 | +                    hidden_states = torch.cat((hidden_states, hidden_states_block), -2)
75 |          hidden_states = attn.batch_to_head_dim(hidden_states)
76 |  
77 |          # linear proj
78 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/patches/attention_patch.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
 2 | index 21eb3a3..d43b51e 100644
 3 | --- a/src/diffusers/models/attention_processor.py
 4 | +++ b/src/diffusers/models/attention_processor.py
 5 | @@ -11,6 +11,10 @@
 6 |  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 7 |  # See the License for the specific language governing permissions and
 8 |  # limitations under the License.
 9 | +#
10 | +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
11 | +# SPDX-License-Identifier: BSD-3-Clause-Clear
12 | +# Not a Contribution 
13 |  from importlib import import_module
14 |  from typing import Callable, Optional, Union
15 |  
16 | @@ -200,10 +204,8 @@ class Attention(nn.Module):
17 |          # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
18 |          # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
19 |          # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
20 | -        if processor is None:
21 | -            processor = (
22 | -                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
23 | -            )
24 | +        # force to not use FlashAttention
25 | +        processor = AttnProcessor()
26 |          self.set_processor(processor)
27 |  
28 |      def set_use_memory_efficient_attention_xformers(
29 | @@ -588,7 +590,9 @@ class Attention(nn.Module):
30 |  
31 |          if attention_mask is None:
32 |              baddbmm_input = torch.empty(
33 | -                query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
34 | +                query.shape[0], query.shape[1],
35 | +                key.shape[2], # key is already transposed
36 | +                dtype=query.dtype, device=query.device
37 |              )
38 |              beta = 0
39 |          else:
40 | @@ -598,7 +602,7 @@ class Attention(nn.Module):
41 |          attention_scores = torch.baddbmm(
42 |              baddbmm_input,
43 |              query,
44 | -            key.transpose(-1, -2),
45 | +            key,  # key is already transposed
46 |              beta=beta,
47 |              alpha=self.scale,
48 |          )
49 | @@ -740,8 +744,26 @@ class AttnProcessor:
50 |          key = attn.head_to_batch_dim(key)
51 |          value = attn.head_to_batch_dim(value)
52 |  
53 | -        attention_probs = attn.get_attention_scores(query, key, attention_mask)
54 | -        hidden_states = torch.bmm(attention_probs, value)
55 | +        # pre-transpose the key
56 | +        key = key.transpose(-1, -2)
57 | +        if query.size(-2) != value.size(-2): # cross-attention, use regular attention
58 | +            # QKV done in single block
59 | +            attention_probs = attn.get_attention_scores(query, key, attention_mask)
60 | +            hidden_states = torch.bmm(attention_probs, value)
61 | +        else: # self-attention, use blocked attention
62 | +            # QKV done with block-attention (a la FlashAttentionV2)
63 | +            print(f"{query.shape = }, {key.shape = }, {value.shape = }")
64 | +            query_block_size = 128
65 | +            query_seq_len = query.size(-2)
66 | +            num_blocks = (query_seq_len + query_block_size - 1) // query_block_size
67 | +            for qidx in range(num_blocks):
68 | +                query_block = query[:,qidx*query_block_size:(qidx+1)*query_block_size,:]
69 | +                attention_probs = attn.get_attention_scores(query_block, key, attention_mask)
70 | +                hidden_states_block = torch.bmm(attention_probs, value)
71 | +                if qidx == 0:
72 | +                    hidden_states = hidden_states_block
73 | +                else:
74 | +                    hidden_states = torch.cat((hidden_states, hidden_states_block), -2)
75 |          hidden_states = attn.batch_to_head_dim(hidden_states)
76 |  
77 |          # linear proj
78 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/run_config_inference.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | ####################################################################################################
  4 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  5 | # SPDX-License-Identifier: BSD-3-Clause-Clear
  6 | ####################################################################################################
  7 | 
  8 | 
  9 | PYTHON=$1
 10 | echo $PYTHON
 11 | 
 12 | # model configs
 13 | MODEL_PATH="stabilityai/stable-diffusion-xl-base-1.0"
 14 | PROMPT="\"A cinematic shot of a baby racoon wearing an intricate italian priest robe.\""
 15 | USE_LATENTS="\"\""
 16 | NEGATIVE_PROMPT="\"Normal quality, low quality, worst quality, low res, blurry.\""
 17 | VAE_TYPE="vae"
 18 | UNET_TYPE="deep"
 19 | IMAGE_SIZE=1024
 20 | BLOCK_SIZE_DEEP=256
 21 | BLOCK_SIZE_SHALLOW=128
 22 | BATCH_SIZE=1
 23 | PRECISION=fp16,fp16,fp16,fp16
 24 | 
 25 | # onnx configs
 26 | GENERATE_ONNX=false
 27 | ONNX_TEXT_ENCODER=true
 28 | ONNX_UNET=true
 29 | ONNX_VAE=true
 30 | 
 31 | # compile configs
 32 | NUM_CORES=16
 33 | VAE_MOS=2
 34 | VAE_OLS=1
 35 | UNET_MOS_DEEP=2
 36 | UNET_OLS_DEEP=1
 37 | UNET_MOS_SHALLOW=1
 38 | UNET_OLS_SHALLOW=2
 39 | COMPILE_TEXT_ENCODER=true
 40 | COMPILE_UNET=true
 41 | COMPILE_VAE=true
 42 | 
 43 | # inference configs
 44 | RUN_ONLY=true
 45 | DEVICE=0
 46 | DEVICE_2=1
 47 | NUM_STEPS=20
 48 | WARMUP_ITERS=3
 49 | REPEAT_ITERS=3
 50 | CACHE_INTERVAL=3
 51 | 
 52 | # mode
 53 | TOGETHER=false
 54 | 
 55 | if [ ${GENERATE_ONNX} == true ]
 56 | then
 57 |     GENERATE_ONNX_CMD="--generate-onnx"
 58 | else
 59 |     GENERATE_ONNX_CMD=""
 60 | fi
 61 | 
 62 | if [ ${ONNX_TEXT_ENCODER} == true ]
 63 | then
 64 |     ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder"
 65 | else
 66 |     ONNX_TEXT_ENCODER_CMD=""
 67 | fi
 68 | 
 69 | if [ ${ONNX_UNET} == true ]
 70 | then
 71 |     ONNX_UNET_CMD="--onnx-unet"
 72 | else
 73 |     ONNX_UNET_CMD=""
 74 | fi
 75 | 
 76 | if [ ${ONNX_VAE} == true ]
 77 | then
 78 |     ONNX_VAE_CMD="--onnx-vae"
 79 | else
 80 |     ONNX_VAE_CMD=""
 81 | fi
 82 | 
 83 | if [ ${COMPILE_TEXT_ENCODER} == true ]
 84 | then
 85 |     COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder"
 86 | else
 87 |     COMPILE_TEXT_ENCODER_CMD=""
 88 | fi
 89 | 
 90 | if [ ${COMPILE_UNET} == true ]
 91 | then
 92 |     COMPILE_UNET_CMD="--compile-unet"
 93 | else
 94 |     COMPILE_UNET_CMD=""
 95 | fi
 96 | 
 97 | if [ ${COMPILE_VAE} == true ]
 98 | then
 99 |     COMPILE_VAE_CMD="--compile-vae"
100 | else
101 |     COMPILE_VAE_CMD=""
102 | fi
103 | 
104 | if [ ${RUN_ONLY} == true ]
105 | then
106 |     RUN_ONLY_CMD="--run-only"
107 | else
108 |     RUN_ONLY_CMD=""
109 | fi
110 | 
111 | if [ ${TOGETHER} == true ]
112 | then
113 |     TOGETHER_CMD="--together"
114 | else
115 |     TOGETHER_CMD=""
116 | fi
117 | 
118 | export HF_HOME="cache"
119 | export TQDM_DISABLE=1
120 | 
121 | rm run.sh
122 | 
123 | scripts="$PYTHON main.py \
124 | --model-path $MODEL_PATH \
125 | --prompt $PROMPT \
126 | --negative-prompt $NEGATIVE_PROMPT \
127 | --use-latents $USE_LATENTS \
128 | --unet-type $UNET_TYPE \
129 | --vae-type $VAE_TYPE \
130 | --batch-size $BATCH_SIZE \
131 | --image-size $IMAGE_SIZE \
132 | --block-size-deep $BLOCK_SIZE_DEEP \
133 | --block-size-shallow $BLOCK_SIZE_SHALLOW \
134 | --num-cores $NUM_CORES \
135 | --vae-mos $VAE_MOS \
136 | --vae-ols $VAE_OLS \
137 | --unet-mos-deep $UNET_MOS_DEEP \
138 | --unet-ols-deep $UNET_OLS_DEEP \
139 | --unet-mos-shallow $UNET_MOS_SHALLOW \
140 | --unet-ols-shallow $UNET_OLS_SHALLOW \
141 | --device-id $DEVICE \
142 | --device-id-2 $DEVICE_2 \
143 | --precision $PRECISION \
144 | --num-steps $NUM_STEPS \
145 | --num-warmup-iters $WARMUP_ITERS \
146 | --num-repeat-iters $REPEAT_ITERS \
147 | --cache-interval $CACHE_INTERVAL \
148 | $ONNX_TEXT_ENCODER_CMD \
149 | $ONNX_UNET_CMD \
150 | $ONNX_VAE_CMD \
151 | $COMPILE_TEXT_ENCODER_CMD \
152 | $COMPILE_UNET_CMD \
153 | $COMPILE_VAE_CMD \
154 | $GENERATE_ONNX_CMD \
155 | $RUN_ONLY_CMD \
156 | $TOGETHER_CMD"
157 | 
158 | echo $scripts >> run.sh
159 | 
160 | bash run.sh
161 | 


--------------------------------------------------------------------------------
/samples/python/aws_ai100_benchmarking/yolo_models/README.md:
--------------------------------------------------------------------------------
  1 | ## Description
  2 | ---
  3 | 
  4 | Download the yolov4, yolov5, and yolov7 models, prepare for the Qualcomm AIC100, compile for high-thoughput, min-latency, or balanced throughput with fp16 precision, run the model on a generated random sample, and obtain the benchmarking results and output values.
  5 | 
  6 | ## Source of the models
  7 | ---
  8 | The models are downloaded from (https://github.com/ultralytics/yolov5). This script has been tested for the following requested models:
  9 | * yolov4
 10 | * yolov5s
 11 | * yolov5m
 12 | * yolov5l
 13 | * yolov5x
 14 | * yolov7-e6e
 15 | 
 16 | 
 17 | ## Virtual environment
 18 | ---
 19 | For a quick environment setup:
 20 | 
 21 | ```commandline
 22 | source /opt/qti-aic/dev/python/qaic-env/bin/activate
 23 | ```
 24 | 
 25 | ## Framework and version
 26 | ---
 27 | ```commandline
 28 | pip3 install torch==1.13.0 onnx==1.12.0 onnxruntime==1.15.0 torchvision==0.14.0 transformers==4.29.2 pandas==2.0.2 urllib3==1.26.6
 29 | pip3 install ultralytics seaborn nvidia-pyindex onnx-graphsurgeon
 30 | 
 31 | ```
 32 | ## Syntax
 33 | ---
 34 | Copy the run_yolo_model.py and the lut_yolo_models.csv to a working directory. Pick a MODEl_NAME from the list above, and type:
 35 | 
 36 | ```commandline
 37 | 
 38 | usage: run_yolo_model.py [-h] --model-name {yolov4,yolov5s,yolov5m,yolov5l,yolov5x,yolov7-e6e}
 39 |              [--objective {best-latency,best-throughput,balanced}] 
 40 | 	     [--opset OPSET] 
 41 | 	     [--batch-size BATCH_SIZE]
 42 |              [--image-size IMAGE_SIZE] 
 43 | 	     [--cores {1,2,3,4,5,6,7,8,9,10,11,12,13,14}]
 44 |              [--instances {1,2,3,4,5,6,7,8,9,10,11,12,13,14}] 
 45 | 	     [--ols {1,2,3,4,5,6,7,8,9,10,11,12,13,14}] 
 46 | 	     [--mos MOS]
 47 |              [--set-size {1,2,3,4,5,6,7,8,9,10}] 
 48 | 	     [--extra EXTRA] 
 49 | 	     [--time TIME] 
 50 | 	     [--device {0,1,2,3,4,5,6,7}] 
 51 | 	     [--run-only]
 52 | 
 53 | 
 54 | 
 55 | Download, Compile, and Run YOLO models on randomly generated inputs
 56 | 
 57 | 
 58 | optional arguments:
 59 |   -h, --help            show this help message and exit
 60 |   --model-name, -m {yolov4,yolov5s,yolov5m,yolov5l,yolov5x,yolov7-e6e}
 61 |                         Model name to download.
 62 |   --objective, -o {best-latency,best-throughput,balanced}
 63 |                         Running for best-latency, best-throughput, or balanced
 64 |   --opset OPSET         ONNX opset. Default <12>
 65 |   --batch-size, -b BATCH_SIZE
 66 |                         Sample input batch size. Default <1>.
 67 |   --image-size, -s IMAGE_SIZE
 68 |                         Sample input image width/height. Default <640>.
 69 |   --cores, -c {1,2,3,4,5,6,7,8,9,10,11,12,13,14}
 70 |                         Number of AIC100 cores to compile the model for. Default <2>
 71 |   --instances, -i {1,2,3,4,5,6,7,8,9,10,11,12,13,14}
 72 |                         Number of model instances to run on AIC100. Default <7>
 73 |   --ols {1,2,3,4,5,6,7,8,9,10,11,12,13,14}
 74 |                         Overlap split factor. Default <1>
 75 |   --mos MOS             Maximum output channel split. Default <1>
 76 |   --set-size {1,2,3,4,5,6,7,8,9,10}
 77 |                         Set size. Default <10>
 78 |   --extra EXTRA         Extra compilation arguments.
 79 |   --time TIME           Duration (in seconds) for which to submit inferences. Default <20>
 80 |   --device, -d {0,1,2,3,4,5,6,7}
 81 |                         AIC100 device ID. Default <0>
 82 |   --run-only, -r        Performs the inference only, without re-exporting and re-compiling the model
 83 | 
 84 | 
 85 | ```
 86 | For example:
 87 | ```commandline
 88 | python run_yolo_model.py -m yolov5s -o best-throughput
 89 | ```
 90 | or
 91 | ```commandline
 92 | python run_yolo_model.py -m yolov5m -o balanced
 93 | ```
 94 | or
 95 | 
 96 | ```commandline
 97 | python run_yolo_model.py -m yolov5x -o best-throughput
 98 | ```
 99 | 
100 | The hardware configuration will be either associated to the corresponding row in the lut_yolo_models.csv or to defualt values if not specified by the user. If the MODEL_NAME is not included in the lut_yolo_models.csv, default values will be used.
101 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_turbo/README.md:
--------------------------------------------------------------------------------
  1 | # Instructions to run SDXL-Turbo on Cloud AI 100
  2 | 
  3 | The instructions below are to run the [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) model on Cloud AI 100. Compile time parameters may need to be adjusted for different cards and different SDKs.
  4 | 
  5 | ## Pre-requisites
  6 | 
  7 | Use the [SDK 1.19.8.0](https://github.com/quic/cloud-ai-containers/pkgs/container/cloud_ai_inference_ubuntu22/414822849?tag=1.19.8.0) container to compile the sdxl-turbo models.
  8 | 
  9 | ```
 10 | sudo docker run \
 11 |   -it \
 12 |   --workdir /cloud-ai-sdk \
 13 |   --entrypoint /bin/bash \
 14 |   --network=host \
 15 |   --mount type=bind,source=<path to cloud-ai-sdk root>,target=/cloud-ai-sdk \
 16 |   --device=/dev/accel/accel0 \
 17 |   --device=/dev/accel/accel1 \
 18 |   --device=/dev/accel/accel2 \
 19 |   --device=/dev/accel/accel3 \
 20 |   ghcr.io/quic/cloud_ai_inference_ubuntu22:1.19.8.0
 21 | 
 22 | cd models/multimodal/text_to_image/sdxl_turbo
 23 | ```
 24 | 
 25 | Install the moreutils package for the `ts` timestamp tool:
 26 | ```
 27 | sudo apt update
 28 | sudo apt-get install moreutils
 29 | ```
 30 | 
 31 | Install Git Large File System (LFS) support
 32 | 
 33 | ```
 34 | sudo apt update
 35 | sudo apt-get install git-lfs
 36 | ```
 37 | 
 38 | ## 1. Generate onnx files and compile for binaries
 39 | 
 40 | 1.  Set up a virtual environment for ONNX generation and compilation
 41 | ```
 42 | python3.10 -m venv env_onnx
 43 | source ./env_onnx/bin/activate
 44 | pip install -r requirements.txt
 45 | ```
 46 | 
 47 | 2.  Create a folder for caching Hugging Face model downloads, and export the environment variable HF_HOME
 48 | ```
 49 | mkdir cache
 50 | mkdir compile_logs
 51 | mkdir qpc
 52 | touch run.sh
 53 | export HF_HOME=${PWD}/cache
 54 | ```
 55 | 
 56 | 3. Install diffusers from source after patching for ONNX file generation
 57 | ```
 58 | git clone --depth 1 --branch v0.24.0 https://github.com/huggingface/diffusers.git diffusers-onnx
 59 | cd diffusers-onnx
 60 | git apply --reject --whitespace=fix ../patches/attention_patch.patch
 61 | pip install .
 62 | cd ..
 63 | ```
 64 | 
 65 | 4. Prepare VAE Decoder
 66 | ```
 67 | export GIT_LFS_SKIP_SMUDGE=1
 68 | git clone https://huggingface.co/stabilityai/sdxl-turbo cache/stabilityai/sdxl_turbo
 69 | cd cache/stabilityai/sdxl_turbo
 70 | git lfs pull -I vae_decoder/model.onnx
 71 | rm -rf .git/lfs # optional to save space
 72 | cd ../../../
 73 | ```
 74 | 
 75 | 5. Generate ONNX files and compile for binaries
 76 | ```
 77 | bash run_config_gen.sh
 78 | ```
 79 | 
 80 | ## 2. Run the end-to-end SDXL-Turbo inference
 81 | 
 82 | 1. Set up a separate virtual environment for running SDXL Turbo
 83 | ```
 84 | python3.10 -m venv env_pipeline
 85 | source ./env_pipeline/bin/activate
 86 | pip install -r requirements.txt
 87 | pip install --force-reinstall /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl
 88 | ```
 89 | 
 90 | 2.  Re-install diffusers from source after patching the SDXL Turbo pipeline for inference
 91 | ```
 92 | git clone --depth 1 --branch v0.24.0 https://github.com/huggingface/diffusers.git diffusers-pipeline
 93 | cd diffusers-pipeline
 94 | git apply --reject --whitespace=fix ../patches/pipeline_patch_separate.patch
 95 | pip install .
 96 | cd ..
 97 | ```
 98 | 
 99 | 4. Run the SDXL-Turbo inference with 'sudo' flag if needed to access the AI 100 devices.
100 | ```
101 | sudo bash run_config_inference.sh $(which python3)
102 | ```
103 | 
104 | ## 3. Run an OpenAI-compatible REST endpoint
105 | 
106 | ```
107 | source ./env_pipeline/bin/activate
108 | python3 server.py
109 | ```
110 | 
111 | Test the endpoint:
112 | 
113 | ```
114 | curl http://localhost:8000/v1/images/generations \
115 |   -H 'Content-Type: application/json' \
116 |   -H 'Authorization: Bearer test-key' \
117 |   -d '{
118 |     "model": "sdxl-turbo",
119 |     "prompt": "photo of 8k ultra realistic harbour, port, boats, sunset, beautiful light, golden hour, full of colour, cinematic lighting, battered, trending on artstation, 4k, hyperrealistic, focused, extreme details, cinematic, masterpiece",
120 |     "n": 1,
121 |     "size": "512x512",
122 |     "response_format": "b64_json"
123 |   }'
124 | ```
125 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/patches/transformer_patch.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
 2 | index 81dff54f9..f27ebe7d3 100644
 3 | --- a/src/transformers/models/t5/modeling_t5.py
 4 | +++ b/src/transformers/models/t5/modeling_t5.py
 5 | @@ -12,6 +12,10 @@
 6 |  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 7 |  # See the License for the specific language governing permissions and
 8 |  # limitations under the License.
 9 | +#
10 | +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
11 | +# SPDX-License-Identifier: BSD-3-Clause-Clear
12 | +# Not a Contribution
13 |  """ PyTorch T5 model."""
14 |  
15 |  
16 | @@ -243,7 +247,8 @@ class T5LayerNorm(nn.Module):
17 |          # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
18 |          # half-precision inputs is done in fp32
19 |  
20 | -        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
21 | +        div_first = hidden_states * torch.rsqrt(torch.tensor(hidden_states.shape[-1], dtype=torch.float32))
22 | +        variance = div_first.pow(2).sum(-1, keepdim=True)
23 |          hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
24 |  
25 |          # convert into half-precision if necessary
26 | @@ -330,11 +335,12 @@ class T5LayerFF(nn.Module):
27 |  
28 |          self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
29 |          self.dropout = nn.Dropout(config.dropout_rate)
30 | +        self.scaling_factor = nn.Parameter(torch.tensor(1.0))
31 |  
32 |      def forward(self, hidden_states):
33 |          forwarded_states = self.layer_norm(hidden_states)
34 |          forwarded_states = self.DenseReluDense(forwarded_states)
35 | -        hidden_states = hidden_states + self.dropout(forwarded_states)
36 | +        hidden_states = hidden_states * self.scaling_factor + self.dropout(forwarded_states)
37 |          return hidden_states
38 |  
39 |  
40 | @@ -538,7 +544,8 @@ class T5Attention(nn.Module):
41 |              # if key and values are already calculated
42 |              # we want only the last query position bias
43 |              if past_key_value is not None:
44 | -                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
45 | +                #position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
46 | +                position_bias = position_bias[:, :, -1:, :]
47 |  
48 |              if mask is not None:
49 |                  position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
50 | @@ -579,6 +586,7 @@ class T5LayerSelfAttention(nn.Module):
51 |          self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
52 |          self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
53 |          self.dropout = nn.Dropout(config.dropout_rate)
54 | +        self.scaling_factor = nn.Parameter(torch.tensor(1.0))
55 |  
56 |      def forward(
57 |          self,
58 | @@ -600,7 +608,7 @@ class T5LayerSelfAttention(nn.Module):
59 |              use_cache=use_cache,
60 |              output_attentions=output_attentions,
61 |          )
62 | -        hidden_states = hidden_states + self.dropout(attention_output[0])
63 | +        hidden_states = hidden_states * self.scaling_factor + self.dropout(attention_output[0])
64 |          outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
65 |          return outputs
66 |  
67 | @@ -611,6 +619,7 @@ class T5LayerCrossAttention(nn.Module):
68 |          self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False)
69 |          self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
70 |          self.dropout = nn.Dropout(config.dropout_rate)
71 | +        self.scaling_factor = nn.Parameter(torch.tensor(1.0))
72 |  
73 |      def forward(
74 |          self,
75 | @@ -636,7 +645,7 @@ class T5LayerCrossAttention(nn.Module):
76 |              query_length=query_length,
77 |              output_attentions=output_attentions,
78 |          )
79 | -        layer_output = hidden_states + self.dropout(attention_output[0])
80 | +        layer_output = hidden_states * self.scaling_factor + self.dropout(attention_output[0])
81 |          outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
82 |          return outputs
83 |  
84 | 


--------------------------------------------------------------------------------
/samples/cpp/cpp_qpc_inference/Readme.md:
--------------------------------------------------------------------------------
  1 | # Simple CPP Example for Bert-base-cased model on AIC-100 
  2 | 
  3 | This project demonstrates using Bert-based-cased model from hugging face, using C++ Qaic APIs.
  4 | 
  5 | ## To build and use it.
  6 | ```bash
  7 |    mkdir build
  8 |    cd build
  9 |    cmake <path/to/root-cmake>
 10 |    make
 11 | ```
 12 | 
 13 | Bert-base-cased model from hugging face, is based on a vocabulary file
 14 | (vocab.txt), which which needs to be downloaded from hugging-face website.
 15 | 
 16 | ## To use the example, the user needs to :
 17 | - download the hugging face bert-base-cased model. (Refer Jupyter notebooks for NLP models). 
 18 | - Replace the QPC path used in the main.cpp with the actual QPC path.
 19 | - Replace the names of the input/output buffers as used to compile bert model into QPC
 20 |   ```
 21 |   for example:
 22 |      ("input_ids", "attention_mask") for input buffers
 23 |      ("logits")  for output  buffers
 24 |   ```
 25 | - build using above build steps
 26 | - run the executable `simple-bert-inference-example`
 27 | 
 28 | ## The example has the following helper classes.
 29 | 
 30 | ### VocabularyHelper :
 31 |    This class parses the vocab.txt, and stores the index of every
 32 |    string token in the vocab.txt file. The index of the words in
 33 |    this file is used in the input and output feeded to the model
 34 |    while running the inference.
 35 | 
 36 | 
 37 | ### Tokenizer :
 38 |    This class, is very basic and trivial parser of input sentence
 39 |    feeded to the bert model. It uses space as delimeter to parse
 40 |    the sentence.  It does not cater special handling for special
 41 |    characters and symbols used in sentence. 
 42 |    Ideally, in C++ the user can use, for example, the 
 43 |    sentencePiece library provided as in https://github.com/google/sentencepiece 
 44 | 
 45 | 
 46 | ### QBufferWrapper:
 47 |    This is a helper class to ensure that the memory allocated
 48 |    for QBuffers used in Qaic APIs is automatically  released.
 49 |    Helper functions are provided for this class
 50 |    
 51 |    `createBuffer` : create the wrapper from a QBuffer class
 52 |    
 53 |    `qBufferToString` : create a string for printing with QBuffer data
 54 | 
 55 | 
 56 | ### Helper Functions to convert few data structures to string for printing:
 57 | ```cpp
 58 | [[nodiscard]] std::string to_string(const qaic::rt::BufferMapping& bufMap)<br>
 59 | [[nodiscard]] std::string to_string(const qaic::rt::BufferMappings& allBufferMappings)<br>
 60 | [[nodiscard]] std::string to_string(const std::vector<int64_t> & tokenVec)<br>
 61 | ```
 62 | 
 63 | ### Processing the intput and output for inference:
 64 |    The input buffer for bert inference in this example is an array of bytes
 65 |    representing the indexes for each sentence word ( in the vocabulary file ).
 66 | 
 67 |    For example:
 68 |    
 69 |    If the compiled QPC has the sequence = 128 and the input type is int64_t
 70 |    then the size of input buffers must be <br>
 71 |    128 * 8 <br>
 72 |    128 [max num tokens in input] * 8 [size of each index in vocabulary file]<br>
 73 | 
 74 |    If the input sentence has 10 words, then the first 10*8 bytes in the
 75 |    input buffer must be populated with the indexes of the sentence words
 76 |    in the vocabulary file. Rest of the bytes must be zero initialized.
 77 | 
 78 |    Bert Model uses attention_mask as an input to model. The attention_mask
 79 |    input buffer can be populated with 1 for initial 10 words and rest of bytes
 80 |    can be zero initialized.
 81 | 
 82 |    The output buffer for bert inference in this example is an array
 83 |    of logit values (corresponding to each symbol/word in the vocabulary)
 84 |    for each input token.
 85 | 
 86 |    For example:
 87 |    
 88 |    If the compiled QPC has the sequence value = 128 and the output format
 89 |    is float (4 bytes). Then the QBuffer for output must be<br>
 90 |    128 * 4 * 289960 <br>
 91 |    128 [max num tokens in input] * 4 [size of each logit value] * 289960 [Vocabular Size]<br>
 92 |   
 93 |    For getting the predicted output sentence, the logit values for the
 94 |    [MASK] token must be extracted from the output buffer. Then the index for the
 95 |    maximum logit value can be used to get the predicted output word.
 96 |    
 97 |    For example:
 98 |    
 99 |    If the [MASK] token is at 3rd word index in sentence, then the corresponding
100 |    logit values shall be present in the following bytes in the output buffer<br>
101 |    289960*3*4  to 289960*4*4 bytes position.<br>
102 |    These 2899960 float values are the logits for the corresponding logits for
103 |    each symbol/word in the vocabulary.
104 |    We find the index for maximum logit value to get the index of prediceted
105 |    word. Then we find the word in the vocabulary.
106 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 2 | # SPDX-License-Identifier: BSD-3-Clause-Clear
 3 | 
 4 | import os
 5 | import torch
 6 | 
 7 | from diffusers import StableDiffusion3Pipeline
 8 | 
 9 | class QAICStableDiffusion3:
10 |     def __init__(self, model_id = 'stabilityai/stable-diffusion-3.5-medium', device_id=0, device_id_2=1):
11 |         sdxl_vae_decoder = './qpc/vae_decoder_64b_1024i_vae_16c_1b_2m_1o/programqpc.bin'
12 |         text_encoder = './qpc/text_encoder_64b_1024i_16c_1b/programqpc.bin'
13 |         transformer = './qpc/transformer_64b_1024i_16c_1b_1m_2o/programqpc.bin'
14 |         text_encoder_2 = './qpc/text_encoder_2_64b_1024i_16c_1b/programqpc.bin'
15 | 
16 |         text_encoder_3 = None
17 | 
18 |         # check the QPCs
19 |         transformer_qpc = transformer if transformer.endswith('programqpc.bin') else os.path.join(transformer,'programqpc.bin')
20 |         assert os.path.isfile(transformer_qpc), f"Could not find binary {transformer_qpc = }!"
21 |         vae_decoder_sdxl_qpc = sdxl_vae_decoder if sdxl_vae_decoder.endswith('programqpc.bin') else os.path.join(sdxl_vae_decoder,'programqpc.bin')
22 |         assert os.path.isfile(vae_decoder_sdxl_qpc), f"Could not find binary {vae_decoder_sdxl_qpc = }!"
23 |         text_encoder_qpc = text_encoder if text_encoder.endswith('programqpc.bin') else os.path.join(text_encoder,'programqpc.bin')
24 |         assert os.path.isfile(text_encoder_qpc), f"Could not find binary {text_encoder_qpc = }!"
25 |         text_encoder_2_qpc = text_encoder_2 if text_encoder_2.endswith('programqpc.bin') else os.path.join(text_encoder_2,'programqpc.bin')
26 |         assert os.path.isfile(text_encoder_2_qpc), f"Could not find binary {text_encoder_2_qpc = }!"
27 | 
28 |         self.vae_type = "vae"
29 | 
30 |         # load the latents
31 |         self.latents = None
32 | 
33 |         # load the model pipeline
34 |         if text_encoder_3:
35 |             text_encoder_3_qpc = text_encoder_3 if text_encoder_3.endswith('programqpc.bin') else os.path.join(text_encoder_3,'programqpc.bin')
36 |             assert os.path.isfile(text_encoder_3_qpc), f"Could not find binary {text_encoder_3_qpc = }!"
37 |             pipe = StableDiffusion3Pipeline.from_pretrained(
38 |                                                 model_id,
39 |                                                 device_id=device_id,
40 |                                                 device_id2=device_id_2,
41 |                                                 transformer_qpc=transformer_qpc,
42 |                                                 vae_decoder_qpc=vae_decoder_sdxl_qpc,
43 |                                                 text_encoder_qpc=text_encoder_qpc,
44 |                                                 text_encoder_2_qpc=text_encoder_2_qpc,
45 |                                                 text_encoder_3_qpc=text_encoder_3_qpc,
46 |             )
47 |         else:
48 |             pipe = StableDiffusion3Pipeline.from_pretrained(
49 |                                                 model_id,
50 |                                                 device_id=device_id,
51 |                                                 device_id2=device_id_2,
52 |                                                 transformer_qpc=transformer_qpc,
53 |                                                 vae_decoder_qpc=vae_decoder_sdxl_qpc,
54 |                                                 text_encoder_qpc=text_encoder_qpc,
55 |                                                 text_encoder_2_qpc=text_encoder_2_qpc,
56 |                                                 text_encoder_3=None,
57 |                                                 tokenizer_3=None,
58 |             )
59 | 
60 |         self.pipe = pipe
61 | 
62 |     def generate(self, prompt, n=1, image_size=(1024,1024), num_steps=28, guidance=4.5):
63 |         height, width = image_size[0], image_size[1]
64 | 
65 |         images = self.pipe(prompt=prompt,
66 |                       negative_prompt='',
67 |                       num_inference_steps=num_steps,
68 |                       height=height,
69 |                       width=width,
70 |                       latents=self.latents,
71 |                       vae_type=self.vae_type,
72 |                       guidance_scale=guidance).images
73 | 
74 |         return images
75 | 
76 | def main():
77 |     model = QAICStableDiffusion3()
78 |     prompt = 'photo of 8k ultra realistic harbour, port, boats, sunset, beautiful light, golden hour, full of colour, cinematic lighting, battered, trending on artstation, 4k, hyperrealistic, focused, extreme details, cinematic, masterpiece'
79 |     image = model.generate(prompt, guidance=7.0)[0]
80 |     image.save('harbor.png')
81 | 
82 | if __name__ == "__main__":
83 |     main()
84 | 
85 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/patches/deepcache_unet.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/DeepCache/sdxl/unet_2d_condition.py b/DeepCache/sdxl/unet_2d_condition.py
 2 | index 6c97199..f6865c6 100644
 3 | --- a/DeepCache/sdxl/unet_2d_condition.py
 4 | +++ b/DeepCache/sdxl/unet_2d_condition.py
 5 | @@ -11,6 +11,10 @@
 6 |  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 7 |  # See the License for the specific language governing permissions and
 8 |  # limitations under the License.
 9 | +#
10 | +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
11 | +# SPDX-License-Identifier: BSD-3-Clause-Clear
12 | +# Not a Contribution 
13 |  from dataclasses import dataclass
14 |  from typing import Any, Dict, List, Optional, Tuple, Union
15 |  
16 | @@ -591,6 +595,8 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
17 |              self.position_net = PositionNet(
18 |                  positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
19 |              )
20 | +        self.cache_layer_id = 0
21 | +        self.cache_block_id = 0
22 |  
23 |      @property
24 |      def attn_processors(self) -> Dict[str, AttentionProcessor]:
25 | @@ -741,6 +747,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
26 |          sample: torch.FloatTensor,
27 |          timestep: Union[torch.Tensor, float, int],
28 |          encoder_hidden_states: torch.Tensor,
29 | +        replicate_prv_feature: Optional[List[torch.Tensor]],
30 |          class_labels: Optional[torch.Tensor] = None,
31 |          timestep_cond: Optional[torch.Tensor] = None,
32 |          attention_mask: Optional[torch.Tensor] = None,
33 | @@ -749,10 +756,6 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
34 |          down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
35 |          mid_block_additional_residual: Optional[torch.Tensor] = None,
36 |          encoder_attention_mask: Optional[torch.Tensor] = None,
37 | -        quick_replicate: bool = False,
38 | -        replicate_prv_feature: Optional[List[torch.Tensor]] = None,
39 | -        cache_layer_id: Optional[int] = None,
40 | -        cache_block_id: Optional[int] = None,
41 |          return_dict: bool = True,
42 |      ) -> Union[UNet2DConditionOutput, Tuple]:
43 |          r"""
44 | @@ -954,8 +957,11 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
45 |          is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
46 |          is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None
47 |  
48 | +        cache_layer_id = self.cache_layer_id
49 | +        cache_block_id = self.cache_block_id
50 |          down_block_res_samples = (sample,)
51 | -        if quick_replicate and replicate_prv_feature is not None:
52 | +        if False:
53 | +            print("Using cache...")
54 |              # Down
55 |              for i, downsample_block in enumerate(self.down_blocks):
56 |                  if i > cache_layer_id:
57 | @@ -1037,9 +1043,10 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
58 |                          scale=lora_scale,
59 |                          enter_block_number=cache_block_id if i == len(self.up_blocks) - 1 - cache_layer_id else None,
60 |                      )
61 | -        
62 | +
63 |              prv_f = replicate_prv_feature
64 |          else:
65 | +            print("Initializing cache...")
66 |              for i, downsample_block in enumerate(self.down_blocks):
67 |                  if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
68 |                      # For t2i-adapter CrossAttnDownBlock2D
69 | @@ -1137,17 +1144,15 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
70 |                          upsample_size=upsample_size,
71 |                          scale=lora_scale,
72 |                      )
73 | -                   
74 | +
75 |                  #print(cache_layer_id, current_record_f is None, i == len(self.up_blocks) - cache_layer_id - 1)
76 |                  #print("Append prv_feature with shape:", sample.shape)
77 |                  if cache_layer_id is not None and current_record_f is not None and i == len(self.up_blocks) - cache_layer_id - 1:
78 |                      prv_f = current_record_f[-cache_block_id-1]
79 | -                
80 | +
81 |          # 6. post-process
82 |          if self.conv_norm_out:
83 |              sample = self.conv_norm_out(sample)
84 |              sample = self.conv_act(sample)
85 |          sample = self.conv_out(sample)
86 | -        if not return_dict:
87 | -            return (sample, prv_f,)
88 | -        return UNet2DConditionOutput(sample=sample)
89 | +        return (sample, prv_f,)
90 | 


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-xl-base-1.0/compile_models.sh:
--------------------------------------------------------------------------------
  1 | ####################################################################################################
  2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
  4 | ####################################################################################################
  5 | #!/bin/bash
  6 | 
  7 | BINARY_FOLDER="./qpc/"
  8 | LOG_FOLDER="./compile_logs/"
  9 | BATCH_SIZE=1
 10 | BATCH_SIZE_2=$(expr 2 \* $BATCH_SIZE)
 11 | SEQ_LEN=77
 12 | LATENT_CHANNELS=4
 13 | LATENT_HEIGHT=128
 14 | LATENT_WIDTH=128
 15 | NUM_CORES=16
 16 | VAE_MOS=2
 17 | VAE_OLS=1
 18 | UNET_MOS_BS1=2
 19 | UNET_OLS_BS1=1
 20 | UNET_MOS_BS2=1
 21 | UNET_OLS_BS2=2
 22 | 
 23 | mkdir ${BINARY_FOLDER}
 24 | mkdir ${LOG_FOLDER}
 25 | 
 26 | ########################################################################################################################
 27 | 
 28 | # 1. Compile the text encoder - self-generated
 29 | rm -rf ${BINARY_FOLDER}text_encoder
 30 | /opt/qti-aic/exec/qaic-exec \
 31 |     -aic-hw -aic-hw-version=2.0 -aic-perf-warnings -aic-perf-metrics \
 32 |     -compile-only -convert-to-fp16 \
 33 |     -m=./onnx_files/text_encoder/model.onnx \
 34 |     -onnx-define-symbol=batch_size,${BATCH_SIZE} \
 35 |     -stats-batchsize=${BATCH_SIZE} \
 36 |     -onnx-define-symbol=sequence_length,${SEQ_LEN} \
 37 |     -aic-num-cores=${NUM_CORES} \
 38 |     -aic-binary-dir=${BINARY_FOLDER}text_encoder \
 39 |     2>&1 | ts > ${LOG_FOLDER}text_encoder.log &
 40 | 
 41 | ########################################################################################################################
 42 | 
 43 | # 2. Compile the text encoder 2 - self-generated
 44 | rm -rf ${BINARY_FOLDER}text_encoder_2
 45 | /opt/qti-aic/exec/qaic-exec \
 46 |     -aic-hw -aic-hw-version=2.0 -aic-perf-warnings -aic-perf-metrics \
 47 |     -compile-only -convert-to-fp16 \
 48 |     -m=./onnx_files/text_encoder_2/model.onnx \
 49 |     -onnx-define-symbol=batch_size,${BATCH_SIZE} \
 50 |     -stats-batchsize=${BATCH_SIZE} \
 51 |     -onnx-define-symbol=sequence_length,${SEQ_LEN} \
 52 |     -aic-num-cores=${NUM_CORES} \
 53 |     -aic-binary-dir=${BINARY_FOLDER}text_encoder_2 \
 54 |     2>&1 | ts > ${LOG_FOLDER}text_encoder_2.log &
 55 | 
 56 | ########################################################################################################################
 57 | 
 58 | # 3a. Compile the UNet with batchsize=1, blocksize=256
 59 | rm -rf ${BINARY_FOLDER}unet-bs${BATCH_SIZE}
 60 | /opt/qti-aic/exec/qaic-exec \
 61 |     -aic-hw -aic-hw-version=2.0 -aic-perf-warnings -aic-perf-metrics \
 62 |     -compile-only -convert-to-fp16 \
 63 |     -mos=${UNET_MOS_BS1} -ols=${UNET_OLS_BS1} \
 64 |     -m=./onnx_files/unet_bs1/unet/model.onnx \
 65 |     -onnx-define-symbol=batch_size,${BATCH_SIZE} \
 66 |     -stats-batchsize=${BATCH_SIZE} \
 67 |     -onnx-define-symbol=sequence_length,${SEQ_LEN} \
 68 |     -onnx-define-symbol=steps,1 \
 69 |     -onnx-define-symbol=num_channels,${LATENT_CHANNELS} \
 70 |     -onnx-define-symbol=height,${LATENT_HEIGHT} \
 71 |     -onnx-define-symbol=width,${LATENT_WIDTH} \
 72 |     -aic-num-cores=${NUM_CORES} \
 73 |     -aic-binary-dir=${BINARY_FOLDER}unet-bs${BATCH_SIZE} \
 74 |     2>&1 | ts > ${LOG_FOLDER}unet-bs${BATCH_SIZE}.log &
 75 | 
 76 | 
 77 | # 3b. Compile the UNet with batchsize=2, blocksize=128
 78 | rm -rf ${BINARY_FOLDER}unet-bs${BATCH_SIZE_2}
 79 | /opt/qti-aic/exec/qaic-exec \
 80 |     -aic-hw -aic-hw-version=2.0 -aic-perf-warnings -aic-perf-metrics \
 81 |     -compile-only -convert-to-fp16 \
 82 |     -mos=${UNET_MOS_BS2} -ols=${UNET_OLS_BS2} \
 83 |     -m=./onnx_files/unet_bs2/unet/model.onnx \
 84 |     -onnx-define-symbol=batch_size,${BATCH_SIZE_2} \
 85 |     -stats-batchsize=${BATCH_SIZE_2} \
 86 |     -onnx-define-symbol=sequence_length,${SEQ_LEN} \
 87 |     -onnx-define-symbol=steps,1 \
 88 |     -onnx-define-symbol=num_channels,${LATENT_CHANNELS} \
 89 |     -onnx-define-symbol=height,${LATENT_HEIGHT} \
 90 |     -onnx-define-symbol=width,${LATENT_WIDTH} \
 91 |     -aic-num-cores=${NUM_CORES} \
 92 |     -aic-binary-dir=${BINARY_FOLDER}unet-bs${BATCH_SIZE_2} \
 93 |     2>&1 | ts > ${LOG_FOLDER}unet-bs${BATCH_SIZE_2}.log &
 94 | 
 95 | 
 96 | ########################################################################################################################
 97 | 
 98 | # 4. Compile the VAE Decoder
 99 | rm -rf ${BINARY_FOLDER}vae_decoder
100 | /opt/qti-aic/exec/qaic-exec \
101 |     -aic-hw -aic-hw-version=2.0 -aic-perf-warnings -aic-perf-metrics \
102 |     -compile-only -convert-to-fp16 \
103 |     -mos=${VAE_MOS} -ols=${VAE_OLS} \
104 |     -m=./onnx_files/vae_decoder/model_fixed_128.onnx \
105 |     -onnx-define-symbol=batch_size,${BATCH_SIZE} \
106 |     -stats-batchsize=${BATCH_SIZE} \
107 |     -onnx-define-symbol=num_channels_latent,${LATENT_CHANNELS} \
108 |     -onnx-define-symbol=height_latent,${LATENT_HEIGHT} \
109 |     -onnx-define-symbol=width_latent,${LATENT_WIDTH} \
110 |     -aic-num-cores=${NUM_CORES} \
111 |     -aic-enable-depth-first -aic-depth-first-mem=32 \
112 |     -aic-binary-dir=${BINARY_FOLDER}vae_decoder \
113 |     2>&1 | ts > ${LOG_FOLDER}vae_decoder.log &
114 | 
115 | ########################################################################################################################
116 | 
117 | echo Waiting for qaic-exec processes to finish ...
118 | wait
119 | 
120 | 


--------------------------------------------------------------------------------
/samples/python/aws_ai100_benchmarking/parse_latency_and_throughput.py:
--------------------------------------------------------------------------------
  1 | ##############################################################################
  2 | # @@-COPYRIGHT-START-@@
  3 | #
  4 | # Copyright (c) 2023, Qualcomm Technologies, Inc. All Rights Reserved.
  5 | # Redistribution and use in source and binary forms, with or without
  6 | # modification, are permitted provided that the following conditions are met:
  7 | #
  8 | # 1. Redistributions of source code must retain the above copyright notice,
  9 | #    this list of conditions and the following disclaimer.
 10 | #
 11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
 12 | #    this list of conditions and the following disclaimer in the documentation
 13 | #    and/or other materials provided with the distribution.
 14 | #
 15 | # 3. Neither the name of the copyright holder nor the names of its contributors
 16 | #    may be used to endorse or promote products derived from this software
 17 | #    without specific prior written permission.
 18 | #
 19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 22 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 23 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 24 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 25 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 26 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 27 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 28 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 29 | # POSSIBILITY OF SUCH DAMAGE.
 30 | #
 31 | # SPDX-License-Identifier: BSD-3-Clause
 32 | #
 33 | # @@-COPYRIGHT-END-@@
 34 | ##############################################################################
 35 | 
 36 | 
 37 | import os
 38 | import sys
 39 | from glob import glob
 40 | import pandas as pd
 41 | import numpy as np
 42 | 
 43 | 
 44 | def get_metric(series, method):
 45 |     '''
 46 |     This functions computes the average or percentile for a pandas.Series object
 47 |     '''
 48 |     if method == 'mean' or method == 'avg':
 49 |         return series.mean()
 50 |     elif method.endswith('pct'):
 51 |         prctile = int(method.replace('pct', ''))/100
 52 |         return series.quantile(prctile)
 53 |     return None
 54 |     
 55 |     
 56 | def get_latency(latency_logs, latency_method):
 57 |     '''
 58 |     This function computes the latency from the profiling latency
 59 |     text files, using the latency_method specified
 60 |     '''
 61 |     df = pd.concat([pd.read_csv(filename, skiprows=4)
 62 |                     for filename in latency_logs])
 63 |     col = df.columns[-3] # Execution Total Time in microseconds
 64 |     latency_ms = get_metric(df[col], latency_method)/1000.0
 65 |     return latency_ms
 66 | 
 67 | 
 68 | if __name__ == "__main__":
 69 |     if len(sys.argv) < 3:
 70 |         print("Syntax: python parse_latency_and_throughput.py <latency_method> <model_name> <model_name> <model_name>")
 71 |         print("where <latency_method> is 'mean', 'avg', or 'Kpct', where K is a number between 0 to 100")
 72 |         print("<model_name> should include full path to the model folder where 'outputFiles' and log files are located")
 73 |         sys.exit()
 74 | 
 75 |     latency_method = sys.argv[1]
 76 |     if (latency_method not in ['mean', 'avg']) and (not latency_method.endswith('pct')):
 77 |         raise ValueError(f"Methods supported are mean/avg or <N>pct, received {latency_method}")
 78 |     model_names = sys.argv[2:]
 79 |     print(model_names)
 80 | 
 81 |     # parse the logs and print the latency and throughput
 82 |     for config in ['best-throughput', 'balanced', 'best-latency']:
 83 | 
 84 |         print("******************************************************************")
 85 |         print(f"*** Latency: {config} configurations **************************")
 86 |         print("******************************************************************")
 87 |         for model in model_names:
 88 |             config_folders = glob(f"{model}/outputFiles/fp16*{config}")
 89 |             print(f"{model}: Found {len(config_folders)} {config} configurations")
 90 |             if len(config_folders) == 0:
 91 |                 continue
 92 |             latency_logs = glob(f"{config_folders[0]}/*latency.txt")
 93 |             print(f"Model: {model}: Latency ({latency_method}) = {get_latency(latency_logs, latency_method):.3f} ms")
 94 | 
 95 |         print("******************************************************************")
 96 |         print(f"*** Throughput: {config} configurations *************************")
 97 |         print("******************************************************************")
 98 |         for model in model_names:
 99 |             log_file = f"{model}/{config}.log"
100 |             if not os.path.exists(log_file):
101 |                 print("Model: {model}: {log_file} does not exist")
102 |                 continue
103 |             with open(log_file, 'r') as fid:
104 |                 throughput = np.double([line.split()[-1]
105 |                                         for line in fid.read().splitlines()
106 |                                         if 'Inf/Sec' in line][-1])
107 |                 print(f"Model: {model}: Throughput = {throughput:.3f} inf/sec")
108 |     print("******************************************************************")
109 | 


--------------------------------------------------------------------------------
/models/vision/detection/README.md:
--------------------------------------------------------------------------------
  1 | ## Description
  2 | ---
  3 | 
  4 | Download the yolov5, and yolov7 models, prepare for the Qualcomm AIC100, compile for high-thoughput, min-latency, or balanced throughput with fp16 precision, run the model on a generated random sample, and obtain the benchmarking results and output values.
  5 | 
  6 | ## Source of the models
  7 | ---
  8 | The models are downloaded from (https://github.com/ultralytics/yolov5). This script has been tested for the following requested models:
  9 | * yolov5s
 10 | * yolov5m
 11 | * yolov5l
 12 | * yolov5x
 13 | * yolov7-e6e
 14 | * yolov8m
 15 | 
 16 | ## Virtual environment
 17 | ---
 18 | For a quick environment setup:
 19 | 
 20 | ```commandline
 21 | python3.10 -m venv det_env
 22 | source det_env/bin/activate
 23 | 
 24 | ```
 25 | 
 26 | ## Framework and version
 27 | ---
 28 | ```commandline
 29 | pip3 install -r requirements.txt
 30 | 
 31 | ```
 32 | ## Syntax
 33 | ---
 34 | Copy the run_yolo_model.py and the lut_yolo_models.csv to a working directory. Pick a MODEL_NAME from the list above, and type:
 35 | 
 36 | ```commandline
 37 | 
 38 | usage: run_yolo_model.py [-h] --model-name {yolov5s,yolov5m,yolov5l,yolov5x,yolov7-e6e,yolov8m}
 39 |              [--objective {best-latency,best-throughput,balanced}] 
 40 | 	     [--opset OPSET] 
 41 | 	     [--batch-size BATCH_SIZE]
 42 |              [--image-size IMAGE_SIZE] 
 43 | 	     [--cores {1,2,3,4,5,6,7,8,9,10,11,12,13,14}]
 44 |              [--instances {1,2,3,4,5,6,7,8,9,10,11,12,13,14}] 
 45 | 	     [--ols {1,2,3,4,5,6,7,8,9,10,11,12,13,14}] 
 46 | 	     [--mos {1,2,3,4,5,6,7,8,9,10,11,12,13,14}]
 47 |              [--set-size {1,2,3,4,5,6,7,8,9,10}] 
 48 | 	     [--extra EXTRA] 
 49 | 	     [--time TIME] 
 50 | 	     [--device {0,1,2,3,4,5,6,7}] 
 51 | 	     [--run-only]
 52 | 
 53 | 
 54 | 
 55 | Download, Compile, and Run YOLO models on randomly generated inputs.
 56 | 
 57 | 
 58 | optional arguments:
 59 |   -h, --help            show this help message and exit
 60 |   --model-name, -m {yolov5s,yolov5m,yolov5l,yolov5x,yolov7-e6e,yolov8m}
 61 |                         Model name to download.
 62 |   --objective, -o {best-latency,best-throughput,balanced}
 63 |                         Running for best-latency, best-throughput, or balanced
 64 |   --opset OPSET         ONNX opset. Default <12>
 65 |   --batch-size, -b BATCH_SIZE
 66 |                         Sample input batch size. Default <1>.
 67 |   --image-size, -s IMAGE_SIZE
 68 |                         Sample input image width/height. Default <640>.
 69 |   --cores, -c {1,2,3,4,5,6,7,8,9,10,11,12,13,14}
 70 |                         Number of AIC100 cores to compile the model for. Default <2>
 71 |   --instances, -i {1,2,3,4,5,6,7,8,9,10,11,12,13,14}
 72 |                         Number of model instances to run on AIC100. Default <7>
 73 |   --ols {1,2,3,4,5,6,7,8,9,10,11,12,13,14}
 74 |                         Overlap split factor. Default <1>
 75 |   --mos {1,2,3,4,5,6,7,8,9,10,11,12,13,14}             
 76 |                         Maximum output channel split. Default <None>
 77 |   --set-size {1,2,3,4,5,6,7,8,9,10}
 78 |                         Set size. Default <10>
 79 |   --extra EXTRA         Extra compilation arguments.
 80 |   --time TIME           Duration (in seconds) for which to submit inferences. Default <20>
 81 |   --device, -d {0,1,2,3,4,5,6,7}
 82 |                         AIC100 device ID. Default <0>
 83 |   --run-only, -r        Performs the inference only, without re-exporting and re-compiling the model
 84 |   --include-nms         Run the model preparator  tool to optimize the graph, and to add the Post Processing to supported models. Details on model preparator tool here- https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Inference-Workflow/Export-the-model/Prepare-the-model/ 
 85 | 
 86 | 
 87 | ```
 88 | Examples:
 89 | ```commandline
 90 | python run_yolo_model.py -m yolov5s -o best-throughput
 91 | ```
 92 | ```commandline
 93 | python run_yolo_model.py -m yolov5m -o balanced
 94 | ```
 95 | ```commandline
 96 | python run_yolo_model.py -m yolov5x -o best-throughput
 97 | ```
 98 | 
 99 | The hardware configuration will be either associated to the corresponding row in the lut_yolo_models.csv or to defualt values if not specified by the user. If the MODEL_NAME is not included in the lut_yolo_models.csv, default values will be used.
100 | 
101 | After download, compile, and run is complete, the working directory of the selected model looks as follows. 
102 | # Working directory structure
103 | ```
104 | |── model                   # Contains the onnx file of the picked model 
105 | |   └── model.onnx          # The onnx file of the picked model
106 | |── inputFiles              # Contains the (randomly generated) input files of the compiled model
107 | │   └── input_img*.raw      # Randomly generated input files of the compiled model
108 | |── outputFiles             # Contains the corresponding output to input, as well as the hardware profiling for latency
109 | │   └── fp16*               
110 | │       └── output-act*.bin # Corresponding output to the randomly generated input_img*.raw
111 | │       └── aic-profil*.bin # The hardware profiling for round trip latency between host and device for each inference
112 | ├── compiled-bin*           # Compilation path
113 | │   └── programqpc.bin      # For the selected objective, the model.onnx is compiled into programqpc.bin 
114 | ├── list*.txt               # A list that contains path to the inputs. Can be used as input to qaic-runner
115 | ├── commands*.txt           # Includes necessary compilation and running scripts to reproduce the results manually.
116 | 
117 | ```
118 | To manually resproduce the results, navigate to the working directory, select the qaic compile/run commands from the command*.txt and run them in the terminal. 
119 | 


--------------------------------------------------------------------------------
/utils/multi-device/README.md:
--------------------------------------------------------------------------------
  1 | # Multi Device 
  2 | 
  3 | This guide provides setup instructions for multi-device enablement. PCIe peer-to-peer P2P communication must be enabled to allow efficient tensor slicing across multiple Cloud AI devices (SoCs and Cards).
  4 | 
  5 | Refer to [Model Sharding](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Features/model_sharding/) for more information on recommended PCIe topologies for tensor slicing (P2P).
  6 | 
  7 | ## Pre-requisites 
  8 | 
  9 | - Server with Platform and APPS SDK versions >= 1.17 installed.
 10 | - PCIe switch for inter-card P2P communication
 11 | - python3 -m pip install pyudev
 12 | 
 13 | ## Setup instructions 
 14 | 
 15 | Platform SDK 1.18 and later offers an option (`--setup_mdp all`) to enable P2P for the multi-device partitioning tensor slicing feature during installation.
 16 | 
 17 | Example:d
 18 | 
 19 | ```
 20 | cd <platform sdk installer>/x86_64/deb
 21 | sudo bash install.sh --setup_mdp all
 22 | ```
 23 | 
 24 | > [!IMPORTANT] 
 25 | > If P2P is enabled via the Platform SDK installer then skip to the [Testing P2P](#testing-p2p) section.
 26 | > 
 27 | > The remaining steps in this section show manual steps for enabling P2P.
 28 | 
 29 | ### Disable PCIe ACS for P2P communication between cards 
 30 | 
 31 | 1. Run `QAicChangeAcs.py` without any flags to display a hierarchial view of PCI bridges and AI 100 devices. 
 32 | 
 33 | ```
 34 | $ python3 QAicChangeAcs.py
 35 | Found the following AIC100 devices:
 36 | Root
 37 | ----0000:30:01.1   <-- Host system PCIe switch, script will disable ACS here
 38 | --------0000:31:00.0     <-- Ultra AI 100 onboard PCIe switch, script will disable ACS here
 39 | ------------0000:32:03.0
 40 | ----------------0000:36:00.0 [Qualcomm AIC100]
 41 | ------------0000:32:02.0
 42 | ----------------0000:35:00.0 [Qualcomm AIC100]
 43 | ------------0000:32:00.0
 44 | ----------------0000:38:00.0 [Qualcomm AIC100]
 45 | ------------0000:32:01.0
 46 | ----------------0000:39:00.0 [Qualcomm AIC100]
 47 | --------0000:21:00.0    <-- Ultra AI 100 onboard PCIe switch, script will disable ACS here
 48 | ------------0000:22:00.0
 49 | ----------------0000:23:00.0 [Qualcomm AIC100]
 50 | ------------0000:22:02.0
 51 | ----------------0000:25:00.0 [Qualcomm AIC100]
 52 | ------------0000:22:01.0
 53 | ----------------0000:27:00.0 [Qualcomm AIC100]
 54 | ------------0000:22:03.0
 55 | ----------------0000:28:00.0 [Qualcomm AIC100]
 56 | ```
 57 | 
 58 | 2. Run `QAicChangeAcs.py all` to disable ACS on all the downstream ports (on the PCIe switch) that connect to AI 100 devices as well as PCIe switch downstream ports that connect to the PCIe switch onboard the AI 100 cards. This command will enable P2P between the AI 100 devices (SoCs) on the same card as well card to card. 
 59 | 
 60 | 3. Users optionally can selectively disable ACS by running `QAicChangeAcs.py <SSSS:BB:DD.F>`, where 
 61 |     - SSSS = 4 digits segment number
 62 |     - BB = 2 digits bus number
 63 |     - DD = 2 digits device number
 64 |     - F = 1 digit function number
 65 | 
 66 |     of the nearest common ancestor PCI bridge under which ACS needs to be disabled. 
 67 |     
 68 |     Examples: 
 69 | 
 70 |     `$ python3 QAicChangeAcs.py 0000:31:00.0` will disable ACS on the first set of AI 100 devices (0000:36:00.0, 0000:35:00.0, 0000:38:00.0 and 0000:39:00.0). <br> 
 71 |     `$ python3 QAicChangeAcs.py 0000:30:01.1` will disable ACS across both the AI 100 Ultra cards as well as the 4 devices in each AI 100 card <br>
 72 | 
 73 | 4. Above steps need to be repeated on every server power cycle. 
 74 | 
 75 | 
 76 | ### Enable multi-device partitioning (MDP)
 77 | 
 78 | This step is required everytime a new version of the Platform SDK is installed. 
 79 | 
 80 | First, check that the Qaic Monitor service is running
 81 | ```
 82 | sudo systemctl status qmonitor-proxy
 83 | ```
 84 | 
 85 | If not active(running) then start it with:
 86 | ```
 87 | sudo systemd-run --unit=qmonitor-proxy /opt/qti-aic/tools/qaic-monitor-grpc-server
 88 | ```
 89 | 
 90 | Next, enable MDP across all Cloud AI devices in the server.
 91 | ``` 
 92 | sudo /opt/qti-aic/tools/qaic-monitor-json -i enable_mdp.json
 93 | ```
 94 | 
 95 | Reset Cloud AI devices for changes to take effect:
 96 | ``` 
 97 | sudo /opt/qti-aic/tools/qaic-util -s
 98 | ```
 99 | 
100 | ## Testing P2P
101 | 
102 | The Qaic Kernel driver requires a longer response timeout for P2P workloads. Use the following command to increase the timeout:
103 | ```
104 | sudo sh -c 'echo 2600 > /sys/module/qaic/parameters/control_resp_timeout_s'
105 | ```
106 | 
107 | Synthetic P2P workloads are available in `/opt/qti-aic/test-data/aic100/v2/qaic-compute-networks/factory-workload-bin`.
108 | 
109 | ### Multi-SoC Accelerators (Ultra) P2P tests
110 | 
111 | ```
112 | # P2P between 2 SoCs with QID 0 and 1 on the same card
113 | sudo /opt/qti-aic/exec/qaic-runner -t /opt/qti-aic/test-data/aic100/v2/qaic-compute-networks/factory-workload-bin/2c-p2p-bw -n 10 -a 1 -l -D 0:1
114 | 
115 | # P2P between 2 SoCs with QID 0 and 4 on different cards.  Choose cards that are on the same PCie switch.
116 | sudo /opt/qti-aic/exec/qaic-runner -t /opt/qti-aic/test-data/aic100/v2/qaic-compute-networks/factory-workload-bin/2c-p2p-bw -n 10 -a 1 -l -D 0:4
117 | ```
118 | 
119 | ### Single-SoC Accelerators (Standard/Pro) P2P tests
120 | 
121 | ```
122 | # P2P between 2 SoCs with QID 0 and 4 on different cards. Choose cards that are on the same PCie switch.
123 | sudo /opt/qti-aic/exec/qaic-runner -t /opt/qti-aic/test-data/aic100/v2/qaic-compute-networks/factory-workload-bin/2c-p2p-bw -n 10 -a 1 -l -D 0:1
124 | ```
125 | 
126 | ### Troubleshooting
127 | If a `Failed to access P2P device` error occurs, check the following:
128 | 1. Re-check enablement instructions above
129 | 2. Review the PCIe topology from the QAicChangeAcs.py script to make sure that a host PCIe switch is present
130 | 


--------------------------------------------------------------------------------
/utils/qaic-bench/README.md:
--------------------------------------------------------------------------------
  1 | # qaic-bench
  2 | 
  3 | Benchmarking script for Cloud AI Inference accelerators.
  4 | 
  5 | ## Installation for x86_64
  6 | 
  7 | Download Cloud AI Docker Image:
  8 | 
  9 | ```
 10 | docker pull ghcr.io/quic/cloud_ai_inference_ubuntu22:1.20.4.0
 11 | ```
 12 | 
 13 | Start container. This example maps 4 Cloud AI 100 Ultra Accelerators. Each accelerator has 4 SoC devices.
 14 | 
 15 | Note: For QPC generation, choose a `/cache` location with 1TB or more of free space to hold model weights, ONNX files, and QPC model binaries.
 16 | 
 17 | Note: Run `docker container rm qaic-bench` to clean up after exiting the container.
 18 | 
 19 | ```
 20 | cd utils/qaic-bench
 21 | 
 22 | docker run -it \
 23 |   --workdir /app \
 24 |   --name qaic-bench \
 25 |   --network host \
 26 |   --mount type=bind,source=${PWD},target=/app \
 27 |   --mount type=bind,source=${HOME}/.cache,target=/cache \
 28 |   --env HF_HOME='/cache/huggingface' \
 29 |   --env QEFF_HOME='/cache/qeff_models' \
 30 |   --env XDG_CACHE_HOME='/cache' \
 31 |   --device=/dev/accel/accel0 \
 32 |   --device=/dev/accel/accel1 \
 33 |   --device=/dev/accel/accel2 \
 34 |   --device=/dev/accel/accel3 \
 35 |   --device=/dev/accel/accel4 \
 36 |   --device=/dev/accel/accel5 \
 37 |   --device=/dev/accel/accel6 \
 38 |   --device=/dev/accel/accel7 \
 39 |   --device=/dev/accel/accel8 \
 40 |   --device=/dev/accel/accel8 \
 41 |   --device=/dev/accel/accel9 \
 42 |   --device=/dev/accel/accel10 \
 43 |   --device=/dev/accel/accel11 \
 44 |   --device=/dev/accel/accel12 \
 45 |   --device=/dev/accel/accel13 \
 46 |   --device=/dev/accel/accel14 \
 47 |   --device=/dev/accel/accel15 \
 48 |   ghcr.io/quic/cloud_ai_inference_ubuntu22:1.20.4.0
 49 | ```
 50 | 
 51 | Activate vLLM environment:<br>
 52 | 
 53 | ```
 54 | source /opt/vllm-env/bin/activate
 55 | ```
 56 | 
 57 | ## Installation for AArch64
 58 | 
 59 | Follow instructions [here](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/vLLM/vLLM/index.html#installing-from-source) to setup the vLLM environment for Cloud AI from source for AArch64.
 60 | 
 61 | Activate vLLM environment:<br>
 62 | 
 63 | ```
 64 | source qaic-vllm-venv/bin/activate
 65 | ```
 66 | 
 67 | ## KV-Heads Replication
 68 | 
 69 | Download KV-Heads Replication script from Efficient Transformers. This is needed to efficiently tensor-slice large models across 16 SoCs.
 70 | 
 71 | ```
 72 | wget https://github.com/quic/efficient-transformers/raw/refs/heads/release/v1.19.3_fp8_update/scripts/replicate_kv_head/replicate_kv_heads.py
 73 | ```
 74 | 
 75 | ## Multi-Device Operation
 76 | 
 77 | To run models across multiple AI 100 devices, make sure tensor slicing is enabled with:
 78 | 
 79 | ```
 80 | sudo /opt/qti-aic/tools/qaic-util -a
 81 | ```
 82 | 
 83 | The control response timeout must also be extended:
 84 | 
 85 | ```
 86 | sudo sh -c 'echo 2600 > /sys/module/qaic/parameters/control_resp_timeout_s'
 87 | ```
 88 | 
 89 | More details at: https://github.com/quic/cloud-ai-sdk/tree/1.20/utils/multi-device
 90 | 
 91 | ## Hugging Face Access Token
 92 | 
 93 | Some models on Hugging Face are access protected. Add your access token with the `--hf_token` script argument or set the `HF_TOKEN` environment variable. Learn more about Authentication here: https://huggingface.co/docs/huggingface_hub/en/quick-start#authentication.
 94 | 
 95 | ## Usage
 96 | 
 97 | Example:
 98 | 
 99 | ```
100 | python3 qaic_bench.py config/config_llama_3_1_8b.json
101 | ```
102 | 
103 | Details:
104 | 
105 | ```
106 | usage: qaic_bench.py [-h] [--devices DEVICES] [--compile-only] config
107 | 
108 | positional arguments:
109 |   config             JSON file with model configurations
110 | 
111 | options:
112 |   -h, --help         show this help message and exit
113 |   --devices DEVICES  List of comma separated device IDs to use for inferencing
114 |   --compile-only     Generate QPCs and skip benchmarking
115 |   --hf_token         Hugging Face access token
116 | ```
117 | 
118 | ## Configuration
119 | 
120 | ### Example
121 | 
122 | ```
123 | {
124 |     "vllm_root": "/opt/qti-aic/integrations/vllm",
125 |     
126 |     "models": [
127 |         {
128 |             "name": "Meta-Llama-3.1-8B-Instruct",
129 |             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
130 |             "configs": [
131 |                 {
132 |                     "batch_size": 1,
133 |                     "devices": 4,
134 |                     "prompt_len": 4096,
135 |                     "generation_len": 4096
136 |                 }
137 |             ]
138 |         }
139 |     ]
140 | }
141 | ```
142 | 
143 | ### JSON Reference
144 | 
145 | | Property       | Description                               |
146 | | -------------- | ----------------------------------------- |
147 | | vllm_root      | Path to full vLLM installation            |
148 | | models         | List of models to benchmark               |
149 | 
150 | ### Model Properties
151 | 
152 | | Property       | Description                               |
153 | | -------------- | ----------------------------------------- |
154 | | name           | Model friendly name                       |
155 | | model          | Hugging Face model path                   |
156 | | configs        | List of model configurations to benchmark |
157 | 
158 | ### Config Properties
159 | 
160 | | Property         | Description                               |
161 | | ---------------- | ----------------------------------------- |
162 | | batch_size       | Model batch size.                         |
163 | | devices          | Number of Cloud AI SoCs for tensor-sliced execution. Set to 1 for single-SoC execution. |
164 | | cores (optional) | Number of AI Cores for compilation. Default 16. |
165 | | prompt_len       | Prompt input length                       |
166 | | generation_len   | Max number of output tokens to generate   |
167 | | qpc (optional)   | Path to pre-generated QPC binary. If not specified, QPC will be generated. |
168 | 


--------------------------------------------------------------------------------
/tutorials/open-webui/README.md:
--------------------------------------------------------------------------------
  1 | # Connecting Cloud AI models to Open WebUI
  2 | 
  3 | [Open WebUI](https://github.com/open-webui/open-webui) is a self-hosted web interface for AI use-cases like Chat, Image Generation and RAG.
  4 | By starting OpenAI-compatible endpoints with vLLM, we can connect Open WebUI to AI models running on Qualcomm Cloud AI accelerators.
  5 | 
  6 | <p align="center">
  7 |   <img src="open_webui_screen_2.png" alt="Open WebUI Chat" width="900"/>
  8 | </p>
  9 | 
 10 | ## Pre-requisites
 11 | 
 12 | * Cloud AI Platform and Apps SDKs [Installation](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/Cloud-AI-SDK/Cloud-AI-SDK/index.html)
 13 | * Cloud AI 100 Ultra accelerator card
 14 | * Python 3.10
 15 | * Docker
 16 | 
 17 | To run language models on multiple SoCs, make sure [tensor slicing](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Features/model_sharding/index.html) is enabled and disable ACS:
 18 | 
 19 | ```
 20 | sudo /opt/qti-aic/tools/qaic-util -a
 21 | ```
 22 | 
 23 | Increase the response timeout:
 24 | ```
 25 | sudo sh -c "echo 2600 > /sys/module/qaic/parameters/control_resp_timeout_s"
 26 | ```
 27 | 
 28 | Preface all docker commands with `sudo`, or add yourself to the docker group:
 29 | ```
 30 | sudo usermod -aG docker $USER
 31 | ```
 32 | 
 33 | Launch a new shell or `newgrp docker` to apply the changes.
 34 | 
 35 | ## Prepare the model
 36 | 
 37 | Use [Efficient Transformers](https://github.com/quic/efficient-transformers) to prepare popular models like Llama-3.3-70B-Instruct, Qwen2.5-Coder and Phi4, or download pre-generated model binaries at http://qualcom-qpc-models.s3-website-us-east-1.amazonaws.com/QPC/.  Note the location of the 'programqpc.bin' files as you'll need these to start vLLM. Efficient-transformers stores model binaries in [~/.cache/qeff_cache](https://quic.github.io/efficient-transformers/source/quick_start.html#transformed-models-and-qpc-storage) by default.
 38 | 
 39 | ## Cloud AI Inference Container
 40 | 
 41 | [Cloud AI Inference containers](https://github.com/quic/cloud-ai-containers/pkgs/container/cloud_ai_inference_ubuntu22) include everything needed to compile and serve models with vLLM on Cloud AI accelerators.
 42 | 
 43 | Download the Docker image:
 44 | 
 45 | ```
 46 | docker pull ghcr.io/quic/cloud_ai_inference_ubuntu22:1.20.4.0
 47 | ```
 48 | 
 49 | ## Start vLLM endpoint
 50 | 
 51 | Prepare a script to launch vLLM with the pre-generated model binary inside the container.
 52 | 
 53 | Customize the Hugging face model name (`--model`), context length (`--max-model-len`), prompt length (`max-seq_len-to-capture`) and full batch size (`max-num-seq`) to match the QPC from the 'Prepare the Model' step above.
 54 | 
 55 | ```
 56 | $ cat <<EOF > serve.sh
 57 | #!/bin/bash
 58 | /opt/vllm-env/bin/python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4 --max-model-len 4096 --max-num-seq 1 --max-seq_len-to-capture 128 --device qaic --device-group 0,1,2,3
 59 | EOF
 60 | 
 61 | # Script must have execute permission
 62 | $ chmod +x serve.sh
 63 | ```
 64 | 
 65 | Note: Change `/path/to/qpc` to the QPC location from the 'Prepare the model' step above.
 66 | If your system has multiple Ultra cards, you can change the `--device` arguments to map a different card.
 67 | This example creates a `qaic-vllm` Docker volume to hold persistent data (namely the tokenizer weights downloaded from Hugging face).
 68 | 
 69 | ```
 70 | docker run -dit \
 71 |   --workdir /model \
 72 |   --name qaic-vllm \
 73 |   --network host \
 74 |   --mount type=bind,source=${PWD}/serve.sh,target=/model/serve.sh \
 75 |   --mount type=bind,source=/path/to/qpc,target=/model/qpc \
 76 |   -v qaic-vllm:/model/data \
 77 |   --env VLLM_QAIC_MAX_CPU_THREADS=8 \
 78 |   --env VLLM_QAIC_QPC_PATH=/model/qpc \
 79 |   --env HF_HOME=/model/data/huggingface \
 80 |   --env QEFF_HOME=/model/data/qeff_models \
 81 |   --device=/dev/accel/accel0 \
 82 |   --device=/dev/accel/accel1 \
 83 |   --device=/dev/accel/accel2 \
 84 |   --device=/dev/accel/accel3 \
 85 |   --entrypoint=/model/serve.sh \
 86 |   ghcr.io/quic/cloud_ai_inference_ubuntu22:1.20.4.0
 87 | ```
 88 | 
 89 | ## Test the endpoint
 90 | 
 91 | ```
 92 | curl http://localhost:8000/v1/chat/completions \
 93 |   -H "Content-Type: application/json" \
 94 |   -H "Authorization: Bearer test-key" \
 95 |   -d '{
 96 |     "model": "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4",
 97 |     "messages": [
 98 |       {
 99 |         "role": "system",
100 |         "content": "You are a helpful AI assistant."
101 |       },
102 |       {
103 |         "role": "user",
104 |         "content": "Hello!"
105 |       }
106 |     ]
107 |   }'
108 | ```
109 | 
110 | ## Start Open WebUI
111 | 
112 | Download Open WebUI Docker image:
113 | 
114 | ```
115 | docker pull ghcr.io/open-webui/open-webui:main
116 | ```
117 | 
118 | Refer to [setup instructions](https://docs.openwebui.com/getting-started/quick-start/#quick-start-with-docker-) for more details.
119 | 
120 | Run the Open WebUI container:
121 | 
122 | ```
123 | docker run \
124 |   -d \
125 |   --network host \
126 |   -e OPENAI_API_KEY=test-key \
127 |   -e OPENAI_API_BASE_URL="http://localhost:8000/v1" \
128 |   -v open-webui:/app/backend/data \
129 |   --name open-webui \
130 |   --restart always \
131 |   ghcr.io/open-webui/open-webui:main
132 | ```
133 | 
134 | In web browser, open http://<your_ip_or_server_name>:8080
135 | 
136 | Setup:
137 | * For first time startup, create a default user.  This user will have admin access.
138 | * Click Profile icon in upper right and open Admin Panel -> Settings -> Connections.
139 | * Click Configure icon for Manage OpenAI API Connections.
140 | * Make sure URL is http://localhost:8000/v1.  Key can be any value
141 | * Click Verify Connection icon to test the connection.
142 | * You should see a "Server Connection Verified" pop-up
143 | * If it fails, double-check that the server.py script is running
144 | * Back on the Open WebUI home page, select the model name from the 'Prepare the model' step above. 
145 | 
146 | <img src="open_webui_screen_1.png" alt="Open WebUI Setup" width="400"/>
147 | 
148 | You can now use the Chat interface in Open WebUI.


--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/patches/attention_patch.patch:
--------------------------------------------------------------------------------
  1 | diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
  2 | index e2ab160..6036c3a 100644
  3 | --- a/src/diffusers/models/attention_processor.py
  4 | +++ b/src/diffusers/models/attention_processor.py
  5 | @@ -11,6 +11,10 @@
  6 |  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  7 |  # See the License for the specific language governing permissions and
  8 |  # limitations under the License.
  9 | +#
 10 | +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 11 | +# SPDX-License-Identifier: BSD-3-Clause-Clear
 12 | +# Not a Contribution 
 13 |  import inspect
 14 |  import math
 15 |  from typing import Callable, List, Optional, Tuple, Union
 16 | @@ -258,9 +262,7 @@ class Attention(nn.Module):
 17 |          # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
 18 |          # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
 19 |          if processor is None:
 20 | -            processor = (
 21 | -                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
 22 | -            )
 23 | +            processor = AttnProcessor()
 24 |          self.set_processor(processor)
 25 |  
 26 |      def set_use_npu_flash_attention(self, use_npu_flash_attention: bool) -> None:
 27 | @@ -560,7 +562,7 @@ class Attention(nn.Module):
 28 |  
 29 |          if attention_mask is None:
 30 |              baddbmm_input = torch.empty(
 31 | -                query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
 32 | +                query.shape[0], query.shape[1], key.shape[2], dtype=query.dtype, device=query.device
 33 |              )
 34 |              beta = 0
 35 |          else:
 36 | @@ -570,7 +572,7 @@ class Attention(nn.Module):
 37 |          attention_scores = torch.baddbmm(
 38 |              baddbmm_input,
 39 |              query,
 40 | -            key.transpose(-1, -2),
 41 | +            key,
 42 |              beta=beta,
 43 |              alpha=self.scale,
 44 |          )
 45 | @@ -764,8 +766,25 @@ class AttnProcessor:
 46 |          key = attn.head_to_batch_dim(key)
 47 |          value = attn.head_to_batch_dim(value)
 48 |  
 49 | -        attention_probs = attn.get_attention_scores(query, key, attention_mask)
 50 | -        hidden_states = torch.bmm(attention_probs, value)
 51 | +        key = key.transpose(-1, -2)
 52 | +        if query.size(-2) != value.size(-2): # cross-attention, use regular attention
 53 | +            # QKV done in single block
 54 | +            attention_probs = attn.get_attention_scores(query, key, attention_mask)
 55 | +            hidden_states = torch.bmm(attention_probs, value)
 56 | +        else: # self-attention, use blocked attention
 57 | +            # QKV done with block-attention (a la FlashAttentionV2)
 58 | +            print(f"{query.shape = }, {key.shape = }, {value.shape = }")
 59 | +            query_block_size = 64
 60 | +            query_seq_len = query.size(-2)
 61 | +            num_blocks = (query_seq_len + query_block_size - 1) // query_block_size
 62 | +            for qidx in range(num_blocks):
 63 | +                query_block = query[:,qidx*query_block_size:(qidx+1)*query_block_size,:]
 64 | +                attention_probs = attn.get_attention_scores(query_block, key, attention_mask)
 65 | +                hidden_states_block = torch.bmm(attention_probs, value)
 66 | +                if qidx == 0:
 67 | +                    hidden_states = hidden_states_block
 68 | +                else:
 69 | +                    hidden_states = torch.cat((hidden_states, hidden_states_block), -2)
 70 |          hidden_states = attn.batch_to_head_dim(hidden_states)
 71 |  
 72 |          # linear proj
 73 | @@ -1075,15 +1094,31 @@ class JointAttnProcessor2_0:
 74 |          key = torch.cat([key, encoder_hidden_states_key_proj], dim=1)
 75 |          value = torch.cat([value, encoder_hidden_states_value_proj], dim=1)
 76 |  
 77 | -        inner_dim = key.shape[-1]
 78 | -        head_dim = inner_dim // attn.heads
 79 | -        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
 80 | -        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
 81 | -        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
 82 | +        query = attn.head_to_batch_dim(query)
 83 | +        key = attn.head_to_batch_dim(key)
 84 | +        value = attn.head_to_batch_dim(value)
 85 |  
 86 | -        hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
 87 | -        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
 88 | -        hidden_states = hidden_states.to(query.dtype)
 89 | +        # pre-transpose the key
 90 | +        key = key.transpose(-1, -2)
 91 | +        if query.size(-2) != value.size(-2): # cross-attention, use regular attention
 92 | +            # QKV done in single block
 93 | +            attention_probs = attn.get_attention_scores(query, key, attention_mask)
 94 | +            hidden_states = torch.bmm(attention_probs, value)
 95 | +        else: # self-attention, use blocked attention
 96 | +            # QKV done with block-attention (a la FlashAttentionV2)
 97 | +            print(f"{query.shape = }, {key.shape = }, {value.shape = }")
 98 | +            query_block_size = 64
 99 | +            query_seq_len = query.size(-2)
100 | +            num_blocks = (query_seq_len + query_block_size - 1) // query_block_size
101 | +            for qidx in range(num_blocks):
102 | +                query_block = query[:,qidx*query_block_size:(qidx+1)*query_block_size,:]
103 | +                attention_probs = attn.get_attention_scores(query_block, key, attention_mask)
104 | +                hidden_states_block = torch.bmm(attention_probs, value)
105 | +                if qidx == 0:
106 | +                    hidden_states = hidden_states_block
107 | +                else:
108 | +                    hidden_states = torch.cat((hidden_states, hidden_states_block), -2)
109 | +        hidden_states = attn.batch_to_head_dim(hidden_states)
110 |  
111 |          # Split the attention outputs.
112 |          hidden_states, encoder_hidden_states = (
113 | 


--------------------------------------------------------------------------------