├── models
├── multimodal
│ └── text_to_image
│ │ ├── sdxl_turbo
│ │ ├── onnx_generation
│ │ │ ├── __init__.py
│ │ │ └── onnx_gen_utils.py
│ │ ├── requirements.txt
│ │ ├── utils.py
│ │ ├── run_config_gen.sh
│ │ ├── run_config_inference.sh
│ │ ├── model.py
│ │ ├── server.py
│ │ ├── patches
│ │ │ └── attention_patch.patch
│ │ └── README.md
│ │ ├── sdxl_deepcache
│ │ ├── onnx_generation
│ │ │ ├── __init__.py
│ │ │ └── onnx_gen_utils.py
│ │ ├── requirements.txt
│ │ ├── utils.py
│ │ ├── README.md
│ │ ├── run_config_deep.sh
│ │ ├── run_config_shallow.sh
│ │ ├── patches
│ │ │ ├── attention_patch.patch
│ │ │ └── deepcache_unet.patch
│ │ └── run_config_inference.sh
│ │ ├── stable-diffusion-3.5-medium
│ │ ├── onnx_generation
│ │ │ ├── __init__.py
│ │ │ └── onnx_gen_utils.py
│ │ ├── requirements.txt
│ │ ├── utils.py
│ │ ├── README.md
│ │ ├── run_config_inference.sh
│ │ ├── run_config_gen.sh
│ │ ├── patches
│ │ │ ├── transformer_patch.patch
│ │ │ └── attention_patch.patch
│ │ └── model.py
│ │ ├── DeciDiffusion-v2-0
│ │ └── README.md
│ │ ├── stable-diffusion-v1-5
│ │ └── README.md
│ │ └── stable-diffusion-xl-base-1.0
│ │ ├── requirements.txt
│ │ ├── fix_vae_decoder_onnx.py
│ │ ├── onnx_gen_utils.py
│ │ ├── attention_patch.patch
│ │ └── compile_models.sh
├── vision
│ ├── classification
│ │ └── requirements.txt
│ └── detection
│ │ ├── requirements.txt
│ │ ├── lut_yolo_models.csv
│ │ └── README.md
├── language_processing
│ ├── decoder
│ │ ├── MptForCausalLM
│ │ │ └── README.md
│ │ ├── DeciCoder-6b
│ │ │ ├── requirements.txt
│ │ │ ├── init.sh
│ │ │ ├── specializations_template.json
│ │ │ ├── compileModel.sh
│ │ │ └── README.md
│ │ ├── LlamaForCausalLM
│ │ │ └── README.md
│ │ ├── README.md
│ │ ├── GPTBigCodeForCausalLM
│ │ │ └── README.md
│ │ └── CodeGen-With-Speculative-Decoding
│ │ │ └── README.md
│ └── encoder
│ │ ├── requirements.txt
│ │ ├── model.py
│ │ └── server.py
└── speech
│ └── whisper
│ ├── requirements.txt
│ ├── audio.py
│ ├── README.md
│ ├── generateModel.py
│ └── runModel.py
├── images
└── Cloud_AI_100.png
├── tutorials
├── NLP
│ ├── Model-Onboarding-Beginner
│ │ ├── bert-base-cased-config.yaml
│ │ ├── distilbert-base-cased-distilled-squad-config.yaml
│ │ ├── Images
│ │ │ └── Workflow.jpg
│ │ └── requirements.txt
│ ├── Profiler-Intermediate
│ │ ├── images
│ │ │ ├── Latency.png
│ │ │ ├── opstats_decoder.png
│ │ │ ├── opstats_example.png
│ │ │ └── operator_details.png
│ │ └── requirements.txt
│ └── Performance-Tuning-Beginner
│ │ ├── Images
│ │ └── Latency.jpg
│ │ ├── requirements.txt
│ │ ├── bert_base_dopt.json
│ │ └── bert_base_dopt_min_latency.json
├── open-webui
│ ├── open_webui_screen_1.png
│ ├── open_webui_screen_2.png
│ ├── serve.sh
│ ├── open_webui.sh
│ ├── vllm_container.sh
│ └── README.md
├── Playground
│ ├── images
│ │ └── qualcomm_cloud_ai_playground.png
│ └── README.md
├── Computer-Vision
│ ├── Perfomance-Tuning-Beginner
│ │ ├── Images
│ │ │ └── Latency.jpg
│ │ ├── requirements.txt
│ │ ├── resnet_base_dopt_min_latency.json
│ │ └── resnet_base_dopt_throughput.json
│ └── DETR
│ │ └── README.md
├── efficient_transformers
│ └── README.md
└── README.md
├── samples
├── python
│ ├── qaic_features
│ │ ├── resnet_config.yaml
│ │ ├── benchmarking_eg.py
│ │ ├── metrics_eg.py
│ │ ├── profiling_eg.py
│ │ └── README.md
│ ├── vit_qaic
│ │ ├── vit_config.yaml
│ │ └── example.py
│ ├── requirements.txt
│ ├── README.md
│ ├── aws_ai100_benchmarking
│ │ ├── yolo_models
│ │ │ ├── lut_yolo_models.csv
│ │ │ └── README.md
│ │ ├── cv_classifiers
│ │ │ └── run_cv_classifiers.sh
│ │ └── parse_latency_and_throughput.py
│ └── common_utils.py
└── cpp
│ └── cpp_qpc_inference
│ ├── CMakeLists.txt
│ └── Readme.md
├── utils
├── multi-device
│ ├── enable_mdp.json
│ └── README.md
├── qaic-bench
│ ├── config
│ │ ├── config_tiny_llama.json
│ │ ├── config_llama_3_1_8b.json
│ │ └── config_dl2q.json
│ └── README.md
└── README.md
├── CONTRIBUTING.md
├── LICENSE
└── CODE-OF-CONDUCT.md
/models/multimodal/text_to_image/sdxl_turbo/onnx_generation/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/onnx_generation/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/onnx_generation/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/images/Cloud_AI_100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/images/Cloud_AI_100.png
--------------------------------------------------------------------------------
/tutorials/NLP/Model-Onboarding-Beginner/bert-base-cased-config.yaml:
--------------------------------------------------------------------------------
1 | # Inference Parameters
2 | num_activations: 2
3 | set_size: 1
4 |
--------------------------------------------------------------------------------
/tutorials/open-webui/open_webui_screen_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/open-webui/open_webui_screen_1.png
--------------------------------------------------------------------------------
/tutorials/open-webui/open_webui_screen_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/open-webui/open_webui_screen_2.png
--------------------------------------------------------------------------------
/tutorials/NLP/Model-Onboarding-Beginner/distilbert-base-cased-distilled-squad-config.yaml:
--------------------------------------------------------------------------------
1 | # Inference Parameters
2 | num_activations: 2
3 | set_size: 10
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/DeciDiffusion-v2-0/README.md:
--------------------------------------------------------------------------------
1 | ## DeciDiffusion 2.0
2 |
3 | This model is deprecated. sdxl_turbo is the recommended alternative.
4 |
--------------------------------------------------------------------------------
/tutorials/NLP/Profiler-Intermediate/images/Latency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/NLP/Profiler-Intermediate/images/Latency.png
--------------------------------------------------------------------------------
/tutorials/NLP/Model-Onboarding-Beginner/Images/Workflow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/NLP/Model-Onboarding-Beginner/Images/Workflow.jpg
--------------------------------------------------------------------------------
/tutorials/NLP/Performance-Tuning-Beginner/Images/Latency.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/NLP/Performance-Tuning-Beginner/Images/Latency.jpg
--------------------------------------------------------------------------------
/tutorials/Playground/images/qualcomm_cloud_ai_playground.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/Playground/images/qualcomm_cloud_ai_playground.png
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-v1-5/README.md:
--------------------------------------------------------------------------------
1 | ## Stable Diffusion v1-5
2 |
3 | This model is deprecated. sdxl_turbo is the recommended alternative.
4 |
5 |
--------------------------------------------------------------------------------
/tutorials/NLP/Profiler-Intermediate/images/opstats_decoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/NLP/Profiler-Intermediate/images/opstats_decoder.png
--------------------------------------------------------------------------------
/tutorials/NLP/Profiler-Intermediate/images/opstats_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/NLP/Profiler-Intermediate/images/opstats_example.png
--------------------------------------------------------------------------------
/tutorials/NLP/Profiler-Intermediate/requirements.txt:
--------------------------------------------------------------------------------
1 | onnx==1.12.0
2 | optimum
3 | numpy==1.23.4
4 | onnxruntime
5 | torch===1.11.0
6 | pillow==8.3.2
7 | opencv-python
8 | paramiko
--------------------------------------------------------------------------------
/tutorials/NLP/Profiler-Intermediate/images/operator_details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/NLP/Profiler-Intermediate/images/operator_details.png
--------------------------------------------------------------------------------
/tutorials/NLP/Performance-Tuning-Beginner/requirements.txt:
--------------------------------------------------------------------------------
1 | onnx==1.12.0
2 | optimum
3 | numpy==1.23.4
4 | onnxruntime
5 | torch===1.11.0
6 | pillow==8.3.2
7 | opencv-python
8 | paramiko
--------------------------------------------------------------------------------
/samples/python/qaic_features/resnet_config.yaml:
--------------------------------------------------------------------------------
1 | aic_num_cores: 4
2 | num_activations: 1
3 | convert_to_fp16: true
4 | onnx_define_symbol:
5 | batch: 2
6 | # output_dir: './resnet_qpc'
7 |
--------------------------------------------------------------------------------
/tutorials/Computer-Vision/Perfomance-Tuning-Beginner/Images/Latency.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/Computer-Vision/Perfomance-Tuning-Beginner/Images/Latency.jpg
--------------------------------------------------------------------------------
/tutorials/Computer-Vision/Perfomance-Tuning-Beginner/requirements.txt:
--------------------------------------------------------------------------------
1 | onnx==1.12.0
2 | optimum
3 | numpy==1.23.4
4 | onnxruntime
5 | torch==1.13.0
6 | pillow==8.3.2
7 | opencv-python
8 | paramiko
9 | jsonschema
--------------------------------------------------------------------------------
/tutorials/NLP/Model-Onboarding-Beginner/requirements.txt:
--------------------------------------------------------------------------------
1 | onnx==1.12.0
2 | optimum
3 | numpy==1.23.4
4 | onnxruntime
5 | torch===1.11.0
6 | pillow==8.3.2
7 | onnxsim
8 | /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl
--------------------------------------------------------------------------------
/models/vision/classification/requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url https://download.pytorch.org/whl/cpu
2 | torch==2.3.1+cpu
3 | torchvision==0.18.1+cpu
4 | onnx==1.14.0
5 | onnxruntime==1.19.0
6 | transformers==4.41.2
7 | pandas==2.1.4
8 |
--------------------------------------------------------------------------------
/utils/multi-device/enable_mdp.json:
--------------------------------------------------------------------------------
1 | {
2 | "request": [
3 | {
4 | "qid": -1,
5 | "dev_config": {
6 | "update_multi_device_partition_config_request": {
7 | "enable": true
8 | }
9 | }
10 | }
11 | ]
12 | }
13 |
--------------------------------------------------------------------------------
/models/language_processing/decoder/MptForCausalLM/README.md:
--------------------------------------------------------------------------------
1 | # MptForCausalLM
2 |
3 | Head over to the [Qualcomm Efficient Transformers library](https://github.com/quic/efficient-transformers) for optimized LLM deployment on Cloud AI 100 Accelerators.
--------------------------------------------------------------------------------
/models/language_processing/decoder/DeciCoder-6b/requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url https://download.pytorch.org/whl/cpu
2 | torch==2.1.2
3 | onnx==1.15.0
4 | onnxruntime==1.16.3
5 | onnxsim==0.4.35
6 | tiktoken==0.5.2
7 | protobuf==3.20.2
8 | numpy==1.26.4
--------------------------------------------------------------------------------
/models/language_processing/decoder/LlamaForCausalLM/README.md:
--------------------------------------------------------------------------------
1 | # LlamaForCausalLM
2 |
3 | Head over to the [Qualcomm Efficient Transformers library](https://github.com/quic/efficient-transformers) for optimized LLM deployment on Cloud AI 100 Accelerators.
--------------------------------------------------------------------------------
/models/language_processing/decoder/README.md:
--------------------------------------------------------------------------------
1 | # efficient-transformers package for LLMs
2 |
3 | Head over to the [Qualcomm Efficient Transformers library](https://github.com/quic/efficient-transformers) for optimized LLM deployment on Cloud AI 100 Accelerators.
--------------------------------------------------------------------------------
/models/language_processing/decoder/GPTBigCodeForCausalLM/README.md:
--------------------------------------------------------------------------------
1 | # GPTBigCodeForCausalLM
2 |
3 | Head over to the [Qualcomm Efficient Transformers library](https://github.com/quic/efficient-transformers) for optimized LLM deployment on Cloud AI 100 Accelerators.
--------------------------------------------------------------------------------
/models/language_processing/decoder/CodeGen-With-Speculative-Decoding/README.md:
--------------------------------------------------------------------------------
1 | # Speculative decoding - CodeGen
2 |
3 | Head over to the [Qualcomm Efficient Transformers library](https://github.com/quic/efficient-transformers) for optimized LLM deployment on Cloud AI 100 Accelerators.
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_turbo/requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url https://download.pytorch.org/whl/cpu
2 | networkx==3.0
3 | torch==2.3.1
4 | torchvision
5 | torchaudio
6 | onnx==1.12.0
7 | onnxruntime
8 | accelerate
9 | transformers==4.42
10 | huggingface-hub==0.25.2
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url https://download.pytorch.org/whl/cpu
2 | networkx==3.0
3 | torch==2.3.1
4 | torchvision
5 | torchaudio
6 | onnx==1.12.0
7 | onnxruntime
8 | accelerate
9 | transformers==4.42
10 | huggingface-hub==0.25.2
--------------------------------------------------------------------------------
/models/language_processing/decoder/DeciCoder-6b/init.sh:
--------------------------------------------------------------------------------
1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
2 | # SPDX-License-Identifier: BSD-3-Clause-Clear
3 |
4 | MODEL_REPO="Deci"
5 | MODEL_NAME="DeciCoder-6b"
6 | BS=1
7 | PL=256
8 | CL=2048
9 | CORES=14
10 | MX="-mxfp6-matmul"
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-xl-base-1.0/requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url https://download.pytorch.org/whl/cpu
2 | networkx==3.0
3 | torch==2.3.1
4 | torchvision
5 | torchaudio
6 | onnx==1.12.0
7 | onnxruntime
8 | accelerate
9 | transformers==4.42
10 | huggingface-hub==0.25.2
--------------------------------------------------------------------------------
/models/language_processing/decoder/DeciCoder-6b/specializations_template.json:
--------------------------------------------------------------------------------
1 | {
2 | "specializations": [
3 | {
4 | "batch_size": "BS",
5 | "seq_len": "PL",
6 | "ctx_len": "CL"
7 | },
8 | {
9 | "batch_size": "BS",
10 | "seq_len": "1",
11 | "ctx_len": "CL"
12 | }
13 | ]
14 | }
15 |
--------------------------------------------------------------------------------
/samples/python/vit_qaic/vit_config.yaml:
--------------------------------------------------------------------------------
1 | # compile parameters
2 | aic_num_cores: 4
3 | convert_to_fp16: true
4 | mos: 1
5 | ols: 2
6 | multicast-weights: true
7 | onnx_define_symbol:
8 | batch_size: 1
9 | stats-batchsize: 1
10 | compile-only: true
11 |
12 | # inference parameters
13 | num_activations: 3
14 | set_size: 4
--------------------------------------------------------------------------------
/models/vision/detection/requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url https://download.pytorch.org/whl/cpu
2 | torch==2.3.1+cpu
3 | torchvision==0.18.1+cpu
4 | onnx==1.19.1
5 | onnxruntime==1.19.0
6 | onnxscript
7 | transformers==4.41.2
8 | pandas==2.1.4
9 | opencv-python-headless
10 | opencv-contrib-python-headless
11 | ultralytics
12 | seaborn
13 | onnx-graphsurgeon
14 |
--------------------------------------------------------------------------------
/samples/python/requirements.txt:
--------------------------------------------------------------------------------
1 | altgraph==0.17.2
2 | attrs==21.4.0
3 | grpcio==1.44.0
4 | iniconfig==1.1.1
5 | nose==1.3.7
6 | numpy==1.22.4
7 | packaging==21.3
8 | pluggy==1.0.0
9 | protobuf==3.20.0
10 | py==1.11.0
11 | pyinstaller==4.9
12 | pyinstaller-hooks-contrib==2022.2
13 | pyparsing==3.0.7
14 | pytest==6.2.5
15 | pyudev==0.23.2
16 | PyYAML==6.0
17 | six==1.16.0
18 | toml==0.10.2
19 | yapf==0.32.0
20 |
--------------------------------------------------------------------------------
/models/speech/whisper/requirements.txt:
--------------------------------------------------------------------------------
1 | /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl
2 | numpy==1.23.5
3 | datasets==2.7.1
4 | transformers==4.24.0
5 | torch==1.12.1
6 | onnx==1.12.0
7 | fsspec==2022.11.0
8 | multiprocess==0.70.14
9 | huggingface-hub==0.11.0
10 | librosa==0.9.2
11 | soundfile==0.11.0
12 | whisper @ git+https://github.com/openai/whisper.git@ec1b34bb90dc2822ce4ebac23970b84dbb03ec6c
13 | pyarrow==20.0.0
14 |
--------------------------------------------------------------------------------
/tutorials/open-webui/serve.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause-Clear
5 |
6 | model=hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
7 |
8 | /opt/vllm-env/bin/python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model ${model} --max-model-len 4096 --max-num-seq 1 --max-seq_len-to-capture 128 --device qaic --device-group 0,1,2,3
9 |
--------------------------------------------------------------------------------
/models/language_processing/encoder/requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url https://download.pytorch.org/whl/cpu
2 | networkx==3.1
3 | torch==2.3.1
4 | fsspec==2024.2.0
5 | wheel==0.42.0
6 | sentence-transformers==2.6.1
7 | onnx==1.18.0
8 | onnxruntime==1.22
9 | transformers==4.40.2
10 | optimum==1.19.1
11 | protobuf==5.26.1
12 | urllib3==1.26.6
13 | /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl
14 |
15 | # For inference serving
16 | fastapi
17 | uvicorn
18 |
--------------------------------------------------------------------------------
/tutorials/open-webui/open_webui.sh:
--------------------------------------------------------------------------------
1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
2 | # SPDX-License-Identifier: BSD-3-Clause-Clear
3 |
4 | image=ghcr.io/open-webui/open-webui:main
5 |
6 | docker run \
7 | -d \
8 | --network host \
9 | -e OPENAI_API_KEY=test-key \
10 | -e OPENAI_API_BASE_URL="http://localhost:8000/v1" \
11 | -v open-webui:/app/backend/data \
12 | --name open-webui \
13 | --restart always \
14 | ${image}
15 |
--------------------------------------------------------------------------------
/utils/qaic-bench/config/config_tiny_llama.json:
--------------------------------------------------------------------------------
1 | {
2 | "vllm_root": "/opt/qti-aic/integrations/vllm",
3 |
4 | "models": [
5 | {
6 | "name": "TinyLlama-1.1B-Chat-v1.0",
7 | "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
8 | "configs": [
9 | {
10 | "batch_size": 1,
11 | "devices": 1,
12 | "prompt_len": 1024,
13 | "generation_len": 1024
14 | }
15 | ]
16 | }
17 | ]
18 | }
19 |
--------------------------------------------------------------------------------
/utils/qaic-bench/config/config_llama_3_1_8b.json:
--------------------------------------------------------------------------------
1 | {
2 | "vllm_root": "/opt/qti-aic/integrations/vllm",
3 |
4 | "models": [
5 | {
6 | "name": "Meta-Llama-3.1-8B-Instruct",
7 | "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
8 | "configs": [
9 | {
10 | "batch_size": 1,
11 | "devices": 4,
12 | "prompt_len": 4096,
13 | "generation_len": 4096
14 | }
15 | ]
16 | }
17 | ]
18 | }
19 |
--------------------------------------------------------------------------------
/tutorials/Playground/README.md:
--------------------------------------------------------------------------------
1 | # Cloud AI Playground Notebook setup
2 |
3 | ## Python Setup
4 | ```
5 | # Setup venv
6 | python3.10 -m venv imagine_env
7 | source imagine_env/bin/activate
8 | pip3 install pip -U
9 |
10 | # Install Qualcomm Imagine Python library
11 | pip3 install python-imagine-sdk
12 |
13 | # Install dependencies
14 | pip3 install Pillow
15 | pip3 install notebook
16 | pip3 install pandas
17 | pip3 install openai
18 | ```
19 |
20 | ## Launch Notebook
21 | ```
22 | jupyter notebook --no-browser --ip 0.0.0.0 --port 8080
23 | ```
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/utils/qaic-bench/config/config_dl2q.json:
--------------------------------------------------------------------------------
1 | {
2 | "vllm_root": "/opt/qti-aic/integrations/vllm",
3 |
4 | "models": [
5 | {
6 | "name": "Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
7 | "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
8 | "configs": [
9 | {
10 | "batch_size": 1,
11 | "devices": 1,
12 | "cores": 14,
13 | "prompt_len": 4096,
14 | "generation_len": 4096
15 | }
16 | ]
17 | }
18 | ]
19 | }
20 |
--------------------------------------------------------------------------------
/tutorials/efficient_transformers/README.md:
--------------------------------------------------------------------------------
1 | ## Installation steps
2 |
3 | ### Create python virtual environment and activate it
4 | ```
5 | python3.10 -m venv qeff_env
6 | source qeff_env/bin/activate
7 | pip install --upgrade pip
8 | ```
9 |
10 | ### Clone and install the efficient transformers repo
11 | ```
12 | pip install git+https://github.com/quic/efficient-transformers@release/v1.20.0
13 | ```
14 |
15 | ### After installation of efficient transformers library, install jupyter notebook
16 | ```
17 | pip install notebook
18 | ```
19 |
20 | ### Launch Notebook
21 | ```
22 | jupyter notebook --no-browser --allow-root
23 | ```
24 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url https://download.pytorch.org/whl/cpu
2 | accelerate==0.31.0
3 | certifi==2024.6.2
4 | charset-normalizer==3.3.2
5 | coloredlogs==15.0.1
6 | filelock==3.13.1
7 | flatbuffers==24.3.25
8 | fsspec==2024.2.0
9 | huggingface-hub==0.23.4
10 | humanfriendly==10.0
11 | idna==3.7
12 | importlib-metadata==7.2.1
13 | Jinja2==3.1.3
14 | MarkupSafe==2.1.5
15 | mpmath==1.3.0
16 | networkx==3.1
17 | numpy==1.24.1
18 | onnx==1.12.0
19 | onnxruntime==1.16.3
20 | packaging==24.1
21 | pillow==10.2.0
22 | protobuf==3.20.1
23 | psutil==6.0.0
24 | PyYAML==6.0.1
25 | regex==2024.5.15
26 | requests==2.32.3
27 | safetensors==0.4.3
28 | sympy==1.12
29 | tokenizers==0.19.1
30 | torch==2.4.1
31 | tqdm==4.66.4
32 | transformers==4.41.2
33 | typing-extensions==4.9.0
34 | urllib3==2.2.2
35 | zipp==3.19.2
36 | onnxsim==0.4.36
37 | sentencepiece==0.2.0
38 |
--------------------------------------------------------------------------------
/tutorials/open-webui/vllm_container.sh:
--------------------------------------------------------------------------------
1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
2 | # SPDX-License-Identifier: BSD-3-Clause-Clear
3 |
4 | image=ghcr.io/quic/cloud_ai_inference_ubuntu22:1.19.8.0
5 | qpc_path=/path/to/qpc
6 |
7 | chmod +x serve.sh
8 |
9 | docker run -dit \
10 | --workdir /model \
11 | --name qaic-vllm \
12 | --network host \
13 | --mount type=bind,source=${PWD}/serve.sh,target=/model/serve.sh \
14 | --mount type=bind,source=${qpc_path},target=/model/qpc \
15 | -v qaic-vllm:/model/data \
16 | --env VLLM_QAIC_MAX_CPU_THREADS=8 \
17 | --env VLLM_QAIC_QPC_PATH=/model/qpc \
18 | --env HF_HOME=/model/data/huggingface \
19 | --env QEFF_HOME=/model/data/qeff_models \
20 | --device=/dev/accel/accel0 \
21 | --device=/dev/accel/accel1 \
22 | --device=/dev/accel/accel2 \
23 | --device=/dev/accel/accel3 \
24 | --entrypoint=/model/serve.sh \
25 | ${image}
26 |
--------------------------------------------------------------------------------
/tutorials/Computer-Vision/DETR/README.md:
--------------------------------------------------------------------------------
1 | ## Description
2 | ---
3 |
4 | Download the DETR-ResNet50 model, prepare for the Qualcomm AIC100, compile the model, run the model on a generated random sample along with input image, and obtain the output.
5 |
6 |
7 | ## Source of the model
8 | ---
9 |
10 | This model is an implementation of DETR-ResNet50 found at (https://github.com/facebookresearch/detr).
11 |
12 |
13 | ## Virtual environment
14 | ---
15 | For a quick environment setup:
16 |
17 | ```commandline
18 | python3.8 -m venv cv_workflow_env
19 | source cv_workflow_env/bin/activate
20 | pip install --upgrade pip
21 |
22 | ```
23 |
24 | ## Framework and version
25 | ---
26 | ```commandline
27 | pip install torch==2.4.1+cpu torchvision==0.19.1+cpu --index-url https://download.pytorch.org/whl/cpu
28 | pip install numpy==1.24.4 onnx==1.17.0 pillow==10.4.0 requests==2.32.3 notebook==7.3.3 matplotlib==3.7.5 scipy==1.10.1
29 |
30 | ```
31 |
32 |
--------------------------------------------------------------------------------
/models/vision/detection/lut_yolo_models.csv:
--------------------------------------------------------------------------------
1 | MODEL_NAME,TASK,BATCH_SIZE,IMAGE_SIZE,CORES,INSTANCES,OLS,MOS,SET_SIZE,EXTRA,PRECISION,OBJECTIVE
2 | yolov5s,object-detection,1,640,7,2,2,,1,,fp16,best-latency
3 | yolov5s,object-detection,1,640,3,4,2,,1,,fp16,balanced
4 | yolov5s,object-detection,1,640,2,7,2,,2,,fp16,best-throughput
5 | yolov5m,object-detection,1,640,12,1,1,,1,,fp16,best-latency
6 | yolov5m,object-detection,1,640,12,1,1,,2,,fp16,balanced
7 | yolov5m,object-detection,1,640,2,7,2,,2,,fp16,best-throughput
8 | yolov5l,object-detection,1,640,12,1,1,,1,,fp16,best-latency
9 | yolov5l,object-detection,1,640,4,3,2,,1,,fp16,balanced
10 | yolov5l,object-detection,1,640,2,7,4,,2,,fp16,best-throughput
11 | yolov5x,object-detection,1,640,12,1,1,,1,,fp16,best-latency
12 | yolov5x,object-detection,1,640,7,2,2,,2,,fp16,balanced
13 | yolov5x,object-detection,1,640,2,7,4,,1,,fp16,best-throughput
14 | yolov7-e6e,object-detection,1,640,12,1,1,,1, -multicast-weights,fp16,best-latency
15 | yolov7-e6e,object-detection,1,640,4,3,2, ,1, -multicast-weights,fp16,balanced
16 | yolov7-e6e,object-detection,1,640,6,2,2, ,2, -multicast-weights,fp16,best-throughput
17 |
--------------------------------------------------------------------------------
/tutorials/NLP/Performance-Tuning-Beginner/bert_base_dopt.json:
--------------------------------------------------------------------------------
1 | {
2 | "max_func_eval": 200,
3 | "objective": "maximize_inf_rate",
4 | "params": {
5 | "cores": {
6 | "min": 1,
7 | "max": 14
8 | },
9 | "mos": {
10 | "min": 1,
11 | "max": 8
12 | },
13 | "ols": {
14 | "min": 1,
15 | "max": 8
16 | },
17 | "bs": {
18 | "min": 1,
19 | "max": 16
20 | },
21 | "instances": {
22 | "min": 1,
23 | "max": 14
24 | }
25 | },
26 | "initial_values": [
27 | {
28 | "cores": 1,
29 | "mos": 1,
30 | "ols": 1,
31 | "bs": 1,
32 | "instances": 14
33 | },
34 | {
35 | "cores": 2,
36 | "mos": 1,
37 | "ols": 1,
38 | "bs": 1,
39 | "instances": 7
40 | },
41 | {
42 | "cores": 4,
43 | "mos": 1,
44 | "ols": 1,
45 | "bs": 1,
46 | "instances": 3
47 | },
48 | {
49 | "cores": 7,
50 | "mos": 1,
51 | "ols": 1,
52 | "bs": 1,
53 | "instances": 2
54 | },
55 | {
56 | "cores": 14,
57 | "mos": 1,
58 | "ols": 1,
59 | "bs": 1,
60 | "instances": 1
61 | }
62 | ]
63 | }
--------------------------------------------------------------------------------
/tutorials/NLP/Performance-Tuning-Beginner/bert_base_dopt_min_latency.json:
--------------------------------------------------------------------------------
1 | {
2 | "max_func_eval": 200,
3 | "objective": "minimize_latency",
4 | "params": {
5 | "cores": {
6 | "min": 1,
7 | "max": 14
8 | },
9 | "mos": {
10 | "min": 1,
11 | "max": 8
12 | },
13 | "ols": {
14 | "min": 1,
15 | "max": 8
16 | },
17 | "bs": {
18 | "min": 1,
19 | "max": 1
20 | },
21 | "instances": {
22 | "min": 1,
23 | "max": 14
24 | }
25 | },
26 | "initial_values": [
27 | {
28 | "cores": 1,
29 | "mos": 1,
30 | "ols": 1,
31 | "bs": 1,
32 | "instances": 1
33 | },
34 | {
35 | "cores": 2,
36 | "mos": 1,
37 | "ols": 1,
38 | "bs": 1,
39 | "instances": 1
40 | },
41 | {
42 | "cores": 4,
43 | "mos": 1,
44 | "ols": 1,
45 | "bs": 1,
46 | "instances": 1
47 | },
48 | {
49 | "cores": 7,
50 | "mos": 1,
51 | "ols": 1,
52 | "bs": 1,
53 | "instances": 1
54 | },
55 | {
56 | "cores": 14,
57 | "mos": 1,
58 | "ols": 1,
59 | "bs": 1,
60 | "instances": 1
61 | }
62 | ]
63 | }
64 |
--------------------------------------------------------------------------------
/samples/cpp/cpp_qpc_inference/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # ==============================================================================
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause-Clear
5 | #
6 | # ==============================================================================
7 |
8 | project(simple-bert-inference-example)
9 | cmake_minimum_required (VERSION 3.17.2)
10 | set(CMAKE_CXX_STANDARD 17)
11 |
12 | find_package(Threads REQUIRED)
13 |
14 | add_executable(simple-bert-inference-example main.cpp)
15 |
16 | target_include_directories(simple-bert-inference-example
17 | PRIVATE
18 | "/opt/qti-aic/dev/inc"
19 | )
20 |
21 |
22 | set_target_properties(simple-bert-inference-example
23 | PROPERTIES
24 | LINK_FLAGS "-Wl,--no-as-needed"
25 | )
26 |
27 | target_compile_options(simple-bert-inference-example
28 | PRIVATE
29 | -fstack-protector-all
30 | -Werror
31 | -Wall
32 | -Wextra
33 | -Wno-sign-compare
34 | -Wno-unused-parameter
35 | -Wno-missing-field-initializers
36 | )
37 |
38 | target_link_libraries(simple-bert-inference-example
39 | PRIVATE
40 | Threads::Threads
41 | dl
42 | )
--------------------------------------------------------------------------------
/tutorials/Computer-Vision/Perfomance-Tuning-Beginner/resnet_base_dopt_min_latency.json:
--------------------------------------------------------------------------------
1 | {
2 | "search-mode": "optimized",
3 | "objective": "min-latency",
4 | "search-parameters": {
5 | "cores": {
6 | "min": 1,
7 | "max": 14
8 | },
9 | "mos": {
10 | "min": 1,
11 | "max": 8
12 | },
13 | "ols": {
14 | "min": 1,
15 | "max": 8
16 | },
17 | "batch-size": {
18 | "min": 1,
19 | "max": 16
20 | },
21 | "instances": {
22 | "min": 1,
23 | "max": 14
24 | }
25 | },
26 | "initial-values": [
27 | {
28 | "cores": 1,
29 | "mos": 1,
30 | "ols": 1,
31 | "batch-size": 1,
32 | "instances": 1
33 | },
34 | {
35 | "cores": 2,
36 | "mos": 1,
37 | "ols": 1,
38 | "batch-size": 1,
39 | "instances": 1
40 | },
41 | {
42 | "cores": 4,
43 | "mos": 1,
44 | "ols": 1,
45 | "batch-size": 1,
46 | "instances": 1
47 | },
48 | {
49 | "cores": 7,
50 | "mos": 1,
51 | "ols": 1,
52 | "batch-size": 1,
53 | "instances": 1
54 | },
55 | {
56 | "cores": 14,
57 | "mos": 1,
58 | "ols": 1,
59 | "batch-size": 1,
60 | "instances": 1
61 | }
62 | ]
63 | }
64 |
--------------------------------------------------------------------------------
/samples/python/README.md:
--------------------------------------------------------------------------------
1 | # This folder consists
2 |
3 | 1. `vit_qaic` and `resnet_qaic` folder contains example showing an end-to-end workflow for running inference on QAIC100 using the python APIs.
4 | 2. `qaic_features` folder consists of examples to show how to perform benchmarking, profiling and measuring metrics for inferences made on the device.
5 |
6 | # Installation
7 |
8 | Steps to install `qaic` API:
9 |
10 | ```
11 | pip install /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl
12 | pip install -r requirements.txt
13 | ```
14 |
15 |
16 | ## Structure of end to end workflow
17 |
18 | Examples follow this pattern:
19 |
20 | 1. Get the model from open source. (HuggingFace for example)
21 | 2. Convert the model to onnx using onnx library.
22 | 3. Call generate_bin function converts onnx to qpc (binary for the device).
23 | a. Currently it is compiled for default arguments, can be replaced with best performance compile arguments) #FIXME
24 | 4. Creating `qaic.session` with appropriate input and output names.
25 | 5. Provide sample prepossessing steps. Build input_dict for the session.
26 | 6. Call session.run() to perform inference.
27 | 7. Provide sample postprocessing steps. reshape output from the session.
28 |
29 | ## To run the example
30 |
31 | ```
32 | python example.py
33 | ```
34 |
--------------------------------------------------------------------------------
/utils/README.md:
--------------------------------------------------------------------------------
1 | # Basic Commands/Utilities for Cloud AI 100 devices
2 |
3 | ## Create `qaic` group to avoid `sudo` to read card/device status
4 |
5 | ```
6 | sudo usermod -aG qaic $USER
7 | newgrp qaic
8 | bash
9 | ```
10 |
11 | ## Check device health
12 | Monitor the health of all AI 100 devices (SoCs) using the `qaic-util` utility.
13 |
14 | ```
15 | /opt/qti-aic/tools/qaic-util -q | grep -e Status -e QID
16 | ```
17 |
18 | ## Monitoring of AI 100 devices (SoCs)
19 | Continuously monitor the health, telemetry (temperature, power etc) and resources (compute, DRAM etc) of the AI 100 devices (SoCs) using the `qaic-util` utility.
20 |
21 | ```
22 | /opt/qti-aic/tools/qaic-util -t 1
23 | ```
24 |
25 | ## Reset AI 100 devices (SoCs)
26 | To reset **all** AI 100 devices (SoCs), run
27 | ```
28 | sudo /opt/qti-aic/tools/qaic-util -s
29 | ```
30 |
31 | To reset **individual** AI 100 devices (SoCs), run
32 | ```
33 | sudo /opt/qti-aic/tools/qaic-util -s -p
34 | ```
35 | where,
36 | - SSSS = 4 digits segment number
37 | - BB = 2 digits bus number
38 | - DD = 2 digits device number
39 | - F = 1 digit function number
40 |
41 | For example,
42 | ```
43 | sudo /opt/qti-aic/tools/qaic-util -s -p 0000:83:00.0
44 |
45 | Resetting 0000:83:00.0:
46 | 0000:83:00.0 success
47 | ```
--------------------------------------------------------------------------------
/models/language_processing/decoder/DeciCoder-6b/compileModel.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | if [ -z "$1" ]; then
6 | echo "Usage: $0 "
7 | exit 1
8 | fi
9 |
10 | model_name="$1"
11 | batch_size="$2"
12 | prompt_len="$3"
13 | ctx_len="$4"
14 | num_cores="$5"
15 | with_or_no_mx="$6"
16 |
17 | # Generate a new specializations.json
18 | sed -e "s/BS/${batch_size}/g" -e "s/PL/${prompt_len}/g" -e "s/CL/${ctx_len}/g" ./specializations_template.json > specializations.json
19 |
20 | # Create qpc directory - Delete exisiting path
21 | mkdir -p qpc
22 | rm -rf qpc/${model_name}-kv-${prompt_len}pl-${ctx_len}cl-${num_cores}c${with_or_no_mx}
23 |
24 | model_path="${model_name}-kv/generatedModels/${model_name}-kv_fp16_simplified.onnx"
25 | if [ ! -f "$model_path" ]; then
26 | model_path="${model_name}-kv/generatedModels/${model_name}-kv_fp16.onnx"
27 | fi
28 |
29 | /opt/qti-aic/exec/qaic-exec \
30 | -m=$model_path \
31 | -aic-hw \
32 | -aic-hw-version=2.0 \
33 | -network-specialization-config=specializations.json \
34 | -retained-state \
35 | -convert-to-fp16 \
36 | -ols=1 \
37 | -mos=${num_cores} \
38 | -aic-num-cores=${num_cores} \
39 | -custom-IO-list-file=${model_name}-kv/custom_io.yaml \
40 | -compile-only \
41 | -aic-binary-dir=qpc/${model_name}-kv-${prompt_len}pl-${ctx_len}cl-${num_cores}c${with_or_no_mx} \
42 | ${with_or_no_mx}
43 |
44 |
--------------------------------------------------------------------------------
/models/speech/whisper/audio.py:
--------------------------------------------------------------------------------
1 | ####################################################################################################
2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
4 | ####################################################################################################
5 |
6 | import os
7 | import numpy as np
8 | from datasets import load_dataset, Audio
9 | import soundfile as sf
10 | from pathlib import Path
11 |
12 | class AudioSample:
13 | def __init__(self):
14 | # load dummy dataset and read soundfiles
15 | self.ds = load_dataset(
16 | 'hf-internal-testing/librispeech_asr_dummy', 'clean', split='validation'
17 | )
18 |
19 | def to_file(self, parent='.'):
20 | audio_sample = self.ds[0]['audio']
21 |
22 | audio_array = audio_sample['array']
23 | audio_fname = os.path.join(parent, Path(audio_sample['path']).name)
24 | sampling_rate = audio_sample["sampling_rate"]
25 |
26 | # Convert to float32 for compatibility with soundfile
27 | if audio_array.dtype != np.float32:
28 | audio_array = audio_array.astype(np.float32)
29 |
30 | try:
31 | sf.write(audio_fname, audio_array, sampling_rate, format='FLAC')
32 | except Exception as e:
33 | print('Error saving file: {}'.format(e))
34 |
35 | return audio_fname
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | ## Contributing to PROJECT
2 |
3 | Hi there!
4 | We’re thrilled that you’d like to contribute to this project.
5 | Your help is essential for keeping this project great and for making it better.
6 |
7 | ## Branching Strategy
8 |
9 | In general, contributors should develop on branches based off of `master` and pull requests should be made against `master`.
10 |
11 | ## Submitting a pull request
12 |
13 | 1. Please read our [code of conduct](CODE-OF-CONDUCT.md) and [license](LICENSE).
14 | 1. Fork and clone the repository.
15 | 1. Create a new branch based on `master`: `git checkout -b master`.
16 | 1. Make your changes, add tests, and make sure the tests still pass.
17 | 1. Commit your changes using the [DCO](http://developercertificate.org/). You can attest to the DCO by commiting with the **-s** or **--signoff** options or manually adding the "Signed-off-by".
18 | 1. Push to your fork and submit a pull request from your branch to `master`.
19 | 1. Pat yourself on the back and wait for your pull request to be reviewed.
20 |
21 | Here are a few things you can do that will increase the likelihood of your pull request to be accepted:
22 |
23 | - Follow the existing style where possible.
24 | - Write tests.
25 | - Keep your change as focused as possible.
26 | If you want to make multiple independent changes, please consider submitting them as separate pull requests.
27 | - Write a [good commit message](http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html).
28 |
--------------------------------------------------------------------------------
/samples/python/aws_ai100_benchmarking/yolo_models/lut_yolo_models.csv:
--------------------------------------------------------------------------------
1 | MODEL_NAME,TASK,BATCH_SIZE,IMAGE_SIZE,CORES,INSTANCES,OLS,MOS,SET_SIZE,EXTRA,PRECISION,OBJECTIVE
2 | yolov4,object-detection,1,608,12,1,1,,1, -aic-pmu-recipe=KernelUtil -multicast-weights -aic-enable-depth-first,fp16,best-latency
3 | yolov4,object-detection,1,608,7,2,2,,2, -aic-pmu-recipe=KernelUtil -multicast-weights -aic-enable-depth-first,fp16,balanced
4 | yolov4,object-detection,1,608,1,14,1,,1, -aic-pmu-recipe=KernelUtil -multicast-weights -aic-enable-depth-first,fp16,best-throughput
5 | yolov5s,object-detection,1,640,7,2,2,,1,,fp16,best-latency
6 | yolov5s,object-detection,1,640,3,4,2,,1,,fp16,balanced
7 | yolov5s,object-detection,1,640,2,7,2,,2,,fp16,best-throughput
8 | yolov5m,object-detection,1,640,12,1,1,,1,,fp16,best-latency
9 | yolov5m,object-detection,1,640,12,1,1,,2,,fp16,balanced
10 | yolov5m,object-detection,1,640,2,7,2,,2,,fp16,best-throughput
11 | yolov5l,object-detection,1,640,12,1,1,,1,,fp16,best-latency
12 | yolov5l,object-detection,1,640,4,3,2,,1,,fp16,balanced
13 | yolov5l,object-detection,1,640,2,7,4,,2,,fp16,best-throughput
14 | yolov5x,object-detection,1,640,12,1,1,,1,,fp16,best-latency
15 | yolov5x,object-detection,1,640,7,2,2,,2,,fp16,balanced
16 | yolov5x,object-detection,1,640,2,7,4,,1,,fp16,best-throughput
17 | yolov7-e6e,object-detection,1,640,12,1,1,,1, -multicast-weights,fp16,best-latency
18 | yolov7-e6e,object-detection,1,640,4,3,2, ,1, -multicast-weights,fp16,balanced
19 | yolov7-e6e,object-detection,1,640,6,2,2, ,2, -multicast-weights,fp16,best-throughput
20 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_turbo/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
2 | # SPDX-License-Identifier: BSD-3-Clause-Clear
3 |
4 | import os
5 | import onnx
6 | from onnx import numpy_helper
7 |
8 |
9 | # executes the command and writes it down in the command.txt. The first time, mode is 'w', then 'a' (append)
10 | def execute(cmd_elements, write_to_file, mode):
11 | cmd_str = ' '.join(str(x) for x in cmd_elements)
12 | redirect = f" 2>&1 | ts > {write_to_file}"
13 | cmd_str += redirect
14 | print(f"Executing: {cmd_str}")
15 | os.system(cmd_str)
16 | with open(write_to_file, mode) as file:
17 | file.write(cmd_str + "\n\n")
18 |
19 |
20 | def scale_conv(model, conv_name, scale_factor):
21 | cnodes = [x for x in model.graph.node if x.name == conv_name]
22 | assert len(cnodes) == 1, f"Node '{conv_name}' not found"
23 | x, w, b = cnodes[0].input
24 | wi, bi = "", ""
25 | for i, init in enumerate(model.graph.initializer):
26 | if init.name == w:
27 | wi = i
28 | elif init.name == b:
29 | bi = i
30 | if wi != "" and bi != "":
31 | break
32 | else:
33 | raise ValueError(f"Cannot find indices of weight: {w} and bias: {b}")
34 | ww = numpy_helper.to_array(model.graph.initializer[wi])
35 | bb = numpy_helper.to_array(model.graph.initializer[bi])
36 | model.graph.initializer[wi].raw_data = (ww / scale_factor).tobytes()
37 | model.graph.initializer[bi].raw_data = (bb / scale_factor).tobytes()
38 |
--------------------------------------------------------------------------------
/tutorials/Computer-Vision/Perfomance-Tuning-Beginner/resnet_base_dopt_throughput.json:
--------------------------------------------------------------------------------
1 | {
2 | "search-mode": "optimized",
3 | "objective": "max-throughput",
4 | "search-parameters": {
5 | "cores": {
6 | "min": 1,
7 | "max": 14
8 | },
9 | "mos": {
10 | "min": 1,
11 | "max": 8
12 | },
13 | "ols": {
14 | "min": 1,
15 | "max": 8
16 | },
17 | "batch-size": {
18 | "min": 1,
19 | "max": 16
20 | },
21 | "instances": {
22 | "min": 1,
23 | "max": 14
24 | },
25 | "set-size": {
26 | "min": 1,
27 | "max": 10
28 | }
29 | },
30 | "initial-values": [
31 | {
32 | "cores": 1,
33 | "mos": 1,
34 | "ols": 1,
35 | "batch-size": 1,
36 | "instances": 14,
37 | "set-size": 1
38 |
39 | },
40 | {
41 | "cores": 2,
42 | "mos": 1,
43 | "ols": 1,
44 | "batch-size": 1,
45 | "instances": 7,
46 | "set-size": 1
47 | },
48 | {
49 | "cores": 4,
50 | "mos": 1,
51 | "ols": 1,
52 | "batch-size": 1,
53 | "instances": 3,
54 | "set-size": 1
55 | },
56 | {
57 | "cores": 7,
58 | "mos": 1,
59 | "ols": 1,
60 | "batch-size": 1,
61 | "instances": 2,
62 | "set-size": 1
63 | },
64 | {
65 | "cores": 14,
66 | "mos": 1,
67 | "ols": 1,
68 | "batch-size": 1,
69 | "instances": 1,
70 | "set-size": 1
71 | }
72 | ]
73 | }
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/utils.py:
--------------------------------------------------------------------------------
1 | ####################################################################################################
2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
4 | ####################################################################################################
5 |
6 | import os
7 | import onnx
8 | from onnx import numpy_helper
9 |
10 |
11 | # executes the command and writes it down in the command.txt. The first time, mode is 'w', then 'a' (append)
12 | def execute(cmd_elements, write_to_file, mode):
13 | cmd_str = ' '.join(str(x) for x in cmd_elements)
14 | redirect = f" 2>&1 | ts > {write_to_file}"
15 | cmd_str += redirect
16 | print(f"Executing: {cmd_str}")
17 | os.system(cmd_str)
18 | with open(write_to_file, mode) as file:
19 | file.write(cmd_str + "\n\n")
20 |
21 |
22 | def scale_conv(model, conv_name, scale_factor):
23 | cnodes = [x for x in model.graph.node if x.name == conv_name]
24 | assert len(cnodes) == 1, f"Node '{conv_name}' not found"
25 | x, w, b = cnodes[0].input
26 | wi, bi = "", ""
27 | for i, init in enumerate(model.graph.initializer):
28 | if init.name == w:
29 | wi = i
30 | elif init.name == b:
31 | bi = i
32 | if wi != "" and bi != "":
33 | break
34 | else:
35 | raise ValueError(f"Cannot find indices of weight: {w} and bias: {b}")
36 | ww = numpy_helper.to_array(model.graph.initializer[wi])
37 | bb = numpy_helper.to_array(model.graph.initializer[bi])
38 | model.graph.initializer[wi].raw_data = (ww / scale_factor).tobytes()
39 | model.graph.initializer[bi].raw_data = (bb / scale_factor).tobytes()
40 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/utils.py:
--------------------------------------------------------------------------------
1 | ####################################################################################################
2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
4 | ####################################################################################################
5 | import os
6 | import onnx
7 | from onnx import numpy_helper
8 |
9 |
10 | # executes the command and writes it down in the command.txt. The first time, mode is 'w', then 'a' (append)
11 | def execute(cmd_elements, write_to_file, mode):
12 | cmd_str = ' '.join(str(x) for x in cmd_elements)
13 | redirect = f" 2>&1 | ts > {write_to_file}"
14 | cmd_str += redirect
15 | print(f"Executing: {cmd_str}")
16 | os.system(cmd_str)
17 | with open(write_to_file, mode) as file:
18 | file.write(cmd_str + "\n\n")
19 |
20 |
21 | def scale_conv(model, conv_name, scale_factor):
22 | cnodes = [x for x in model.graph.node if x.name == conv_name]
23 | assert len(cnodes) == 1, f"Node '{conv_name}' not found"
24 | x, w, b = cnodes[0].input
25 | wi, bi = "", ""
26 | for i, init in enumerate(model.graph.initializer):
27 | if init.name == w:
28 | wi = i
29 | elif init.name == b:
30 | bi = i
31 | if wi != "" and bi != "":
32 | break
33 | else:
34 | raise ValueError(f"Cannot find indices of weight: {w} and bias: {b}")
35 | ww = numpy_helper.to_array(model.graph.initializer[wi])
36 | bb = numpy_helper.to_array(model.graph.initializer[bi])
37 | model.graph.initializer[wi].raw_data = (ww / scale_factor).tobytes()
38 | model.graph.initializer[bi].raw_data = (bb / scale_factor).tobytes()
39 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
2 |
3 | Redistribution and use in source and binary forms, with or without
4 | modification, are permitted (subject to the limitations in the
5 | disclaimer below) provided that the following conditions are met:
6 |
7 | * Redistributions of source code must retain the above copyright
8 | notice, this list of conditions and the following disclaimer.
9 |
10 | * Redistributions in binary form must reproduce the above
11 | copyright notice, this list of conditions and the following
12 | disclaimer in the documentation and/or other materials provided
13 | with the distribution.
14 |
15 | * Neither the name of Qualcomm Technologies, Inc. nor the names of its
16 | contributors may be used to endorse or promote products derived
17 | from this software without specific prior written permission.
18 |
19 | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
20 | GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
21 | HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
22 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
23 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
27 | GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
29 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30 | OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
31 | IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 |
33 | SPDX-License-Identifier: BSD-3-Clause-Clear
34 |
--------------------------------------------------------------------------------
/tutorials/README.md:
--------------------------------------------------------------------------------
1 | Tutorials are Jupyter notebools designed to walk the developer through the Cloud AI inference workflow. The tutorials are split into 2 categories - CV and NLP. Overall, the inference workflow for CV and NLP models are very similar and have been presented for convenience.
2 |
3 | `Model-Onboarding` - This is one of the beginner notebooks. This goes through exporting and preparing the model, compiling the model using a CLI tool and executing inference using CLI tool / Python APIs.
4 |
5 | `Performance-Tuning` - This is another beginner notebook that walks the developer through the key parameters to optimize for best performance (latency and throughput) on Cloud AI platforms. Going through this notebook and 'Performance Tuning' section in the Quick start guide will equip developers with a intuitive understanding of how to use the key parameters to meet inference application KPIs (AI compute resource usage, throughput and latency).
6 |
7 | `Profiler` - This is a intermediate-level notebook that describes system and device level inference profiling capabilities. Developers can use the tools and techniques described in this tutorial to measure application/device level latency and identify system/device bottlenecks.
8 |
9 |
10 | ### Pre-requisites
11 | 1. Clone this repo
12 | 2. Create python3.8 venv and activate it.
13 | `python3.8 -m venv jn_env`
14 | `source jn_env/bin/activate`
15 | 3. Install qaic
16 | `pip install /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl`
17 | 4. Install Jupyter notebook
18 | `pip install notebook`
19 | `pip install urllib3==1.26.6`
20 | 5. Run the notebook
21 | `jupyter notebook --allow-root --ip 0.0.0.0 --no-browser`.
22 | You should see `http://ip-xx-yyy-zzz-aaa.us-west-2.compute.internal:8888/tree?token=`.
23 | On the local machine, type `http://xx.yyy.zzz.aaa:8888/tree?token=` on a browser to run the tutorial notebooks.
24 |
--------------------------------------------------------------------------------
/models/speech/whisper/README.md:
--------------------------------------------------------------------------------
1 | # Whisper
2 |
3 | [Whisper](https://github.com/openai/whisper) is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.
4 |
5 | ## Environment and dependencies
6 |
7 | ```commandline
8 | python3.10 -m venv whisper_env
9 | source whisper_env/bin/activate
10 | pip3 install -r requirements.txt
11 |
12 | sudo apt-get update
13 | sudo apt-get install libsndfile1 ffmpeg
14 | ```
15 |
16 | ## Model generation
17 |
18 | The following command generates encoder and decoder ONNX files in the `output_whisper` folder:
19 | ```commandline
20 | python3 generateModel.py --model-name base --output-dir output_whisper
21 | ```
22 |
23 | **Note** Check here for additional model variants:
24 | https://github.com/openai/whisper#available-models-and-languages
25 |
26 |
27 | ## Model compilation
28 |
29 | AIC binaries folder
30 |
31 | ```commandline
32 | mkdir ./whisper_AIC
33 | ```
34 |
35 | Whisper encoder
36 |
37 | ```commandline
38 | rm -rf ./whisper_AIC/whisper-encoder
39 | /opt/qti-aic/exec/qaic-exec -m=./output_whisper/encoder_model.onnx -aic-hw -aic-num-cores=12 -mos=2 -ols=1 -convert-to-fp16 -onnx-define-symbol=batch_size,1 -onnx-define-symbol=feature_size,80 -onnx-define-symbol=encoder_sequence_length,3000 -aic-binary-dir=./whisper_AIC/whisper-encoder -compile-only
40 | ```
41 |
42 | Whisper decoder
43 |
44 | ```commandline
45 | rm -rf ./whisper_AIC/whisper-decoder
46 | /opt/qti-aic/exec/qaic-exec -m=./output_whisper/decoder_model.onnx -aic-hw -aic-num-cores=12 -mos=2 -ols=1 -convert-to-fp16 -onnx-define-symbol=batch_size,1 -onnx-define-symbol=encoder_sequence_length,1500 -onnx-define-symbol=decoder_sequence_length,150 -aic-binary-dir=./whisper_AIC/whisper-decoder -compile-only
47 | ```
48 |
49 | ## Model execution
50 |
51 | ```commandline
52 | sudo ./whisper_env/bin/python3 runModel.py
53 | ```
54 |
--------------------------------------------------------------------------------
/samples/python/qaic_features/benchmarking_eg.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
3 | SPDX-License-Identifier: BSD-3-Clause-Clear
4 | '''
5 |
6 | import qaic
7 | import numpy as np
8 | import argparse
9 |
10 | # Establish arguments to accept
11 | def get_args():
12 |
13 | parser = argparse.ArgumentParser()
14 |
15 | parser.add_argument(
16 | "--model-path",
17 | dest='model_path',
18 | default=
19 | '/opt/qti-aic/integrations/qaic_onnxrt/tests/resnet50/resnet50-v1-12-batch.onnx',
20 | help='Pass path to qpc of this model to avoid compilation')
21 |
22 | parser.add_argument(
23 | "--config-path",
24 | dest='config_path',
25 | default=
26 | './resnet_config.yaml',
27 | help='Pass path to qpc of this model to avoid compilation')
28 |
29 | parser.add_argument(
30 | "--input",
31 | dest='input_img',
32 | help=
33 | 'If image is not provided, random values will be generated as input. Input image should be 1*3*224*224 pixel in raw format'
34 | )
35 |
36 | parser.add_argument(
37 | "--num_iters",
38 | dest='num_iters',
39 | default=1000,
40 | help='Enter number of inferences you want to run on the model')
41 |
42 | return parser.parse_args()
43 |
44 | def main(args):
45 |
46 | resnet_sess = qaic.Session(
47 | args.model_path,
48 | options_path=args.config_path)
49 |
50 | input_shape, input_type = resnet_sess.model_input_shape_dict['data']
51 |
52 | # Read input
53 |
54 | if args.input_img is None:
55 | x = np.random.randn(*input_shape).astype(input_type)
56 | else:
57 | img = np.fromfile(args.input_img, dtype=input_type)
58 | x = np.resize(img, input_shape)
59 |
60 | # Run Benchmarking
61 | input_dict = {'data': x}
62 |
63 | inf_completed, inf_rate, inf_time, batch_size = resnet_sess.run_benchmark(num_inferences = args.num_iters,
64 | input_dict=input_dict)
65 |
66 | if __name__ == '__main__':
67 | args = get_args()
68 | main(args)
69 |
--------------------------------------------------------------------------------
/models/language_processing/encoder/model.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
2 | # SPDX-License-Identifier: BSD-3-Clause-Clear
3 |
4 | import os
5 | from transformers import AutoTokenizer
6 | import numpy as np
7 | import qaic
8 |
9 | class QAicEmbeddingModel():
10 | def __init__(self, model_name='BAAI/bge-large-en-v1.5', qpc_path='./models/BAAI/bge-large-en-v1.5/compiled-bin-fp16-B1-C4-A3-OLS2-MOS1-best-throughput', device=0):
11 | self.tokenizer = AutoTokenizer.from_pretrained(model_name)
12 | self.aic_session = qaic.Session(model_path=os.path.join(qpc_path, 'programqpc.bin'), dev_id=device)
13 | self.name = model_name
14 |
15 | self.aic_session.setup()
16 |
17 | def generate(self, input):
18 | tokens = self.tokenizer(input, padding=True, return_tensors='np')
19 |
20 | input_data = {'input_ids': None,
21 | 'attention_mask': None}
22 |
23 | for k in input_data.keys():
24 | input_shape, input_type = self.aic_session.model_input_shape_dict[k]
25 |
26 | rows, cols = tokens[k].shape
27 | input_data[k] = np.zeros(input_shape, dtype=input_type)
28 | input_data[k][:rows, :cols] = tokens[k]
29 |
30 | outputs = self.aic_session.run(input_data)
31 |
32 | output_shape, output_type = self.aic_session.model_output_shape_dict['token_embeddings']
33 | token_embeddings = np.frombuffer(outputs['token_embeddings'], dtype=output_type).reshape(output_shape)
34 | token_embeddings = token_embeddings[:, 0]
35 |
36 | output_shape, output_type = self.aic_session.model_output_shape_dict['sentence_embedding']
37 | sentence_embedding = np.frombuffer(outputs['sentence_embedding'], dtype=output_type).reshape(output_shape)
38 |
39 | return token_embeddings, sentence_embedding
40 |
41 | def main():
42 | inputs_txt = 'your_text_here'
43 | model = QAicEmbeddingModel()
44 | token_embedding, sentence_embeddings = model.generate(inputs_txt)
45 | print('token_embedding {}'.format(token_embedding))
46 | print('sentence_embeddings {}'.format(sentence_embeddings))
47 |
48 | if __name__ == "__main__":
49 | main()
50 |
--------------------------------------------------------------------------------
/samples/python/qaic_features/metrics_eg.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
3 | SPDX-License-Identifier: BSD-3-Clause-Clear
4 | '''
5 |
6 | import qaic
7 | import numpy as np
8 | import argparse
9 |
10 | # Establish arguments to accept
11 | def get_args():
12 |
13 | parser = argparse.ArgumentParser()
14 |
15 | parser.add_argument(
16 | "--model-path",
17 | dest='model_path',
18 | default=
19 | '/opt/qti-aic/integrations/qaic_onnxrt/tests/resnet50/resnet50-v1-12-batch.onnx',
20 | help='Pass path to qpc of this model to avoid compilation')
21 |
22 | parser.add_argument(
23 | "--config-path",
24 | dest='config_path',
25 | default=
26 | './resnet_config.yaml',
27 | help='Pass path to qpc of this model to avoid compilation')
28 |
29 | parser.add_argument(
30 | "--input",
31 | dest='input_img',
32 | help=
33 | 'If image is not provided, random values will be generated as input. Input image should be 1*3*224*224 pixel in raw format'
34 | )
35 |
36 | parser.add_argument(
37 | "--num_iters",
38 | dest='num_iters',
39 | default=1000,
40 | help='Enter number of inferences you want to run on the model')
41 |
42 | return parser.parse_args()
43 |
44 | def main(args):
45 |
46 | resnet_sess = qaic.Session(
47 | args.model_path,
48 | options_path=args.config_path,
49 | enable_metrics=True)
50 |
51 | input_shape, input_type = resnet_sess.model_input_shape_dict['data']
52 |
53 | # Read input
54 |
55 | if args.input_img is None:
56 | x = np.random.randn(*input_shape).astype(input_type)
57 | else:
58 | img = np.fromfile(args.input_img, dtype=input_type)
59 | x = np.resize(img, input_shape)
60 |
61 | # Run inference
62 | input_dict = {'data': x}
63 |
64 | for _ in range(args.num_iters):
65 | resnet_sess.run(input_dict)
66 |
67 | print('\n\n\n\n-------------- Metrics --------------\n\n\n\n')
68 | resnet_sess.print_metrics()
69 | metrics = resnet_sess.get_metrics()
70 |
71 | if __name__ == '__main__':
72 | args = get_args()
73 | main(args)
--------------------------------------------------------------------------------
/samples/python/qaic_features/profiling_eg.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
3 | SPDX-License-Identifier: BSD-3-Clause-Clear
4 | '''
5 |
6 | import qaic
7 | import numpy as np
8 | import argparse
9 |
10 | # Establish arguments to accept
11 | def get_args():
12 |
13 | parser = argparse.ArgumentParser()
14 |
15 | parser.add_argument(
16 | "--model-path",
17 | dest='model_path',
18 | default=
19 | '/opt/qti-aic/integrations/qaic_onnxrt/tests/resnet50/resnet50-v1-12-batch.onnx',
20 | help='Pass path to qpc of this model to avoid compilation')
21 |
22 | parser.add_argument(
23 | "--config-path",
24 | dest='config_path',
25 | default=
26 | './resnet_config.yaml',
27 | help='Pass path to qpc of this model to avoid compilation')
28 |
29 | parser.add_argument(
30 | "--input",
31 | dest='input_img',
32 | help=
33 | 'If image is not provided, random values will be generated as input. Input image should be 1*3*224*224 pixel in raw format'
34 | )
35 |
36 | parser.add_argument(
37 | "--num_iters",
38 | dest='num_iters',
39 | default=1000,
40 | help='Enter number of inferences you want to run on the model')
41 |
42 | return parser.parse_args()
43 |
44 | def main(args):
45 |
46 | resnet_sess = qaic.Session(
47 | args.model_path,
48 | options_path=args.config_path,
49 | enable_profiling=True)
50 |
51 | input_shape, input_type = resnet_sess.model_input_shape_dict['data']
52 |
53 | # Read input
54 |
55 | if args.input_img is None:
56 | x = np.random.randn(*input_shape).astype(input_type)
57 | else:
58 | img = np.fromfile(args.input_img, dtype=input_type)
59 | x = np.resize(img, input_shape)
60 |
61 | # Run inference
62 | input_dict = {'data': x}
63 |
64 | for _ in range(args.num_iters):
65 | resnet_sess.run(input_dict)
66 |
67 | print('\n\n\n\n-------------- Metrics --------------\n\n\n\n')
68 | resnet_sess.print_metrics()
69 | print('\n\n\n\n-------------- Profile Data --------------\n\n\n\n')
70 | resnet_sess.print_profile_data(n=5)
71 | metrics = resnet_sess.get_metrics()
72 |
73 | if __name__ == '__main__':
74 | args = get_args()
75 | main(args)
--------------------------------------------------------------------------------
/samples/python/common_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
3 | SPDX-License-Identifier: BSD-3-Clause-Clear
4 | '''
5 |
6 | import os
7 | import yaml
8 | import inspect
9 |
10 | def generate_bin(onnx_filename, yaml_filename):
11 | """
12 | Generate compiled binary for QAIC
13 |
14 | Args:
15 | onnx_path : path to onnx file.
16 | yaml_path : path to yaml file which has compile time arguments.
17 |
18 | Returns:
19 | qpc_path : path to qpc (compiled binary)
20 | """
21 | caller_path = inspect.stack()[1].filename #os.path.dirname(os.path.realpath
22 | onnx_path = os.path.join(os.path.dirname(caller_path), onnx_filename)
23 | yaml_path = os.path.join(os.path.dirname(caller_path), yaml_filename)
24 |
25 | filename, extension = os.path.splitext(onnx_filename)
26 | onnx_folder = os.path.dirname(onnx_path)
27 | qpc_bin = os.path.join(os.path.dirname(caller_path), filename+'_qpc')
28 | with open(yaml_path, "r") as file:
29 | yaml_data = yaml.load(file, Loader=yaml.FullLoader)
30 |
31 | if os.path.isdir(qpc_bin):
32 | print(f'INFO: Removing existing QPC {qpc_bin}')
33 | cmd = f'sudo rm -fr {qpc_bin}'
34 | os.system(cmd)
35 | print(f'INFO: Existing QPC {qpc_bin} is removed')
36 |
37 | # create the command string from the yaml arguments.
38 | cmd_list = [f'/opt/qti-aic/exec/qaic-exec -m={onnx_path} -aic-hw -aic-hw-version={2.0}']
39 |
40 | # ignore the following arguments:
41 | ignore = ['num-activations', 'set-size']
42 | replace_dict = {'aic_num_cores':'aic-num-cores'}
43 |
44 | for arg, value in yaml_data.items():
45 | arg = arg.replace('_','-')
46 | if arg in ignore:
47 | continue
48 | if isinstance(value, bool):
49 | if value:# include the argument only if true; for example -convert-to-fp16
50 | cmd_list.append(f'-{arg}')
51 | elif isinstance(value, dict):
52 | for subarg, subval in value.items():
53 | cmd_list.append(f'-{arg}={subarg},{subval}')
54 | else:
55 | cmd_list.append(f'-{arg}={value}')
56 |
57 | cmd_list.append(f'-aic-binary-dir={qpc_bin}')
58 |
59 | cmd = ' '.join(cmd_list)
60 | print(f'INFO: Running the compile cmd: {cmd}')
61 | os.system(cmd)
62 |
63 | return qpc_bin
64 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-xl-base-1.0/fix_vae_decoder_onnx.py:
--------------------------------------------------------------------------------
1 | ####################################################################################################
2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
4 | ####################################################################################################
5 | import onnx
6 | from onnx import numpy_helper
7 |
8 | def scale_conv(model, conv_name, scale_factor):
9 | cnodes = [x for x in model.graph.node if x.name == conv_name]
10 | assert len(cnodes) == 1, f"Node '{conv_name}' not found"
11 | x, w, b = cnodes[0].input
12 | wi, bi = "", ""
13 | for i, init in enumerate(model.graph.initializer):
14 | if init.name == w:
15 | wi = i
16 | elif init.name == b:
17 | bi = i
18 | if wi != "" and bi != "":
19 | break
20 | else:
21 | raise ValueError(f"Cannot find indices of weight: {w} and bias: {b}")
22 | ww = numpy_helper.to_array(model.graph.initializer[wi])
23 | bb = numpy_helper.to_array(model.graph.initializer[bi])
24 | model.graph.initializer[wi].raw_data = (ww / scale_factor).tobytes()
25 | model.graph.initializer[bi].raw_data = (bb / scale_factor).tobytes()
26 |
27 |
28 | def main(model_path, scaling_factor):
29 | model = onnx.load(model_path)
30 | scale_conv(model, "/decoder/up_blocks.2/upsamplers.0/conv/Conv", scaling_factor)
31 | scale_conv(model, "/decoder/up_blocks.3/resnets.0/conv2/Conv", scaling_factor)
32 | # scale_conv(model, "/decoder/up_blocks.3/resnets.0/conv_shortcut/Conv", scaling_factor)
33 | scale_conv(model, "/decoder/up_blocks.3/resnets.1/conv2/Conv", scaling_factor)
34 | scale_conv(model, "/decoder/up_blocks.3/resnets.2/conv2/Conv", scaling_factor)
35 | output_path = model_path[:-5] + f"_fixed_{scaling_factor}.onnx"
36 | onnx.save(model, output_path)
37 |
38 |
39 | if __name__ == "__main__":
40 | import argparse
41 | argp = argparse.ArgumentParser()
42 | argp.add_argument(
43 | "--model-path",
44 | default="stabilityai/stable-diffusion-xl-base-1.0/vae_decoder/model.onnx",
45 | help="Model path to fix",
46 | )
47 | argp.add_argument("--scaling-factor", default=128, type=int, help="Scaling factor")
48 | args = argp.parse_args()
49 | main(**vars(args))
50 |
--------------------------------------------------------------------------------
/models/speech/whisper/generateModel.py:
--------------------------------------------------------------------------------
1 | ####################################################################################################
2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
4 | ####################################################################################################
5 |
6 | import os
7 | import argparse
8 | import numpy as np
9 | import torch
10 | import whisper
11 | from audio import AudioSample
12 |
13 | def main(model_name: str, output_dir: str):
14 | cache_path = './cache'
15 |
16 | audio_sample = AudioSample()
17 | audio_path = audio_sample.to_file()
18 |
19 | audio = whisper.load_audio(audio_path) # Read audio from file
20 | audio_pad = whisper.pad_or_trim(audio) # Padding and trimming
21 |
22 | # make log-Mel spectrogram and move to the same device as the model
23 | input_features = whisper.log_mel_spectrogram(audio_pad) # convert to mel spectrogram
24 | input_features = torch.unsqueeze(input_features, 0) # add batch dimension
25 |
26 | model = whisper.load_model(model_name, download_root=cache_path)
27 | audio_features = model.encoder(input_features)
28 | decoder_input_ids = torch.tensor([[50258]])
29 |
30 | if not os.path.exists(output_dir):
31 | os.makedirs(output_dir)
32 |
33 | # Encoder model
34 | torch.onnx.export(
35 | model.encoder,
36 | (input_features),
37 | os.path.join(output_dir, 'encoder_model.onnx'),
38 | input_names=['input_features'],
39 | output_names=['last_hidden_state'],
40 | dynamic_axes={
41 | 'input_features': {0: 'batch_size', 1: 'feature_size', 2: 'encoder_sequence_length'},
42 | 'last_hidden_state': {0: 'batch_size'}
43 | }
44 | )
45 |
46 | # Decoder model
47 | torch.onnx.export(
48 | model.decoder,
49 | (decoder_input_ids, audio_features),
50 | os.path.join(output_dir, 'decoder_model.onnx'),
51 | input_names=['input_ids', 'encoder_hidden_states'],
52 | output_names=['logits'],
53 | dynamic_axes={
54 | 'input_ids': {0: 'batch_size', 1: 'decoder_sequence_length'},
55 | 'encoder_hidden_states': {0: 'batch_size', 1: 'encoder_sequence_length'},
56 | 'logits': {0: 'batch_size', 1: 'decoder_sequence_length'}
57 | }
58 | )
59 |
60 | if __name__ == '__main__':
61 | import argparse
62 |
63 | argp = argparse.ArgumentParser()
64 | argp.add_argument(
65 | '--model-name',
66 | required=True,
67 | help='Model name to generate',
68 | )
69 | argp.add_argument(
70 | '--output-dir',
71 | required=False,
72 | help='Path to store generated ONNX files',
73 | default='./'
74 | )
75 | args = argp.parse_args()
76 | main(**vars(args))
77 |
--------------------------------------------------------------------------------
/models/language_processing/decoder/DeciCoder-6b/README.md:
--------------------------------------------------------------------------------
1 | # Description
2 |
3 | [DeciCoder-6b](https://huggingface.co/Deci/DeciCoder-6b) is a decoder-only large language model (LLM) developed by [Deci Ai](https://deci.ai) for code generation tasks. The architectures of the model was developed by AutoNAC which is Deci Ai's proprietary Neural Architecture Search technology. The model has a context length of 2048 tokens and is trained on the Python, Java, Javascript, C++, C#, Go, and Rust subsets of [The-Stack](https://huggingface.co/datasets/bigcode/the-stack) dataset.
4 |
5 | # Running on AIC100
6 |
7 | ## Available Compute Resources
8 | The following cloud provider instances are equipped with AIC100 accelerators.
9 |
10 |
11 |
12 | |Provider | [AWS DL2q Instance](https://aws.amazon.com/ec2/instance-types/dl2q/) | [Cirrascale Instance](https://cirrascale.com/solutions-qualcomm-cloud-ai100.php) |
13 | | --------------------- | --------------------- | -------------------------- |
14 | |Cloud-AI Accelerators | 8 Std (14 NSPs) SKUs | 1 to 8 Pro (16 NSPs) SKUs |
15 | |Supported Formats for [DeciCoder-6b](https://huggingface.co/Deci/DeciCoder-6b)| FP16 and [MX6](https://arxiv.org/abs/2302.08007) | FP16 and [MX6](https://arxiv.org/abs/2302.08007) |
16 |
17 | ## Source of the Model
18 |
19 | The model is downloaded from [HuggingFace](https://huggingface.co/Deci/DeciCoder-6b).
20 |
21 | ## Environment and Dependencies
22 | Create Python virtual environment and activate.
23 |
24 | ```commandline
25 | python3.10 -m venv llm_env
26 | source llm_env/bin/activate
27 | pip3 install -r requirements.txt
28 | ```
29 |
30 | Install the dependencies.
31 |
32 | ```commandline
33 | git clone --branch v4.35.2 --depth 1 https://github.com/huggingface/transformers transformers-dev
34 | cd transformers-dev
35 | git apply ../Llama2_4.35.2.patch
36 | pip3 install .
37 | cd ..
38 | ```
39 |
40 | ## Model and Hardware Parameters
41 | Customize the model repo/name and the compilation parameters in `init.sh`. Model will be compiled using MX6 compression. Let MX="" if you want to avoid MX6 compression. BS, PL and CL are Batchsize, Prompt Length and Context Length respectively.
42 |
43 | ```commandline
44 | source init.sh
45 | ```
46 |
47 | ## Model Generation
48 | Generate the model into onnx format.
49 |
50 | ```commandline
51 | python generateModel.py --model-name ${MODEL_REPO}/${MODEL_NAME} --model-class LlamaForCausalLM
52 | ```
53 |
54 | ## Model Compilation for AIC100
55 | Compile the onnx format into bin file. Modify BS, PL, CL, CORES, and MX if needed.
56 |
57 | ```commandline
58 | bash compileModel.sh $MODEL_NAME $BS $PL $CL $CORES $MX
59 | ```
60 |
61 | ## Model Execution on AIC100
62 | Run the compiled model binary on AIC100. Modify DEVICE_ID if needed. Run `/opt/qti-aic/tools/qaic-util -q` to check available devices.
63 |
64 | ```commandline
65 | export PROMPT="insert your prompt here"
66 | export DEVICE_ID=0
67 | python runModel.py --model-name ${MODEL_REPO}/${MODEL_NAME} --qpc ./qpc/${MODEL_NAME}-kv-${PL}pl-${CL}cl-${CORES}c${MX} --device_id $DEVICE_ID --prompt "${PROMPT}"
68 | ```
69 |
70 | ## References
71 | - [Shared Micro-exponents](https://arxiv.org/abs/2302.08007)
72 |
73 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/onnx_generation/onnx_gen_utils.py:
--------------------------------------------------------------------------------
1 | ####################################################################################################
2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
4 | ####################################################################################################
5 | from packaging import version
6 | import torch
7 | import onnx
8 | from onnx import external_data_helper, numpy_helper
9 | import numpy as np
10 | from pathlib import Path
11 |
12 | is_torch_less_than_1_11 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.11")
13 |
14 |
15 | def fix_onnx_fp16(
16 | gen_models_path: str,
17 | ) -> str:
18 | finfo = np.finfo(np.float16)
19 | fp16_max = finfo.max
20 | fp16_min = finfo.min
21 | model = onnx.load(f"{gen_models_path}/model.onnx")
22 | fp16_fix = False
23 | for tensor in external_data_helper._get_all_tensors(model):
24 | nptensor = numpy_helper.to_array(tensor, gen_models_path)
25 | if nptensor.dtype == np.float32 and (
26 | np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min)
27 | ):
28 | # print(f'tensor value : {nptensor} above {fp16_max} or below {fp16_min}')
29 | nptensor = np.clip(nptensor, fp16_min, fp16_max)
30 | new_tensor = numpy_helper.from_array(nptensor, tensor.name)
31 | tensor.CopyFrom(new_tensor)
32 | fp16_fix = True
33 |
34 |
35 | if fp16_fix:
36 | # Save FP16 model
37 | print("Found constants out of FP16 range, clipped to FP16 range")
38 | onnx.save(model, f=f"{gen_models_path}" / "model_fp16.onnx")
39 | print(f"Saving modified onnx file at {gen_models_path}/model_fp16.onnx")
40 |
41 |
42 | def onnx_export(
43 | model,
44 | model_args: tuple,
45 | output_path: Path,
46 | ordered_input_names,
47 | output_names,
48 | dynamic_axes,
49 | opset,
50 | use_external_data_format=False,
51 | ):
52 | output_path.parent.mkdir(parents=True, exist_ok=True)
53 | # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11,
54 | # so we check the torch version for backwards compatibility
55 | if is_torch_less_than_1_11:
56 | torch.onnx.export(
57 | model,
58 | model_args,
59 | f=output_path.as_posix(),
60 | input_names=ordered_input_names,
61 | output_names=output_names,
62 | dynamic_axes=dynamic_axes,
63 | do_constant_folding=True,
64 | use_external_data_format=use_external_data_format,
65 | enable_onnx_checker=True,
66 | opset_version=opset,
67 | )
68 | else:
69 | torch.onnx.export(
70 | model,
71 | model_args,
72 | f=output_path.as_posix(),
73 | input_names=ordered_input_names,
74 | output_names=output_names,
75 | dynamic_axes=dynamic_axes,
76 | do_constant_folding=True,
77 | opset_version=opset,
78 | )
79 |
80 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_turbo/run_config_gen.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause-Clear
5 |
6 | # model configs
7 | MODEL_PATH="stabilityai/sdxl-turbo"
8 | PROMPT="\"photo of 8k ultra realistic harbour, port, boats, sunset, beautiful light, golden hour, full of colour, cinematic lighting, battered, trending on artstation, 4k, hyperrealistic, focused, extreme details, cinematic, masterpiece\""
9 | VAE_TYPE="vae"
10 | IMAGE_SIZE=512
11 | BLOCK_SIZE=256
12 | BATCH_SIZE=1
13 |
14 | # onnx configs
15 | GENERATE_ONNX=true
16 | ONNX_TEXT_ENCODER=true
17 | ONNX_UNET=true
18 | ONNX_VAE=true
19 |
20 | # compile configs
21 | NUM_CORES=16
22 | VAE_MOS=2
23 | VAE_OLS=1
24 | UNET_MOS=2
25 | UNET_OLS=1
26 | COMPILE_TEXT_ENCODER=true
27 | COMPILE_UNET=true
28 | COMPILE_VAE=true
29 |
30 | # inference configs
31 | RUN_ONLY=false
32 | DEVICE=0
33 | NUM_STEPS=1
34 | WARMUP_ITERS=3
35 | REPEAT_ITERS=1
36 |
37 | # mode
38 | TOGETHER=false
39 |
40 | if [ ${GENERATE_ONNX} == true ]
41 | then
42 | GENERATE_ONNX_CMD="--generate-onnx"
43 | else
44 | GENERATE_ONNX_CMD=""
45 | fi
46 |
47 | if [ ${ONNX_TEXT_ENCODER} == true ]
48 | then
49 | ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder"
50 | else
51 | ONNX_TEXT_ENCODER_CMD=""
52 | fi
53 |
54 | if [ ${ONNX_UNET} == true ]
55 | then
56 | ONNX_UNET_CMD="--onnx-unet"
57 | else
58 | ONNX_UNET_CMD=""
59 | fi
60 |
61 | if [ ${ONNX_VAE} == true ]
62 | then
63 | ONNX_VAE_CMD="--onnx-vae"
64 | else
65 | ONNX_VAE_CMD=""
66 | fi
67 |
68 | if [ ${COMPILE_TEXT_ENCODER} == true ]
69 | then
70 | COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder"
71 | else
72 | COMPILE_TEXT_ENCODER_CMD=""
73 | fi
74 |
75 | if [ ${COMPILE_UNET} == true ]
76 | then
77 | COMPILE_UNET_CMD="--compile-unet"
78 | else
79 | COMPILE_UNET_CMD=""
80 | fi
81 |
82 | if [ ${COMPILE_VAE} == true ]
83 | then
84 | COMPILE_VAE_CMD="--compile-vae"
85 | else
86 | COMPILE_VAE_CMD=""
87 | fi
88 |
89 | if [ ${RUN_ONLY} == true ]
90 | then
91 | RUN_ONLY_CMD="--run-only"
92 | else
93 | RUN_ONLY_CMD=""
94 | fi
95 |
96 | if [ ${TOGETHER} == true ]
97 | then
98 | TOGETHER_CMD="--together"
99 | else
100 | TOGETHER_CMD=""
101 | fi
102 |
103 | export HF_HOME="cache"
104 |
105 | rm run.sh
106 |
107 | scripts="python main.py \
108 | --model-path $MODEL_PATH \
109 | --prompt $PROMPT \
110 | --vae-type $VAE_TYPE \
111 | --batch-size $BATCH_SIZE \
112 | --image-size $IMAGE_SIZE \
113 | --block-size $BLOCK_SIZE \
114 | --num-cores $NUM_CORES \
115 | --vae-mos $VAE_MOS \
116 | --vae-ols $VAE_OLS \
117 | --unet-mos $UNET_MOS \
118 | --unet-ols $UNET_OLS \
119 | --device $DEVICE \
120 | --num-steps $NUM_STEPS \
121 | --num-warmup-iters $WARMUP_ITERS \
122 | --num-repeat-iters $REPEAT_ITERS \
123 | $ONNX_TEXT_ENCODER_CMD \
124 | $ONNX_UNET_CMD \
125 | $ONNX_VAE_CMD \
126 | $COMPILE_TEXT_ENCODER_CMD \
127 | $COMPILE_UNET_CMD \
128 | $COMPILE_VAE_CMD \
129 | $GENERATE_ONNX_CMD \
130 | $RUN_ONLY_CMD \
131 | $TOGETHER_CMD"
132 |
133 | echo $scripts >> run.sh
134 |
135 | bash run.sh
136 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_turbo/run_config_inference.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause-Clear
5 |
6 | PYTHON=$1
7 | echo $PYTHON
8 |
9 | # model configs
10 | MODEL_PATH="stabilityai/sdxl-turbo"
11 | PROMPT="\"photo of 8k ultra realistic harbour, port, boats, sunset, beautiful light, golden hour, full of colour, cinematic lighting, battered, trending on artstation, 4k, hyperrealistic, focused, extreme details, cinematic, masterpiece\""
12 | VAE_TYPE="vae"
13 | IMAGE_SIZE=512
14 | BLOCK_SIZE=256
15 | BATCH_SIZE=1
16 |
17 | # onnx configs
18 | GENERATE_ONNX=false
19 | ONNX_TEXT_ENCODER=false
20 | ONNX_UNET=false
21 | ONNX_VAE=false
22 |
23 | # compile configs
24 | NUM_CORES=16
25 | VAE_MOS=2
26 | VAE_OLS=1
27 | UNET_MOS=2
28 | UNET_OLS=1
29 | COMPILE_TEXT_ENCODER=false
30 | COMPILE_UNET=false
31 | COMPILE_VAE=false
32 |
33 | # inference configs
34 | RUN_ONLY=true
35 | DEVICE=0
36 | NUM_STEPS=1
37 | WARMUP_ITERS=3
38 | REPEAT_ITERS=3
39 |
40 | # mode
41 | TOGETHER=false
42 |
43 | if [ ${GENERATE_ONNX} == true ]
44 | then
45 | GENERATE_ONNX_CMD="--generate-onnx"
46 | else
47 | GENERATE_ONNX_CMD=""
48 | fi
49 |
50 | if [ ${ONNX_TEXT_ENCODER} == true ]
51 | then
52 | ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder"
53 | else
54 | ONNX_TEXT_ENCODER_CMD=""
55 | fi
56 |
57 | if [ ${ONNX_UNET} == true ]
58 | then
59 | ONNX_UNET_CMD="--onnx-unet"
60 | else
61 | ONNX_UNET_CMD=""
62 | fi
63 |
64 | if [ ${ONNX_VAE} == true ]
65 | then
66 | ONNX_VAE_CMD="--onnx-vae"
67 | else
68 | ONNX_VAE_CMD=""
69 | fi
70 |
71 | if [ ${COMPILE_TEXT_ENCODER} == true ]
72 | then
73 | COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder"
74 | else
75 | COMPILE_TEXT_ENCODER_CMD=""
76 | fi
77 |
78 | if [ ${COMPILE_UNET} == true ]
79 | then
80 | COMPILE_UNET_CMD="--compile-unet"
81 | else
82 | COMPILE_UNET_CMD=""
83 | fi
84 |
85 | if [ ${COMPILE_VAE} == true ]
86 | then
87 | COMPILE_VAE_CMD="--compile-vae"
88 | else
89 | COMPILE_VAE_CMD=""
90 | fi
91 |
92 | if [ ${RUN_ONLY} == true ]
93 | then
94 | RUN_ONLY_CMD="--run-only"
95 | else
96 | RUN_ONLY_CMD=""
97 | fi
98 |
99 | if [ ${TOGETHER} == true ]
100 | then
101 | TOGETHER_CMD="--together"
102 | else
103 | TOGETHER_CMD=""
104 | fi
105 |
106 | export HF_HOME="cache"
107 |
108 | rm run.sh
109 |
110 | scripts="$PYTHON main.py \
111 | --model-path $MODEL_PATH \
112 | --prompt $PROMPT \
113 | --vae-type $VAE_TYPE \
114 | --batch-size $BATCH_SIZE \
115 | --image-size $IMAGE_SIZE \
116 | --block-size $BLOCK_SIZE \
117 | --num-cores $NUM_CORES \
118 | --vae-mos $VAE_MOS \
119 | --vae-ols $VAE_OLS \
120 | --unet-mos $UNET_MOS \
121 | --unet-ols $UNET_OLS \
122 | --device $DEVICE \
123 | --num-steps $NUM_STEPS \
124 | --num-warmup-iters $WARMUP_ITERS \
125 | --num-repeat-iters $REPEAT_ITERS \
126 | $ONNX_TEXT_ENCODER_CMD \
127 | $ONNX_UNET_CMD \
128 | $ONNX_VAE_CMD \
129 | $COMPILE_TEXT_ENCODER_CMD \
130 | $COMPILE_UNET_CMD \
131 | $COMPILE_VAE_CMD \
132 | $GENERATE_ONNX_CMD \
133 | $RUN_ONLY_CMD \
134 | $TOGETHER_CMD"
135 |
136 | echo $scripts >> run.sh
137 |
138 | bash run.sh
139 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/README.md:
--------------------------------------------------------------------------------
1 | ### Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
2 | ### SPDX-License-Identifier: BSD-3-Clause-Clear
3 |
4 | # Instructions to run SD3.5 on Cloud AI 100
5 |
6 | The instructions below are to run the [Stable Diffusion 3.5 model](stabilityai/stable-diffusion-3.5-medium) on Cloud AI 100. Compile time parameters may need to be adjusted for different cards and different SDKs.
7 |
8 | ## 1. Download model
9 |
10 | 1. Setup environment varialble
11 | ```
12 | mkdir cache
13 | export HF_HOME=cache
14 | export HF_TOKEN=
15 | ```
16 |
17 | 2. Follow [instructions on HuggingFace](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium) to gain access to model.
18 |
19 | ## 2. Generate onnx files and compile for binaries
20 |
21 | 1. Set up a virtual environment for ONNX generation and compilation
22 | ```
23 | python3.10 -m venv env_onnx
24 | source ./env_onnx/bin/activate
25 | pip install -r requirements.txt
26 | ```
27 |
28 | 2. Create a folder for caching HuggingFace model downloads
29 | ```
30 | mkdir compile_logs
31 | mkdir qpc
32 | touch run.sh
33 | ```
34 |
35 | 3. Install diffusers from source after patching for ONNX file generation
36 | ```
37 | git clone --depth 1 --branch v0.31.0 https://github.com/huggingface/diffusers.git diffusers-onnx
38 | cd diffusers-onnx
39 | git apply --reject --whitespace=fix ../patches/attention_patch.patch
40 | pip install .
41 | cd ..
42 | ```
43 |
44 | 4. Install transformers from source (for T5 text_encoder_3 only)
45 | ```
46 | git clone -b v4.41.2 https://github.com/huggingface/transformers.git transformers-dev
47 | cd transformers-dev
48 | git apply --reject --whitespace=fix ../patches/transformer_patch.patch
49 | pip install .
50 | cd ..
51 | ```
52 |
53 | 5. Generate ONNX files and model binaries
54 | ```
55 | bash run_config_gen.sh
56 | ```
57 |
58 | ## 3. Run the end-to-end SD3 inference
59 |
60 | 1. Set up a separate virtual environment
61 | ```
62 | python3.10 -m venv env_pipeline
63 | source ./env_pipeline/bin/activate
64 | pip install -r requirements.txt
65 | pip install --force-reinstall /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl
66 | ```
67 |
68 | 2. Re-install diffusers from source after patching the SD3 pipeline for inference
69 | ```
70 | git clone --depth 1 --branch v0.31.0 https://github.com/huggingface/diffusers.git diffusers-pipeline
71 | cd diffusers-pipeline
72 | git apply --reject --whitespace=fix ../patches/pipeline_patch.patch
73 | pip install .
74 | cd ..
75 | ```
76 |
77 | 3. Run the inference with 'sudo' flag if needed to access the AI 100 devices.
78 | ```
79 | sudo bash run_config_inference.sh "" ""
80 | ```
81 |
82 | ## 4. Python interface
83 |
84 | ```
85 | source ./env_pipeline/bin/activate
86 | ```
87 |
88 | ```python
89 | from model import QAICStableDiffusion3
90 |
91 | model = QAICStableDiffusion3()
92 | prompt = 'photo of 8k ultra realistic harbour, port, boats, sunset, beautiful light, golden hour, full of colour, cinematic lighting, battered, trending on artstation, 4k, hyperrealistic, focused, extreme details, cinematic, masterpiece'
93 | image = model.generate(prompt, guidance=7.0)[0]
94 | image.save('harbor.png')
95 | ```
96 |
97 |
--------------------------------------------------------------------------------
/samples/python/qaic_features/README.md:
--------------------------------------------------------------------------------
1 | # Python High-Level API (qaic) features
2 |
3 | qaic_features depicts examples on how one can use different features provided by qaic module along with running inferences.
4 |
5 | a) Metrics
6 | After running inferences on AIC100 chip, if you want to get statistics regarding inference times, you can use get_metrics method as follows:
7 |
8 | ```python
9 | #Create Session with enable_metrics = True
10 | session = qaic.Session(
11 | model_path,
12 | options_path=yaml_config_path,
13 | enable_metrics=True)
14 |
15 | #Create input dictionary
16 | input_dict = {'data': np.array()}
17 |
18 | #Run Inferences
19 | for i in range(100):
20 | session.run(input_dict)
21 |
22 | #Get Metrics
23 | session.print_metrics()
24 | metrics = session.get_metrics()
25 | ```
26 |
27 | Sample output for session.print_metrics()
28 |
29 | ```bash
30 | Number of inferences utilized for calculation are 999
31 | Minimum latency observed 0.0009578340000000001 s
32 | Maximum latency observed 0.002209001 s
33 | Average latency / inference time observed is 0.0012380756316316324 s
34 | P25 / 25% of inferences observed latency less than 0.001095435 s
35 | P50 / 50% of inferences observed latency less than 0.0012522870000000001 s
36 | P75 / 75% of inferences observed latency less than 0.001299786 s
37 | P90 / 90% of inferences observed latency less than 0.002209001 s
38 | P99 / 99% of inferences observed latency less than 0.0016082370000000002 s
39 | Sum of all the inference times 1.2368375560000007 s
40 | Average latency / inference time observed is 0.0012380756316316324 s
41 | ```
42 |
43 |
44 |
45 |
46 | b) Profiling
47 | To profile your inferences being performed on AIC100 chip and get inference time statistic metrics, you can use following methods:
48 |
49 | ```python
50 | #Create Session with enable_metrics = True
51 | session = qaic.Session(
52 | model_path,
53 | options_path=yaml_config_path,
54 | enable_profiling=True)
55 |
56 | #Create input dictionary
57 | input_dict = {'data': np.array()}
58 |
59 | #Run Inferences
60 | for i in range(100):
61 | session.run(input_dict)
62 |
63 | #Get Metrics
64 | session.print_metrics()
65 | metrics = session.get_metrics()
66 | session.print_profile_data(n=5)
67 | ```
68 |
69 | Sample output for session.print_profile_data()
70 |
71 | ```bash
72 | | File-Line-Function | | num calls | | func time | | tot time |
73 |
74 | ('~', 0, "") 1 0.000149101 0.000149101
75 |
76 | ('~', 0, '') 1 2.38e-06 2.38e-06
77 |
78 | ('~', 0, '') 1 4.22e-06 4.22e-06
79 | ```
80 |
81 |
82 |
83 |
84 | c) Benchmarking
85 | To run benchmarking for model inferences on AIC100 chip, following method can be used:
86 |
87 | ```python
88 | #Create Session with enable_metrics = True
89 | session = qaic.Session(
90 | model_path,
91 | options_path=yaml_config_path)
92 |
93 | #Create input dictionary
94 | input_dict = {'data': np.array()}
95 |
96 | # Run Benchmarking
97 | input_dict = {'data': x}
98 |
99 | inf_completed, inf_rate, inf_time, batch_size = session.run_benchmark(input_dict=input_dict)
100 | ```
101 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_turbo/onnx_generation/onnx_gen_utils.py:
--------------------------------------------------------------------------------
1 | ####################################################################################################
2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
4 | ####################################################################################################
5 | from packaging import version
6 | import torch
7 | import onnx
8 | from onnx import external_data_helper, numpy_helper
9 | import numpy as np
10 | from pathlib import Path
11 |
12 | is_torch_less_than_1_11 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.11")
13 |
14 |
15 | def fix_onnx_fp16(
16 | gen_models_path: str,
17 | model_base_name: str,
18 | ) -> str:
19 | finfo = np.finfo(np.float16)
20 | fp16_max = finfo.max
21 | fp16_min = finfo.min
22 | model = onnx.load(f"{gen_models_path}/{model_base_name}")
23 | fp16_fix = False
24 | for tensor in external_data_helper._get_all_tensors(model):
25 | nptensor = numpy_helper.to_array(tensor, gen_models_path)
26 | if nptensor.dtype == np.float32 and (
27 | np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min)
28 | ):
29 | # print(f'tensor value : {nptensor} above {fp16_max} or below {fp16_min}')
30 | nptensor = np.clip(nptensor, fp16_min, fp16_max)
31 | new_tensor = numpy_helper.from_array(nptensor, tensor.name)
32 | tensor.CopyFrom(new_tensor)
33 | fp16_fix = True
34 |
35 |
36 | if fp16_fix:
37 | # Save FP16 model
38 | print("Found constants out of FP16 range, clipped to FP16 range")
39 | model_base_name += "_fix_outofrange_fp16"
40 | onnx.save(model, f=f"{gen_models_path}/{model_base_name}")
41 | print(f"Saving modified onnx file at {gen_models_path}/{model_base_name}")
42 | return model_base_name
43 |
44 |
45 | def onnx_export(
46 | model,
47 | model_args: tuple,
48 | output_path: Path,
49 | ordered_input_names,
50 | output_names,
51 | dynamic_axes,
52 | opset,
53 | use_external_data_format=False,
54 | ):
55 | output_path.parent.mkdir(parents=True, exist_ok=True)
56 | # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11,
57 | # so we check the torch version for backwards compatibility
58 | if is_torch_less_than_1_11:
59 | torch.onnx.export(
60 | model,
61 | model_args,
62 | f=output_path.as_posix(),
63 | input_names=ordered_input_names,
64 | output_names=output_names,
65 | dynamic_axes=dynamic_axes,
66 | do_constant_folding=True,
67 | use_external_data_format=use_external_data_format,
68 | enable_onnx_checker=True,
69 | opset_version=opset,
70 | )
71 | else:
72 | torch.onnx.export(
73 | model,
74 | model_args,
75 | f=output_path.as_posix(),
76 | input_names=ordered_input_names,
77 | output_names=output_names,
78 | dynamic_axes=dynamic_axes,
79 | do_constant_folding=True,
80 | opset_version=opset,
81 | )
82 |
83 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/onnx_generation/onnx_gen_utils.py:
--------------------------------------------------------------------------------
1 | ####################################################################################################
2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
4 | ####################################################################################################
5 | from packaging import version
6 | import torch
7 | import onnx
8 | from onnx import external_data_helper, numpy_helper
9 | import numpy as np
10 | from pathlib import Path
11 |
12 | is_torch_less_than_1_11 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.11")
13 |
14 |
15 | def fix_onnx_fp16(
16 | gen_models_path: str,
17 | model_base_name: str,
18 | ) -> str:
19 | finfo = np.finfo(np.float16)
20 | fp16_max = finfo.max
21 | fp16_min = finfo.min
22 | model = onnx.load(f"{gen_models_path}/{model_base_name}")
23 | fp16_fix = False
24 | for tensor in external_data_helper._get_all_tensors(model):
25 | nptensor = numpy_helper.to_array(tensor, gen_models_path)
26 | if nptensor.dtype == np.float32 and (
27 | np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min)
28 | ):
29 | # print(f'tensor value : {nptensor} above {fp16_max} or below {fp16_min}')
30 | nptensor = np.clip(nptensor, fp16_min, fp16_max)
31 | new_tensor = numpy_helper.from_array(nptensor, tensor.name)
32 | tensor.CopyFrom(new_tensor)
33 | fp16_fix = True
34 |
35 |
36 | if fp16_fix:
37 | # Save FP16 model
38 | print("Found constants out of FP16 range, clipped to FP16 range")
39 | model_base_name += "_fix_outofrange_fp16"
40 | onnx.save(model, f=f"{gen_models_path}/{model_base_name}")
41 | print(f"Saving modified onnx file at {gen_models_path}/{model_base_name}")
42 | return model_base_name
43 |
44 |
45 | def onnx_export(
46 | model,
47 | model_args: tuple,
48 | output_path: Path,
49 | ordered_input_names,
50 | output_names,
51 | dynamic_axes,
52 | opset,
53 | use_external_data_format=False,
54 | ):
55 | output_path.parent.mkdir(parents=True, exist_ok=True)
56 | # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11,
57 | # so we check the torch version for backwards compatibility
58 | if is_torch_less_than_1_11:
59 | torch.onnx.export(
60 | model,
61 | model_args,
62 | f=output_path.as_posix(),
63 | input_names=ordered_input_names,
64 | output_names=output_names,
65 | dynamic_axes=dynamic_axes,
66 | do_constant_folding=True,
67 | use_external_data_format=use_external_data_format,
68 | enable_onnx_checker=True,
69 | opset_version=opset,
70 | )
71 | else:
72 | torch.onnx.export(
73 | model,
74 | model_args,
75 | f=output_path.as_posix(),
76 | input_names=ordered_input_names,
77 | output_names=output_names,
78 | dynamic_axes=dynamic_axes,
79 | do_constant_folding=True,
80 | opset_version=opset,
81 | )
82 |
83 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-xl-base-1.0/onnx_gen_utils.py:
--------------------------------------------------------------------------------
1 | ####################################################################################################
2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
4 | ####################################################################################################
5 | from packaging import version
6 | import torch
7 | import onnx
8 | from onnx import external_data_helper, numpy_helper
9 | import numpy as np
10 | from pathlib import Path
11 |
12 | is_torch_less_than_1_11 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.11")
13 |
14 |
15 | def fix_onnx_fp16(
16 | gen_models_path: str,
17 | model_base_name: str,
18 | ) -> str:
19 | finfo = np.finfo(np.float16)
20 | fp16_max = finfo.max
21 | fp16_min = finfo.min
22 | model = onnx.load(f"{gen_models_path}/{model_base_name}")
23 | fp16_fix = False
24 | for tensor in external_data_helper._get_all_tensors(model):
25 | nptensor = numpy_helper.to_array(tensor, gen_models_path)
26 | if nptensor.dtype == np.float32 and (
27 | np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min)
28 | ):
29 | # print(f'tensor value : {nptensor} above {fp16_max} or below {fp16_min}')
30 | nptensor = np.clip(nptensor, fp16_min, fp16_max)
31 | new_tensor = numpy_helper.from_array(nptensor, tensor.name)
32 | tensor.CopyFrom(new_tensor)
33 | fp16_fix = True
34 |
35 |
36 | if fp16_fix:
37 | # Save FP16 model
38 | print("Found constants out of FP16 range, clipped to FP16 range")
39 | model_base_name += "_fix_outofrange_fp16"
40 | onnx.save(model, f=f"{gen_models_path}/{model_base_name}")
41 | print(f"Saving modified onnx file at {gen_models_path}/{model_base_name}")
42 | return model_base_name
43 |
44 |
45 | def onnx_export(
46 | model,
47 | model_args: tuple,
48 | output_path: Path,
49 | ordered_input_names,
50 | output_names,
51 | dynamic_axes,
52 | opset,
53 | use_external_data_format=False,
54 | ):
55 | output_path.parent.mkdir(parents=True, exist_ok=True)
56 | # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11,
57 | # so we check the torch version for backwards compatibility
58 | if is_torch_less_than_1_11:
59 | torch.onnx.export(
60 | model,
61 | model_args,
62 | f=output_path.as_posix(),
63 | input_names=ordered_input_names,
64 | output_names=output_names,
65 | dynamic_axes=dynamic_axes,
66 | do_constant_folding=True,
67 | use_external_data_format=use_external_data_format,
68 | enable_onnx_checker=True,
69 | opset_version=opset,
70 | )
71 | else:
72 | torch.onnx.export(
73 | model,
74 | model_args,
75 | f=output_path.as_posix(),
76 | input_names=ordered_input_names,
77 | output_names=output_names,
78 | dynamic_axes=dynamic_axes,
79 | do_constant_folding=True,
80 | opset_version=opset,
81 | )
82 |
83 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_turbo/model.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
2 | # SPDX-License-Identifier: BSD-3-Clause-Clear
3 |
4 | import asyncio
5 | import os
6 | import torch
7 |
8 | from diffusers import AutoPipelineForText2Image
9 |
10 | class QAICStableDiffusion:
11 | def __init__(self, model_id = 'stabilityai/sdxl-turbo', device_id=0):
12 | text_encoder = './qpc/text_encoder_256b_512i_16c_1b/programqpc.bin'
13 | unet = './qpc/unet_256b_512i_16c_1b_2m_1o/programqpc.bin'
14 | text_encoder_2 = './qpc/text_encoder_2_256b_512i_16c_1b/programqpc.bin'
15 | sdxl_vae_decoder = './qpc/vae_decoder_256b_512i_vae_16c_1b_2m_1o/programqpc.bin'
16 |
17 | # check the QPCs
18 | unet_qpc = unet if unet.endswith('programqpc.bin') else os.path.join(unet,'programqpc.bin')
19 | assert os.path.isfile(unet_qpc), f"Could not find binary {unet_qpc = }!"
20 | vae_decoder_sdxl_qpc = sdxl_vae_decoder if sdxl_vae_decoder.endswith('programqpc.bin') else os.path.join(sdxl_vae_decoder,'programqpc.bin')
21 | assert os.path.isfile(vae_decoder_sdxl_qpc), f"Could not find binary {vae_decoder_sdxl_qpc = }!"
22 | text_encoder_qpc = text_encoder if text_encoder.endswith('programqpc.bin') else os.path.join(text_encoder,'programqpc.bin')
23 | assert os.path.isfile(text_encoder_qpc), f"Could not find binary {text_encoder_qpc = }!"
24 | text_encoder_2_qpc = text_encoder_2 if text_encoder_2.endswith('programqpc.bin') else os.path.join(text_encoder_2,'programqpc.bin')
25 | assert os.path.isfile(text_encoder_2_qpc), f"Could not find binary {text_encoder_2_qpc = }!"
26 |
27 | self.num_steps = 1
28 | self.vae_type = "vae"
29 |
30 | # load the latents
31 | self.latents = None
32 |
33 | # load the model pipeline
34 | self.pipe = AutoPipelineForText2Image.from_pretrained(model_id, torch_dtype=torch.float16, variant="fp16",
35 | device_id=device_id,
36 | unet_qpc=unet_qpc,
37 | vae_decoder_qpc=vae_decoder_sdxl_qpc,
38 | text_encoder_qpc=text_encoder_qpc,
39 | text_encoder_2_qpc=text_encoder_2_qpc)
40 |
41 | async def generate(self, prompt, n, image_size):
42 | height, width = image_size[0], image_size[1]
43 | images = self.pipe(prompt=prompt,
44 | num_inference_steps=self.num_steps,
45 | height=height,
46 | width=width,
47 | latents=self.latents,
48 | vae_type=self.vae_type,
49 | guidance_scale=0.0).images
50 |
51 | yield images[0]
52 |
53 | async def main():
54 | model = QAICStableDiffusion()
55 | prompt = 'photo of 8k ultra realistic harbour, port, boats, sunset, beautiful light, golden hour, full of colour, cinematic lighting, battered, trending on artstation, 4k, hyperrealistic, focused, extreme details, cinematic, masterpiece'
56 | idx = 0
57 | async for image in model.generate(prompt, 1, (512, 512)):
58 | image.save('generated_image_{}.png'.format(idx))
59 | idx += 1
60 |
61 | if __name__ == "__main__":
62 | asyncio.run(main())
63 |
64 |
--------------------------------------------------------------------------------
/samples/python/vit_qaic/example.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
3 | SPDX-License-Identifier: BSD-3-Clause-Clear
4 | '''
5 |
6 | import sys
7 | sys.path.append("/opt/qti-aic/examples/apps/qaic-python-sdk")
8 | import qaic
9 | import numpy as np
10 | import torchvision
11 | import torch
12 | import pandas as pd
13 | import os
14 | sys.path.insert(1, os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir))
15 | from common_utils import generate_bin
16 | from transformers import ViTImageProcessor, ViTForImageClassification
17 | from PIL import Image
18 | import requests
19 | import onnx
20 | from onnxsim import simplify
21 |
22 | image_size = 224
23 |
24 | model_name = f'vit-base-patch16-{image_size}'
25 |
26 | # Import the model
27 | model = ViTForImageClassification.from_pretrained(f'google/{model_name}')
28 | onnx_filename = f'{model_name}.onnx'
29 |
30 | # Export the PyTorch model to ONNX
31 | dummy_input = torch.randn(1, 3, image_size, image_size).type(torch.FloatTensor)
32 | torch.onnx.export(model, # PyTorch model
33 | dummy_input, # Input tensor
34 | onnx_filename, # Output file
35 | export_params=True, # Export the model parameters
36 | opset_version=11, # ONNX opset version
37 | do_constant_folding=True, # Fold constant values for optimization
38 | input_names=['image'], # Input tensor names
39 | output_names=['output'], # Output tensor names
40 | dynamic_axes={'image': {0: 'batch_size'}, # Dynamic axes
41 | 'output': {0: 'batch_size'}})
42 |
43 | # apply onnxsim (optional)
44 | onnx_model = onnx.load(onnx_filename)
45 | onnx_model_simp, check = simplify(onnx_model)
46 | onnx.save(onnx_model_simp, onnx_filename)
47 | print("ONNX model saved at: ", onnx_filename)
48 |
49 | # Generate binary for QAIC by default the binary using a helper library.
50 | qpcPath = generate_bin(onnx_filename = onnx_filename , yaml_filename ='./vit_config.yaml') # return path to the folder containing compiled binary.
51 |
52 | # Compile and load the model
53 | vit_sess = qaic.Session(model_path= qpcPath+'/programqpc.bin', options_path='./vit_config.yaml')
54 | vit_sess.setup()
55 | input_shape, input_type = vit_sess.model_input_shape_dict['image']
56 | output_shape, output_type = vit_sess.model_output_shape_dict['output']
57 |
58 | processor = ViTImageProcessor.from_pretrained(f'google/{model_name}')
59 |
60 | # input sample
61 | url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
62 | image = Image.open(requests.get(url, stream=True).raw)
63 | inputs = processor(images=image, return_tensors="pt")
64 |
65 | device = True
66 | if device:
67 | print("INFO: running inference on Qualcomm Cloud AI 100")
68 | input_data = inputs['pixel_values'].numpy().astype(input_type)
69 | input_dict = {'image': input_data}
70 | output = vit_sess.run(input_dict)
71 | logits = np.frombuffer(output['output'], dtype=output_type).reshape(output_shape) # dtype to be modified based on given model
72 | else:
73 | print("INFO: running inference on CPU")
74 | outputs = model(**inputs)
75 | logits = outputs.logits
76 |
77 | predicted_class_idx = logits.argmax(-1).item()
78 | print("Predicted class:", model.config.id2label[predicted_class_idx])
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_turbo/server.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
2 | # SPDX-License-Identifier: BSD-3-Clause-Clear
3 |
4 | from contextlib import asynccontextmanager
5 | from fastapi import FastAPI, HTTPException, Request
6 | from typing import Optional
7 | from pydantic import BaseModel
8 | import time
9 | import base64
10 | import time
11 |
12 | from io import BytesIO
13 |
14 | from model import QAICStableDiffusion
15 |
16 | @asynccontextmanager
17 | async def lifespan(app: FastAPI):
18 | # Code to run before the application starts
19 | print("Application startup")
20 |
21 | app.model = QAICStableDiffusion(device_id=args.device)
22 |
23 | yield
24 | # Code to run when the application shuts down
25 | print("Application shutdown")
26 |
27 | app = FastAPI(lifespan=lifespan)
28 |
29 | class ImageRequest(BaseModel):
30 | model: str
31 | prompt: str
32 | n: Optional[int] = 1
33 | size: Optional[str] = '512x512'
34 | response_format: Optional[str] = 'b64_json'
35 |
36 | @app.get("/v1/models")
37 | async def get_models():
38 | try:
39 | response = {
40 | "object": "list",
41 | "data": [
42 | {
43 | "id": "sdxl-turbo",
44 | "object": "model",
45 | "created": 1746296172,
46 | "owned_by": "system"
47 | }
48 | ],
49 | }
50 |
51 | return {"response": response}
52 | except Exception as e:
53 | raise HTTPException(status_code=500, detail=str(e))
54 |
55 | @app.post("/v1/images/generations")
56 | async def generate_images(image_request: ImageRequest):
57 | print(image_request)
58 | utc_seconds = time.time()
59 |
60 | size = [int(dim) for dim in image_request.size.split('x')]
61 |
62 | try:
63 | async for image in app.model.generate(image_request.prompt,
64 | image_request.n,
65 | size):
66 | buffered = BytesIO()
67 | image.save(buffered, format='PNG')
68 | b64_json = base64.b64encode(buffered.getvalue()).decode()
69 |
70 | response = {
71 | "created": int(utc_seconds),
72 | "data": [
73 | {
74 | "b64_json": b64_json
75 | }
76 | ]
77 | }
78 |
79 | return response
80 |
81 | except Exception as e:
82 | raise HTTPException(status_code=500, detail=str(e))
83 |
84 | if __name__ == "__main__":
85 | import uvicorn
86 | import argparse
87 |
88 | parser = argparse.ArgumentParser(description="SDXL-Turbo REST endpoint")
89 |
90 | parser.add_argument(
91 | "--host",
92 | type=str,
93 | help="IP address",
94 | default="0.0.0.0"
95 | )
96 |
97 | parser.add_argument(
98 | "--port",
99 | type=int,
100 | help="Port",
101 | default=8000
102 | )
103 |
104 | parser.add_argument(
105 | "--device",
106 | type=int,
107 | help="Cloud AI device",
108 | default=0
109 | )
110 |
111 | args = parser.parse_args()
112 |
113 | uvicorn.run(app, host=args.host, port=args.port)
114 |
115 |
--------------------------------------------------------------------------------
/CODE-OF-CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, gender identity and expression, level of experience,
9 | nationality, personal appearance, race, religion, or sexual identity and
10 | orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team. All complaints will be reviewed
59 | and investigated and will result in a response that is deemed necessary and
60 | appropriate to the circumstances. The project team is obligated to maintain
61 | confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-xl-base-1.0/attention_patch.patch:
--------------------------------------------------------------------------------
1 | diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
2 | index 21eb3a32..1df1b09c 100644
3 | --- a/src/diffusers/models/attention_processor.py
4 | +++ b/src/diffusers/models/attention_processor.py
5 | @@ -200,10 +200,8 @@ class Attention(nn.Module):
6 | # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
7 | # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
8 | # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
9 | - if processor is None:
10 | - processor = (
11 | - AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
12 | - )
13 | + # force to not use FlashAttention
14 | + processor = AttnProcessor()
15 | self.set_processor(processor)
16 |
17 | def set_use_memory_efficient_attention_xformers(
18 | @@ -588,7 +586,9 @@ class Attention(nn.Module):
19 |
20 | if attention_mask is None:
21 | baddbmm_input = torch.empty(
22 | - query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
23 | + query.shape[0], query.shape[1],
24 | + key.shape[2], # key is already transposed
25 | + dtype=query.dtype, device=query.device
26 | )
27 | beta = 0
28 | else:
29 | @@ -598,7 +598,7 @@ class Attention(nn.Module):
30 | attention_scores = torch.baddbmm(
31 | baddbmm_input,
32 | query,
33 | - key.transpose(-1, -2),
34 | + key, # key is already transposed
35 | beta=beta,
36 | alpha=self.scale,
37 | )
38 | @@ -740,8 +740,26 @@ class AttnProcessor:
39 | key = attn.head_to_batch_dim(key)
40 | value = attn.head_to_batch_dim(value)
41 |
42 | - attention_probs = attn.get_attention_scores(query, key, attention_mask)
43 | - hidden_states = torch.bmm(attention_probs, value)
44 | + # pre-transpose the key
45 | + key = key.transpose(-1, -2)
46 | + if query.size(-2) != value.size(-2): # cross-attention, use regular attention
47 | + # QKV done in single block
48 | + attention_probs = attn.get_attention_scores(query, key, attention_mask)
49 | + hidden_states = torch.bmm(attention_probs, value)
50 | + else: # self-attention, use blocked attention
51 | + # QKV done with block-attention (a la FlashAttentionV2)
52 | + print(f"{query.shape = }, {key.shape = }, {value.shape = }")
53 | + query_block_size = 128
54 | + query_seq_len = query.size(-2)
55 | + num_blocks = (query_seq_len + query_block_size - 1) // query_block_size
56 | + for qidx in range(num_blocks):
57 | + query_block = query[:,qidx*query_block_size:(qidx+1)*query_block_size,:]
58 | + attention_probs = attn.get_attention_scores(query_block, key, attention_mask)
59 | + hidden_states_block = torch.bmm(attention_probs, value)
60 | + if qidx == 0:
61 | + hidden_states = hidden_states_block
62 | + else:
63 | + hidden_states = torch.cat((hidden_states, hidden_states_block), -2)
64 | hidden_states = attn.batch_to_head_dim(hidden_states)
65 |
66 | # linear proj
67 |
--------------------------------------------------------------------------------
/models/speech/whisper/runModel.py:
--------------------------------------------------------------------------------
1 | ####################################################################################################
2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
4 | ####################################################################################################
5 |
6 | import os
7 | from datasets import load_dataset
8 | from transformers import WhisperProcessor
9 | import whisper
10 | import numpy as np
11 | import torch
12 | from audio import AudioSample
13 | import qaic
14 |
15 | model_name = 'base'
16 | aic_path = './whisper_AIC'
17 |
18 | # Select an audio file and read it:
19 | audio_sample = AudioSample()
20 | audio_path = audio_sample.to_file()
21 | audio = whisper.load_audio(audio_path) # Read audio from file
22 | audio_pad = whisper.pad_or_trim(audio) # Padding and trimming
23 | # make log-Mel spectrogram and move to the same device as the model
24 | input_features = whisper.log_mel_spectrogram(audio_pad) # convert to mel spectrogram
25 |
26 | # Load the Whisper processor for parsing results
27 | processor = WhisperProcessor.from_pretrained('openai/whisper-{}'.format(model_name))
28 |
29 | eot = 50257 # end of transcript token
30 | startoftranscript = 50258 # start of transcript token
31 |
32 | decoder_sequence_length=150
33 |
34 | def run_AIC(input_features, device_id=0):
35 | # Load both encoder and decoder models into Cloud AI accelerator memory
36 | # via oversubscription.
37 | # The number of NSP cores required is the maximum of the numbers of cores
38 | # for which encoder and decoder are compiled.
39 | # If encoder is compiled for 4 cores and decoder is compiled for 12 cores,
40 | # then the max usage is 12 cores.
41 | # Since encoder and decoder don't run at the same time, this allows us to
42 | # efficiently utilize the available cores.
43 |
44 | encoder_sess = qaic.Session(
45 | model_path=os.path.join(aic_path, 'whisper-encoder', 'programqpc.bin'),
46 | num_activations=1,
47 | set_size=1,
48 | dev_id=device_id,
49 | oversubscription_name='group1')
50 |
51 | decoder_sess = qaic.Session(
52 | model_path=os.path.join(aic_path, 'whisper-decoder', 'programqpc.bin'),
53 | num_activations=1,
54 | set_size=1,
55 | dev_id=device_id,
56 | oversubscription_name='group1')
57 |
58 | encoder_inputs = {
59 | 'input_features': input_features.numpy().astype(np.float32).reshape(1,80,3000)
60 | }
61 |
62 | audio_features = encoder_sess.run(encoder_inputs)['last_hidden_state']
63 |
64 | next_token = None
65 | tokens = [startoftranscript]
66 | decoder_input_ids = np.zeros((1, decoder_sequence_length), dtype=np.int64)
67 | decoder_input_ids[:,0] = startoftranscript
68 |
69 | for iter in range(decoder_sequence_length):
70 | if iter > 0:
71 | decoder_input_ids[:,iter] = next_token.item()
72 |
73 | decoder_inputs = {
74 | 'input_ids': decoder_input_ids,
75 | 'encoder_hidden_states': audio_features,
76 | }
77 |
78 | logits = decoder_sess.run(decoder_inputs)['logits']
79 | logits = logits[:,iter,:]
80 |
81 | next_token = logits.argmax(axis=-1)
82 | tokens.append(next_token.item())
83 |
84 | if next_token == eot: # stop at end-of-transcript token
85 | break
86 |
87 | transcription = processor.batch_decode(tokens, skip_special_tokens=False)
88 | print("result:", transcription)
89 |
90 | if __name__ == '__main__':
91 | run_AIC(input_features)
92 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/run_config_inference.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ####################################################################################################
4 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
5 | # SPDX-License-Identifier: BSD-3-Clause-Clear
6 | ####################################################################################################
7 |
8 | # model configs
9 | MODEL_PATH="stabilityai/stable-diffusion-3.5-medium"
10 | PROMPT="\"$1\""
11 | NEG_PROMPT="\"$2\""
12 | GUIDANCE=7.0
13 | VAE_TYPE="vae"
14 | IMAGE_SIZE=1024
15 | BLOCK_SIZE=64
16 | BATCH_SIZE=1
17 |
18 | # onnx configs
19 | GENERATE_ONNX=false
20 | ONNX_TEXT_ENCODER=false
21 | ONNX_TRANSFORMER=false
22 | ONNX_VAE=false
23 |
24 | # compile configs
25 | NUM_CORES=16
26 | VAE_MOS=2
27 | VAE_OLS=1
28 | TRANSFORMER_MOS=1
29 | TRANSFORMER_OLS=2
30 | COMPILE_TEXT_ENCODER=false
31 | COMPILE_TRANSFORMER=false
32 | COMPILE_VAE=false
33 |
34 | # inference configs
35 | RUN_ONLY=true
36 | TEXT_ENCODER_3=false
37 | DEVICE=0
38 | DEVICE2=1
39 | NUM_STEPS=28
40 | WARMUP_ITERS=3
41 | REPEAT_ITERS=3
42 |
43 | # mode
44 | TOGETHER=false
45 |
46 | if [ ${GENERATE_ONNX} == true ]
47 | then
48 | GENERATE_ONNX_CMD="--generate-onnx"
49 | else
50 | GENERATE_ONNX_CMD=""
51 | fi
52 |
53 | if [ ${ONNX_TEXT_ENCODER} == true ]
54 | then
55 | ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder"
56 | else
57 | ONNX_TEXT_ENCODER_CMD=""
58 | fi
59 |
60 | if [ ${ONNX_TRANSFORMER} == true ]
61 | then
62 | ONNX_TRANSFORMER_CMD="--onnx-transformer"
63 | else
64 | ONNX_TRANSFORMER_CMD=""
65 | fi
66 |
67 | if [ ${ONNX_VAE} == true ]
68 | then
69 | ONNX_VAE_CMD="--onnx-vae"
70 | else
71 | ONNX_VAE_CMD=""
72 | fi
73 |
74 | if [ ${COMPILE_TEXT_ENCODER} == true ]
75 | then
76 | COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder"
77 | else
78 | COMPILE_TEXT_ENCODER_CMD=""
79 | fi
80 |
81 | if [ ${COMPILE_TRANSFORMER} == true ]
82 | then
83 | COMPILE_TRANSFORMER_CMD="--compile-transformer"
84 | else
85 | COMPILE_TRANSFORMER_CMD=""
86 | fi
87 |
88 | if [ ${COMPILE_VAE} == true ]
89 | then
90 | COMPILE_VAE_CMD="--compile-vae"
91 | else
92 | COMPILE_VAE_CMD=""
93 | fi
94 |
95 | if [ ${RUN_ONLY} == true ]
96 | then
97 | RUN_ONLY_CMD="--run-only"
98 | else
99 | RUN_ONLY_CMD=""
100 | fi
101 |
102 | if [ ${TEXT_ENCODER_3} == true ]
103 | then
104 | TEXT_ENCODER_3_CMD="--text-encoder-3"
105 | else
106 | TEXT_ENCODER_3_CMD=""
107 | fi
108 |
109 | if [ ${TOGETHER} == true ]
110 | then
111 | TOGETHER_CMD="--together"
112 | else
113 | TOGETHER_CMD=""
114 | fi
115 |
116 | export HF_HOME="cache"
117 |
118 | rm run.sh
119 |
120 | scripts="python main.py \
121 | --model-path $MODEL_PATH \
122 | --prompt $PROMPT \
123 | --neg_prompt $NEG_PROMPT \
124 | --guidance $GUIDANCE \
125 | --vae-type $VAE_TYPE \
126 | --batch-size $BATCH_SIZE \
127 | --image-size $IMAGE_SIZE \
128 | --block-size $BLOCK_SIZE \
129 | --num-cores $NUM_CORES \
130 | --vae-mos $VAE_MOS \
131 | --vae-ols $VAE_OLS \
132 | --transformer-mos $TRANSFORMER_MOS \
133 | --transformer-ols $TRANSFORMER_OLS \
134 | --device-id $DEVICE \
135 | --device-id2 $DEVICE2 \
136 | --num-steps $NUM_STEPS \
137 | --num-warmup-iters $WARMUP_ITERS \
138 | --num-repeat-iters $REPEAT_ITERS \
139 | $ONNX_TEXT_ENCODER_CMD \
140 | $ONNX_TRANSFORMER_CMD \
141 | $ONNX_VAE_CMD \
142 | $COMPILE_TEXT_ENCODER_CMD \
143 | $COMPILE_TRANSFORMER_CMD \
144 | $COMPILE_VAE_CMD \
145 | $GENERATE_ONNX_CMD \
146 | $RUN_ONLY_CMD \
147 | $TEXT_ENCODER_3_CMD \
148 | $TOGETHER_CMD"
149 |
150 | echo $scripts >> run.sh
151 |
152 | bash run.sh
153 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/README.md:
--------------------------------------------------------------------------------
1 | ### Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
2 | ### SPDX-License-Identifier: BSD-3-Clause-Clear
3 |
4 | # Instructions to run SDXL on Cloud AI 100 with DeepCache
5 |
6 | The instructions below are to run the [Stable Diffusion XL model](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) with [DeepCache](https://github.com/horseee/DeepCache) on Cloud AI 100.
7 |
8 |
9 | ## Pre-requisites
10 |
11 | Install the moreutils package for the `ts` timestamp tool:
12 | ```
13 | sudo apt update
14 | sudo apt-get install moreutils
15 | ```
16 |
17 | Install Git Large File System (LFS) support
18 |
19 | ```
20 | sudo apt update
21 | sudo apt-get install git-lfs
22 | ```
23 |
24 | ## 1. Generate onnx files and compile for binaries
25 |
26 | 1. Set up a virtual environment for ONNX generation and compilation
27 | ```
28 | python3.10 -m venv env_onnx
29 | source ./env_onnx/bin/activate
30 | pip install -r requirements.txt
31 | ```
32 |
33 | 2. Setup environments
34 | ```
35 | mkdir cache
36 | mkdir qpc
37 | mkdir compile_logs
38 | ```
39 |
40 | 3. Install diffusers from source after patching for ONNX file generation
41 | ```
42 | git clone --depth 1 --branch v0.24.0 https://github.com/huggingface/diffusers.git diffusers_onnx
43 | cd diffusers_onnx
44 | git apply --reject --whitespace=fix ../patches/attention_patch.patch
45 | pip install .
46 | cd ..
47 | ```
48 |
49 | 4. Install DeepCache for ONNX file generation (deep UNet)
50 | ```
51 | git clone https://github.com/horseee/DeepCache.git
52 | cd DeepCache
53 | git apply --reject --whitespace=fix ../patches/deepcache_unet.patch
54 | pip install .
55 | cd ..
56 | ```
57 |
58 | 5. Prepare VAE Decoder
59 | ```
60 | export GIT_LFS_SKIP_SMUDGE=1
61 | git clone https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0 cache/stabilityai/stable-diffusion-xl-base-1.0
62 | cd cache/stabilityai/stable-diffusion-xl-base-1.0
63 | git lfs pull -I vae_decoder/model.onnx
64 | rm -rf .git/lfs # optional to save space
65 | cd ../../../
66 | ```
67 |
68 | 6. Generate ONNX files and compile for binaries
69 | ```
70 | touch run.sh
71 | bash run_config_deep.sh
72 | ```
73 |
74 | 7. Modify the UNet to be the shallow version
75 | ```
76 | sed -i '963s/False/True/' env_onnx/lib/python3.10/site-packages/DeepCache/sdxl/unet_2d_condition.py
77 | ```
78 |
79 | 8. Generate ONNX file and compile shallow UNet for DeepCache
80 | ```
81 | bash run_config_shallow.sh
82 | ```
83 |
84 | ## 2. Run the end-to-end SDXL inference
85 |
86 | 1. Set up a separate virtual environment for running SDXL
87 | ```
88 | python3.10 -m venv env_pipeline
89 | source ./env_pipeline/bin/activate
90 | pip install -r requirements.txt
91 | pip install --force-reinstall /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl
92 | ```
93 |
94 | 2. Re-install diffusers and DeepCache from source after patching the SDXL pipeline for inference
95 | ```
96 | git clone --depth 1 --branch v0.24.0 https://github.com/huggingface/diffusers.git diffusers_pipeline
97 | cd diffusers_pipeline
98 | pip install .
99 | cd ..
100 | ```
101 |
102 | 3. Install DeepCache and prepare the pipeline for inference
103 | ```
104 | git clone https://github.com/horseee/DeepCache.git deepcache_pipeline
105 | cd deepcache_pipeline
106 | git apply --reject --whitespace=fix ../patches/deepcache_pipeline.patch
107 | pip install .
108 | cd ..
109 | ```
110 |
111 | 4. Run the SDXL inference with 'sudo' flag if needed to access the AI100 devices.
112 | ```
113 | sudo bash run_config_inference.sh $(which python3)
114 | ```
115 | Note: ```CACHE_INTERVAL``` variable in ```run_config_inference.sh``` refers to the period of caching
116 |
117 |
--------------------------------------------------------------------------------
/samples/python/aws_ai100_benchmarking/cv_classifiers/run_cv_classifiers.sh:
--------------------------------------------------------------------------------
1 | ##############################################################################
2 | # @@-COPYRIGHT-START-@@
3 | #
4 | # Copyright (c) 2023, Qualcomm Technologies, Inc. All Rights Reserved.
5 | # Redistribution and use in source and binary forms, with or without
6 | # modification, are permitted provided that the following conditions are met:
7 | #
8 | # 1. Redistributions of source code must retain the above copyright notice,
9 | # this list of conditions and the following disclaimer.
10 | #
11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
12 | # this list of conditions and the following disclaimer in the documentation
13 | # and/or other materials provided with the distribution.
14 | #
15 | # 3. Neither the name of the copyright holder nor the names of its contributors
16 | # may be used to endorse or promote products derived from this software
17 | # without specific prior written permission.
18 | #
19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 | # POSSIBILITY OF SUCH DAMAGE.
30 | #
31 | # SPDX-License-Identifier: BSD-3-Clause
32 | #
33 | # @@-COPYRIGHT-END-@@
34 | ##############################################################################
35 |
36 | #!/bin/bash
37 |
38 | mkdir -p ./resnet-152/
39 | echo python run_cv_classifier.py -m resnet-152 -o best-latency "$@"
40 | python run_cv_classifier.py -m resnet-152 -o best-latency "$@" | tee -a ./resnet-152/best-latency.log
41 | echo python run_cv_classifier.py -m resnet-152 -o balanced "$@"
42 | python run_cv_classifier.py -m resnet-152 -o balanced "$@" | tee -a ./resnet-152/balanced.log
43 | echo python run_cv_classifier.py -m resnet-152 -o best-throughput "$@"
44 | python run_cv_classifier.py -m resnet-152 -o best-throughput "$@" | tee -a ./resnet-152/best-throughput.log
45 |
46 | mkdir -p ./resnet-50/
47 | echo python run_cv_classifier.py -m resnet-50 -o best-latency "$@"
48 | python run_cv_classifier.py -m resnet-50 -o best-latency "$@" | tee -a ./resnet-50/best-latency.log
49 | echo python run_cv_classifier.py -m resnet-50 -o balanced "$@"
50 | python run_cv_classifier.py -m resnet-50 -o balanced "$@" | tee -a ./resnet-50/balanced.log
51 | echo python run_cv_classifier.py -m resnet-50 -o best-throughput "$@"
52 | python run_cv_classifier.py -m resnet-50 -o best-throughput "$@" | tee -a ./resnet-50/best-throughput.log
53 |
54 | mkdir -p ./vit-base-patch16-224/
55 | echo python run_cv_classifier.py -m vit-base-patch16-224 -o best-latency "$@"
56 | python run_cv_classifier.py -m vit-base-patch16-224 -o best-latency "$@" | tee -a ./vit-base-patch16-224/best-latency.log
57 | echo python run_cv_classifier.py -m vit-base-patch16-224 -o balanced "$@"
58 | python run_cv_classifier.py -m vit-base-patch16-224 -o balanced "$@" | tee -a ./vit-base-patch16-224/balanced.log
59 | echo python run_cv_classifier.py -m vit-base-patch16-224 -o best-throughput "$@"
60 | python run_cv_classifier.py -m vit-base-patch16-224 -o best-throughput "$@" | tee -a ./vit-base-patch16-224/best-throughput.log
61 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/run_config_gen.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ####################################################################################################
4 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
5 | # SPDX-License-Identifier: BSD-3-Clause-Clear
6 | ####################################################################################################
7 |
8 | # model configs
9 | MODEL_PATH="stabilityai/stable-diffusion-3.5-medium"
10 | VAE_TYPE="vae"
11 | IMAGE_SIZE=1024
12 | BLOCK_SIZE=64
13 | BATCH_SIZE=1
14 |
15 | # onnx configs
16 | GENERATE_ONNX=true
17 | ONNX_TEXT_ENCODER=true
18 | ONNX_TEXT_ENCODER_3=true
19 | ONNX_TRANSFORMER=true
20 | ONNX_VAE=true
21 |
22 | # compile configs
23 | NUM_CORES=16
24 | VAE_MOS=2
25 | VAE_OLS=1
26 | TRANSFORMER_MOS=1
27 | TRANSFORMER_OLS=2
28 | COMPILE_TEXT_ENCODER=true
29 | COMPILE_TEXT_ENCODER_3=false
30 | COMPILE_TRANSFORMER=true
31 | COMPILE_VAE=true
32 |
33 | # inference configs
34 | RUN_ONLY=false
35 | DEVICE=0
36 | DEVICE2=1
37 | NUM_STEPS=1
38 | WARMUP_ITERS=3
39 | REPEAT_ITERS=1
40 |
41 | # mode
42 | TOGETHER=false
43 |
44 | if [ ${GENERATE_ONNX} == true ]
45 | then
46 | GENERATE_ONNX_CMD="--generate-onnx"
47 | else
48 | GENERATE_ONNX_CMD=""
49 | fi
50 |
51 | if [ ${ONNX_TEXT_ENCODER} == true ]
52 | then
53 | ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder"
54 | else
55 | ONNX_TEXT_ENCODER_CMD=""
56 | fi
57 |
58 | if [ ${ONNX_TEXT_ENCODER_3} == true ]
59 | then
60 | ONNX_TEXT_ENCODER_3_CMD="--onnx-text-encoder-3"
61 | else
62 | ONNX_TEXT_ENCODER_3_CMD=""
63 | fi
64 |
65 | if [ ${ONNX_TRANSFORMER} == true ]
66 | then
67 | ONNX_TRANSFORMER_CMD="--onnx-transformer"
68 | else
69 | ONNX_TRANSFORMER_CMD=""
70 | fi
71 |
72 | if [ ${ONNX_VAE} == true ]
73 | then
74 | ONNX_VAE_CMD="--onnx-vae"
75 | else
76 | ONNX_VAE_CMD=""
77 | fi
78 |
79 | if [ ${COMPILE_TEXT_ENCODER} == true ]
80 | then
81 | COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder"
82 | else
83 | COMPILE_TEXT_ENCODER_CMD=""
84 | fi
85 |
86 | if [ ${COMPILE_TEXT_ENCODER_3} == true ]
87 | then
88 | COMPILE_TEXT_ENCODER_3_CMD="--compile-text-encoder-3"
89 | else
90 | COMPILE_TEXT_ENCODER_3_CMD=""
91 | fi
92 |
93 | if [ ${COMPILE_TRANSFORMER} == true ]
94 | then
95 | COMPILE_TRANSFORMER_CMD="--compile-transformer"
96 | else
97 | COMPILE_TRANSFORMER_CMD=""
98 | fi
99 |
100 | if [ ${COMPILE_VAE} == true ]
101 | then
102 | COMPILE_VAE_CMD="--compile-vae"
103 | else
104 | COMPILE_VAE_CMD=""
105 | fi
106 |
107 | if [ ${RUN_ONLY} == true ]
108 | then
109 | RUN_ONLY_CMD="--run-only"
110 | else
111 | RUN_ONLY_CMD=""
112 | fi
113 |
114 | if [ ${TOGETHER} == true ]
115 | then
116 | TOGETHER_CMD="--together"
117 | else
118 | TOGETHER_CMD=""
119 | fi
120 |
121 | export HF_HOME="cache"
122 |
123 | rm run.sh
124 |
125 | scripts="python main.py \
126 | --model-path $MODEL_PATH \
127 | --vae-type $VAE_TYPE \
128 | --batch-size $BATCH_SIZE \
129 | --image-size $IMAGE_SIZE \
130 | --block-size $BLOCK_SIZE \
131 | --num-cores $NUM_CORES \
132 | --vae-mos $VAE_MOS \
133 | --vae-ols $VAE_OLS \
134 | --transformer-mos $TRANSFORMER_MOS \
135 | --transformer-ols $TRANSFORMER_OLS \
136 | --device-id $DEVICE \
137 | --device-id2 $DEVICE2 \
138 | --num-steps $NUM_STEPS \
139 | --num-warmup-iters $WARMUP_ITERS \
140 | --num-repeat-iters $REPEAT_ITERS \
141 | $ONNX_TEXT_ENCODER_CMD \
142 | $ONNX_TEXT_ENCODER_3_CMD \
143 | $ONNX_TRANSFORMER_CMD \
144 | $ONNX_VAE_CMD \
145 | $COMPILE_TEXT_ENCODER_CMD \
146 | $COMPILE_TEXT_ENCODER_3_CMD \
147 | $COMPILE_TRANSFORMER_CMD \
148 | $COMPILE_VAE_CMD \
149 | $GENERATE_ONNX_CMD \
150 | $RUN_ONLY_CMD \
151 | $TOGETHER_CMD"
152 |
153 | echo $scripts >> run.sh
154 |
155 | bash run.sh
156 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/run_config_deep.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ####################################################################################################
4 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
5 | # SPDX-License-Identifier: BSD-3-Clause-Clear
6 | ####################################################################################################
7 |
8 | # model configs
9 | MODEL_PATH="stabilityai/stable-diffusion-xl-base-1.0"
10 | PROMPT="\"A cinematic shot of a baby racoon wearing an intricate italian priest robe.\""
11 | VAE_TYPE="vae"
12 | UNET_TYPE="deep"
13 | IMAGE_SIZE=1024
14 | BLOCK_SIZE_DEEP=256
15 | BLOCK_SIZE_SHALLOW=128
16 | BATCH_SIZE=1
17 | PRECISION=fp16,fp16,fp16,fp16
18 |
19 | # onnx configs
20 | GENERATE_ONNX=true
21 | ONNX_TEXT_ENCODER=true
22 | ONNX_UNET=true
23 | ONNX_VAE=true
24 |
25 | # compile configs
26 | NUM_CORES=16
27 | VAE_MOS=2
28 | VAE_OLS=1
29 | UNET_MOS_DEEP=2
30 | UNET_OLS_DEEP=1
31 | UNET_MOS_SHALLOW=1
32 | UNET_OLS_SHALLOW=2
33 | COMPILE_TEXT_ENCODER=true
34 | COMPILE_UNET=true
35 | COMPILE_VAE=true
36 |
37 | # inference configs
38 | RUN_ONLY=false
39 | DEVICE=0
40 | DEVICE_2=1
41 | NUM_STEPS=20
42 | WARMUP_ITERS=3
43 | REPEAT_ITERS=3
44 |
45 | # mode
46 | TOGETHER=false
47 |
48 | if [ ${GENERATE_ONNX} == true ]
49 | then
50 | GENERATE_ONNX_CMD="--generate-onnx"
51 | else
52 | GENERATE_ONNX_CMD=""
53 | fi
54 |
55 | if [ ${ONNX_TEXT_ENCODER} == true ]
56 | then
57 | ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder"
58 | else
59 | ONNX_TEXT_ENCODER_CMD=""
60 | fi
61 |
62 | if [ ${ONNX_UNET} == true ]
63 | then
64 | ONNX_UNET_CMD="--onnx-unet"
65 | else
66 | ONNX_UNET_CMD=""
67 | fi
68 |
69 | if [ ${ONNX_VAE} == true ]
70 | then
71 | ONNX_VAE_CMD="--onnx-vae"
72 | else
73 | ONNX_VAE_CMD=""
74 | fi
75 |
76 | if [ ${COMPILE_TEXT_ENCODER} == true ]
77 | then
78 | COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder"
79 | else
80 | COMPILE_TEXT_ENCODER_CMD=""
81 | fi
82 |
83 | if [ ${COMPILE_UNET} == true ]
84 | then
85 | COMPILE_UNET_CMD="--compile-unet"
86 | else
87 | COMPILE_UNET_CMD=""
88 | fi
89 |
90 | if [ ${COMPILE_VAE} == true ]
91 | then
92 | COMPILE_VAE_CMD="--compile-vae"
93 | else
94 | COMPILE_VAE_CMD=""
95 | fi
96 |
97 | if [ ${RUN_ONLY} == true ]
98 | then
99 | RUN_ONLY_CMD="--run-only"
100 | else
101 | RUN_ONLY_CMD=""
102 | fi
103 |
104 | if [ ${TOGETHER} == true ]
105 | then
106 | TOGETHER_CMD="--together"
107 | else
108 | TOGETHER_CMD=""
109 | fi
110 |
111 | export HF_HOME="cache"
112 | sed -i 's/query_block_size = 128/query_block_size = 256/g' ./env_onnx/lib/python3.10/site-packages/diffusers/models/attention_processor.py
113 |
114 | rm run.sh
115 |
116 | scripts="python main.py \
117 | --model-path $MODEL_PATH \
118 | --prompt $PROMPT \
119 | --unet-type $UNET_TYPE \
120 | --vae-type $VAE_TYPE \
121 | --batch-size $BATCH_SIZE \
122 | --image-size $IMAGE_SIZE \
123 | --block-size-deep $BLOCK_SIZE_DEEP \
124 | --block-size-shallow $BLOCK_SIZE_SHALLOW \
125 | --num-cores $NUM_CORES \
126 | --vae-mos $VAE_MOS \
127 | --vae-ols $VAE_OLS \
128 | --unet-mos-deep $UNET_MOS_DEEP \
129 | --unet-ols-deep $UNET_OLS_DEEP \
130 | --unet-mos-shallow $UNET_MOS_SHALLOW \
131 | --unet-ols-shallow $UNET_OLS_SHALLOW \
132 | --device-id $DEVICE \
133 | --device-id-2 $DEVICE_2 \
134 | --num-steps $NUM_STEPS \
135 | --num-warmup-iters $WARMUP_ITERS \
136 | --num-repeat-iters $REPEAT_ITERS \
137 | --precision $PRECISION \
138 | $ONNX_TEXT_ENCODER_CMD \
139 | $ONNX_UNET_CMD \
140 | $ONNX_VAE_CMD \
141 | $COMPILE_TEXT_ENCODER_CMD \
142 | $COMPILE_UNET_CMD \
143 | $COMPILE_VAE_CMD \
144 | $GENERATE_ONNX_CMD \
145 | $RUN_ONLY_CMD \
146 | $TOGETHER_CMD"
147 |
148 | echo $scripts >> run.sh
149 |
150 | bash run.sh
151 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/run_config_shallow.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ####################################################################################################
4 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
5 | # SPDX-License-Identifier: BSD-3-Clause-Clear
6 | ####################################################################################################
7 |
8 | # model configs
9 | MODEL_PATH="stabilityai/stable-diffusion-xl-base-1.0"
10 | PROMPT="\"A cinematic shot of a baby racoon wearing an intricate italian priest robe.\""
11 | VAE_TYPE="vae"
12 | UNET_TYPE="shallow"
13 | IMAGE_SIZE=1024
14 | BLOCK_SIZE_DEEP=256
15 | BLOCK_SIZE_SHALLOW=128
16 | BATCH_SIZE=1
17 | PRECISION=fp16,fp16,fp16,fp16
18 |
19 | # onnx configs
20 | GENERATE_ONNX=true
21 | ONNX_TEXT_ENCODER=false
22 | ONNX_UNET=true
23 | ONNX_VAE=false
24 |
25 | # compile configs
26 | NUM_CORES=16
27 | VAE_MOS=2
28 | VAE_OLS=1
29 | UNET_MOS_DEEP=2
30 | UNET_OLS_DEEP=1
31 | UNET_MOS_SHALLOW=1
32 | UNET_OLS_SHALLOW=2
33 | COMPILE_TEXT_ENCODER=false
34 | COMPILE_UNET=true
35 | COMPILE_VAE=false
36 |
37 | # inference configs
38 | RUN_ONLY=false
39 | DEVICE=0
40 | DEVICE_2=1
41 | NUM_STEPS=20
42 | WARMUP_ITERS=3
43 | REPEAT_ITERS=3
44 |
45 | # mode
46 | TOGETHER=false
47 |
48 | if [ ${GENERATE_ONNX} == true ]
49 | then
50 | GENERATE_ONNX_CMD="--generate-onnx"
51 | else
52 | GENERATE_ONNX_CMD=""
53 | fi
54 |
55 | if [ ${ONNX_TEXT_ENCODER} == true ]
56 | then
57 | ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder"
58 | else
59 | ONNX_TEXT_ENCODER_CMD=""
60 | fi
61 |
62 | if [ ${ONNX_UNET} == true ]
63 | then
64 | ONNX_UNET_CMD="--onnx-unet"
65 | else
66 | ONNX_UNET_CMD=""
67 | fi
68 |
69 | if [ ${ONNX_VAE} == true ]
70 | then
71 | ONNX_VAE_CMD="--onnx-vae"
72 | else
73 | ONNX_VAE_CMD=""
74 | fi
75 |
76 | if [ ${COMPILE_TEXT_ENCODER} == true ]
77 | then
78 | COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder"
79 | else
80 | COMPILE_TEXT_ENCODER_CMD=""
81 | fi
82 |
83 | if [ ${COMPILE_UNET} == true ]
84 | then
85 | COMPILE_UNET_CMD="--compile-unet"
86 | else
87 | COMPILE_UNET_CMD=""
88 | fi
89 |
90 | if [ ${COMPILE_VAE} == true ]
91 | then
92 | COMPILE_VAE_CMD="--compile-vae"
93 | else
94 | COMPILE_VAE_CMD=""
95 | fi
96 |
97 | if [ ${RUN_ONLY} == true ]
98 | then
99 | RUN_ONLY_CMD="--run-only"
100 | else
101 | RUN_ONLY_CMD=""
102 | fi
103 |
104 | if [ ${TOGETHER} == true ]
105 | then
106 | TOGETHER_CMD="--together"
107 | else
108 | TOGETHER_CMD=""
109 | fi
110 |
111 | export HF_HOME="cache"
112 | sed -i 's/query_block_size = 256/query_block_size = 128/g' ./env_onnx/lib/python3.10/site-packages/diffusers/models/attention_processor.py
113 |
114 | rm run.sh
115 |
116 | scripts="python main.py \
117 | --model-path $MODEL_PATH \
118 | --prompt $PROMPT \
119 | --unet-type $UNET_TYPE \
120 | --vae-type $VAE_TYPE \
121 | --batch-size $BATCH_SIZE \
122 | --image-size $IMAGE_SIZE \
123 | --block-size-deep $BLOCK_SIZE_DEEP \
124 | --block-size-shallow $BLOCK_SIZE_SHALLOW \
125 | --num-cores $NUM_CORES \
126 | --vae-mos $VAE_MOS \
127 | --vae-ols $VAE_OLS \
128 | --unet-mos-deep $UNET_MOS_DEEP \
129 | --unet-ols-deep $UNET_OLS_DEEP \
130 | --unet-mos-shallow $UNET_MOS_SHALLOW \
131 | --unet-ols-shallow $UNET_OLS_SHALLOW \
132 | --device-id $DEVICE \
133 | --device-id-2 $DEVICE_2 \
134 | --num-steps $NUM_STEPS \
135 | --num-warmup-iters $WARMUP_ITERS \
136 | --num-repeat-iters $REPEAT_ITERS \
137 | --precision $PRECISION \
138 | $ONNX_TEXT_ENCODER_CMD \
139 | $ONNX_UNET_CMD \
140 | $ONNX_VAE_CMD \
141 | $COMPILE_TEXT_ENCODER_CMD \
142 | $COMPILE_UNET_CMD \
143 | $COMPILE_VAE_CMD \
144 | $GENERATE_ONNX_CMD \
145 | $RUN_ONLY_CMD \
146 | $TOGETHER_CMD"
147 |
148 | echo $scripts >> run.sh
149 |
150 | bash run.sh
151 |
--------------------------------------------------------------------------------
/models/language_processing/encoder/server.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
2 | # SPDX-License-Identifier: BSD-3-Clause-Clear
3 |
4 | from contextlib import asynccontextmanager
5 | from fastapi import FastAPI, HTTPException
6 | from typing import Optional, List, Union
7 | from pydantic import BaseModel
8 | import argparse
9 |
10 | from model import QAicEmbeddingModel
11 |
12 | @asynccontextmanager
13 | async def lifespan(app: FastAPI):
14 | # Code to run before the application starts
15 | print("Application startup")
16 |
17 | app.model = QAicEmbeddingModel(model_name=args.model_name, qpc_path=args.qpc_path, device=args.device)
18 |
19 | yield
20 |
21 | # Code to run when the application shuts down
22 | print("Application shutdown")
23 |
24 | app = FastAPI(lifespan=lifespan)
25 |
26 | @app.get("/v1/models")
27 | async def get_models():
28 | #print('get_models')
29 | try:
30 | response = {
31 | "object": "list",
32 | "data": [
33 | {
34 | "id": app.model.name,
35 | "object": "model",
36 | "created": 1746296172,
37 | "owned_by": "system",
38 | "max_model_len": 4096
39 | }
40 | ],
41 | }
42 |
43 | return response
44 | except Exception as e:
45 | print(str(e))
46 | raise HTTPException(status_code=500, detail=str(e))
47 |
48 | class EmbeddingsRequest(BaseModel):
49 | model: Optional[str] = "bge-large-en-v1.5"
50 | input: Union[str, List[str]]
51 | encoding_format: Optional[str] = 'float'
52 | user: Optional[str] = None
53 |
54 | @app.post("/v1/embeddings")
55 | async def embeddings(request: EmbeddingsRequest):
56 | try:
57 | response = {'object': 'list', 'data': []}
58 |
59 | inputs = request.input
60 | if isinstance(inputs, str):
61 | inputs = [inputs]
62 |
63 | for idx, input in enumerate(inputs):
64 | token_embedding, sentence_embeddings = app.model.generate(input)
65 |
66 | response['data'].append(
67 | {
68 | 'object': 'embedding',
69 | 'embedding': sentence_embeddings.reshape(-1).tolist(),
70 | 'index': idx
71 | }
72 | )
73 | #print(response)
74 | return response
75 | except Exception as e:
76 | print(str(e))
77 | raise HTTPException(status_code=500, detail=str(e))
78 |
79 | if __name__ == "__main__":
80 | import uvicorn
81 |
82 | parser = argparse.ArgumentParser(description="Embedding model endpoint")
83 |
84 | parser.add_argument(
85 | "--host",
86 | type=str,
87 | help="IP address",
88 | default="0.0.0.0"
89 | )
90 |
91 | parser.add_argument(
92 | "--port",
93 | type=int,
94 | help="Port",
95 | default=8000
96 | )
97 |
98 | parser.add_argument(
99 | "--hf_token",
100 | type=str,
101 | help="Hugging Face auth token",
102 | default=None
103 | )
104 |
105 | parser.add_argument(
106 | "--model_name",
107 | type=str,
108 | help="Hugging Face model path",
109 | default='BAAI/bge-large-en-v1.5'
110 | )
111 |
112 | parser.add_argument(
113 | "--qpc_path",
114 | type=str,
115 | help="QPC model binary path",
116 | default='./models/BAAI/bge-large-en-v1.5/compiled-bin-fp16-B1-C4-A3-OLS2-MOS1-best-throughput'
117 | )
118 |
119 | parser.add_argument(
120 | "--device",
121 | type=int,
122 | help="Cloud AI accelerator device ID",
123 | default=0
124 | )
125 |
126 | args = parser.parse_args()
127 |
128 | uvicorn.run(app, host=args.host, port=args.port)
129 |
130 |
131 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_turbo/patches/attention_patch.patch:
--------------------------------------------------------------------------------
1 | diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
2 | index 21eb3a3..4f8d68c 100644
3 | --- a/src/diffusers/models/attention_processor.py
4 | +++ b/src/diffusers/models/attention_processor.py
5 | @@ -11,6 +11,10 @@
6 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
7 | # See the License for the specific language governing permissions and
8 | # limitations under the License.
9 | +#
10 | +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
11 | +# SPDX-License-Identifier: BSD-3-Clause-Clear
12 | +# Not a Contribution
13 | from importlib import import_module
14 | from typing import Callable, Optional, Union
15 |
16 | @@ -200,10 +204,8 @@ class Attention(nn.Module):
17 | # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
18 | # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
19 | # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
20 | - if processor is None:
21 | - processor = (
22 | - AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
23 | - )
24 | + # force to not use FlashAttention
25 | + processor = AttnProcessor()
26 | self.set_processor(processor)
27 |
28 | def set_use_memory_efficient_attention_xformers(
29 | @@ -588,7 +590,9 @@ class Attention(nn.Module):
30 |
31 | if attention_mask is None:
32 | baddbmm_input = torch.empty(
33 | - query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
34 | + query.shape[0], query.shape[1],
35 | + key.shape[2], # key is already transposed
36 | + dtype=query.dtype, device=query.device
37 | )
38 | beta = 0
39 | else:
40 | @@ -598,7 +602,7 @@ class Attention(nn.Module):
41 | attention_scores = torch.baddbmm(
42 | baddbmm_input,
43 | query,
44 | - key.transpose(-1, -2),
45 | + key, # key is already transposed
46 | beta=beta,
47 | alpha=self.scale,
48 | )
49 | @@ -740,8 +744,26 @@ class AttnProcessor:
50 | key = attn.head_to_batch_dim(key)
51 | value = attn.head_to_batch_dim(value)
52 |
53 | - attention_probs = attn.get_attention_scores(query, key, attention_mask)
54 | - hidden_states = torch.bmm(attention_probs, value)
55 | + # pre-transpose the key
56 | + key = key.transpose(-1, -2)
57 | + if query.size(-2) != value.size(-2): # cross-attention, use regular attention
58 | + # QKV done in single block
59 | + attention_probs = attn.get_attention_scores(query, key, attention_mask)
60 | + hidden_states = torch.bmm(attention_probs, value)
61 | + else: # self-attention, use blocked attention
62 | + # QKV done with block-attention (a la FlashAttentionV2)
63 | + print(f"{query.shape = }, {key.shape = }, {value.shape = }")
64 | + query_block_size = 256
65 | + query_seq_len = query.size(-2)
66 | + num_blocks = (query_seq_len + query_block_size - 1) // query_block_size
67 | + for qidx in range(num_blocks):
68 | + query_block = query[:,qidx*query_block_size:(qidx+1)*query_block_size,:]
69 | + attention_probs = attn.get_attention_scores(query_block, key, attention_mask)
70 | + hidden_states_block = torch.bmm(attention_probs, value)
71 | + if qidx == 0:
72 | + hidden_states = hidden_states_block
73 | + else:
74 | + hidden_states = torch.cat((hidden_states, hidden_states_block), -2)
75 | hidden_states = attn.batch_to_head_dim(hidden_states)
76 |
77 | # linear proj
78 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/patches/attention_patch.patch:
--------------------------------------------------------------------------------
1 | diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
2 | index 21eb3a3..d43b51e 100644
3 | --- a/src/diffusers/models/attention_processor.py
4 | +++ b/src/diffusers/models/attention_processor.py
5 | @@ -11,6 +11,10 @@
6 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
7 | # See the License for the specific language governing permissions and
8 | # limitations under the License.
9 | +#
10 | +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
11 | +# SPDX-License-Identifier: BSD-3-Clause-Clear
12 | +# Not a Contribution
13 | from importlib import import_module
14 | from typing import Callable, Optional, Union
15 |
16 | @@ -200,10 +204,8 @@ class Attention(nn.Module):
17 | # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
18 | # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
19 | # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
20 | - if processor is None:
21 | - processor = (
22 | - AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
23 | - )
24 | + # force to not use FlashAttention
25 | + processor = AttnProcessor()
26 | self.set_processor(processor)
27 |
28 | def set_use_memory_efficient_attention_xformers(
29 | @@ -588,7 +590,9 @@ class Attention(nn.Module):
30 |
31 | if attention_mask is None:
32 | baddbmm_input = torch.empty(
33 | - query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
34 | + query.shape[0], query.shape[1],
35 | + key.shape[2], # key is already transposed
36 | + dtype=query.dtype, device=query.device
37 | )
38 | beta = 0
39 | else:
40 | @@ -598,7 +602,7 @@ class Attention(nn.Module):
41 | attention_scores = torch.baddbmm(
42 | baddbmm_input,
43 | query,
44 | - key.transpose(-1, -2),
45 | + key, # key is already transposed
46 | beta=beta,
47 | alpha=self.scale,
48 | )
49 | @@ -740,8 +744,26 @@ class AttnProcessor:
50 | key = attn.head_to_batch_dim(key)
51 | value = attn.head_to_batch_dim(value)
52 |
53 | - attention_probs = attn.get_attention_scores(query, key, attention_mask)
54 | - hidden_states = torch.bmm(attention_probs, value)
55 | + # pre-transpose the key
56 | + key = key.transpose(-1, -2)
57 | + if query.size(-2) != value.size(-2): # cross-attention, use regular attention
58 | + # QKV done in single block
59 | + attention_probs = attn.get_attention_scores(query, key, attention_mask)
60 | + hidden_states = torch.bmm(attention_probs, value)
61 | + else: # self-attention, use blocked attention
62 | + # QKV done with block-attention (a la FlashAttentionV2)
63 | + print(f"{query.shape = }, {key.shape = }, {value.shape = }")
64 | + query_block_size = 128
65 | + query_seq_len = query.size(-2)
66 | + num_blocks = (query_seq_len + query_block_size - 1) // query_block_size
67 | + for qidx in range(num_blocks):
68 | + query_block = query[:,qidx*query_block_size:(qidx+1)*query_block_size,:]
69 | + attention_probs = attn.get_attention_scores(query_block, key, attention_mask)
70 | + hidden_states_block = torch.bmm(attention_probs, value)
71 | + if qidx == 0:
72 | + hidden_states = hidden_states_block
73 | + else:
74 | + hidden_states = torch.cat((hidden_states, hidden_states_block), -2)
75 | hidden_states = attn.batch_to_head_dim(hidden_states)
76 |
77 | # linear proj
78 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/run_config_inference.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ####################################################################################################
4 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
5 | # SPDX-License-Identifier: BSD-3-Clause-Clear
6 | ####################################################################################################
7 |
8 |
9 | PYTHON=$1
10 | echo $PYTHON
11 |
12 | # model configs
13 | MODEL_PATH="stabilityai/stable-diffusion-xl-base-1.0"
14 | PROMPT="\"A cinematic shot of a baby racoon wearing an intricate italian priest robe.\""
15 | USE_LATENTS="\"\""
16 | NEGATIVE_PROMPT="\"Normal quality, low quality, worst quality, low res, blurry.\""
17 | VAE_TYPE="vae"
18 | UNET_TYPE="deep"
19 | IMAGE_SIZE=1024
20 | BLOCK_SIZE_DEEP=256
21 | BLOCK_SIZE_SHALLOW=128
22 | BATCH_SIZE=1
23 | PRECISION=fp16,fp16,fp16,fp16
24 |
25 | # onnx configs
26 | GENERATE_ONNX=false
27 | ONNX_TEXT_ENCODER=true
28 | ONNX_UNET=true
29 | ONNX_VAE=true
30 |
31 | # compile configs
32 | NUM_CORES=16
33 | VAE_MOS=2
34 | VAE_OLS=1
35 | UNET_MOS_DEEP=2
36 | UNET_OLS_DEEP=1
37 | UNET_MOS_SHALLOW=1
38 | UNET_OLS_SHALLOW=2
39 | COMPILE_TEXT_ENCODER=true
40 | COMPILE_UNET=true
41 | COMPILE_VAE=true
42 |
43 | # inference configs
44 | RUN_ONLY=true
45 | DEVICE=0
46 | DEVICE_2=1
47 | NUM_STEPS=20
48 | WARMUP_ITERS=3
49 | REPEAT_ITERS=3
50 | CACHE_INTERVAL=3
51 |
52 | # mode
53 | TOGETHER=false
54 |
55 | if [ ${GENERATE_ONNX} == true ]
56 | then
57 | GENERATE_ONNX_CMD="--generate-onnx"
58 | else
59 | GENERATE_ONNX_CMD=""
60 | fi
61 |
62 | if [ ${ONNX_TEXT_ENCODER} == true ]
63 | then
64 | ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder"
65 | else
66 | ONNX_TEXT_ENCODER_CMD=""
67 | fi
68 |
69 | if [ ${ONNX_UNET} == true ]
70 | then
71 | ONNX_UNET_CMD="--onnx-unet"
72 | else
73 | ONNX_UNET_CMD=""
74 | fi
75 |
76 | if [ ${ONNX_VAE} == true ]
77 | then
78 | ONNX_VAE_CMD="--onnx-vae"
79 | else
80 | ONNX_VAE_CMD=""
81 | fi
82 |
83 | if [ ${COMPILE_TEXT_ENCODER} == true ]
84 | then
85 | COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder"
86 | else
87 | COMPILE_TEXT_ENCODER_CMD=""
88 | fi
89 |
90 | if [ ${COMPILE_UNET} == true ]
91 | then
92 | COMPILE_UNET_CMD="--compile-unet"
93 | else
94 | COMPILE_UNET_CMD=""
95 | fi
96 |
97 | if [ ${COMPILE_VAE} == true ]
98 | then
99 | COMPILE_VAE_CMD="--compile-vae"
100 | else
101 | COMPILE_VAE_CMD=""
102 | fi
103 |
104 | if [ ${RUN_ONLY} == true ]
105 | then
106 | RUN_ONLY_CMD="--run-only"
107 | else
108 | RUN_ONLY_CMD=""
109 | fi
110 |
111 | if [ ${TOGETHER} == true ]
112 | then
113 | TOGETHER_CMD="--together"
114 | else
115 | TOGETHER_CMD=""
116 | fi
117 |
118 | export HF_HOME="cache"
119 | export TQDM_DISABLE=1
120 |
121 | rm run.sh
122 |
123 | scripts="$PYTHON main.py \
124 | --model-path $MODEL_PATH \
125 | --prompt $PROMPT \
126 | --negative-prompt $NEGATIVE_PROMPT \
127 | --use-latents $USE_LATENTS \
128 | --unet-type $UNET_TYPE \
129 | --vae-type $VAE_TYPE \
130 | --batch-size $BATCH_SIZE \
131 | --image-size $IMAGE_SIZE \
132 | --block-size-deep $BLOCK_SIZE_DEEP \
133 | --block-size-shallow $BLOCK_SIZE_SHALLOW \
134 | --num-cores $NUM_CORES \
135 | --vae-mos $VAE_MOS \
136 | --vae-ols $VAE_OLS \
137 | --unet-mos-deep $UNET_MOS_DEEP \
138 | --unet-ols-deep $UNET_OLS_DEEP \
139 | --unet-mos-shallow $UNET_MOS_SHALLOW \
140 | --unet-ols-shallow $UNET_OLS_SHALLOW \
141 | --device-id $DEVICE \
142 | --device-id-2 $DEVICE_2 \
143 | --precision $PRECISION \
144 | --num-steps $NUM_STEPS \
145 | --num-warmup-iters $WARMUP_ITERS \
146 | --num-repeat-iters $REPEAT_ITERS \
147 | --cache-interval $CACHE_INTERVAL \
148 | $ONNX_TEXT_ENCODER_CMD \
149 | $ONNX_UNET_CMD \
150 | $ONNX_VAE_CMD \
151 | $COMPILE_TEXT_ENCODER_CMD \
152 | $COMPILE_UNET_CMD \
153 | $COMPILE_VAE_CMD \
154 | $GENERATE_ONNX_CMD \
155 | $RUN_ONLY_CMD \
156 | $TOGETHER_CMD"
157 |
158 | echo $scripts >> run.sh
159 |
160 | bash run.sh
161 |
--------------------------------------------------------------------------------
/samples/python/aws_ai100_benchmarking/yolo_models/README.md:
--------------------------------------------------------------------------------
1 | ## Description
2 | ---
3 |
4 | Download the yolov4, yolov5, and yolov7 models, prepare for the Qualcomm AIC100, compile for high-thoughput, min-latency, or balanced throughput with fp16 precision, run the model on a generated random sample, and obtain the benchmarking results and output values.
5 |
6 | ## Source of the models
7 | ---
8 | The models are downloaded from (https://github.com/ultralytics/yolov5). This script has been tested for the following requested models:
9 | * yolov4
10 | * yolov5s
11 | * yolov5m
12 | * yolov5l
13 | * yolov5x
14 | * yolov7-e6e
15 |
16 |
17 | ## Virtual environment
18 | ---
19 | For a quick environment setup:
20 |
21 | ```commandline
22 | source /opt/qti-aic/dev/python/qaic-env/bin/activate
23 | ```
24 |
25 | ## Framework and version
26 | ---
27 | ```commandline
28 | pip3 install torch==1.13.0 onnx==1.12.0 onnxruntime==1.15.0 torchvision==0.14.0 transformers==4.29.2 pandas==2.0.2 urllib3==1.26.6
29 | pip3 install ultralytics seaborn nvidia-pyindex onnx-graphsurgeon
30 |
31 | ```
32 | ## Syntax
33 | ---
34 | Copy the run_yolo_model.py and the lut_yolo_models.csv to a working directory. Pick a MODEl_NAME from the list above, and type:
35 |
36 | ```commandline
37 |
38 | usage: run_yolo_model.py [-h] --model-name {yolov4,yolov5s,yolov5m,yolov5l,yolov5x,yolov7-e6e}
39 | [--objective {best-latency,best-throughput,balanced}]
40 | [--opset OPSET]
41 | [--batch-size BATCH_SIZE]
42 | [--image-size IMAGE_SIZE]
43 | [--cores {1,2,3,4,5,6,7,8,9,10,11,12,13,14}]
44 | [--instances {1,2,3,4,5,6,7,8,9,10,11,12,13,14}]
45 | [--ols {1,2,3,4,5,6,7,8,9,10,11,12,13,14}]
46 | [--mos MOS]
47 | [--set-size {1,2,3,4,5,6,7,8,9,10}]
48 | [--extra EXTRA]
49 | [--time TIME]
50 | [--device {0,1,2,3,4,5,6,7}]
51 | [--run-only]
52 |
53 |
54 |
55 | Download, Compile, and Run YOLO models on randomly generated inputs
56 |
57 |
58 | optional arguments:
59 | -h, --help show this help message and exit
60 | --model-name, -m {yolov4,yolov5s,yolov5m,yolov5l,yolov5x,yolov7-e6e}
61 | Model name to download.
62 | --objective, -o {best-latency,best-throughput,balanced}
63 | Running for best-latency, best-throughput, or balanced
64 | --opset OPSET ONNX opset. Default <12>
65 | --batch-size, -b BATCH_SIZE
66 | Sample input batch size. Default <1>.
67 | --image-size, -s IMAGE_SIZE
68 | Sample input image width/height. Default <640>.
69 | --cores, -c {1,2,3,4,5,6,7,8,9,10,11,12,13,14}
70 | Number of AIC100 cores to compile the model for. Default <2>
71 | --instances, -i {1,2,3,4,5,6,7,8,9,10,11,12,13,14}
72 | Number of model instances to run on AIC100. Default <7>
73 | --ols {1,2,3,4,5,6,7,8,9,10,11,12,13,14}
74 | Overlap split factor. Default <1>
75 | --mos MOS Maximum output channel split. Default <1>
76 | --set-size {1,2,3,4,5,6,7,8,9,10}
77 | Set size. Default <10>
78 | --extra EXTRA Extra compilation arguments.
79 | --time TIME Duration (in seconds) for which to submit inferences. Default <20>
80 | --device, -d {0,1,2,3,4,5,6,7}
81 | AIC100 device ID. Default <0>
82 | --run-only, -r Performs the inference only, without re-exporting and re-compiling the model
83 |
84 |
85 | ```
86 | For example:
87 | ```commandline
88 | python run_yolo_model.py -m yolov5s -o best-throughput
89 | ```
90 | or
91 | ```commandline
92 | python run_yolo_model.py -m yolov5m -o balanced
93 | ```
94 | or
95 |
96 | ```commandline
97 | python run_yolo_model.py -m yolov5x -o best-throughput
98 | ```
99 |
100 | The hardware configuration will be either associated to the corresponding row in the lut_yolo_models.csv or to defualt values if not specified by the user. If the MODEL_NAME is not included in the lut_yolo_models.csv, default values will be used.
101 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_turbo/README.md:
--------------------------------------------------------------------------------
1 | # Instructions to run SDXL-Turbo on Cloud AI 100
2 |
3 | The instructions below are to run the [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) model on Cloud AI 100. Compile time parameters may need to be adjusted for different cards and different SDKs.
4 |
5 | ## Pre-requisites
6 |
7 | Use the [SDK 1.19.8.0](https://github.com/quic/cloud-ai-containers/pkgs/container/cloud_ai_inference_ubuntu22/414822849?tag=1.19.8.0) container to compile the sdxl-turbo models.
8 |
9 | ```
10 | sudo docker run \
11 | -it \
12 | --workdir /cloud-ai-sdk \
13 | --entrypoint /bin/bash \
14 | --network=host \
15 | --mount type=bind,source=,target=/cloud-ai-sdk \
16 | --device=/dev/accel/accel0 \
17 | --device=/dev/accel/accel1 \
18 | --device=/dev/accel/accel2 \
19 | --device=/dev/accel/accel3 \
20 | ghcr.io/quic/cloud_ai_inference_ubuntu22:1.19.8.0
21 |
22 | cd models/multimodal/text_to_image/sdxl_turbo
23 | ```
24 |
25 | Install the moreutils package for the `ts` timestamp tool:
26 | ```
27 | sudo apt update
28 | sudo apt-get install moreutils
29 | ```
30 |
31 | Install Git Large File System (LFS) support
32 |
33 | ```
34 | sudo apt update
35 | sudo apt-get install git-lfs
36 | ```
37 |
38 | ## 1. Generate onnx files and compile for binaries
39 |
40 | 1. Set up a virtual environment for ONNX generation and compilation
41 | ```
42 | python3.10 -m venv env_onnx
43 | source ./env_onnx/bin/activate
44 | pip install -r requirements.txt
45 | ```
46 |
47 | 2. Create a folder for caching Hugging Face model downloads, and export the environment variable HF_HOME
48 | ```
49 | mkdir cache
50 | mkdir compile_logs
51 | mkdir qpc
52 | touch run.sh
53 | export HF_HOME=${PWD}/cache
54 | ```
55 |
56 | 3. Install diffusers from source after patching for ONNX file generation
57 | ```
58 | git clone --depth 1 --branch v0.24.0 https://github.com/huggingface/diffusers.git diffusers-onnx
59 | cd diffusers-onnx
60 | git apply --reject --whitespace=fix ../patches/attention_patch.patch
61 | pip install .
62 | cd ..
63 | ```
64 |
65 | 4. Prepare VAE Decoder
66 | ```
67 | export GIT_LFS_SKIP_SMUDGE=1
68 | git clone https://huggingface.co/stabilityai/sdxl-turbo cache/stabilityai/sdxl_turbo
69 | cd cache/stabilityai/sdxl_turbo
70 | git lfs pull -I vae_decoder/model.onnx
71 | rm -rf .git/lfs # optional to save space
72 | cd ../../../
73 | ```
74 |
75 | 5. Generate ONNX files and compile for binaries
76 | ```
77 | bash run_config_gen.sh
78 | ```
79 |
80 | ## 2. Run the end-to-end SDXL-Turbo inference
81 |
82 | 1. Set up a separate virtual environment for running SDXL Turbo
83 | ```
84 | python3.10 -m venv env_pipeline
85 | source ./env_pipeline/bin/activate
86 | pip install -r requirements.txt
87 | pip install --force-reinstall /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl
88 | ```
89 |
90 | 2. Re-install diffusers from source after patching the SDXL Turbo pipeline for inference
91 | ```
92 | git clone --depth 1 --branch v0.24.0 https://github.com/huggingface/diffusers.git diffusers-pipeline
93 | cd diffusers-pipeline
94 | git apply --reject --whitespace=fix ../patches/pipeline_patch_separate.patch
95 | pip install .
96 | cd ..
97 | ```
98 |
99 | 4. Run the SDXL-Turbo inference with 'sudo' flag if needed to access the AI 100 devices.
100 | ```
101 | sudo bash run_config_inference.sh $(which python3)
102 | ```
103 |
104 | ## 3. Run an OpenAI-compatible REST endpoint
105 |
106 | ```
107 | source ./env_pipeline/bin/activate
108 | python3 server.py
109 | ```
110 |
111 | Test the endpoint:
112 |
113 | ```
114 | curl http://localhost:8000/v1/images/generations \
115 | -H 'Content-Type: application/json' \
116 | -H 'Authorization: Bearer test-key' \
117 | -d '{
118 | "model": "sdxl-turbo",
119 | "prompt": "photo of 8k ultra realistic harbour, port, boats, sunset, beautiful light, golden hour, full of colour, cinematic lighting, battered, trending on artstation, 4k, hyperrealistic, focused, extreme details, cinematic, masterpiece",
120 | "n": 1,
121 | "size": "512x512",
122 | "response_format": "b64_json"
123 | }'
124 | ```
125 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/patches/transformer_patch.patch:
--------------------------------------------------------------------------------
1 | diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
2 | index 81dff54f9..f27ebe7d3 100644
3 | --- a/src/transformers/models/t5/modeling_t5.py
4 | +++ b/src/transformers/models/t5/modeling_t5.py
5 | @@ -12,6 +12,10 @@
6 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
7 | # See the License for the specific language governing permissions and
8 | # limitations under the License.
9 | +#
10 | +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
11 | +# SPDX-License-Identifier: BSD-3-Clause-Clear
12 | +# Not a Contribution
13 | """ PyTorch T5 model."""
14 |
15 |
16 | @@ -243,7 +247,8 @@ class T5LayerNorm(nn.Module):
17 | # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
18 | # half-precision inputs is done in fp32
19 |
20 | - variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
21 | + div_first = hidden_states * torch.rsqrt(torch.tensor(hidden_states.shape[-1], dtype=torch.float32))
22 | + variance = div_first.pow(2).sum(-1, keepdim=True)
23 | hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
24 |
25 | # convert into half-precision if necessary
26 | @@ -330,11 +335,12 @@ class T5LayerFF(nn.Module):
27 |
28 | self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
29 | self.dropout = nn.Dropout(config.dropout_rate)
30 | + self.scaling_factor = nn.Parameter(torch.tensor(1.0))
31 |
32 | def forward(self, hidden_states):
33 | forwarded_states = self.layer_norm(hidden_states)
34 | forwarded_states = self.DenseReluDense(forwarded_states)
35 | - hidden_states = hidden_states + self.dropout(forwarded_states)
36 | + hidden_states = hidden_states * self.scaling_factor + self.dropout(forwarded_states)
37 | return hidden_states
38 |
39 |
40 | @@ -538,7 +544,8 @@ class T5Attention(nn.Module):
41 | # if key and values are already calculated
42 | # we want only the last query position bias
43 | if past_key_value is not None:
44 | - position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
45 | + #position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
46 | + position_bias = position_bias[:, :, -1:, :]
47 |
48 | if mask is not None:
49 | position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length)
50 | @@ -579,6 +586,7 @@ class T5LayerSelfAttention(nn.Module):
51 | self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
52 | self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
53 | self.dropout = nn.Dropout(config.dropout_rate)
54 | + self.scaling_factor = nn.Parameter(torch.tensor(1.0))
55 |
56 | def forward(
57 | self,
58 | @@ -600,7 +608,7 @@ class T5LayerSelfAttention(nn.Module):
59 | use_cache=use_cache,
60 | output_attentions=output_attentions,
61 | )
62 | - hidden_states = hidden_states + self.dropout(attention_output[0])
63 | + hidden_states = hidden_states * self.scaling_factor + self.dropout(attention_output[0])
64 | outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them
65 | return outputs
66 |
67 | @@ -611,6 +619,7 @@ class T5LayerCrossAttention(nn.Module):
68 | self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False)
69 | self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
70 | self.dropout = nn.Dropout(config.dropout_rate)
71 | + self.scaling_factor = nn.Parameter(torch.tensor(1.0))
72 |
73 | def forward(
74 | self,
75 | @@ -636,7 +645,7 @@ class T5LayerCrossAttention(nn.Module):
76 | query_length=query_length,
77 | output_attentions=output_attentions,
78 | )
79 | - layer_output = hidden_states + self.dropout(attention_output[0])
80 | + layer_output = hidden_states * self.scaling_factor + self.dropout(attention_output[0])
81 | outputs = (layer_output,) + attention_output[1:] # add attentions if we output them
82 | return outputs
83 |
84 |
--------------------------------------------------------------------------------
/samples/cpp/cpp_qpc_inference/Readme.md:
--------------------------------------------------------------------------------
1 | # Simple CPP Example for Bert-base-cased model on AIC-100
2 |
3 | This project demonstrates using Bert-based-cased model from hugging face, using C++ Qaic APIs.
4 |
5 | ## To build and use it.
6 | ```bash
7 | mkdir build
8 | cd build
9 | cmake
10 | make
11 | ```
12 |
13 | Bert-base-cased model from hugging face, is based on a vocabulary file
14 | (vocab.txt), which which needs to be downloaded from hugging-face website.
15 |
16 | ## To use the example, the user needs to :
17 | - download the hugging face bert-base-cased model. (Refer Jupyter notebooks for NLP models).
18 | - Replace the QPC path used in the main.cpp with the actual QPC path.
19 | - Replace the names of the input/output buffers as used to compile bert model into QPC
20 | ```
21 | for example:
22 | ("input_ids", "attention_mask") for input buffers
23 | ("logits") for output buffers
24 | ```
25 | - build using above build steps
26 | - run the executable `simple-bert-inference-example`
27 |
28 | ## The example has the following helper classes.
29 |
30 | ### VocabularyHelper :
31 | This class parses the vocab.txt, and stores the index of every
32 | string token in the vocab.txt file. The index of the words in
33 | this file is used in the input and output feeded to the model
34 | while running the inference.
35 |
36 |
37 | ### Tokenizer :
38 | This class, is very basic and trivial parser of input sentence
39 | feeded to the bert model. It uses space as delimeter to parse
40 | the sentence. It does not cater special handling for special
41 | characters and symbols used in sentence.
42 | Ideally, in C++ the user can use, for example, the
43 | sentencePiece library provided as in https://github.com/google/sentencepiece
44 |
45 |
46 | ### QBufferWrapper:
47 | This is a helper class to ensure that the memory allocated
48 | for QBuffers used in Qaic APIs is automatically released.
49 | Helper functions are provided for this class
50 |
51 | `createBuffer` : create the wrapper from a QBuffer class
52 |
53 | `qBufferToString` : create a string for printing with QBuffer data
54 |
55 |
56 | ### Helper Functions to convert few data structures to string for printing:
57 | ```cpp
58 | [[nodiscard]] std::string to_string(const qaic::rt::BufferMapping& bufMap)
59 | [[nodiscard]] std::string to_string(const qaic::rt::BufferMappings& allBufferMappings)
60 | [[nodiscard]] std::string to_string(const std::vector & tokenVec)
61 | ```
62 |
63 | ### Processing the intput and output for inference:
64 | The input buffer for bert inference in this example is an array of bytes
65 | representing the indexes for each sentence word ( in the vocabulary file ).
66 |
67 | For example:
68 |
69 | If the compiled QPC has the sequence = 128 and the input type is int64_t
70 | then the size of input buffers must be
71 | 128 * 8
72 | 128 [max num tokens in input] * 8 [size of each index in vocabulary file]
73 |
74 | If the input sentence has 10 words, then the first 10*8 bytes in the
75 | input buffer must be populated with the indexes of the sentence words
76 | in the vocabulary file. Rest of the bytes must be zero initialized.
77 |
78 | Bert Model uses attention_mask as an input to model. The attention_mask
79 | input buffer can be populated with 1 for initial 10 words and rest of bytes
80 | can be zero initialized.
81 |
82 | The output buffer for bert inference in this example is an array
83 | of logit values (corresponding to each symbol/word in the vocabulary)
84 | for each input token.
85 |
86 | For example:
87 |
88 | If the compiled QPC has the sequence value = 128 and the output format
89 | is float (4 bytes). Then the QBuffer for output must be
90 | 128 * 4 * 289960
91 | 128 [max num tokens in input] * 4 [size of each logit value] * 289960 [Vocabular Size]
92 |
93 | For getting the predicted output sentence, the logit values for the
94 | [MASK] token must be extracted from the output buffer. Then the index for the
95 | maximum logit value can be used to get the predicted output word.
96 |
97 | For example:
98 |
99 | If the [MASK] token is at 3rd word index in sentence, then the corresponding
100 | logit values shall be present in the following bytes in the output buffer
101 | 289960*3*4 to 289960*4*4 bytes position.
102 | These 2899960 float values are the logits for the corresponding logits for
103 | each symbol/word in the vocabulary.
104 | We find the index for maximum logit value to get the index of prediceted
105 | word. Then we find the word in the vocabulary.
106 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/model.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
2 | # SPDX-License-Identifier: BSD-3-Clause-Clear
3 |
4 | import os
5 | import torch
6 |
7 | from diffusers import StableDiffusion3Pipeline
8 |
9 | class QAICStableDiffusion3:
10 | def __init__(self, model_id = 'stabilityai/stable-diffusion-3.5-medium', device_id=0, device_id_2=1):
11 | sdxl_vae_decoder = './qpc/vae_decoder_64b_1024i_vae_16c_1b_2m_1o/programqpc.bin'
12 | text_encoder = './qpc/text_encoder_64b_1024i_16c_1b/programqpc.bin'
13 | transformer = './qpc/transformer_64b_1024i_16c_1b_1m_2o/programqpc.bin'
14 | text_encoder_2 = './qpc/text_encoder_2_64b_1024i_16c_1b/programqpc.bin'
15 |
16 | text_encoder_3 = None
17 |
18 | # check the QPCs
19 | transformer_qpc = transformer if transformer.endswith('programqpc.bin') else os.path.join(transformer,'programqpc.bin')
20 | assert os.path.isfile(transformer_qpc), f"Could not find binary {transformer_qpc = }!"
21 | vae_decoder_sdxl_qpc = sdxl_vae_decoder if sdxl_vae_decoder.endswith('programqpc.bin') else os.path.join(sdxl_vae_decoder,'programqpc.bin')
22 | assert os.path.isfile(vae_decoder_sdxl_qpc), f"Could not find binary {vae_decoder_sdxl_qpc = }!"
23 | text_encoder_qpc = text_encoder if text_encoder.endswith('programqpc.bin') else os.path.join(text_encoder,'programqpc.bin')
24 | assert os.path.isfile(text_encoder_qpc), f"Could not find binary {text_encoder_qpc = }!"
25 | text_encoder_2_qpc = text_encoder_2 if text_encoder_2.endswith('programqpc.bin') else os.path.join(text_encoder_2,'programqpc.bin')
26 | assert os.path.isfile(text_encoder_2_qpc), f"Could not find binary {text_encoder_2_qpc = }!"
27 |
28 | self.vae_type = "vae"
29 |
30 | # load the latents
31 | self.latents = None
32 |
33 | # load the model pipeline
34 | if text_encoder_3:
35 | text_encoder_3_qpc = text_encoder_3 if text_encoder_3.endswith('programqpc.bin') else os.path.join(text_encoder_3,'programqpc.bin')
36 | assert os.path.isfile(text_encoder_3_qpc), f"Could not find binary {text_encoder_3_qpc = }!"
37 | pipe = StableDiffusion3Pipeline.from_pretrained(
38 | model_id,
39 | device_id=device_id,
40 | device_id2=device_id_2,
41 | transformer_qpc=transformer_qpc,
42 | vae_decoder_qpc=vae_decoder_sdxl_qpc,
43 | text_encoder_qpc=text_encoder_qpc,
44 | text_encoder_2_qpc=text_encoder_2_qpc,
45 | text_encoder_3_qpc=text_encoder_3_qpc,
46 | )
47 | else:
48 | pipe = StableDiffusion3Pipeline.from_pretrained(
49 | model_id,
50 | device_id=device_id,
51 | device_id2=device_id_2,
52 | transformer_qpc=transformer_qpc,
53 | vae_decoder_qpc=vae_decoder_sdxl_qpc,
54 | text_encoder_qpc=text_encoder_qpc,
55 | text_encoder_2_qpc=text_encoder_2_qpc,
56 | text_encoder_3=None,
57 | tokenizer_3=None,
58 | )
59 |
60 | self.pipe = pipe
61 |
62 | def generate(self, prompt, n=1, image_size=(1024,1024), num_steps=28, guidance=4.5):
63 | height, width = image_size[0], image_size[1]
64 |
65 | images = self.pipe(prompt=prompt,
66 | negative_prompt='',
67 | num_inference_steps=num_steps,
68 | height=height,
69 | width=width,
70 | latents=self.latents,
71 | vae_type=self.vae_type,
72 | guidance_scale=guidance).images
73 |
74 | return images
75 |
76 | def main():
77 | model = QAICStableDiffusion3()
78 | prompt = 'photo of 8k ultra realistic harbour, port, boats, sunset, beautiful light, golden hour, full of colour, cinematic lighting, battered, trending on artstation, 4k, hyperrealistic, focused, extreme details, cinematic, masterpiece'
79 | image = model.generate(prompt, guidance=7.0)[0]
80 | image.save('harbor.png')
81 |
82 | if __name__ == "__main__":
83 | main()
84 |
85 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/sdxl_deepcache/patches/deepcache_unet.patch:
--------------------------------------------------------------------------------
1 | diff --git a/DeepCache/sdxl/unet_2d_condition.py b/DeepCache/sdxl/unet_2d_condition.py
2 | index 6c97199..f6865c6 100644
3 | --- a/DeepCache/sdxl/unet_2d_condition.py
4 | +++ b/DeepCache/sdxl/unet_2d_condition.py
5 | @@ -11,6 +11,10 @@
6 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
7 | # See the License for the specific language governing permissions and
8 | # limitations under the License.
9 | +#
10 | +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
11 | +# SPDX-License-Identifier: BSD-3-Clause-Clear
12 | +# Not a Contribution
13 | from dataclasses import dataclass
14 | from typing import Any, Dict, List, Optional, Tuple, Union
15 |
16 | @@ -591,6 +595,8 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
17 | self.position_net = PositionNet(
18 | positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
19 | )
20 | + self.cache_layer_id = 0
21 | + self.cache_block_id = 0
22 |
23 | @property
24 | def attn_processors(self) -> Dict[str, AttentionProcessor]:
25 | @@ -741,6 +747,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
26 | sample: torch.FloatTensor,
27 | timestep: Union[torch.Tensor, float, int],
28 | encoder_hidden_states: torch.Tensor,
29 | + replicate_prv_feature: Optional[List[torch.Tensor]],
30 | class_labels: Optional[torch.Tensor] = None,
31 | timestep_cond: Optional[torch.Tensor] = None,
32 | attention_mask: Optional[torch.Tensor] = None,
33 | @@ -749,10 +756,6 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
34 | down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
35 | mid_block_additional_residual: Optional[torch.Tensor] = None,
36 | encoder_attention_mask: Optional[torch.Tensor] = None,
37 | - quick_replicate: bool = False,
38 | - replicate_prv_feature: Optional[List[torch.Tensor]] = None,
39 | - cache_layer_id: Optional[int] = None,
40 | - cache_block_id: Optional[int] = None,
41 | return_dict: bool = True,
42 | ) -> Union[UNet2DConditionOutput, Tuple]:
43 | r"""
44 | @@ -954,8 +957,11 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
45 | is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
46 | is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None
47 |
48 | + cache_layer_id = self.cache_layer_id
49 | + cache_block_id = self.cache_block_id
50 | down_block_res_samples = (sample,)
51 | - if quick_replicate and replicate_prv_feature is not None:
52 | + if False:
53 | + print("Using cache...")
54 | # Down
55 | for i, downsample_block in enumerate(self.down_blocks):
56 | if i > cache_layer_id:
57 | @@ -1037,9 +1043,10 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
58 | scale=lora_scale,
59 | enter_block_number=cache_block_id if i == len(self.up_blocks) - 1 - cache_layer_id else None,
60 | )
61 | -
62 | +
63 | prv_f = replicate_prv_feature
64 | else:
65 | + print("Initializing cache...")
66 | for i, downsample_block in enumerate(self.down_blocks):
67 | if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
68 | # For t2i-adapter CrossAttnDownBlock2D
69 | @@ -1137,17 +1144,15 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
70 | upsample_size=upsample_size,
71 | scale=lora_scale,
72 | )
73 | -
74 | +
75 | #print(cache_layer_id, current_record_f is None, i == len(self.up_blocks) - cache_layer_id - 1)
76 | #print("Append prv_feature with shape:", sample.shape)
77 | if cache_layer_id is not None and current_record_f is not None and i == len(self.up_blocks) - cache_layer_id - 1:
78 | prv_f = current_record_f[-cache_block_id-1]
79 | -
80 | +
81 | # 6. post-process
82 | if self.conv_norm_out:
83 | sample = self.conv_norm_out(sample)
84 | sample = self.conv_act(sample)
85 | sample = self.conv_out(sample)
86 | - if not return_dict:
87 | - return (sample, prv_f,)
88 | - return UNet2DConditionOutput(sample=sample)
89 | + return (sample, prv_f,)
90 |
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-xl-base-1.0/compile_models.sh:
--------------------------------------------------------------------------------
1 | ####################################################################################################
2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
3 | # SPDX-License-Identifier: BSD-3-Clause-Clear
4 | ####################################################################################################
5 | #!/bin/bash
6 |
7 | BINARY_FOLDER="./qpc/"
8 | LOG_FOLDER="./compile_logs/"
9 | BATCH_SIZE=1
10 | BATCH_SIZE_2=$(expr 2 \* $BATCH_SIZE)
11 | SEQ_LEN=77
12 | LATENT_CHANNELS=4
13 | LATENT_HEIGHT=128
14 | LATENT_WIDTH=128
15 | NUM_CORES=16
16 | VAE_MOS=2
17 | VAE_OLS=1
18 | UNET_MOS_BS1=2
19 | UNET_OLS_BS1=1
20 | UNET_MOS_BS2=1
21 | UNET_OLS_BS2=2
22 |
23 | mkdir ${BINARY_FOLDER}
24 | mkdir ${LOG_FOLDER}
25 |
26 | ########################################################################################################################
27 |
28 | # 1. Compile the text encoder - self-generated
29 | rm -rf ${BINARY_FOLDER}text_encoder
30 | /opt/qti-aic/exec/qaic-exec \
31 | -aic-hw -aic-hw-version=2.0 -aic-perf-warnings -aic-perf-metrics \
32 | -compile-only -convert-to-fp16 \
33 | -m=./onnx_files/text_encoder/model.onnx \
34 | -onnx-define-symbol=batch_size,${BATCH_SIZE} \
35 | -stats-batchsize=${BATCH_SIZE} \
36 | -onnx-define-symbol=sequence_length,${SEQ_LEN} \
37 | -aic-num-cores=${NUM_CORES} \
38 | -aic-binary-dir=${BINARY_FOLDER}text_encoder \
39 | 2>&1 | ts > ${LOG_FOLDER}text_encoder.log &
40 |
41 | ########################################################################################################################
42 |
43 | # 2. Compile the text encoder 2 - self-generated
44 | rm -rf ${BINARY_FOLDER}text_encoder_2
45 | /opt/qti-aic/exec/qaic-exec \
46 | -aic-hw -aic-hw-version=2.0 -aic-perf-warnings -aic-perf-metrics \
47 | -compile-only -convert-to-fp16 \
48 | -m=./onnx_files/text_encoder_2/model.onnx \
49 | -onnx-define-symbol=batch_size,${BATCH_SIZE} \
50 | -stats-batchsize=${BATCH_SIZE} \
51 | -onnx-define-symbol=sequence_length,${SEQ_LEN} \
52 | -aic-num-cores=${NUM_CORES} \
53 | -aic-binary-dir=${BINARY_FOLDER}text_encoder_2 \
54 | 2>&1 | ts > ${LOG_FOLDER}text_encoder_2.log &
55 |
56 | ########################################################################################################################
57 |
58 | # 3a. Compile the UNet with batchsize=1, blocksize=256
59 | rm -rf ${BINARY_FOLDER}unet-bs${BATCH_SIZE}
60 | /opt/qti-aic/exec/qaic-exec \
61 | -aic-hw -aic-hw-version=2.0 -aic-perf-warnings -aic-perf-metrics \
62 | -compile-only -convert-to-fp16 \
63 | -mos=${UNET_MOS_BS1} -ols=${UNET_OLS_BS1} \
64 | -m=./onnx_files/unet_bs1/unet/model.onnx \
65 | -onnx-define-symbol=batch_size,${BATCH_SIZE} \
66 | -stats-batchsize=${BATCH_SIZE} \
67 | -onnx-define-symbol=sequence_length,${SEQ_LEN} \
68 | -onnx-define-symbol=steps,1 \
69 | -onnx-define-symbol=num_channels,${LATENT_CHANNELS} \
70 | -onnx-define-symbol=height,${LATENT_HEIGHT} \
71 | -onnx-define-symbol=width,${LATENT_WIDTH} \
72 | -aic-num-cores=${NUM_CORES} \
73 | -aic-binary-dir=${BINARY_FOLDER}unet-bs${BATCH_SIZE} \
74 | 2>&1 | ts > ${LOG_FOLDER}unet-bs${BATCH_SIZE}.log &
75 |
76 |
77 | # 3b. Compile the UNet with batchsize=2, blocksize=128
78 | rm -rf ${BINARY_FOLDER}unet-bs${BATCH_SIZE_2}
79 | /opt/qti-aic/exec/qaic-exec \
80 | -aic-hw -aic-hw-version=2.0 -aic-perf-warnings -aic-perf-metrics \
81 | -compile-only -convert-to-fp16 \
82 | -mos=${UNET_MOS_BS2} -ols=${UNET_OLS_BS2} \
83 | -m=./onnx_files/unet_bs2/unet/model.onnx \
84 | -onnx-define-symbol=batch_size,${BATCH_SIZE_2} \
85 | -stats-batchsize=${BATCH_SIZE_2} \
86 | -onnx-define-symbol=sequence_length,${SEQ_LEN} \
87 | -onnx-define-symbol=steps,1 \
88 | -onnx-define-symbol=num_channels,${LATENT_CHANNELS} \
89 | -onnx-define-symbol=height,${LATENT_HEIGHT} \
90 | -onnx-define-symbol=width,${LATENT_WIDTH} \
91 | -aic-num-cores=${NUM_CORES} \
92 | -aic-binary-dir=${BINARY_FOLDER}unet-bs${BATCH_SIZE_2} \
93 | 2>&1 | ts > ${LOG_FOLDER}unet-bs${BATCH_SIZE_2}.log &
94 |
95 |
96 | ########################################################################################################################
97 |
98 | # 4. Compile the VAE Decoder
99 | rm -rf ${BINARY_FOLDER}vae_decoder
100 | /opt/qti-aic/exec/qaic-exec \
101 | -aic-hw -aic-hw-version=2.0 -aic-perf-warnings -aic-perf-metrics \
102 | -compile-only -convert-to-fp16 \
103 | -mos=${VAE_MOS} -ols=${VAE_OLS} \
104 | -m=./onnx_files/vae_decoder/model_fixed_128.onnx \
105 | -onnx-define-symbol=batch_size,${BATCH_SIZE} \
106 | -stats-batchsize=${BATCH_SIZE} \
107 | -onnx-define-symbol=num_channels_latent,${LATENT_CHANNELS} \
108 | -onnx-define-symbol=height_latent,${LATENT_HEIGHT} \
109 | -onnx-define-symbol=width_latent,${LATENT_WIDTH} \
110 | -aic-num-cores=${NUM_CORES} \
111 | -aic-enable-depth-first -aic-depth-first-mem=32 \
112 | -aic-binary-dir=${BINARY_FOLDER}vae_decoder \
113 | 2>&1 | ts > ${LOG_FOLDER}vae_decoder.log &
114 |
115 | ########################################################################################################################
116 |
117 | echo Waiting for qaic-exec processes to finish ...
118 | wait
119 |
120 |
--------------------------------------------------------------------------------
/samples/python/aws_ai100_benchmarking/parse_latency_and_throughput.py:
--------------------------------------------------------------------------------
1 | ##############################################################################
2 | # @@-COPYRIGHT-START-@@
3 | #
4 | # Copyright (c) 2023, Qualcomm Technologies, Inc. All Rights Reserved.
5 | # Redistribution and use in source and binary forms, with or without
6 | # modification, are permitted provided that the following conditions are met:
7 | #
8 | # 1. Redistributions of source code must retain the above copyright notice,
9 | # this list of conditions and the following disclaimer.
10 | #
11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
12 | # this list of conditions and the following disclaimer in the documentation
13 | # and/or other materials provided with the distribution.
14 | #
15 | # 3. Neither the name of the copyright holder nor the names of its contributors
16 | # may be used to endorse or promote products derived from this software
17 | # without specific prior written permission.
18 | #
19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 | # POSSIBILITY OF SUCH DAMAGE.
30 | #
31 | # SPDX-License-Identifier: BSD-3-Clause
32 | #
33 | # @@-COPYRIGHT-END-@@
34 | ##############################################################################
35 |
36 |
37 | import os
38 | import sys
39 | from glob import glob
40 | import pandas as pd
41 | import numpy as np
42 |
43 |
44 | def get_metric(series, method):
45 | '''
46 | This functions computes the average or percentile for a pandas.Series object
47 | '''
48 | if method == 'mean' or method == 'avg':
49 | return series.mean()
50 | elif method.endswith('pct'):
51 | prctile = int(method.replace('pct', ''))/100
52 | return series.quantile(prctile)
53 | return None
54 |
55 |
56 | def get_latency(latency_logs, latency_method):
57 | '''
58 | This function computes the latency from the profiling latency
59 | text files, using the latency_method specified
60 | '''
61 | df = pd.concat([pd.read_csv(filename, skiprows=4)
62 | for filename in latency_logs])
63 | col = df.columns[-3] # Execution Total Time in microseconds
64 | latency_ms = get_metric(df[col], latency_method)/1000.0
65 | return latency_ms
66 |
67 |
68 | if __name__ == "__main__":
69 | if len(sys.argv) < 3:
70 | print("Syntax: python parse_latency_and_throughput.py ")
71 | print("where is 'mean', 'avg', or 'Kpct', where K is a number between 0 to 100")
72 | print(" should include full path to the model folder where 'outputFiles' and log files are located")
73 | sys.exit()
74 |
75 | latency_method = sys.argv[1]
76 | if (latency_method not in ['mean', 'avg']) and (not latency_method.endswith('pct')):
77 | raise ValueError(f"Methods supported are mean/avg or pct, received {latency_method}")
78 | model_names = sys.argv[2:]
79 | print(model_names)
80 |
81 | # parse the logs and print the latency and throughput
82 | for config in ['best-throughput', 'balanced', 'best-latency']:
83 |
84 | print("******************************************************************")
85 | print(f"*** Latency: {config} configurations **************************")
86 | print("******************************************************************")
87 | for model in model_names:
88 | config_folders = glob(f"{model}/outputFiles/fp16*{config}")
89 | print(f"{model}: Found {len(config_folders)} {config} configurations")
90 | if len(config_folders) == 0:
91 | continue
92 | latency_logs = glob(f"{config_folders[0]}/*latency.txt")
93 | print(f"Model: {model}: Latency ({latency_method}) = {get_latency(latency_logs, latency_method):.3f} ms")
94 |
95 | print("******************************************************************")
96 | print(f"*** Throughput: {config} configurations *************************")
97 | print("******************************************************************")
98 | for model in model_names:
99 | log_file = f"{model}/{config}.log"
100 | if not os.path.exists(log_file):
101 | print("Model: {model}: {log_file} does not exist")
102 | continue
103 | with open(log_file, 'r') as fid:
104 | throughput = np.double([line.split()[-1]
105 | for line in fid.read().splitlines()
106 | if 'Inf/Sec' in line][-1])
107 | print(f"Model: {model}: Throughput = {throughput:.3f} inf/sec")
108 | print("******************************************************************")
109 |
--------------------------------------------------------------------------------
/models/vision/detection/README.md:
--------------------------------------------------------------------------------
1 | ## Description
2 | ---
3 |
4 | Download the yolov5, and yolov7 models, prepare for the Qualcomm AIC100, compile for high-thoughput, min-latency, or balanced throughput with fp16 precision, run the model on a generated random sample, and obtain the benchmarking results and output values.
5 |
6 | ## Source of the models
7 | ---
8 | The models are downloaded from (https://github.com/ultralytics/yolov5). This script has been tested for the following requested models:
9 | * yolov5s
10 | * yolov5m
11 | * yolov5l
12 | * yolov5x
13 | * yolov7-e6e
14 | * yolov8m
15 |
16 | ## Virtual environment
17 | ---
18 | For a quick environment setup:
19 |
20 | ```commandline
21 | python3.10 -m venv det_env
22 | source det_env/bin/activate
23 |
24 | ```
25 |
26 | ## Framework and version
27 | ---
28 | ```commandline
29 | pip3 install -r requirements.txt
30 |
31 | ```
32 | ## Syntax
33 | ---
34 | Copy the run_yolo_model.py and the lut_yolo_models.csv to a working directory. Pick a MODEL_NAME from the list above, and type:
35 |
36 | ```commandline
37 |
38 | usage: run_yolo_model.py [-h] --model-name {yolov5s,yolov5m,yolov5l,yolov5x,yolov7-e6e,yolov8m}
39 | [--objective {best-latency,best-throughput,balanced}]
40 | [--opset OPSET]
41 | [--batch-size BATCH_SIZE]
42 | [--image-size IMAGE_SIZE]
43 | [--cores {1,2,3,4,5,6,7,8,9,10,11,12,13,14}]
44 | [--instances {1,2,3,4,5,6,7,8,9,10,11,12,13,14}]
45 | [--ols {1,2,3,4,5,6,7,8,9,10,11,12,13,14}]
46 | [--mos {1,2,3,4,5,6,7,8,9,10,11,12,13,14}]
47 | [--set-size {1,2,3,4,5,6,7,8,9,10}]
48 | [--extra EXTRA]
49 | [--time TIME]
50 | [--device {0,1,2,3,4,5,6,7}]
51 | [--run-only]
52 |
53 |
54 |
55 | Download, Compile, and Run YOLO models on randomly generated inputs.
56 |
57 |
58 | optional arguments:
59 | -h, --help show this help message and exit
60 | --model-name, -m {yolov5s,yolov5m,yolov5l,yolov5x,yolov7-e6e,yolov8m}
61 | Model name to download.
62 | --objective, -o {best-latency,best-throughput,balanced}
63 | Running for best-latency, best-throughput, or balanced
64 | --opset OPSET ONNX opset. Default <12>
65 | --batch-size, -b BATCH_SIZE
66 | Sample input batch size. Default <1>.
67 | --image-size, -s IMAGE_SIZE
68 | Sample input image width/height. Default <640>.
69 | --cores, -c {1,2,3,4,5,6,7,8,9,10,11,12,13,14}
70 | Number of AIC100 cores to compile the model for. Default <2>
71 | --instances, -i {1,2,3,4,5,6,7,8,9,10,11,12,13,14}
72 | Number of model instances to run on AIC100. Default <7>
73 | --ols {1,2,3,4,5,6,7,8,9,10,11,12,13,14}
74 | Overlap split factor. Default <1>
75 | --mos {1,2,3,4,5,6,7,8,9,10,11,12,13,14}
76 | Maximum output channel split. Default
77 | --set-size {1,2,3,4,5,6,7,8,9,10}
78 | Set size. Default <10>
79 | --extra EXTRA Extra compilation arguments.
80 | --time TIME Duration (in seconds) for which to submit inferences. Default <20>
81 | --device, -d {0,1,2,3,4,5,6,7}
82 | AIC100 device ID. Default <0>
83 | --run-only, -r Performs the inference only, without re-exporting and re-compiling the model
84 | --include-nms Run the model preparator tool to optimize the graph, and to add the Post Processing to supported models. Details on model preparator tool here- https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Inference-Workflow/Export-the-model/Prepare-the-model/
85 |
86 |
87 | ```
88 | Examples:
89 | ```commandline
90 | python run_yolo_model.py -m yolov5s -o best-throughput
91 | ```
92 | ```commandline
93 | python run_yolo_model.py -m yolov5m -o balanced
94 | ```
95 | ```commandline
96 | python run_yolo_model.py -m yolov5x -o best-throughput
97 | ```
98 |
99 | The hardware configuration will be either associated to the corresponding row in the lut_yolo_models.csv or to defualt values if not specified by the user. If the MODEL_NAME is not included in the lut_yolo_models.csv, default values will be used.
100 |
101 | After download, compile, and run is complete, the working directory of the selected model looks as follows.
102 | # Working directory structure
103 | ```
104 | |── model # Contains the onnx file of the picked model
105 | | └── model.onnx # The onnx file of the picked model
106 | |── inputFiles # Contains the (randomly generated) input files of the compiled model
107 | │ └── input_img*.raw # Randomly generated input files of the compiled model
108 | |── outputFiles # Contains the corresponding output to input, as well as the hardware profiling for latency
109 | │ └── fp16*
110 | │ └── output-act*.bin # Corresponding output to the randomly generated input_img*.raw
111 | │ └── aic-profil*.bin # The hardware profiling for round trip latency between host and device for each inference
112 | ├── compiled-bin* # Compilation path
113 | │ └── programqpc.bin # For the selected objective, the model.onnx is compiled into programqpc.bin
114 | ├── list*.txt # A list that contains path to the inputs. Can be used as input to qaic-runner
115 | ├── commands*.txt # Includes necessary compilation and running scripts to reproduce the results manually.
116 |
117 | ```
118 | To manually resproduce the results, navigate to the working directory, select the qaic compile/run commands from the command*.txt and run them in the terminal.
119 |
--------------------------------------------------------------------------------
/utils/multi-device/README.md:
--------------------------------------------------------------------------------
1 | # Multi Device
2 |
3 | This guide provides setup instructions for multi-device enablement. PCIe peer-to-peer P2P communication must be enabled to allow efficient tensor slicing across multiple Cloud AI devices (SoCs and Cards).
4 |
5 | Refer to [Model Sharding](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Features/model_sharding/) for more information on recommended PCIe topologies for tensor slicing (P2P).
6 |
7 | ## Pre-requisites
8 |
9 | - Server with Platform and APPS SDK versions >= 1.17 installed.
10 | - PCIe switch for inter-card P2P communication
11 | - python3 -m pip install pyudev
12 |
13 | ## Setup instructions
14 |
15 | Platform SDK 1.18 and later offers an option (`--setup_mdp all`) to enable P2P for the multi-device partitioning tensor slicing feature during installation.
16 |
17 | Example:d
18 |
19 | ```
20 | cd /x86_64/deb
21 | sudo bash install.sh --setup_mdp all
22 | ```
23 |
24 | > [!IMPORTANT]
25 | > If P2P is enabled via the Platform SDK installer then skip to the [Testing P2P](#testing-p2p) section.
26 | >
27 | > The remaining steps in this section show manual steps for enabling P2P.
28 |
29 | ### Disable PCIe ACS for P2P communication between cards
30 |
31 | 1. Run `QAicChangeAcs.py` without any flags to display a hierarchial view of PCI bridges and AI 100 devices.
32 |
33 | ```
34 | $ python3 QAicChangeAcs.py
35 | Found the following AIC100 devices:
36 | Root
37 | ----0000:30:01.1 <-- Host system PCIe switch, script will disable ACS here
38 | --------0000:31:00.0 <-- Ultra AI 100 onboard PCIe switch, script will disable ACS here
39 | ------------0000:32:03.0
40 | ----------------0000:36:00.0 [Qualcomm AIC100]
41 | ------------0000:32:02.0
42 | ----------------0000:35:00.0 [Qualcomm AIC100]
43 | ------------0000:32:00.0
44 | ----------------0000:38:00.0 [Qualcomm AIC100]
45 | ------------0000:32:01.0
46 | ----------------0000:39:00.0 [Qualcomm AIC100]
47 | --------0000:21:00.0 <-- Ultra AI 100 onboard PCIe switch, script will disable ACS here
48 | ------------0000:22:00.0
49 | ----------------0000:23:00.0 [Qualcomm AIC100]
50 | ------------0000:22:02.0
51 | ----------------0000:25:00.0 [Qualcomm AIC100]
52 | ------------0000:22:01.0
53 | ----------------0000:27:00.0 [Qualcomm AIC100]
54 | ------------0000:22:03.0
55 | ----------------0000:28:00.0 [Qualcomm AIC100]
56 | ```
57 |
58 | 2. Run `QAicChangeAcs.py all` to disable ACS on all the downstream ports (on the PCIe switch) that connect to AI 100 devices as well as PCIe switch downstream ports that connect to the PCIe switch onboard the AI 100 cards. This command will enable P2P between the AI 100 devices (SoCs) on the same card as well card to card.
59 |
60 | 3. Users optionally can selectively disable ACS by running `QAicChangeAcs.py `, where
61 | - SSSS = 4 digits segment number
62 | - BB = 2 digits bus number
63 | - DD = 2 digits device number
64 | - F = 1 digit function number
65 |
66 | of the nearest common ancestor PCI bridge under which ACS needs to be disabled.
67 |
68 | Examples:
69 |
70 | `$ python3 QAicChangeAcs.py 0000:31:00.0` will disable ACS on the first set of AI 100 devices (0000:36:00.0, 0000:35:00.0, 0000:38:00.0 and 0000:39:00.0).
71 | `$ python3 QAicChangeAcs.py 0000:30:01.1` will disable ACS across both the AI 100 Ultra cards as well as the 4 devices in each AI 100 card
72 |
73 | 4. Above steps need to be repeated on every server power cycle.
74 |
75 |
76 | ### Enable multi-device partitioning (MDP)
77 |
78 | This step is required everytime a new version of the Platform SDK is installed.
79 |
80 | First, check that the Qaic Monitor service is running
81 | ```
82 | sudo systemctl status qmonitor-proxy
83 | ```
84 |
85 | If not active(running) then start it with:
86 | ```
87 | sudo systemd-run --unit=qmonitor-proxy /opt/qti-aic/tools/qaic-monitor-grpc-server
88 | ```
89 |
90 | Next, enable MDP across all Cloud AI devices in the server.
91 | ```
92 | sudo /opt/qti-aic/tools/qaic-monitor-json -i enable_mdp.json
93 | ```
94 |
95 | Reset Cloud AI devices for changes to take effect:
96 | ```
97 | sudo /opt/qti-aic/tools/qaic-util -s
98 | ```
99 |
100 | ## Testing P2P
101 |
102 | The Qaic Kernel driver requires a longer response timeout for P2P workloads. Use the following command to increase the timeout:
103 | ```
104 | sudo sh -c 'echo 2600 > /sys/module/qaic/parameters/control_resp_timeout_s'
105 | ```
106 |
107 | Synthetic P2P workloads are available in `/opt/qti-aic/test-data/aic100/v2/qaic-compute-networks/factory-workload-bin`.
108 |
109 | ### Multi-SoC Accelerators (Ultra) P2P tests
110 |
111 | ```
112 | # P2P between 2 SoCs with QID 0 and 1 on the same card
113 | sudo /opt/qti-aic/exec/qaic-runner -t /opt/qti-aic/test-data/aic100/v2/qaic-compute-networks/factory-workload-bin/2c-p2p-bw -n 10 -a 1 -l -D 0:1
114 |
115 | # P2P between 2 SoCs with QID 0 and 4 on different cards. Choose cards that are on the same PCie switch.
116 | sudo /opt/qti-aic/exec/qaic-runner -t /opt/qti-aic/test-data/aic100/v2/qaic-compute-networks/factory-workload-bin/2c-p2p-bw -n 10 -a 1 -l -D 0:4
117 | ```
118 |
119 | ### Single-SoC Accelerators (Standard/Pro) P2P tests
120 |
121 | ```
122 | # P2P between 2 SoCs with QID 0 and 4 on different cards. Choose cards that are on the same PCie switch.
123 | sudo /opt/qti-aic/exec/qaic-runner -t /opt/qti-aic/test-data/aic100/v2/qaic-compute-networks/factory-workload-bin/2c-p2p-bw -n 10 -a 1 -l -D 0:1
124 | ```
125 |
126 | ### Troubleshooting
127 | If a `Failed to access P2P device` error occurs, check the following:
128 | 1. Re-check enablement instructions above
129 | 2. Review the PCIe topology from the QAicChangeAcs.py script to make sure that a host PCIe switch is present
130 |
--------------------------------------------------------------------------------
/utils/qaic-bench/README.md:
--------------------------------------------------------------------------------
1 | # qaic-bench
2 |
3 | Benchmarking script for Cloud AI Inference accelerators.
4 |
5 | ## Installation for x86_64
6 |
7 | Download Cloud AI Docker Image:
8 |
9 | ```
10 | docker pull ghcr.io/quic/cloud_ai_inference_ubuntu22:1.20.4.0
11 | ```
12 |
13 | Start container. This example maps 4 Cloud AI 100 Ultra Accelerators. Each accelerator has 4 SoC devices.
14 |
15 | Note: For QPC generation, choose a `/cache` location with 1TB or more of free space to hold model weights, ONNX files, and QPC model binaries.
16 |
17 | Note: Run `docker container rm qaic-bench` to clean up after exiting the container.
18 |
19 | ```
20 | cd utils/qaic-bench
21 |
22 | docker run -it \
23 | --workdir /app \
24 | --name qaic-bench \
25 | --network host \
26 | --mount type=bind,source=${PWD},target=/app \
27 | --mount type=bind,source=${HOME}/.cache,target=/cache \
28 | --env HF_HOME='/cache/huggingface' \
29 | --env QEFF_HOME='/cache/qeff_models' \
30 | --env XDG_CACHE_HOME='/cache' \
31 | --device=/dev/accel/accel0 \
32 | --device=/dev/accel/accel1 \
33 | --device=/dev/accel/accel2 \
34 | --device=/dev/accel/accel3 \
35 | --device=/dev/accel/accel4 \
36 | --device=/dev/accel/accel5 \
37 | --device=/dev/accel/accel6 \
38 | --device=/dev/accel/accel7 \
39 | --device=/dev/accel/accel8 \
40 | --device=/dev/accel/accel8 \
41 | --device=/dev/accel/accel9 \
42 | --device=/dev/accel/accel10 \
43 | --device=/dev/accel/accel11 \
44 | --device=/dev/accel/accel12 \
45 | --device=/dev/accel/accel13 \
46 | --device=/dev/accel/accel14 \
47 | --device=/dev/accel/accel15 \
48 | ghcr.io/quic/cloud_ai_inference_ubuntu22:1.20.4.0
49 | ```
50 |
51 | Activate vLLM environment:
52 |
53 | ```
54 | source /opt/vllm-env/bin/activate
55 | ```
56 |
57 | ## Installation for AArch64
58 |
59 | Follow instructions [here](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/vLLM/vLLM/index.html#installing-from-source) to setup the vLLM environment for Cloud AI from source for AArch64.
60 |
61 | Activate vLLM environment:
62 |
63 | ```
64 | source qaic-vllm-venv/bin/activate
65 | ```
66 |
67 | ## KV-Heads Replication
68 |
69 | Download KV-Heads Replication script from Efficient Transformers. This is needed to efficiently tensor-slice large models across 16 SoCs.
70 |
71 | ```
72 | wget https://github.com/quic/efficient-transformers/raw/refs/heads/release/v1.19.3_fp8_update/scripts/replicate_kv_head/replicate_kv_heads.py
73 | ```
74 |
75 | ## Multi-Device Operation
76 |
77 | To run models across multiple AI 100 devices, make sure tensor slicing is enabled with:
78 |
79 | ```
80 | sudo /opt/qti-aic/tools/qaic-util -a
81 | ```
82 |
83 | The control response timeout must also be extended:
84 |
85 | ```
86 | sudo sh -c 'echo 2600 > /sys/module/qaic/parameters/control_resp_timeout_s'
87 | ```
88 |
89 | More details at: https://github.com/quic/cloud-ai-sdk/tree/1.20/utils/multi-device
90 |
91 | ## Hugging Face Access Token
92 |
93 | Some models on Hugging Face are access protected. Add your access token with the `--hf_token` script argument or set the `HF_TOKEN` environment variable. Learn more about Authentication here: https://huggingface.co/docs/huggingface_hub/en/quick-start#authentication.
94 |
95 | ## Usage
96 |
97 | Example:
98 |
99 | ```
100 | python3 qaic_bench.py config/config_llama_3_1_8b.json
101 | ```
102 |
103 | Details:
104 |
105 | ```
106 | usage: qaic_bench.py [-h] [--devices DEVICES] [--compile-only] config
107 |
108 | positional arguments:
109 | config JSON file with model configurations
110 |
111 | options:
112 | -h, --help show this help message and exit
113 | --devices DEVICES List of comma separated device IDs to use for inferencing
114 | --compile-only Generate QPCs and skip benchmarking
115 | --hf_token Hugging Face access token
116 | ```
117 |
118 | ## Configuration
119 |
120 | ### Example
121 |
122 | ```
123 | {
124 | "vllm_root": "/opt/qti-aic/integrations/vllm",
125 |
126 | "models": [
127 | {
128 | "name": "Meta-Llama-3.1-8B-Instruct",
129 | "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
130 | "configs": [
131 | {
132 | "batch_size": 1,
133 | "devices": 4,
134 | "prompt_len": 4096,
135 | "generation_len": 4096
136 | }
137 | ]
138 | }
139 | ]
140 | }
141 | ```
142 |
143 | ### JSON Reference
144 |
145 | | Property | Description |
146 | | -------------- | ----------------------------------------- |
147 | | vllm_root | Path to full vLLM installation |
148 | | models | List of models to benchmark |
149 |
150 | ### Model Properties
151 |
152 | | Property | Description |
153 | | -------------- | ----------------------------------------- |
154 | | name | Model friendly name |
155 | | model | Hugging Face model path |
156 | | configs | List of model configurations to benchmark |
157 |
158 | ### Config Properties
159 |
160 | | Property | Description |
161 | | ---------------- | ----------------------------------------- |
162 | | batch_size | Model batch size. |
163 | | devices | Number of Cloud AI SoCs for tensor-sliced execution. Set to 1 for single-SoC execution. |
164 | | cores (optional) | Number of AI Cores for compilation. Default 16. |
165 | | prompt_len | Prompt input length |
166 | | generation_len | Max number of output tokens to generate |
167 | | qpc (optional) | Path to pre-generated QPC binary. If not specified, QPC will be generated. |
168 |
--------------------------------------------------------------------------------
/tutorials/open-webui/README.md:
--------------------------------------------------------------------------------
1 | # Connecting Cloud AI models to Open WebUI
2 |
3 | [Open WebUI](https://github.com/open-webui/open-webui) is a self-hosted web interface for AI use-cases like Chat, Image Generation and RAG.
4 | By starting OpenAI-compatible endpoints with vLLM, we can connect Open WebUI to AI models running on Qualcomm Cloud AI accelerators.
5 |
6 |
7 |
8 |
9 |
10 | ## Pre-requisites
11 |
12 | * Cloud AI Platform and Apps SDKs [Installation](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/Cloud-AI-SDK/Cloud-AI-SDK/index.html)
13 | * Cloud AI 100 Ultra accelerator card
14 | * Python 3.10
15 | * Docker
16 |
17 | To run language models on multiple SoCs, make sure [tensor slicing](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Features/model_sharding/index.html) is enabled and disable ACS:
18 |
19 | ```
20 | sudo /opt/qti-aic/tools/qaic-util -a
21 | ```
22 |
23 | Increase the response timeout:
24 | ```
25 | sudo sh -c "echo 2600 > /sys/module/qaic/parameters/control_resp_timeout_s"
26 | ```
27 |
28 | Preface all docker commands with `sudo`, or add yourself to the docker group:
29 | ```
30 | sudo usermod -aG docker $USER
31 | ```
32 |
33 | Launch a new shell or `newgrp docker` to apply the changes.
34 |
35 | ## Prepare the model
36 |
37 | Use [Efficient Transformers](https://github.com/quic/efficient-transformers) to prepare popular models like Llama-3.3-70B-Instruct, Qwen2.5-Coder and Phi4, or download pre-generated model binaries at http://qualcom-qpc-models.s3-website-us-east-1.amazonaws.com/QPC/. Note the location of the 'programqpc.bin' files as you'll need these to start vLLM. Efficient-transformers stores model binaries in [~/.cache/qeff_cache](https://quic.github.io/efficient-transformers/source/quick_start.html#transformed-models-and-qpc-storage) by default.
38 |
39 | ## Cloud AI Inference Container
40 |
41 | [Cloud AI Inference containers](https://github.com/quic/cloud-ai-containers/pkgs/container/cloud_ai_inference_ubuntu22) include everything needed to compile and serve models with vLLM on Cloud AI accelerators.
42 |
43 | Download the Docker image:
44 |
45 | ```
46 | docker pull ghcr.io/quic/cloud_ai_inference_ubuntu22:1.20.4.0
47 | ```
48 |
49 | ## Start vLLM endpoint
50 |
51 | Prepare a script to launch vLLM with the pre-generated model binary inside the container.
52 |
53 | Customize the Hugging face model name (`--model`), context length (`--max-model-len`), prompt length (`max-seq_len-to-capture`) and full batch size (`max-num-seq`) to match the QPC from the 'Prepare the Model' step above.
54 |
55 | ```
56 | $ cat < serve.sh
57 | #!/bin/bash
58 | /opt/vllm-env/bin/python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4 --max-model-len 4096 --max-num-seq 1 --max-seq_len-to-capture 128 --device qaic --device-group 0,1,2,3
59 | EOF
60 |
61 | # Script must have execute permission
62 | $ chmod +x serve.sh
63 | ```
64 |
65 | Note: Change `/path/to/qpc` to the QPC location from the 'Prepare the model' step above.
66 | If your system has multiple Ultra cards, you can change the `--device` arguments to map a different card.
67 | This example creates a `qaic-vllm` Docker volume to hold persistent data (namely the tokenizer weights downloaded from Hugging face).
68 |
69 | ```
70 | docker run -dit \
71 | --workdir /model \
72 | --name qaic-vllm \
73 | --network host \
74 | --mount type=bind,source=${PWD}/serve.sh,target=/model/serve.sh \
75 | --mount type=bind,source=/path/to/qpc,target=/model/qpc \
76 | -v qaic-vllm:/model/data \
77 | --env VLLM_QAIC_MAX_CPU_THREADS=8 \
78 | --env VLLM_QAIC_QPC_PATH=/model/qpc \
79 | --env HF_HOME=/model/data/huggingface \
80 | --env QEFF_HOME=/model/data/qeff_models \
81 | --device=/dev/accel/accel0 \
82 | --device=/dev/accel/accel1 \
83 | --device=/dev/accel/accel2 \
84 | --device=/dev/accel/accel3 \
85 | --entrypoint=/model/serve.sh \
86 | ghcr.io/quic/cloud_ai_inference_ubuntu22:1.20.4.0
87 | ```
88 |
89 | ## Test the endpoint
90 |
91 | ```
92 | curl http://localhost:8000/v1/chat/completions \
93 | -H "Content-Type: application/json" \
94 | -H "Authorization: Bearer test-key" \
95 | -d '{
96 | "model": "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4",
97 | "messages": [
98 | {
99 | "role": "system",
100 | "content": "You are a helpful AI assistant."
101 | },
102 | {
103 | "role": "user",
104 | "content": "Hello!"
105 | }
106 | ]
107 | }'
108 | ```
109 |
110 | ## Start Open WebUI
111 |
112 | Download Open WebUI Docker image:
113 |
114 | ```
115 | docker pull ghcr.io/open-webui/open-webui:main
116 | ```
117 |
118 | Refer to [setup instructions](https://docs.openwebui.com/getting-started/quick-start/#quick-start-with-docker-) for more details.
119 |
120 | Run the Open WebUI container:
121 |
122 | ```
123 | docker run \
124 | -d \
125 | --network host \
126 | -e OPENAI_API_KEY=test-key \
127 | -e OPENAI_API_BASE_URL="http://localhost:8000/v1" \
128 | -v open-webui:/app/backend/data \
129 | --name open-webui \
130 | --restart always \
131 | ghcr.io/open-webui/open-webui:main
132 | ```
133 |
134 | In web browser, open http://:8080
135 |
136 | Setup:
137 | * For first time startup, create a default user. This user will have admin access.
138 | * Click Profile icon in upper right and open Admin Panel -> Settings -> Connections.
139 | * Click Configure icon for Manage OpenAI API Connections.
140 | * Make sure URL is http://localhost:8000/v1. Key can be any value
141 | * Click Verify Connection icon to test the connection.
142 | * You should see a "Server Connection Verified" pop-up
143 | * If it fails, double-check that the server.py script is running
144 | * Back on the Open WebUI home page, select the model name from the 'Prepare the model' step above.
145 |
146 |
147 |
148 | You can now use the Chat interface in Open WebUI.
--------------------------------------------------------------------------------
/models/multimodal/text_to_image/stable-diffusion-3.5-medium/patches/attention_patch.patch:
--------------------------------------------------------------------------------
1 | diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
2 | index e2ab160..6036c3a 100644
3 | --- a/src/diffusers/models/attention_processor.py
4 | +++ b/src/diffusers/models/attention_processor.py
5 | @@ -11,6 +11,10 @@
6 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
7 | # See the License for the specific language governing permissions and
8 | # limitations under the License.
9 | +#
10 | +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
11 | +# SPDX-License-Identifier: BSD-3-Clause-Clear
12 | +# Not a Contribution
13 | import inspect
14 | import math
15 | from typing import Callable, List, Optional, Tuple, Union
16 | @@ -258,9 +262,7 @@ class Attention(nn.Module):
17 | # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
18 | # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
19 | if processor is None:
20 | - processor = (
21 | - AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
22 | - )
23 | + processor = AttnProcessor()
24 | self.set_processor(processor)
25 |
26 | def set_use_npu_flash_attention(self, use_npu_flash_attention: bool) -> None:
27 | @@ -560,7 +562,7 @@ class Attention(nn.Module):
28 |
29 | if attention_mask is None:
30 | baddbmm_input = torch.empty(
31 | - query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
32 | + query.shape[0], query.shape[1], key.shape[2], dtype=query.dtype, device=query.device
33 | )
34 | beta = 0
35 | else:
36 | @@ -570,7 +572,7 @@ class Attention(nn.Module):
37 | attention_scores = torch.baddbmm(
38 | baddbmm_input,
39 | query,
40 | - key.transpose(-1, -2),
41 | + key,
42 | beta=beta,
43 | alpha=self.scale,
44 | )
45 | @@ -764,8 +766,25 @@ class AttnProcessor:
46 | key = attn.head_to_batch_dim(key)
47 | value = attn.head_to_batch_dim(value)
48 |
49 | - attention_probs = attn.get_attention_scores(query, key, attention_mask)
50 | - hidden_states = torch.bmm(attention_probs, value)
51 | + key = key.transpose(-1, -2)
52 | + if query.size(-2) != value.size(-2): # cross-attention, use regular attention
53 | + # QKV done in single block
54 | + attention_probs = attn.get_attention_scores(query, key, attention_mask)
55 | + hidden_states = torch.bmm(attention_probs, value)
56 | + else: # self-attention, use blocked attention
57 | + # QKV done with block-attention (a la FlashAttentionV2)
58 | + print(f"{query.shape = }, {key.shape = }, {value.shape = }")
59 | + query_block_size = 64
60 | + query_seq_len = query.size(-2)
61 | + num_blocks = (query_seq_len + query_block_size - 1) // query_block_size
62 | + for qidx in range(num_blocks):
63 | + query_block = query[:,qidx*query_block_size:(qidx+1)*query_block_size,:]
64 | + attention_probs = attn.get_attention_scores(query_block, key, attention_mask)
65 | + hidden_states_block = torch.bmm(attention_probs, value)
66 | + if qidx == 0:
67 | + hidden_states = hidden_states_block
68 | + else:
69 | + hidden_states = torch.cat((hidden_states, hidden_states_block), -2)
70 | hidden_states = attn.batch_to_head_dim(hidden_states)
71 |
72 | # linear proj
73 | @@ -1075,15 +1094,31 @@ class JointAttnProcessor2_0:
74 | key = torch.cat([key, encoder_hidden_states_key_proj], dim=1)
75 | value = torch.cat([value, encoder_hidden_states_value_proj], dim=1)
76 |
77 | - inner_dim = key.shape[-1]
78 | - head_dim = inner_dim // attn.heads
79 | - query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
80 | - key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
81 | - value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
82 | + query = attn.head_to_batch_dim(query)
83 | + key = attn.head_to_batch_dim(key)
84 | + value = attn.head_to_batch_dim(value)
85 |
86 | - hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
87 | - hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
88 | - hidden_states = hidden_states.to(query.dtype)
89 | + # pre-transpose the key
90 | + key = key.transpose(-1, -2)
91 | + if query.size(-2) != value.size(-2): # cross-attention, use regular attention
92 | + # QKV done in single block
93 | + attention_probs = attn.get_attention_scores(query, key, attention_mask)
94 | + hidden_states = torch.bmm(attention_probs, value)
95 | + else: # self-attention, use blocked attention
96 | + # QKV done with block-attention (a la FlashAttentionV2)
97 | + print(f"{query.shape = }, {key.shape = }, {value.shape = }")
98 | + query_block_size = 64
99 | + query_seq_len = query.size(-2)
100 | + num_blocks = (query_seq_len + query_block_size - 1) // query_block_size
101 | + for qidx in range(num_blocks):
102 | + query_block = query[:,qidx*query_block_size:(qidx+1)*query_block_size,:]
103 | + attention_probs = attn.get_attention_scores(query_block, key, attention_mask)
104 | + hidden_states_block = torch.bmm(attention_probs, value)
105 | + if qidx == 0:
106 | + hidden_states = hidden_states_block
107 | + else:
108 | + hidden_states = torch.cat((hidden_states, hidden_states_block), -2)
109 | + hidden_states = attn.batch_to_head_dim(hidden_states)
110 |
111 | # Split the attention outputs.
112 | hidden_states, encoder_hidden_states = (
113 |
--------------------------------------------------------------------------------