├── models ├── multimodal │ └── text_to_image │ │ ├── sdxl_turbo │ │ ├── onnx_generation │ │ │ ├── __init__.py │ │ │ └── onnx_gen_utils.py │ │ ├── requirements.txt │ │ ├── utils.py │ │ ├── run_config_gen.sh │ │ ├── run_config_inference.sh │ │ ├── model.py │ │ ├── server.py │ │ ├── patches │ │ │ └── attention_patch.patch │ │ └── README.md │ │ ├── sdxl_deepcache │ │ ├── onnx_generation │ │ │ ├── __init__.py │ │ │ └── onnx_gen_utils.py │ │ ├── requirements.txt │ │ ├── utils.py │ │ ├── README.md │ │ ├── run_config_deep.sh │ │ ├── run_config_shallow.sh │ │ ├── patches │ │ │ ├── attention_patch.patch │ │ │ └── deepcache_unet.patch │ │ └── run_config_inference.sh │ │ ├── stable-diffusion-3.5-medium │ │ ├── onnx_generation │ │ │ ├── __init__.py │ │ │ └── onnx_gen_utils.py │ │ ├── requirements.txt │ │ ├── utils.py │ │ ├── README.md │ │ ├── run_config_inference.sh │ │ ├── run_config_gen.sh │ │ ├── patches │ │ │ ├── transformer_patch.patch │ │ │ └── attention_patch.patch │ │ └── model.py │ │ ├── DeciDiffusion-v2-0 │ │ └── README.md │ │ ├── stable-diffusion-v1-5 │ │ └── README.md │ │ └── stable-diffusion-xl-base-1.0 │ │ ├── requirements.txt │ │ ├── fix_vae_decoder_onnx.py │ │ ├── onnx_gen_utils.py │ │ ├── attention_patch.patch │ │ └── compile_models.sh ├── vision │ ├── classification │ │ └── requirements.txt │ └── detection │ │ ├── requirements.txt │ │ ├── lut_yolo_models.csv │ │ └── README.md ├── language_processing │ ├── decoder │ │ ├── MptForCausalLM │ │ │ └── README.md │ │ ├── DeciCoder-6b │ │ │ ├── requirements.txt │ │ │ ├── init.sh │ │ │ ├── specializations_template.json │ │ │ ├── compileModel.sh │ │ │ └── README.md │ │ ├── LlamaForCausalLM │ │ │ └── README.md │ │ ├── README.md │ │ ├── GPTBigCodeForCausalLM │ │ │ └── README.md │ │ └── CodeGen-With-Speculative-Decoding │ │ │ └── README.md │ └── encoder │ │ ├── requirements.txt │ │ ├── model.py │ │ └── server.py └── speech │ └── whisper │ ├── requirements.txt │ ├── audio.py │ ├── README.md │ ├── generateModel.py │ └── runModel.py ├── images └── Cloud_AI_100.png ├── tutorials ├── NLP │ ├── Model-Onboarding-Beginner │ │ ├── bert-base-cased-config.yaml │ │ ├── distilbert-base-cased-distilled-squad-config.yaml │ │ ├── Images │ │ │ └── Workflow.jpg │ │ └── requirements.txt │ ├── Profiler-Intermediate │ │ ├── images │ │ │ ├── Latency.png │ │ │ ├── opstats_decoder.png │ │ │ ├── opstats_example.png │ │ │ └── operator_details.png │ │ └── requirements.txt │ └── Performance-Tuning-Beginner │ │ ├── Images │ │ └── Latency.jpg │ │ ├── requirements.txt │ │ ├── bert_base_dopt.json │ │ └── bert_base_dopt_min_latency.json ├── open-webui │ ├── open_webui_screen_1.png │ ├── open_webui_screen_2.png │ ├── serve.sh │ ├── open_webui.sh │ ├── vllm_container.sh │ └── README.md ├── Playground │ ├── images │ │ └── qualcomm_cloud_ai_playground.png │ └── README.md ├── Computer-Vision │ ├── Perfomance-Tuning-Beginner │ │ ├── Images │ │ │ └── Latency.jpg │ │ ├── requirements.txt │ │ ├── resnet_base_dopt_min_latency.json │ │ └── resnet_base_dopt_throughput.json │ └── DETR │ │ └── README.md ├── efficient_transformers │ └── README.md └── README.md ├── samples ├── python │ ├── qaic_features │ │ ├── resnet_config.yaml │ │ ├── benchmarking_eg.py │ │ ├── metrics_eg.py │ │ ├── profiling_eg.py │ │ └── README.md │ ├── vit_qaic │ │ ├── vit_config.yaml │ │ └── example.py │ ├── requirements.txt │ ├── README.md │ ├── aws_ai100_benchmarking │ │ ├── yolo_models │ │ │ ├── lut_yolo_models.csv │ │ │ └── README.md │ │ ├── cv_classifiers │ │ │ └── run_cv_classifiers.sh │ │ └── parse_latency_and_throughput.py │ └── common_utils.py └── cpp │ └── cpp_qpc_inference │ ├── CMakeLists.txt │ └── Readme.md ├── utils ├── multi-device │ ├── enable_mdp.json │ └── README.md ├── qaic-bench │ ├── config │ │ ├── config_tiny_llama.json │ │ ├── config_llama_3_1_8b.json │ │ └── config_dl2q.json │ └── README.md └── README.md ├── CONTRIBUTING.md ├── LICENSE └── CODE-OF-CONDUCT.md /models/multimodal/text_to_image/sdxl_turbo/onnx_generation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_deepcache/onnx_generation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/stable-diffusion-3.5-medium/onnx_generation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /images/Cloud_AI_100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/images/Cloud_AI_100.png -------------------------------------------------------------------------------- /tutorials/NLP/Model-Onboarding-Beginner/bert-base-cased-config.yaml: -------------------------------------------------------------------------------- 1 | # Inference Parameters 2 | num_activations: 2 3 | set_size: 1 4 | -------------------------------------------------------------------------------- /tutorials/open-webui/open_webui_screen_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/open-webui/open_webui_screen_1.png -------------------------------------------------------------------------------- /tutorials/open-webui/open_webui_screen_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/open-webui/open_webui_screen_2.png -------------------------------------------------------------------------------- /tutorials/NLP/Model-Onboarding-Beginner/distilbert-base-cased-distilled-squad-config.yaml: -------------------------------------------------------------------------------- 1 | # Inference Parameters 2 | num_activations: 2 3 | set_size: 10 -------------------------------------------------------------------------------- /models/multimodal/text_to_image/DeciDiffusion-v2-0/README.md: -------------------------------------------------------------------------------- 1 | ## DeciDiffusion 2.0 2 | 3 | This model is deprecated. sdxl_turbo is the recommended alternative. 4 | -------------------------------------------------------------------------------- /tutorials/NLP/Profiler-Intermediate/images/Latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/NLP/Profiler-Intermediate/images/Latency.png -------------------------------------------------------------------------------- /tutorials/NLP/Model-Onboarding-Beginner/Images/Workflow.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/NLP/Model-Onboarding-Beginner/Images/Workflow.jpg -------------------------------------------------------------------------------- /tutorials/NLP/Performance-Tuning-Beginner/Images/Latency.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/NLP/Performance-Tuning-Beginner/Images/Latency.jpg -------------------------------------------------------------------------------- /tutorials/Playground/images/qualcomm_cloud_ai_playground.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/Playground/images/qualcomm_cloud_ai_playground.png -------------------------------------------------------------------------------- /models/multimodal/text_to_image/stable-diffusion-v1-5/README.md: -------------------------------------------------------------------------------- 1 | ## Stable Diffusion v1-5 2 | 3 | This model is deprecated. sdxl_turbo is the recommended alternative. 4 | 5 | -------------------------------------------------------------------------------- /tutorials/NLP/Profiler-Intermediate/images/opstats_decoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/NLP/Profiler-Intermediate/images/opstats_decoder.png -------------------------------------------------------------------------------- /tutorials/NLP/Profiler-Intermediate/images/opstats_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/NLP/Profiler-Intermediate/images/opstats_example.png -------------------------------------------------------------------------------- /tutorials/NLP/Profiler-Intermediate/requirements.txt: -------------------------------------------------------------------------------- 1 | onnx==1.12.0 2 | optimum 3 | numpy==1.23.4 4 | onnxruntime 5 | torch===1.11.0 6 | pillow==8.3.2 7 | opencv-python 8 | paramiko -------------------------------------------------------------------------------- /tutorials/NLP/Profiler-Intermediate/images/operator_details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/NLP/Profiler-Intermediate/images/operator_details.png -------------------------------------------------------------------------------- /tutorials/NLP/Performance-Tuning-Beginner/requirements.txt: -------------------------------------------------------------------------------- 1 | onnx==1.12.0 2 | optimum 3 | numpy==1.23.4 4 | onnxruntime 5 | torch===1.11.0 6 | pillow==8.3.2 7 | opencv-python 8 | paramiko -------------------------------------------------------------------------------- /samples/python/qaic_features/resnet_config.yaml: -------------------------------------------------------------------------------- 1 | aic_num_cores: 4 2 | num_activations: 1 3 | convert_to_fp16: true 4 | onnx_define_symbol: 5 | batch: 2 6 | # output_dir: './resnet_qpc' 7 | -------------------------------------------------------------------------------- /tutorials/Computer-Vision/Perfomance-Tuning-Beginner/Images/Latency.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quic/cloud-ai-sdk/HEAD/tutorials/Computer-Vision/Perfomance-Tuning-Beginner/Images/Latency.jpg -------------------------------------------------------------------------------- /tutorials/Computer-Vision/Perfomance-Tuning-Beginner/requirements.txt: -------------------------------------------------------------------------------- 1 | onnx==1.12.0 2 | optimum 3 | numpy==1.23.4 4 | onnxruntime 5 | torch==1.13.0 6 | pillow==8.3.2 7 | opencv-python 8 | paramiko 9 | jsonschema -------------------------------------------------------------------------------- /tutorials/NLP/Model-Onboarding-Beginner/requirements.txt: -------------------------------------------------------------------------------- 1 | onnx==1.12.0 2 | optimum 3 | numpy==1.23.4 4 | onnxruntime 5 | torch===1.11.0 6 | pillow==8.3.2 7 | onnxsim 8 | /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl -------------------------------------------------------------------------------- /models/vision/classification/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cpu 2 | torch==2.3.1+cpu 3 | torchvision==0.18.1+cpu 4 | onnx==1.14.0 5 | onnxruntime==1.19.0 6 | transformers==4.41.2 7 | pandas==2.1.4 8 | -------------------------------------------------------------------------------- /utils/multi-device/enable_mdp.json: -------------------------------------------------------------------------------- 1 | { 2 | "request": [ 3 | { 4 | "qid": -1, 5 | "dev_config": { 6 | "update_multi_device_partition_config_request": { 7 | "enable": true 8 | } 9 | } 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /models/language_processing/decoder/MptForCausalLM/README.md: -------------------------------------------------------------------------------- 1 | # MptForCausalLM 2 | 3 | Head over to the [Qualcomm Efficient Transformers library](https://github.com/quic/efficient-transformers) for optimized LLM deployment on Cloud AI 100 Accelerators. -------------------------------------------------------------------------------- /models/language_processing/decoder/DeciCoder-6b/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cpu 2 | torch==2.1.2 3 | onnx==1.15.0 4 | onnxruntime==1.16.3 5 | onnxsim==0.4.35 6 | tiktoken==0.5.2 7 | protobuf==3.20.2 8 | numpy==1.26.4 -------------------------------------------------------------------------------- /models/language_processing/decoder/LlamaForCausalLM/README.md: -------------------------------------------------------------------------------- 1 | # LlamaForCausalLM 2 | 3 | Head over to the [Qualcomm Efficient Transformers library](https://github.com/quic/efficient-transformers) for optimized LLM deployment on Cloud AI 100 Accelerators. -------------------------------------------------------------------------------- /models/language_processing/decoder/README.md: -------------------------------------------------------------------------------- 1 | # efficient-transformers package for LLMs 2 | 3 | Head over to the [Qualcomm Efficient Transformers library](https://github.com/quic/efficient-transformers) for optimized LLM deployment on Cloud AI 100 Accelerators. -------------------------------------------------------------------------------- /models/language_processing/decoder/GPTBigCodeForCausalLM/README.md: -------------------------------------------------------------------------------- 1 | # GPTBigCodeForCausalLM 2 | 3 | Head over to the [Qualcomm Efficient Transformers library](https://github.com/quic/efficient-transformers) for optimized LLM deployment on Cloud AI 100 Accelerators. -------------------------------------------------------------------------------- /models/language_processing/decoder/CodeGen-With-Speculative-Decoding/README.md: -------------------------------------------------------------------------------- 1 | # Speculative decoding - CodeGen 2 | 3 | Head over to the [Qualcomm Efficient Transformers library](https://github.com/quic/efficient-transformers) for optimized LLM deployment on Cloud AI 100 Accelerators. -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_turbo/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cpu 2 | networkx==3.0 3 | torch==2.3.1 4 | torchvision 5 | torchaudio 6 | onnx==1.12.0 7 | onnxruntime 8 | accelerate 9 | transformers==4.42 10 | huggingface-hub==0.25.2 -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_deepcache/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cpu 2 | networkx==3.0 3 | torch==2.3.1 4 | torchvision 5 | torchaudio 6 | onnx==1.12.0 7 | onnxruntime 8 | accelerate 9 | transformers==4.42 10 | huggingface-hub==0.25.2 -------------------------------------------------------------------------------- /models/language_processing/decoder/DeciCoder-6b/init.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 2 | # SPDX-License-Identifier: BSD-3-Clause-Clear 3 | 4 | MODEL_REPO="Deci" 5 | MODEL_NAME="DeciCoder-6b" 6 | BS=1 7 | PL=256 8 | CL=2048 9 | CORES=14 10 | MX="-mxfp6-matmul" -------------------------------------------------------------------------------- /models/multimodal/text_to_image/stable-diffusion-xl-base-1.0/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cpu 2 | networkx==3.0 3 | torch==2.3.1 4 | torchvision 5 | torchaudio 6 | onnx==1.12.0 7 | onnxruntime 8 | accelerate 9 | transformers==4.42 10 | huggingface-hub==0.25.2 -------------------------------------------------------------------------------- /models/language_processing/decoder/DeciCoder-6b/specializations_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "specializations": [ 3 | { 4 | "batch_size": "BS", 5 | "seq_len": "PL", 6 | "ctx_len": "CL" 7 | }, 8 | { 9 | "batch_size": "BS", 10 | "seq_len": "1", 11 | "ctx_len": "CL" 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /samples/python/vit_qaic/vit_config.yaml: -------------------------------------------------------------------------------- 1 | # compile parameters 2 | aic_num_cores: 4 3 | convert_to_fp16: true 4 | mos: 1 5 | ols: 2 6 | multicast-weights: true 7 | onnx_define_symbol: 8 | batch_size: 1 9 | stats-batchsize: 1 10 | compile-only: true 11 | 12 | # inference parameters 13 | num_activations: 3 14 | set_size: 4 -------------------------------------------------------------------------------- /models/vision/detection/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cpu 2 | torch==2.3.1+cpu 3 | torchvision==0.18.1+cpu 4 | onnx==1.19.1 5 | onnxruntime==1.19.0 6 | onnxscript 7 | transformers==4.41.2 8 | pandas==2.1.4 9 | opencv-python-headless 10 | opencv-contrib-python-headless 11 | ultralytics 12 | seaborn 13 | onnx-graphsurgeon 14 | -------------------------------------------------------------------------------- /samples/python/requirements.txt: -------------------------------------------------------------------------------- 1 | altgraph==0.17.2 2 | attrs==21.4.0 3 | grpcio==1.44.0 4 | iniconfig==1.1.1 5 | nose==1.3.7 6 | numpy==1.22.4 7 | packaging==21.3 8 | pluggy==1.0.0 9 | protobuf==3.20.0 10 | py==1.11.0 11 | pyinstaller==4.9 12 | pyinstaller-hooks-contrib==2022.2 13 | pyparsing==3.0.7 14 | pytest==6.2.5 15 | pyudev==0.23.2 16 | PyYAML==6.0 17 | six==1.16.0 18 | toml==0.10.2 19 | yapf==0.32.0 20 | -------------------------------------------------------------------------------- /models/speech/whisper/requirements.txt: -------------------------------------------------------------------------------- 1 | /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl 2 | numpy==1.23.5 3 | datasets==2.7.1 4 | transformers==4.24.0 5 | torch==1.12.1 6 | onnx==1.12.0 7 | fsspec==2022.11.0 8 | multiprocess==0.70.14 9 | huggingface-hub==0.11.0 10 | librosa==0.9.2 11 | soundfile==0.11.0 12 | whisper @ git+https://github.com/openai/whisper.git@ec1b34bb90dc2822ce4ebac23970b84dbb03ec6c 13 | pyarrow==20.0.0 14 | -------------------------------------------------------------------------------- /tutorials/open-webui/serve.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause-Clear 5 | 6 | model=hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4 7 | 8 | /opt/vllm-env/bin/python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model ${model} --max-model-len 4096 --max-num-seq 1 --max-seq_len-to-capture 128 --device qaic --device-group 0,1,2,3 9 | -------------------------------------------------------------------------------- /models/language_processing/encoder/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cpu 2 | networkx==3.1 3 | torch==2.3.1 4 | fsspec==2024.2.0 5 | wheel==0.42.0 6 | sentence-transformers==2.6.1 7 | onnx==1.18.0 8 | onnxruntime==1.22 9 | transformers==4.40.2 10 | optimum==1.19.1 11 | protobuf==5.26.1 12 | urllib3==1.26.6 13 | /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl 14 | 15 | # For inference serving 16 | fastapi 17 | uvicorn 18 | -------------------------------------------------------------------------------- /tutorials/open-webui/open_webui.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 2 | # SPDX-License-Identifier: BSD-3-Clause-Clear 3 | 4 | image=ghcr.io/open-webui/open-webui:main 5 | 6 | docker run \ 7 | -d \ 8 | --network host \ 9 | -e OPENAI_API_KEY=test-key \ 10 | -e OPENAI_API_BASE_URL="http://localhost:8000/v1" \ 11 | -v open-webui:/app/backend/data \ 12 | --name open-webui \ 13 | --restart always \ 14 | ${image} 15 | -------------------------------------------------------------------------------- /utils/qaic-bench/config/config_tiny_llama.json: -------------------------------------------------------------------------------- 1 | { 2 | "vllm_root": "/opt/qti-aic/integrations/vllm", 3 | 4 | "models": [ 5 | { 6 | "name": "TinyLlama-1.1B-Chat-v1.0", 7 | "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", 8 | "configs": [ 9 | { 10 | "batch_size": 1, 11 | "devices": 1, 12 | "prompt_len": 1024, 13 | "generation_len": 1024 14 | } 15 | ] 16 | } 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /utils/qaic-bench/config/config_llama_3_1_8b.json: -------------------------------------------------------------------------------- 1 | { 2 | "vllm_root": "/opt/qti-aic/integrations/vllm", 3 | 4 | "models": [ 5 | { 6 | "name": "Meta-Llama-3.1-8B-Instruct", 7 | "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", 8 | "configs": [ 9 | { 10 | "batch_size": 1, 11 | "devices": 4, 12 | "prompt_len": 4096, 13 | "generation_len": 4096 14 | } 15 | ] 16 | } 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /tutorials/Playground/README.md: -------------------------------------------------------------------------------- 1 | # Cloud AI Playground Notebook setup 2 | 3 | ## Python Setup 4 | ``` 5 | # Setup venv 6 | python3.10 -m venv imagine_env 7 | source imagine_env/bin/activate 8 | pip3 install pip -U 9 | 10 | # Install Qualcomm Imagine Python library 11 | pip3 install python-imagine-sdk 12 | 13 | # Install dependencies 14 | pip3 install Pillow 15 | pip3 install notebook 16 | pip3 install pandas 17 | pip3 install openai 18 | ``` 19 | 20 | ## Launch Notebook 21 | ``` 22 | jupyter notebook --no-browser --ip 0.0.0.0 --port 8080 23 | ``` 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /utils/qaic-bench/config/config_dl2q.json: -------------------------------------------------------------------------------- 1 | { 2 | "vllm_root": "/opt/qti-aic/integrations/vllm", 3 | 4 | "models": [ 5 | { 6 | "name": "Meta-Llama-3.1-8B-Instruct-AWQ-INT4", 7 | "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", 8 | "configs": [ 9 | { 10 | "batch_size": 1, 11 | "devices": 1, 12 | "cores": 14, 13 | "prompt_len": 4096, 14 | "generation_len": 4096 15 | } 16 | ] 17 | } 18 | ] 19 | } 20 | -------------------------------------------------------------------------------- /tutorials/efficient_transformers/README.md: -------------------------------------------------------------------------------- 1 | ## Installation steps 2 | 3 | ### Create python virtual environment and activate it 4 | ``` 5 | python3.10 -m venv qeff_env 6 | source qeff_env/bin/activate 7 | pip install --upgrade pip 8 | ``` 9 | 10 | ### Clone and install the efficient transformers repo 11 | ``` 12 | pip install git+https://github.com/quic/efficient-transformers@release/v1.20.0 13 | ``` 14 | 15 | ### After installation of efficient transformers library, install jupyter notebook 16 | ``` 17 | pip install notebook 18 | ``` 19 | 20 | ### Launch Notebook 21 | ``` 22 | jupyter notebook --no-browser --allow-root 23 | ``` 24 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/stable-diffusion-3.5-medium/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cpu 2 | accelerate==0.31.0 3 | certifi==2024.6.2 4 | charset-normalizer==3.3.2 5 | coloredlogs==15.0.1 6 | filelock==3.13.1 7 | flatbuffers==24.3.25 8 | fsspec==2024.2.0 9 | huggingface-hub==0.23.4 10 | humanfriendly==10.0 11 | idna==3.7 12 | importlib-metadata==7.2.1 13 | Jinja2==3.1.3 14 | MarkupSafe==2.1.5 15 | mpmath==1.3.0 16 | networkx==3.1 17 | numpy==1.24.1 18 | onnx==1.12.0 19 | onnxruntime==1.16.3 20 | packaging==24.1 21 | pillow==10.2.0 22 | protobuf==3.20.1 23 | psutil==6.0.0 24 | PyYAML==6.0.1 25 | regex==2024.5.15 26 | requests==2.32.3 27 | safetensors==0.4.3 28 | sympy==1.12 29 | tokenizers==0.19.1 30 | torch==2.4.1 31 | tqdm==4.66.4 32 | transformers==4.41.2 33 | typing-extensions==4.9.0 34 | urllib3==2.2.2 35 | zipp==3.19.2 36 | onnxsim==0.4.36 37 | sentencepiece==0.2.0 38 | -------------------------------------------------------------------------------- /tutorials/open-webui/vllm_container.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 2 | # SPDX-License-Identifier: BSD-3-Clause-Clear 3 | 4 | image=ghcr.io/quic/cloud_ai_inference_ubuntu22:1.19.8.0 5 | qpc_path=/path/to/qpc 6 | 7 | chmod +x serve.sh 8 | 9 | docker run -dit \ 10 | --workdir /model \ 11 | --name qaic-vllm \ 12 | --network host \ 13 | --mount type=bind,source=${PWD}/serve.sh,target=/model/serve.sh \ 14 | --mount type=bind,source=${qpc_path},target=/model/qpc \ 15 | -v qaic-vllm:/model/data \ 16 | --env VLLM_QAIC_MAX_CPU_THREADS=8 \ 17 | --env VLLM_QAIC_QPC_PATH=/model/qpc \ 18 | --env HF_HOME=/model/data/huggingface \ 19 | --env QEFF_HOME=/model/data/qeff_models \ 20 | --device=/dev/accel/accel0 \ 21 | --device=/dev/accel/accel1 \ 22 | --device=/dev/accel/accel2 \ 23 | --device=/dev/accel/accel3 \ 24 | --entrypoint=/model/serve.sh \ 25 | ${image} 26 | -------------------------------------------------------------------------------- /tutorials/Computer-Vision/DETR/README.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | --- 3 | 4 | Download the DETR-ResNet50 model, prepare for the Qualcomm AIC100, compile the model, run the model on a generated random sample along with input image, and obtain the output. 5 | 6 | 7 | ## Source of the model 8 | --- 9 | 10 | This model is an implementation of DETR-ResNet50 found at (https://github.com/facebookresearch/detr). 11 | 12 | 13 | ## Virtual environment 14 | --- 15 | For a quick environment setup: 16 | 17 | ```commandline 18 | python3.8 -m venv cv_workflow_env 19 | source cv_workflow_env/bin/activate 20 | pip install --upgrade pip 21 | 22 | ``` 23 | 24 | ## Framework and version 25 | --- 26 | ```commandline 27 | pip install torch==2.4.1+cpu torchvision==0.19.1+cpu --index-url https://download.pytorch.org/whl/cpu 28 | pip install numpy==1.24.4 onnx==1.17.0 pillow==10.4.0 requests==2.32.3 notebook==7.3.3 matplotlib==3.7.5 scipy==1.10.1 29 | 30 | ``` 31 | 32 | -------------------------------------------------------------------------------- /models/vision/detection/lut_yolo_models.csv: -------------------------------------------------------------------------------- 1 | MODEL_NAME,TASK,BATCH_SIZE,IMAGE_SIZE,CORES,INSTANCES,OLS,MOS,SET_SIZE,EXTRA,PRECISION,OBJECTIVE 2 | yolov5s,object-detection,1,640,7,2,2,,1,,fp16,best-latency 3 | yolov5s,object-detection,1,640,3,4,2,,1,,fp16,balanced 4 | yolov5s,object-detection,1,640,2,7,2,,2,,fp16,best-throughput 5 | yolov5m,object-detection,1,640,12,1,1,,1,,fp16,best-latency 6 | yolov5m,object-detection,1,640,12,1,1,,2,,fp16,balanced 7 | yolov5m,object-detection,1,640,2,7,2,,2,,fp16,best-throughput 8 | yolov5l,object-detection,1,640,12,1,1,,1,,fp16,best-latency 9 | yolov5l,object-detection,1,640,4,3,2,,1,,fp16,balanced 10 | yolov5l,object-detection,1,640,2,7,4,,2,,fp16,best-throughput 11 | yolov5x,object-detection,1,640,12,1,1,,1,,fp16,best-latency 12 | yolov5x,object-detection,1,640,7,2,2,,2,,fp16,balanced 13 | yolov5x,object-detection,1,640,2,7,4,,1,,fp16,best-throughput 14 | yolov7-e6e,object-detection,1,640,12,1,1,,1, -multicast-weights,fp16,best-latency 15 | yolov7-e6e,object-detection,1,640,4,3,2, ,1, -multicast-weights,fp16,balanced 16 | yolov7-e6e,object-detection,1,640,6,2,2, ,2, -multicast-weights,fp16,best-throughput 17 | -------------------------------------------------------------------------------- /tutorials/NLP/Performance-Tuning-Beginner/bert_base_dopt.json: -------------------------------------------------------------------------------- 1 | { 2 | "max_func_eval": 200, 3 | "objective": "maximize_inf_rate", 4 | "params": { 5 | "cores": { 6 | "min": 1, 7 | "max": 14 8 | }, 9 | "mos": { 10 | "min": 1, 11 | "max": 8 12 | }, 13 | "ols": { 14 | "min": 1, 15 | "max": 8 16 | }, 17 | "bs": { 18 | "min": 1, 19 | "max": 16 20 | }, 21 | "instances": { 22 | "min": 1, 23 | "max": 14 24 | } 25 | }, 26 | "initial_values": [ 27 | { 28 | "cores": 1, 29 | "mos": 1, 30 | "ols": 1, 31 | "bs": 1, 32 | "instances": 14 33 | }, 34 | { 35 | "cores": 2, 36 | "mos": 1, 37 | "ols": 1, 38 | "bs": 1, 39 | "instances": 7 40 | }, 41 | { 42 | "cores": 4, 43 | "mos": 1, 44 | "ols": 1, 45 | "bs": 1, 46 | "instances": 3 47 | }, 48 | { 49 | "cores": 7, 50 | "mos": 1, 51 | "ols": 1, 52 | "bs": 1, 53 | "instances": 2 54 | }, 55 | { 56 | "cores": 14, 57 | "mos": 1, 58 | "ols": 1, 59 | "bs": 1, 60 | "instances": 1 61 | } 62 | ] 63 | } -------------------------------------------------------------------------------- /tutorials/NLP/Performance-Tuning-Beginner/bert_base_dopt_min_latency.json: -------------------------------------------------------------------------------- 1 | { 2 | "max_func_eval": 200, 3 | "objective": "minimize_latency", 4 | "params": { 5 | "cores": { 6 | "min": 1, 7 | "max": 14 8 | }, 9 | "mos": { 10 | "min": 1, 11 | "max": 8 12 | }, 13 | "ols": { 14 | "min": 1, 15 | "max": 8 16 | }, 17 | "bs": { 18 | "min": 1, 19 | "max": 1 20 | }, 21 | "instances": { 22 | "min": 1, 23 | "max": 14 24 | } 25 | }, 26 | "initial_values": [ 27 | { 28 | "cores": 1, 29 | "mos": 1, 30 | "ols": 1, 31 | "bs": 1, 32 | "instances": 1 33 | }, 34 | { 35 | "cores": 2, 36 | "mos": 1, 37 | "ols": 1, 38 | "bs": 1, 39 | "instances": 1 40 | }, 41 | { 42 | "cores": 4, 43 | "mos": 1, 44 | "ols": 1, 45 | "bs": 1, 46 | "instances": 1 47 | }, 48 | { 49 | "cores": 7, 50 | "mos": 1, 51 | "ols": 1, 52 | "bs": 1, 53 | "instances": 1 54 | }, 55 | { 56 | "cores": 14, 57 | "mos": 1, 58 | "ols": 1, 59 | "bs": 1, 60 | "instances": 1 61 | } 62 | ] 63 | } 64 | -------------------------------------------------------------------------------- /samples/cpp/cpp_qpc_inference/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause-Clear 5 | # 6 | # ============================================================================== 7 | 8 | project(simple-bert-inference-example) 9 | cmake_minimum_required (VERSION 3.17.2) 10 | set(CMAKE_CXX_STANDARD 17) 11 | 12 | find_package(Threads REQUIRED) 13 | 14 | add_executable(simple-bert-inference-example main.cpp) 15 | 16 | target_include_directories(simple-bert-inference-example 17 | PRIVATE 18 | "/opt/qti-aic/dev/inc" 19 | ) 20 | 21 | 22 | set_target_properties(simple-bert-inference-example 23 | PROPERTIES 24 | LINK_FLAGS "-Wl,--no-as-needed" 25 | ) 26 | 27 | target_compile_options(simple-bert-inference-example 28 | PRIVATE 29 | -fstack-protector-all 30 | -Werror 31 | -Wall 32 | -Wextra 33 | -Wno-sign-compare 34 | -Wno-unused-parameter 35 | -Wno-missing-field-initializers 36 | ) 37 | 38 | target_link_libraries(simple-bert-inference-example 39 | PRIVATE 40 | Threads::Threads 41 | dl 42 | ) -------------------------------------------------------------------------------- /tutorials/Computer-Vision/Perfomance-Tuning-Beginner/resnet_base_dopt_min_latency.json: -------------------------------------------------------------------------------- 1 | { 2 | "search-mode": "optimized", 3 | "objective": "min-latency", 4 | "search-parameters": { 5 | "cores": { 6 | "min": 1, 7 | "max": 14 8 | }, 9 | "mos": { 10 | "min": 1, 11 | "max": 8 12 | }, 13 | "ols": { 14 | "min": 1, 15 | "max": 8 16 | }, 17 | "batch-size": { 18 | "min": 1, 19 | "max": 16 20 | }, 21 | "instances": { 22 | "min": 1, 23 | "max": 14 24 | } 25 | }, 26 | "initial-values": [ 27 | { 28 | "cores": 1, 29 | "mos": 1, 30 | "ols": 1, 31 | "batch-size": 1, 32 | "instances": 1 33 | }, 34 | { 35 | "cores": 2, 36 | "mos": 1, 37 | "ols": 1, 38 | "batch-size": 1, 39 | "instances": 1 40 | }, 41 | { 42 | "cores": 4, 43 | "mos": 1, 44 | "ols": 1, 45 | "batch-size": 1, 46 | "instances": 1 47 | }, 48 | { 49 | "cores": 7, 50 | "mos": 1, 51 | "ols": 1, 52 | "batch-size": 1, 53 | "instances": 1 54 | }, 55 | { 56 | "cores": 14, 57 | "mos": 1, 58 | "ols": 1, 59 | "batch-size": 1, 60 | "instances": 1 61 | } 62 | ] 63 | } 64 | -------------------------------------------------------------------------------- /samples/python/README.md: -------------------------------------------------------------------------------- 1 | # This folder consists 2 | 3 | 1. `vit_qaic` and `resnet_qaic` folder contains example showing an end-to-end workflow for running inference on QAIC100 using the python APIs. 4 | 2. `qaic_features` folder consists of examples to show how to perform benchmarking, profiling and measuring metrics for inferences made on the device. 5 | 6 | # Installation 7 | 8 | Steps to install `qaic` API: 9 | 10 | ``` 11 | pip install /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl 12 | pip install -r requirements.txt 13 | ``` 14 | 15 | 16 | ## Structure of end to end workflow 17 | 18 | Examples follow this pattern: 19 | 20 | 1. Get the model from open source. (HuggingFace for example) 21 | 2. Convert the model to onnx using onnx library. 22 | 3. Call generate_bin function converts onnx to qpc (binary for the device). 23 | a. Currently it is compiled for default arguments, can be replaced with best performance compile arguments) #FIXME 24 | 4. Creating `qaic.session` with appropriate input and output names. 25 | 5. Provide sample prepossessing steps. Build input_dict for the session. 26 | 6. Call session.run() to perform inference. 27 | 7. Provide sample postprocessing steps. reshape output from the session. 28 | 29 | ## To run the example 30 | 31 | ``` 32 | python example.py 33 | ``` 34 | -------------------------------------------------------------------------------- /utils/README.md: -------------------------------------------------------------------------------- 1 | # Basic Commands/Utilities for Cloud AI 100 devices 2 | 3 | ## Create `qaic` group to avoid `sudo` to read card/device status 4 | 5 | ``` 6 | sudo usermod -aG qaic $USER 7 | newgrp qaic 8 | bash 9 | ``` 10 | 11 | ## Check device health 12 | Monitor the health of all AI 100 devices (SoCs) using the `qaic-util` utility. 13 | 14 | ``` 15 | /opt/qti-aic/tools/qaic-util -q | grep -e Status -e QID 16 | ``` 17 | 18 | ## Monitoring of AI 100 devices (SoCs) 19 | Continuously monitor the health, telemetry (temperature, power etc) and resources (compute, DRAM etc) of the AI 100 devices (SoCs) using the `qaic-util` utility. 20 | 21 | ``` 22 | /opt/qti-aic/tools/qaic-util -t 1 23 | ``` 24 | 25 | ## Reset AI 100 devices (SoCs) 26 | To reset **all** AI 100 devices (SoCs), run 27 | ``` 28 | sudo /opt/qti-aic/tools/qaic-util -s 29 | ``` 30 | 31 | To reset **individual** AI 100 devices (SoCs), run 32 | ``` 33 | sudo /opt/qti-aic/tools/qaic-util -s -p 34 | ``` 35 | where, 36 | - SSSS = 4 digits segment number 37 | - BB = 2 digits bus number 38 | - DD = 2 digits device number 39 | - F = 1 digit function number 40 | 41 | For example, 42 | ``` 43 | sudo /opt/qti-aic/tools/qaic-util -s -p 0000:83:00.0 44 | 45 | Resetting 0000:83:00.0: 46 | 0000:83:00.0 success 47 | ``` -------------------------------------------------------------------------------- /models/language_processing/decoder/DeciCoder-6b/compileModel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [ -z "$1" ]; then 6 | echo "Usage: $0 " 7 | exit 1 8 | fi 9 | 10 | model_name="$1" 11 | batch_size="$2" 12 | prompt_len="$3" 13 | ctx_len="$4" 14 | num_cores="$5" 15 | with_or_no_mx="$6" 16 | 17 | # Generate a new specializations.json 18 | sed -e "s/BS/${batch_size}/g" -e "s/PL/${prompt_len}/g" -e "s/CL/${ctx_len}/g" ./specializations_template.json > specializations.json 19 | 20 | # Create qpc directory - Delete exisiting path 21 | mkdir -p qpc 22 | rm -rf qpc/${model_name}-kv-${prompt_len}pl-${ctx_len}cl-${num_cores}c${with_or_no_mx} 23 | 24 | model_path="${model_name}-kv/generatedModels/${model_name}-kv_fp16_simplified.onnx" 25 | if [ ! -f "$model_path" ]; then 26 | model_path="${model_name}-kv/generatedModels/${model_name}-kv_fp16.onnx" 27 | fi 28 | 29 | /opt/qti-aic/exec/qaic-exec \ 30 | -m=$model_path \ 31 | -aic-hw \ 32 | -aic-hw-version=2.0 \ 33 | -network-specialization-config=specializations.json \ 34 | -retained-state \ 35 | -convert-to-fp16 \ 36 | -ols=1 \ 37 | -mos=${num_cores} \ 38 | -aic-num-cores=${num_cores} \ 39 | -custom-IO-list-file=${model_name}-kv/custom_io.yaml \ 40 | -compile-only \ 41 | -aic-binary-dir=qpc/${model_name}-kv-${prompt_len}pl-${ctx_len}cl-${num_cores}c${with_or_no_mx} \ 42 | ${with_or_no_mx} 43 | 44 | -------------------------------------------------------------------------------- /models/speech/whisper/audio.py: -------------------------------------------------------------------------------- 1 | #################################################################################################### 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear 4 | #################################################################################################### 5 | 6 | import os 7 | import numpy as np 8 | from datasets import load_dataset, Audio 9 | import soundfile as sf 10 | from pathlib import Path 11 | 12 | class AudioSample: 13 | def __init__(self): 14 | # load dummy dataset and read soundfiles 15 | self.ds = load_dataset( 16 | 'hf-internal-testing/librispeech_asr_dummy', 'clean', split='validation' 17 | ) 18 | 19 | def to_file(self, parent='.'): 20 | audio_sample = self.ds[0]['audio'] 21 | 22 | audio_array = audio_sample['array'] 23 | audio_fname = os.path.join(parent, Path(audio_sample['path']).name) 24 | sampling_rate = audio_sample["sampling_rate"] 25 | 26 | # Convert to float32 for compatibility with soundfile 27 | if audio_array.dtype != np.float32: 28 | audio_array = audio_array.astype(np.float32) 29 | 30 | try: 31 | sf.write(audio_fname, audio_array, sampling_rate, format='FLAC') 32 | except Exception as e: 33 | print('Error saving file: {}'.format(e)) 34 | 35 | return audio_fname -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Contributing to PROJECT 2 | 3 | Hi there! 4 | We’re thrilled that you’d like to contribute to this project. 5 | Your help is essential for keeping this project great and for making it better. 6 | 7 | ## Branching Strategy 8 | 9 | In general, contributors should develop on branches based off of `master` and pull requests should be made against `master`. 10 | 11 | ## Submitting a pull request 12 | 13 | 1. Please read our [code of conduct](CODE-OF-CONDUCT.md) and [license](LICENSE). 14 | 1. Fork and clone the repository. 15 | 1. Create a new branch based on `master`: `git checkout -b master`. 16 | 1. Make your changes, add tests, and make sure the tests still pass. 17 | 1. Commit your changes using the [DCO](http://developercertificate.org/). You can attest to the DCO by commiting with the **-s** or **--signoff** options or manually adding the "Signed-off-by". 18 | 1. Push to your fork and submit a pull request from your branch to `master`. 19 | 1. Pat yourself on the back and wait for your pull request to be reviewed. 20 | 21 | Here are a few things you can do that will increase the likelihood of your pull request to be accepted: 22 | 23 | - Follow the existing style where possible. 24 | - Write tests. 25 | - Keep your change as focused as possible. 26 | If you want to make multiple independent changes, please consider submitting them as separate pull requests. 27 | - Write a [good commit message](http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html). 28 | -------------------------------------------------------------------------------- /samples/python/aws_ai100_benchmarking/yolo_models/lut_yolo_models.csv: -------------------------------------------------------------------------------- 1 | MODEL_NAME,TASK,BATCH_SIZE,IMAGE_SIZE,CORES,INSTANCES,OLS,MOS,SET_SIZE,EXTRA,PRECISION,OBJECTIVE 2 | yolov4,object-detection,1,608,12,1,1,,1, -aic-pmu-recipe=KernelUtil -multicast-weights -aic-enable-depth-first,fp16,best-latency 3 | yolov4,object-detection,1,608,7,2,2,,2, -aic-pmu-recipe=KernelUtil -multicast-weights -aic-enable-depth-first,fp16,balanced 4 | yolov4,object-detection,1,608,1,14,1,,1, -aic-pmu-recipe=KernelUtil -multicast-weights -aic-enable-depth-first,fp16,best-throughput 5 | yolov5s,object-detection,1,640,7,2,2,,1,,fp16,best-latency 6 | yolov5s,object-detection,1,640,3,4,2,,1,,fp16,balanced 7 | yolov5s,object-detection,1,640,2,7,2,,2,,fp16,best-throughput 8 | yolov5m,object-detection,1,640,12,1,1,,1,,fp16,best-latency 9 | yolov5m,object-detection,1,640,12,1,1,,2,,fp16,balanced 10 | yolov5m,object-detection,1,640,2,7,2,,2,,fp16,best-throughput 11 | yolov5l,object-detection,1,640,12,1,1,,1,,fp16,best-latency 12 | yolov5l,object-detection,1,640,4,3,2,,1,,fp16,balanced 13 | yolov5l,object-detection,1,640,2,7,4,,2,,fp16,best-throughput 14 | yolov5x,object-detection,1,640,12,1,1,,1,,fp16,best-latency 15 | yolov5x,object-detection,1,640,7,2,2,,2,,fp16,balanced 16 | yolov5x,object-detection,1,640,2,7,4,,1,,fp16,best-throughput 17 | yolov7-e6e,object-detection,1,640,12,1,1,,1, -multicast-weights,fp16,best-latency 18 | yolov7-e6e,object-detection,1,640,4,3,2, ,1, -multicast-weights,fp16,balanced 19 | yolov7-e6e,object-detection,1,640,6,2,2, ,2, -multicast-weights,fp16,best-throughput 20 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_turbo/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 2 | # SPDX-License-Identifier: BSD-3-Clause-Clear 3 | 4 | import os 5 | import onnx 6 | from onnx import numpy_helper 7 | 8 | 9 | # executes the command and writes it down in the command.txt. The first time, mode is 'w', then 'a' (append) 10 | def execute(cmd_elements, write_to_file, mode): 11 | cmd_str = ' '.join(str(x) for x in cmd_elements) 12 | redirect = f" 2>&1 | ts > {write_to_file}" 13 | cmd_str += redirect 14 | print(f"Executing: {cmd_str}") 15 | os.system(cmd_str) 16 | with open(write_to_file, mode) as file: 17 | file.write(cmd_str + "\n\n") 18 | 19 | 20 | def scale_conv(model, conv_name, scale_factor): 21 | cnodes = [x for x in model.graph.node if x.name == conv_name] 22 | assert len(cnodes) == 1, f"Node '{conv_name}' not found" 23 | x, w, b = cnodes[0].input 24 | wi, bi = "", "" 25 | for i, init in enumerate(model.graph.initializer): 26 | if init.name == w: 27 | wi = i 28 | elif init.name == b: 29 | bi = i 30 | if wi != "" and bi != "": 31 | break 32 | else: 33 | raise ValueError(f"Cannot find indices of weight: {w} and bias: {b}") 34 | ww = numpy_helper.to_array(model.graph.initializer[wi]) 35 | bb = numpy_helper.to_array(model.graph.initializer[bi]) 36 | model.graph.initializer[wi].raw_data = (ww / scale_factor).tobytes() 37 | model.graph.initializer[bi].raw_data = (bb / scale_factor).tobytes() 38 | -------------------------------------------------------------------------------- /tutorials/Computer-Vision/Perfomance-Tuning-Beginner/resnet_base_dopt_throughput.json: -------------------------------------------------------------------------------- 1 | { 2 | "search-mode": "optimized", 3 | "objective": "max-throughput", 4 | "search-parameters": { 5 | "cores": { 6 | "min": 1, 7 | "max": 14 8 | }, 9 | "mos": { 10 | "min": 1, 11 | "max": 8 12 | }, 13 | "ols": { 14 | "min": 1, 15 | "max": 8 16 | }, 17 | "batch-size": { 18 | "min": 1, 19 | "max": 16 20 | }, 21 | "instances": { 22 | "min": 1, 23 | "max": 14 24 | }, 25 | "set-size": { 26 | "min": 1, 27 | "max": 10 28 | } 29 | }, 30 | "initial-values": [ 31 | { 32 | "cores": 1, 33 | "mos": 1, 34 | "ols": 1, 35 | "batch-size": 1, 36 | "instances": 14, 37 | "set-size": 1 38 | 39 | }, 40 | { 41 | "cores": 2, 42 | "mos": 1, 43 | "ols": 1, 44 | "batch-size": 1, 45 | "instances": 7, 46 | "set-size": 1 47 | }, 48 | { 49 | "cores": 4, 50 | "mos": 1, 51 | "ols": 1, 52 | "batch-size": 1, 53 | "instances": 3, 54 | "set-size": 1 55 | }, 56 | { 57 | "cores": 7, 58 | "mos": 1, 59 | "ols": 1, 60 | "batch-size": 1, 61 | "instances": 2, 62 | "set-size": 1 63 | }, 64 | { 65 | "cores": 14, 66 | "mos": 1, 67 | "ols": 1, 68 | "batch-size": 1, 69 | "instances": 1, 70 | "set-size": 1 71 | } 72 | ] 73 | } -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_deepcache/utils.py: -------------------------------------------------------------------------------- 1 | #################################################################################################### 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear 4 | #################################################################################################### 5 | 6 | import os 7 | import onnx 8 | from onnx import numpy_helper 9 | 10 | 11 | # executes the command and writes it down in the command.txt. The first time, mode is 'w', then 'a' (append) 12 | def execute(cmd_elements, write_to_file, mode): 13 | cmd_str = ' '.join(str(x) for x in cmd_elements) 14 | redirect = f" 2>&1 | ts > {write_to_file}" 15 | cmd_str += redirect 16 | print(f"Executing: {cmd_str}") 17 | os.system(cmd_str) 18 | with open(write_to_file, mode) as file: 19 | file.write(cmd_str + "\n\n") 20 | 21 | 22 | def scale_conv(model, conv_name, scale_factor): 23 | cnodes = [x for x in model.graph.node if x.name == conv_name] 24 | assert len(cnodes) == 1, f"Node '{conv_name}' not found" 25 | x, w, b = cnodes[0].input 26 | wi, bi = "", "" 27 | for i, init in enumerate(model.graph.initializer): 28 | if init.name == w: 29 | wi = i 30 | elif init.name == b: 31 | bi = i 32 | if wi != "" and bi != "": 33 | break 34 | else: 35 | raise ValueError(f"Cannot find indices of weight: {w} and bias: {b}") 36 | ww = numpy_helper.to_array(model.graph.initializer[wi]) 37 | bb = numpy_helper.to_array(model.graph.initializer[bi]) 38 | model.graph.initializer[wi].raw_data = (ww / scale_factor).tobytes() 39 | model.graph.initializer[bi].raw_data = (bb / scale_factor).tobytes() 40 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/stable-diffusion-3.5-medium/utils.py: -------------------------------------------------------------------------------- 1 | #################################################################################################### 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear 4 | #################################################################################################### 5 | import os 6 | import onnx 7 | from onnx import numpy_helper 8 | 9 | 10 | # executes the command and writes it down in the command.txt. The first time, mode is 'w', then 'a' (append) 11 | def execute(cmd_elements, write_to_file, mode): 12 | cmd_str = ' '.join(str(x) for x in cmd_elements) 13 | redirect = f" 2>&1 | ts > {write_to_file}" 14 | cmd_str += redirect 15 | print(f"Executing: {cmd_str}") 16 | os.system(cmd_str) 17 | with open(write_to_file, mode) as file: 18 | file.write(cmd_str + "\n\n") 19 | 20 | 21 | def scale_conv(model, conv_name, scale_factor): 22 | cnodes = [x for x in model.graph.node if x.name == conv_name] 23 | assert len(cnodes) == 1, f"Node '{conv_name}' not found" 24 | x, w, b = cnodes[0].input 25 | wi, bi = "", "" 26 | for i, init in enumerate(model.graph.initializer): 27 | if init.name == w: 28 | wi = i 29 | elif init.name == b: 30 | bi = i 31 | if wi != "" and bi != "": 32 | break 33 | else: 34 | raise ValueError(f"Cannot find indices of weight: {w} and bias: {b}") 35 | ww = numpy_helper.to_array(model.graph.initializer[wi]) 36 | bb = numpy_helper.to_array(model.graph.initializer[bi]) 37 | model.graph.initializer[wi].raw_data = (ww / scale_factor).tobytes() 38 | model.graph.initializer[bi].raw_data = (bb / scale_factor).tobytes() 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted (subject to the limitations in the 5 | disclaimer below) provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above 11 | copyright notice, this list of conditions and the following 12 | disclaimer in the documentation and/or other materials provided 13 | with the distribution. 14 | 15 | * Neither the name of Qualcomm Technologies, Inc. nor the names of its 16 | contributors may be used to endorse or promote products derived 17 | from this software without specific prior written permission. 18 | 19 | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE 20 | GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT 21 | HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 22 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 23 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 24 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 27 | GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 29 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 30 | OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 31 | IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | 33 | SPDX-License-Identifier: BSD-3-Clause-Clear 34 | -------------------------------------------------------------------------------- /tutorials/README.md: -------------------------------------------------------------------------------- 1 | Tutorials are Jupyter notebools designed to walk the developer through the Cloud AI inference workflow. The tutorials are split into 2 categories - CV and NLP. Overall, the inference workflow for CV and NLP models are very similar and have been presented for convenience. 2 | 3 | `Model-Onboarding` - This is one of the beginner notebooks. This goes through exporting and preparing the model, compiling the model using a CLI tool and executing inference using CLI tool / Python APIs. 4 | 5 | `Performance-Tuning` - This is another beginner notebook that walks the developer through the key parameters to optimize for best performance (latency and throughput) on Cloud AI platforms. Going through this notebook and 'Performance Tuning' section in the Quick start guide will equip developers with a intuitive understanding of how to use the key parameters to meet inference application KPIs (AI compute resource usage, throughput and latency). 6 | 7 | `Profiler` - This is a intermediate-level notebook that describes system and device level inference profiling capabilities. Developers can use the tools and techniques described in this tutorial to measure application/device level latency and identify system/device bottlenecks. 8 | 9 | 10 | ### Pre-requisites 11 | 1. Clone this repo 12 | 2. Create python3.8 venv and activate it. 13 | `python3.8 -m venv jn_env`
14 | `source jn_env/bin/activate`
15 | 3. Install qaic 16 | `pip install /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl` 17 | 4. Install Jupyter notebook 18 | `pip install notebook` 19 | `pip install urllib3==1.26.6` 20 | 5. Run the notebook 21 | `jupyter notebook --allow-root --ip 0.0.0.0 --no-browser`.
22 | You should see `http://ip-xx-yyy-zzz-aaa.us-west-2.compute.internal:8888/tree?token=`.
23 | On the local machine, type `http://xx.yyy.zzz.aaa:8888/tree?token=` on a browser to run the tutorial notebooks. 24 | -------------------------------------------------------------------------------- /models/speech/whisper/README.md: -------------------------------------------------------------------------------- 1 | # Whisper 2 | 3 | [Whisper](https://github.com/openai/whisper) is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification. 4 | 5 | ## Environment and dependencies 6 | 7 | ```commandline 8 | python3.10 -m venv whisper_env 9 | source whisper_env/bin/activate 10 | pip3 install -r requirements.txt 11 | 12 | sudo apt-get update 13 | sudo apt-get install libsndfile1 ffmpeg 14 | ``` 15 | 16 | ## Model generation 17 | 18 | The following command generates encoder and decoder ONNX files in the `output_whisper` folder: 19 | ```commandline 20 | python3 generateModel.py --model-name base --output-dir output_whisper 21 | ``` 22 | 23 | **Note** Check here for additional model variants:
24 | https://github.com/openai/whisper#available-models-and-languages 25 | 26 | 27 | ## Model compilation 28 | 29 | AIC binaries folder 30 | 31 | ```commandline 32 | mkdir ./whisper_AIC 33 | ``` 34 | 35 | Whisper encoder 36 | 37 | ```commandline 38 | rm -rf ./whisper_AIC/whisper-encoder 39 | /opt/qti-aic/exec/qaic-exec -m=./output_whisper/encoder_model.onnx -aic-hw -aic-num-cores=12 -mos=2 -ols=1 -convert-to-fp16 -onnx-define-symbol=batch_size,1 -onnx-define-symbol=feature_size,80 -onnx-define-symbol=encoder_sequence_length,3000 -aic-binary-dir=./whisper_AIC/whisper-encoder -compile-only 40 | ``` 41 | 42 | Whisper decoder 43 | 44 | ```commandline 45 | rm -rf ./whisper_AIC/whisper-decoder 46 | /opt/qti-aic/exec/qaic-exec -m=./output_whisper/decoder_model.onnx -aic-hw -aic-num-cores=12 -mos=2 -ols=1 -convert-to-fp16 -onnx-define-symbol=batch_size,1 -onnx-define-symbol=encoder_sequence_length,1500 -onnx-define-symbol=decoder_sequence_length,150 -aic-binary-dir=./whisper_AIC/whisper-decoder -compile-only 47 | ``` 48 | 49 | ## Model execution 50 | 51 | ```commandline 52 | sudo ./whisper_env/bin/python3 runModel.py 53 | ``` 54 | -------------------------------------------------------------------------------- /samples/python/qaic_features/benchmarking_eg.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 3 | SPDX-License-Identifier: BSD-3-Clause-Clear 4 | ''' 5 | 6 | import qaic 7 | import numpy as np 8 | import argparse 9 | 10 | # Establish arguments to accept 11 | def get_args(): 12 | 13 | parser = argparse.ArgumentParser() 14 | 15 | parser.add_argument( 16 | "--model-path", 17 | dest='model_path', 18 | default= 19 | '/opt/qti-aic/integrations/qaic_onnxrt/tests/resnet50/resnet50-v1-12-batch.onnx', 20 | help='Pass path to qpc of this model to avoid compilation') 21 | 22 | parser.add_argument( 23 | "--config-path", 24 | dest='config_path', 25 | default= 26 | './resnet_config.yaml', 27 | help='Pass path to qpc of this model to avoid compilation') 28 | 29 | parser.add_argument( 30 | "--input", 31 | dest='input_img', 32 | help= 33 | 'If image is not provided, random values will be generated as input. Input image should be 1*3*224*224 pixel in raw format' 34 | ) 35 | 36 | parser.add_argument( 37 | "--num_iters", 38 | dest='num_iters', 39 | default=1000, 40 | help='Enter number of inferences you want to run on the model') 41 | 42 | return parser.parse_args() 43 | 44 | def main(args): 45 | 46 | resnet_sess = qaic.Session( 47 | args.model_path, 48 | options_path=args.config_path) 49 | 50 | input_shape, input_type = resnet_sess.model_input_shape_dict['data'] 51 | 52 | # Read input 53 | 54 | if args.input_img is None: 55 | x = np.random.randn(*input_shape).astype(input_type) 56 | else: 57 | img = np.fromfile(args.input_img, dtype=input_type) 58 | x = np.resize(img, input_shape) 59 | 60 | # Run Benchmarking 61 | input_dict = {'data': x} 62 | 63 | inf_completed, inf_rate, inf_time, batch_size = resnet_sess.run_benchmark(num_inferences = args.num_iters, 64 | input_dict=input_dict) 65 | 66 | if __name__ == '__main__': 67 | args = get_args() 68 | main(args) 69 | -------------------------------------------------------------------------------- /models/language_processing/encoder/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 2 | # SPDX-License-Identifier: BSD-3-Clause-Clear 3 | 4 | import os 5 | from transformers import AutoTokenizer 6 | import numpy as np 7 | import qaic 8 | 9 | class QAicEmbeddingModel(): 10 | def __init__(self, model_name='BAAI/bge-large-en-v1.5', qpc_path='./models/BAAI/bge-large-en-v1.5/compiled-bin-fp16-B1-C4-A3-OLS2-MOS1-best-throughput', device=0): 11 | self.tokenizer = AutoTokenizer.from_pretrained(model_name) 12 | self.aic_session = qaic.Session(model_path=os.path.join(qpc_path, 'programqpc.bin'), dev_id=device) 13 | self.name = model_name 14 | 15 | self.aic_session.setup() 16 | 17 | def generate(self, input): 18 | tokens = self.tokenizer(input, padding=True, return_tensors='np') 19 | 20 | input_data = {'input_ids': None, 21 | 'attention_mask': None} 22 | 23 | for k in input_data.keys(): 24 | input_shape, input_type = self.aic_session.model_input_shape_dict[k] 25 | 26 | rows, cols = tokens[k].shape 27 | input_data[k] = np.zeros(input_shape, dtype=input_type) 28 | input_data[k][:rows, :cols] = tokens[k] 29 | 30 | outputs = self.aic_session.run(input_data) 31 | 32 | output_shape, output_type = self.aic_session.model_output_shape_dict['token_embeddings'] 33 | token_embeddings = np.frombuffer(outputs['token_embeddings'], dtype=output_type).reshape(output_shape) 34 | token_embeddings = token_embeddings[:, 0] 35 | 36 | output_shape, output_type = self.aic_session.model_output_shape_dict['sentence_embedding'] 37 | sentence_embedding = np.frombuffer(outputs['sentence_embedding'], dtype=output_type).reshape(output_shape) 38 | 39 | return token_embeddings, sentence_embedding 40 | 41 | def main(): 42 | inputs_txt = 'your_text_here' 43 | model = QAicEmbeddingModel() 44 | token_embedding, sentence_embeddings = model.generate(inputs_txt) 45 | print('token_embedding {}'.format(token_embedding)) 46 | print('sentence_embeddings {}'.format(sentence_embeddings)) 47 | 48 | if __name__ == "__main__": 49 | main() 50 | -------------------------------------------------------------------------------- /samples/python/qaic_features/metrics_eg.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 3 | SPDX-License-Identifier: BSD-3-Clause-Clear 4 | ''' 5 | 6 | import qaic 7 | import numpy as np 8 | import argparse 9 | 10 | # Establish arguments to accept 11 | def get_args(): 12 | 13 | parser = argparse.ArgumentParser() 14 | 15 | parser.add_argument( 16 | "--model-path", 17 | dest='model_path', 18 | default= 19 | '/opt/qti-aic/integrations/qaic_onnxrt/tests/resnet50/resnet50-v1-12-batch.onnx', 20 | help='Pass path to qpc of this model to avoid compilation') 21 | 22 | parser.add_argument( 23 | "--config-path", 24 | dest='config_path', 25 | default= 26 | './resnet_config.yaml', 27 | help='Pass path to qpc of this model to avoid compilation') 28 | 29 | parser.add_argument( 30 | "--input", 31 | dest='input_img', 32 | help= 33 | 'If image is not provided, random values will be generated as input. Input image should be 1*3*224*224 pixel in raw format' 34 | ) 35 | 36 | parser.add_argument( 37 | "--num_iters", 38 | dest='num_iters', 39 | default=1000, 40 | help='Enter number of inferences you want to run on the model') 41 | 42 | return parser.parse_args() 43 | 44 | def main(args): 45 | 46 | resnet_sess = qaic.Session( 47 | args.model_path, 48 | options_path=args.config_path, 49 | enable_metrics=True) 50 | 51 | input_shape, input_type = resnet_sess.model_input_shape_dict['data'] 52 | 53 | # Read input 54 | 55 | if args.input_img is None: 56 | x = np.random.randn(*input_shape).astype(input_type) 57 | else: 58 | img = np.fromfile(args.input_img, dtype=input_type) 59 | x = np.resize(img, input_shape) 60 | 61 | # Run inference 62 | input_dict = {'data': x} 63 | 64 | for _ in range(args.num_iters): 65 | resnet_sess.run(input_dict) 66 | 67 | print('\n\n\n\n-------------- Metrics --------------\n\n\n\n') 68 | resnet_sess.print_metrics() 69 | metrics = resnet_sess.get_metrics() 70 | 71 | if __name__ == '__main__': 72 | args = get_args() 73 | main(args) -------------------------------------------------------------------------------- /samples/python/qaic_features/profiling_eg.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 3 | SPDX-License-Identifier: BSD-3-Clause-Clear 4 | ''' 5 | 6 | import qaic 7 | import numpy as np 8 | import argparse 9 | 10 | # Establish arguments to accept 11 | def get_args(): 12 | 13 | parser = argparse.ArgumentParser() 14 | 15 | parser.add_argument( 16 | "--model-path", 17 | dest='model_path', 18 | default= 19 | '/opt/qti-aic/integrations/qaic_onnxrt/tests/resnet50/resnet50-v1-12-batch.onnx', 20 | help='Pass path to qpc of this model to avoid compilation') 21 | 22 | parser.add_argument( 23 | "--config-path", 24 | dest='config_path', 25 | default= 26 | './resnet_config.yaml', 27 | help='Pass path to qpc of this model to avoid compilation') 28 | 29 | parser.add_argument( 30 | "--input", 31 | dest='input_img', 32 | help= 33 | 'If image is not provided, random values will be generated as input. Input image should be 1*3*224*224 pixel in raw format' 34 | ) 35 | 36 | parser.add_argument( 37 | "--num_iters", 38 | dest='num_iters', 39 | default=1000, 40 | help='Enter number of inferences you want to run on the model') 41 | 42 | return parser.parse_args() 43 | 44 | def main(args): 45 | 46 | resnet_sess = qaic.Session( 47 | args.model_path, 48 | options_path=args.config_path, 49 | enable_profiling=True) 50 | 51 | input_shape, input_type = resnet_sess.model_input_shape_dict['data'] 52 | 53 | # Read input 54 | 55 | if args.input_img is None: 56 | x = np.random.randn(*input_shape).astype(input_type) 57 | else: 58 | img = np.fromfile(args.input_img, dtype=input_type) 59 | x = np.resize(img, input_shape) 60 | 61 | # Run inference 62 | input_dict = {'data': x} 63 | 64 | for _ in range(args.num_iters): 65 | resnet_sess.run(input_dict) 66 | 67 | print('\n\n\n\n-------------- Metrics --------------\n\n\n\n') 68 | resnet_sess.print_metrics() 69 | print('\n\n\n\n-------------- Profile Data --------------\n\n\n\n') 70 | resnet_sess.print_profile_data(n=5) 71 | metrics = resnet_sess.get_metrics() 72 | 73 | if __name__ == '__main__': 74 | args = get_args() 75 | main(args) -------------------------------------------------------------------------------- /samples/python/common_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 3 | SPDX-License-Identifier: BSD-3-Clause-Clear 4 | ''' 5 | 6 | import os 7 | import yaml 8 | import inspect 9 | 10 | def generate_bin(onnx_filename, yaml_filename): 11 | """ 12 | Generate compiled binary for QAIC 13 | 14 | Args: 15 | onnx_path : path to onnx file. 16 | yaml_path : path to yaml file which has compile time arguments. 17 | 18 | Returns: 19 | qpc_path : path to qpc (compiled binary) 20 | """ 21 | caller_path = inspect.stack()[1].filename #os.path.dirname(os.path.realpath 22 | onnx_path = os.path.join(os.path.dirname(caller_path), onnx_filename) 23 | yaml_path = os.path.join(os.path.dirname(caller_path), yaml_filename) 24 | 25 | filename, extension = os.path.splitext(onnx_filename) 26 | onnx_folder = os.path.dirname(onnx_path) 27 | qpc_bin = os.path.join(os.path.dirname(caller_path), filename+'_qpc') 28 | with open(yaml_path, "r") as file: 29 | yaml_data = yaml.load(file, Loader=yaml.FullLoader) 30 | 31 | if os.path.isdir(qpc_bin): 32 | print(f'INFO: Removing existing QPC {qpc_bin}') 33 | cmd = f'sudo rm -fr {qpc_bin}' 34 | os.system(cmd) 35 | print(f'INFO: Existing QPC {qpc_bin} is removed') 36 | 37 | # create the command string from the yaml arguments. 38 | cmd_list = [f'/opt/qti-aic/exec/qaic-exec -m={onnx_path} -aic-hw -aic-hw-version={2.0}'] 39 | 40 | # ignore the following arguments: 41 | ignore = ['num-activations', 'set-size'] 42 | replace_dict = {'aic_num_cores':'aic-num-cores'} 43 | 44 | for arg, value in yaml_data.items(): 45 | arg = arg.replace('_','-') 46 | if arg in ignore: 47 | continue 48 | if isinstance(value, bool): 49 | if value:# include the argument only if true; for example -convert-to-fp16 50 | cmd_list.append(f'-{arg}') 51 | elif isinstance(value, dict): 52 | for subarg, subval in value.items(): 53 | cmd_list.append(f'-{arg}={subarg},{subval}') 54 | else: 55 | cmd_list.append(f'-{arg}={value}') 56 | 57 | cmd_list.append(f'-aic-binary-dir={qpc_bin}') 58 | 59 | cmd = ' '.join(cmd_list) 60 | print(f'INFO: Running the compile cmd: {cmd}') 61 | os.system(cmd) 62 | 63 | return qpc_bin 64 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/stable-diffusion-xl-base-1.0/fix_vae_decoder_onnx.py: -------------------------------------------------------------------------------- 1 | #################################################################################################### 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear 4 | #################################################################################################### 5 | import onnx 6 | from onnx import numpy_helper 7 | 8 | def scale_conv(model, conv_name, scale_factor): 9 | cnodes = [x for x in model.graph.node if x.name == conv_name] 10 | assert len(cnodes) == 1, f"Node '{conv_name}' not found" 11 | x, w, b = cnodes[0].input 12 | wi, bi = "", "" 13 | for i, init in enumerate(model.graph.initializer): 14 | if init.name == w: 15 | wi = i 16 | elif init.name == b: 17 | bi = i 18 | if wi != "" and bi != "": 19 | break 20 | else: 21 | raise ValueError(f"Cannot find indices of weight: {w} and bias: {b}") 22 | ww = numpy_helper.to_array(model.graph.initializer[wi]) 23 | bb = numpy_helper.to_array(model.graph.initializer[bi]) 24 | model.graph.initializer[wi].raw_data = (ww / scale_factor).tobytes() 25 | model.graph.initializer[bi].raw_data = (bb / scale_factor).tobytes() 26 | 27 | 28 | def main(model_path, scaling_factor): 29 | model = onnx.load(model_path) 30 | scale_conv(model, "/decoder/up_blocks.2/upsamplers.0/conv/Conv", scaling_factor) 31 | scale_conv(model, "/decoder/up_blocks.3/resnets.0/conv2/Conv", scaling_factor) 32 | # scale_conv(model, "/decoder/up_blocks.3/resnets.0/conv_shortcut/Conv", scaling_factor) 33 | scale_conv(model, "/decoder/up_blocks.3/resnets.1/conv2/Conv", scaling_factor) 34 | scale_conv(model, "/decoder/up_blocks.3/resnets.2/conv2/Conv", scaling_factor) 35 | output_path = model_path[:-5] + f"_fixed_{scaling_factor}.onnx" 36 | onnx.save(model, output_path) 37 | 38 | 39 | if __name__ == "__main__": 40 | import argparse 41 | argp = argparse.ArgumentParser() 42 | argp.add_argument( 43 | "--model-path", 44 | default="stabilityai/stable-diffusion-xl-base-1.0/vae_decoder/model.onnx", 45 | help="Model path to fix", 46 | ) 47 | argp.add_argument("--scaling-factor", default=128, type=int, help="Scaling factor") 48 | args = argp.parse_args() 49 | main(**vars(args)) 50 | -------------------------------------------------------------------------------- /models/speech/whisper/generateModel.py: -------------------------------------------------------------------------------- 1 | #################################################################################################### 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear 4 | #################################################################################################### 5 | 6 | import os 7 | import argparse 8 | import numpy as np 9 | import torch 10 | import whisper 11 | from audio import AudioSample 12 | 13 | def main(model_name: str, output_dir: str): 14 | cache_path = './cache' 15 | 16 | audio_sample = AudioSample() 17 | audio_path = audio_sample.to_file() 18 | 19 | audio = whisper.load_audio(audio_path) # Read audio from file 20 | audio_pad = whisper.pad_or_trim(audio) # Padding and trimming 21 | 22 | # make log-Mel spectrogram and move to the same device as the model 23 | input_features = whisper.log_mel_spectrogram(audio_pad) # convert to mel spectrogram 24 | input_features = torch.unsqueeze(input_features, 0) # add batch dimension 25 | 26 | model = whisper.load_model(model_name, download_root=cache_path) 27 | audio_features = model.encoder(input_features) 28 | decoder_input_ids = torch.tensor([[50258]]) 29 | 30 | if not os.path.exists(output_dir): 31 | os.makedirs(output_dir) 32 | 33 | # Encoder model 34 | torch.onnx.export( 35 | model.encoder, 36 | (input_features), 37 | os.path.join(output_dir, 'encoder_model.onnx'), 38 | input_names=['input_features'], 39 | output_names=['last_hidden_state'], 40 | dynamic_axes={ 41 | 'input_features': {0: 'batch_size', 1: 'feature_size', 2: 'encoder_sequence_length'}, 42 | 'last_hidden_state': {0: 'batch_size'} 43 | } 44 | ) 45 | 46 | # Decoder model 47 | torch.onnx.export( 48 | model.decoder, 49 | (decoder_input_ids, audio_features), 50 | os.path.join(output_dir, 'decoder_model.onnx'), 51 | input_names=['input_ids', 'encoder_hidden_states'], 52 | output_names=['logits'], 53 | dynamic_axes={ 54 | 'input_ids': {0: 'batch_size', 1: 'decoder_sequence_length'}, 55 | 'encoder_hidden_states': {0: 'batch_size', 1: 'encoder_sequence_length'}, 56 | 'logits': {0: 'batch_size', 1: 'decoder_sequence_length'} 57 | } 58 | ) 59 | 60 | if __name__ == '__main__': 61 | import argparse 62 | 63 | argp = argparse.ArgumentParser() 64 | argp.add_argument( 65 | '--model-name', 66 | required=True, 67 | help='Model name to generate', 68 | ) 69 | argp.add_argument( 70 | '--output-dir', 71 | required=False, 72 | help='Path to store generated ONNX files', 73 | default='./' 74 | ) 75 | args = argp.parse_args() 76 | main(**vars(args)) 77 | -------------------------------------------------------------------------------- /models/language_processing/decoder/DeciCoder-6b/README.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | [DeciCoder-6b](https://huggingface.co/Deci/DeciCoder-6b) is a decoder-only large language model (LLM) developed by [Deci Ai](https://deci.ai) for code generation tasks. The architectures of the model was developed by AutoNAC which is Deci Ai's proprietary Neural Architecture Search technology. The model has a context length of 2048 tokens and is trained on the Python, Java, Javascript, C++, C#, Go, and Rust subsets of [The-Stack](https://huggingface.co/datasets/bigcode/the-stack) dataset. 4 | 5 | # Running on AIC100 6 | 7 | ## Available Compute Resources 8 | The following cloud provider instances are equipped with AIC100 accelerators. 9 | 10 | 11 | 12 | |Provider | [AWS DL2q Instance](https://aws.amazon.com/ec2/instance-types/dl2q/) | [Cirrascale Instance](https://cirrascale.com/solutions-qualcomm-cloud-ai100.php) | 13 | | --------------------- | --------------------- | -------------------------- | 14 | |Cloud-AI Accelerators | 8 Std (14 NSPs) SKUs | 1 to 8 Pro (16 NSPs) SKUs | 15 | |Supported Formats for [DeciCoder-6b](https://huggingface.co/Deci/DeciCoder-6b)| FP16 and [MX6](https://arxiv.org/abs/2302.08007) | FP16 and [MX6](https://arxiv.org/abs/2302.08007) | 16 | 17 | ## Source of the Model 18 | 19 | The model is downloaded from [HuggingFace](https://huggingface.co/Deci/DeciCoder-6b). 20 | 21 | ## Environment and Dependencies 22 | Create Python virtual environment and activate. 23 | 24 | ```commandline 25 | python3.10 -m venv llm_env 26 | source llm_env/bin/activate 27 | pip3 install -r requirements.txt 28 | ``` 29 | 30 | Install the dependencies. 31 | 32 | ```commandline 33 | git clone --branch v4.35.2 --depth 1 https://github.com/huggingface/transformers transformers-dev 34 | cd transformers-dev 35 | git apply ../Llama2_4.35.2.patch 36 | pip3 install . 37 | cd .. 38 | ``` 39 | 40 | ## Model and Hardware Parameters 41 | Customize the model repo/name and the compilation parameters in `init.sh`. Model will be compiled using MX6 compression. Let MX="" if you want to avoid MX6 compression. BS, PL and CL are Batchsize, Prompt Length and Context Length respectively. 42 | 43 | ```commandline 44 | source init.sh 45 | ``` 46 | 47 | ## Model Generation 48 | Generate the model into onnx format. 49 | 50 | ```commandline 51 | python generateModel.py --model-name ${MODEL_REPO}/${MODEL_NAME} --model-class LlamaForCausalLM 52 | ``` 53 | 54 | ## Model Compilation for AIC100 55 | Compile the onnx format into bin file. Modify BS, PL, CL, CORES, and MX if needed. 56 | 57 | ```commandline 58 | bash compileModel.sh $MODEL_NAME $BS $PL $CL $CORES $MX 59 | ``` 60 | 61 | ## Model Execution on AIC100 62 | Run the compiled model binary on AIC100. Modify DEVICE_ID if needed. Run `/opt/qti-aic/tools/qaic-util -q` to check available devices. 63 | 64 | ```commandline 65 | export PROMPT="insert your prompt here" 66 | export DEVICE_ID=0 67 | python runModel.py --model-name ${MODEL_REPO}/${MODEL_NAME} --qpc ./qpc/${MODEL_NAME}-kv-${PL}pl-${CL}cl-${CORES}c${MX} --device_id $DEVICE_ID --prompt "${PROMPT}" 68 | ``` 69 | 70 | ## References 71 | - [Shared Micro-exponents](https://arxiv.org/abs/2302.08007) 72 | 73 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/stable-diffusion-3.5-medium/onnx_generation/onnx_gen_utils.py: -------------------------------------------------------------------------------- 1 | #################################################################################################### 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear 4 | #################################################################################################### 5 | from packaging import version 6 | import torch 7 | import onnx 8 | from onnx import external_data_helper, numpy_helper 9 | import numpy as np 10 | from pathlib import Path 11 | 12 | is_torch_less_than_1_11 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.11") 13 | 14 | 15 | def fix_onnx_fp16( 16 | gen_models_path: str, 17 | ) -> str: 18 | finfo = np.finfo(np.float16) 19 | fp16_max = finfo.max 20 | fp16_min = finfo.min 21 | model = onnx.load(f"{gen_models_path}/model.onnx") 22 | fp16_fix = False 23 | for tensor in external_data_helper._get_all_tensors(model): 24 | nptensor = numpy_helper.to_array(tensor, gen_models_path) 25 | if nptensor.dtype == np.float32 and ( 26 | np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min) 27 | ): 28 | # print(f'tensor value : {nptensor} above {fp16_max} or below {fp16_min}') 29 | nptensor = np.clip(nptensor, fp16_min, fp16_max) 30 | new_tensor = numpy_helper.from_array(nptensor, tensor.name) 31 | tensor.CopyFrom(new_tensor) 32 | fp16_fix = True 33 | 34 | 35 | if fp16_fix: 36 | # Save FP16 model 37 | print("Found constants out of FP16 range, clipped to FP16 range") 38 | onnx.save(model, f=f"{gen_models_path}" / "model_fp16.onnx") 39 | print(f"Saving modified onnx file at {gen_models_path}/model_fp16.onnx") 40 | 41 | 42 | def onnx_export( 43 | model, 44 | model_args: tuple, 45 | output_path: Path, 46 | ordered_input_names, 47 | output_names, 48 | dynamic_axes, 49 | opset, 50 | use_external_data_format=False, 51 | ): 52 | output_path.parent.mkdir(parents=True, exist_ok=True) 53 | # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11, 54 | # so we check the torch version for backwards compatibility 55 | if is_torch_less_than_1_11: 56 | torch.onnx.export( 57 | model, 58 | model_args, 59 | f=output_path.as_posix(), 60 | input_names=ordered_input_names, 61 | output_names=output_names, 62 | dynamic_axes=dynamic_axes, 63 | do_constant_folding=True, 64 | use_external_data_format=use_external_data_format, 65 | enable_onnx_checker=True, 66 | opset_version=opset, 67 | ) 68 | else: 69 | torch.onnx.export( 70 | model, 71 | model_args, 72 | f=output_path.as_posix(), 73 | input_names=ordered_input_names, 74 | output_names=output_names, 75 | dynamic_axes=dynamic_axes, 76 | do_constant_folding=True, 77 | opset_version=opset, 78 | ) 79 | 80 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_turbo/run_config_gen.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause-Clear 5 | 6 | # model configs 7 | MODEL_PATH="stabilityai/sdxl-turbo" 8 | PROMPT="\"photo of 8k ultra realistic harbour, port, boats, sunset, beautiful light, golden hour, full of colour, cinematic lighting, battered, trending on artstation, 4k, hyperrealistic, focused, extreme details, cinematic, masterpiece\"" 9 | VAE_TYPE="vae" 10 | IMAGE_SIZE=512 11 | BLOCK_SIZE=256 12 | BATCH_SIZE=1 13 | 14 | # onnx configs 15 | GENERATE_ONNX=true 16 | ONNX_TEXT_ENCODER=true 17 | ONNX_UNET=true 18 | ONNX_VAE=true 19 | 20 | # compile configs 21 | NUM_CORES=16 22 | VAE_MOS=2 23 | VAE_OLS=1 24 | UNET_MOS=2 25 | UNET_OLS=1 26 | COMPILE_TEXT_ENCODER=true 27 | COMPILE_UNET=true 28 | COMPILE_VAE=true 29 | 30 | # inference configs 31 | RUN_ONLY=false 32 | DEVICE=0 33 | NUM_STEPS=1 34 | WARMUP_ITERS=3 35 | REPEAT_ITERS=1 36 | 37 | # mode 38 | TOGETHER=false 39 | 40 | if [ ${GENERATE_ONNX} == true ] 41 | then 42 | GENERATE_ONNX_CMD="--generate-onnx" 43 | else 44 | GENERATE_ONNX_CMD="" 45 | fi 46 | 47 | if [ ${ONNX_TEXT_ENCODER} == true ] 48 | then 49 | ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder" 50 | else 51 | ONNX_TEXT_ENCODER_CMD="" 52 | fi 53 | 54 | if [ ${ONNX_UNET} == true ] 55 | then 56 | ONNX_UNET_CMD="--onnx-unet" 57 | else 58 | ONNX_UNET_CMD="" 59 | fi 60 | 61 | if [ ${ONNX_VAE} == true ] 62 | then 63 | ONNX_VAE_CMD="--onnx-vae" 64 | else 65 | ONNX_VAE_CMD="" 66 | fi 67 | 68 | if [ ${COMPILE_TEXT_ENCODER} == true ] 69 | then 70 | COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder" 71 | else 72 | COMPILE_TEXT_ENCODER_CMD="" 73 | fi 74 | 75 | if [ ${COMPILE_UNET} == true ] 76 | then 77 | COMPILE_UNET_CMD="--compile-unet" 78 | else 79 | COMPILE_UNET_CMD="" 80 | fi 81 | 82 | if [ ${COMPILE_VAE} == true ] 83 | then 84 | COMPILE_VAE_CMD="--compile-vae" 85 | else 86 | COMPILE_VAE_CMD="" 87 | fi 88 | 89 | if [ ${RUN_ONLY} == true ] 90 | then 91 | RUN_ONLY_CMD="--run-only" 92 | else 93 | RUN_ONLY_CMD="" 94 | fi 95 | 96 | if [ ${TOGETHER} == true ] 97 | then 98 | TOGETHER_CMD="--together" 99 | else 100 | TOGETHER_CMD="" 101 | fi 102 | 103 | export HF_HOME="cache" 104 | 105 | rm run.sh 106 | 107 | scripts="python main.py \ 108 | --model-path $MODEL_PATH \ 109 | --prompt $PROMPT \ 110 | --vae-type $VAE_TYPE \ 111 | --batch-size $BATCH_SIZE \ 112 | --image-size $IMAGE_SIZE \ 113 | --block-size $BLOCK_SIZE \ 114 | --num-cores $NUM_CORES \ 115 | --vae-mos $VAE_MOS \ 116 | --vae-ols $VAE_OLS \ 117 | --unet-mos $UNET_MOS \ 118 | --unet-ols $UNET_OLS \ 119 | --device $DEVICE \ 120 | --num-steps $NUM_STEPS \ 121 | --num-warmup-iters $WARMUP_ITERS \ 122 | --num-repeat-iters $REPEAT_ITERS \ 123 | $ONNX_TEXT_ENCODER_CMD \ 124 | $ONNX_UNET_CMD \ 125 | $ONNX_VAE_CMD \ 126 | $COMPILE_TEXT_ENCODER_CMD \ 127 | $COMPILE_UNET_CMD \ 128 | $COMPILE_VAE_CMD \ 129 | $GENERATE_ONNX_CMD \ 130 | $RUN_ONLY_CMD \ 131 | $TOGETHER_CMD" 132 | 133 | echo $scripts >> run.sh 134 | 135 | bash run.sh 136 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_turbo/run_config_inference.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause-Clear 5 | 6 | PYTHON=$1 7 | echo $PYTHON 8 | 9 | # model configs 10 | MODEL_PATH="stabilityai/sdxl-turbo" 11 | PROMPT="\"photo of 8k ultra realistic harbour, port, boats, sunset, beautiful light, golden hour, full of colour, cinematic lighting, battered, trending on artstation, 4k, hyperrealistic, focused, extreme details, cinematic, masterpiece\"" 12 | VAE_TYPE="vae" 13 | IMAGE_SIZE=512 14 | BLOCK_SIZE=256 15 | BATCH_SIZE=1 16 | 17 | # onnx configs 18 | GENERATE_ONNX=false 19 | ONNX_TEXT_ENCODER=false 20 | ONNX_UNET=false 21 | ONNX_VAE=false 22 | 23 | # compile configs 24 | NUM_CORES=16 25 | VAE_MOS=2 26 | VAE_OLS=1 27 | UNET_MOS=2 28 | UNET_OLS=1 29 | COMPILE_TEXT_ENCODER=false 30 | COMPILE_UNET=false 31 | COMPILE_VAE=false 32 | 33 | # inference configs 34 | RUN_ONLY=true 35 | DEVICE=0 36 | NUM_STEPS=1 37 | WARMUP_ITERS=3 38 | REPEAT_ITERS=3 39 | 40 | # mode 41 | TOGETHER=false 42 | 43 | if [ ${GENERATE_ONNX} == true ] 44 | then 45 | GENERATE_ONNX_CMD="--generate-onnx" 46 | else 47 | GENERATE_ONNX_CMD="" 48 | fi 49 | 50 | if [ ${ONNX_TEXT_ENCODER} == true ] 51 | then 52 | ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder" 53 | else 54 | ONNX_TEXT_ENCODER_CMD="" 55 | fi 56 | 57 | if [ ${ONNX_UNET} == true ] 58 | then 59 | ONNX_UNET_CMD="--onnx-unet" 60 | else 61 | ONNX_UNET_CMD="" 62 | fi 63 | 64 | if [ ${ONNX_VAE} == true ] 65 | then 66 | ONNX_VAE_CMD="--onnx-vae" 67 | else 68 | ONNX_VAE_CMD="" 69 | fi 70 | 71 | if [ ${COMPILE_TEXT_ENCODER} == true ] 72 | then 73 | COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder" 74 | else 75 | COMPILE_TEXT_ENCODER_CMD="" 76 | fi 77 | 78 | if [ ${COMPILE_UNET} == true ] 79 | then 80 | COMPILE_UNET_CMD="--compile-unet" 81 | else 82 | COMPILE_UNET_CMD="" 83 | fi 84 | 85 | if [ ${COMPILE_VAE} == true ] 86 | then 87 | COMPILE_VAE_CMD="--compile-vae" 88 | else 89 | COMPILE_VAE_CMD="" 90 | fi 91 | 92 | if [ ${RUN_ONLY} == true ] 93 | then 94 | RUN_ONLY_CMD="--run-only" 95 | else 96 | RUN_ONLY_CMD="" 97 | fi 98 | 99 | if [ ${TOGETHER} == true ] 100 | then 101 | TOGETHER_CMD="--together" 102 | else 103 | TOGETHER_CMD="" 104 | fi 105 | 106 | export HF_HOME="cache" 107 | 108 | rm run.sh 109 | 110 | scripts="$PYTHON main.py \ 111 | --model-path $MODEL_PATH \ 112 | --prompt $PROMPT \ 113 | --vae-type $VAE_TYPE \ 114 | --batch-size $BATCH_SIZE \ 115 | --image-size $IMAGE_SIZE \ 116 | --block-size $BLOCK_SIZE \ 117 | --num-cores $NUM_CORES \ 118 | --vae-mos $VAE_MOS \ 119 | --vae-ols $VAE_OLS \ 120 | --unet-mos $UNET_MOS \ 121 | --unet-ols $UNET_OLS \ 122 | --device $DEVICE \ 123 | --num-steps $NUM_STEPS \ 124 | --num-warmup-iters $WARMUP_ITERS \ 125 | --num-repeat-iters $REPEAT_ITERS \ 126 | $ONNX_TEXT_ENCODER_CMD \ 127 | $ONNX_UNET_CMD \ 128 | $ONNX_VAE_CMD \ 129 | $COMPILE_TEXT_ENCODER_CMD \ 130 | $COMPILE_UNET_CMD \ 131 | $COMPILE_VAE_CMD \ 132 | $GENERATE_ONNX_CMD \ 133 | $RUN_ONLY_CMD \ 134 | $TOGETHER_CMD" 135 | 136 | echo $scripts >> run.sh 137 | 138 | bash run.sh 139 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/stable-diffusion-3.5-medium/README.md: -------------------------------------------------------------------------------- 1 | ### Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 2 | ### SPDX-License-Identifier: BSD-3-Clause-Clear 3 | 4 | # Instructions to run SD3.5 on Cloud AI 100 5 | 6 | The instructions below are to run the [Stable Diffusion 3.5 model](stabilityai/stable-diffusion-3.5-medium) on Cloud AI 100. Compile time parameters may need to be adjusted for different cards and different SDKs. 7 | 8 | ## 1. Download model 9 | 10 | 1. Setup environment varialble 11 | ``` 12 | mkdir cache 13 | export HF_HOME=cache 14 | export HF_TOKEN= 15 | ``` 16 | 17 | 2. Follow [instructions on HuggingFace](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium) to gain access to model. 18 | 19 | ## 2. Generate onnx files and compile for binaries 20 | 21 | 1. Set up a virtual environment for ONNX generation and compilation 22 | ``` 23 | python3.10 -m venv env_onnx 24 | source ./env_onnx/bin/activate 25 | pip install -r requirements.txt 26 | ``` 27 | 28 | 2. Create a folder for caching HuggingFace model downloads 29 | ``` 30 | mkdir compile_logs 31 | mkdir qpc 32 | touch run.sh 33 | ``` 34 | 35 | 3. Install diffusers from source after patching for ONNX file generation 36 | ``` 37 | git clone --depth 1 --branch v0.31.0 https://github.com/huggingface/diffusers.git diffusers-onnx 38 | cd diffusers-onnx 39 | git apply --reject --whitespace=fix ../patches/attention_patch.patch 40 | pip install . 41 | cd .. 42 | ``` 43 | 44 | 4. Install transformers from source (for T5 text_encoder_3 only) 45 | ``` 46 | git clone -b v4.41.2 https://github.com/huggingface/transformers.git transformers-dev 47 | cd transformers-dev 48 | git apply --reject --whitespace=fix ../patches/transformer_patch.patch 49 | pip install . 50 | cd .. 51 | ``` 52 | 53 | 5. Generate ONNX files and model binaries 54 | ``` 55 | bash run_config_gen.sh 56 | ``` 57 | 58 | ## 3. Run the end-to-end SD3 inference 59 | 60 | 1. Set up a separate virtual environment 61 | ``` 62 | python3.10 -m venv env_pipeline 63 | source ./env_pipeline/bin/activate 64 | pip install -r requirements.txt 65 | pip install --force-reinstall /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl 66 | ``` 67 | 68 | 2. Re-install diffusers from source after patching the SD3 pipeline for inference 69 | ``` 70 | git clone --depth 1 --branch v0.31.0 https://github.com/huggingface/diffusers.git diffusers-pipeline 71 | cd diffusers-pipeline 72 | git apply --reject --whitespace=fix ../patches/pipeline_patch.patch 73 | pip install . 74 | cd .. 75 | ``` 76 | 77 | 3. Run the inference with 'sudo' flag if needed to access the AI 100 devices. 78 | ``` 79 | sudo bash run_config_inference.sh "" "" 80 | ``` 81 | 82 | ## 4. Python interface 83 | 84 | ``` 85 | source ./env_pipeline/bin/activate 86 | ``` 87 | 88 | ```python 89 | from model import QAICStableDiffusion3 90 | 91 | model = QAICStableDiffusion3() 92 | prompt = 'photo of 8k ultra realistic harbour, port, boats, sunset, beautiful light, golden hour, full of colour, cinematic lighting, battered, trending on artstation, 4k, hyperrealistic, focused, extreme details, cinematic, masterpiece' 93 | image = model.generate(prompt, guidance=7.0)[0] 94 | image.save('harbor.png') 95 | ``` 96 | 97 | -------------------------------------------------------------------------------- /samples/python/qaic_features/README.md: -------------------------------------------------------------------------------- 1 | # Python High-Level API (qaic) features 2 | 3 | qaic_features depicts examples on how one can use different features provided by qaic module along with running inferences. 4 | 5 | a) Metrics 6 | After running inferences on AIC100 chip, if you want to get statistics regarding inference times, you can use get_metrics method as follows: 7 | 8 | ```python 9 | #Create Session with enable_metrics = True 10 | session = qaic.Session( 11 | model_path, 12 | options_path=yaml_config_path, 13 | enable_metrics=True) 14 | 15 | #Create input dictionary 16 | input_dict = {'data': np.array()} 17 | 18 | #Run Inferences 19 | for i in range(100): 20 | session.run(input_dict) 21 | 22 | #Get Metrics 23 | session.print_metrics() 24 | metrics = session.get_metrics() 25 | ``` 26 | 27 | Sample output for session.print_metrics() 28 | 29 | ```bash 30 | Number of inferences utilized for calculation are 999 31 | Minimum latency observed 0.0009578340000000001 s 32 | Maximum latency observed 0.002209001 s 33 | Average latency / inference time observed is 0.0012380756316316324 s 34 | P25 / 25% of inferences observed latency less than 0.001095435 s 35 | P50 / 50% of inferences observed latency less than 0.0012522870000000001 s 36 | P75 / 75% of inferences observed latency less than 0.001299786 s 37 | P90 / 90% of inferences observed latency less than 0.002209001 s 38 | P99 / 99% of inferences observed latency less than 0.0016082370000000002 s 39 | Sum of all the inference times 1.2368375560000007 s 40 | Average latency / inference time observed is 0.0012380756316316324 s 41 | ``` 42 | 43 | 44 | 45 | 46 | b) Profiling 47 | To profile your inferences being performed on AIC100 chip and get inference time statistic metrics, you can use following methods: 48 | 49 | ```python 50 | #Create Session with enable_metrics = True 51 | session = qaic.Session( 52 | model_path, 53 | options_path=yaml_config_path, 54 | enable_profiling=True) 55 | 56 | #Create input dictionary 57 | input_dict = {'data': np.array()} 58 | 59 | #Run Inferences 60 | for i in range(100): 61 | session.run(input_dict) 62 | 63 | #Get Metrics 64 | session.print_metrics() 65 | metrics = session.get_metrics() 66 | session.print_profile_data(n=5) 67 | ``` 68 | 69 | Sample output for session.print_profile_data() 70 | 71 | ```bash 72 | | File-Line-Function | | num calls | | func time | | tot time | 73 | 74 | ('~', 0, "") 1 0.000149101 0.000149101 75 | 76 | ('~', 0, '') 1 2.38e-06 2.38e-06 77 | 78 | ('~', 0, '') 1 4.22e-06 4.22e-06 79 | ``` 80 | 81 | 82 | 83 | 84 | c) Benchmarking 85 | To run benchmarking for model inferences on AIC100 chip, following method can be used: 86 | 87 | ```python 88 | #Create Session with enable_metrics = True 89 | session = qaic.Session( 90 | model_path, 91 | options_path=yaml_config_path) 92 | 93 | #Create input dictionary 94 | input_dict = {'data': np.array()} 95 | 96 | # Run Benchmarking 97 | input_dict = {'data': x} 98 | 99 | inf_completed, inf_rate, inf_time, batch_size = session.run_benchmark(input_dict=input_dict) 100 | ``` 101 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_turbo/onnx_generation/onnx_gen_utils.py: -------------------------------------------------------------------------------- 1 | #################################################################################################### 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear 4 | #################################################################################################### 5 | from packaging import version 6 | import torch 7 | import onnx 8 | from onnx import external_data_helper, numpy_helper 9 | import numpy as np 10 | from pathlib import Path 11 | 12 | is_torch_less_than_1_11 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.11") 13 | 14 | 15 | def fix_onnx_fp16( 16 | gen_models_path: str, 17 | model_base_name: str, 18 | ) -> str: 19 | finfo = np.finfo(np.float16) 20 | fp16_max = finfo.max 21 | fp16_min = finfo.min 22 | model = onnx.load(f"{gen_models_path}/{model_base_name}") 23 | fp16_fix = False 24 | for tensor in external_data_helper._get_all_tensors(model): 25 | nptensor = numpy_helper.to_array(tensor, gen_models_path) 26 | if nptensor.dtype == np.float32 and ( 27 | np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min) 28 | ): 29 | # print(f'tensor value : {nptensor} above {fp16_max} or below {fp16_min}') 30 | nptensor = np.clip(nptensor, fp16_min, fp16_max) 31 | new_tensor = numpy_helper.from_array(nptensor, tensor.name) 32 | tensor.CopyFrom(new_tensor) 33 | fp16_fix = True 34 | 35 | 36 | if fp16_fix: 37 | # Save FP16 model 38 | print("Found constants out of FP16 range, clipped to FP16 range") 39 | model_base_name += "_fix_outofrange_fp16" 40 | onnx.save(model, f=f"{gen_models_path}/{model_base_name}") 41 | print(f"Saving modified onnx file at {gen_models_path}/{model_base_name}") 42 | return model_base_name 43 | 44 | 45 | def onnx_export( 46 | model, 47 | model_args: tuple, 48 | output_path: Path, 49 | ordered_input_names, 50 | output_names, 51 | dynamic_axes, 52 | opset, 53 | use_external_data_format=False, 54 | ): 55 | output_path.parent.mkdir(parents=True, exist_ok=True) 56 | # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11, 57 | # so we check the torch version for backwards compatibility 58 | if is_torch_less_than_1_11: 59 | torch.onnx.export( 60 | model, 61 | model_args, 62 | f=output_path.as_posix(), 63 | input_names=ordered_input_names, 64 | output_names=output_names, 65 | dynamic_axes=dynamic_axes, 66 | do_constant_folding=True, 67 | use_external_data_format=use_external_data_format, 68 | enable_onnx_checker=True, 69 | opset_version=opset, 70 | ) 71 | else: 72 | torch.onnx.export( 73 | model, 74 | model_args, 75 | f=output_path.as_posix(), 76 | input_names=ordered_input_names, 77 | output_names=output_names, 78 | dynamic_axes=dynamic_axes, 79 | do_constant_folding=True, 80 | opset_version=opset, 81 | ) 82 | 83 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_deepcache/onnx_generation/onnx_gen_utils.py: -------------------------------------------------------------------------------- 1 | #################################################################################################### 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear 4 | #################################################################################################### 5 | from packaging import version 6 | import torch 7 | import onnx 8 | from onnx import external_data_helper, numpy_helper 9 | import numpy as np 10 | from pathlib import Path 11 | 12 | is_torch_less_than_1_11 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.11") 13 | 14 | 15 | def fix_onnx_fp16( 16 | gen_models_path: str, 17 | model_base_name: str, 18 | ) -> str: 19 | finfo = np.finfo(np.float16) 20 | fp16_max = finfo.max 21 | fp16_min = finfo.min 22 | model = onnx.load(f"{gen_models_path}/{model_base_name}") 23 | fp16_fix = False 24 | for tensor in external_data_helper._get_all_tensors(model): 25 | nptensor = numpy_helper.to_array(tensor, gen_models_path) 26 | if nptensor.dtype == np.float32 and ( 27 | np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min) 28 | ): 29 | # print(f'tensor value : {nptensor} above {fp16_max} or below {fp16_min}') 30 | nptensor = np.clip(nptensor, fp16_min, fp16_max) 31 | new_tensor = numpy_helper.from_array(nptensor, tensor.name) 32 | tensor.CopyFrom(new_tensor) 33 | fp16_fix = True 34 | 35 | 36 | if fp16_fix: 37 | # Save FP16 model 38 | print("Found constants out of FP16 range, clipped to FP16 range") 39 | model_base_name += "_fix_outofrange_fp16" 40 | onnx.save(model, f=f"{gen_models_path}/{model_base_name}") 41 | print(f"Saving modified onnx file at {gen_models_path}/{model_base_name}") 42 | return model_base_name 43 | 44 | 45 | def onnx_export( 46 | model, 47 | model_args: tuple, 48 | output_path: Path, 49 | ordered_input_names, 50 | output_names, 51 | dynamic_axes, 52 | opset, 53 | use_external_data_format=False, 54 | ): 55 | output_path.parent.mkdir(parents=True, exist_ok=True) 56 | # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11, 57 | # so we check the torch version for backwards compatibility 58 | if is_torch_less_than_1_11: 59 | torch.onnx.export( 60 | model, 61 | model_args, 62 | f=output_path.as_posix(), 63 | input_names=ordered_input_names, 64 | output_names=output_names, 65 | dynamic_axes=dynamic_axes, 66 | do_constant_folding=True, 67 | use_external_data_format=use_external_data_format, 68 | enable_onnx_checker=True, 69 | opset_version=opset, 70 | ) 71 | else: 72 | torch.onnx.export( 73 | model, 74 | model_args, 75 | f=output_path.as_posix(), 76 | input_names=ordered_input_names, 77 | output_names=output_names, 78 | dynamic_axes=dynamic_axes, 79 | do_constant_folding=True, 80 | opset_version=opset, 81 | ) 82 | 83 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/stable-diffusion-xl-base-1.0/onnx_gen_utils.py: -------------------------------------------------------------------------------- 1 | #################################################################################################### 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear 4 | #################################################################################################### 5 | from packaging import version 6 | import torch 7 | import onnx 8 | from onnx import external_data_helper, numpy_helper 9 | import numpy as np 10 | from pathlib import Path 11 | 12 | is_torch_less_than_1_11 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.11") 13 | 14 | 15 | def fix_onnx_fp16( 16 | gen_models_path: str, 17 | model_base_name: str, 18 | ) -> str: 19 | finfo = np.finfo(np.float16) 20 | fp16_max = finfo.max 21 | fp16_min = finfo.min 22 | model = onnx.load(f"{gen_models_path}/{model_base_name}") 23 | fp16_fix = False 24 | for tensor in external_data_helper._get_all_tensors(model): 25 | nptensor = numpy_helper.to_array(tensor, gen_models_path) 26 | if nptensor.dtype == np.float32 and ( 27 | np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min) 28 | ): 29 | # print(f'tensor value : {nptensor} above {fp16_max} or below {fp16_min}') 30 | nptensor = np.clip(nptensor, fp16_min, fp16_max) 31 | new_tensor = numpy_helper.from_array(nptensor, tensor.name) 32 | tensor.CopyFrom(new_tensor) 33 | fp16_fix = True 34 | 35 | 36 | if fp16_fix: 37 | # Save FP16 model 38 | print("Found constants out of FP16 range, clipped to FP16 range") 39 | model_base_name += "_fix_outofrange_fp16" 40 | onnx.save(model, f=f"{gen_models_path}/{model_base_name}") 41 | print(f"Saving modified onnx file at {gen_models_path}/{model_base_name}") 42 | return model_base_name 43 | 44 | 45 | def onnx_export( 46 | model, 47 | model_args: tuple, 48 | output_path: Path, 49 | ordered_input_names, 50 | output_names, 51 | dynamic_axes, 52 | opset, 53 | use_external_data_format=False, 54 | ): 55 | output_path.parent.mkdir(parents=True, exist_ok=True) 56 | # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11, 57 | # so we check the torch version for backwards compatibility 58 | if is_torch_less_than_1_11: 59 | torch.onnx.export( 60 | model, 61 | model_args, 62 | f=output_path.as_posix(), 63 | input_names=ordered_input_names, 64 | output_names=output_names, 65 | dynamic_axes=dynamic_axes, 66 | do_constant_folding=True, 67 | use_external_data_format=use_external_data_format, 68 | enable_onnx_checker=True, 69 | opset_version=opset, 70 | ) 71 | else: 72 | torch.onnx.export( 73 | model, 74 | model_args, 75 | f=output_path.as_posix(), 76 | input_names=ordered_input_names, 77 | output_names=output_names, 78 | dynamic_axes=dynamic_axes, 79 | do_constant_folding=True, 80 | opset_version=opset, 81 | ) 82 | 83 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_turbo/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 2 | # SPDX-License-Identifier: BSD-3-Clause-Clear 3 | 4 | import asyncio 5 | import os 6 | import torch 7 | 8 | from diffusers import AutoPipelineForText2Image 9 | 10 | class QAICStableDiffusion: 11 | def __init__(self, model_id = 'stabilityai/sdxl-turbo', device_id=0): 12 | text_encoder = './qpc/text_encoder_256b_512i_16c_1b/programqpc.bin' 13 | unet = './qpc/unet_256b_512i_16c_1b_2m_1o/programqpc.bin' 14 | text_encoder_2 = './qpc/text_encoder_2_256b_512i_16c_1b/programqpc.bin' 15 | sdxl_vae_decoder = './qpc/vae_decoder_256b_512i_vae_16c_1b_2m_1o/programqpc.bin' 16 | 17 | # check the QPCs 18 | unet_qpc = unet if unet.endswith('programqpc.bin') else os.path.join(unet,'programqpc.bin') 19 | assert os.path.isfile(unet_qpc), f"Could not find binary {unet_qpc = }!" 20 | vae_decoder_sdxl_qpc = sdxl_vae_decoder if sdxl_vae_decoder.endswith('programqpc.bin') else os.path.join(sdxl_vae_decoder,'programqpc.bin') 21 | assert os.path.isfile(vae_decoder_sdxl_qpc), f"Could not find binary {vae_decoder_sdxl_qpc = }!" 22 | text_encoder_qpc = text_encoder if text_encoder.endswith('programqpc.bin') else os.path.join(text_encoder,'programqpc.bin') 23 | assert os.path.isfile(text_encoder_qpc), f"Could not find binary {text_encoder_qpc = }!" 24 | text_encoder_2_qpc = text_encoder_2 if text_encoder_2.endswith('programqpc.bin') else os.path.join(text_encoder_2,'programqpc.bin') 25 | assert os.path.isfile(text_encoder_2_qpc), f"Could not find binary {text_encoder_2_qpc = }!" 26 | 27 | self.num_steps = 1 28 | self.vae_type = "vae" 29 | 30 | # load the latents 31 | self.latents = None 32 | 33 | # load the model pipeline 34 | self.pipe = AutoPipelineForText2Image.from_pretrained(model_id, torch_dtype=torch.float16, variant="fp16", 35 | device_id=device_id, 36 | unet_qpc=unet_qpc, 37 | vae_decoder_qpc=vae_decoder_sdxl_qpc, 38 | text_encoder_qpc=text_encoder_qpc, 39 | text_encoder_2_qpc=text_encoder_2_qpc) 40 | 41 | async def generate(self, prompt, n, image_size): 42 | height, width = image_size[0], image_size[1] 43 | images = self.pipe(prompt=prompt, 44 | num_inference_steps=self.num_steps, 45 | height=height, 46 | width=width, 47 | latents=self.latents, 48 | vae_type=self.vae_type, 49 | guidance_scale=0.0).images 50 | 51 | yield images[0] 52 | 53 | async def main(): 54 | model = QAICStableDiffusion() 55 | prompt = 'photo of 8k ultra realistic harbour, port, boats, sunset, beautiful light, golden hour, full of colour, cinematic lighting, battered, trending on artstation, 4k, hyperrealistic, focused, extreme details, cinematic, masterpiece' 56 | idx = 0 57 | async for image in model.generate(prompt, 1, (512, 512)): 58 | image.save('generated_image_{}.png'.format(idx)) 59 | idx += 1 60 | 61 | if __name__ == "__main__": 62 | asyncio.run(main()) 63 | 64 | -------------------------------------------------------------------------------- /samples/python/vit_qaic/example.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 3 | SPDX-License-Identifier: BSD-3-Clause-Clear 4 | ''' 5 | 6 | import sys 7 | sys.path.append("/opt/qti-aic/examples/apps/qaic-python-sdk") 8 | import qaic 9 | import numpy as np 10 | import torchvision 11 | import torch 12 | import pandas as pd 13 | import os 14 | sys.path.insert(1, os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir)) 15 | from common_utils import generate_bin 16 | from transformers import ViTImageProcessor, ViTForImageClassification 17 | from PIL import Image 18 | import requests 19 | import onnx 20 | from onnxsim import simplify 21 | 22 | image_size = 224 23 | 24 | model_name = f'vit-base-patch16-{image_size}' 25 | 26 | # Import the model 27 | model = ViTForImageClassification.from_pretrained(f'google/{model_name}') 28 | onnx_filename = f'{model_name}.onnx' 29 | 30 | # Export the PyTorch model to ONNX 31 | dummy_input = torch.randn(1, 3, image_size, image_size).type(torch.FloatTensor) 32 | torch.onnx.export(model, # PyTorch model 33 | dummy_input, # Input tensor 34 | onnx_filename, # Output file 35 | export_params=True, # Export the model parameters 36 | opset_version=11, # ONNX opset version 37 | do_constant_folding=True, # Fold constant values for optimization 38 | input_names=['image'], # Input tensor names 39 | output_names=['output'], # Output tensor names 40 | dynamic_axes={'image': {0: 'batch_size'}, # Dynamic axes 41 | 'output': {0: 'batch_size'}}) 42 | 43 | # apply onnxsim (optional) 44 | onnx_model = onnx.load(onnx_filename) 45 | onnx_model_simp, check = simplify(onnx_model) 46 | onnx.save(onnx_model_simp, onnx_filename) 47 | print("ONNX model saved at: ", onnx_filename) 48 | 49 | # Generate binary for QAIC by default the binary using a helper library. 50 | qpcPath = generate_bin(onnx_filename = onnx_filename , yaml_filename ='./vit_config.yaml') # return path to the folder containing compiled binary. 51 | 52 | # Compile and load the model 53 | vit_sess = qaic.Session(model_path= qpcPath+'/programqpc.bin', options_path='./vit_config.yaml') 54 | vit_sess.setup() 55 | input_shape, input_type = vit_sess.model_input_shape_dict['image'] 56 | output_shape, output_type = vit_sess.model_output_shape_dict['output'] 57 | 58 | processor = ViTImageProcessor.from_pretrained(f'google/{model_name}') 59 | 60 | # input sample 61 | url = 'http://images.cocodataset.org/val2017/000000039769.jpg' 62 | image = Image.open(requests.get(url, stream=True).raw) 63 | inputs = processor(images=image, return_tensors="pt") 64 | 65 | device = True 66 | if device: 67 | print("INFO: running inference on Qualcomm Cloud AI 100") 68 | input_data = inputs['pixel_values'].numpy().astype(input_type) 69 | input_dict = {'image': input_data} 70 | output = vit_sess.run(input_dict) 71 | logits = np.frombuffer(output['output'], dtype=output_type).reshape(output_shape) # dtype to be modified based on given model 72 | else: 73 | print("INFO: running inference on CPU") 74 | outputs = model(**inputs) 75 | logits = outputs.logits 76 | 77 | predicted_class_idx = logits.argmax(-1).item() 78 | print("Predicted class:", model.config.id2label[predicted_class_idx]) -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_turbo/server.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 2 | # SPDX-License-Identifier: BSD-3-Clause-Clear 3 | 4 | from contextlib import asynccontextmanager 5 | from fastapi import FastAPI, HTTPException, Request 6 | from typing import Optional 7 | from pydantic import BaseModel 8 | import time 9 | import base64 10 | import time 11 | 12 | from io import BytesIO 13 | 14 | from model import QAICStableDiffusion 15 | 16 | @asynccontextmanager 17 | async def lifespan(app: FastAPI): 18 | # Code to run before the application starts 19 | print("Application startup") 20 | 21 | app.model = QAICStableDiffusion(device_id=args.device) 22 | 23 | yield 24 | # Code to run when the application shuts down 25 | print("Application shutdown") 26 | 27 | app = FastAPI(lifespan=lifespan) 28 | 29 | class ImageRequest(BaseModel): 30 | model: str 31 | prompt: str 32 | n: Optional[int] = 1 33 | size: Optional[str] = '512x512' 34 | response_format: Optional[str] = 'b64_json' 35 | 36 | @app.get("/v1/models") 37 | async def get_models(): 38 | try: 39 | response = { 40 | "object": "list", 41 | "data": [ 42 | { 43 | "id": "sdxl-turbo", 44 | "object": "model", 45 | "created": 1746296172, 46 | "owned_by": "system" 47 | } 48 | ], 49 | } 50 | 51 | return {"response": response} 52 | except Exception as e: 53 | raise HTTPException(status_code=500, detail=str(e)) 54 | 55 | @app.post("/v1/images/generations") 56 | async def generate_images(image_request: ImageRequest): 57 | print(image_request) 58 | utc_seconds = time.time() 59 | 60 | size = [int(dim) for dim in image_request.size.split('x')] 61 | 62 | try: 63 | async for image in app.model.generate(image_request.prompt, 64 | image_request.n, 65 | size): 66 | buffered = BytesIO() 67 | image.save(buffered, format='PNG') 68 | b64_json = base64.b64encode(buffered.getvalue()).decode() 69 | 70 | response = { 71 | "created": int(utc_seconds), 72 | "data": [ 73 | { 74 | "b64_json": b64_json 75 | } 76 | ] 77 | } 78 | 79 | return response 80 | 81 | except Exception as e: 82 | raise HTTPException(status_code=500, detail=str(e)) 83 | 84 | if __name__ == "__main__": 85 | import uvicorn 86 | import argparse 87 | 88 | parser = argparse.ArgumentParser(description="SDXL-Turbo REST endpoint") 89 | 90 | parser.add_argument( 91 | "--host", 92 | type=str, 93 | help="IP address", 94 | default="0.0.0.0" 95 | ) 96 | 97 | parser.add_argument( 98 | "--port", 99 | type=int, 100 | help="Port", 101 | default=8000 102 | ) 103 | 104 | parser.add_argument( 105 | "--device", 106 | type=int, 107 | help="Cloud AI device", 108 | default=0 109 | ) 110 | 111 | args = parser.parse_args() 112 | 113 | uvicorn.run(app, host=args.host, port=args.port) 114 | 115 | -------------------------------------------------------------------------------- /CODE-OF-CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team. All complaints will be reviewed 59 | and investigated and will result in a response that is deemed necessary and 60 | appropriate to the circumstances. The project team is obligated to maintain 61 | confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/stable-diffusion-xl-base-1.0/attention_patch.patch: -------------------------------------------------------------------------------- 1 | diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py 2 | index 21eb3a32..1df1b09c 100644 3 | --- a/src/diffusers/models/attention_processor.py 4 | +++ b/src/diffusers/models/attention_processor.py 5 | @@ -200,10 +200,8 @@ class Attention(nn.Module): 6 | # We use the AttnProcessor2_0 by default when torch 2.x is used which uses 7 | # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention 8 | # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1 9 | - if processor is None: 10 | - processor = ( 11 | - AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor() 12 | - ) 13 | + # force to not use FlashAttention 14 | + processor = AttnProcessor() 15 | self.set_processor(processor) 16 | 17 | def set_use_memory_efficient_attention_xformers( 18 | @@ -588,7 +586,9 @@ class Attention(nn.Module): 19 | 20 | if attention_mask is None: 21 | baddbmm_input = torch.empty( 22 | - query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device 23 | + query.shape[0], query.shape[1], 24 | + key.shape[2], # key is already transposed 25 | + dtype=query.dtype, device=query.device 26 | ) 27 | beta = 0 28 | else: 29 | @@ -598,7 +598,7 @@ class Attention(nn.Module): 30 | attention_scores = torch.baddbmm( 31 | baddbmm_input, 32 | query, 33 | - key.transpose(-1, -2), 34 | + key, # key is already transposed 35 | beta=beta, 36 | alpha=self.scale, 37 | ) 38 | @@ -740,8 +740,26 @@ class AttnProcessor: 39 | key = attn.head_to_batch_dim(key) 40 | value = attn.head_to_batch_dim(value) 41 | 42 | - attention_probs = attn.get_attention_scores(query, key, attention_mask) 43 | - hidden_states = torch.bmm(attention_probs, value) 44 | + # pre-transpose the key 45 | + key = key.transpose(-1, -2) 46 | + if query.size(-2) != value.size(-2): # cross-attention, use regular attention 47 | + # QKV done in single block 48 | + attention_probs = attn.get_attention_scores(query, key, attention_mask) 49 | + hidden_states = torch.bmm(attention_probs, value) 50 | + else: # self-attention, use blocked attention 51 | + # QKV done with block-attention (a la FlashAttentionV2) 52 | + print(f"{query.shape = }, {key.shape = }, {value.shape = }") 53 | + query_block_size = 128 54 | + query_seq_len = query.size(-2) 55 | + num_blocks = (query_seq_len + query_block_size - 1) // query_block_size 56 | + for qidx in range(num_blocks): 57 | + query_block = query[:,qidx*query_block_size:(qidx+1)*query_block_size,:] 58 | + attention_probs = attn.get_attention_scores(query_block, key, attention_mask) 59 | + hidden_states_block = torch.bmm(attention_probs, value) 60 | + if qidx == 0: 61 | + hidden_states = hidden_states_block 62 | + else: 63 | + hidden_states = torch.cat((hidden_states, hidden_states_block), -2) 64 | hidden_states = attn.batch_to_head_dim(hidden_states) 65 | 66 | # linear proj 67 | -------------------------------------------------------------------------------- /models/speech/whisper/runModel.py: -------------------------------------------------------------------------------- 1 | #################################################################################################### 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear 4 | #################################################################################################### 5 | 6 | import os 7 | from datasets import load_dataset 8 | from transformers import WhisperProcessor 9 | import whisper 10 | import numpy as np 11 | import torch 12 | from audio import AudioSample 13 | import qaic 14 | 15 | model_name = 'base' 16 | aic_path = './whisper_AIC' 17 | 18 | # Select an audio file and read it: 19 | audio_sample = AudioSample() 20 | audio_path = audio_sample.to_file() 21 | audio = whisper.load_audio(audio_path) # Read audio from file 22 | audio_pad = whisper.pad_or_trim(audio) # Padding and trimming 23 | # make log-Mel spectrogram and move to the same device as the model 24 | input_features = whisper.log_mel_spectrogram(audio_pad) # convert to mel spectrogram 25 | 26 | # Load the Whisper processor for parsing results 27 | processor = WhisperProcessor.from_pretrained('openai/whisper-{}'.format(model_name)) 28 | 29 | eot = 50257 # end of transcript token 30 | startoftranscript = 50258 # start of transcript token 31 | 32 | decoder_sequence_length=150 33 | 34 | def run_AIC(input_features, device_id=0): 35 | # Load both encoder and decoder models into Cloud AI accelerator memory 36 | # via oversubscription. 37 | # The number of NSP cores required is the maximum of the numbers of cores 38 | # for which encoder and decoder are compiled. 39 | # If encoder is compiled for 4 cores and decoder is compiled for 12 cores, 40 | # then the max usage is 12 cores. 41 | # Since encoder and decoder don't run at the same time, this allows us to 42 | # efficiently utilize the available cores. 43 | 44 | encoder_sess = qaic.Session( 45 | model_path=os.path.join(aic_path, 'whisper-encoder', 'programqpc.bin'), 46 | num_activations=1, 47 | set_size=1, 48 | dev_id=device_id, 49 | oversubscription_name='group1') 50 | 51 | decoder_sess = qaic.Session( 52 | model_path=os.path.join(aic_path, 'whisper-decoder', 'programqpc.bin'), 53 | num_activations=1, 54 | set_size=1, 55 | dev_id=device_id, 56 | oversubscription_name='group1') 57 | 58 | encoder_inputs = { 59 | 'input_features': input_features.numpy().astype(np.float32).reshape(1,80,3000) 60 | } 61 | 62 | audio_features = encoder_sess.run(encoder_inputs)['last_hidden_state'] 63 | 64 | next_token = None 65 | tokens = [startoftranscript] 66 | decoder_input_ids = np.zeros((1, decoder_sequence_length), dtype=np.int64) 67 | decoder_input_ids[:,0] = startoftranscript 68 | 69 | for iter in range(decoder_sequence_length): 70 | if iter > 0: 71 | decoder_input_ids[:,iter] = next_token.item() 72 | 73 | decoder_inputs = { 74 | 'input_ids': decoder_input_ids, 75 | 'encoder_hidden_states': audio_features, 76 | } 77 | 78 | logits = decoder_sess.run(decoder_inputs)['logits'] 79 | logits = logits[:,iter,:] 80 | 81 | next_token = logits.argmax(axis=-1) 82 | tokens.append(next_token.item()) 83 | 84 | if next_token == eot: # stop at end-of-transcript token 85 | break 86 | 87 | transcription = processor.batch_decode(tokens, skip_special_tokens=False) 88 | print("result:", transcription) 89 | 90 | if __name__ == '__main__': 91 | run_AIC(input_features) 92 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/stable-diffusion-3.5-medium/run_config_inference.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #################################################################################################### 4 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 5 | # SPDX-License-Identifier: BSD-3-Clause-Clear 6 | #################################################################################################### 7 | 8 | # model configs 9 | MODEL_PATH="stabilityai/stable-diffusion-3.5-medium" 10 | PROMPT="\"$1\"" 11 | NEG_PROMPT="\"$2\"" 12 | GUIDANCE=7.0 13 | VAE_TYPE="vae" 14 | IMAGE_SIZE=1024 15 | BLOCK_SIZE=64 16 | BATCH_SIZE=1 17 | 18 | # onnx configs 19 | GENERATE_ONNX=false 20 | ONNX_TEXT_ENCODER=false 21 | ONNX_TRANSFORMER=false 22 | ONNX_VAE=false 23 | 24 | # compile configs 25 | NUM_CORES=16 26 | VAE_MOS=2 27 | VAE_OLS=1 28 | TRANSFORMER_MOS=1 29 | TRANSFORMER_OLS=2 30 | COMPILE_TEXT_ENCODER=false 31 | COMPILE_TRANSFORMER=false 32 | COMPILE_VAE=false 33 | 34 | # inference configs 35 | RUN_ONLY=true 36 | TEXT_ENCODER_3=false 37 | DEVICE=0 38 | DEVICE2=1 39 | NUM_STEPS=28 40 | WARMUP_ITERS=3 41 | REPEAT_ITERS=3 42 | 43 | # mode 44 | TOGETHER=false 45 | 46 | if [ ${GENERATE_ONNX} == true ] 47 | then 48 | GENERATE_ONNX_CMD="--generate-onnx" 49 | else 50 | GENERATE_ONNX_CMD="" 51 | fi 52 | 53 | if [ ${ONNX_TEXT_ENCODER} == true ] 54 | then 55 | ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder" 56 | else 57 | ONNX_TEXT_ENCODER_CMD="" 58 | fi 59 | 60 | if [ ${ONNX_TRANSFORMER} == true ] 61 | then 62 | ONNX_TRANSFORMER_CMD="--onnx-transformer" 63 | else 64 | ONNX_TRANSFORMER_CMD="" 65 | fi 66 | 67 | if [ ${ONNX_VAE} == true ] 68 | then 69 | ONNX_VAE_CMD="--onnx-vae" 70 | else 71 | ONNX_VAE_CMD="" 72 | fi 73 | 74 | if [ ${COMPILE_TEXT_ENCODER} == true ] 75 | then 76 | COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder" 77 | else 78 | COMPILE_TEXT_ENCODER_CMD="" 79 | fi 80 | 81 | if [ ${COMPILE_TRANSFORMER} == true ] 82 | then 83 | COMPILE_TRANSFORMER_CMD="--compile-transformer" 84 | else 85 | COMPILE_TRANSFORMER_CMD="" 86 | fi 87 | 88 | if [ ${COMPILE_VAE} == true ] 89 | then 90 | COMPILE_VAE_CMD="--compile-vae" 91 | else 92 | COMPILE_VAE_CMD="" 93 | fi 94 | 95 | if [ ${RUN_ONLY} == true ] 96 | then 97 | RUN_ONLY_CMD="--run-only" 98 | else 99 | RUN_ONLY_CMD="" 100 | fi 101 | 102 | if [ ${TEXT_ENCODER_3} == true ] 103 | then 104 | TEXT_ENCODER_3_CMD="--text-encoder-3" 105 | else 106 | TEXT_ENCODER_3_CMD="" 107 | fi 108 | 109 | if [ ${TOGETHER} == true ] 110 | then 111 | TOGETHER_CMD="--together" 112 | else 113 | TOGETHER_CMD="" 114 | fi 115 | 116 | export HF_HOME="cache" 117 | 118 | rm run.sh 119 | 120 | scripts="python main.py \ 121 | --model-path $MODEL_PATH \ 122 | --prompt $PROMPT \ 123 | --neg_prompt $NEG_PROMPT \ 124 | --guidance $GUIDANCE \ 125 | --vae-type $VAE_TYPE \ 126 | --batch-size $BATCH_SIZE \ 127 | --image-size $IMAGE_SIZE \ 128 | --block-size $BLOCK_SIZE \ 129 | --num-cores $NUM_CORES \ 130 | --vae-mos $VAE_MOS \ 131 | --vae-ols $VAE_OLS \ 132 | --transformer-mos $TRANSFORMER_MOS \ 133 | --transformer-ols $TRANSFORMER_OLS \ 134 | --device-id $DEVICE \ 135 | --device-id2 $DEVICE2 \ 136 | --num-steps $NUM_STEPS \ 137 | --num-warmup-iters $WARMUP_ITERS \ 138 | --num-repeat-iters $REPEAT_ITERS \ 139 | $ONNX_TEXT_ENCODER_CMD \ 140 | $ONNX_TRANSFORMER_CMD \ 141 | $ONNX_VAE_CMD \ 142 | $COMPILE_TEXT_ENCODER_CMD \ 143 | $COMPILE_TRANSFORMER_CMD \ 144 | $COMPILE_VAE_CMD \ 145 | $GENERATE_ONNX_CMD \ 146 | $RUN_ONLY_CMD \ 147 | $TEXT_ENCODER_3_CMD \ 148 | $TOGETHER_CMD" 149 | 150 | echo $scripts >> run.sh 151 | 152 | bash run.sh 153 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_deepcache/README.md: -------------------------------------------------------------------------------- 1 | ### Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 2 | ### SPDX-License-Identifier: BSD-3-Clause-Clear 3 | 4 | # Instructions to run SDXL on Cloud AI 100 with DeepCache 5 | 6 | The instructions below are to run the [Stable Diffusion XL model](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) with [DeepCache](https://github.com/horseee/DeepCache) on Cloud AI 100. 7 | 8 | 9 | ## Pre-requisites 10 | 11 | Install the moreutils package for the `ts` timestamp tool: 12 | ``` 13 | sudo apt update 14 | sudo apt-get install moreutils 15 | ``` 16 | 17 | Install Git Large File System (LFS) support 18 | 19 | ``` 20 | sudo apt update 21 | sudo apt-get install git-lfs 22 | ``` 23 | 24 | ## 1. Generate onnx files and compile for binaries 25 | 26 | 1. Set up a virtual environment for ONNX generation and compilation 27 | ``` 28 | python3.10 -m venv env_onnx 29 | source ./env_onnx/bin/activate 30 | pip install -r requirements.txt 31 | ``` 32 | 33 | 2. Setup environments 34 | ``` 35 | mkdir cache 36 | mkdir qpc 37 | mkdir compile_logs 38 | ``` 39 | 40 | 3. Install diffusers from source after patching for ONNX file generation 41 | ``` 42 | git clone --depth 1 --branch v0.24.0 https://github.com/huggingface/diffusers.git diffusers_onnx 43 | cd diffusers_onnx 44 | git apply --reject --whitespace=fix ../patches/attention_patch.patch 45 | pip install . 46 | cd .. 47 | ``` 48 | 49 | 4. Install DeepCache for ONNX file generation (deep UNet) 50 | ``` 51 | git clone https://github.com/horseee/DeepCache.git 52 | cd DeepCache 53 | git apply --reject --whitespace=fix ../patches/deepcache_unet.patch 54 | pip install . 55 | cd .. 56 | ``` 57 | 58 | 5. Prepare VAE Decoder 59 | ``` 60 | export GIT_LFS_SKIP_SMUDGE=1 61 | git clone https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0 cache/stabilityai/stable-diffusion-xl-base-1.0 62 | cd cache/stabilityai/stable-diffusion-xl-base-1.0 63 | git lfs pull -I vae_decoder/model.onnx 64 | rm -rf .git/lfs # optional to save space 65 | cd ../../../ 66 | ``` 67 | 68 | 6. Generate ONNX files and compile for binaries 69 | ``` 70 | touch run.sh 71 | bash run_config_deep.sh 72 | ``` 73 | 74 | 7. Modify the UNet to be the shallow version 75 | ``` 76 | sed -i '963s/False/True/' env_onnx/lib/python3.10/site-packages/DeepCache/sdxl/unet_2d_condition.py 77 | ``` 78 | 79 | 8. Generate ONNX file and compile shallow UNet for DeepCache 80 | ``` 81 | bash run_config_shallow.sh 82 | ``` 83 | 84 | ## 2. Run the end-to-end SDXL inference 85 | 86 | 1. Set up a separate virtual environment for running SDXL 87 | ``` 88 | python3.10 -m venv env_pipeline 89 | source ./env_pipeline/bin/activate 90 | pip install -r requirements.txt 91 | pip install --force-reinstall /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl 92 | ``` 93 | 94 | 2. Re-install diffusers and DeepCache from source after patching the SDXL pipeline for inference 95 | ``` 96 | git clone --depth 1 --branch v0.24.0 https://github.com/huggingface/diffusers.git diffusers_pipeline 97 | cd diffusers_pipeline 98 | pip install . 99 | cd .. 100 | ``` 101 | 102 | 3. Install DeepCache and prepare the pipeline for inference 103 | ``` 104 | git clone https://github.com/horseee/DeepCache.git deepcache_pipeline 105 | cd deepcache_pipeline 106 | git apply --reject --whitespace=fix ../patches/deepcache_pipeline.patch 107 | pip install . 108 | cd .. 109 | ``` 110 | 111 | 4. Run the SDXL inference with 'sudo' flag if needed to access the AI100 devices. 112 | ``` 113 | sudo bash run_config_inference.sh $(which python3) 114 | ``` 115 | Note: ```CACHE_INTERVAL``` variable in ```run_config_inference.sh``` refers to the period of caching 116 | 117 | -------------------------------------------------------------------------------- /samples/python/aws_ai100_benchmarking/cv_classifiers/run_cv_classifiers.sh: -------------------------------------------------------------------------------- 1 | ############################################################################## 2 | # @@-COPYRIGHT-START-@@ 3 | # 4 | # Copyright (c) 2023, Qualcomm Technologies, Inc. All Rights Reserved. 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are met: 7 | # 8 | # 1. Redistributions of source code must retain the above copyright notice, 9 | # this list of conditions and the following disclaimer. 10 | # 11 | # 2. Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation 13 | # and/or other materials provided with the distribution. 14 | # 15 | # 3. Neither the name of the copyright holder nor the names of its contributors 16 | # may be used to endorse or promote products derived from this software 17 | # without specific prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 23 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | # POSSIBILITY OF SUCH DAMAGE. 30 | # 31 | # SPDX-License-Identifier: BSD-3-Clause 32 | # 33 | # @@-COPYRIGHT-END-@@ 34 | ############################################################################## 35 | 36 | #!/bin/bash 37 | 38 | mkdir -p ./resnet-152/ 39 | echo python run_cv_classifier.py -m resnet-152 -o best-latency "$@" 40 | python run_cv_classifier.py -m resnet-152 -o best-latency "$@" | tee -a ./resnet-152/best-latency.log 41 | echo python run_cv_classifier.py -m resnet-152 -o balanced "$@" 42 | python run_cv_classifier.py -m resnet-152 -o balanced "$@" | tee -a ./resnet-152/balanced.log 43 | echo python run_cv_classifier.py -m resnet-152 -o best-throughput "$@" 44 | python run_cv_classifier.py -m resnet-152 -o best-throughput "$@" | tee -a ./resnet-152/best-throughput.log 45 | 46 | mkdir -p ./resnet-50/ 47 | echo python run_cv_classifier.py -m resnet-50 -o best-latency "$@" 48 | python run_cv_classifier.py -m resnet-50 -o best-latency "$@" | tee -a ./resnet-50/best-latency.log 49 | echo python run_cv_classifier.py -m resnet-50 -o balanced "$@" 50 | python run_cv_classifier.py -m resnet-50 -o balanced "$@" | tee -a ./resnet-50/balanced.log 51 | echo python run_cv_classifier.py -m resnet-50 -o best-throughput "$@" 52 | python run_cv_classifier.py -m resnet-50 -o best-throughput "$@" | tee -a ./resnet-50/best-throughput.log 53 | 54 | mkdir -p ./vit-base-patch16-224/ 55 | echo python run_cv_classifier.py -m vit-base-patch16-224 -o best-latency "$@" 56 | python run_cv_classifier.py -m vit-base-patch16-224 -o best-latency "$@" | tee -a ./vit-base-patch16-224/best-latency.log 57 | echo python run_cv_classifier.py -m vit-base-patch16-224 -o balanced "$@" 58 | python run_cv_classifier.py -m vit-base-patch16-224 -o balanced "$@" | tee -a ./vit-base-patch16-224/balanced.log 59 | echo python run_cv_classifier.py -m vit-base-patch16-224 -o best-throughput "$@" 60 | python run_cv_classifier.py -m vit-base-patch16-224 -o best-throughput "$@" | tee -a ./vit-base-patch16-224/best-throughput.log 61 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/stable-diffusion-3.5-medium/run_config_gen.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #################################################################################################### 4 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 5 | # SPDX-License-Identifier: BSD-3-Clause-Clear 6 | #################################################################################################### 7 | 8 | # model configs 9 | MODEL_PATH="stabilityai/stable-diffusion-3.5-medium" 10 | VAE_TYPE="vae" 11 | IMAGE_SIZE=1024 12 | BLOCK_SIZE=64 13 | BATCH_SIZE=1 14 | 15 | # onnx configs 16 | GENERATE_ONNX=true 17 | ONNX_TEXT_ENCODER=true 18 | ONNX_TEXT_ENCODER_3=true 19 | ONNX_TRANSFORMER=true 20 | ONNX_VAE=true 21 | 22 | # compile configs 23 | NUM_CORES=16 24 | VAE_MOS=2 25 | VAE_OLS=1 26 | TRANSFORMER_MOS=1 27 | TRANSFORMER_OLS=2 28 | COMPILE_TEXT_ENCODER=true 29 | COMPILE_TEXT_ENCODER_3=false 30 | COMPILE_TRANSFORMER=true 31 | COMPILE_VAE=true 32 | 33 | # inference configs 34 | RUN_ONLY=false 35 | DEVICE=0 36 | DEVICE2=1 37 | NUM_STEPS=1 38 | WARMUP_ITERS=3 39 | REPEAT_ITERS=1 40 | 41 | # mode 42 | TOGETHER=false 43 | 44 | if [ ${GENERATE_ONNX} == true ] 45 | then 46 | GENERATE_ONNX_CMD="--generate-onnx" 47 | else 48 | GENERATE_ONNX_CMD="" 49 | fi 50 | 51 | if [ ${ONNX_TEXT_ENCODER} == true ] 52 | then 53 | ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder" 54 | else 55 | ONNX_TEXT_ENCODER_CMD="" 56 | fi 57 | 58 | if [ ${ONNX_TEXT_ENCODER_3} == true ] 59 | then 60 | ONNX_TEXT_ENCODER_3_CMD="--onnx-text-encoder-3" 61 | else 62 | ONNX_TEXT_ENCODER_3_CMD="" 63 | fi 64 | 65 | if [ ${ONNX_TRANSFORMER} == true ] 66 | then 67 | ONNX_TRANSFORMER_CMD="--onnx-transformer" 68 | else 69 | ONNX_TRANSFORMER_CMD="" 70 | fi 71 | 72 | if [ ${ONNX_VAE} == true ] 73 | then 74 | ONNX_VAE_CMD="--onnx-vae" 75 | else 76 | ONNX_VAE_CMD="" 77 | fi 78 | 79 | if [ ${COMPILE_TEXT_ENCODER} == true ] 80 | then 81 | COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder" 82 | else 83 | COMPILE_TEXT_ENCODER_CMD="" 84 | fi 85 | 86 | if [ ${COMPILE_TEXT_ENCODER_3} == true ] 87 | then 88 | COMPILE_TEXT_ENCODER_3_CMD="--compile-text-encoder-3" 89 | else 90 | COMPILE_TEXT_ENCODER_3_CMD="" 91 | fi 92 | 93 | if [ ${COMPILE_TRANSFORMER} == true ] 94 | then 95 | COMPILE_TRANSFORMER_CMD="--compile-transformer" 96 | else 97 | COMPILE_TRANSFORMER_CMD="" 98 | fi 99 | 100 | if [ ${COMPILE_VAE} == true ] 101 | then 102 | COMPILE_VAE_CMD="--compile-vae" 103 | else 104 | COMPILE_VAE_CMD="" 105 | fi 106 | 107 | if [ ${RUN_ONLY} == true ] 108 | then 109 | RUN_ONLY_CMD="--run-only" 110 | else 111 | RUN_ONLY_CMD="" 112 | fi 113 | 114 | if [ ${TOGETHER} == true ] 115 | then 116 | TOGETHER_CMD="--together" 117 | else 118 | TOGETHER_CMD="" 119 | fi 120 | 121 | export HF_HOME="cache" 122 | 123 | rm run.sh 124 | 125 | scripts="python main.py \ 126 | --model-path $MODEL_PATH \ 127 | --vae-type $VAE_TYPE \ 128 | --batch-size $BATCH_SIZE \ 129 | --image-size $IMAGE_SIZE \ 130 | --block-size $BLOCK_SIZE \ 131 | --num-cores $NUM_CORES \ 132 | --vae-mos $VAE_MOS \ 133 | --vae-ols $VAE_OLS \ 134 | --transformer-mos $TRANSFORMER_MOS \ 135 | --transformer-ols $TRANSFORMER_OLS \ 136 | --device-id $DEVICE \ 137 | --device-id2 $DEVICE2 \ 138 | --num-steps $NUM_STEPS \ 139 | --num-warmup-iters $WARMUP_ITERS \ 140 | --num-repeat-iters $REPEAT_ITERS \ 141 | $ONNX_TEXT_ENCODER_CMD \ 142 | $ONNX_TEXT_ENCODER_3_CMD \ 143 | $ONNX_TRANSFORMER_CMD \ 144 | $ONNX_VAE_CMD \ 145 | $COMPILE_TEXT_ENCODER_CMD \ 146 | $COMPILE_TEXT_ENCODER_3_CMD \ 147 | $COMPILE_TRANSFORMER_CMD \ 148 | $COMPILE_VAE_CMD \ 149 | $GENERATE_ONNX_CMD \ 150 | $RUN_ONLY_CMD \ 151 | $TOGETHER_CMD" 152 | 153 | echo $scripts >> run.sh 154 | 155 | bash run.sh 156 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_deepcache/run_config_deep.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #################################################################################################### 4 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 5 | # SPDX-License-Identifier: BSD-3-Clause-Clear 6 | #################################################################################################### 7 | 8 | # model configs 9 | MODEL_PATH="stabilityai/stable-diffusion-xl-base-1.0" 10 | PROMPT="\"A cinematic shot of a baby racoon wearing an intricate italian priest robe.\"" 11 | VAE_TYPE="vae" 12 | UNET_TYPE="deep" 13 | IMAGE_SIZE=1024 14 | BLOCK_SIZE_DEEP=256 15 | BLOCK_SIZE_SHALLOW=128 16 | BATCH_SIZE=1 17 | PRECISION=fp16,fp16,fp16,fp16 18 | 19 | # onnx configs 20 | GENERATE_ONNX=true 21 | ONNX_TEXT_ENCODER=true 22 | ONNX_UNET=true 23 | ONNX_VAE=true 24 | 25 | # compile configs 26 | NUM_CORES=16 27 | VAE_MOS=2 28 | VAE_OLS=1 29 | UNET_MOS_DEEP=2 30 | UNET_OLS_DEEP=1 31 | UNET_MOS_SHALLOW=1 32 | UNET_OLS_SHALLOW=2 33 | COMPILE_TEXT_ENCODER=true 34 | COMPILE_UNET=true 35 | COMPILE_VAE=true 36 | 37 | # inference configs 38 | RUN_ONLY=false 39 | DEVICE=0 40 | DEVICE_2=1 41 | NUM_STEPS=20 42 | WARMUP_ITERS=3 43 | REPEAT_ITERS=3 44 | 45 | # mode 46 | TOGETHER=false 47 | 48 | if [ ${GENERATE_ONNX} == true ] 49 | then 50 | GENERATE_ONNX_CMD="--generate-onnx" 51 | else 52 | GENERATE_ONNX_CMD="" 53 | fi 54 | 55 | if [ ${ONNX_TEXT_ENCODER} == true ] 56 | then 57 | ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder" 58 | else 59 | ONNX_TEXT_ENCODER_CMD="" 60 | fi 61 | 62 | if [ ${ONNX_UNET} == true ] 63 | then 64 | ONNX_UNET_CMD="--onnx-unet" 65 | else 66 | ONNX_UNET_CMD="" 67 | fi 68 | 69 | if [ ${ONNX_VAE} == true ] 70 | then 71 | ONNX_VAE_CMD="--onnx-vae" 72 | else 73 | ONNX_VAE_CMD="" 74 | fi 75 | 76 | if [ ${COMPILE_TEXT_ENCODER} == true ] 77 | then 78 | COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder" 79 | else 80 | COMPILE_TEXT_ENCODER_CMD="" 81 | fi 82 | 83 | if [ ${COMPILE_UNET} == true ] 84 | then 85 | COMPILE_UNET_CMD="--compile-unet" 86 | else 87 | COMPILE_UNET_CMD="" 88 | fi 89 | 90 | if [ ${COMPILE_VAE} == true ] 91 | then 92 | COMPILE_VAE_CMD="--compile-vae" 93 | else 94 | COMPILE_VAE_CMD="" 95 | fi 96 | 97 | if [ ${RUN_ONLY} == true ] 98 | then 99 | RUN_ONLY_CMD="--run-only" 100 | else 101 | RUN_ONLY_CMD="" 102 | fi 103 | 104 | if [ ${TOGETHER} == true ] 105 | then 106 | TOGETHER_CMD="--together" 107 | else 108 | TOGETHER_CMD="" 109 | fi 110 | 111 | export HF_HOME="cache" 112 | sed -i 's/query_block_size = 128/query_block_size = 256/g' ./env_onnx/lib/python3.10/site-packages/diffusers/models/attention_processor.py 113 | 114 | rm run.sh 115 | 116 | scripts="python main.py \ 117 | --model-path $MODEL_PATH \ 118 | --prompt $PROMPT \ 119 | --unet-type $UNET_TYPE \ 120 | --vae-type $VAE_TYPE \ 121 | --batch-size $BATCH_SIZE \ 122 | --image-size $IMAGE_SIZE \ 123 | --block-size-deep $BLOCK_SIZE_DEEP \ 124 | --block-size-shallow $BLOCK_SIZE_SHALLOW \ 125 | --num-cores $NUM_CORES \ 126 | --vae-mos $VAE_MOS \ 127 | --vae-ols $VAE_OLS \ 128 | --unet-mos-deep $UNET_MOS_DEEP \ 129 | --unet-ols-deep $UNET_OLS_DEEP \ 130 | --unet-mos-shallow $UNET_MOS_SHALLOW \ 131 | --unet-ols-shallow $UNET_OLS_SHALLOW \ 132 | --device-id $DEVICE \ 133 | --device-id-2 $DEVICE_2 \ 134 | --num-steps $NUM_STEPS \ 135 | --num-warmup-iters $WARMUP_ITERS \ 136 | --num-repeat-iters $REPEAT_ITERS \ 137 | --precision $PRECISION \ 138 | $ONNX_TEXT_ENCODER_CMD \ 139 | $ONNX_UNET_CMD \ 140 | $ONNX_VAE_CMD \ 141 | $COMPILE_TEXT_ENCODER_CMD \ 142 | $COMPILE_UNET_CMD \ 143 | $COMPILE_VAE_CMD \ 144 | $GENERATE_ONNX_CMD \ 145 | $RUN_ONLY_CMD \ 146 | $TOGETHER_CMD" 147 | 148 | echo $scripts >> run.sh 149 | 150 | bash run.sh 151 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_deepcache/run_config_shallow.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #################################################################################################### 4 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 5 | # SPDX-License-Identifier: BSD-3-Clause-Clear 6 | #################################################################################################### 7 | 8 | # model configs 9 | MODEL_PATH="stabilityai/stable-diffusion-xl-base-1.0" 10 | PROMPT="\"A cinematic shot of a baby racoon wearing an intricate italian priest robe.\"" 11 | VAE_TYPE="vae" 12 | UNET_TYPE="shallow" 13 | IMAGE_SIZE=1024 14 | BLOCK_SIZE_DEEP=256 15 | BLOCK_SIZE_SHALLOW=128 16 | BATCH_SIZE=1 17 | PRECISION=fp16,fp16,fp16,fp16 18 | 19 | # onnx configs 20 | GENERATE_ONNX=true 21 | ONNX_TEXT_ENCODER=false 22 | ONNX_UNET=true 23 | ONNX_VAE=false 24 | 25 | # compile configs 26 | NUM_CORES=16 27 | VAE_MOS=2 28 | VAE_OLS=1 29 | UNET_MOS_DEEP=2 30 | UNET_OLS_DEEP=1 31 | UNET_MOS_SHALLOW=1 32 | UNET_OLS_SHALLOW=2 33 | COMPILE_TEXT_ENCODER=false 34 | COMPILE_UNET=true 35 | COMPILE_VAE=false 36 | 37 | # inference configs 38 | RUN_ONLY=false 39 | DEVICE=0 40 | DEVICE_2=1 41 | NUM_STEPS=20 42 | WARMUP_ITERS=3 43 | REPEAT_ITERS=3 44 | 45 | # mode 46 | TOGETHER=false 47 | 48 | if [ ${GENERATE_ONNX} == true ] 49 | then 50 | GENERATE_ONNX_CMD="--generate-onnx" 51 | else 52 | GENERATE_ONNX_CMD="" 53 | fi 54 | 55 | if [ ${ONNX_TEXT_ENCODER} == true ] 56 | then 57 | ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder" 58 | else 59 | ONNX_TEXT_ENCODER_CMD="" 60 | fi 61 | 62 | if [ ${ONNX_UNET} == true ] 63 | then 64 | ONNX_UNET_CMD="--onnx-unet" 65 | else 66 | ONNX_UNET_CMD="" 67 | fi 68 | 69 | if [ ${ONNX_VAE} == true ] 70 | then 71 | ONNX_VAE_CMD="--onnx-vae" 72 | else 73 | ONNX_VAE_CMD="" 74 | fi 75 | 76 | if [ ${COMPILE_TEXT_ENCODER} == true ] 77 | then 78 | COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder" 79 | else 80 | COMPILE_TEXT_ENCODER_CMD="" 81 | fi 82 | 83 | if [ ${COMPILE_UNET} == true ] 84 | then 85 | COMPILE_UNET_CMD="--compile-unet" 86 | else 87 | COMPILE_UNET_CMD="" 88 | fi 89 | 90 | if [ ${COMPILE_VAE} == true ] 91 | then 92 | COMPILE_VAE_CMD="--compile-vae" 93 | else 94 | COMPILE_VAE_CMD="" 95 | fi 96 | 97 | if [ ${RUN_ONLY} == true ] 98 | then 99 | RUN_ONLY_CMD="--run-only" 100 | else 101 | RUN_ONLY_CMD="" 102 | fi 103 | 104 | if [ ${TOGETHER} == true ] 105 | then 106 | TOGETHER_CMD="--together" 107 | else 108 | TOGETHER_CMD="" 109 | fi 110 | 111 | export HF_HOME="cache" 112 | sed -i 's/query_block_size = 256/query_block_size = 128/g' ./env_onnx/lib/python3.10/site-packages/diffusers/models/attention_processor.py 113 | 114 | rm run.sh 115 | 116 | scripts="python main.py \ 117 | --model-path $MODEL_PATH \ 118 | --prompt $PROMPT \ 119 | --unet-type $UNET_TYPE \ 120 | --vae-type $VAE_TYPE \ 121 | --batch-size $BATCH_SIZE \ 122 | --image-size $IMAGE_SIZE \ 123 | --block-size-deep $BLOCK_SIZE_DEEP \ 124 | --block-size-shallow $BLOCK_SIZE_SHALLOW \ 125 | --num-cores $NUM_CORES \ 126 | --vae-mos $VAE_MOS \ 127 | --vae-ols $VAE_OLS \ 128 | --unet-mos-deep $UNET_MOS_DEEP \ 129 | --unet-ols-deep $UNET_OLS_DEEP \ 130 | --unet-mos-shallow $UNET_MOS_SHALLOW \ 131 | --unet-ols-shallow $UNET_OLS_SHALLOW \ 132 | --device-id $DEVICE \ 133 | --device-id-2 $DEVICE_2 \ 134 | --num-steps $NUM_STEPS \ 135 | --num-warmup-iters $WARMUP_ITERS \ 136 | --num-repeat-iters $REPEAT_ITERS \ 137 | --precision $PRECISION \ 138 | $ONNX_TEXT_ENCODER_CMD \ 139 | $ONNX_UNET_CMD \ 140 | $ONNX_VAE_CMD \ 141 | $COMPILE_TEXT_ENCODER_CMD \ 142 | $COMPILE_UNET_CMD \ 143 | $COMPILE_VAE_CMD \ 144 | $GENERATE_ONNX_CMD \ 145 | $RUN_ONLY_CMD \ 146 | $TOGETHER_CMD" 147 | 148 | echo $scripts >> run.sh 149 | 150 | bash run.sh 151 | -------------------------------------------------------------------------------- /models/language_processing/encoder/server.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 2 | # SPDX-License-Identifier: BSD-3-Clause-Clear 3 | 4 | from contextlib import asynccontextmanager 5 | from fastapi import FastAPI, HTTPException 6 | from typing import Optional, List, Union 7 | from pydantic import BaseModel 8 | import argparse 9 | 10 | from model import QAicEmbeddingModel 11 | 12 | @asynccontextmanager 13 | async def lifespan(app: FastAPI): 14 | # Code to run before the application starts 15 | print("Application startup") 16 | 17 | app.model = QAicEmbeddingModel(model_name=args.model_name, qpc_path=args.qpc_path, device=args.device) 18 | 19 | yield 20 | 21 | # Code to run when the application shuts down 22 | print("Application shutdown") 23 | 24 | app = FastAPI(lifespan=lifespan) 25 | 26 | @app.get("/v1/models") 27 | async def get_models(): 28 | #print('get_models') 29 | try: 30 | response = { 31 | "object": "list", 32 | "data": [ 33 | { 34 | "id": app.model.name, 35 | "object": "model", 36 | "created": 1746296172, 37 | "owned_by": "system", 38 | "max_model_len": 4096 39 | } 40 | ], 41 | } 42 | 43 | return response 44 | except Exception as e: 45 | print(str(e)) 46 | raise HTTPException(status_code=500, detail=str(e)) 47 | 48 | class EmbeddingsRequest(BaseModel): 49 | model: Optional[str] = "bge-large-en-v1.5" 50 | input: Union[str, List[str]] 51 | encoding_format: Optional[str] = 'float' 52 | user: Optional[str] = None 53 | 54 | @app.post("/v1/embeddings") 55 | async def embeddings(request: EmbeddingsRequest): 56 | try: 57 | response = {'object': 'list', 'data': []} 58 | 59 | inputs = request.input 60 | if isinstance(inputs, str): 61 | inputs = [inputs] 62 | 63 | for idx, input in enumerate(inputs): 64 | token_embedding, sentence_embeddings = app.model.generate(input) 65 | 66 | response['data'].append( 67 | { 68 | 'object': 'embedding', 69 | 'embedding': sentence_embeddings.reshape(-1).tolist(), 70 | 'index': idx 71 | } 72 | ) 73 | #print(response) 74 | return response 75 | except Exception as e: 76 | print(str(e)) 77 | raise HTTPException(status_code=500, detail=str(e)) 78 | 79 | if __name__ == "__main__": 80 | import uvicorn 81 | 82 | parser = argparse.ArgumentParser(description="Embedding model endpoint") 83 | 84 | parser.add_argument( 85 | "--host", 86 | type=str, 87 | help="IP address", 88 | default="0.0.0.0" 89 | ) 90 | 91 | parser.add_argument( 92 | "--port", 93 | type=int, 94 | help="Port", 95 | default=8000 96 | ) 97 | 98 | parser.add_argument( 99 | "--hf_token", 100 | type=str, 101 | help="Hugging Face auth token", 102 | default=None 103 | ) 104 | 105 | parser.add_argument( 106 | "--model_name", 107 | type=str, 108 | help="Hugging Face model path", 109 | default='BAAI/bge-large-en-v1.5' 110 | ) 111 | 112 | parser.add_argument( 113 | "--qpc_path", 114 | type=str, 115 | help="QPC model binary path", 116 | default='./models/BAAI/bge-large-en-v1.5/compiled-bin-fp16-B1-C4-A3-OLS2-MOS1-best-throughput' 117 | ) 118 | 119 | parser.add_argument( 120 | "--device", 121 | type=int, 122 | help="Cloud AI accelerator device ID", 123 | default=0 124 | ) 125 | 126 | args = parser.parse_args() 127 | 128 | uvicorn.run(app, host=args.host, port=args.port) 129 | 130 | 131 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_turbo/patches/attention_patch.patch: -------------------------------------------------------------------------------- 1 | diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py 2 | index 21eb3a3..4f8d68c 100644 3 | --- a/src/diffusers/models/attention_processor.py 4 | +++ b/src/diffusers/models/attention_processor.py 5 | @@ -11,6 +11,10 @@ 6 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 7 | # See the License for the specific language governing permissions and 8 | # limitations under the License. 9 | +# 10 | +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 11 | +# SPDX-License-Identifier: BSD-3-Clause-Clear 12 | +# Not a Contribution 13 | from importlib import import_module 14 | from typing import Callable, Optional, Union 15 | 16 | @@ -200,10 +204,8 @@ class Attention(nn.Module): 17 | # We use the AttnProcessor2_0 by default when torch 2.x is used which uses 18 | # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention 19 | # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1 20 | - if processor is None: 21 | - processor = ( 22 | - AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor() 23 | - ) 24 | + # force to not use FlashAttention 25 | + processor = AttnProcessor() 26 | self.set_processor(processor) 27 | 28 | def set_use_memory_efficient_attention_xformers( 29 | @@ -588,7 +590,9 @@ class Attention(nn.Module): 30 | 31 | if attention_mask is None: 32 | baddbmm_input = torch.empty( 33 | - query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device 34 | + query.shape[0], query.shape[1], 35 | + key.shape[2], # key is already transposed 36 | + dtype=query.dtype, device=query.device 37 | ) 38 | beta = 0 39 | else: 40 | @@ -598,7 +602,7 @@ class Attention(nn.Module): 41 | attention_scores = torch.baddbmm( 42 | baddbmm_input, 43 | query, 44 | - key.transpose(-1, -2), 45 | + key, # key is already transposed 46 | beta=beta, 47 | alpha=self.scale, 48 | ) 49 | @@ -740,8 +744,26 @@ class AttnProcessor: 50 | key = attn.head_to_batch_dim(key) 51 | value = attn.head_to_batch_dim(value) 52 | 53 | - attention_probs = attn.get_attention_scores(query, key, attention_mask) 54 | - hidden_states = torch.bmm(attention_probs, value) 55 | + # pre-transpose the key 56 | + key = key.transpose(-1, -2) 57 | + if query.size(-2) != value.size(-2): # cross-attention, use regular attention 58 | + # QKV done in single block 59 | + attention_probs = attn.get_attention_scores(query, key, attention_mask) 60 | + hidden_states = torch.bmm(attention_probs, value) 61 | + else: # self-attention, use blocked attention 62 | + # QKV done with block-attention (a la FlashAttentionV2) 63 | + print(f"{query.shape = }, {key.shape = }, {value.shape = }") 64 | + query_block_size = 256 65 | + query_seq_len = query.size(-2) 66 | + num_blocks = (query_seq_len + query_block_size - 1) // query_block_size 67 | + for qidx in range(num_blocks): 68 | + query_block = query[:,qidx*query_block_size:(qidx+1)*query_block_size,:] 69 | + attention_probs = attn.get_attention_scores(query_block, key, attention_mask) 70 | + hidden_states_block = torch.bmm(attention_probs, value) 71 | + if qidx == 0: 72 | + hidden_states = hidden_states_block 73 | + else: 74 | + hidden_states = torch.cat((hidden_states, hidden_states_block), -2) 75 | hidden_states = attn.batch_to_head_dim(hidden_states) 76 | 77 | # linear proj 78 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_deepcache/patches/attention_patch.patch: -------------------------------------------------------------------------------- 1 | diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py 2 | index 21eb3a3..d43b51e 100644 3 | --- a/src/diffusers/models/attention_processor.py 4 | +++ b/src/diffusers/models/attention_processor.py 5 | @@ -11,6 +11,10 @@ 6 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 7 | # See the License for the specific language governing permissions and 8 | # limitations under the License. 9 | +# 10 | +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 11 | +# SPDX-License-Identifier: BSD-3-Clause-Clear 12 | +# Not a Contribution 13 | from importlib import import_module 14 | from typing import Callable, Optional, Union 15 | 16 | @@ -200,10 +204,8 @@ class Attention(nn.Module): 17 | # We use the AttnProcessor2_0 by default when torch 2.x is used which uses 18 | # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention 19 | # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1 20 | - if processor is None: 21 | - processor = ( 22 | - AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor() 23 | - ) 24 | + # force to not use FlashAttention 25 | + processor = AttnProcessor() 26 | self.set_processor(processor) 27 | 28 | def set_use_memory_efficient_attention_xformers( 29 | @@ -588,7 +590,9 @@ class Attention(nn.Module): 30 | 31 | if attention_mask is None: 32 | baddbmm_input = torch.empty( 33 | - query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device 34 | + query.shape[0], query.shape[1], 35 | + key.shape[2], # key is already transposed 36 | + dtype=query.dtype, device=query.device 37 | ) 38 | beta = 0 39 | else: 40 | @@ -598,7 +602,7 @@ class Attention(nn.Module): 41 | attention_scores = torch.baddbmm( 42 | baddbmm_input, 43 | query, 44 | - key.transpose(-1, -2), 45 | + key, # key is already transposed 46 | beta=beta, 47 | alpha=self.scale, 48 | ) 49 | @@ -740,8 +744,26 @@ class AttnProcessor: 50 | key = attn.head_to_batch_dim(key) 51 | value = attn.head_to_batch_dim(value) 52 | 53 | - attention_probs = attn.get_attention_scores(query, key, attention_mask) 54 | - hidden_states = torch.bmm(attention_probs, value) 55 | + # pre-transpose the key 56 | + key = key.transpose(-1, -2) 57 | + if query.size(-2) != value.size(-2): # cross-attention, use regular attention 58 | + # QKV done in single block 59 | + attention_probs = attn.get_attention_scores(query, key, attention_mask) 60 | + hidden_states = torch.bmm(attention_probs, value) 61 | + else: # self-attention, use blocked attention 62 | + # QKV done with block-attention (a la FlashAttentionV2) 63 | + print(f"{query.shape = }, {key.shape = }, {value.shape = }") 64 | + query_block_size = 128 65 | + query_seq_len = query.size(-2) 66 | + num_blocks = (query_seq_len + query_block_size - 1) // query_block_size 67 | + for qidx in range(num_blocks): 68 | + query_block = query[:,qidx*query_block_size:(qidx+1)*query_block_size,:] 69 | + attention_probs = attn.get_attention_scores(query_block, key, attention_mask) 70 | + hidden_states_block = torch.bmm(attention_probs, value) 71 | + if qidx == 0: 72 | + hidden_states = hidden_states_block 73 | + else: 74 | + hidden_states = torch.cat((hidden_states, hidden_states_block), -2) 75 | hidden_states = attn.batch_to_head_dim(hidden_states) 76 | 77 | # linear proj 78 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_deepcache/run_config_inference.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #################################################################################################### 4 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 5 | # SPDX-License-Identifier: BSD-3-Clause-Clear 6 | #################################################################################################### 7 | 8 | 9 | PYTHON=$1 10 | echo $PYTHON 11 | 12 | # model configs 13 | MODEL_PATH="stabilityai/stable-diffusion-xl-base-1.0" 14 | PROMPT="\"A cinematic shot of a baby racoon wearing an intricate italian priest robe.\"" 15 | USE_LATENTS="\"\"" 16 | NEGATIVE_PROMPT="\"Normal quality, low quality, worst quality, low res, blurry.\"" 17 | VAE_TYPE="vae" 18 | UNET_TYPE="deep" 19 | IMAGE_SIZE=1024 20 | BLOCK_SIZE_DEEP=256 21 | BLOCK_SIZE_SHALLOW=128 22 | BATCH_SIZE=1 23 | PRECISION=fp16,fp16,fp16,fp16 24 | 25 | # onnx configs 26 | GENERATE_ONNX=false 27 | ONNX_TEXT_ENCODER=true 28 | ONNX_UNET=true 29 | ONNX_VAE=true 30 | 31 | # compile configs 32 | NUM_CORES=16 33 | VAE_MOS=2 34 | VAE_OLS=1 35 | UNET_MOS_DEEP=2 36 | UNET_OLS_DEEP=1 37 | UNET_MOS_SHALLOW=1 38 | UNET_OLS_SHALLOW=2 39 | COMPILE_TEXT_ENCODER=true 40 | COMPILE_UNET=true 41 | COMPILE_VAE=true 42 | 43 | # inference configs 44 | RUN_ONLY=true 45 | DEVICE=0 46 | DEVICE_2=1 47 | NUM_STEPS=20 48 | WARMUP_ITERS=3 49 | REPEAT_ITERS=3 50 | CACHE_INTERVAL=3 51 | 52 | # mode 53 | TOGETHER=false 54 | 55 | if [ ${GENERATE_ONNX} == true ] 56 | then 57 | GENERATE_ONNX_CMD="--generate-onnx" 58 | else 59 | GENERATE_ONNX_CMD="" 60 | fi 61 | 62 | if [ ${ONNX_TEXT_ENCODER} == true ] 63 | then 64 | ONNX_TEXT_ENCODER_CMD="--onnx-text-encoder" 65 | else 66 | ONNX_TEXT_ENCODER_CMD="" 67 | fi 68 | 69 | if [ ${ONNX_UNET} == true ] 70 | then 71 | ONNX_UNET_CMD="--onnx-unet" 72 | else 73 | ONNX_UNET_CMD="" 74 | fi 75 | 76 | if [ ${ONNX_VAE} == true ] 77 | then 78 | ONNX_VAE_CMD="--onnx-vae" 79 | else 80 | ONNX_VAE_CMD="" 81 | fi 82 | 83 | if [ ${COMPILE_TEXT_ENCODER} == true ] 84 | then 85 | COMPILE_TEXT_ENCODER_CMD="--compile-text-encoder" 86 | else 87 | COMPILE_TEXT_ENCODER_CMD="" 88 | fi 89 | 90 | if [ ${COMPILE_UNET} == true ] 91 | then 92 | COMPILE_UNET_CMD="--compile-unet" 93 | else 94 | COMPILE_UNET_CMD="" 95 | fi 96 | 97 | if [ ${COMPILE_VAE} == true ] 98 | then 99 | COMPILE_VAE_CMD="--compile-vae" 100 | else 101 | COMPILE_VAE_CMD="" 102 | fi 103 | 104 | if [ ${RUN_ONLY} == true ] 105 | then 106 | RUN_ONLY_CMD="--run-only" 107 | else 108 | RUN_ONLY_CMD="" 109 | fi 110 | 111 | if [ ${TOGETHER} == true ] 112 | then 113 | TOGETHER_CMD="--together" 114 | else 115 | TOGETHER_CMD="" 116 | fi 117 | 118 | export HF_HOME="cache" 119 | export TQDM_DISABLE=1 120 | 121 | rm run.sh 122 | 123 | scripts="$PYTHON main.py \ 124 | --model-path $MODEL_PATH \ 125 | --prompt $PROMPT \ 126 | --negative-prompt $NEGATIVE_PROMPT \ 127 | --use-latents $USE_LATENTS \ 128 | --unet-type $UNET_TYPE \ 129 | --vae-type $VAE_TYPE \ 130 | --batch-size $BATCH_SIZE \ 131 | --image-size $IMAGE_SIZE \ 132 | --block-size-deep $BLOCK_SIZE_DEEP \ 133 | --block-size-shallow $BLOCK_SIZE_SHALLOW \ 134 | --num-cores $NUM_CORES \ 135 | --vae-mos $VAE_MOS \ 136 | --vae-ols $VAE_OLS \ 137 | --unet-mos-deep $UNET_MOS_DEEP \ 138 | --unet-ols-deep $UNET_OLS_DEEP \ 139 | --unet-mos-shallow $UNET_MOS_SHALLOW \ 140 | --unet-ols-shallow $UNET_OLS_SHALLOW \ 141 | --device-id $DEVICE \ 142 | --device-id-2 $DEVICE_2 \ 143 | --precision $PRECISION \ 144 | --num-steps $NUM_STEPS \ 145 | --num-warmup-iters $WARMUP_ITERS \ 146 | --num-repeat-iters $REPEAT_ITERS \ 147 | --cache-interval $CACHE_INTERVAL \ 148 | $ONNX_TEXT_ENCODER_CMD \ 149 | $ONNX_UNET_CMD \ 150 | $ONNX_VAE_CMD \ 151 | $COMPILE_TEXT_ENCODER_CMD \ 152 | $COMPILE_UNET_CMD \ 153 | $COMPILE_VAE_CMD \ 154 | $GENERATE_ONNX_CMD \ 155 | $RUN_ONLY_CMD \ 156 | $TOGETHER_CMD" 157 | 158 | echo $scripts >> run.sh 159 | 160 | bash run.sh 161 | -------------------------------------------------------------------------------- /samples/python/aws_ai100_benchmarking/yolo_models/README.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | --- 3 | 4 | Download the yolov4, yolov5, and yolov7 models, prepare for the Qualcomm AIC100, compile for high-thoughput, min-latency, or balanced throughput with fp16 precision, run the model on a generated random sample, and obtain the benchmarking results and output values. 5 | 6 | ## Source of the models 7 | --- 8 | The models are downloaded from (https://github.com/ultralytics/yolov5). This script has been tested for the following requested models: 9 | * yolov4 10 | * yolov5s 11 | * yolov5m 12 | * yolov5l 13 | * yolov5x 14 | * yolov7-e6e 15 | 16 | 17 | ## Virtual environment 18 | --- 19 | For a quick environment setup: 20 | 21 | ```commandline 22 | source /opt/qti-aic/dev/python/qaic-env/bin/activate 23 | ``` 24 | 25 | ## Framework and version 26 | --- 27 | ```commandline 28 | pip3 install torch==1.13.0 onnx==1.12.0 onnxruntime==1.15.0 torchvision==0.14.0 transformers==4.29.2 pandas==2.0.2 urllib3==1.26.6 29 | pip3 install ultralytics seaborn nvidia-pyindex onnx-graphsurgeon 30 | 31 | ``` 32 | ## Syntax 33 | --- 34 | Copy the run_yolo_model.py and the lut_yolo_models.csv to a working directory. Pick a MODEl_NAME from the list above, and type: 35 | 36 | ```commandline 37 | 38 | usage: run_yolo_model.py [-h] --model-name {yolov4,yolov5s,yolov5m,yolov5l,yolov5x,yolov7-e6e} 39 | [--objective {best-latency,best-throughput,balanced}] 40 | [--opset OPSET] 41 | [--batch-size BATCH_SIZE] 42 | [--image-size IMAGE_SIZE] 43 | [--cores {1,2,3,4,5,6,7,8,9,10,11,12,13,14}] 44 | [--instances {1,2,3,4,5,6,7,8,9,10,11,12,13,14}] 45 | [--ols {1,2,3,4,5,6,7,8,9,10,11,12,13,14}] 46 | [--mos MOS] 47 | [--set-size {1,2,3,4,5,6,7,8,9,10}] 48 | [--extra EXTRA] 49 | [--time TIME] 50 | [--device {0,1,2,3,4,5,6,7}] 51 | [--run-only] 52 | 53 | 54 | 55 | Download, Compile, and Run YOLO models on randomly generated inputs 56 | 57 | 58 | optional arguments: 59 | -h, --help show this help message and exit 60 | --model-name, -m {yolov4,yolov5s,yolov5m,yolov5l,yolov5x,yolov7-e6e} 61 | Model name to download. 62 | --objective, -o {best-latency,best-throughput,balanced} 63 | Running for best-latency, best-throughput, or balanced 64 | --opset OPSET ONNX opset. Default <12> 65 | --batch-size, -b BATCH_SIZE 66 | Sample input batch size. Default <1>. 67 | --image-size, -s IMAGE_SIZE 68 | Sample input image width/height. Default <640>. 69 | --cores, -c {1,2,3,4,5,6,7,8,9,10,11,12,13,14} 70 | Number of AIC100 cores to compile the model for. Default <2> 71 | --instances, -i {1,2,3,4,5,6,7,8,9,10,11,12,13,14} 72 | Number of model instances to run on AIC100. Default <7> 73 | --ols {1,2,3,4,5,6,7,8,9,10,11,12,13,14} 74 | Overlap split factor. Default <1> 75 | --mos MOS Maximum output channel split. Default <1> 76 | --set-size {1,2,3,4,5,6,7,8,9,10} 77 | Set size. Default <10> 78 | --extra EXTRA Extra compilation arguments. 79 | --time TIME Duration (in seconds) for which to submit inferences. Default <20> 80 | --device, -d {0,1,2,3,4,5,6,7} 81 | AIC100 device ID. Default <0> 82 | --run-only, -r Performs the inference only, without re-exporting and re-compiling the model 83 | 84 | 85 | ``` 86 | For example: 87 | ```commandline 88 | python run_yolo_model.py -m yolov5s -o best-throughput 89 | ``` 90 | or 91 | ```commandline 92 | python run_yolo_model.py -m yolov5m -o balanced 93 | ``` 94 | or 95 | 96 | ```commandline 97 | python run_yolo_model.py -m yolov5x -o best-throughput 98 | ``` 99 | 100 | The hardware configuration will be either associated to the corresponding row in the lut_yolo_models.csv or to defualt values if not specified by the user. If the MODEL_NAME is not included in the lut_yolo_models.csv, default values will be used. 101 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_turbo/README.md: -------------------------------------------------------------------------------- 1 | # Instructions to run SDXL-Turbo on Cloud AI 100 2 | 3 | The instructions below are to run the [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo) model on Cloud AI 100. Compile time parameters may need to be adjusted for different cards and different SDKs. 4 | 5 | ## Pre-requisites 6 | 7 | Use the [SDK 1.19.8.0](https://github.com/quic/cloud-ai-containers/pkgs/container/cloud_ai_inference_ubuntu22/414822849?tag=1.19.8.0) container to compile the sdxl-turbo models. 8 | 9 | ``` 10 | sudo docker run \ 11 | -it \ 12 | --workdir /cloud-ai-sdk \ 13 | --entrypoint /bin/bash \ 14 | --network=host \ 15 | --mount type=bind,source=,target=/cloud-ai-sdk \ 16 | --device=/dev/accel/accel0 \ 17 | --device=/dev/accel/accel1 \ 18 | --device=/dev/accel/accel2 \ 19 | --device=/dev/accel/accel3 \ 20 | ghcr.io/quic/cloud_ai_inference_ubuntu22:1.19.8.0 21 | 22 | cd models/multimodal/text_to_image/sdxl_turbo 23 | ``` 24 | 25 | Install the moreutils package for the `ts` timestamp tool: 26 | ``` 27 | sudo apt update 28 | sudo apt-get install moreutils 29 | ``` 30 | 31 | Install Git Large File System (LFS) support 32 | 33 | ``` 34 | sudo apt update 35 | sudo apt-get install git-lfs 36 | ``` 37 | 38 | ## 1. Generate onnx files and compile for binaries 39 | 40 | 1. Set up a virtual environment for ONNX generation and compilation 41 | ``` 42 | python3.10 -m venv env_onnx 43 | source ./env_onnx/bin/activate 44 | pip install -r requirements.txt 45 | ``` 46 | 47 | 2. Create a folder for caching Hugging Face model downloads, and export the environment variable HF_HOME 48 | ``` 49 | mkdir cache 50 | mkdir compile_logs 51 | mkdir qpc 52 | touch run.sh 53 | export HF_HOME=${PWD}/cache 54 | ``` 55 | 56 | 3. Install diffusers from source after patching for ONNX file generation 57 | ``` 58 | git clone --depth 1 --branch v0.24.0 https://github.com/huggingface/diffusers.git diffusers-onnx 59 | cd diffusers-onnx 60 | git apply --reject --whitespace=fix ../patches/attention_patch.patch 61 | pip install . 62 | cd .. 63 | ``` 64 | 65 | 4. Prepare VAE Decoder 66 | ``` 67 | export GIT_LFS_SKIP_SMUDGE=1 68 | git clone https://huggingface.co/stabilityai/sdxl-turbo cache/stabilityai/sdxl_turbo 69 | cd cache/stabilityai/sdxl_turbo 70 | git lfs pull -I vae_decoder/model.onnx 71 | rm -rf .git/lfs # optional to save space 72 | cd ../../../ 73 | ``` 74 | 75 | 5. Generate ONNX files and compile for binaries 76 | ``` 77 | bash run_config_gen.sh 78 | ``` 79 | 80 | ## 2. Run the end-to-end SDXL-Turbo inference 81 | 82 | 1. Set up a separate virtual environment for running SDXL Turbo 83 | ``` 84 | python3.10 -m venv env_pipeline 85 | source ./env_pipeline/bin/activate 86 | pip install -r requirements.txt 87 | pip install --force-reinstall /opt/qti-aic/dev/lib/x86_64/qaic-0.0.1-py3-none-any.whl 88 | ``` 89 | 90 | 2. Re-install diffusers from source after patching the SDXL Turbo pipeline for inference 91 | ``` 92 | git clone --depth 1 --branch v0.24.0 https://github.com/huggingface/diffusers.git diffusers-pipeline 93 | cd diffusers-pipeline 94 | git apply --reject --whitespace=fix ../patches/pipeline_patch_separate.patch 95 | pip install . 96 | cd .. 97 | ``` 98 | 99 | 4. Run the SDXL-Turbo inference with 'sudo' flag if needed to access the AI 100 devices. 100 | ``` 101 | sudo bash run_config_inference.sh $(which python3) 102 | ``` 103 | 104 | ## 3. Run an OpenAI-compatible REST endpoint 105 | 106 | ``` 107 | source ./env_pipeline/bin/activate 108 | python3 server.py 109 | ``` 110 | 111 | Test the endpoint: 112 | 113 | ``` 114 | curl http://localhost:8000/v1/images/generations \ 115 | -H 'Content-Type: application/json' \ 116 | -H 'Authorization: Bearer test-key' \ 117 | -d '{ 118 | "model": "sdxl-turbo", 119 | "prompt": "photo of 8k ultra realistic harbour, port, boats, sunset, beautiful light, golden hour, full of colour, cinematic lighting, battered, trending on artstation, 4k, hyperrealistic, focused, extreme details, cinematic, masterpiece", 120 | "n": 1, 121 | "size": "512x512", 122 | "response_format": "b64_json" 123 | }' 124 | ``` 125 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/stable-diffusion-3.5-medium/patches/transformer_patch.patch: -------------------------------------------------------------------------------- 1 | diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py 2 | index 81dff54f9..f27ebe7d3 100644 3 | --- a/src/transformers/models/t5/modeling_t5.py 4 | +++ b/src/transformers/models/t5/modeling_t5.py 5 | @@ -12,6 +12,10 @@ 6 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 7 | # See the License for the specific language governing permissions and 8 | # limitations under the License. 9 | +# 10 | +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 11 | +# SPDX-License-Identifier: BSD-3-Clause-Clear 12 | +# Not a Contribution 13 | """ PyTorch T5 model.""" 14 | 15 | 16 | @@ -243,7 +247,8 @@ class T5LayerNorm(nn.Module): 17 | # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for 18 | # half-precision inputs is done in fp32 19 | 20 | - variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) 21 | + div_first = hidden_states * torch.rsqrt(torch.tensor(hidden_states.shape[-1], dtype=torch.float32)) 22 | + variance = div_first.pow(2).sum(-1, keepdim=True) 23 | hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) 24 | 25 | # convert into half-precision if necessary 26 | @@ -330,11 +335,12 @@ class T5LayerFF(nn.Module): 27 | 28 | self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) 29 | self.dropout = nn.Dropout(config.dropout_rate) 30 | + self.scaling_factor = nn.Parameter(torch.tensor(1.0)) 31 | 32 | def forward(self, hidden_states): 33 | forwarded_states = self.layer_norm(hidden_states) 34 | forwarded_states = self.DenseReluDense(forwarded_states) 35 | - hidden_states = hidden_states + self.dropout(forwarded_states) 36 | + hidden_states = hidden_states * self.scaling_factor + self.dropout(forwarded_states) 37 | return hidden_states 38 | 39 | 40 | @@ -538,7 +544,8 @@ class T5Attention(nn.Module): 41 | # if key and values are already calculated 42 | # we want only the last query position bias 43 | if past_key_value is not None: 44 | - position_bias = position_bias[:, :, -hidden_states.size(1) :, :] 45 | + #position_bias = position_bias[:, :, -hidden_states.size(1) :, :] 46 | + position_bias = position_bias[:, :, -1:, :] 47 | 48 | if mask is not None: 49 | position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) 50 | @@ -579,6 +586,7 @@ class T5LayerSelfAttention(nn.Module): 51 | self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) 52 | self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) 53 | self.dropout = nn.Dropout(config.dropout_rate) 54 | + self.scaling_factor = nn.Parameter(torch.tensor(1.0)) 55 | 56 | def forward( 57 | self, 58 | @@ -600,7 +608,7 @@ class T5LayerSelfAttention(nn.Module): 59 | use_cache=use_cache, 60 | output_attentions=output_attentions, 61 | ) 62 | - hidden_states = hidden_states + self.dropout(attention_output[0]) 63 | + hidden_states = hidden_states * self.scaling_factor + self.dropout(attention_output[0]) 64 | outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them 65 | return outputs 66 | 67 | @@ -611,6 +619,7 @@ class T5LayerCrossAttention(nn.Module): 68 | self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False) 69 | self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) 70 | self.dropout = nn.Dropout(config.dropout_rate) 71 | + self.scaling_factor = nn.Parameter(torch.tensor(1.0)) 72 | 73 | def forward( 74 | self, 75 | @@ -636,7 +645,7 @@ class T5LayerCrossAttention(nn.Module): 76 | query_length=query_length, 77 | output_attentions=output_attentions, 78 | ) 79 | - layer_output = hidden_states + self.dropout(attention_output[0]) 80 | + layer_output = hidden_states * self.scaling_factor + self.dropout(attention_output[0]) 81 | outputs = (layer_output,) + attention_output[1:] # add attentions if we output them 82 | return outputs 83 | 84 | -------------------------------------------------------------------------------- /samples/cpp/cpp_qpc_inference/Readme.md: -------------------------------------------------------------------------------- 1 | # Simple CPP Example for Bert-base-cased model on AIC-100 2 | 3 | This project demonstrates using Bert-based-cased model from hugging face, using C++ Qaic APIs. 4 | 5 | ## To build and use it. 6 | ```bash 7 | mkdir build 8 | cd build 9 | cmake 10 | make 11 | ``` 12 | 13 | Bert-base-cased model from hugging face, is based on a vocabulary file 14 | (vocab.txt), which which needs to be downloaded from hugging-face website. 15 | 16 | ## To use the example, the user needs to : 17 | - download the hugging face bert-base-cased model. (Refer Jupyter notebooks for NLP models). 18 | - Replace the QPC path used in the main.cpp with the actual QPC path. 19 | - Replace the names of the input/output buffers as used to compile bert model into QPC 20 | ``` 21 | for example: 22 | ("input_ids", "attention_mask") for input buffers 23 | ("logits") for output buffers 24 | ``` 25 | - build using above build steps 26 | - run the executable `simple-bert-inference-example` 27 | 28 | ## The example has the following helper classes. 29 | 30 | ### VocabularyHelper : 31 | This class parses the vocab.txt, and stores the index of every 32 | string token in the vocab.txt file. The index of the words in 33 | this file is used in the input and output feeded to the model 34 | while running the inference. 35 | 36 | 37 | ### Tokenizer : 38 | This class, is very basic and trivial parser of input sentence 39 | feeded to the bert model. It uses space as delimeter to parse 40 | the sentence. It does not cater special handling for special 41 | characters and symbols used in sentence. 42 | Ideally, in C++ the user can use, for example, the 43 | sentencePiece library provided as in https://github.com/google/sentencepiece 44 | 45 | 46 | ### QBufferWrapper: 47 | This is a helper class to ensure that the memory allocated 48 | for QBuffers used in Qaic APIs is automatically released. 49 | Helper functions are provided for this class 50 | 51 | `createBuffer` : create the wrapper from a QBuffer class 52 | 53 | `qBufferToString` : create a string for printing with QBuffer data 54 | 55 | 56 | ### Helper Functions to convert few data structures to string for printing: 57 | ```cpp 58 | [[nodiscard]] std::string to_string(const qaic::rt::BufferMapping& bufMap)
59 | [[nodiscard]] std::string to_string(const qaic::rt::BufferMappings& allBufferMappings)
60 | [[nodiscard]] std::string to_string(const std::vector & tokenVec)
61 | ``` 62 | 63 | ### Processing the intput and output for inference: 64 | The input buffer for bert inference in this example is an array of bytes 65 | representing the indexes for each sentence word ( in the vocabulary file ). 66 | 67 | For example: 68 | 69 | If the compiled QPC has the sequence = 128 and the input type is int64_t 70 | then the size of input buffers must be
71 | 128 * 8
72 | 128 [max num tokens in input] * 8 [size of each index in vocabulary file]
73 | 74 | If the input sentence has 10 words, then the first 10*8 bytes in the 75 | input buffer must be populated with the indexes of the sentence words 76 | in the vocabulary file. Rest of the bytes must be zero initialized. 77 | 78 | Bert Model uses attention_mask as an input to model. The attention_mask 79 | input buffer can be populated with 1 for initial 10 words and rest of bytes 80 | can be zero initialized. 81 | 82 | The output buffer for bert inference in this example is an array 83 | of logit values (corresponding to each symbol/word in the vocabulary) 84 | for each input token. 85 | 86 | For example: 87 | 88 | If the compiled QPC has the sequence value = 128 and the output format 89 | is float (4 bytes). Then the QBuffer for output must be
90 | 128 * 4 * 289960
91 | 128 [max num tokens in input] * 4 [size of each logit value] * 289960 [Vocabular Size]
92 | 93 | For getting the predicted output sentence, the logit values for the 94 | [MASK] token must be extracted from the output buffer. Then the index for the 95 | maximum logit value can be used to get the predicted output word. 96 | 97 | For example: 98 | 99 | If the [MASK] token is at 3rd word index in sentence, then the corresponding 100 | logit values shall be present in the following bytes in the output buffer
101 | 289960*3*4 to 289960*4*4 bytes position.
102 | These 2899960 float values are the logits for the corresponding logits for 103 | each symbol/word in the vocabulary. 104 | We find the index for maximum logit value to get the index of prediceted 105 | word. Then we find the word in the vocabulary. 106 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/stable-diffusion-3.5-medium/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 2 | # SPDX-License-Identifier: BSD-3-Clause-Clear 3 | 4 | import os 5 | import torch 6 | 7 | from diffusers import StableDiffusion3Pipeline 8 | 9 | class QAICStableDiffusion3: 10 | def __init__(self, model_id = 'stabilityai/stable-diffusion-3.5-medium', device_id=0, device_id_2=1): 11 | sdxl_vae_decoder = './qpc/vae_decoder_64b_1024i_vae_16c_1b_2m_1o/programqpc.bin' 12 | text_encoder = './qpc/text_encoder_64b_1024i_16c_1b/programqpc.bin' 13 | transformer = './qpc/transformer_64b_1024i_16c_1b_1m_2o/programqpc.bin' 14 | text_encoder_2 = './qpc/text_encoder_2_64b_1024i_16c_1b/programqpc.bin' 15 | 16 | text_encoder_3 = None 17 | 18 | # check the QPCs 19 | transformer_qpc = transformer if transformer.endswith('programqpc.bin') else os.path.join(transformer,'programqpc.bin') 20 | assert os.path.isfile(transformer_qpc), f"Could not find binary {transformer_qpc = }!" 21 | vae_decoder_sdxl_qpc = sdxl_vae_decoder if sdxl_vae_decoder.endswith('programqpc.bin') else os.path.join(sdxl_vae_decoder,'programqpc.bin') 22 | assert os.path.isfile(vae_decoder_sdxl_qpc), f"Could not find binary {vae_decoder_sdxl_qpc = }!" 23 | text_encoder_qpc = text_encoder if text_encoder.endswith('programqpc.bin') else os.path.join(text_encoder,'programqpc.bin') 24 | assert os.path.isfile(text_encoder_qpc), f"Could not find binary {text_encoder_qpc = }!" 25 | text_encoder_2_qpc = text_encoder_2 if text_encoder_2.endswith('programqpc.bin') else os.path.join(text_encoder_2,'programqpc.bin') 26 | assert os.path.isfile(text_encoder_2_qpc), f"Could not find binary {text_encoder_2_qpc = }!" 27 | 28 | self.vae_type = "vae" 29 | 30 | # load the latents 31 | self.latents = None 32 | 33 | # load the model pipeline 34 | if text_encoder_3: 35 | text_encoder_3_qpc = text_encoder_3 if text_encoder_3.endswith('programqpc.bin') else os.path.join(text_encoder_3,'programqpc.bin') 36 | assert os.path.isfile(text_encoder_3_qpc), f"Could not find binary {text_encoder_3_qpc = }!" 37 | pipe = StableDiffusion3Pipeline.from_pretrained( 38 | model_id, 39 | device_id=device_id, 40 | device_id2=device_id_2, 41 | transformer_qpc=transformer_qpc, 42 | vae_decoder_qpc=vae_decoder_sdxl_qpc, 43 | text_encoder_qpc=text_encoder_qpc, 44 | text_encoder_2_qpc=text_encoder_2_qpc, 45 | text_encoder_3_qpc=text_encoder_3_qpc, 46 | ) 47 | else: 48 | pipe = StableDiffusion3Pipeline.from_pretrained( 49 | model_id, 50 | device_id=device_id, 51 | device_id2=device_id_2, 52 | transformer_qpc=transformer_qpc, 53 | vae_decoder_qpc=vae_decoder_sdxl_qpc, 54 | text_encoder_qpc=text_encoder_qpc, 55 | text_encoder_2_qpc=text_encoder_2_qpc, 56 | text_encoder_3=None, 57 | tokenizer_3=None, 58 | ) 59 | 60 | self.pipe = pipe 61 | 62 | def generate(self, prompt, n=1, image_size=(1024,1024), num_steps=28, guidance=4.5): 63 | height, width = image_size[0], image_size[1] 64 | 65 | images = self.pipe(prompt=prompt, 66 | negative_prompt='', 67 | num_inference_steps=num_steps, 68 | height=height, 69 | width=width, 70 | latents=self.latents, 71 | vae_type=self.vae_type, 72 | guidance_scale=guidance).images 73 | 74 | return images 75 | 76 | def main(): 77 | model = QAICStableDiffusion3() 78 | prompt = 'photo of 8k ultra realistic harbour, port, boats, sunset, beautiful light, golden hour, full of colour, cinematic lighting, battered, trending on artstation, 4k, hyperrealistic, focused, extreme details, cinematic, masterpiece' 79 | image = model.generate(prompt, guidance=7.0)[0] 80 | image.save('harbor.png') 81 | 82 | if __name__ == "__main__": 83 | main() 84 | 85 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/sdxl_deepcache/patches/deepcache_unet.patch: -------------------------------------------------------------------------------- 1 | diff --git a/DeepCache/sdxl/unet_2d_condition.py b/DeepCache/sdxl/unet_2d_condition.py 2 | index 6c97199..f6865c6 100644 3 | --- a/DeepCache/sdxl/unet_2d_condition.py 4 | +++ b/DeepCache/sdxl/unet_2d_condition.py 5 | @@ -11,6 +11,10 @@ 6 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 7 | # See the License for the specific language governing permissions and 8 | # limitations under the License. 9 | +# 10 | +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 11 | +# SPDX-License-Identifier: BSD-3-Clause-Clear 12 | +# Not a Contribution 13 | from dataclasses import dataclass 14 | from typing import Any, Dict, List, Optional, Tuple, Union 15 | 16 | @@ -591,6 +595,8 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) 17 | self.position_net = PositionNet( 18 | positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type 19 | ) 20 | + self.cache_layer_id = 0 21 | + self.cache_block_id = 0 22 | 23 | @property 24 | def attn_processors(self) -> Dict[str, AttentionProcessor]: 25 | @@ -741,6 +747,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) 26 | sample: torch.FloatTensor, 27 | timestep: Union[torch.Tensor, float, int], 28 | encoder_hidden_states: torch.Tensor, 29 | + replicate_prv_feature: Optional[List[torch.Tensor]], 30 | class_labels: Optional[torch.Tensor] = None, 31 | timestep_cond: Optional[torch.Tensor] = None, 32 | attention_mask: Optional[torch.Tensor] = None, 33 | @@ -749,10 +756,6 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) 34 | down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, 35 | mid_block_additional_residual: Optional[torch.Tensor] = None, 36 | encoder_attention_mask: Optional[torch.Tensor] = None, 37 | - quick_replicate: bool = False, 38 | - replicate_prv_feature: Optional[List[torch.Tensor]] = None, 39 | - cache_layer_id: Optional[int] = None, 40 | - cache_block_id: Optional[int] = None, 41 | return_dict: bool = True, 42 | ) -> Union[UNet2DConditionOutput, Tuple]: 43 | r""" 44 | @@ -954,8 +957,11 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) 45 | is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None 46 | is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None 47 | 48 | + cache_layer_id = self.cache_layer_id 49 | + cache_block_id = self.cache_block_id 50 | down_block_res_samples = (sample,) 51 | - if quick_replicate and replicate_prv_feature is not None: 52 | + if False: 53 | + print("Using cache...") 54 | # Down 55 | for i, downsample_block in enumerate(self.down_blocks): 56 | if i > cache_layer_id: 57 | @@ -1037,9 +1043,10 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) 58 | scale=lora_scale, 59 | enter_block_number=cache_block_id if i == len(self.up_blocks) - 1 - cache_layer_id else None, 60 | ) 61 | - 62 | + 63 | prv_f = replicate_prv_feature 64 | else: 65 | + print("Initializing cache...") 66 | for i, downsample_block in enumerate(self.down_blocks): 67 | if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention: 68 | # For t2i-adapter CrossAttnDownBlock2D 69 | @@ -1137,17 +1144,15 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) 70 | upsample_size=upsample_size, 71 | scale=lora_scale, 72 | ) 73 | - 74 | + 75 | #print(cache_layer_id, current_record_f is None, i == len(self.up_blocks) - cache_layer_id - 1) 76 | #print("Append prv_feature with shape:", sample.shape) 77 | if cache_layer_id is not None and current_record_f is not None and i == len(self.up_blocks) - cache_layer_id - 1: 78 | prv_f = current_record_f[-cache_block_id-1] 79 | - 80 | + 81 | # 6. post-process 82 | if self.conv_norm_out: 83 | sample = self.conv_norm_out(sample) 84 | sample = self.conv_act(sample) 85 | sample = self.conv_out(sample) 86 | - if not return_dict: 87 | - return (sample, prv_f,) 88 | - return UNet2DConditionOutput(sample=sample) 89 | + return (sample, prv_f,) 90 | -------------------------------------------------------------------------------- /models/multimodal/text_to_image/stable-diffusion-xl-base-1.0/compile_models.sh: -------------------------------------------------------------------------------- 1 | #################################################################################################### 2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 3 | # SPDX-License-Identifier: BSD-3-Clause-Clear 4 | #################################################################################################### 5 | #!/bin/bash 6 | 7 | BINARY_FOLDER="./qpc/" 8 | LOG_FOLDER="./compile_logs/" 9 | BATCH_SIZE=1 10 | BATCH_SIZE_2=$(expr 2 \* $BATCH_SIZE) 11 | SEQ_LEN=77 12 | LATENT_CHANNELS=4 13 | LATENT_HEIGHT=128 14 | LATENT_WIDTH=128 15 | NUM_CORES=16 16 | VAE_MOS=2 17 | VAE_OLS=1 18 | UNET_MOS_BS1=2 19 | UNET_OLS_BS1=1 20 | UNET_MOS_BS2=1 21 | UNET_OLS_BS2=2 22 | 23 | mkdir ${BINARY_FOLDER} 24 | mkdir ${LOG_FOLDER} 25 | 26 | ######################################################################################################################## 27 | 28 | # 1. Compile the text encoder - self-generated 29 | rm -rf ${BINARY_FOLDER}text_encoder 30 | /opt/qti-aic/exec/qaic-exec \ 31 | -aic-hw -aic-hw-version=2.0 -aic-perf-warnings -aic-perf-metrics \ 32 | -compile-only -convert-to-fp16 \ 33 | -m=./onnx_files/text_encoder/model.onnx \ 34 | -onnx-define-symbol=batch_size,${BATCH_SIZE} \ 35 | -stats-batchsize=${BATCH_SIZE} \ 36 | -onnx-define-symbol=sequence_length,${SEQ_LEN} \ 37 | -aic-num-cores=${NUM_CORES} \ 38 | -aic-binary-dir=${BINARY_FOLDER}text_encoder \ 39 | 2>&1 | ts > ${LOG_FOLDER}text_encoder.log & 40 | 41 | ######################################################################################################################## 42 | 43 | # 2. Compile the text encoder 2 - self-generated 44 | rm -rf ${BINARY_FOLDER}text_encoder_2 45 | /opt/qti-aic/exec/qaic-exec \ 46 | -aic-hw -aic-hw-version=2.0 -aic-perf-warnings -aic-perf-metrics \ 47 | -compile-only -convert-to-fp16 \ 48 | -m=./onnx_files/text_encoder_2/model.onnx \ 49 | -onnx-define-symbol=batch_size,${BATCH_SIZE} \ 50 | -stats-batchsize=${BATCH_SIZE} \ 51 | -onnx-define-symbol=sequence_length,${SEQ_LEN} \ 52 | -aic-num-cores=${NUM_CORES} \ 53 | -aic-binary-dir=${BINARY_FOLDER}text_encoder_2 \ 54 | 2>&1 | ts > ${LOG_FOLDER}text_encoder_2.log & 55 | 56 | ######################################################################################################################## 57 | 58 | # 3a. Compile the UNet with batchsize=1, blocksize=256 59 | rm -rf ${BINARY_FOLDER}unet-bs${BATCH_SIZE} 60 | /opt/qti-aic/exec/qaic-exec \ 61 | -aic-hw -aic-hw-version=2.0 -aic-perf-warnings -aic-perf-metrics \ 62 | -compile-only -convert-to-fp16 \ 63 | -mos=${UNET_MOS_BS1} -ols=${UNET_OLS_BS1} \ 64 | -m=./onnx_files/unet_bs1/unet/model.onnx \ 65 | -onnx-define-symbol=batch_size,${BATCH_SIZE} \ 66 | -stats-batchsize=${BATCH_SIZE} \ 67 | -onnx-define-symbol=sequence_length,${SEQ_LEN} \ 68 | -onnx-define-symbol=steps,1 \ 69 | -onnx-define-symbol=num_channels,${LATENT_CHANNELS} \ 70 | -onnx-define-symbol=height,${LATENT_HEIGHT} \ 71 | -onnx-define-symbol=width,${LATENT_WIDTH} \ 72 | -aic-num-cores=${NUM_CORES} \ 73 | -aic-binary-dir=${BINARY_FOLDER}unet-bs${BATCH_SIZE} \ 74 | 2>&1 | ts > ${LOG_FOLDER}unet-bs${BATCH_SIZE}.log & 75 | 76 | 77 | # 3b. Compile the UNet with batchsize=2, blocksize=128 78 | rm -rf ${BINARY_FOLDER}unet-bs${BATCH_SIZE_2} 79 | /opt/qti-aic/exec/qaic-exec \ 80 | -aic-hw -aic-hw-version=2.0 -aic-perf-warnings -aic-perf-metrics \ 81 | -compile-only -convert-to-fp16 \ 82 | -mos=${UNET_MOS_BS2} -ols=${UNET_OLS_BS2} \ 83 | -m=./onnx_files/unet_bs2/unet/model.onnx \ 84 | -onnx-define-symbol=batch_size,${BATCH_SIZE_2} \ 85 | -stats-batchsize=${BATCH_SIZE_2} \ 86 | -onnx-define-symbol=sequence_length,${SEQ_LEN} \ 87 | -onnx-define-symbol=steps,1 \ 88 | -onnx-define-symbol=num_channels,${LATENT_CHANNELS} \ 89 | -onnx-define-symbol=height,${LATENT_HEIGHT} \ 90 | -onnx-define-symbol=width,${LATENT_WIDTH} \ 91 | -aic-num-cores=${NUM_CORES} \ 92 | -aic-binary-dir=${BINARY_FOLDER}unet-bs${BATCH_SIZE_2} \ 93 | 2>&1 | ts > ${LOG_FOLDER}unet-bs${BATCH_SIZE_2}.log & 94 | 95 | 96 | ######################################################################################################################## 97 | 98 | # 4. Compile the VAE Decoder 99 | rm -rf ${BINARY_FOLDER}vae_decoder 100 | /opt/qti-aic/exec/qaic-exec \ 101 | -aic-hw -aic-hw-version=2.0 -aic-perf-warnings -aic-perf-metrics \ 102 | -compile-only -convert-to-fp16 \ 103 | -mos=${VAE_MOS} -ols=${VAE_OLS} \ 104 | -m=./onnx_files/vae_decoder/model_fixed_128.onnx \ 105 | -onnx-define-symbol=batch_size,${BATCH_SIZE} \ 106 | -stats-batchsize=${BATCH_SIZE} \ 107 | -onnx-define-symbol=num_channels_latent,${LATENT_CHANNELS} \ 108 | -onnx-define-symbol=height_latent,${LATENT_HEIGHT} \ 109 | -onnx-define-symbol=width_latent,${LATENT_WIDTH} \ 110 | -aic-num-cores=${NUM_CORES} \ 111 | -aic-enable-depth-first -aic-depth-first-mem=32 \ 112 | -aic-binary-dir=${BINARY_FOLDER}vae_decoder \ 113 | 2>&1 | ts > ${LOG_FOLDER}vae_decoder.log & 114 | 115 | ######################################################################################################################## 116 | 117 | echo Waiting for qaic-exec processes to finish ... 118 | wait 119 | 120 | -------------------------------------------------------------------------------- /samples/python/aws_ai100_benchmarking/parse_latency_and_throughput.py: -------------------------------------------------------------------------------- 1 | ############################################################################## 2 | # @@-COPYRIGHT-START-@@ 3 | # 4 | # Copyright (c) 2023, Qualcomm Technologies, Inc. All Rights Reserved. 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are met: 7 | # 8 | # 1. Redistributions of source code must retain the above copyright notice, 9 | # this list of conditions and the following disclaimer. 10 | # 11 | # 2. Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation 13 | # and/or other materials provided with the distribution. 14 | # 15 | # 3. Neither the name of the copyright holder nor the names of its contributors 16 | # may be used to endorse or promote products derived from this software 17 | # without specific prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 23 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | # POSSIBILITY OF SUCH DAMAGE. 30 | # 31 | # SPDX-License-Identifier: BSD-3-Clause 32 | # 33 | # @@-COPYRIGHT-END-@@ 34 | ############################################################################## 35 | 36 | 37 | import os 38 | import sys 39 | from glob import glob 40 | import pandas as pd 41 | import numpy as np 42 | 43 | 44 | def get_metric(series, method): 45 | ''' 46 | This functions computes the average or percentile for a pandas.Series object 47 | ''' 48 | if method == 'mean' or method == 'avg': 49 | return series.mean() 50 | elif method.endswith('pct'): 51 | prctile = int(method.replace('pct', ''))/100 52 | return series.quantile(prctile) 53 | return None 54 | 55 | 56 | def get_latency(latency_logs, latency_method): 57 | ''' 58 | This function computes the latency from the profiling latency 59 | text files, using the latency_method specified 60 | ''' 61 | df = pd.concat([pd.read_csv(filename, skiprows=4) 62 | for filename in latency_logs]) 63 | col = df.columns[-3] # Execution Total Time in microseconds 64 | latency_ms = get_metric(df[col], latency_method)/1000.0 65 | return latency_ms 66 | 67 | 68 | if __name__ == "__main__": 69 | if len(sys.argv) < 3: 70 | print("Syntax: python parse_latency_and_throughput.py ") 71 | print("where is 'mean', 'avg', or 'Kpct', where K is a number between 0 to 100") 72 | print(" should include full path to the model folder where 'outputFiles' and log files are located") 73 | sys.exit() 74 | 75 | latency_method = sys.argv[1] 76 | if (latency_method not in ['mean', 'avg']) and (not latency_method.endswith('pct')): 77 | raise ValueError(f"Methods supported are mean/avg or pct, received {latency_method}") 78 | model_names = sys.argv[2:] 79 | print(model_names) 80 | 81 | # parse the logs and print the latency and throughput 82 | for config in ['best-throughput', 'balanced', 'best-latency']: 83 | 84 | print("******************************************************************") 85 | print(f"*** Latency: {config} configurations **************************") 86 | print("******************************************************************") 87 | for model in model_names: 88 | config_folders = glob(f"{model}/outputFiles/fp16*{config}") 89 | print(f"{model}: Found {len(config_folders)} {config} configurations") 90 | if len(config_folders) == 0: 91 | continue 92 | latency_logs = glob(f"{config_folders[0]}/*latency.txt") 93 | print(f"Model: {model}: Latency ({latency_method}) = {get_latency(latency_logs, latency_method):.3f} ms") 94 | 95 | print("******************************************************************") 96 | print(f"*** Throughput: {config} configurations *************************") 97 | print("******************************************************************") 98 | for model in model_names: 99 | log_file = f"{model}/{config}.log" 100 | if not os.path.exists(log_file): 101 | print("Model: {model}: {log_file} does not exist") 102 | continue 103 | with open(log_file, 'r') as fid: 104 | throughput = np.double([line.split()[-1] 105 | for line in fid.read().splitlines() 106 | if 'Inf/Sec' in line][-1]) 107 | print(f"Model: {model}: Throughput = {throughput:.3f} inf/sec") 108 | print("******************************************************************") 109 | -------------------------------------------------------------------------------- /models/vision/detection/README.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | --- 3 | 4 | Download the yolov5, and yolov7 models, prepare for the Qualcomm AIC100, compile for high-thoughput, min-latency, or balanced throughput with fp16 precision, run the model on a generated random sample, and obtain the benchmarking results and output values. 5 | 6 | ## Source of the models 7 | --- 8 | The models are downloaded from (https://github.com/ultralytics/yolov5). This script has been tested for the following requested models: 9 | * yolov5s 10 | * yolov5m 11 | * yolov5l 12 | * yolov5x 13 | * yolov7-e6e 14 | * yolov8m 15 | 16 | ## Virtual environment 17 | --- 18 | For a quick environment setup: 19 | 20 | ```commandline 21 | python3.10 -m venv det_env 22 | source det_env/bin/activate 23 | 24 | ``` 25 | 26 | ## Framework and version 27 | --- 28 | ```commandline 29 | pip3 install -r requirements.txt 30 | 31 | ``` 32 | ## Syntax 33 | --- 34 | Copy the run_yolo_model.py and the lut_yolo_models.csv to a working directory. Pick a MODEL_NAME from the list above, and type: 35 | 36 | ```commandline 37 | 38 | usage: run_yolo_model.py [-h] --model-name {yolov5s,yolov5m,yolov5l,yolov5x,yolov7-e6e,yolov8m} 39 | [--objective {best-latency,best-throughput,balanced}] 40 | [--opset OPSET] 41 | [--batch-size BATCH_SIZE] 42 | [--image-size IMAGE_SIZE] 43 | [--cores {1,2,3,4,5,6,7,8,9,10,11,12,13,14}] 44 | [--instances {1,2,3,4,5,6,7,8,9,10,11,12,13,14}] 45 | [--ols {1,2,3,4,5,6,7,8,9,10,11,12,13,14}] 46 | [--mos {1,2,3,4,5,6,7,8,9,10,11,12,13,14}] 47 | [--set-size {1,2,3,4,5,6,7,8,9,10}] 48 | [--extra EXTRA] 49 | [--time TIME] 50 | [--device {0,1,2,3,4,5,6,7}] 51 | [--run-only] 52 | 53 | 54 | 55 | Download, Compile, and Run YOLO models on randomly generated inputs. 56 | 57 | 58 | optional arguments: 59 | -h, --help show this help message and exit 60 | --model-name, -m {yolov5s,yolov5m,yolov5l,yolov5x,yolov7-e6e,yolov8m} 61 | Model name to download. 62 | --objective, -o {best-latency,best-throughput,balanced} 63 | Running for best-latency, best-throughput, or balanced 64 | --opset OPSET ONNX opset. Default <12> 65 | --batch-size, -b BATCH_SIZE 66 | Sample input batch size. Default <1>. 67 | --image-size, -s IMAGE_SIZE 68 | Sample input image width/height. Default <640>. 69 | --cores, -c {1,2,3,4,5,6,7,8,9,10,11,12,13,14} 70 | Number of AIC100 cores to compile the model for. Default <2> 71 | --instances, -i {1,2,3,4,5,6,7,8,9,10,11,12,13,14} 72 | Number of model instances to run on AIC100. Default <7> 73 | --ols {1,2,3,4,5,6,7,8,9,10,11,12,13,14} 74 | Overlap split factor. Default <1> 75 | --mos {1,2,3,4,5,6,7,8,9,10,11,12,13,14} 76 | Maximum output channel split. Default 77 | --set-size {1,2,3,4,5,6,7,8,9,10} 78 | Set size. Default <10> 79 | --extra EXTRA Extra compilation arguments. 80 | --time TIME Duration (in seconds) for which to submit inferences. Default <20> 81 | --device, -d {0,1,2,3,4,5,6,7} 82 | AIC100 device ID. Default <0> 83 | --run-only, -r Performs the inference only, without re-exporting and re-compiling the model 84 | --include-nms Run the model preparator tool to optimize the graph, and to add the Post Processing to supported models. Details on model preparator tool here- https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Inference-Workflow/Export-the-model/Prepare-the-model/ 85 | 86 | 87 | ``` 88 | Examples: 89 | ```commandline 90 | python run_yolo_model.py -m yolov5s -o best-throughput 91 | ``` 92 | ```commandline 93 | python run_yolo_model.py -m yolov5m -o balanced 94 | ``` 95 | ```commandline 96 | python run_yolo_model.py -m yolov5x -o best-throughput 97 | ``` 98 | 99 | The hardware configuration will be either associated to the corresponding row in the lut_yolo_models.csv or to defualt values if not specified by the user. If the MODEL_NAME is not included in the lut_yolo_models.csv, default values will be used. 100 | 101 | After download, compile, and run is complete, the working directory of the selected model looks as follows. 102 | # Working directory structure 103 | ``` 104 | |── model # Contains the onnx file of the picked model 105 | | └── model.onnx # The onnx file of the picked model 106 | |── inputFiles # Contains the (randomly generated) input files of the compiled model 107 | │ └── input_img*.raw # Randomly generated input files of the compiled model 108 | |── outputFiles # Contains the corresponding output to input, as well as the hardware profiling for latency 109 | │ └── fp16* 110 | │ └── output-act*.bin # Corresponding output to the randomly generated input_img*.raw 111 | │ └── aic-profil*.bin # The hardware profiling for round trip latency between host and device for each inference 112 | ├── compiled-bin* # Compilation path 113 | │ └── programqpc.bin # For the selected objective, the model.onnx is compiled into programqpc.bin 114 | ├── list*.txt # A list that contains path to the inputs. Can be used as input to qaic-runner 115 | ├── commands*.txt # Includes necessary compilation and running scripts to reproduce the results manually. 116 | 117 | ``` 118 | To manually resproduce the results, navigate to the working directory, select the qaic compile/run commands from the command*.txt and run them in the terminal. 119 | -------------------------------------------------------------------------------- /utils/multi-device/README.md: -------------------------------------------------------------------------------- 1 | # Multi Device 2 | 3 | This guide provides setup instructions for multi-device enablement. PCIe peer-to-peer P2P communication must be enabled to allow efficient tensor slicing across multiple Cloud AI devices (SoCs and Cards). 4 | 5 | Refer to [Model Sharding](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Features/model_sharding/) for more information on recommended PCIe topologies for tensor slicing (P2P). 6 | 7 | ## Pre-requisites 8 | 9 | - Server with Platform and APPS SDK versions >= 1.17 installed. 10 | - PCIe switch for inter-card P2P communication 11 | - python3 -m pip install pyudev 12 | 13 | ## Setup instructions 14 | 15 | Platform SDK 1.18 and later offers an option (`--setup_mdp all`) to enable P2P for the multi-device partitioning tensor slicing feature during installation. 16 | 17 | Example:d 18 | 19 | ``` 20 | cd /x86_64/deb 21 | sudo bash install.sh --setup_mdp all 22 | ``` 23 | 24 | > [!IMPORTANT] 25 | > If P2P is enabled via the Platform SDK installer then skip to the [Testing P2P](#testing-p2p) section. 26 | > 27 | > The remaining steps in this section show manual steps for enabling P2P. 28 | 29 | ### Disable PCIe ACS for P2P communication between cards 30 | 31 | 1. Run `QAicChangeAcs.py` without any flags to display a hierarchial view of PCI bridges and AI 100 devices. 32 | 33 | ``` 34 | $ python3 QAicChangeAcs.py 35 | Found the following AIC100 devices: 36 | Root 37 | ----0000:30:01.1 <-- Host system PCIe switch, script will disable ACS here 38 | --------0000:31:00.0 <-- Ultra AI 100 onboard PCIe switch, script will disable ACS here 39 | ------------0000:32:03.0 40 | ----------------0000:36:00.0 [Qualcomm AIC100] 41 | ------------0000:32:02.0 42 | ----------------0000:35:00.0 [Qualcomm AIC100] 43 | ------------0000:32:00.0 44 | ----------------0000:38:00.0 [Qualcomm AIC100] 45 | ------------0000:32:01.0 46 | ----------------0000:39:00.0 [Qualcomm AIC100] 47 | --------0000:21:00.0 <-- Ultra AI 100 onboard PCIe switch, script will disable ACS here 48 | ------------0000:22:00.0 49 | ----------------0000:23:00.0 [Qualcomm AIC100] 50 | ------------0000:22:02.0 51 | ----------------0000:25:00.0 [Qualcomm AIC100] 52 | ------------0000:22:01.0 53 | ----------------0000:27:00.0 [Qualcomm AIC100] 54 | ------------0000:22:03.0 55 | ----------------0000:28:00.0 [Qualcomm AIC100] 56 | ``` 57 | 58 | 2. Run `QAicChangeAcs.py all` to disable ACS on all the downstream ports (on the PCIe switch) that connect to AI 100 devices as well as PCIe switch downstream ports that connect to the PCIe switch onboard the AI 100 cards. This command will enable P2P between the AI 100 devices (SoCs) on the same card as well card to card. 59 | 60 | 3. Users optionally can selectively disable ACS by running `QAicChangeAcs.py `, where 61 | - SSSS = 4 digits segment number 62 | - BB = 2 digits bus number 63 | - DD = 2 digits device number 64 | - F = 1 digit function number 65 | 66 | of the nearest common ancestor PCI bridge under which ACS needs to be disabled. 67 | 68 | Examples: 69 | 70 | `$ python3 QAicChangeAcs.py 0000:31:00.0` will disable ACS on the first set of AI 100 devices (0000:36:00.0, 0000:35:00.0, 0000:38:00.0 and 0000:39:00.0).
71 | `$ python3 QAicChangeAcs.py 0000:30:01.1` will disable ACS across both the AI 100 Ultra cards as well as the 4 devices in each AI 100 card
72 | 73 | 4. Above steps need to be repeated on every server power cycle. 74 | 75 | 76 | ### Enable multi-device partitioning (MDP) 77 | 78 | This step is required everytime a new version of the Platform SDK is installed. 79 | 80 | First, check that the Qaic Monitor service is running 81 | ``` 82 | sudo systemctl status qmonitor-proxy 83 | ``` 84 | 85 | If not active(running) then start it with: 86 | ``` 87 | sudo systemd-run --unit=qmonitor-proxy /opt/qti-aic/tools/qaic-monitor-grpc-server 88 | ``` 89 | 90 | Next, enable MDP across all Cloud AI devices in the server. 91 | ``` 92 | sudo /opt/qti-aic/tools/qaic-monitor-json -i enable_mdp.json 93 | ``` 94 | 95 | Reset Cloud AI devices for changes to take effect: 96 | ``` 97 | sudo /opt/qti-aic/tools/qaic-util -s 98 | ``` 99 | 100 | ## Testing P2P 101 | 102 | The Qaic Kernel driver requires a longer response timeout for P2P workloads. Use the following command to increase the timeout: 103 | ``` 104 | sudo sh -c 'echo 2600 > /sys/module/qaic/parameters/control_resp_timeout_s' 105 | ``` 106 | 107 | Synthetic P2P workloads are available in `/opt/qti-aic/test-data/aic100/v2/qaic-compute-networks/factory-workload-bin`. 108 | 109 | ### Multi-SoC Accelerators (Ultra) P2P tests 110 | 111 | ``` 112 | # P2P between 2 SoCs with QID 0 and 1 on the same card 113 | sudo /opt/qti-aic/exec/qaic-runner -t /opt/qti-aic/test-data/aic100/v2/qaic-compute-networks/factory-workload-bin/2c-p2p-bw -n 10 -a 1 -l -D 0:1 114 | 115 | # P2P between 2 SoCs with QID 0 and 4 on different cards. Choose cards that are on the same PCie switch. 116 | sudo /opt/qti-aic/exec/qaic-runner -t /opt/qti-aic/test-data/aic100/v2/qaic-compute-networks/factory-workload-bin/2c-p2p-bw -n 10 -a 1 -l -D 0:4 117 | ``` 118 | 119 | ### Single-SoC Accelerators (Standard/Pro) P2P tests 120 | 121 | ``` 122 | # P2P between 2 SoCs with QID 0 and 4 on different cards. Choose cards that are on the same PCie switch. 123 | sudo /opt/qti-aic/exec/qaic-runner -t /opt/qti-aic/test-data/aic100/v2/qaic-compute-networks/factory-workload-bin/2c-p2p-bw -n 10 -a 1 -l -D 0:1 124 | ``` 125 | 126 | ### Troubleshooting 127 | If a `Failed to access P2P device` error occurs, check the following: 128 | 1. Re-check enablement instructions above 129 | 2. Review the PCIe topology from the QAicChangeAcs.py script to make sure that a host PCIe switch is present 130 | -------------------------------------------------------------------------------- /utils/qaic-bench/README.md: -------------------------------------------------------------------------------- 1 | # qaic-bench 2 | 3 | Benchmarking script for Cloud AI Inference accelerators. 4 | 5 | ## Installation for x86_64 6 | 7 | Download Cloud AI Docker Image: 8 | 9 | ``` 10 | docker pull ghcr.io/quic/cloud_ai_inference_ubuntu22:1.20.4.0 11 | ``` 12 | 13 | Start container. This example maps 4 Cloud AI 100 Ultra Accelerators. Each accelerator has 4 SoC devices. 14 | 15 | Note: For QPC generation, choose a `/cache` location with 1TB or more of free space to hold model weights, ONNX files, and QPC model binaries. 16 | 17 | Note: Run `docker container rm qaic-bench` to clean up after exiting the container. 18 | 19 | ``` 20 | cd utils/qaic-bench 21 | 22 | docker run -it \ 23 | --workdir /app \ 24 | --name qaic-bench \ 25 | --network host \ 26 | --mount type=bind,source=${PWD},target=/app \ 27 | --mount type=bind,source=${HOME}/.cache,target=/cache \ 28 | --env HF_HOME='/cache/huggingface' \ 29 | --env QEFF_HOME='/cache/qeff_models' \ 30 | --env XDG_CACHE_HOME='/cache' \ 31 | --device=/dev/accel/accel0 \ 32 | --device=/dev/accel/accel1 \ 33 | --device=/dev/accel/accel2 \ 34 | --device=/dev/accel/accel3 \ 35 | --device=/dev/accel/accel4 \ 36 | --device=/dev/accel/accel5 \ 37 | --device=/dev/accel/accel6 \ 38 | --device=/dev/accel/accel7 \ 39 | --device=/dev/accel/accel8 \ 40 | --device=/dev/accel/accel8 \ 41 | --device=/dev/accel/accel9 \ 42 | --device=/dev/accel/accel10 \ 43 | --device=/dev/accel/accel11 \ 44 | --device=/dev/accel/accel12 \ 45 | --device=/dev/accel/accel13 \ 46 | --device=/dev/accel/accel14 \ 47 | --device=/dev/accel/accel15 \ 48 | ghcr.io/quic/cloud_ai_inference_ubuntu22:1.20.4.0 49 | ``` 50 | 51 | Activate vLLM environment:
52 | 53 | ``` 54 | source /opt/vllm-env/bin/activate 55 | ``` 56 | 57 | ## Installation for AArch64 58 | 59 | Follow instructions [here](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/vLLM/vLLM/index.html#installing-from-source) to setup the vLLM environment for Cloud AI from source for AArch64. 60 | 61 | Activate vLLM environment:
62 | 63 | ``` 64 | source qaic-vllm-venv/bin/activate 65 | ``` 66 | 67 | ## KV-Heads Replication 68 | 69 | Download KV-Heads Replication script from Efficient Transformers. This is needed to efficiently tensor-slice large models across 16 SoCs. 70 | 71 | ``` 72 | wget https://github.com/quic/efficient-transformers/raw/refs/heads/release/v1.19.3_fp8_update/scripts/replicate_kv_head/replicate_kv_heads.py 73 | ``` 74 | 75 | ## Multi-Device Operation 76 | 77 | To run models across multiple AI 100 devices, make sure tensor slicing is enabled with: 78 | 79 | ``` 80 | sudo /opt/qti-aic/tools/qaic-util -a 81 | ``` 82 | 83 | The control response timeout must also be extended: 84 | 85 | ``` 86 | sudo sh -c 'echo 2600 > /sys/module/qaic/parameters/control_resp_timeout_s' 87 | ``` 88 | 89 | More details at: https://github.com/quic/cloud-ai-sdk/tree/1.20/utils/multi-device 90 | 91 | ## Hugging Face Access Token 92 | 93 | Some models on Hugging Face are access protected. Add your access token with the `--hf_token` script argument or set the `HF_TOKEN` environment variable. Learn more about Authentication here: https://huggingface.co/docs/huggingface_hub/en/quick-start#authentication. 94 | 95 | ## Usage 96 | 97 | Example: 98 | 99 | ``` 100 | python3 qaic_bench.py config/config_llama_3_1_8b.json 101 | ``` 102 | 103 | Details: 104 | 105 | ``` 106 | usage: qaic_bench.py [-h] [--devices DEVICES] [--compile-only] config 107 | 108 | positional arguments: 109 | config JSON file with model configurations 110 | 111 | options: 112 | -h, --help show this help message and exit 113 | --devices DEVICES List of comma separated device IDs to use for inferencing 114 | --compile-only Generate QPCs and skip benchmarking 115 | --hf_token Hugging Face access token 116 | ``` 117 | 118 | ## Configuration 119 | 120 | ### Example 121 | 122 | ``` 123 | { 124 | "vllm_root": "/opt/qti-aic/integrations/vllm", 125 | 126 | "models": [ 127 | { 128 | "name": "Meta-Llama-3.1-8B-Instruct", 129 | "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", 130 | "configs": [ 131 | { 132 | "batch_size": 1, 133 | "devices": 4, 134 | "prompt_len": 4096, 135 | "generation_len": 4096 136 | } 137 | ] 138 | } 139 | ] 140 | } 141 | ``` 142 | 143 | ### JSON Reference 144 | 145 | | Property | Description | 146 | | -------------- | ----------------------------------------- | 147 | | vllm_root | Path to full vLLM installation | 148 | | models | List of models to benchmark | 149 | 150 | ### Model Properties 151 | 152 | | Property | Description | 153 | | -------------- | ----------------------------------------- | 154 | | name | Model friendly name | 155 | | model | Hugging Face model path | 156 | | configs | List of model configurations to benchmark | 157 | 158 | ### Config Properties 159 | 160 | | Property | Description | 161 | | ---------------- | ----------------------------------------- | 162 | | batch_size | Model batch size. | 163 | | devices | Number of Cloud AI SoCs for tensor-sliced execution. Set to 1 for single-SoC execution. | 164 | | cores (optional) | Number of AI Cores for compilation. Default 16. | 165 | | prompt_len | Prompt input length | 166 | | generation_len | Max number of output tokens to generate | 167 | | qpc (optional) | Path to pre-generated QPC binary. If not specified, QPC will be generated. | 168 | -------------------------------------------------------------------------------- /tutorials/open-webui/README.md: -------------------------------------------------------------------------------- 1 | # Connecting Cloud AI models to Open WebUI 2 | 3 | [Open WebUI](https://github.com/open-webui/open-webui) is a self-hosted web interface for AI use-cases like Chat, Image Generation and RAG. 4 | By starting OpenAI-compatible endpoints with vLLM, we can connect Open WebUI to AI models running on Qualcomm Cloud AI accelerators. 5 | 6 |

7 | Open WebUI Chat 8 |

9 | 10 | ## Pre-requisites 11 | 12 | * Cloud AI Platform and Apps SDKs [Installation](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/Cloud-AI-SDK/Cloud-AI-SDK/index.html) 13 | * Cloud AI 100 Ultra accelerator card 14 | * Python 3.10 15 | * Docker 16 | 17 | To run language models on multiple SoCs, make sure [tensor slicing](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Features/model_sharding/index.html) is enabled and disable ACS: 18 | 19 | ``` 20 | sudo /opt/qti-aic/tools/qaic-util -a 21 | ``` 22 | 23 | Increase the response timeout: 24 | ``` 25 | sudo sh -c "echo 2600 > /sys/module/qaic/parameters/control_resp_timeout_s" 26 | ``` 27 | 28 | Preface all docker commands with `sudo`, or add yourself to the docker group: 29 | ``` 30 | sudo usermod -aG docker $USER 31 | ``` 32 | 33 | Launch a new shell or `newgrp docker` to apply the changes. 34 | 35 | ## Prepare the model 36 | 37 | Use [Efficient Transformers](https://github.com/quic/efficient-transformers) to prepare popular models like Llama-3.3-70B-Instruct, Qwen2.5-Coder and Phi4, or download pre-generated model binaries at http://qualcom-qpc-models.s3-website-us-east-1.amazonaws.com/QPC/. Note the location of the 'programqpc.bin' files as you'll need these to start vLLM. Efficient-transformers stores model binaries in [~/.cache/qeff_cache](https://quic.github.io/efficient-transformers/source/quick_start.html#transformed-models-and-qpc-storage) by default. 38 | 39 | ## Cloud AI Inference Container 40 | 41 | [Cloud AI Inference containers](https://github.com/quic/cloud-ai-containers/pkgs/container/cloud_ai_inference_ubuntu22) include everything needed to compile and serve models with vLLM on Cloud AI accelerators. 42 | 43 | Download the Docker image: 44 | 45 | ``` 46 | docker pull ghcr.io/quic/cloud_ai_inference_ubuntu22:1.20.4.0 47 | ``` 48 | 49 | ## Start vLLM endpoint 50 | 51 | Prepare a script to launch vLLM with the pre-generated model binary inside the container. 52 | 53 | Customize the Hugging face model name (`--model`), context length (`--max-model-len`), prompt length (`max-seq_len-to-capture`) and full batch size (`max-num-seq`) to match the QPC from the 'Prepare the Model' step above. 54 | 55 | ``` 56 | $ cat < serve.sh 57 | #!/bin/bash 58 | /opt/vllm-env/bin/python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4 --max-model-len 4096 --max-num-seq 1 --max-seq_len-to-capture 128 --device qaic --device-group 0,1,2,3 59 | EOF 60 | 61 | # Script must have execute permission 62 | $ chmod +x serve.sh 63 | ``` 64 | 65 | Note: Change `/path/to/qpc` to the QPC location from the 'Prepare the model' step above. 66 | If your system has multiple Ultra cards, you can change the `--device` arguments to map a different card. 67 | This example creates a `qaic-vllm` Docker volume to hold persistent data (namely the tokenizer weights downloaded from Hugging face). 68 | 69 | ``` 70 | docker run -dit \ 71 | --workdir /model \ 72 | --name qaic-vllm \ 73 | --network host \ 74 | --mount type=bind,source=${PWD}/serve.sh,target=/model/serve.sh \ 75 | --mount type=bind,source=/path/to/qpc,target=/model/qpc \ 76 | -v qaic-vllm:/model/data \ 77 | --env VLLM_QAIC_MAX_CPU_THREADS=8 \ 78 | --env VLLM_QAIC_QPC_PATH=/model/qpc \ 79 | --env HF_HOME=/model/data/huggingface \ 80 | --env QEFF_HOME=/model/data/qeff_models \ 81 | --device=/dev/accel/accel0 \ 82 | --device=/dev/accel/accel1 \ 83 | --device=/dev/accel/accel2 \ 84 | --device=/dev/accel/accel3 \ 85 | --entrypoint=/model/serve.sh \ 86 | ghcr.io/quic/cloud_ai_inference_ubuntu22:1.20.4.0 87 | ``` 88 | 89 | ## Test the endpoint 90 | 91 | ``` 92 | curl http://localhost:8000/v1/chat/completions \ 93 | -H "Content-Type: application/json" \ 94 | -H "Authorization: Bearer test-key" \ 95 | -d '{ 96 | "model": "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4", 97 | "messages": [ 98 | { 99 | "role": "system", 100 | "content": "You are a helpful AI assistant." 101 | }, 102 | { 103 | "role": "user", 104 | "content": "Hello!" 105 | } 106 | ] 107 | }' 108 | ``` 109 | 110 | ## Start Open WebUI 111 | 112 | Download Open WebUI Docker image: 113 | 114 | ``` 115 | docker pull ghcr.io/open-webui/open-webui:main 116 | ``` 117 | 118 | Refer to [setup instructions](https://docs.openwebui.com/getting-started/quick-start/#quick-start-with-docker-) for more details. 119 | 120 | Run the Open WebUI container: 121 | 122 | ``` 123 | docker run \ 124 | -d \ 125 | --network host \ 126 | -e OPENAI_API_KEY=test-key \ 127 | -e OPENAI_API_BASE_URL="http://localhost:8000/v1" \ 128 | -v open-webui:/app/backend/data \ 129 | --name open-webui \ 130 | --restart always \ 131 | ghcr.io/open-webui/open-webui:main 132 | ``` 133 | 134 | In web browser, open http://:8080 135 | 136 | Setup: 137 | * For first time startup, create a default user. This user will have admin access. 138 | * Click Profile icon in upper right and open Admin Panel -> Settings -> Connections. 139 | * Click Configure icon for Manage OpenAI API Connections. 140 | * Make sure URL is http://localhost:8000/v1. Key can be any value 141 | * Click Verify Connection icon to test the connection. 142 | * You should see a "Server Connection Verified" pop-up 143 | * If it fails, double-check that the server.py script is running 144 | * Back on the Open WebUI home page, select the model name from the 'Prepare the model' step above. 145 | 146 | Open WebUI Setup 147 | 148 | You can now use the Chat interface in Open WebUI. -------------------------------------------------------------------------------- /models/multimodal/text_to_image/stable-diffusion-3.5-medium/patches/attention_patch.patch: -------------------------------------------------------------------------------- 1 | diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py 2 | index e2ab160..6036c3a 100644 3 | --- a/src/diffusers/models/attention_processor.py 4 | +++ b/src/diffusers/models/attention_processor.py 5 | @@ -11,6 +11,10 @@ 6 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 7 | # See the License for the specific language governing permissions and 8 | # limitations under the License. 9 | +# 10 | +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 11 | +# SPDX-License-Identifier: BSD-3-Clause-Clear 12 | +# Not a Contribution 13 | import inspect 14 | import math 15 | from typing import Callable, List, Optional, Tuple, Union 16 | @@ -258,9 +262,7 @@ class Attention(nn.Module): 17 | # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention 18 | # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1 19 | if processor is None: 20 | - processor = ( 21 | - AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor() 22 | - ) 23 | + processor = AttnProcessor() 24 | self.set_processor(processor) 25 | 26 | def set_use_npu_flash_attention(self, use_npu_flash_attention: bool) -> None: 27 | @@ -560,7 +562,7 @@ class Attention(nn.Module): 28 | 29 | if attention_mask is None: 30 | baddbmm_input = torch.empty( 31 | - query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device 32 | + query.shape[0], query.shape[1], key.shape[2], dtype=query.dtype, device=query.device 33 | ) 34 | beta = 0 35 | else: 36 | @@ -570,7 +572,7 @@ class Attention(nn.Module): 37 | attention_scores = torch.baddbmm( 38 | baddbmm_input, 39 | query, 40 | - key.transpose(-1, -2), 41 | + key, 42 | beta=beta, 43 | alpha=self.scale, 44 | ) 45 | @@ -764,8 +766,25 @@ class AttnProcessor: 46 | key = attn.head_to_batch_dim(key) 47 | value = attn.head_to_batch_dim(value) 48 | 49 | - attention_probs = attn.get_attention_scores(query, key, attention_mask) 50 | - hidden_states = torch.bmm(attention_probs, value) 51 | + key = key.transpose(-1, -2) 52 | + if query.size(-2) != value.size(-2): # cross-attention, use regular attention 53 | + # QKV done in single block 54 | + attention_probs = attn.get_attention_scores(query, key, attention_mask) 55 | + hidden_states = torch.bmm(attention_probs, value) 56 | + else: # self-attention, use blocked attention 57 | + # QKV done with block-attention (a la FlashAttentionV2) 58 | + print(f"{query.shape = }, {key.shape = }, {value.shape = }") 59 | + query_block_size = 64 60 | + query_seq_len = query.size(-2) 61 | + num_blocks = (query_seq_len + query_block_size - 1) // query_block_size 62 | + for qidx in range(num_blocks): 63 | + query_block = query[:,qidx*query_block_size:(qidx+1)*query_block_size,:] 64 | + attention_probs = attn.get_attention_scores(query_block, key, attention_mask) 65 | + hidden_states_block = torch.bmm(attention_probs, value) 66 | + if qidx == 0: 67 | + hidden_states = hidden_states_block 68 | + else: 69 | + hidden_states = torch.cat((hidden_states, hidden_states_block), -2) 70 | hidden_states = attn.batch_to_head_dim(hidden_states) 71 | 72 | # linear proj 73 | @@ -1075,15 +1094,31 @@ class JointAttnProcessor2_0: 74 | key = torch.cat([key, encoder_hidden_states_key_proj], dim=1) 75 | value = torch.cat([value, encoder_hidden_states_value_proj], dim=1) 76 | 77 | - inner_dim = key.shape[-1] 78 | - head_dim = inner_dim // attn.heads 79 | - query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) 80 | - key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) 81 | - value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) 82 | + query = attn.head_to_batch_dim(query) 83 | + key = attn.head_to_batch_dim(key) 84 | + value = attn.head_to_batch_dim(value) 85 | 86 | - hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False) 87 | - hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) 88 | - hidden_states = hidden_states.to(query.dtype) 89 | + # pre-transpose the key 90 | + key = key.transpose(-1, -2) 91 | + if query.size(-2) != value.size(-2): # cross-attention, use regular attention 92 | + # QKV done in single block 93 | + attention_probs = attn.get_attention_scores(query, key, attention_mask) 94 | + hidden_states = torch.bmm(attention_probs, value) 95 | + else: # self-attention, use blocked attention 96 | + # QKV done with block-attention (a la FlashAttentionV2) 97 | + print(f"{query.shape = }, {key.shape = }, {value.shape = }") 98 | + query_block_size = 64 99 | + query_seq_len = query.size(-2) 100 | + num_blocks = (query_seq_len + query_block_size - 1) // query_block_size 101 | + for qidx in range(num_blocks): 102 | + query_block = query[:,qidx*query_block_size:(qidx+1)*query_block_size,:] 103 | + attention_probs = attn.get_attention_scores(query_block, key, attention_mask) 104 | + hidden_states_block = torch.bmm(attention_probs, value) 105 | + if qidx == 0: 106 | + hidden_states = hidden_states_block 107 | + else: 108 | + hidden_states = torch.cat((hidden_states, hidden_states_block), -2) 109 | + hidden_states = attn.batch_to_head_dim(hidden_states) 110 | 111 | # Split the attention outputs. 112 | hidden_states, encoder_hidden_states = ( 113 | --------------------------------------------------------------------------------