├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── lint-format.yml │ └── quic-organization-repolinter.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CODE-OF-CONDUCT.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── QEfficient ├── __init__.py ├── base │ ├── __init__.py │ ├── common.py │ ├── modeling_qeff.py │ ├── onnx_transforms.py │ └── pytorch_transforms.py ├── cloud │ ├── __init__.py │ ├── compile.py │ ├── execute.py │ ├── export.py │ ├── finetune.py │ └── infer.py ├── compile │ ├── __init__.py │ ├── compile_helper.py │ ├── qnn_compiler.py │ └── qnn_config.json ├── customop │ ├── __init__.py │ ├── ctx_scatter_gather.py │ ├── ctx_scatter_gather_cb.py │ ├── matmulnbits.py │ └── rms_norm.py ├── exporter │ ├── __init__.py │ ├── export_hf_to_cloud_ai_100.py │ └── export_utils.py ├── finetune │ ├── __init__.py │ ├── configs │ │ ├── __init__.py │ │ ├── dataset_config.py │ │ ├── peft_config.py │ │ └── training.py │ ├── data │ │ ├── __init__.py │ │ └── sampler.py │ ├── dataset │ │ ├── __init__.py │ │ ├── alpaca_dataset.py │ │ ├── custom_dataset.py │ │ ├── dataset_config.py │ │ ├── grammar_dataset.py │ │ ├── gsm8k_dataset.py │ │ ├── imdb_dataset.py │ │ └── samsum_dataset.py │ ├── eval.py │ └── utils │ │ ├── __init__.py │ │ ├── config_utils.py │ │ ├── dataset_utils.py │ │ ├── plot_metrics.py │ │ └── train_utils.py ├── generation │ ├── __init__.py │ ├── cloud_infer.py │ └── text_generation_inference.py ├── peft │ ├── __init__.py │ ├── auto.py │ ├── lora │ │ ├── __init__.py │ │ ├── auto.py │ │ ├── layers.py │ │ ├── lora_model.py │ │ └── pytorch_transforms.py │ ├── onnx_transforms.py │ ├── peft_model.py │ └── pytorch_transforms.py ├── transformers │ ├── __init__.py │ ├── cache_utils.py │ ├── modeling_attn_mask_utils.py │ ├── modeling_utils.py │ ├── models │ │ ├── __init__.py │ │ ├── codegen │ │ │ ├── __init__.py │ │ │ └── modeling_codegen.py │ │ ├── falcon │ │ │ ├── __init__.py │ │ │ └── modeling_falcon.py │ │ ├── gemma │ │ │ ├── __init__.py │ │ │ └── modeling_gemma.py │ │ ├── gemma2 │ │ │ ├── __init__.py │ │ │ └── modeling_gemma2.py │ │ ├── gpt2 │ │ │ ├── __init__.py │ │ │ └── modeling_gpt2.py │ │ ├── gpt_bigcode │ │ │ ├── __init__.py │ │ │ └── modeling_gpt_bigcode.py │ │ ├── gptj │ │ │ ├── __init__.py │ │ │ └── modeling_gptj.py │ │ ├── granite │ │ │ ├── __init__.py │ │ │ └── modeling_granite.py │ │ ├── granitemoe │ │ │ ├── __init__.py │ │ │ └── modeling_granitemoe.py │ │ ├── internvl │ │ │ ├── __init__.py │ │ │ └── modeling_internvl.py │ │ ├── llama │ │ │ ├── __init__.py │ │ │ └── modeling_llama.py │ │ ├── llama_swiftkv │ │ │ ├── __init__.py │ │ │ └── modeling_llama_swiftkv.py │ │ ├── llava │ │ │ ├── __init__.py │ │ │ └── modeling_llava.py │ │ ├── llava_next │ │ │ ├── __init__.py │ │ │ └── modeling_llava_next.py │ │ ├── mistral │ │ │ ├── __init__.py │ │ │ └── modeling_mistral.py │ │ ├── mixtral_moe │ │ │ ├── __init__.py │ │ │ └── modeling_mixtral.py │ │ ├── mllama │ │ │ ├── __init__.py │ │ │ └── modeling_mllama.py │ │ ├── modeling_auto.py │ │ ├── mpt │ │ │ ├── __init__.py │ │ │ └── modeling_mpt.py │ │ ├── phi │ │ │ ├── __init__.py │ │ │ └── modeling_phi.py │ │ ├── phi3 │ │ │ ├── __init__.py │ │ │ └── modeling_phi3.py │ │ ├── pytorch_transforms.py │ │ ├── qwen2 │ │ │ ├── __init__.py │ │ │ └── modeling_qwen2.py │ │ ├── starcoder2 │ │ │ ├── __init__.py │ │ │ └── modeling_starcoder2.py │ │ └── whisper │ │ │ ├── __init__.py │ │ │ └── modeling_whisper.py │ ├── post_processing.py │ ├── quantizers │ │ ├── __init__.py │ │ ├── auto.py │ │ ├── awq.py │ │ ├── gptq.py │ │ ├── quant_transforms.py │ │ ├── quantizer_awq.py │ │ ├── quantizer_compressed_tensors.py │ │ ├── quantizer_gptq.py │ │ └── quantizer_utils.py │ ├── spd │ │ ├── __init__.py │ │ ├── spd_transform_forward.py │ │ └── turbo.py │ └── transform.py └── utils │ ├── __init__.py │ ├── _utils.py │ ├── cache.py │ ├── checkpoint_utils.py │ ├── constants.py │ ├── device_utils.py │ ├── generate_inputs.py │ ├── generate_qnn_network_specialization_config.py │ ├── logging_utils.py │ ├── model_registery.py │ ├── run_utils.py │ ├── spd_utils.py │ └── test_utils.py ├── README.md ├── docs ├── README.md ├── _static │ └── my_theme.css ├── _templates │ └── versions.html ├── conf.py ├── image │ └── Cloud_AI_100.png ├── index.md ├── requirements.txt └── source │ ├── blogs.md │ ├── cli_api.md │ ├── finetune.md │ ├── image │ ├── Cloud_AI_100.png │ └── kv_cache_cloudai100.png │ ├── installation.md │ ├── introduction.md │ ├── python_api.md │ ├── quick_start.md │ ├── reference.md │ ├── upgrade.md │ └── validate.md ├── examples ├── __init__.py ├── basic_gguf_models.py ├── cpp_execution │ ├── CMakeLists.txt │ ├── InferenceSetIOBuffer.cpp │ ├── README.md │ └── text_inference_using_cpp.py ├── draft_spd_inference.py ├── embedding_model.py ├── granite_example │ ├── granite_vision_inference.py │ └── readme.md ├── image_text_to_text_inference.py ├── intern_example │ ├── internvl_inference.py │ └── readme.md ├── lora_models.py ├── multiprojs_spd_inference.py ├── peft_models.py ├── pld_spd_inference.py ├── prompts.txt └── speech_to_text │ ├── README.md │ └── run_whisper_speech_to_text.py ├── notebooks ├── QEfficientGPT2.ipynb ├── QEfficientMPT.ipynb └── __init__.py ├── pyproject.toml ├── scripts ├── Jenkinsfile ├── __init__.py ├── finetune │ ├── __init__.py │ └── run_ft_model.py ├── perplexity_computation │ ├── README.md │ ├── __init__.py │ └── calculate_perplexity.py ├── replicate_kv_head │ ├── README.md │ ├── __init__.py │ └── replicate_kv_heads.py └── specializations.json └── tests ├── README.md ├── __init__.py ├── base ├── test_modeling_qeff.py ├── test_onnx_transforms.py └── test_pytorch_transforms.py ├── cloud ├── conftest.py ├── high_level_testing.json ├── test_compile_and_execute.py ├── test_export.py ├── test_infer.py └── test_infer_vlm.py ├── finetune └── test_finetune.py ├── peft ├── lora │ └── test_lora_model.py ├── test_peft_model.py └── test_peft_onnx_transforms.py ├── text_generation └── test_text_generation.py ├── transformers ├── models │ ├── test_causal_lm_models.py │ ├── test_embedding_models.py │ ├── test_image_text_to_text_models.py │ ├── test_prefix_caching.py │ └── test_speech_seq2seq_models.py ├── spd │ ├── test_pld_inference.py │ └── test_spd_inference.py ├── test_causal_lm.py ├── test_speech_seq2seq.py └── test_transformer_pytorch_transforms.py ├── utils └── test_cache.py └── vllm └── test_qaic_output_consistency.py /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | # Default owners 9 | # review when someone opens a pull request and assign appropriate reviewer 10 | * @quic-rishinr @ochougul @quic-hemagnih @quic-amitraj 11 | pyproject.toml @carlstreeter-quic 12 | 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Command Used to run / script used 16 | 2. Error details 17 | 18 | **Expected behavior** 19 | A clear and concise description of what you expected to happen. 20 | 21 | **Screenshots** 22 | If applicable, add screenshots to help explain your problem. 23 | 24 | **Environment (please complete the following information):** 25 | - OS: [e.g. iOS] 26 | - Environment details with packages version etc. 27 | - Version/Branch/Commit ID [e.g. 22] 28 | 29 | **Additional context** 30 | Add any other context about the problem here. 31 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/lint-format.yml: -------------------------------------------------------------------------------- 1 | name: Lint & Format 2 | on: [pull_request] 3 | jobs: 4 | lint: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v4 8 | - run: pip3 install ruff 9 | - run: ruff check 10 | env: 11 | RUFF_OUTPUT_FORMAT: github 12 | format: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | - run: pip3 install ruff 17 | - run: ruff format --check 18 | env: 19 | RUFF_OUTPUT_FORMAT: github 20 | -------------------------------------------------------------------------------- /.github/workflows/quic-organization-repolinter.yml: -------------------------------------------------------------------------------- 1 | name: QuIC Organization Repolinter 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | jobs: 10 | repolinter: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout Repo 14 | uses: actions/checkout@v2 15 | - name: Verify repolinter config file is present 16 | id: check_files 17 | uses: andstor/file-existence-action@v1 18 | with: 19 | files: "repolint.json" 20 | - name: Run Repolinter with local repolint.json 21 | if: steps.check_files.outputs.files_exists == 'true' 22 | uses: todogroup/repolinter-action@v1 23 | with: 24 | config_file: "repolint.json" 25 | - name: Run Repolinter with default ruleset 26 | if: steps.check_files.outputs.files_exists == 'false' 27 | uses: todogroup/repolinter-action@v1 28 | with: 29 | config_url: "https://raw.githubusercontent.com/quic/.github/main/repolint.json" 30 | 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Distribution / packaging 7 | .Python 8 | build/ 9 | develop-eggs/ 10 | dist/ 11 | downloads/ 12 | eggs/ 13 | .eggs/ 14 | lib/ 15 | lib64/ 16 | parts/ 17 | sdist/ 18 | var/ 19 | wheels/ 20 | share/python-wheels/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | MANIFEST 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Unit test / coverage reports 33 | htmlcov/ 34 | .tox/ 35 | .nox/ 36 | .coverage 37 | .coverage.* 38 | .cache 39 | nosetests.xml 40 | coverage.xml 41 | *.cover 42 | *.py,cover 43 | .hypothesis/ 44 | .pytest_cache/ 45 | cover/ 46 | 47 | # PyBuilder 48 | .pybuilder/ 49 | target/ 50 | 51 | # Jupyter Notebook 52 | .ipynb_checkpoints 53 | 54 | # IPython 55 | profile_default/ 56 | ipython_config.py 57 | 58 | # Environments 59 | .env 60 | .venv 61 | env/ 62 | venv/ 63 | ENV/ 64 | env.bak/ 65 | venv.bak/ 66 | 67 | # Spyder project settings 68 | .spyderproject 69 | .spyproject 70 | 71 | # Rope project settings 72 | .ropeproject 73 | 74 | # mypy 75 | .mypy_cache/ 76 | .dmypy.json 77 | dmypy.json 78 | 79 | # Pyre type checker 80 | .pyre/ 81 | 82 | # pytype static type analyzer 83 | .pytype/ 84 | 85 | # Cython debug symbols 86 | cython_debug/ 87 | 88 | # Local Files 89 | cache_dir 90 | qeff_models 91 | .vscode/* 92 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | # Ruff version. 4 | rev: v0.5.2 5 | hooks: 6 | # Run the linter. 7 | - id: ruff 8 | types_or: [ python, pyi, jupyter ] 9 | args: [ --fix ] 10 | # Run the formatter. 11 | - id: ruff-format 12 | types_or: [ python, pyi, jupyter ] 13 | -------------------------------------------------------------------------------- /CODE-OF-CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team. All complaints will be reviewed 59 | and investigated and will result in a response that is deemed necessary and 60 | appropriate to the circumstances. The project team is obligated to maintain 61 | confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Contributing to PROJECT 2 | 3 | Hi there! 4 | We’re thrilled that you’d like to contribute to this project. 5 | Your help is essential for keeping this project great and for making it better. 6 | 7 | ## Branching Strategy 8 | 9 | In general, contributors should develop on branches based off of `main` and pull requests should be made against `main`. 10 | 11 | ## Submitting a pull request 12 | 13 | 1. Please read our [code of conduct](CODE-OF-CONDUCT.md) and [license](LICENSE). 14 | 1. Fork and clone the repository. 15 | 1. Create a new branch based on `main`: `git checkout -b main`. 16 | 1. Make your changes, add tests, and make sure the tests still pass. 17 | 1. Commit your changes using the [DCO](http://developercertificate.org/). You can attest to the DCO by commiting with the **-s** or **--signoff** options or manually adding the "Signed-off-by". 18 | 1. Push to your fork and submit a pull request from your branch to `main`. 19 | 1. Pat yourself on the back and wait for your pull request to be reviewed. 20 | 21 | Here are a few things you can do that will increase the likelihood of your pull request to be accepted: 22 | 23 | - Follow the existing style where possible. 24 | - Write tests. 25 | - Keep your change as focused as possible. 26 | If you want to make multiple independent changes, please consider submitting them as separate pull requests. 27 | - Write a [good commit message](http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html). 28 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use Ubuntu 20.04 as the base image 2 | # Create a temp image that has build tools that we can use to build wheel 3 | # files for dependencies only available as source. 4 | FROM docker-registry.qualcomm.com/library/ubuntu:20.04 5 | 6 | # Update the package lists and install required packages 7 | RUN apt-get update && apt-get install -y \ 8 | git \ 9 | tmux \ 10 | python3.10 \ 11 | python3.10-venv \ 12 | python3-pip 13 | 14 | # pip recognizes this variable 15 | ENV PIP_CACHE_DIR=/var/cache/pip 16 | WORKDIR /app 17 | 18 | # Sample command to register and clone the repository 19 | # Clone the GitHub repository 20 | RUN git config --global user.email none@none.com && \ 21 | git config --global user.name none 22 | 23 | RUN mkdir -p /app/qefficient-library 24 | COPY . /app/qefficient-library 25 | 26 | # Create Virtual Env for the docker image 27 | RUN python3.10 -m venv /app/llm_env 28 | RUN . /app/llm_env/bin/activate 29 | WORKDIR /app/qefficient-library 30 | 31 | # Install the required Python packages 32 | 33 | RUN pip install torch==2.0.0+cpu --extra-index-url https://download.pytorch.org/whl/cpu --no-deps 34 | RUN pip install datasets==2.17.0 fsspec==2023.10.0 multidict==6.0.5 sentencepiece --no-deps 35 | 36 | RUN python3.10 -m pip install . 37 | WORKDIR /app/qefficient-library 38 | 39 | # Set the environment variable for the model card name and token ID 40 | ENV HF_HOME = "/app/qefficient-library/docs" 41 | ENV MODEL_NAME = "" 42 | ENV CACHE_DIR = "" 43 | ENV TOKEN_ID = "" 44 | 45 | # Print a success message 46 | CMD ["echo", "qefficient-transformers repository cloned and setup installed inside Docker image."] 47 | CMD ["echo", "Starting the Model Download and Export to Onnx Stage for QEff."] 48 | CMD python3.10 -m QEfficient.cloud.export --model-name "$MODEL_NAME" 49 | 50 | # Example usage: 51 | # docker build -t qefficient-library . 52 | 53 | # Minimum System Requirements Before running docker containers: 54 | # 1. Clear the tmp space. 55 | # 2. For smaller models, 32GiB RAM is sufficient, but larger LLMs we require good CPU/RAM (Context 7B model would require atleast 64GiB). 56 | # 3. The exact minimum system configuration are tough to decide, since its all function of model parameters. 57 | 58 | # docker run -e MODEL_NAME=gpt2 -e TOKEN_ID= qefficient-library -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are met: 5 | 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following 11 | disclaimer in the documentation and/or other materials provided 12 | with the distribution. 13 | 14 | * Neither the name of Qualcomm Technologies, Inc. nor the names of its 15 | contributors may be used to endorse or promote products derived 16 | from this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | SPDX-License-Identifier: BSD-3-Clause 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | -------------------------------------------------------------------------------- /QEfficient/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import os 9 | 10 | # For faster downloads via hf_transfer 11 | # This code is put above import statements as this needs to be executed before 12 | # hf_transfer is imported (will happen on line 15 via leading imports) 13 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" 14 | 15 | # Placeholder for all non-transformer models registered in QEfficient 16 | import QEfficient.utils.model_registery # noqa: F401 17 | from QEfficient.utils.logging_utils import logger 18 | 19 | 20 | def check_qaic_sdk(): 21 | """Check if QAIC SDK is installed""" 22 | try: 23 | import platform 24 | import sys 25 | 26 | sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}") 27 | import qaicrt # noqa: F401 28 | 29 | return True 30 | except ImportError: 31 | return False 32 | 33 | 34 | # Conditionally import QAIC-related modules if the SDK is installed 35 | __version__ = "0.0.1.dev0" 36 | 37 | if check_qaic_sdk(): 38 | from QEfficient.base import ( 39 | QEFFAutoModel, 40 | QEFFAutoModelForCausalLM, 41 | QEFFAutoModelForImageTextToText, 42 | QEFFAutoModelForSpeechSeq2Seq, 43 | QEFFCommonLoader, 44 | ) 45 | from QEfficient.compile.compile_helper import compile 46 | from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter 47 | from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv 48 | from QEfficient.peft import QEffAutoPeftModelForCausalLM 49 | from QEfficient.transformers.transform import transform 50 | 51 | # Users can use QEfficient.export for exporting models to ONNX 52 | export = qualcomm_efficient_converter 53 | 54 | __all__ = [ 55 | "transform", 56 | "export", 57 | "compile", 58 | "cloud_ai_100_exec_kv", 59 | "QEFFAutoModel", 60 | "QEFFAutoModelForCausalLM", 61 | "QEffAutoPeftModelForCausalLM", 62 | "QEFFAutoModelForImageTextToText", 63 | "QEFFAutoModelForSpeechSeq2Seq", 64 | "QEFFCommonLoader", 65 | ] 66 | 67 | else: 68 | logger.warning("QAIC SDK is not installed, eager mode features won't be available!") 69 | -------------------------------------------------------------------------------- /QEfficient/base/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | from QEfficient.base.common import QEFFCommonLoader # noqa: F401 9 | from QEfficient.transformers.models.modeling_auto import ( # noqa: F401 10 | QEFFAutoModel, 11 | QEFFAutoModelForCausalLM, 12 | QEFFAutoModelForImageTextToText, 13 | QEFFAutoModelForSpeechSeq2Seq, 14 | ) 15 | -------------------------------------------------------------------------------- /QEfficient/base/common.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | """ 9 | MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP dictionary defines the mapping between names of the varities of Transformer model defined in 10 | QEFF_MODEL_TYPE and the classes that implement the methods i.e.(compile, export etc.) for those types. 11 | 12 | QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model. 13 | """ 14 | 15 | import os 16 | from typing import Any 17 | 18 | from transformers import AutoConfig 19 | 20 | from QEfficient.base.modeling_qeff import QEFFBaseModel 21 | from QEfficient.transformers.modeling_utils import MODEL_CLASS_MAPPING 22 | from QEfficient.utils import login_and_download_hf_lm 23 | 24 | 25 | class QEFFCommonLoader: 26 | """ 27 | Provides HuggingFace model loading interface same as transformers APIs. 28 | Supports loading any model on HuggingFace. 29 | Wrapper on top of Auto Classes 30 | """ 31 | 32 | def __init__(self, *args: Any, **kwds: Any) -> None: 33 | raise EnvironmentError( 34 | f"{self.__class__.__name__} is designed to be instantiated " 35 | f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)`" 36 | ) 37 | 38 | @classmethod 39 | def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> QEFFBaseModel: 40 | """ 41 | Downloads HuggingFace model if already doesn't exist locally, returns QEFFAutoModel object based on type of model. 42 | """ 43 | config = AutoConfig.from_pretrained(pretrained_model_name_or_path) 44 | 45 | class_name = MODEL_CLASS_MAPPING.get(config.__class__.__name__, None) 46 | if class_name: 47 | module = __import__("QEfficient.transformers.models.modeling_auto") 48 | model_class = getattr(module, class_name) 49 | else: 50 | raise NotImplementedError( 51 | f"Unknown architecture={config.__class__.__name__}, either use specific auto model class for loading the model or raise an issue for support!" 52 | ) 53 | 54 | local_model_dir = kwargs.pop("local_model_dir", None) 55 | if not os.path.isdir(pretrained_model_name_or_path) and local_model_dir is None: 56 | pretrained_model_name_or_path = login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs) 57 | hf_token = kwargs.pop("hf_token", None) 58 | continuous_batching = True if kwargs.pop("full_batch_size", None) else False 59 | 60 | qeff_model = model_class.from_pretrained( 61 | pretrained_model_name_or_path=(local_model_dir if local_model_dir else pretrained_model_name_or_path), 62 | token=hf_token, 63 | continuous_batching=continuous_batching, 64 | **kwargs, 65 | ) 66 | return qeff_model 67 | -------------------------------------------------------------------------------- /QEfficient/base/onnx_transforms.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ---------------------------------------------------------------------------- 7 | 8 | from typing import Optional, Tuple 9 | 10 | import numpy as np 11 | from onnx import ModelProto, external_data_helper, numpy_helper 12 | 13 | 14 | class OnnxTransform: 15 | """ 16 | OnnxTransform is the base class for graph modifications on exported onnx. 17 | """ 18 | 19 | def __init__(self): 20 | raise TypeError("Transform classes are not to be instantiated. Directly use the `apply` method.") 21 | 22 | @classmethod 23 | def apply(cls, model: ModelProto, **kwargs) -> Tuple[ModelProto, bool]: 24 | """ 25 | Override this class to apply a transformation. 26 | :param model: The model's ONNX graph to transform 27 | :param kwargs: Parameters needed for specific transforms. All transforms should take **kwargs to ignore unneeded kwargs. 28 | 29 | :returns: ONNX graph after applying the transform 30 | :returns: Boolean indicating whether transform was applied 31 | """ 32 | raise NotImplementedError("Use subclasses for ONNX transform") 33 | 34 | 35 | class FP16ClipTransform(OnnxTransform): 36 | """ 37 | Clips the tensor values to be in FP16 range. 38 | """ 39 | 40 | @classmethod 41 | def apply(cls, model: ModelProto, *, onnx_base_dir: Optional[str] = None, **kwargs) -> Tuple[ModelProto, bool]: 42 | """ 43 | :param onnx_base_dir: Base directory to load tensors (if not already loaded). 44 | """ 45 | finfo = np.finfo(np.float16) 46 | fp16_max = finfo.max 47 | fp16_min = finfo.min 48 | transformed = False 49 | for tensor in external_data_helper._get_all_tensors(model): 50 | nptensor = numpy_helper.to_array(tensor, onnx_base_dir) 51 | if nptensor.dtype == np.float32 and (np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min)): 52 | nptensor = np.clip(nptensor, fp16_min, fp16_max) 53 | new_tensor = numpy_helper.from_array(nptensor, tensor.name) 54 | tensor.CopyFrom(new_tensor) 55 | transformed = True 56 | return model, transformed 57 | 58 | 59 | class SplitTensorsTransform(OnnxTransform): 60 | """ 61 | Split external tensors file 62 | """ 63 | 64 | @classmethod 65 | def apply( 66 | cls, 67 | model: ModelProto, 68 | *, 69 | model_name: str, 70 | onnx_base_dir: Optional[str] = None, 71 | file_chunk_size: int = 10 * 2**30, # 10 GiB 72 | size_threshold: int = 1024, 73 | **kwargs, 74 | ) -> Tuple[ModelProto, bool]: 75 | """ 76 | :param model_name: Used for naming external files. i.e. {model_name}_0.onnx.data 77 | :param onnx_base_dir: Base directory to load tensors (if not already loaded). 78 | :param file_chunk_size: Chunk size to split external files into. 79 | :param size_threshold: Only tensors greater than this threshold (in bytes) will be saved externally. 80 | """ 81 | file_num = 0 82 | current_file_size = 0 83 | transformed = False 84 | external_data_helper.load_external_data_for_model(model, onnx_base_dir) 85 | for tensor in external_data_helper._get_all_tensors(model): 86 | if tensor.HasField("raw_data") and ((tsize := len(tensor.raw_data)) > size_threshold): 87 | transformed = True 88 | current_file_size += tsize 89 | if current_file_size > file_chunk_size: 90 | file_num += 1 91 | current_file_size = tsize 92 | external_data_helper.set_external_data(tensor, f"{model_name}_{file_num}.onnx.data") 93 | return model, transformed 94 | -------------------------------------------------------------------------------- /QEfficient/cloud/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/cloud/compile.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import argparse 9 | 10 | import QEfficient 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser(description="Compilation script.") 14 | parser.add_argument("--onnx_path", "--onnx-path", required=True, help="Onnx Model Path") 15 | parser.add_argument( 16 | "--qpc-path", 17 | "--qpc_path", 18 | required=True, 19 | help="Compiled qpc binaries will be stored under this folder", 20 | ) 21 | parser.add_argument("--batch_size", "--batch-size", type=int, default=1, help="Batch size for text generation") 22 | parser.add_argument( 23 | "--prompt_len", 24 | "--prompt-len", 25 | default=32, 26 | type=int, 27 | help="Sequence length for text generation.", 28 | ) 29 | parser.add_argument("--ctx_len", "--ctx-len", default=128, type=int, help="Context length for text generation.") 30 | parser.add_argument( 31 | "--mxfp6", 32 | action="store_true", 33 | help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression", 34 | ) 35 | parser.add_argument( 36 | "--mxint8", 37 | action="store_true", 38 | help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False", 39 | ) 40 | parser.add_argument( 41 | "--num_cores", 42 | "--num-cores", 43 | required=True, 44 | type=int, 45 | help="num cores to compile the model on", 46 | ) 47 | parser.add_argument( 48 | "--custom_io_file_path", 49 | "--custom-io-file-path", 50 | type=str, 51 | help="Path to custom IO file", 52 | ) 53 | parser.add_argument( 54 | "--device_group", 55 | "--device-group", 56 | required=True, 57 | type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")], 58 | help="Cloud AI 100 device ids (comma-separated) e.g. [0,1] ", 59 | ) 60 | parser.add_argument( 61 | "--aic_enable_depth_first", 62 | "--aic-enable-depth-first", 63 | action="store_true", 64 | help="If passed, this option will be enabled during compilation, disabled by default", 65 | ) 66 | parser.add_argument( 67 | "--mos", 68 | type=int, 69 | default=-1, 70 | help=" Effort level to reduce the on-chip memory", 71 | ) 72 | parser.add_argument( 73 | "--full_batch_size", 74 | "--full-batch-size", 75 | type=int, 76 | default=None, 77 | help="Set full batch size to enable continuous batching mode, default is None", 78 | ) 79 | parser.add_argument( 80 | "--allow-mxint8-mdp-io", 81 | "--allow_mxint8_mdp_io", 82 | action="store_true", 83 | help="If passed, this option allows MXINT8 compression of MDP IO traffic", 84 | ) 85 | parser.add_argument( 86 | "--enable_qnn", 87 | "--enable-qnn", 88 | nargs="?", 89 | const=True, 90 | type=str, 91 | default=False, 92 | help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\ 93 | If not provided, the default configuration will be used.\ 94 | Sample Config: QEfficient/compile/qnn_config.json", 95 | ) 96 | 97 | args, compiler_options = parser.parse_known_args() 98 | 99 | if isinstance(args.enable_qnn, str): 100 | args.qnn_config = args.enable_qnn 101 | args.enable_qnn = True 102 | 103 | compiler_options_dict = {} 104 | for i in range(0, len(compiler_options)): 105 | if compiler_options[i].startswith("--"): 106 | key = compiler_options[i].lstrip("-").replace("-", "_") 107 | value = ( 108 | compiler_options[i + 1] 109 | if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-") 110 | else True 111 | ) 112 | compiler_options_dict[key] = value 113 | QEfficient.compile(**args.__dict__, **compiler_options_dict) 114 | -------------------------------------------------------------------------------- /QEfficient/cloud/export.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import argparse 9 | import os 10 | from typing import Optional 11 | 12 | from QEfficient.base.common import QEFFCommonLoader 13 | from QEfficient.utils import check_and_assign_cache_dir 14 | from QEfficient.utils.logging_utils import logger 15 | 16 | # Specifically for Docker images. 17 | ROOT_DIR = os.path.dirname(os.path.abspath("")) 18 | 19 | 20 | def get_onnx_model_path( 21 | model_name: str, 22 | cache_dir: Optional[str] = None, 23 | hf_token: Optional[str] = None, 24 | full_batch_size: Optional[int] = None, 25 | local_model_dir: Optional[str] = None, 26 | ): 27 | """ 28 | exports the model to onnx if pre-exported file is not found and returns onnx_model_path 29 | 30 | ``Mandatory`` Args: 31 | :model_name (str): Hugging Face Model Card name, Example: ``gpt2``. 32 | ``Optional`` Args: 33 | :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.`` 34 | :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Pass model tokenizer. ``Defaults to None.`` 35 | :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` 36 | :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` 37 | :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.`` 38 | """ 39 | logger.info(f"Exporting Pytorch {model_name} model to ONNX...") 40 | 41 | qeff_model = QEFFCommonLoader.from_pretrained( 42 | pretrained_model_name_or_path=model_name, 43 | cache_dir=cache_dir, 44 | hf_token=hf_token, 45 | full_batch_size=full_batch_size, 46 | local_model_dir=local_model_dir, 47 | ) 48 | onnx_model_path = qeff_model.export() 49 | logger.info(f"Generated onnx_path: {onnx_model_path}") 50 | return onnx_model_path 51 | 52 | 53 | def main( 54 | model_name: str, 55 | cache_dir: Optional[str] = None, 56 | hf_token: Optional[str] = None, 57 | local_model_dir: Optional[str] = None, 58 | full_batch_size: Optional[int] = None, 59 | ) -> None: 60 | """ 61 | Helper function used by export CLI app for exporting to ONNX Model. 62 | 63 | ``Mandatory`` Args: 64 | :model_name (str): Hugging Face Model Card name, Example: ``gpt2``. 65 | 66 | ``Optional`` Args: 67 | :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.`` 68 | :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` 69 | :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` 70 | :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.`` 71 | 72 | .. code-block:: bash 73 | 74 | python -m QEfficient.cloud.export OPTIONS 75 | 76 | """ 77 | cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir) 78 | get_onnx_model_path( 79 | model_name=model_name, 80 | cache_dir=cache_dir, 81 | hf_token=hf_token, 82 | full_batch_size=full_batch_size, 83 | local_model_dir=local_model_dir, 84 | ) 85 | 86 | 87 | if __name__ == "__main__": 88 | parser = argparse.ArgumentParser(description="Export script.") 89 | parser.add_argument("--model_name", "--model-name", required=True, help="HF Model card name/id") 90 | parser.add_argument( 91 | "--local-model-dir", "--local_model_dir", required=False, help="Path to custom model weights and config files" 92 | ) 93 | parser.add_argument( 94 | "--cache_dir", 95 | "--cache-dir", 96 | required=False, 97 | help="Cache_dir to store the HF files", 98 | ) 99 | parser.add_argument( 100 | "--hf-token", "--hf_token", default=None, type=str, required=False, help="HF token id for private HF models" 101 | ) 102 | parser.add_argument( 103 | "--full_batch_size", 104 | "--full-batch-size", 105 | type=int, 106 | default=None, 107 | help="Set full batch size to enable continuous batching mode, default is None", 108 | ) 109 | args = parser.parse_args() 110 | main(**args.__dict__) 111 | -------------------------------------------------------------------------------- /QEfficient/compile/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /QEfficient/compile/qnn_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "converter_args_extension": "", 3 | "context_binary_generator_args_extension": "--log_level debug", 4 | "qnn_compilation_backend": 5 | { 6 | "compiler_enable_depth_first": true, 7 | "compiler_printDDRStats": false, 8 | "compiler_printPerfMetrics": false, 9 | "compiler_stat_level": 10 10 | }, 11 | "SKIP_QNN_CONVERTER_STEP": false 12 | } -------------------------------------------------------------------------------- /QEfficient/customop/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | from QEfficient.customop.ctx_scatter_gather import CtxGatherFunc, CtxGatherFunc3D, CtxScatterFunc, CtxScatterFunc3D 9 | from QEfficient.customop.ctx_scatter_gather_cb import ( 10 | CtxGatherFuncCB, 11 | CtxGatherFuncCB3D, 12 | CtxScatterFuncCB, 13 | CtxScatterFuncCB3D, 14 | ) 15 | from QEfficient.customop.rms_norm import CustomRMSNormAIC, GemmaCustomRMSNormAIC 16 | 17 | __all__ = [ 18 | "CtxGatherFunc", 19 | "CtxScatterFunc", 20 | "CtxGatherFunc3D", 21 | "CtxScatterFunc3D", 22 | "CustomRMSNormAIC", 23 | "GemmaCustomRMSNormAIC", 24 | "CtxGatherFuncCB", 25 | "CtxScatterFuncCB", 26 | "CtxGatherFuncCB3D", 27 | "CtxScatterFuncCB3D", 28 | ] 29 | -------------------------------------------------------------------------------- /QEfficient/customop/rms_norm.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import onnxscript 9 | import torch 10 | from torch import nn 11 | 12 | from QEfficient.utils import constants 13 | 14 | ops = getattr(onnxscript, "opset" + str(constants.ONNX_EXPORT_OPSET)) 15 | 16 | 17 | @onnxscript.script(onnxscript.values.Opset(domain="com.qti.aisw.onnx", version=1)) 18 | def CustomRMSNorm(hidden_states: onnxscript.FLOAT, weight: onnxscript.FLOAT, epsilon: float): 19 | weight = ops.Cast(weight, to=1) 20 | variance = ops.ReduceMean(ops.Pow(hidden_states, 2), axes=[-1], keepdims=1) 21 | epsilon = ops.Expand(epsilon, ops.Shape(variance)) 22 | hidden_states = hidden_states * ops.Reciprocal(ops.Sqrt(variance + epsilon)) 23 | return weight * hidden_states 24 | 25 | 26 | class CustomRMSNormFunc(torch.autograd.Function): 27 | @staticmethod 28 | def forward(hidden_states: torch.Tensor, weight: torch.Tensor, epsilon: float): 29 | variance = hidden_states.pow(2).mean(-1, keepdim=True) 30 | hidden_states = hidden_states * torch.rsqrt(variance + epsilon) 31 | return weight * hidden_states 32 | 33 | @staticmethod 34 | def setup_context(ctx, inputs, outputs): 35 | pass 36 | 37 | @staticmethod 38 | def symbolic(g: torch.Graph, hidden_states: torch.Value, weight: torch.Value, epsilon: torch.Value) -> torch.Value: 39 | return g.onnxscript_op(CustomRMSNorm, hidden_states, weight, epsilon_f=epsilon).setTypeAs(hidden_states) 40 | 41 | 42 | class CustomRMSNormAIC(nn.Module): 43 | """ 44 | RMSNorm module that works by replacing the current module with compiler known custom-op. 45 | """ 46 | 47 | def __init__(self, hidden_size, eps=1e-05): 48 | super(CustomRMSNormAIC, self).__init__() 49 | self.variance_epsilon = eps 50 | self.eps = eps # Added to support GemmaRMSNorm 51 | self.weight = torch.nn.Parameter(torch.ones(hidden_size)) 52 | 53 | def forward(self, hidden_states): 54 | return CustomRMSNormFunc.apply( 55 | hidden_states, self.weight, self.variance_epsilon if hasattr(self, "variance_epsilon") else self.eps 56 | ) 57 | 58 | 59 | class GemmaCustomRMSNormAIC(CustomRMSNormAIC): 60 | """ 61 | Modify the init function to add +1 to the weights 62 | """ 63 | 64 | def __qeff_init__(self): 65 | with torch.no_grad(): 66 | self.weight.copy_(self.weight + 1.0) 67 | -------------------------------------------------------------------------------- /QEfficient/exporter/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/finetune/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | 9 | -------------------------------------------------------------------------------- /QEfficient/finetune/configs/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | 9 | -------------------------------------------------------------------------------- /QEfficient/finetune/configs/dataset_config.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | from dataclasses import dataclass 9 | 10 | 11 | @dataclass 12 | class samsum_dataset: 13 | dataset: str = "samsum_dataset" 14 | train_split: str = "train" 15 | test_split: str = "validation" 16 | 17 | 18 | @dataclass 19 | class grammar_dataset: 20 | dataset: str = "grammar_dataset" 21 | train_split: str = "train" 22 | test_split: str = "validation" 23 | 24 | 25 | @dataclass 26 | class alpaca_dataset: 27 | dataset: str = "alpaca_dataset" 28 | train_split: str = "train" 29 | test_split: str = "val" 30 | data_path: str = "dataset/alpaca_data.json" 31 | 32 | 33 | @dataclass 34 | class gsm8k_dataset: 35 | dataset: str = "gsm8k_dataset" 36 | train_split: str = "train" 37 | test_split: str = "test" 38 | 39 | 40 | @dataclass 41 | class imdb_dataset: 42 | dataset: str = "imdb_dataset" 43 | train_split: str = "train" 44 | test_split: str = "test" 45 | num_labels: int = 2 46 | 47 | 48 | @dataclass 49 | class custom_dataset: 50 | dataset: str = "custom_dataset" 51 | file: str = "dataset/custom_dataset.py" 52 | train_split: str = "train" 53 | test_split: str = "validation" 54 | data_path: str = "" 55 | -------------------------------------------------------------------------------- /QEfficient/finetune/configs/peft_config.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | from dataclasses import dataclass, field 9 | from typing import List 10 | 11 | 12 | @dataclass 13 | class LoraConfig: 14 | """LoRA-specific configuration for parameter-efficient fine-tuning. 15 | 16 | Attributes: 17 | r (int): LoRA rank (default: 8). 18 | lora_alpha (int): LoRA scaling factor (default: 32). 19 | target_modules (List[str]): Modules to apply LoRA to (default: ["q_proj", "v_proj"]). 20 | bias (str): Bias handling in LoRA (default: "none"). 21 | task_type (str): Task type for LoRA (default: "CAUSAL_LM"). 22 | lora_dropout (float): Dropout rate for LoRA (default: 0.0). 23 | inference_mode (bool): Whether model is in inference mode (default: False). 24 | """ 25 | 26 | r: int = 8 27 | lora_alpha: int = 32 28 | target_modules: List[str] = field(default_factory=lambda: ["q_proj", "v_proj"]) 29 | bias: str = "none" 30 | task_type: str = "CAUSAL_LM" 31 | lora_dropout: float = 0.05 32 | inference_mode: bool = False # should be False for finetuning 33 | 34 | 35 | # CAUTION prefix tuning is currently not supported 36 | @dataclass 37 | class PrefixConfig: 38 | num_virtual_tokens: int = 30 39 | task_type: str = "CAUSAL_LM" 40 | -------------------------------------------------------------------------------- /QEfficient/finetune/data/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/finetune/data/sampler.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import random 9 | from itertools import islice 10 | 11 | import numpy as np 12 | import torch 13 | 14 | 15 | class LengthBasedBatchSampler(torch.utils.data.BatchSampler): 16 | def __init__(self, data_source, batch_size: int, drop_last: bool, shuffle: bool = True) -> None: 17 | if isinstance(next(iter(data_source)), dict): 18 | first_key = next(iter(next(iter(data_source)).keys())) 19 | self.lengths = [len(d[first_key]) for d in data_source] 20 | else: 21 | self.lengths = [len(d) for d in data_source] 22 | self.batch_size = batch_size 23 | self.drop_last = drop_last 24 | self.shuffle = shuffle 25 | 26 | def __iter__(self): 27 | ids = np.argsort(self.lengths, kind="mergesort") 28 | if self.drop_last: 29 | ids = ids[: len(ids) // self.batch_size * self.batch_size] 30 | 31 | batches = [ids[i : i + self.batch_size] for i in range(0, len(ids), self.batch_size)] 32 | 33 | if self.shuffle: 34 | random.shuffle(batches) 35 | 36 | for b in batches: 37 | yield b 38 | 39 | def __len__(self): 40 | if self.drop_last: 41 | return len(self.lengths) // self.batch_size 42 | else: 43 | return len(self.lengths) // self.batch_size + (len(self.lengths) % self.batch_size > 0) 44 | 45 | 46 | class DistributedLengthBasedBatchSampler(torch.utils.data.BatchSampler): 47 | def __init__( 48 | self, data_source, batch_size: int, num_replicas: int, rank: int, shuffle: bool = True, seed: int = 0 49 | ) -> None: 50 | random.seed(seed) 51 | self.batch_sampler = LengthBasedBatchSampler( 52 | data_source, batch_size=batch_size, drop_last=True, shuffle=shuffle 53 | ) 54 | self.num_replicas = num_replicas 55 | self.rank = rank 56 | 57 | def __iter__(self): 58 | max_length = len(self.batch_sampler) // self.num_replicas * self.num_replicas 59 | return islice(self.batch_sampler, self.rank, max_length, self.num_replicas) 60 | 61 | def __len__(self): 62 | return len(self.batch_sampler) // self.num_replicas 63 | -------------------------------------------------------------------------------- /QEfficient/finetune/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/finetune/dataset/alpaca_dataset.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import copy 9 | import json 10 | 11 | import torch 12 | from torch.utils.data import Dataset 13 | 14 | PROMPT_DICT = { 15 | "prompt_input": ( 16 | "Below is an instruction that describes a task, paired with an input that provides further context. " 17 | "Write a response that appropriately completes the request.\n\n" 18 | "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:" 19 | ), 20 | "prompt_no_input": ( 21 | "Below is an instruction that describes a task. " 22 | "Write a response that appropriately completes the request.\n\n" 23 | "### Instruction:\n{instruction}\n\n### Response:" 24 | ), 25 | } 26 | 27 | 28 | class InstructionDataset(Dataset): 29 | def __init__(self, dataset_config, tokenizer, partition="train", context_length=None): 30 | self.ann = json.load(open(dataset_config.data_path)) 31 | # Use 5% of the dataset for evaluation 32 | eval_length = int(len(self.ann) / 20) 33 | if partition == "train": 34 | self.ann = self.ann[eval_length:] 35 | else: 36 | self.ann = self.ann[:eval_length] 37 | 38 | self.tokenizer = tokenizer 39 | self.context_length = context_length 40 | 41 | def __len__(self): 42 | return len(self.ann) 43 | 44 | def __getitem__(self, index): 45 | IGNORE_INDEX = -100 # The default setting 46 | 47 | ann = self.ann[index] 48 | if ann.get("input", "") == "": 49 | prompt = PROMPT_DICT["prompt_no_input"].format_map(ann) 50 | else: 51 | prompt = PROMPT_DICT["prompt_input"].format_map(ann) 52 | example = prompt + ann["output"] 53 | prompt = torch.tensor( 54 | self.tokenizer.encode(prompt, max_length=self.context_length, pad_to_max_length=True), dtype=torch.int64 55 | ) 56 | example = self.tokenizer.encode(example, max_length=self.context_length, pad_to_max_length=True) 57 | example.append(self.tokenizer.eos_token_id) 58 | example = torch.tensor(example, dtype=torch.int64) 59 | labels = copy.deepcopy(example) 60 | labels[: len(prompt)] = -1 61 | example_mask = example.ge(0) 62 | label_mask = labels.ge(0) 63 | example[~example_mask] = 0 64 | labels[~label_mask] = IGNORE_INDEX 65 | 66 | return { 67 | "input_ids": example.tolist(), 68 | "labels": labels.tolist(), 69 | "attention_mask": example_mask.tolist(), 70 | } 71 | -------------------------------------------------------------------------------- /QEfficient/finetune/dataset/custom_dataset.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import importlib 9 | from pathlib import Path 10 | 11 | 12 | def load_module_from_py_file(py_file: str) -> object: 13 | """ 14 | This method loads a module from a py file which is not in the Python path 15 | """ 16 | module_name = Path(py_file).name 17 | loader = importlib.machinery.SourceFileLoader(module_name, py_file) 18 | spec = importlib.util.spec_from_loader(module_name, loader) 19 | module = importlib.util.module_from_spec(spec) 20 | 21 | loader.exec_module(module) 22 | 23 | return module 24 | 25 | 26 | def get_custom_dataset(dataset_config, tokenizer, split: str): 27 | if ":" in dataset_config.file: 28 | module_path, func_name = dataset_config.file.split(":") 29 | else: 30 | module_path, func_name = dataset_config.file, "get_custom_dataset" 31 | 32 | if not module_path.endswith(".py"): 33 | raise ValueError(f"Dataset file {module_path} is not a .py file.") 34 | 35 | module_path = Path(module_path) 36 | if not module_path.is_file(): 37 | raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.") 38 | 39 | module = load_module_from_py_file(module_path.as_posix()) 40 | try: 41 | return getattr(module, func_name)(dataset_config, tokenizer, split) 42 | except AttributeError as e: 43 | print( 44 | f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})." 45 | ) 46 | raise e 47 | 48 | 49 | def get_data_collator(dataset_processer, dataset_config): 50 | if ":" in dataset_config.file: 51 | module_path, func_name = dataset_config.file.split(":") 52 | else: 53 | module_path, func_name = dataset_config.file, "get_data_collator" 54 | 55 | if not module_path.endswith(".py"): 56 | raise ValueError(f"Dataset file {module_path} is not a .py file.") 57 | 58 | module_path = Path(module_path) 59 | if not module_path.is_file(): 60 | raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.") 61 | 62 | module = load_module_from_py_file(module_path.as_posix()) 63 | try: 64 | return getattr(module, func_name)(dataset_processer) 65 | except AttributeError: 66 | print(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).") 67 | print("Using the default data_collator instead.") 68 | return None 69 | -------------------------------------------------------------------------------- /QEfficient/finetune/dataset/dataset_config.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | from functools import partial 9 | 10 | from QEfficient.finetune.dataset.alpaca_dataset import ( 11 | InstructionDataset as get_alpaca_dataset, 12 | ) 13 | from QEfficient.finetune.dataset.custom_dataset import ( 14 | get_custom_dataset, 15 | get_data_collator, 16 | ) 17 | from QEfficient.finetune.dataset.grammar_dataset import ( 18 | get_dataset as get_grammar_dataset, 19 | ) 20 | from QEfficient.finetune.dataset.gsm8k_dataset import get_gsm8k_dataset 21 | from QEfficient.finetune.dataset.imdb_dataset import ( 22 | get_preprocessed_imdb as get_imdb_dataset, 23 | ) 24 | from QEfficient.finetune.dataset.samsum_dataset import ( 25 | get_preprocessed_samsum as get_samsum_dataset, 26 | ) 27 | 28 | DATASET_PREPROC = { 29 | "alpaca_dataset": partial(get_alpaca_dataset), 30 | "grammar_dataset": get_grammar_dataset, 31 | "samsum_dataset": get_samsum_dataset, 32 | "gsm8k_dataset": get_gsm8k_dataset, 33 | "custom_dataset": get_custom_dataset, 34 | "imdb_dataset": get_imdb_dataset, 35 | } 36 | DATALOADER_COLLATE_FUNC = { 37 | "custom_dataset": get_data_collator, 38 | } 39 | -------------------------------------------------------------------------------- /QEfficient/finetune/dataset/grammar_dataset.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | from pathlib import Path 9 | 10 | from datasets import load_dataset 11 | from torch.utils.data import Dataset 12 | 13 | 14 | class grammar(Dataset): 15 | def __init__(self, tokenizer, csv_name=None, context_length=None): 16 | try: 17 | self.dataset = load_dataset( 18 | "csv", 19 | data_files={"train": [csv_name]}, # "eval": "grammar_validation.csv"}, 20 | delimiter=",", 21 | ) 22 | except Exception as e: 23 | print( 24 | "Loading of grammar dataset failed! Please see [here](https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset." 25 | ) 26 | raise e 27 | 28 | self.context_length = context_length 29 | self.tokenizer = tokenizer 30 | self.print_text = False # print_text 31 | 32 | def __len__(self): 33 | return self.dataset["train"].shape[0] 34 | 35 | def convert_to_features(self, example_batch): 36 | # Create prompt and tokenize contexts and questions 37 | 38 | if self.print_text: 39 | print("Input Text: ", self.clean_text(example_batch["text"])) 40 | 41 | input_ = example_batch["input"] 42 | target_ = example_batch["target"] 43 | 44 | prompt = f"Correct this to standard English: {input_}\n---\nCorrected: " 45 | prompt_ids = self.tokenizer.encode( 46 | self.tokenizer.bos_token + prompt, 47 | add_special_tokens=False, 48 | max_length=self.context_length, 49 | pad_to_max_length=True, 50 | ) 51 | label_ids = self.tokenizer.encode( 52 | target_ + self.tokenizer.eos_token, 53 | add_special_tokens=False, 54 | max_length=self.context_length, 55 | pad_to_max_length=True, 56 | ) 57 | 58 | sample = { 59 | "input_ids": prompt_ids + label_ids, 60 | "attention_mask": [1] * len(prompt_ids + label_ids), 61 | "labels": [-100] * len(prompt_ids) + label_ids, 62 | } 63 | 64 | return sample 65 | 66 | def __getitem__(self, index): 67 | return self.convert_to_features(self.dataset["train"][int(index)]) 68 | 69 | 70 | def get_dataset(dataset_config, tokenizer, csv_name=None, context_length=None): 71 | """cover function for handling loading the working dataset""" 72 | """dataset loading""" 73 | currPath = Path.cwd() / "datasets_grammar" / "grammar_train.csv" 74 | print(f"Loading dataset {currPath}") 75 | csv_name = str(currPath) 76 | print(csv_name) 77 | dataset = grammar(tokenizer=tokenizer, csv_name=csv_name, context_length=context_length) 78 | 79 | return dataset 80 | -------------------------------------------------------------------------------- /QEfficient/finetune/dataset/gsm8k_dataset.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | from typing import Dict 9 | 10 | from datasets import Dataset, load_dataset 11 | 12 | default_instruction = """### Instruction: Solve the math question using a basic calculator. 13 | Calculator can be invoked using the format: <>. 14 | "expression" can be one of the 4 arithmetic operations, and "answer" will be filled in for you. 15 | Example: <<20+30=50>> 16 | 17 | ### Question: {question} 18 | 19 | ### Answer: """ 20 | 21 | 22 | def tokenize_and_mask(row: Dict[str, str], *, tokenizer, instruction) -> Dict[str, list]: 23 | start_tokens = {tokenizer(x, add_special_tokens=False)["input_ids"][0] for x in ["<<", " <<"]} 24 | equal_tokens = {tokenizer(x, add_special_tokens=False)["input_ids"][0] for x in ["=", " ="]} 25 | end_tokens = {tokenizer(x, add_special_tokens=False)["input_ids"][0] for x in [">>"]} 26 | 27 | input_str = tokenizer.bos_token + instruction.format(**row) 28 | ques_ids = tokenizer(input_str, add_special_tokens=False, return_attention_mask=False)["input_ids"] 29 | ans_ids = tokenizer(row["answer"] + tokenizer.eos_token, add_special_tokens=False, return_attention_mask=False)[ 30 | "input_ids" 31 | ] 32 | input_ids = ques_ids + ans_ids 33 | 34 | # State machine to recognize <> and mask answer 35 | mode = 0 36 | for i, token in enumerate(ans_ids): 37 | if mode == 0 and token in start_tokens: 38 | mode = 1 39 | elif mode == 1 and token in equal_tokens: 40 | mode = 2 41 | elif mode == 2: 42 | ans_ids[i] = -100 43 | if token in end_tokens: 44 | mode = 0 45 | 46 | labels = [-100] * len(ques_ids) + ans_ids 47 | 48 | inputs = {"input_ids": input_ids, "labels": labels} 49 | return inputs 50 | 51 | 52 | def pad_to_max_length(row: Dict[str, list], *, tokenizer, max_length: int) -> Dict[str, list]: 53 | length = len(row["input_ids"]) 54 | return { 55 | "input_ids": row["input_ids"] + [tokenizer.pad_token_id] * (max_length - length), 56 | "attention_mask": [1] * length + [0] * (max_length - length), 57 | "labels": row["labels"] + [-100] * (max_length - length), 58 | } 59 | 60 | 61 | def get_gsm8k_dataset( 62 | dataset_config, 63 | tokenizer, 64 | split, 65 | context_length=None, 66 | instruction: str = default_instruction, 67 | ) -> Dataset: 68 | ds = load_dataset("openai/gsm8k", "main", split=split) 69 | ds = ds.map( 70 | tokenize_and_mask, 71 | fn_kwargs={"tokenizer": tokenizer, "instruction": instruction}, 72 | remove_columns=["question", "answer"], 73 | ) 74 | 75 | if context_length is not None: 76 | ds = ds.filter(lambda x: x["length"] <= context_length) 77 | ds = ds.map( 78 | pad_to_max_length, 79 | fn_kwargs={"tokenizer": tokenizer, "max_length": context_length}, 80 | ) 81 | 82 | ds.set_format("torch") 83 | 84 | return ds 85 | -------------------------------------------------------------------------------- /QEfficient/finetune/dataset/imdb_dataset.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | 9 | from itertools import chain 10 | 11 | import datasets 12 | 13 | 14 | def get_preprocessed_imdb(dataset_config, tokenizer, split, context_length=None): 15 | dataset = datasets.load_dataset("stanfordnlp/imdb", split=split, trust_remote_code=True) 16 | 17 | if split == "test": 18 | # Test set contains 15000 samples. Not all are required. 19 | # 0-12499 are 0 labeled samples, 12500-24999 are 1 labeled samples. 20 | dataset = dataset.select(chain(range(0, 500), range(12500, 13000))) 21 | 22 | # Need to shuffle dataset as all the 0 labeled data is organized first and then all the 1 labeled data. 23 | dataset = dataset.shuffle(seed=42) 24 | 25 | if tokenizer.pad_token is None: 26 | tokenizer.add_special_tokens({"pad_token": "[PAD]"}) 27 | 28 | def tokenize_add_label(sample): 29 | data = tokenizer( 30 | sample["text"], 31 | add_special_tokens=True, 32 | max_length=tokenizer.model_max_length, 33 | ) 34 | 35 | data["labels"] = [sample["label"]] 36 | return data 37 | 38 | dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset.features)) 39 | return dataset 40 | -------------------------------------------------------------------------------- /QEfficient/finetune/dataset/samsum_dataset.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import datasets 9 | 10 | 11 | def get_preprocessed_samsum(dataset_config, tokenizer, split, context_length=None): 12 | dataset = datasets.load_dataset("Samsung/samsum", split=split, trust_remote_code=True) 13 | 14 | prompt = "Summarize this dialog:\n{dialog}\n---\nSummary:\n" 15 | 16 | def apply_prompt_template(sample): 17 | return { 18 | "prompt": prompt.format(dialog=sample["dialogue"]), 19 | "summary": sample["summary"], 20 | } 21 | 22 | dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features)) 23 | 24 | def tokenize_add_label(sample): 25 | prompt = tokenizer.encode( 26 | tokenizer.bos_token + sample["prompt"], 27 | add_special_tokens=False, 28 | max_length=context_length, 29 | pad_to_max_length=True, 30 | ) 31 | summary = tokenizer.encode( 32 | sample["summary"] + tokenizer.eos_token, 33 | add_special_tokens=False, 34 | max_length=context_length, 35 | pad_to_max_length=True, 36 | ) 37 | 38 | sample = { 39 | "input_ids": prompt + summary, 40 | "attention_mask": [1] * (len(prompt) + len(summary)), 41 | "labels": [-100] * len(prompt) + summary, 42 | } 43 | 44 | return sample 45 | 46 | dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset.features)) 47 | 48 | return dataset 49 | -------------------------------------------------------------------------------- /QEfficient/finetune/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/finetune/utils/dataset_utils.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import torch 9 | 10 | # from QEfficient.finetune.data.concatenator import ConcatDataset 11 | from QEfficient.finetune.dataset.dataset_config import DATALOADER_COLLATE_FUNC, DATASET_PREPROC 12 | from QEfficient.finetune.utils.config_utils import get_dataloader_kwargs 13 | 14 | 15 | def get_preprocessed_dataset( 16 | tokenizer, dataset_config, split: str = "train", context_length: int = None 17 | ) -> torch.utils.data.Dataset: 18 | if dataset_config.dataset not in DATASET_PREPROC: 19 | raise NotImplementedError(f"{dataset_config.dataset} is not (yet) implemented") 20 | 21 | def get_split(): 22 | return dataset_config.train_split if split == "train" else dataset_config.test_split 23 | 24 | return DATASET_PREPROC[dataset_config.dataset](dataset_config, tokenizer, get_split(), context_length) 25 | 26 | 27 | def get_custom_data_collator(dataset_processer, dataset_config) -> torch.utils.data.Dataset: 28 | if dataset_config.dataset not in DATALOADER_COLLATE_FUNC: 29 | return None 30 | 31 | return DATALOADER_COLLATE_FUNC[dataset_config.dataset](dataset_processer, dataset_config) 32 | 33 | 34 | def get_dataloader(tokenizer, dataset_config, train_config, split: str = "train"): 35 | dataset = get_preprocessed_dataset(tokenizer, dataset_config, split) 36 | dl_kwargs = get_dataloader_kwargs(train_config, dataset, tokenizer, split) 37 | 38 | # if split == "train" and train_config.batching_strategy == "packing": 39 | # dataset = ConcatDataset(dataset, chunk_size=train_config.context_length) 40 | 41 | # Create data loader 42 | dataloader = torch.utils.data.DataLoader( 43 | dataset, 44 | num_workers=train_config.num_workers_dataloader, 45 | pin_memory=True, 46 | **dl_kwargs, 47 | ) 48 | return dataloader 49 | -------------------------------------------------------------------------------- /QEfficient/finetune/utils/plot_metrics.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import argparse 9 | import json 10 | import os 11 | 12 | import matplotlib.pyplot as plt 13 | 14 | 15 | def plot_metric(data, metric_name, x_label, y_label, title, colors): 16 | plt.figure(figsize=(7, 6)) 17 | 18 | plt.plot( 19 | data[f"train_epoch_{metric_name}"], 20 | label=f"Train Epoch {metric_name.capitalize()}", 21 | color=colors[0], 22 | ) 23 | plt.plot( 24 | data[f"val_epoch_{metric_name}"], 25 | label=f"Validation Epoch {metric_name.capitalize()}", 26 | color=colors[1], 27 | ) 28 | plt.xlabel(x_label) 29 | plt.ylabel(y_label) 30 | plt.title(f"Train and Validation Epoch {title}") 31 | plt.legend() 32 | plt.tight_layout() 33 | 34 | 35 | def plot_single_metric_by_step(data, metric_name, x_label, y_label, title, color): 36 | plt.plot(data[f"{metric_name}"], label=f"{title}", color=color) 37 | plt.xlabel(x_label) 38 | plt.ylabel(y_label) 39 | plt.title(title) 40 | plt.legend() 41 | plt.tight_layout() 42 | 43 | 44 | def plot_metrics_by_step(data, metric_name, x_label, y_label, colors): 45 | plt.figure(figsize=(14, 6)) 46 | 47 | plt.subplot(1, 2, 1) 48 | plot_single_metric_by_step( 49 | data, 50 | f"train_step_{metric_name}", 51 | x_label, 52 | y_label, 53 | f"Train Step {metric_name.capitalize()}", 54 | colors[0], 55 | ) 56 | plt.subplot(1, 2, 2) 57 | plot_single_metric_by_step( 58 | data, 59 | f"val_step_{metric_name}", 60 | x_label, 61 | y_label, 62 | f"Validation Step {metric_name.capitalize()}", 63 | colors[1], 64 | ) 65 | plt.tight_layout() 66 | 67 | 68 | def plot_metrics(file_path): 69 | if not os.path.exists(file_path): 70 | print(f"File {file_path} does not exist.") 71 | return 72 | 73 | with open(file_path, "r") as f: 74 | try: 75 | data = json.load(f) 76 | except json.JSONDecodeError: 77 | print("Invalid JSON file.") 78 | return 79 | 80 | directory = os.path.dirname(file_path) 81 | filename_prefix = os.path.basename(file_path).split(".")[0] 82 | 83 | plot_metric(data, "loss", "Epoch", "Loss", "Loss", ["b", "r"]) 84 | plt.savefig(os.path.join(directory, f"{filename_prefix}_train_and_validation_loss.png")) 85 | plt.close() 86 | 87 | plot_metric(data, "perplexity", "Epoch", "Perplexity", "Perplexity", ["g", "m"]) 88 | plt.savefig(os.path.join(directory, f"{filename_prefix}_train_and_validation_perplexity.png")) 89 | plt.close() 90 | 91 | plot_metrics_by_step(data, "loss", "Step", "Loss", ["b", "r"]) 92 | plt.savefig(os.path.join(directory, f"{filename_prefix}_train_and_validation_loss_by_step.png")) 93 | plt.close() 94 | 95 | plot_metrics_by_step(data, "perplexity", "Step", "Loss", ["g", "m"]) 96 | plt.savefig(os.path.join(directory, f"{filename_prefix}_train_and_validation_perplexity_by_step.png")) 97 | plt.close() 98 | 99 | 100 | if __name__ == "__main__": 101 | parser = argparse.ArgumentParser(description="Plot metrics from JSON file.") 102 | parser.add_argument("--file_path", required=True, type=str, help="Path to the metrics JSON file.") 103 | args = parser.parse_args() 104 | 105 | plot_metrics(args.file_path) 106 | -------------------------------------------------------------------------------- /QEfficient/generation/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/peft/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | from QEfficient.peft.auto import QEffAutoPeftModelForCausalLM 9 | from QEfficient.peft.peft_model import QEffPeftModelForCausalLM 10 | 11 | __all__ = [ 12 | "QEffAutoPeftModelForCausalLM", 13 | "QEffPeftModelForCausalLM", 14 | ] 15 | -------------------------------------------------------------------------------- /QEfficient/peft/lora/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ---------------------------------------------------------------------------- 7 | 8 | from QEfficient.peft.lora.auto import QEffAutoLoraModelForCausalLM 9 | 10 | __all__ = [ 11 | "QEffAutoLoraModelForCausalLM", 12 | ] 13 | -------------------------------------------------------------------------------- /QEfficient/peft/lora/layers.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import math 9 | from typing import Any 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | from QEfficient.customop import CtxGatherFuncCB 16 | 17 | 18 | class LinearMultiLoRA(nn.Linear): 19 | def multilora_init(self, lora_rank, max_num_adapters): 20 | if lora_rank < 1 or max_num_adapters < 1: 21 | raise ValueError("lora_rank and max_num_adapters must be greater or equal to 1") 22 | 23 | self.max_num_adapters = max_num_adapters 24 | self.lora_rank = lora_rank 25 | 26 | self.lora_a_weights = nn.Parameter( 27 | self.weight.new_zeros(self.max_num_adapters + 1, 1, self.in_features, self.lora_rank) 28 | ) 29 | self.lora_a_weights.requires_grad = False 30 | self.lora_b_weights = nn.Parameter( 31 | self.weight.new_zeros(self.max_num_adapters + 1, 1, self.lora_rank, self.out_features) 32 | ) 33 | self.lora_b_weights.requires_grad = False 34 | self.lora_scalings = torch.full((self.max_num_adapters + 1, 1, 1, 1), 1.0, dtype=torch.float) 35 | 36 | nn.init.kaiming_uniform_(self.lora_a_weights, a=math.sqrt(5)) 37 | nn.init.zeros_(self.lora_b_weights) 38 | 39 | def forward(self, x: torch.Tensor, lora_ids: torch.Tensor): 40 | result = F.linear(x, self.weight, bias=self.bias) 41 | 42 | # multilora implementation: lora_ids 43 | other_indices_a = torch.arange(self.lora_a_weights.shape[2]).view(1, 1, -1) 44 | selected_lora_a_weights = CtxGatherFuncCB.apply( 45 | self.lora_a_weights, lora_ids, other_indices_a 46 | ) # 47 | other_indices_b = torch.arange(self.lora_b_weights.shape[2]).view(1, 1, -1) 48 | selected_lora_b_weights = CtxGatherFuncCB.apply( 49 | self.lora_b_weights, lora_ids, other_indices_b 50 | ) # 51 | other_indices_s = torch.arange(self.lora_scalings.shape[2]).view(1, 1, -1) 52 | selected_lora_scalings = CtxGatherFuncCB.apply( 53 | self.lora_scalings, lora_ids, other_indices_s 54 | ) # 55 | 56 | selected_lora_a_weights = selected_lora_a_weights.squeeze(1) 57 | selected_lora_b_weights = selected_lora_b_weights.squeeze(1) 58 | selected_lora_scalings = selected_lora_scalings.squeeze(1) 59 | 60 | result = result + x @ selected_lora_a_weights @ selected_lora_b_weights * selected_lora_scalings 61 | 62 | return result 63 | 64 | 65 | class LinearBase(nn.Linear): 66 | def forward(self, x: torch.Tensor, **kwargs: Any): 67 | return super().forward(x) 68 | -------------------------------------------------------------------------------- /QEfficient/peft/lora/lora_model.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ---------------------------------------------------------------------------- 7 | 8 | from typing import List, Optional, Tuple, Union 9 | 10 | import torch 11 | from transformers.modeling_outputs import ( 12 | CausalLMOutputWithPast, 13 | ) 14 | 15 | from QEfficient.transformers.models.llama.modeling_llama import QEffLlamaForCausalLM 16 | from QEfficient.transformers.models.mistral.modeling_mistral import QEffMistralForCausalLM 17 | 18 | 19 | class QEffLoraModelMistralForCausalLM(QEffMistralForCausalLM): 20 | def forward( 21 | self, 22 | input_ids: torch.LongTensor = None, 23 | attention_mask: Optional[torch.Tensor] = None, 24 | position_ids: Optional[torch.LongTensor] = None, 25 | past_key_values: Optional[List[torch.FloatTensor]] = None, 26 | batch_index: Optional[torch.LongTensor] = None, 27 | inputs_embeds: Optional[torch.FloatTensor] = None, 28 | labels: Optional[torch.LongTensor] = None, 29 | use_cache: Optional[bool] = None, 30 | output_attentions: Optional[bool] = None, 31 | output_hidden_states: Optional[bool] = None, 32 | return_dict: Optional[bool] = None, 33 | cache_position: Optional[torch.LongTensor] = None, 34 | lora_ids: Optional[torch.Tensor] = None, 35 | **kwargs, 36 | ) -> Union[Tuple, CausalLMOutputWithPast]: 37 | kwargs["lora_ids"] = lora_ids 38 | 39 | return super().forward( 40 | input_ids=input_ids, 41 | attention_mask=attention_mask, 42 | position_ids=position_ids, 43 | past_key_values=past_key_values, 44 | batch_index=batch_index, 45 | inputs_embeds=inputs_embeds, 46 | use_cache=use_cache, 47 | output_attentions=output_attentions, 48 | output_hidden_states=output_hidden_states, 49 | return_dict=return_dict, 50 | cache_position=cache_position, 51 | **kwargs, 52 | ) 53 | 54 | 55 | class QEffLoraModelLlamaForCausalLM(QEffLlamaForCausalLM): 56 | def forward( 57 | self, 58 | input_ids: torch.LongTensor = None, 59 | attention_mask: Optional[torch.Tensor] = None, 60 | position_ids: Optional[torch.LongTensor] = None, 61 | past_key_values: Optional[List[torch.FloatTensor]] = None, 62 | batch_index: Optional[torch.LongTensor] = None, 63 | inputs_embeds: Optional[torch.FloatTensor] = None, 64 | labels: Optional[torch.LongTensor] = None, 65 | use_cache: Optional[bool] = None, 66 | output_attentions: Optional[bool] = None, 67 | output_hidden_states: Optional[bool] = None, 68 | return_dict: Optional[bool] = None, 69 | cache_position: Optional[torch.LongTensor] = None, 70 | lora_ids: Optional[torch.Tensor] = None, 71 | **kwargs, 72 | ) -> Union[Tuple, CausalLMOutputWithPast]: 73 | kwargs["lora_ids"] = lora_ids 74 | 75 | return super().forward( 76 | input_ids=input_ids, 77 | attention_mask=attention_mask, 78 | position_ids=position_ids, 79 | past_key_values=past_key_values, 80 | batch_index=batch_index, 81 | inputs_embeds=inputs_embeds, 82 | use_cache=use_cache, 83 | output_attentions=output_attentions, 84 | output_hidden_states=output_hidden_states, 85 | return_dict=return_dict, 86 | cache_position=cache_position, 87 | **kwargs, 88 | ) 89 | -------------------------------------------------------------------------------- /QEfficient/peft/lora/pytorch_transforms.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ---------------------------------------------------------------------------- 7 | 8 | from typing import Dict, Optional, Tuple 9 | 10 | from torch import nn 11 | 12 | from QEfficient.base.pytorch_transforms import ModuleMappingTransform 13 | from QEfficient.peft.lora.layers import LinearBase, LinearMultiLoRA 14 | from QEfficient.peft.lora.lora_model import QEffLoraModelLlamaForCausalLM, QEffLoraModelMistralForCausalLM 15 | from QEfficient.transformers.models.llama.modeling_llama import QEffLlamaForCausalLM 16 | from QEfficient.transformers.models.mistral.modeling_mistral import QEffMistralForCausalLM 17 | 18 | 19 | class LoraModelInputsTransform(ModuleMappingTransform): 20 | _module_mapping = { 21 | QEffMistralForCausalLM: QEffLoraModelMistralForCausalLM, 22 | QEffLlamaForCausalLM: QEffLoraModelLlamaForCausalLM, 23 | } 24 | 25 | 26 | class TargetModulesTransform(ModuleMappingTransform): 27 | _module_mapping = {nn.Linear: LinearMultiLoRA} 28 | 29 | _module_mapping_nontarget = {nn.Linear: LinearBase} 30 | 31 | # whole set of supported target modules for now (make sure **kwargs are passed in on modeling file) 32 | all_modules = {"q_proj", "k_proj", "v_proj", "o_proj"} 33 | 34 | # a class method that deals with target module names 35 | @classmethod 36 | def apply( 37 | cls, model: nn.Module, target_modules: Optional[Dict], lora_rank: int, max_num_adapters: int 38 | ) -> Tuple[nn.Module, bool]: 39 | transformed = False 40 | nontarget_modules = {key for key in cls.all_modules if key not in target_modules} 41 | 42 | for name, module in model.named_modules(): 43 | if repl_module := cls._module_mapping.get(type(module)): 44 | if name.split(".")[-1] in target_modules: 45 | module.__class__ = repl_module 46 | if hasattr(module, "multilora_init"): 47 | module.multilora_init(lora_rank, max_num_adapters) 48 | transformed = True 49 | elif name.split(".")[-1] in nontarget_modules: 50 | module.__class__ = cls._module_mapping_nontarget.get(type(module)) 51 | transformed = True 52 | 53 | return model, transformed 54 | -------------------------------------------------------------------------------- /QEfficient/peft/onnx_transforms.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ---------------------------------------------------------------------------- 7 | 8 | from typing import Tuple 9 | 10 | import onnx 11 | 12 | from QEfficient.base.onnx_transforms import OnnxTransform 13 | 14 | 15 | class AdapterWeightsToInputsTransform(OnnxTransform): 16 | @classmethod 17 | def apply(cls, model: onnx.ModelProto, *, adapter_name: str, **kwargs) -> Tuple[onnx.ModelProto, bool]: 18 | transformed = False 19 | removed_initializers = [] 20 | 21 | # Find nodes with lora weights as inputs 22 | weight_suffix = f".{adapter_name}.weight" 23 | lora_weight_nodes = { 24 | inp: node for node in model.graph.node for inp in node.input if inp.endswith(weight_suffix) 25 | } 26 | 27 | for i, weight in enumerate(model.graph.initializer): 28 | if weight.name.endswith(weight_suffix): 29 | transformed = True 30 | 31 | # Create input/output for lora weights 32 | new_weight_name = weight.name[: -len(weight_suffix)] + ".weight" 33 | type_proto = onnx.helper.make_tensor_type_proto(weight.data_type, shape=list(weight.dims)) 34 | inp = onnx.ValueInfoProto(name=new_weight_name, type=type_proto) 35 | out = onnx.ValueInfoProto(name=new_weight_name + "_RetainedState", type=type_proto) 36 | model.graph.input.append(inp) 37 | model.graph.output.append(out) 38 | 39 | # Create a node that connects input -> output 40 | node = onnx.helper.make_node("Identity", [inp.name], [out.name], new_weight_name + "_identity") 41 | model.graph.node.append(node) 42 | 43 | # Rename weight input 44 | lora_weight_node = lora_weight_nodes[weight.name] 45 | for j, inp in enumerate(lora_weight_node.input): 46 | if inp == weight.name: 47 | lora_weight_node.input[j] = new_weight_name 48 | 49 | # Remove weight initializers 50 | removed_initializers.append(i) 51 | 52 | if transformed: 53 | for i in sorted(removed_initializers, reverse=True): 54 | model.graph.initializer.pop(i) 55 | 56 | return model, transformed 57 | -------------------------------------------------------------------------------- /QEfficient/peft/peft_model.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ---------------------------------------------------------------------------- 7 | 8 | from peft import PeftModelForCausalLM, PeftType 9 | 10 | 11 | class QEffPeftModelForCausalLM(PeftModelForCausalLM): 12 | def forward( 13 | self, 14 | input_ids=None, 15 | attention_mask=None, 16 | position_ids=None, 17 | past_key_values=None, 18 | inputs_embeds=None, 19 | labels=None, 20 | output_attentions=None, 21 | output_hidden_states=None, 22 | return_dict=None, 23 | task_ids=None, 24 | **kwargs, 25 | ): 26 | peft_config = self.active_peft_config 27 | if not peft_config.is_prompt_learning: 28 | if self.base_model.config.model_type == "mpt": 29 | if inputs_embeds is not None: 30 | raise AssertionError("forward in MPTForCausalLM does not support inputs_embeds") 31 | return self.base_model( 32 | input_ids=input_ids, 33 | attention_mask=attention_mask, 34 | position_ids=position_ids, 35 | past_key_values=past_key_values, 36 | labels=labels, 37 | output_attentions=output_attentions, 38 | output_hidden_states=output_hidden_states, 39 | return_dict=return_dict, 40 | **kwargs, 41 | ) 42 | 43 | if peft_config.peft_type == PeftType.POLY: 44 | kwargs["task_ids"] = task_ids 45 | 46 | with self._enable_peft_forward_hooks(**kwargs): 47 | kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} 48 | return self.base_model( 49 | input_ids=input_ids, 50 | attention_mask=attention_mask, 51 | position_ids=position_ids, 52 | past_key_values=past_key_values, 53 | inputs_embeds=inputs_embeds, 54 | labels=labels, 55 | output_attentions=output_attentions, 56 | output_hidden_states=output_hidden_states, 57 | return_dict=return_dict, 58 | **kwargs, 59 | ) 60 | 61 | raise NotImplementedError("Prompt learning methods are not supported from QEfficient") 62 | -------------------------------------------------------------------------------- /QEfficient/peft/pytorch_transforms.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ---------------------------------------------------------------------------- 7 | 8 | from peft import PeftModelForCausalLM 9 | 10 | from QEfficient.base.pytorch_transforms import ModuleMappingTransform 11 | from QEfficient.peft.peft_model import QEffPeftModelForCausalLM 12 | 13 | 14 | class PeftModelInputsTransform(ModuleMappingTransform): 15 | _module_mapping = {PeftModelForCausalLM: QEffPeftModelForCausalLM} 16 | -------------------------------------------------------------------------------- /QEfficient/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/transformers/modeling_attn_mask_utils.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | from typing import Optional 9 | 10 | import torch 11 | 12 | 13 | def _create_causal_mask( 14 | position_ids, 15 | target_length, 16 | sliding_window: Optional[int] = None, 17 | ): 18 | """ 19 | A utility attention mask class that allows one to: 20 | - Create a causal 4d mask 21 | - Create a causal 4d mask with slided window 22 | """ 23 | if sliding_window is not None: 24 | query_indices = position_ids.unsqueeze(-1) 25 | kv_indices = torch.arange(target_length).view(1, -1) 26 | # --- Rolling buffer --- 27 | pos_max = position_ids.max(1, keepdim=True).values 28 | kv_start = (pos_max // target_length) * target_length 29 | kv_indices_high = kv_indices + kv_start 30 | kv_indices_low = torch.where(kv_indices_high < target_length, kv_indices, kv_indices_high - target_length) 31 | kv_indices = torch.where(kv_indices_high > pos_max, kv_indices_low, kv_indices_high) 32 | kv_indices = kv_indices.unsqueeze(1) 33 | # ------ 34 | causal_mask = kv_indices > query_indices 35 | attention_mask = causal_mask 36 | 37 | window_indices = query_indices - sliding_window + 1 38 | window_mask = kv_indices < window_indices 39 | attention_mask = attention_mask | window_mask 40 | attention_mask = attention_mask.unsqueeze(1) 41 | else: 42 | query_indices = position_ids.unsqueeze(-1) 43 | kv_indices = torch.arange(target_length).view(1, 1, -1) 44 | attention_mask = kv_indices > query_indices 45 | attention_mask = attention_mask.unsqueeze(1) 46 | 47 | return attention_mask 48 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/codegen/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/falcon/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/gemma/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/gemma2/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/gpt2/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/gpt_bigcode/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/gptj/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/granite/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/granitemoe/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/internvl/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/llama/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/llama_swiftkv/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/llava/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/llava_next/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/mistral/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/mixtral_moe/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/mllama/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/mpt/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/phi/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/phi3/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/qwen2/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/starcoder2/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /QEfficient/transformers/models/whisper/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/transformers/post_processing.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | from QEfficient.transformers.spd.turbo import build_and_attach_turbo 9 | from QEfficient.utils.spd_utils import get_speculative_config, get_speculative_weights 10 | 11 | model_type_registry = dict(turbo=build_and_attach_turbo) 12 | 13 | 14 | def build_and_attach_mlp(model, pretrained_model_name_or_path, speculative_model_type: str, **kwargs): 15 | speculative_config: dict = get_speculative_config(pretrained_model_name_or_path, **kwargs) 16 | speculative_weights: str = get_speculative_weights(pretrained_model_name_or_path, **kwargs) 17 | 18 | if (model_type := speculative_config.get("model_type")) is None: 19 | speculative_config["model_type"] = speculative_model_type 20 | else: 21 | if model_type != speculative_model_type: 22 | raise ValueError( 23 | f"`model_type` key from speculator config ({model_type} does not match input model type ({speculative_model_type})." 24 | ) 25 | func = model_type_registry[speculative_model_type] 26 | model = func(model, speculative_config, speculative_weights) 27 | model.config.speculative_config = speculative_config 28 | return model 29 | -------------------------------------------------------------------------------- /QEfficient/transformers/quantizers/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /QEfficient/transformers/quantizers/auto.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ---------------------------------------------------------------------------- 7 | 8 | from transformers.quantizers.auto import AUTO_QUANTIZATION_CONFIG_MAPPING, AUTO_QUANTIZER_MAPPING 9 | from transformers.quantizers.quantizer_awq import AwqQuantizer 10 | from transformers.quantizers.quantizer_compressed_tensors import CompressedTensorsHfQuantizer 11 | from transformers.quantizers.quantizer_gptq import GptqHfQuantizer 12 | from transformers.utils.quantization_config import AwqConfig, CompressedTensorsConfig, GPTQConfig 13 | 14 | from QEfficient.transformers.quantizers.quantizer_awq import QEffAwqConfig, QEffAwqQuantizer 15 | from QEfficient.transformers.quantizers.quantizer_compressed_tensors import ( 16 | QEffCompressedTensorsConfig, 17 | QEffCompressedTensorsFP8Quantizer, 18 | QEffFP8Config, 19 | QEffFP8Quantizer, 20 | ) 21 | from QEfficient.transformers.quantizers.quantizer_gptq import QEffGPTQConfig, QEffGPTQQuantizer 22 | 23 | QEFF_AUTO_QUANTIZER_MAPPING = { 24 | "awq": QEffAwqQuantizer, 25 | "gptq": QEffGPTQQuantizer, 26 | "compressed-tensors": QEffCompressedTensorsFP8Quantizer, 27 | "fp8": QEffFP8Quantizer, 28 | } 29 | QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING = { 30 | "awq": QEffAwqConfig, 31 | "gptq": QEffGPTQConfig, 32 | "compressed-tensors": QEffCompressedTensorsConfig, 33 | "fp8": QEffFP8Config, 34 | } 35 | DUPLICATE_AUTO_QUANTIZER_MAPPING = { 36 | "awq": AwqQuantizer, 37 | "gptq": GptqHfQuantizer, 38 | "compressed-tensors": CompressedTensorsHfQuantizer, 39 | "fp8": None, 40 | } 41 | DUPLICATE_AUTO_QUANTIZATION_CONFIG_MAPPING = { 42 | "awq": AwqConfig, 43 | "gptq": GPTQConfig, 44 | "compressed-tensors": CompressedTensorsConfig, 45 | "fp8": None, 46 | } 47 | 48 | 49 | def with_replaced_quantizers(func): 50 | def wrapper(*args, **kwargs): 51 | transformers_replaced_quantization_config_mapping = dict() 52 | transformers_replaced_quantizer_mapping = dict() 53 | 54 | for k in QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.keys(): 55 | # Replace quantization config 56 | transformers_replaced_quantization_config_mapping[k] = AUTO_QUANTIZATION_CONFIG_MAPPING.get(k, None) 57 | AUTO_QUANTIZATION_CONFIG_MAPPING[k] = QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING[k] 58 | 59 | # Replace quantizer 60 | transformers_replaced_quantizer_mapping[k] = AUTO_QUANTIZER_MAPPING.get(k, None) 61 | AUTO_QUANTIZER_MAPPING[k] = QEFF_AUTO_QUANTIZER_MAPPING[k] 62 | 63 | # Call the function for loading quantized models here 64 | out = func(*args, **kwargs) 65 | 66 | # Put back quantization config and quantizer 67 | for k in QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.keys(): 68 | AUTO_QUANTIZATION_CONFIG_MAPPING[k] = transformers_replaced_quantization_config_mapping[k] 69 | AUTO_QUANTIZER_MAPPING[k] = transformers_replaced_quantizer_mapping[k] 70 | 71 | return out 72 | 73 | return wrapper 74 | 75 | 76 | def replace_transformers_quantizers(): 77 | """ 78 | This method lets you import AWQ/GPTQ models on CPU without bypassing the 79 | rule of transformers of need to GPU. 80 | Just call this method before using 81 | `transformer.AutoModelForCausalLM.from_pretrained` and any AWQ/GPTQ model 82 | that can be supported by QEfficient will be loaded using CPU. 83 | """ 84 | AUTO_QUANTIZER_MAPPING.update(QEFF_AUTO_QUANTIZER_MAPPING) 85 | AUTO_QUANTIZATION_CONFIG_MAPPING.update(QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING) 86 | 87 | 88 | # TODO: Make this a fixture? Or better, always update the quantizer and config in transformers. 89 | # When a user imports QEfficient, these are always available. 90 | def undo_transformers_quantizers(): 91 | """ 92 | This method is used to undo the effects on method `replace_transformers_quantizers`. 93 | After this is called, the transformers library will be used for loading AWQ/GPTQ models. 94 | """ 95 | AUTO_QUANTIZER_MAPPING.update(DUPLICATE_AUTO_QUANTIZER_MAPPING) 96 | AUTO_QUANTIZATION_CONFIG_MAPPING.update(DUPLICATE_AUTO_QUANTIZATION_CONFIG_MAPPING) 97 | -------------------------------------------------------------------------------- /QEfficient/transformers/quantizers/awq.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import torch 9 | import torch.nn as nn 10 | 11 | from QEfficient.transformers.quantizers.quantizer_utils import dequantize_gemm 12 | 13 | 14 | class WQLinear_GEMM(nn.Module): 15 | def __init__(self, bits, group_size, in_features, out_features, bias): 16 | super().__init__() 17 | 18 | if bits != 4: 19 | raise NotImplementedError("Only 4-bit are supported for now.") 20 | 21 | self.in_features = in_features 22 | self.out_features = out_features 23 | self.bits = bits 24 | self.group_size = group_size if group_size != -1 else in_features 25 | 26 | # quick sanity check (make sure alignment) 27 | if self.in_features % self.group_size != 0: 28 | raise ValueError( 29 | f"in_features should be perfectly divisible by group_size, got in_features = {self.in_features}, group_size = {self.group_size} while initializing WQLinear_GEMM module" 30 | ) 31 | if out_features % (32 // self.bits) != 0: 32 | raise ValueError( 33 | f"out_features must be perfectly divisible by number of weights packed into int32 value i.e. 8, got out_features={self.out_features}" 34 | ) 35 | 36 | # For compatibility with QuantLinearORT 37 | self.g_idx = torch.tensor([i // group_size for i in range(in_features)], dtype=torch.int32) 38 | self.register_buffer( 39 | "qweight", 40 | torch.zeros( 41 | (in_features, out_features // (32 // self.bits)), 42 | dtype=torch.int32, 43 | ), 44 | ) 45 | self.register_buffer( 46 | "qzeros", 47 | torch.zeros( 48 | (in_features // self.group_size, out_features // (32 // self.bits)), 49 | dtype=torch.int32, 50 | ), 51 | ) 52 | self.register_buffer( 53 | "scales", 54 | torch.zeros( 55 | (in_features // self.group_size, out_features), 56 | dtype=torch.float16, 57 | ), 58 | ) 59 | if bias: 60 | self.register_buffer( 61 | "bias", 62 | torch.zeros( 63 | (out_features), 64 | dtype=torch.float16, 65 | ), 66 | ) 67 | else: 68 | self.bias = None 69 | 70 | def forward(self, x): 71 | # Only Inference supported 72 | with torch.no_grad(): 73 | out_shape = x.shape[:-1] + (self.out_features,) 74 | 75 | out = dequantize_gemm(self.qweight, self.qzeros, self.scales, self.bits, self.group_size) 76 | out = torch.matmul(x.float(), out.float()) 77 | 78 | out = out + self.bias if self.bias is not None else out 79 | out = out.reshape(out_shape) 80 | 81 | return out 82 | -------------------------------------------------------------------------------- /QEfficient/transformers/quantizers/gptq.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ---------------------------------------------------------------------------- 7 | 8 | import math 9 | 10 | import torch 11 | from torch import nn 12 | 13 | from QEfficient.transformers.quantizers.quantizer_utils import dequantize_gptq 14 | 15 | 16 | class QuantLinearGPTQ(nn.Module): 17 | """ 18 | A quantized linear layer using GPTQ (Generalized Post-Training Quantization). 19 | This class supports only 4-bit quantization and is compatible with QuantLinearORT. 20 | 21 | Research paper link- GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers (https://arxiv.org/abs/2210.17323) 22 | 23 | Attributes: 24 | in_features (int): The number of input features. 25 | out_features (int): The number of output features. 26 | bits (int): The number of bits used for quantization (must be 4). 27 | act_order (None or bool): The activation order. 28 | orig_fp_weight (None or torch.Tensor): The original floating-point weights. 29 | maxq (int): The maximum quantization value. 30 | group_size (int): The group size for quantization. 31 | pack_mode (str): The packing mode, set to "GPTQ". 32 | qweight (torch.Tensor): The quantized weight tensor. 33 | qzeros (torch.Tensor): The quantized zeros tensor. 34 | scales (torch.Tensor): The scales tensor. 35 | g_idx (torch.Tensor): The group index tensor. 36 | bias (torch.Tensor or None): The bias tensor, if applicable. 37 | """ 38 | 39 | def __init__(self, bits, group_size, in_features, out_features, bias): 40 | super().__init__() 41 | if bits != 4: 42 | raise NotImplementedError("Only 4 bits are supported.") 43 | self.in_features = in_features 44 | self.out_features = out_features 45 | self.bits = bits 46 | self.act_order = None 47 | self.orig_fp_weight = None 48 | self.maxq = 2**self.bits - 1 49 | self.group_size = group_size if group_size != -1 else in_features 50 | self.pack_mode = "GPTQ" 51 | 52 | # For compatibility with QuantLinearORT 53 | self.register_buffer( 54 | "qweight", 55 | torch.zeros((in_features // 32 * self.bits, out_features), dtype=torch.int32), 56 | ) 57 | self.register_buffer( 58 | "qzeros", 59 | torch.zeros((math.ceil(in_features / self.group_size), out_features // 32 * self.bits), dtype=torch.int32), 60 | ) 61 | self.register_buffer( 62 | "scales", 63 | torch.zeros((math.ceil(in_features / self.group_size), out_features), dtype=torch.float16), 64 | ) 65 | self.g_idx = torch.tensor([i // group_size for i in range(in_features)], dtype=torch.int32) 66 | if bias: 67 | self.register_buffer( 68 | "bias", 69 | torch.zeros((out_features), dtype=torch.float16), 70 | ) 71 | else: 72 | self.bias = None 73 | 74 | def forward(self, x): 75 | # Only Inference supported 76 | out, _, _ = dequantize_gptq(self.qweight.T, self.qzeros, self.scales, self.bits, self.g_idx) 77 | out = torch.matmul(x.float(), out.float()) 78 | out = out + self.bias if self.bias is not None else out 79 | 80 | return out 81 | -------------------------------------------------------------------------------- /QEfficient/transformers/quantizers/quantizer_awq.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import torch 9 | from transformers.quantizers.quantizer_awq import AwqQuantizer 10 | from transformers.utils.quantization_config import AwqBackendPackingMethod, AwqConfig, AWQLinearVersion 11 | 12 | from QEfficient.transformers.quantizers.awq import WQLinear_GEMM 13 | from QEfficient.transformers.quantizers.quantizer_utils import ( 14 | get_keys_to_not_convert, 15 | replace_linear_layer_with_target_layer, 16 | replace_quantization_scales, 17 | ) 18 | from QEfficient.utils.logging_utils import logger 19 | 20 | 21 | class QEffAwqConfig(AwqConfig): 22 | def post_init(self): 23 | """ 24 | Safety checker that arguments are correct 25 | """ 26 | 27 | if self.backend not in [AwqBackendPackingMethod.AUTOAWQ]: 28 | raise ValueError( 29 | f"Only quantization backend {AwqBackendPackingMethod.AUTOAWQ} is supported - not recognized backend {self.backend}" 30 | ) 31 | 32 | self.version = AWQLinearVersion.from_str(self.version) 33 | if self.version not in [AWQLinearVersion.GEMM]: 34 | raise ValueError( 35 | f"Only {AWQLinearVersion.GEMM} version in supported - not recognized version {self.version}" 36 | ) 37 | 38 | if self.do_fuse or self.fuse_max_seq_len is not None: 39 | raise ValueError( 40 | f"fused modules are not supported, got do_fuse={self.do_fuse}, fuse_max_seq_len={self.fuse_max_seq_len}" 41 | ) 42 | 43 | if self.bits != 4: 44 | raise ValueError(f"Only 4-bit AWQ quantization is supported, got bits={self.bits}") 45 | 46 | 47 | class QEffAwqQuantizer(AwqQuantizer): 48 | target_cls = WQLinear_GEMM 49 | 50 | def __init__(self, quantization_config: QEffAwqConfig, **kwargs): 51 | super().__init__(quantization_config, **kwargs) 52 | 53 | def validate_environment(self, device_map, **kwargs): 54 | # No need to validate as we will always use pytorch CPU version. 55 | return True 56 | 57 | @property 58 | def is_trainable(self): 59 | return False 60 | 61 | def update_torch_dtype(self, torch_dtype): 62 | if torch_dtype not in [None, torch.float32]: 63 | logger.warning(f"Requested dtype {torch_dtype} is not supported, overriding to None") 64 | return None 65 | 66 | def _process_model_before_weight_loading(self, model, **kwargs): 67 | self.modules_to_not_convert = get_keys_to_not_convert(model) 68 | 69 | if self.quantization_config.modules_to_not_convert is not None: 70 | self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert) 71 | 72 | model, has_been_replaced = replace_linear_layer_with_target_layer( 73 | model, 74 | target_cls=self.target_cls, 75 | quantization_config=self.quantization_config, 76 | modules_to_not_convert=self.modules_to_not_convert, 77 | ) 78 | 79 | model = replace_quantization_scales(model, model.config.model_type) 80 | if not has_been_replaced: 81 | logger.warning( 82 | "You are loading an AWQ model but no linear modules were found in your model." 83 | " Please double check your model architecture, or submit an issue on github if you think this is a bug." 84 | ) 85 | -------------------------------------------------------------------------------- /QEfficient/transformers/spd/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /QEfficient/transformers/spd/turbo.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import torch 9 | 10 | from QEfficient.utils.checkpoint_utils import load_checkpoint 11 | 12 | 13 | class ResBlock(torch.nn.Module): 14 | """ 15 | A Residual Block module. 16 | This module performs a linear transformation followed by a SiLU activation, 17 | and then adds the result to the original input, creating a residual connection. 18 | Args: 19 | hidden_size (int): The size of the hidden layers in the block. 20 | """ 21 | 22 | def __init__(self, hidden_size): 23 | super().__init__() 24 | self.linear = torch.nn.Linear(hidden_size, hidden_size) 25 | # Initialize as an identity mapping 26 | torch.nn.init.zeros_(self.linear.weight) 27 | # Use SiLU activation to keep consistent with the Llama model 28 | self.act = torch.nn.SiLU() 29 | 30 | def forward(self, x): 31 | """ 32 | Forward pass of the ResBlock. 33 | Args: 34 | x (torch.Tensor): Input tensor. 35 | Returns: 36 | torch.Tensor: Output after the residual connection and activation. 37 | """ 38 | return x + self.act(self.linear(x)) 39 | 40 | 41 | def post_process_turbo_state_dict(state_dict: dict) -> dict: 42 | """normaize turbo state dict keys 43 | Args: 44 | state_dict (dict): turbo state dict 45 | Returns: 46 | dict: normalized state dict 47 | """ 48 | new_state_dict = dict() 49 | for name, weights in state_dict.items(): 50 | new_name = name.replace("projections.", "") 51 | new_state_dict[new_name] = weights 52 | return new_state_dict 53 | 54 | 55 | def build_and_attach_turbo(model, speculative_config: dict, speculative_weights: str): 56 | """build and attach turbo projections 57 | Args: 58 | model: model to attach projections to 59 | speculative_config (dict): speculative config file used to build projections 60 | Returns: 61 | model: model with turbo projections 62 | """ 63 | hidden_size = model.config.hidden_size 64 | num_layers = speculative_config["turbo_num_layers"] 65 | num_heads = speculative_config["turbo_num_heads"] 66 | projections = torch.nn.ModuleList( 67 | [ 68 | torch.nn.Sequential( 69 | *([ResBlock(hidden_size)] * num_layers), 70 | ) 71 | for _ in range(num_heads) 72 | ], 73 | ) 74 | load_checkpoint(projections, speculative_weights, strict=True, post_process_func=post_process_turbo_state_dict) 75 | model.projections = projections 76 | speculative_config["num_speculative_tokens"] = num_heads 77 | return model 78 | -------------------------------------------------------------------------------- /QEfficient/transformers/transform.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import hashlib 9 | 10 | import torch.nn as nn 11 | import transformers 12 | 13 | from QEfficient.base.modeling_qeff import QEFFBaseModel 14 | from QEfficient.transformers.cache_utils import QEffDynamicCache 15 | from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict 16 | from QEfficient.utils.logging_utils import logger 17 | 18 | 19 | def replace_module_with_qeff_layers(model: nn.Module) -> None: 20 | """ 21 | Replaces the transformers nn.Module classes with optimized QEff classes in place. 22 | 23 | Args: 24 | :model (torch.nn.Module) Base PyTorch model. 25 | """ 26 | # Replace if module class is registed in TransformersToQEffModulesDict 27 | target_module = TransformersToQEffModulesDict.get(model.__class__) 28 | if target_module is not None: 29 | model.__class__ = target_module 30 | 31 | # Iterate over child modules 32 | for _, module in model.named_children(): 33 | replace_module_with_qeff_layers(module) 34 | 35 | 36 | def get_params_hash(model: nn.Module) -> str: 37 | """ 38 | Creates a Hash of all the parameters values i.e. weights using SHA256 algo. 39 | 40 | Args: 41 | model (torch.nn.Module): Base PyTorch model. 42 | 43 | Returns: 44 | :str: Hash string 45 | """ 46 | hasher = hashlib.sha256() 47 | for _, params in model.named_parameters(): 48 | hasher.update(params.data.numpy().tobytes()) 49 | 50 | return hasher.hexdigest() 51 | 52 | 53 | def transform_lm(model: nn.Module) -> nn.Module: 54 | """ 55 | Replaces some Transformers torch.nn.Module layers for equivalent optimized modules for Cloud AI 100. 56 | 57 | Args: 58 | model (torch.nn.Module): PyTorch model. 59 | 60 | Returns: 61 | :torch.nn.Module: PyTorch Module with replaced QEff layers. 62 | """ 63 | 64 | # Introducnig qeff_transformed attribue in model to check status of transform 65 | if getattr(model, "qeff_transformed", False): 66 | print("Model is already transformed") 67 | return model 68 | 69 | # Get Hash of all params for checking later 70 | prior_params_hash = get_params_hash(model) 71 | logger.warning(f"The model {model.__class__} layers has been updated to QEff layers in-place") 72 | # Replace with QEff layers 73 | replace_module_with_qeff_layers(model) 74 | 75 | # Check with new params hash 76 | later_params_hash = get_params_hash(model) 77 | if prior_params_hash != later_params_hash: 78 | raise RuntimeError("Weights were changed in the transform process, please report an issue") 79 | 80 | # Replace the Dyanmic cache utils update api 81 | transformers.cache_utils.DynamicCache.update = QEffDynamicCache.update 82 | 83 | setattr(model, "qeff_transformed", True) 84 | return model.eval() 85 | 86 | 87 | def transform(model: QEFFBaseModel, form_factor="cloud"): 88 | """ 89 | This function serves for optimizing any kind of model (i.e. LLM, SD, AWQ etc.) for Cloud AI 100. 90 | Will replace the torch.nn.Module layers of passed QEffModel with optimized implementation of the same. 91 | 92 | model (torch.nn.Module): object of any instance of class that is child of `QEFFBaseAutoModelFactory` 93 | form_factor (str): form factor configuration for optimizing the model, available options=["cloud", "edge"]. 94 | """ 95 | if form_factor != "cloud": 96 | raise ValueError("Only form_factor='cloud' is supported as of now!") 97 | # FIXME: move this to class and use model.transform() 98 | transform_lm(model.model) # type: ignore 99 | return model 100 | -------------------------------------------------------------------------------- /QEfficient/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | from QEfficient.transformers.quantizers.auto import ( # noqa: F401 9 | replace_transformers_quantizers, 10 | undo_transformers_quantizers, 11 | ) 12 | from QEfficient.utils._utils import ( # noqa: F401 13 | check_and_assign_cache_dir, 14 | dump_qconfig, 15 | get_num_layers_from_config, 16 | get_num_layers_vlm, 17 | get_onnx_dir_name, 18 | get_padding_shape_from_config, 19 | get_padding_shape_vlm, 20 | get_qpc_dir_path, 21 | hf_download, 22 | load_hf_processor, 23 | load_hf_tokenizer, 24 | login_and_download_hf_lm, 25 | onnx_exists, 26 | padding_check_and_fix, 27 | qpc_exists, 28 | ) 29 | -------------------------------------------------------------------------------- /QEfficient/utils/cache.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ---------------------------------------------------------------------------- 7 | 8 | import json 9 | import os 10 | from pathlib import Path 11 | 12 | QEFF_HOME: Path = None 13 | if "QEFF_HOME" in os.environ: 14 | QEFF_HOME = Path(os.environ["QEFF_HOME"]) 15 | elif "XDG_CACHE_HOME" in os.environ: 16 | QEFF_HOME = Path(os.environ["XDG_CACHE_HOME"]) / "qeff_models" 17 | else: 18 | QEFF_HOME = Path("~/.cache/qeff_models").expanduser() 19 | 20 | 21 | def json_serializable(obj): 22 | if isinstance(obj, set): 23 | return sorted(obj) 24 | raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable") 25 | 26 | 27 | def to_hashable(obj) -> bytes: 28 | """ 29 | Converts obj to bytes such that same object will result in same hash 30 | """ 31 | return json.dumps( 32 | obj, 33 | skipkeys=False, 34 | ensure_ascii=True, 35 | check_circular=True, 36 | allow_nan=False, 37 | indent=None, 38 | separators=(",", ":"), 39 | default=json_serializable, 40 | sort_keys=True, 41 | ).encode() 42 | -------------------------------------------------------------------------------- /QEfficient/utils/checkpoint_utils.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | from safetensors.torch import load_file 9 | 10 | 11 | def load_checkpoint(model, checkpoint: str, strict=False, post_process_func=None): 12 | """load weights ending with `.safetensors` extension 13 | Args: 14 | model: model to load wights into 15 | checkpoint (str): checkpoint path 16 | strict (bool, optional): strictness of loading weights. Defaults to False. 17 | post_process_func (optional): Optional post-processing of loaded state dict. Defaults to None. 18 | Returns: 19 | model: model with applied weights 20 | """ 21 | state_dict: dict = load_file(checkpoint) 22 | if post_process_func is not None: 23 | state_dict = post_process_func(state_dict) 24 | model.load_state_dict(state_dict, strict=strict) 25 | return model 26 | -------------------------------------------------------------------------------- /QEfficient/utils/device_utils.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import math 9 | import subprocess 10 | 11 | from QEfficient.utils.constants import Constants 12 | from QEfficient.utils.logging_utils import logger 13 | 14 | 15 | def get_available_device_id(): 16 | """ 17 | API to check available device id. 18 | 19 | Return: 20 | :int: Available device id. 21 | """ 22 | 23 | device_id = 0 24 | result = None 25 | 26 | # FIXME: goes into infinite loop when user doesn't have permission and the command gives permission denied. 27 | # To reproduce change the ownership of available devices. 28 | while 1: 29 | command = ["/opt/qti-aic/tools/qaic-util", "-q", "-d", f"{device_id}"] 30 | try: 31 | result = subprocess.run(command, capture_output=True, text=True) 32 | except OSError: 33 | logger.warning("Not a Cloud AI 100 device, Command not found", command) 34 | return None 35 | if result: 36 | if "Status:Error" in result.stdout: 37 | device_id += 1 38 | elif "Status:Ready" in result.stdout: 39 | logger.info("device is available.") 40 | return [device_id] 41 | elif "Failed to find requested device ID" in result.stdout: 42 | logger.warning("Failed to find requested device ID") 43 | return None 44 | 45 | 46 | def is_qpc_size_gt_32gb(params: int, mxfp6: bool) -> bool: 47 | if mxfp6: 48 | qpc_size = math.ceil((params * 1) / Constants.GB) 49 | else: 50 | qpc_size = math.ceil((params * 2) / Constants.GB) 51 | 52 | logger.warning(f"Approximate QPC size is: {qpc_size} GB") 53 | num_devices = math.ceil(qpc_size / Constants.MAX_QPC_LIMIT) 54 | logger.warning(f"Number of Devices required: {num_devices}") 55 | return qpc_size > Constants.MAX_QPC_LIMIT 56 | 57 | 58 | def is_multi_qranium_setup_available(): 59 | result = None 60 | command = ["/opt/qti-aic/tools/qaic-util", "-q"] 61 | try: 62 | result = subprocess.run(command, stdout=subprocess.PIPE, universal_newlines=True) 63 | filtered_result = subprocess.run( 64 | ["grep", "Device Capabilities"], input=result.stdout, stdout=subprocess.PIPE, text=True 65 | ) 66 | except OSError: 67 | print("Command not found", command) 68 | return None 69 | 70 | lines = filtered_result.stdout.split("\n") 71 | 72 | # to count the number of devices in MQ enabled set up 73 | hybridboot_mdp_count = 0 74 | for line in lines: 75 | if ("HybridBoot+" in line) and ("MDP+" in line): 76 | hybridboot_mdp_count = hybridboot_mdp_count + 1 77 | 78 | if hybridboot_mdp_count > 0: 79 | print("No: of Devices with MQ enabled available: ", hybridboot_mdp_count) 80 | return True 81 | else: 82 | print("Device in MQ set up not available") 83 | return False 84 | -------------------------------------------------------------------------------- /QEfficient/utils/logging_utils.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import logging 9 | 10 | 11 | class QEffFormatter(logging.Formatter): 12 | """ 13 | Formatter class used to set colors for printing different logging levels of messages on console. 14 | """ 15 | 16 | cyan: str = "\x1b[38;5;14m" 17 | yellow: str = "\x1b[33;20m" 18 | red: str = "\x1b[31;20m" 19 | bold_red: str = "\x1b[31;1m" 20 | reset: str = "\x1b[0m" 21 | common_format: str = "%(levelname)s - %(name)s - %(message)s" # type: ignore 22 | format_with_line_info = "%(levelname)s - %(name)s - %(message)s (%(filename)s:%(lineno)d)" # type: ignore 23 | 24 | FORMATS = { 25 | logging.DEBUG: cyan + format_with_line_info + reset, 26 | logging.INFO: cyan + common_format + reset, 27 | logging.WARNING: yellow + common_format + reset, 28 | logging.ERROR: red + format_with_line_info + reset, 29 | logging.CRITICAL: bold_red + format_with_line_info + reset, 30 | } 31 | 32 | def format(self, record): 33 | """ 34 | Overriding the base class method to Choose format based on log level. 35 | """ 36 | log_fmt = self.FORMATS.get(record.levelno) 37 | formatter = logging.Formatter(log_fmt) 38 | return formatter.format(record) 39 | 40 | 41 | def create_logger() -> logging.Logger: 42 | """ 43 | Creates a logger object with Colored QEffFormatter. 44 | """ 45 | logger = logging.getLogger("QEfficient") 46 | 47 | # create console handler and set level to debug 48 | ch = logging.StreamHandler() 49 | ch.setLevel(logging.INFO) 50 | # define formatter 51 | ch.setFormatter(QEffFormatter()) 52 | 53 | logger.addHandler(ch) 54 | return logger 55 | 56 | 57 | # Define the logger object that can be used for logging purposes throughout the module. 58 | logger = create_logger() 59 | -------------------------------------------------------------------------------- /QEfficient/utils/model_registery.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | 9 | from transformers import AutoConfig, AutoModelForCausalLM 10 | 11 | # Placeholder for all non-transformer models 12 | from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import ( 13 | QEffLlamaSwiftKVConfig, 14 | QEffLlamaSwiftKVForCausalLM, 15 | ) 16 | 17 | # Map of model type to config class, Modelling class and transformer model architecture class 18 | MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS = { 19 | "llama_swiftkv": [QEffLlamaSwiftKVConfig, QEffLlamaSwiftKVForCausalLM, AutoModelForCausalLM], 20 | } 21 | 22 | # loop over all the model types which are not present in transformers and register them 23 | for model_type, model_cls in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items(): 24 | # Register the model config class based on the model type. This will be first element in the tuple 25 | AutoConfig.register(model_type, model_cls[0]) 26 | 27 | # Register the non transformer library Class and config class using AutoModelClass 28 | model_cls[2].register(model_cls[0], model_cls[1]) 29 | -------------------------------------------------------------------------------- /QEfficient/utils/spd_utils.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | from pathlib import Path 9 | 10 | from huggingface_hub import hf_hub_download 11 | from transformers import PretrainedConfig 12 | 13 | from QEfficient.utils._utils import filter_kwargs 14 | 15 | 16 | def get_speculative_config(pretrained_model_name_or_path, **kwargs) -> dict: 17 | if not isinstance(pretrained_model_name_or_path, (str, Path)): 18 | raise ValueError( 19 | f"`pretrained_config` must be a string or Path object but is of type {type(pretrained_model_name_or_path)}" 20 | ) 21 | try: 22 | speculative_config, _ = PretrainedConfig.get_config_dict( 23 | pretrained_model_name_or_path, _configuration_file="speculator_config.json", **kwargs 24 | ) 25 | except OSError as err: 26 | raise OSError(f"{err}.\nFile 'speculator_config.json' is expected to exist to apply turbo projections.") 27 | return speculative_config 28 | 29 | 30 | def get_speculative_weights(pretrained_model_name_or_path, **kwargs) -> str: 31 | turbo_weights_file = "speculator.safetensors" 32 | hf_hub_kwargs = filter_kwargs(hf_hub_download, kwargs) 33 | if (local_path := Path(pretrained_model_name_or_path)).exists(): 34 | if not local_path.is_dir(): 35 | raise ValueError(f"local model path {local_path} must point to an existing dictionary") 36 | weights_path = local_path / turbo_weights_file 37 | if not weights_path.exists(): 38 | raise FileNotFoundError(f"weights path {weights_path} does not exist.") 39 | else: 40 | weights_path = hf_hub_download(pretrained_model_name_or_path, filename=turbo_weights_file, **hf_hub_kwargs) 41 | return str(weights_path) 42 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Docs 2 | 3 | This directory contains the instructions for building static html documentations based on [sphinx](https://www.sphinx-doc.org/en/master/). 4 | 5 | 6 | ## Build the docs 7 | Install the packages required for building documentation: 8 | 9 | ```sh 10 | pip install -r docs/requirements.txt 11 | ``` 12 | 13 | And then, change directory to docs folder to build the docs. 14 | 15 | ```sh 16 | cd docs/ 17 | # To build docs specific to branch 18 | sphinx-build -M html . build 19 | # [Optional] To build docs for all the supporting branches 20 | sphinx-multiversion . build 21 | ``` 22 | ## Preview the docs locally 23 | 24 | ```bash 25 | cd build/html 26 | python -m http.server 27 | ``` 28 | You can visit the page with your web browser with url `http://localhost:8080`. 29 | -------------------------------------------------------------------------------- /docs/_static/my_theme.css: -------------------------------------------------------------------------------- 1 | .wy-nav-content { 2 | max-width: 1200px !important; 3 | } -------------------------------------------------------------------------------- /docs/_templates/versions.html: -------------------------------------------------------------------------------- 1 |
2 | 3 | Version: Main 4 | 5 | 6 |
7 | Versions 8 |
9 |
main
10 |
release/v1.18
11 |
12 |
13 |
14 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | # Configuration file for the Sphinx documentation builder. 9 | # 10 | # This file only contains a selection of the most common options. For a full 11 | # list see the documentation: 12 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 13 | 14 | # -- Path setup -------------------------------------------------------------- 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | 23 | sys.path.insert(0, os.path.abspath("..")) 24 | 25 | 26 | # -- Project information ----------------------------------------------------- 27 | 28 | project = "efficient-transformers" 29 | copyright = "2024, Qualcomm" 30 | 31 | # The full version, including alpha/beta/rc tags 32 | release = "main" 33 | 34 | 35 | # -- General configuration --------------------------------------------------- 36 | 37 | # Add any Sphinx extension module names here, as strings. They can be 38 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 39 | # ones. 40 | extensions = ["myst_parser", "sphinx.ext.todo", "sphinx.ext.viewcode", "sphinx.ext.autodoc", "sphinx_multiversion"] 41 | 42 | # Add any paths that contain templates here, relative to this directory. 43 | templates_path = ["_templates"] 44 | 45 | # List of patterns, relative to source directory, that match files and 46 | # directories to ignore when looking for source files. 47 | # This pattern also affects html_static_path and html_extra_path. 48 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 49 | 50 | 51 | # -- Options for HTML output ------------------------------------------------- 52 | 53 | # The theme to use for HTML and HTML Help pages. See the documentation for 54 | # a list of builtin themes. 55 | # 56 | html_theme = "sphinx_rtd_theme" 57 | 58 | 59 | def setup(app): 60 | app.add_css_file("my_theme.css") 61 | 62 | 63 | # Add any paths that contain custom static files (such as style sheets) here, 64 | # relative to this directory. They are copied after the builtin static files, 65 | # so a file named "default.css" will overwrite the builtin "default.css". 66 | html_static_path = ["_static"] 67 | source = [".md"] 68 | todo_include_todos = True 69 | 70 | suppress_warnings = [ 71 | "ref.rst_pilog", # Suppress warnings about excluded toctree entries 72 | ] 73 | -------------------------------------------------------------------------------- /docs/image/Cloud_AI_100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quic/efficient-transformers/299ef7938ab84dc90f4d0b5e1e273de40be878ab/docs/image/Cloud_AI_100.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | % QEfficient documentation master file, created by 2 | % sphinx-quickstart on Tue May 28 09:19:122024. 3 | % You can adapt this file completely to your liking, but it should at least 4 | % contain the root `toctree` directive. 5 | 6 | Welcome to Efficient-Transformers Documentation! 7 | ======================================== 8 | 9 | 10 | 13 | 14 | 15 | ```{toctree} 16 | :caption: 'Getting Started' 17 | :maxdepth: 4 18 | 19 | source/introduction 20 | source/validate 21 | ``` 22 | 23 | 24 | ```{toctree} 25 | :caption: 'Installation' 26 | :maxdepth: 2 27 | 28 | source/installation 29 | ``` 30 | 31 | ```{toctree} 32 | :caption: 'Upgrade Efficient-Transformers' 33 | :maxdepth: 2 34 | 35 | source/upgrade 36 | ``` 37 | 38 | ```{toctree} 39 | :caption: 'Inference on Cloud AI 100' 40 | :maxdepth: 4 41 | 42 | source/quick_start 43 | source/cli_api 44 | source/python_api 45 | ``` 46 | 47 | 48 | ```{toctree} 49 | :caption: 'QAIC Finetune' 50 | :maxdepth: 2 51 | 52 | source/finetune 53 | 54 | ``` 55 | 56 | ```{toctree} 57 | :caption: 'Blogs' 58 | :maxdepth: 2 59 | 60 | source/blogs 61 | 62 | ``` 63 | 64 | ```{toctree} 65 | :caption: 'Reference' 66 | :maxdepth: 2 67 | 68 | source/reference 69 | 70 | ``` 71 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | Sphinx==7.1.2 2 | sphinx-multiversion==0.2.4 3 | sphinx-rtd-theme==2.0.0 4 | myst-parser==3.0.1 5 | -------------------------------------------------------------------------------- /docs/source/blogs.md: -------------------------------------------------------------------------------- 1 | # Train anywhere, Infer on Qualcomm Cloud AI 100 2 | [Click here](https://www.qualcomm.com/developer/blog/2024/01/train-anywhere-infer-qualcomm-cloud-ai-100) 3 | 4 | # How to Quadruple LLM Decoding Performance with Speculative Decoding (SpD) and Microscaling (MX) Formats on Qualcomm® Cloud AI 100 5 | [Click here](https://statics.teams.cdn.office.net/evergreen-assets/safelinks/1/atp-safelinks.html) 6 | 7 | # Power-efficient acceleration for large language models – Qualcomm Cloud AI SDK 8 | [Click here](https://www.qualcomm.com/developer/blog/2023/11/power-efficient-acceleration-large-language-models-qualcomm-cloud-ai-sdk) 9 | 10 | # Qualcomm Cloud AI 100 Accelerates Large Language Model Inference by ~2x Using Microscaling (Mx) Formats 11 | [click here](https://www.qualcomm.com/developer/blog/2024/01/qualcomm-cloud-ai-100-accelerates-large-language-model-inference-2x-using-microscaling-mx) 12 | 13 | # Qualcomm Cloud AI Introduces Efficient Transformers: One API, Infinite Possibilities 14 | [click here](https://www.qualcomm.com/developer/blog/2024/05/qualcomm-cloud-ai-introduces-efficient-transformers-one-api) 15 | 16 | -------------------------------------------------------------------------------- /docs/source/cli_api.md: -------------------------------------------------------------------------------- 1 | 2 | # Command Line Interface Use (CLI) 3 | 4 | ```{NOTE} 5 | Use ``bash terminal``, else if using ``ZSH terminal`` then ``device_group``should be in single quotes e.g. ``'--device_group [0]'`` 6 | ``` 7 | 8 | (infer_api)= 9 | ## `QEfficient.cloud.infer` 10 | ```{eval-rst} 11 | .. automodule:: QEfficient.cloud.infer.main 12 | ``` 13 | ## `QEfficient.cloud.execute` 14 | ```{eval-rst} 15 | .. automodule:: QEfficient.cloud.execute.main 16 | ``` 17 | ## `QEfficient.cloud.compile` 18 | ```{eval-rst} 19 | .. automodule:: QEfficient.compile.compile_helper.compile 20 | .. code-block:: bash 21 | 22 | python -m QEfficient.cloud.compile OPTIONS 23 | ``` 24 | ## `QEfficient.cloud.export` 25 | ```{eval-rst} 26 | .. automodule:: QEfficient.cloud.export.main 27 | 28 | ``` 29 | ## `QEfficient.cloud.finetune` 30 | ```{eval-rst} 31 | .. automodule:: QEfficient.cloud.finetune.main 32 | 33 | ``` -------------------------------------------------------------------------------- /docs/source/finetune.md: -------------------------------------------------------------------------------- 1 | # Finetune Infra 2 | 3 | This repository provides the infrastructure for finetuning models using different hardware accelerators such as QAIC. 4 | Same CLI can be used to run Finetuning on gpu by setting the device flag.(for finetuning on gpu, install torch specific to cuda) 5 | 6 | ## Installation 7 | 8 | Same as QEfficient along with QAIC PyTorch Eager mode. 9 | 10 | For QEfficient Library : https://github.com/quic/efficient-transformers 11 | 12 | For torch_qaic, assuming QEfficient is already installed, 13 | ```bash 14 | pip install /opt/qti-aic/integrations/torch_qaic/py310/torch_qaic-0.1.0-cp310-cp310-linux_x86_64.whl 15 | ``` 16 | 17 | ## Finetuning 18 | 19 | Export the ENV variables to download and enable private datasets 20 | ```bash 21 | export HF_DATASETS_TRUST_REMOTE_CODE=True 22 | ``` 23 | 24 | Export the ENV variables to get the device and HW traces and debugging logs 25 | ```bash 26 | export QAIC_DEVICE_LOG_LEVEL=0 # For Device level logs 27 | export QAIC_DEBUG=1 # To understand the CPU fallback ops 28 | ``` 29 | 30 | ## Dataset Details 31 | 32 | To download the Alpaca dataset, visit this [link](https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/refs/heads/main/alpaca_data.json). Download the dataset and place it under the **dataset** directory. Make sure to update the training configuration accordingly. 33 | ```bash 34 | wget -c https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/refs/heads/main/alpaca_data.json -P dataset/ 35 | ``` 36 | 37 | To download the grammar dataset, visit this [link](https://github.com/meta-llama/llama-cookbook/blob/main/src/llama_cookbook/datasets/grammar_dataset/grammar_dataset_process.ipynb). Download the dataset and place it under the **datasets_grammar** directory. Make sure to update the training configuration accordingly. 38 | 39 | 40 | ## Usage 41 | 42 | ### Single SOC finetuning on QAIC 43 | 44 | ```python 45 | python -m QEfficient.cloud.finetune --device qaic:0 --model_name "meta-llama/Llama-3.2-1B" 46 | ``` 47 | Also, you can configure various training parameters, for more details, checkout: QEfficient/finetune/configs/training.py, Below is example command line 48 | ```python 49 | python -m QEfficient.cloud.finetune --device qaic:0 --use-peft --output_dir ./meta-sam --num_epochs 2 --context_length 256 50 | ``` 51 | 52 | ### Distributed training(DDP) on QAIC 53 | 54 | ```python 55 | QAIC_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc-per-node 4 -m QEfficient.cloud.finetune --device qaic --enable_ddp --dist_backend qccl --num_epochs 2 --model_name "meta-llama/Llama-3.2-1B" 56 | ``` 57 | **nproc-per-node is number of workers(QAIC devices) running locally. 58 | 59 | ## Visualization 60 | 61 | Tensorboard logs are generated inside runs/ directory with date and time stamp. 62 | to visualise the data, 63 | 64 | ```python 65 | tensorboard --logdir runs/ --bind_all 66 | ``` -------------------------------------------------------------------------------- /docs/source/image/Cloud_AI_100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quic/efficient-transformers/299ef7938ab84dc90f4d0b5e1e273de40be878ab/docs/source/image/Cloud_AI_100.png -------------------------------------------------------------------------------- /docs/source/image/kv_cache_cloudai100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quic/efficient-transformers/299ef7938ab84dc90f4d0b5e1e273de40be878ab/docs/source/image/kv_cache_cloudai100.png -------------------------------------------------------------------------------- /docs/source/installation.md: -------------------------------------------------------------------------------- 1 | # Pre-requisites 2 | System Requirements: 3 | 1. [Supported Linux OS](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/#operating-systems) - Ubuntu, RHEL and AWS Linux 4 | 2. [Cloud AI 100 Platform SDK installed](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/Cloud-AI-SDK/Cloud-AI-SDK/#platform-sdk) 5 | 3. [SDK Pre-requisites](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/Pre-requisites/pre-requisites/) 6 | 4. [Multi-device support enabled for model sharding](https://github.com/quic/cloud-ai-sdk/tree/1.12/utils/multi-device) 7 | 8 | # Installation 9 | 10 | ### 1. Download Apps SDK 11 | * [Cloud AI 100 Apps SDK install](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/Cloud-AI-SDK/Cloud-AI-SDK/) 12 | 13 | ### 2. Install Efficient-Transformers 14 | Uninstall existing Apps SDK 15 | ``` 16 | sudo ./uninstall.sh 17 | ``` 18 | Run the install.sh script as root or with sudo to install with root permissions. 19 | ``` 20 | sudo ./install.sh --enable-qeff 21 | source /opt/qti-aic/dev/python/qeff/bin/activate 22 | ``` 23 | On successful installation, the contents are stored to the /opt/qti-aic path under the dev and exec directories: 24 | ``` 25 | dev exec integrations scripts 26 | ``` 27 | Check the Apps SDK version with the following command 28 | ``` 29 | sudo /opt/qti-aic/tools/qaic-version-util --apps 30 | ``` 31 | Apply chmod commands 32 | ``` 33 | sudo chmod a+x /opt/qti-aic/dev/hexagon_tools/bin/* 34 | sudo chmod a+x /opt/qti-aic/exec/* 35 | ``` 36 | 37 | # Sanity Check 38 | 39 | After above installation methods, you can check if ``QEfficient`` is installed correctly by using 40 | ```bash 41 | python -c "import QEfficient; print(QEfficient.__version__)" 42 | ``` 43 | If the above line executes successfully, you are good to go ahead and start deploying models on ``Cloud AI 100`` cards using ``QEfficient`` library. 44 | -------------------------------------------------------------------------------- /docs/source/python_api.md: -------------------------------------------------------------------------------- 1 | # Python API 2 | 3 | **This page give you an overview about the all the APIs that you might need to integrate the `QEfficient` into your python applications.** 4 | 5 | ## High Level API 6 | 7 | ### `QEFFAutoModelForCausalLM` 8 | 9 | ```{eval-rst} 10 | .. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForCausalLM 11 | :member-order: bysource 12 | :members: 13 | ``` 14 | 15 | (QEFFAutoModel)= 16 | ### `QEFFAutoModel` 17 | 18 | ```{eval-rst} 19 | .. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModel 20 | :member-order: bysource 21 | :members: 22 | ``` 23 | 24 | (QEffAutoPeftModelForCausalLM)= 25 | ### `QEffAutoPeftModelForCausalLM` 26 | 27 | ```{eval-rst} 28 | .. autoclass:: QEfficient.peft.auto.QEffAutoPeftModelForCausalLM 29 | :member-order: bysource 30 | :members: 31 | ``` 32 | 33 | (QEffAutoLoraModelForCausalLM)= 34 | ### `QEffAutoLoraModelForCausalLM` 35 | 36 | ```{eval-rst} 37 | .. autoclass:: QEfficient.peft.lora.auto.QEffAutoLoraModelForCausalLM 38 | :member-order: bysource 39 | :members: 40 | ``` 41 | 42 | (QEFFAutoModelForImageTextToText)= 43 | ### `QEFFAutoModelForImageTextToText` 44 | 45 | ```{eval-rst} 46 | .. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForImageTextToText 47 | :member-order: bysource 48 | :members: 49 | ``` 50 | 51 | (QEFFAutoModelForSpeechSeq2Seq)= 52 | ### `QEFFAutoModelForSpeechSeq2Seq` 53 | 54 | ```{eval-rst} 55 | .. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForSpeechSeq2Seq 56 | :member-order: bysource 57 | :members: 58 | ``` 59 | 60 | ### `export` 61 | 62 | ```{eval-rst} 63 | .. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100 64 | :members: 65 | :show-inheritance: 66 | :exclude-members: convert_to_cloud_kvstyle, convert_to_cloud_bertstyle 67 | .. deprecated:: 68 | This function will be deprecated in version 1.19, please use QEFFAutoModelForCausalLM.export instead 69 | ``` 70 | 71 | ### `compile` 72 | 73 | ```{eval-rst} 74 | .. automodule:: QEfficient.compile.compile_helper 75 | :members: 76 | :show-inheritance: 77 | .. code-block:: python 78 | 79 | import QEfficient 80 | base_path, onnx_model_path = QEfficient.export(model_name="gpt2") 81 | qpc_path = QEfficient.compile(onnx_path=onnx_model_path, qpc_path=os.path.join(base_path, "qpc"), num_cores=14, device_group=[0]) 82 | .. deprecated:: 83 | This function will be deprecated in version 1.19, please use QEFFAutoModelForCausalLM.compile instead 84 | ``` 85 | 86 | ### `Execute` 87 | 88 | ```{eval-rst} 89 | .. automodule:: QEfficient.generation.text_generation_inference 90 | :members: 91 | :show-inheritance: 92 | :exclude-members: latency_stats_bertstyle,cloud_ai_100_exec_kv_helper 93 | ``` 94 | ## Low Level API 95 | 96 | ### `convert_to_cloud_kvstyle` 97 | 98 | ```{eval-rst} 99 | .. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100 100 | :members: 101 | :show-inheritance: 102 | :exclude-members: qualcomm_efficient_converter, convert_to_cloud_bertstyle 103 | ``` 104 | 105 | ### `convert_to_cloud_bertstyle` 106 | 107 | ```{eval-rst} 108 | .. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100 109 | :members: 110 | :show-inheritance: 111 | :exclude-members: qualcomm_efficient_converter, convert_to_cloud_kvstyle 112 | ``` 113 | 114 | ### `utils` 115 | 116 | ```{eval-rst} 117 | .. automodule:: QEfficient.utils.device_utils 118 | :members: 119 | :show-inheritance: 120 | ``` 121 | 122 | ```{eval-rst} 123 | .. automodule:: QEfficient.utils.generate_inputs 124 | :members: 125 | :undoc-members: 126 | :show-inheritance: 127 | ``` 128 | 129 | ```{eval-rst} 130 | .. automodule:: QEfficient.utils.run_utils 131 | :members: 132 | :undoc-members: 133 | :show-inheritance: 134 | ``` -------------------------------------------------------------------------------- /docs/source/reference.md: -------------------------------------------------------------------------------- 1 | **References** 2 | # [Qualcomm Cloud AI home](https://www.qualcomm.com/products/technology/processors/cloud-artificial-intelligence) 3 | # [Qualcomm Cloud AI SDK download](https://www.qualcomm.com/products/technology/processors/cloud-artificial-intelligence/cloud-ai-100#Software) 4 | # [Qualcomm Cloud AI API reference](https://quic.github.io/cloud-ai-sdk-pages/latest/API/) 5 | # [User Guide](https://quic.github.io/cloud-ai-sdk-pages/) 6 | # [OCP Microscaling Formats (MX) Specification](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) -------------------------------------------------------------------------------- /docs/source/upgrade.md: -------------------------------------------------------------------------------- 1 | 2 | ## Using GitHub Repository 3 | 4 | ``Warning: Efficient Transformers have been validated to work with the same compatible SDK. Upgrading this may result in certain models becoming incompatible.`` 5 | 6 | ```bash 7 | # Create Python virtual env and activate it. (Required Python 3.10) 8 | 9 | python3.10 -m venv qeff_env 10 | source qeff_env/bin/activate 11 | pip install -U pip 12 | 13 | # Clone and Install the QEfficient Repo. 14 | pip install git+https://github.com/quic/efficient-transformers 15 | 16 | ``` -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /examples/basic_gguf_models.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | # This is the work example of the GGUF models with the AI 100 9 | 10 | from transformers import AutoTokenizer 11 | 12 | from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM 13 | 14 | # Load the model and tokenizer 15 | model_name = "MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF" 16 | gguf_file = "Mistral-7B-Instruct-v0.3.fp16.gguf" 17 | # org_model_name = "mistralai/Mistral-7B-Instruct-v0.3" 18 | 19 | tokenizer = AutoTokenizer.from_pretrained(model_name, gguf_file=gguf_file) 20 | model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file=gguf_file) 21 | 22 | generated_qpc_path = model.compile(prefill_seq_len=32, ctx_len=128, num_cores=16, num_devices=1) 23 | model.generate(prompts=["How are you?"], tokenizer=tokenizer) 24 | -------------------------------------------------------------------------------- /examples/cpp_execution/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | project(InferenceSetIOBuffer) 9 | cmake_minimum_required (VERSION 3.15) 10 | 11 | # Find the pybind11 CMake directory using a Python script 12 | execute_process( 13 | COMMAND python -c "import pybind11; print(pybind11.get_cmake_dir())" 14 | OUTPUT_VARIABLE pybind11_DIR 15 | OUTPUT_STRIP_TRAILING_WHITESPACE 16 | ) 17 | 18 | # Set the CMAKE_PREFIX_PATH to include pybind11 19 | set(CMAKE_PREFIX_PATH ${pybind11_DIR} ${CMAKE_PREFIX_PATH}) 20 | set(CMAKE_CXX_STANDARD 17) 21 | find_package(pybind11 REQUIRED) 22 | 23 | pybind11_add_module(InferenceSetIOBuffer MODULE InferenceSetIOBuffer.cpp) 24 | 25 | 26 | include_directories("/opt/qti-aic/dev/inc") 27 | include_directories("examples/cpp_execution") 28 | 29 | target_link_libraries(InferenceSetIOBuffer PRIVATE ${PYTHON_LIBRARIES} pybind11::module pthread dl) 30 | 31 | target_include_directories(InferenceSetIOBuffer PRIVATE ${PYTHON_INCLUDE_DIRS} ${pybind11_INCLUDE_DIRS}) 32 | 33 | set_target_properties( 34 | InferenceSetIOBuffer 35 | PROPERTIES 36 | LINK_FLAGS "-Wl,--no-as-needed" 37 | ) 38 | 39 | set(CMAKE_BUILD_TYPE Debug) 40 | 41 | target_compile_options(InferenceSetIOBuffer PRIVATE 42 | -fstack-protector-all 43 | -fstack-protector-all 44 | -Werror 45 | -Wall 46 | -Wextra 47 | -Wunused-variable 48 | -Wunused-parameter 49 | -Wnon-virtual-dtor 50 | -Wno-missing-field-initializers) 51 | 52 | #Print paths for debugging 53 | message(STATUS "Python Include Dirs: ${PYTHON_INCLUDE_DIRS}") 54 | message(STATUS "Python Include Dirs: ${PYTHON_LIBRARIES}") 55 | message(STATUS "pybind11_DIR: ${pybind11_DIR}") 56 | -------------------------------------------------------------------------------- /examples/cpp_execution/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Text Generation using CPP Inference 3 | 4 | ## Overview 5 | This example demonstrates how to execute a model on AI 100 using Efficient Transformers and C++ APIs. The Efficient Transformers library is utilized for transforming, exporting and compiling the model, while the QPC is executed using C++ APIs. It is tested on both x86 and ARM platform. 6 | 7 | > **_NOTE:_** This supports BS>1 and Chunking. 8 | 9 | ## Prerequisite 10 | 1. `pip install pybind11` 11 | 2. Cpp17 or above (Tested on C++17 and g++ version - 11.4.0) 12 | 3. QEfficient [Quick Installation Guide]( https://github.com/quic/efficient-transformers?tab=readme-ov-file#quick-installation) 13 | 14 | ## Setup and Execution 15 | ```bash 16 | 17 | # Compile the cpp file using the following commands 18 | mkdir build 19 | cd build 20 | 21 | cmake .. 22 | make -j 8 23 | 24 | cd ../../../ # Need to be in base folder - efficient-transformers to run below cmd 25 | 26 | # Run the python script to get the generated text 27 | python examples/cpp_execution/text_inference_using_cpp.py --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 14 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first 28 | 29 | ``` 30 | 31 | ## Future Enhancements 32 | 1. DMA Buffer Handling 33 | 2. Continuous Batching 34 | 3. Handling streamer 35 | -------------------------------------------------------------------------------- /examples/embedding_model.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | # This is the work example of the Embedding model with the AI 100 9 | # For more information, visit: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 10 | 11 | import torch 12 | import torch.nn.functional as F 13 | from transformers import AutoTokenizer 14 | 15 | from QEfficient import QEFFAutoModel as AutoModel 16 | 17 | 18 | def mean_pooling(model_output, attention_mask): 19 | token_embeddings = model_output # First element of model_output contains all token embeddings 20 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 21 | return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) 22 | 23 | 24 | # Sentences we want sentence embeddings for 25 | sentences = "This is an example sentence" 26 | 27 | # Load model from HuggingFace Hub 28 | tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") 29 | 30 | 31 | qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") 32 | qeff_model.compile(num_cores=14) 33 | 34 | # Tokenize sentences 35 | encoded_input = tokenizer(sentences, return_tensors="pt") 36 | qeff_output = torch.tensor(qeff_model.generate(encoded_input)) 37 | 38 | # Perform pooling 39 | sentence_embeddings = mean_pooling(qeff_output, encoded_input["attention_mask"]) 40 | 41 | # Normalize embeddings 42 | sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) 43 | 44 | print("Sentence embeddings:") 45 | print(sentence_embeddings) 46 | -------------------------------------------------------------------------------- /examples/granite_example/granite_vision_inference.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import requests 9 | from PIL import Image 10 | from transformers import AutoProcessor, TextStreamer 11 | 12 | from QEfficient import QEFFAutoModelForImageTextToText 13 | 14 | # Add HuggingFace Token to access the model 15 | HF_TOKEN = "" 16 | 17 | 18 | def run_model( 19 | model_name, 20 | token, 21 | query, 22 | image_url, 23 | kv_offload=False, 24 | prefill_seq_len=5500, 25 | ctx_len=6000, 26 | generation_len=128, 27 | img_size=384, 28 | num_cores=16, 29 | num_devices=1, 30 | ): 31 | ## STEP - 1 Load the Processor and Model 32 | 33 | processor = AutoProcessor.from_pretrained(model_name, token=token) 34 | 35 | # `kv_offload` is used to compile the model in a 2 QPCs.Currently we are not supporting 1 qpc so the flag false is not allowed. 36 | # The `kv_offload` flag should always be set to True. 37 | # The Dual QPC approach splits the model to perform Image Encoding and Output generation in 2 different QPCs. 38 | # The outputs of the Vision Encoder are then passed to the Language model via host in this case. 39 | 40 | model = QEFFAutoModelForImageTextToText.from_pretrained(model_name, token=token, kv_offload=kv_offload) 41 | 42 | ## STEP - 2 Export & Compile the Model 43 | 44 | model.compile( 45 | prefill_seq_len=prefill_seq_len, 46 | ctx_len=ctx_len, 47 | img_size=img_size, 48 | num_cores=num_cores, 49 | num_devices=num_devices, 50 | mxfp6_matmul=False, 51 | ) 52 | 53 | ## STEP - 3 Load and process the inputs for Inference 54 | 55 | # We are resizing the image to (w x h) (1610 x 1109) so that any image can work on the model irrespective of image dimensssions 56 | # we have a fixed size of height 1109 and width 1610 57 | 58 | image = Image.open(requests.get(image_url, stream=True).raw) 59 | image = image.resize((1610, 1109)) 60 | 61 | messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": query}]}] 62 | input_text = processor.apply_chat_template(messages, add_generation_prompt=True) 63 | inputs = processor(image, input_text, add_special_tokens=False, return_tensors="pt") 64 | 65 | ## STEP - 4 Run Inference on the compiled model 66 | 67 | streamer = TextStreamer(processor.tokenizer) 68 | output = model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len) 69 | print(output) 70 | 71 | 72 | if __name__ == "__main__": 73 | # Model name and Input parameters 74 | model_name = "ibm-granite/granite-vision-3.2-2b" 75 | 76 | # Please add prompt here 77 | query = "Describe the image" 78 | 79 | # Please pass image url or image path .The format of the image should be jpg. 80 | image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" 81 | 82 | # Compilation parameters for the model 83 | kv_offload = True 84 | prefill_seq_len = 5500 85 | ctx_len = 6000 86 | generation_len = 128 87 | img_size = 384 88 | num_cores = 16 89 | num_devices = 4 90 | 91 | run_model( 92 | model_name=model_name, 93 | token=HF_TOKEN, 94 | query=query, 95 | kv_offload=kv_offload, 96 | image_url=image_url, 97 | prefill_seq_len=prefill_seq_len, 98 | ctx_len=ctx_len, 99 | generation_len=generation_len, 100 | img_size=img_size, 101 | num_cores=num_cores, 102 | num_devices=num_devices, 103 | ) 104 | 105 | 106 | """ 107 | Expected Response: 108 | 109 | The image depicts two cats lying on a pink blanket that is spread out on a red couch. The cats are positioned in a relaxed manner, with their bodies stretched out and their heads resting on the blanket. 110 | The cat on the left is a smaller, tabby cat with a mix of black, gray, and white fur. It has a long, slender body and a distinctive tail that is curled up near its tail end. The cat on the right is a larger, 111 | tabby cat with a mix of gray, black, and brown fur. It has 112 | 113 | """ 114 | -------------------------------------------------------------------------------- /examples/granite_example/readme.md: -------------------------------------------------------------------------------- 1 | # Granite Vision Inference 2 | This directory contains an example script of how to run inference on Granite-vision-3.2-2b via QEFFAutoModelForCausalLM class. 3 | 4 | Currently for this model we will support dual pcs. No CB support is there for this model. 5 | 6 | The model expects the following inputs to be fixed. 7 | 8 | 1. Image Size Height =1109 9 | 2. Image Size Width =1610 10 | 3. Num Patches= 10 11 | 12 | Please reshape any given image to (w x h) (1610 x 1109) and than pass it to the processor.It accepts a path or a url. Please pass jpg images. 13 | 14 | Image used: 15 | 16 | http://images.cocodataset.org/val2017/000000039769.jpg 17 | 18 | 19 | To run example script after package installations: 20 | ```sh 21 | python granite_vision_inference.py 22 | ``` 23 | 24 | Expected output for given sample inputs in the script: 25 | ```sh 26 | The image depicts two cats lying on a pink blanket that is spread out on a red couch. The cats are positioned in a relaxed manner, with their bodies stretched out and their heads resting on the blanket. 27 | The cat on the left is a smaller, tabby cat with a mix of black, gray, and white fur. It has a long, slender body and a distinctive tail that is curled up near its tail end. The cat on the right is a larger, 28 | tabby cat with a mix of gray, black, and brown fur. It has 29 | ``` -------------------------------------------------------------------------------- /examples/image_text_to_text_inference.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import requests 9 | from PIL import Image 10 | from transformers import AutoProcessor, TextStreamer 11 | 12 | from QEfficient import QEFFAutoModelForImageTextToText 13 | 14 | # Add HuggingFace Token to access the model 15 | HF_TOKEN = "" 16 | 17 | 18 | def run_model( 19 | model_name, 20 | token, 21 | query, 22 | image_url, 23 | kv_offload=False, 24 | prefill_seq_len=32, 25 | ctx_len=512, 26 | generation_len=128, 27 | img_size=560, 28 | num_cores=16, 29 | num_devices=1, 30 | ): 31 | ## STEP - 1 Load the Processor and Model 32 | 33 | processor = AutoProcessor.from_pretrained(model_name, token=token) 34 | 35 | # `kv_offload` is used to compile the model in a Single QPC or 2 QPCs. 36 | # The Dual QPC approach splits the model to perform Image Encoding and Output generation in 2 different QPCs. 37 | # The outputs of the Vision Encoder are then passed to the Language model via host in this case. 38 | 39 | model = QEFFAutoModelForImageTextToText.from_pretrained( 40 | model_name, token=token, attn_implementation="eager", kv_offload=kv_offload 41 | ) 42 | 43 | ## STEP - 2 Export & Compile the Model 44 | 45 | model.compile( 46 | prefill_seq_len=prefill_seq_len, 47 | ctx_len=ctx_len, 48 | img_size=img_size, 49 | num_cores=num_cores, 50 | num_devices=num_devices, 51 | mxfp6_matmul=False, 52 | ) 53 | 54 | ## STEP - 3 Load and process the inputs for Inference 55 | 56 | image = Image.open(requests.get(image_url, stream=True).raw) 57 | messages = [ 58 | { 59 | "role": "user", 60 | "content": [ 61 | {"type": "image"}, 62 | {"type": "text", "text": query}, 63 | ], 64 | } 65 | ] 66 | input_text = [processor.apply_chat_template(messages, add_generation_prompt=True)] 67 | 68 | inputs = processor( 69 | text=input_text, 70 | images=image, 71 | return_tensors="pt", 72 | add_special_tokens=False, 73 | padding="max_length", 74 | max_length=prefill_seq_len, 75 | ) 76 | 77 | ## STEP - 4 Run Inference on the compiled model 78 | 79 | streamer = TextStreamer(processor.tokenizer) 80 | model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len) 81 | 82 | 83 | if __name__ == "__main__": 84 | # Model name and Input parameters 85 | model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" 86 | query = "Describe this image." 87 | image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" 88 | 89 | # Compilation parameters for the model 90 | kv_offload = False 91 | prefill_seq_len = 32 92 | ctx_len = 512 93 | generation_len = 128 94 | img_size = 560 95 | num_cores = 16 96 | num_devices = 1 97 | 98 | run_model( 99 | model_name=model_name, 100 | token=HF_TOKEN, 101 | query=query, 102 | kv_offload=kv_offload, 103 | image_url=image_url, 104 | prefill_seq_len=prefill_seq_len, 105 | ctx_len=ctx_len, 106 | generation_len=generation_len, 107 | img_size=img_size, 108 | num_cores=num_cores, 109 | num_devices=num_devices, 110 | ) 111 | 112 | 113 | """ 114 | Expected Response: 115 | 116 | This image depicts a charming anthropomorphic rabbit standing on a dirt path in front of a picturesque stone cottage, surrounded by a serene landscape. 117 | 118 | The rabbit, with its light brown fur and distinctive long ears, is attired in a stylish blue coat, brown vest, and tan pants, exuding a sense of sophistication. The dirt path, flanked by vibrant flowers and lush greenery, leads to the cottage, which features a thatched roof and a chimney, adding to the rustic charm of the scene. In the background, rolling hills and trees create a breathtaking panorama, while the sky above is a brilliant blue with white clouds, completing the 119 | 120 | """ 121 | -------------------------------------------------------------------------------- /examples/intern_example/readme.md: -------------------------------------------------------------------------------- 1 | # InternVL Inference 2 | This directory contains an example script of how to run inference on InternVL-1B model via QEFFAutoModelForCausalLM class. 3 | 4 | ## Required packages: 5 | - `torch==2.4.1+cpu` 6 | - `torchvision==0.19.1+cpu` 7 | - `timm==1.0.14` 8 | - `einops==0.8.1` 9 | 10 | You can install them using pip: 11 | ```sh 12 | pip install torch==2.4.1+cpu --extra-index-url https://download.pytorch.org/whl/cpu timm==1.0.14 torchvision==0.19.1+cpu einops==0.8.1 13 | ``` 14 | 15 | To run example script after package installations: 16 | ```sh 17 | python internvl_inference.py 18 | ``` 19 | 20 | Expected output for given sample inputs in the script: 21 | ```sh 22 | The image is a promotional graphic for Microsoft Azure. It features a blue background with a hexagonal pattern on the left side. The hexagons are white and are arranged in a way that suggests a network or connectivity theme. 23 | 24 | On the right side of the image, the Microsoft Azure logo is prominently displayed. The logo consists of the Azure name in white, with the Microsoft logo above it, which includes four colored squares (blue, green, yellow, and red). Below the logo, the word "Azure" is written in large white letters. 25 | 26 | Below the logo, there is text that reads: 27 | - "By Dinesh Kumar Wick 28 | ``` -------------------------------------------------------------------------------- /examples/peft_models.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | from transformers import AutoTokenizer, TextStreamer 9 | 10 | from QEfficient import QEffAutoPeftModelForCausalLM 11 | 12 | base_model_name = "mistralai/Mistral-7B-v0.1" 13 | tokenizer = AutoTokenizer.from_pretrained(base_model_name) 14 | streamer = TextStreamer(tokenizer) 15 | 16 | m = QEffAutoPeftModelForCausalLM.from_pretrained("predibase/magicoder", "magicoder") 17 | m.export() 18 | m.compile(prefill_seq_len=32, ctx_len=1024) 19 | 20 | # Magicoder adapter 21 | m.set_adapter("magicoder") 22 | inputs = tokenizer("def fibonacci", return_tensors="pt") 23 | m.generate(**inputs, streamer=streamer, max_new_tokens=1024) 24 | 25 | # TLDR, summary generator 26 | m.load_adapter("predibase/tldr_headline_gen", "tldr_headline_gen") 27 | m.set_adapter("tldr_headline_gen") 28 | inputs = tokenizer( 29 | """Summarize this passage in one sentence or less: Jeffrey Berns, CEO of Blockchains LLC, wants the Nevada government to allow companies like \ 30 | his to form local governments on land they own, granting them power over everything from \ 31 | schools to law enforcement. Berns envisions a city based on digital currencies and \ 32 | blockchain storage. His company is proposing to build a 15,000 home town 12 miles east of \ 33 | Reno. Nevada Lawmakers have responded with intrigue and skepticism. The proposed \ 34 | legislation has yet to be formally filed or discussed in public hearings. 35 | 36 | Summary: """, 37 | return_tensors="pt", 38 | ) 39 | m.generate(**inputs, streamer=streamer, max_new_tokens=1024) 40 | 41 | # Math problems 42 | m.load_adapter("predibase/gsm8k", "gsm8k") 43 | m.set_adapter("gsm8k") 44 | inputs = tokenizer( 45 | "James decides to run 3 sprints 3 times a week. He runs 60 meters each sprint. \ 46 | How many total meters does he run a week?", 47 | return_tensors="pt", 48 | ) 49 | m.generate(**inputs, streamer=streamer, max_new_tokens=1024) 50 | 51 | # News explanation 52 | m.load_adapter("predibase/agnews_explained", "agnews_explained") 53 | m.set_adapter("agnews_explained") 54 | inputs = tokenizer( 55 | """Below is a news article. Please classify it under one of the following \ 56 | classes (World, Business, Sports, Sci/Tech) and provide a reasonable coherent explanation for \ 57 | why the article is classified as such. Please format your response as a JSON payload. 58 | 59 | ### Article: US poverty rate climbs, along with number lacking health coverage (AFP) AFP - The \ 60 | number of Americans living in poverty or without health insurance grew last year, a government \ 61 | survey showed, adding potential dynamite in the battle for the White House. 62 | 63 | ### JSON Response 64 | 65 | """, 66 | return_tensors="pt", 67 | ) 68 | m.generate(**inputs, streamer=streamer, max_new_tokens=1024) 69 | -------------------------------------------------------------------------------- /examples/prompts.txt: -------------------------------------------------------------------------------- 1 | My name is 2 | The sun rises from 3 | The flat earth theory is the belief that -------------------------------------------------------------------------------- /examples/speech_to_text/README.md: -------------------------------------------------------------------------------- 1 | # Speech Seq2Seq 2 | This directory contains an example script of how to use the AutoModelForSpeechSeq2Seq class. (for now, Whisper models on audio <30 seconds only has been validated) 3 | 4 | ## Required packages: 5 | - `librosa==0.10.2` 6 | - `soundfile==0.13.1` 7 | 8 | You can install them using pip: 9 | ```sh 10 | pip install librosa==0.10.2 soundfile==0.13.1 11 | ``` 12 | 13 | To run example script after package installations: 14 | ```sh 15 | python speech_seq2seq_models.py 16 | ``` 17 | 18 | Expected output for given data sample: 19 | ```sh 20 | <|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.<|endoftext|> 21 | ``` -------------------------------------------------------------------------------- /examples/speech_to_text/run_whisper_speech_to_text.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | from datasets import load_dataset 9 | from transformers import AutoProcessor 10 | 11 | from QEfficient import QEFFAutoModelForSpeechSeq2Seq 12 | 13 | base_model_name = "openai/whisper-tiny" 14 | ctx_len = 25 15 | 16 | ## STEP 1 -- load audio sample, using a standard english dataset, can load specific files if longer audio needs to be tested; also load initial processor 17 | ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") 18 | data = ds[0]["audio"]["array"] 19 | # reshape to so shape corresponds to data with batch size 1 20 | data = data.reshape(-1) 21 | sample_rate = ds[0]["audio"]["sampling_rate"] 22 | processor = AutoProcessor.from_pretrained(base_model_name) 23 | 24 | ## STEP 2 -- init base model 25 | qeff_model = QEFFAutoModelForSpeechSeq2Seq.from_pretrained(base_model_name) 26 | 27 | ## STEP 3 -- export and compile model 28 | qeff_model.compile() 29 | 30 | ## STEP 4 -- generate output for loaded input and processor 31 | exec_info = qeff_model.generate( 32 | inputs=processor(data, sampling_rate=sample_rate, return_tensors="pt"), generation_len=ctx_len 33 | ) 34 | 35 | ## STEP 5 (optional) -- use processor to decode output 36 | print(processor.batch_decode(exec_info.generated_ids)[0]) 37 | -------------------------------------------------------------------------------- /notebooks/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "QEfficient" 3 | dynamic = ["version"] 4 | description = """ 5 | QEfficient is the library interface for the Hugging Face Transformer \ 6 | models for efficient inference on Qualcomm Cloud AI 100""" 7 | readme = "README.md" 8 | license = { file = "LICENSE" } 9 | authors = [{ name = "Qualcomm Cloud AI ML Team" }] 10 | keywords = ["transformers", "Cloud AI 100", "Inference"] 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "Development Status :: 5 - Development/Unstable", 14 | "Intended Audience :: Developers", 15 | "Intended Audience :: Education", 16 | "Operating System :: Linux", 17 | "Programming Language :: Python :: 3.10", 18 | "Topic :: Scientific/Engineering :: Artificial Intelligence for Inference Accelerator", 19 | ] 20 | requires-python = ">=3.8,<3.11" 21 | dependencies = [ 22 | "transformers==4.50.0", 23 | "huggingface-hub==0.27.0", 24 | "hf_transfer==0.1.9", 25 | "peft==0.13.2", 26 | "datasets==2.20.0", 27 | "fsspec==2023.6.0", 28 | "multidict==6.0.4", 29 | "urllib3<2", 30 | "sentencepiece==0.2.0", 31 | "onnx==1.16.0", 32 | "onnxruntime==1.16.3", 33 | "numpy==1.26.4", 34 | "protobuf==3.20.2", 35 | "onnxscript==0.1.0.dev20240327", 36 | "pillow===10.4.0", 37 | "sympy", 38 | "tensorboard", 39 | "fire", 40 | "py7zr", 41 | "torchmetrics==1.7.0", 42 | "torch==2.4.1; platform_machine=='aarch64'", 43 | # Specifying torch cpu package URL per python version, update the list once pytorch releases whl for python>3.11 44 | "torch@https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp38-cp38-linux_x86_64.whl ; python_version=='3.8' and platform_machine=='x86_64'", 45 | "torch@https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp39-cp39-linux_x86_64.whl ; python_version=='3.9' and platform_machine=='x86_64'", 46 | "torch@https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp310-cp310-linux_x86_64.whl ; python_version=='3.10' and platform_machine=='x86_64'", 47 | ] 48 | 49 | [project.optional-dependencies] 50 | test = ["pytest","pytest-mock"] 51 | docs = ["Sphinx==7.1.2","sphinx-rtd-theme==2.0.0","myst-parser==3.0.1","sphinx-multiversion"] 52 | quality = ["black", "ruff", "hf_doc_builder@git+https://github.com/huggingface/doc-builder.git"] 53 | 54 | [build-system] 55 | requires = ["setuptools>=62.0.0"] 56 | build-backend = "setuptools.build_meta" 57 | 58 | [tool.setuptools.packages.find] 59 | include = ["QEfficient*"] 60 | namespaces = false 61 | 62 | [tool.setuptools.dynamic.version] 63 | attr = "QEfficient.__version__" 64 | 65 | [tool.ruff] 66 | line-length = 120 67 | # Enable the isort rules. 68 | lint.extend-select = ["I"] 69 | 70 | [tool.pytest.ini_options] 71 | addopts = "-W ignore -s -v" 72 | junit_logging = "all" 73 | doctest_optionflags = "NUMBER NORMALIZE_WHITESPACE ELLIPSIS" 74 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /scripts/finetune/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /scripts/finetune/run_ft_model.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import os 9 | import warnings 10 | 11 | import torch 12 | from peft import AutoPeftModelForCausalLM 13 | from transformers import AutoModelForCausalLM, AutoTokenizer 14 | 15 | from QEfficient.finetune.configs.training import TrainConfig 16 | 17 | # Suppress all warnings 18 | warnings.filterwarnings("ignore") 19 | 20 | try: 21 | import torch_qaic # noqa: F401 22 | 23 | device = "qaic:0" 24 | except ImportError as e: 25 | print(f"Warning: {e}. Moving ahead without these qaic modules.") 26 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 27 | 28 | train_config = TrainConfig() 29 | model = AutoModelForCausalLM.from_pretrained( 30 | train_config.model_name, 31 | use_cache=False, 32 | attn_implementation="sdpa", 33 | torch_dtype=torch.float16 if torch.cuda.is_available() or device == "qaic:0" else None, 34 | ) 35 | 36 | # Load the tokenizer and add special tokens 37 | tokenizer = AutoTokenizer.from_pretrained( 38 | train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name 39 | ) 40 | if not tokenizer.pad_token_id: 41 | tokenizer.pad_token_id = tokenizer.eos_token_id 42 | 43 | eval_prompt = """ 44 | Summarize this dialog: 45 | A: Hi Tom, are you busy tomorrow’s afternoon? 46 | B: I’m pretty sure I am. What’s up? 47 | A: Can you go with me to the animal shelter?. 48 | B: What do you want to do? 49 | A: I want to get a puppy for my son. 50 | B: That will make him so happy. 51 | --- 52 | Summary: 53 | """ 54 | 55 | model_input = tokenizer(eval_prompt, return_tensors="pt") 56 | 57 | model.to(device) 58 | model_input.to(device) 59 | model.eval() 60 | 61 | with torch.inference_mode(): 62 | print( 63 | tokenizer.decode( 64 | model.generate(**model_input, max_new_tokens=50, do_sample=False)[0], 65 | skip_special_tokens=True, 66 | ) 67 | ) 68 | 69 | trained_weights_path = os.path.join(train_config.output_dir, "trained_weights") 70 | list_paths = [d for d in os.listdir(trained_weights_path) if os.path.isdir(os.path.join(trained_weights_path, d))] 71 | max_index = max([int(path[5:]) for path in list_paths]) 72 | 73 | save_dir = os.path.join(trained_weights_path, "step_" + str(max_index)) 74 | 75 | # Load PEFT model on CPU 76 | model = AutoPeftModelForCausalLM.from_pretrained(save_dir) 77 | # Merge LoRA and base model and save 78 | merged_model = model.merge_and_unload() 79 | merged_model.save_pretrained(train_config.output_dir, safe_serialization=True) 80 | model_id = train_config.output_dir 81 | 82 | # Load Model with PEFT adapter 83 | model_peft = AutoModelForCausalLM.from_pretrained(model_id, use_cache=False, attn_implementation="sdpa") 84 | 85 | model_peft.to(device) 86 | model_peft.eval() 87 | with torch.inference_mode(): 88 | print( 89 | tokenizer.decode( 90 | model_peft.generate(**model_input, max_new_tokens=50, do_sample=False)[0], 91 | skip_special_tokens=True, 92 | ) 93 | ) 94 | -------------------------------------------------------------------------------- /scripts/perplexity_computation/README.md: -------------------------------------------------------------------------------- 1 | # Perplexity Calculator 2 | 3 | This script calculates the perplexity for ONNX, QPC, or Torch models using the WikiText-2 dataset. It supports different model types and configurations. 4 | 5 | ## Table of Contents 6 | 7 | - Requirements 8 | - Installation 9 | - Usage 10 | - Example 11 | - Arguments 12 | - Output Details 13 | 14 | ## Requirements 15 | 16 | - Python 3.8+ 17 | - Required Python packages: 18 | - `QEfficient` 19 | - `datasets==2.20` 20 | 21 | ## Installation 22 | 23 | - Install QEfficient and update the datasets package to 2.20 24 | 25 | ## Usage 26 | 27 | To run the script, use the following command: 28 | 29 | ```bash 30 | python calculate_perplexity.py --model_type --model_name [--model_path ] [--dataset_name ] [--ctx_len ] [--prompt_len ] [--batch_size ] [--stride ] [--num_samples ] [--qpc_device_id ] [--log_file ] 31 | 32 | python perplexity_calculator_cloud.py --model_type torch --model_name meta-llama/Meta-Llama-3-8B-Instruct --num_samples 1 33 | ``` 34 | 35 | ## Arguments (Help Section) 36 | ```bash 37 | --model_path: Path to ONNX or QPC model (optional for Torch Original models). 38 | --model_type: Type of model (onnx, qpc, or torch) (required). 39 | --model_name: Name of the HuggingFace Model Card Name/tokenizer (required). 40 | --dataset_name: Name of the dataset (default: wikitext-2-raw-v1). 41 | --ctx_len: Context length (default: 2048). 42 | --prompt_len: Prompt length (default: 1). 43 | --batch_size: Batch size (default: 1). 44 | --stride: Stride for dataset (default: 1024). 45 | --num_samples: Number of samples to use (-1 for all) (default: -1). 46 | --qpc_device_id: QAIC device ids (comma-separated) (default: [0]). 47 | --log_file: Log file name (default: perplexity_results.log). 48 | ``` 49 | 50 | ## Output Details 51 | The script logs the following information: 52 | 53 | - Perplexity and loss for the specified model. (For Original Torch, it will dump the Target for FP16 and MXFP6 Precision too) 54 | - Total time taken for evaluation. 55 | - Detailed configuration and results in the specified log file. 56 | 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /scripts/perplexity_computation/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /scripts/replicate_kv_head/README.md: -------------------------------------------------------------------------------- 1 | # KV-Head Replication 2 | 3 | This example contains a sample script for replicating key-value (KV) heads for the Llama-3-8B-Instruct model. The script performs the following steps: 4 | 1. Runs inference with the original model. 5 | 2. Replicates the KV heads. 6 | 3. Runs inference on the modified model to validate the changes. 7 | 4. Exports the modified model to ONNX format. 8 | 9 | 10 | ## Installation 11 | 12 | Install efficient-transformers and the required libraries using https://github.com/quic/efficient-transformers#quick-installation 13 | 14 | 15 | ## Usage 16 | You can run the script with different parameters using the command line. Below is an example of how to use the script: 17 | 18 | 1. **(Optional)** If you are using a gated repository, export the `HF_TOKEN`: 19 | ```sh 20 | export HF_TOKEN= 21 | ``` 22 | 23 | 2. **Run the script** with the desired parameters: 24 | ```sh 25 | python script.py --model_name "meta-llama/Meta-Llama-3-8B-Instruct" --prompt "Hello, world!" --repeat 3 26 | ``` 27 | 28 | Replace `` with your actual token. 29 | 30 | ### Arguments 31 | - **--model_name**: Model card name to use (default: “meta-llama/Meta-Llama-3-8B-Instruct”). 32 | - **--prompt**: Prompt to use for the model (default: “My name is”). 33 | - **--repeat**: Factor to repeat key-value heads (default: 2). 34 | - **--num_attention_heads**: Number of attentin heads (default: None). This is optional param, if not given explicitly the will be read from config.json. 35 | - **--hidden_size**: Hidden size (default: None). This is optional param, if not given explicitly the will be read from config.json. -------------------------------------------------------------------------------- /scripts/replicate_kv_head/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /scripts/specializations.json: -------------------------------------------------------------------------------- 1 | { 2 | "specializations": [ 3 | { 4 | "full_batch_size": "4", 5 | "batch_size": "1", 6 | "seq_len": "8", 7 | "ctx_len": "32" 8 | }, 9 | { 10 | "full_batch_size": "4", 11 | "batch_size": "1", 12 | "seq_len": "1", 13 | "ctx_len": "32" 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Tests 2 | This directory contains the tests for the project. Below is the list of test functions and required pytest plugins. 3 | 4 | ## Test Functions 5 | ### cloud/test_infer.py 6 | - test_infer function 7 | 8 | ### cloud/test_export.py 9 | - test_export function 10 | 11 | ### cloud/test_compile.py 12 | - test_compile function 13 | 14 | ### cloud/test_execute.py 15 | - test_execute function 16 | 17 | ## Required Plugins 18 | - `pytest` 19 | - `pytest-mock` 20 | 21 | You can install them using pip: 22 | ```sh 23 | pip install pytest pytest-mock 24 | ``` 25 | Alternatively, if you have specefied these dependencies in your `pyproject.toml` , you can install them using the test feature: 26 | ```sh 27 | pip install .[test] 28 | ``` 29 | 30 | ## Running the Tests 31 | To run the tests, navigate to the root directory of the project and use the following command: 32 | ```sh 33 | pytest -v -s 34 | ``` 35 | And If you want to see the Skipped reason then you can use the below command for testing: 36 | ```sh 37 | pytest -v -rs 38 | ``` 39 | If you want to run a specefic test file or test function, you can specify it like this: 40 | ```sh 41 | pytest tests/cloud/test_infer.py 42 | ``` 43 | ```sh 44 | pytest tests/cloud/test_infer.py::test_infer 45 | ``` 46 | ### Note 47 | To run all the tests, follow the instructions below: 48 | ```sh 49 | cd tests/cloud # navigate to the directory where conftest.py present 50 | pytest -v --all # use --all option 51 | ``` 52 | ## Cleanup 53 | Some tests will create temporary files or directories, to ensure a clean state after running the tests, use the provided fixtures or cleanup scripts as described in the `conftest.py`. 54 | 55 | ## Test Coverage 56 | If you want to measure test coverage, you can use the `pytest-cov` plugin. Install it using: 57 | ```sh 58 | pip install pytest-cov 59 | ``` 60 | Then run the tests with coverage: 61 | ```sh 62 | pytest --cov=QEfficient/cloud 63 | ``` 64 | It will show the code coverage of that particular directory. 65 | 66 | 67 | ## Test Report 68 | If you want to generate a html report for the tests execution, you can use the `pytest-html` plugin. Install it using: 69 | ```sh 70 | pip install pytest-html 71 | ``` 72 | Then run the tests with html: 73 | ```sh 74 | pytest --html=report.html 75 | ``` 76 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | -------------------------------------------------------------------------------- /tests/base/test_modeling_qeff.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | from types import SimpleNamespace 9 | 10 | import onnx 11 | import pytest 12 | 13 | from QEfficient.base.modeling_qeff import QEFFBaseModel 14 | 15 | 16 | def test_compiler_invalid_file(tmp_path): 17 | qeff_obj = SimpleNamespace() 18 | 19 | invalid_file = tmp_path / "invalid.onnx" 20 | with open(invalid_file, "wb") as fp: 21 | fp.write(chr(0).encode() * 100) 22 | 23 | with pytest.raises(RuntimeError): 24 | QEFFBaseModel._compile(qeff_obj, invalid_file, tmp_path) 25 | 26 | 27 | def test_compiler_invalid_flag(tmp_path): 28 | qeff_obj = SimpleNamespace() 29 | 30 | onnx_model = onnx.parser.parse_model(""" 31 | < 32 | ir_version: 8, 33 | opset_import: ["": 17] 34 | > 35 | test_compiler(float x) => (float y) 36 | { 37 | y = Identity(x) 38 | } 39 | """) 40 | valid_file = tmp_path / "valid.onnx" 41 | onnx.save(onnx_model, valid_file) 42 | 43 | with pytest.raises(RuntimeError): 44 | QEFFBaseModel._compile( 45 | qeff_obj, valid_file, tmp_path, convert_tofp16=True, compile_only=True, aic_binary_dir=tmp_path 46 | ) 47 | -------------------------------------------------------------------------------- /tests/base/test_pytorch_transforms.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ---------------------------------------------------------------------------- 7 | 8 | import pytest 9 | import torch 10 | from torch import nn 11 | 12 | from QEfficient.base.pytorch_transforms import ModuleMappingTransform, ModuleMutatorTransform 13 | 14 | 15 | class TestModel(nn.Module): 16 | def __init__(self): 17 | super().__init__() 18 | 19 | self.a = nn.Linear(32, 64) 20 | self.b = nn.Linear(64, 32) 21 | 22 | def forward(self, x): 23 | x = self.a(x) 24 | x = self.b(x) 25 | return x 26 | 27 | 28 | def test_module_mapping_transform(): 29 | with pytest.raises(TypeError): 30 | ModuleMappingTransform() 31 | 32 | class TestTransform(ModuleMappingTransform): 33 | _module_mapping = {nn.Linear: nn.Identity} 34 | 35 | model = TestModel() 36 | x = torch.rand(1, 32) 37 | y1 = model(x) 38 | assert torch.any(y1 != x) 39 | 40 | model, transformed = TestTransform.apply(model) 41 | assert transformed 42 | y2 = model(x) 43 | assert torch.all(y2 == x) 44 | 45 | 46 | def test_module_mutator_transform(): 47 | with pytest.raises(TypeError): 48 | ModuleMutatorTransform() 49 | 50 | class TestTransform(ModuleMutatorTransform): 51 | _match_class = nn.Linear 52 | 53 | @classmethod 54 | def mutate(cls, original_module: nn.Module, parent_module: nn.Module): 55 | return nn.Identity() 56 | 57 | model = TestModel() 58 | prev_ids = [id(model.a), id(model.b)] 59 | x = torch.rand(1, 32) 60 | y1 = model(x) 61 | assert torch.any(y1 != x) 62 | model, transformed = TestTransform.apply(model) 63 | assert transformed 64 | assert not ([id(model.a), id(model.b)] == prev_ids) 65 | y2 = model(x) 66 | assert torch.all(y2 == x) 67 | -------------------------------------------------------------------------------- /tests/cloud/high_level_testing.json: -------------------------------------------------------------------------------- 1 | { 2 | "license": "SEE LICENSE IN LICENSE FILE", 3 | "model_name" : ["gpt2"], 4 | "num_cores" : [16], 5 | "prompt" : ["My name is"], 6 | "prompts_txt_file_path" : ["examples/prompts.txt"], 7 | "aic_enable_depth_first" : [1], 8 | "mos" : [1], 9 | "cache_dir" : [null], 10 | "hf_token" : [null], 11 | "batch_size" : [1], 12 | "prompt_len" : [32], 13 | "ctx_len" : [128], 14 | "mxfp6" : [1], 15 | "mxint8" : [1], 16 | "device_group" : [null], 17 | "full_batch_size" : [null,3], 18 | "enable_qnn" : [false, true], 19 | "qnn_config" : [null, "QEfficient/compile/qnn_config.json"] 20 | } 21 | -------------------------------------------------------------------------------- /tests/cloud/test_compile_and_execute.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import os 9 | 10 | import pytest 11 | import yaml 12 | 13 | import QEfficient 14 | from QEfficient.cloud.execute import main as execute 15 | from QEfficient.cloud.export import get_onnx_model_path 16 | 17 | 18 | @pytest.mark.on_qaic 19 | @pytest.mark.cli 20 | def test_compile(setup, mocker): 21 | """ 22 | test_compile is a HL compile api testing function, 23 | checks compile api code flow, object creations, internal api calls, internal returns. 24 | --------- 25 | Parameters: 26 | setup: is a fixture defined in conftest.py module. 27 | mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions. 28 | """ 29 | ms = setup 30 | onnx_model_path = get_onnx_model_path( 31 | model_name=ms.model_name, 32 | cache_dir=ms.cache_dir, 33 | hf_token=ms.hf_token, 34 | full_batch_size=ms.full_batch_size, 35 | local_model_dir=ms.local_model_dir, 36 | ) 37 | 38 | base_key = "past_key." 39 | base_value = "past_value." 40 | precision = "float16" 41 | 42 | data = [] 43 | 44 | for i in range(12): 45 | data.append({"IOName": f"{base_key}{i}", "Precision": precision}) 46 | data.append({"IOName": f"{base_value}{i}", "Precision": precision}) 47 | 48 | for i in range(12): 49 | data.append({"IOName": f"{base_key}{i}_RetainedState", "Precision": precision}) 50 | data.append({"IOName": f"{base_value}{i}_RetainedState", "Precision": precision}) 51 | 52 | with open(((onnx_model_path.parent) / "custom_io.yaml"), "w") as file: 53 | yaml.dump(data, file) 54 | 55 | qpc_path = QEfficient.compile( 56 | onnx_path=onnx_model_path, 57 | qpc_path=os.path.dirname(ms.qpc_dir_path()), 58 | num_cores=ms.num_cores, 59 | device_group=ms.device_group, 60 | custom_io_file_path=(onnx_model_path.parent) / "custom_io.yaml", 61 | aic_enable_depth_first=ms.aic_enable_depth_first, 62 | mos=ms.mos, 63 | batch_size=ms.batch_size, 64 | prompt_len=ms.prompt_len, 65 | ctx_len=ms.ctx_len, 66 | mxfp6=ms.mxfp6, 67 | mxint8=ms.mxint8, 68 | full_batch_size=ms.full_batch_size, 69 | enable_qnn=ms.enable_qnn, 70 | ) 71 | 72 | execute( 73 | model_name=ms.model_name, 74 | qpc_path=qpc_path, 75 | prompt=ms.prompt, 76 | prompts_txt_file_path=ms.prompts_txt_file_path, 77 | generation_len=ms.generation_len, 78 | hf_token=ms.hf_token, 79 | full_batch_size=ms.full_batch_size, 80 | ) 81 | -------------------------------------------------------------------------------- /tests/cloud/test_export.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | 9 | import pytest 10 | 11 | from QEfficient.cloud.export import main as export 12 | 13 | 14 | @pytest.mark.cli 15 | def test_export(setup, mocker): 16 | """ 17 | test_export is a HL export api testing function, 18 | checks export api code flow, object creations, internal api calls, internal returns. 19 | --------- 20 | Parameters: 21 | setup: is a fixture defined in conftest.py module. 22 | mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions. 23 | """ 24 | ms = setup 25 | 26 | export( 27 | model_name=ms.model_name, 28 | hf_token=ms.hf_token, 29 | local_model_dir=ms.local_model_dir, 30 | full_batch_size=ms.full_batch_size, 31 | ) 32 | -------------------------------------------------------------------------------- /tests/cloud/test_infer.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | 9 | import pytest 10 | 11 | from QEfficient.cloud.infer import main as infer 12 | 13 | 14 | @pytest.mark.on_qaic 15 | @pytest.mark.cli 16 | @pytest.mark.usefixtures("clean_up_after_test") 17 | def test_infer(setup, mocker): 18 | """ 19 | test_infer is a HL infer api testing function, 20 | checks infer api code flow, object creations, internal api calls, internal returns. 21 | --------- 22 | Parameters: 23 | setup: is a fixture defined in conftest.py module. 24 | mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions. 25 | --------- 26 | Ref: https://docs.pytest.org/en/7.1.x/how-to/fixtures.html 27 | Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html 28 | """ 29 | ms = setup 30 | infer( 31 | model_name=ms.model_name, 32 | num_cores=ms.num_cores, 33 | prompt=ms.prompt, 34 | local_model_dir=ms.local_model_dir, 35 | prompts_txt_file_path=ms.prompts_txt_file_path, 36 | aic_enable_depth_first=ms.aic_enable_depth_first, 37 | mos=ms.mos, 38 | hf_token=ms.hf_token, 39 | batch_size=ms.batch_size, 40 | prompt_len=ms.prompt_len, 41 | ctx_len=ms.ctx_len, 42 | generation_len=ms.generation_len, 43 | mxfp6=ms.mxfp6, 44 | mxint8=ms.mxint8, 45 | full_batch_size=ms.full_batch_size, 46 | enable_qnn=ms.enable_qnn, 47 | ) 48 | -------------------------------------------------------------------------------- /tests/cloud/test_infer_vlm.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import pytest 9 | 10 | from QEfficient.cloud.infer import main as infer 11 | 12 | 13 | @pytest.mark.on_qaic 14 | @pytest.mark.cli 15 | @pytest.mark.multimodal 16 | @pytest.mark.usefixtures("clean_up_after_test") 17 | def test_vlm_cli(setup, mocker): 18 | ms = setup 19 | # Taking some values from setup fixture and assigning other's based on model's requirement. 20 | # For example, mxint8 is not required for VLM models, so assigning False. 21 | infer( 22 | model_name="llava-hf/llava-1.5-7b-hf", 23 | num_cores=ms.num_cores, 24 | prompt="Describe the image.", 25 | prompts_txt_file_path=None, 26 | aic_enable_depth_first=ms.aic_enable_depth_first, 27 | mos=ms.mos, 28 | batch_size=1, 29 | full_batch_size=None, 30 | prompt_len=1024, 31 | ctx_len=2048, 32 | generation_len=20, 33 | mxfp6=False, 34 | mxint8=False, 35 | local_model_dir=None, 36 | cache_dir=None, 37 | hf_token=ms.hf_token, 38 | enable_qnn=False, 39 | qnn_config=None, 40 | image_url="https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg", 41 | ) 42 | -------------------------------------------------------------------------------- /tests/peft/test_peft_onnx_transforms.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import textwrap 9 | 10 | import onnx 11 | 12 | from QEfficient.peft.onnx_transforms import AdapterWeightsToInputsTransform 13 | 14 | 15 | def test_adapter_weights_to_inputs_transform(): 16 | external_tensors_file = "weight.raw" 17 | adapter_name = "testAdapter1" 18 | test_onnx = onnx.parser.parse_model(f""" 19 | < 20 | ir_version: 8, 21 | opset_import: ["" : 17] 22 | > 23 | test_adapter_weights (float[n, 32] input) => (float[n, 32] output) 24 | < 25 | float[32, 32] layer1_{adapter_name}_weight = [ "location" : "{external_tensors_file}" ], 26 | float[32, 32] layer2_{adapter_name}_weight = [ "location" : "{external_tensors_file}" ] 27 | > 28 | {{ 29 | layer1output = MatMul (input, layer1_{adapter_name}_weight) 30 | output = MatMul (layer1output, layer2_{adapter_name}_weight) 31 | }} 32 | """) 33 | 34 | out_onnx, transformed = AdapterWeightsToInputsTransform.apply(test_onnx, adapter_name=adapter_name) 35 | assert not transformed 36 | 37 | # Currently the onnx parser doesn't support using "." in identifier 38 | # Replace _ with . 39 | for init in test_onnx.graph.initializer: 40 | init.name = init.name.replace("_", ".") 41 | for node in test_onnx.graph.node: 42 | for i, inp in enumerate(node.input): 43 | node.input[i] = inp.replace("_", ".") 44 | for i, out in enumerate(node.output): 45 | node.output[i] = out.replace("_", ".") 46 | 47 | out_onnx, transformed = AdapterWeightsToInputsTransform.apply(test_onnx, adapter_name=adapter_name) 48 | assert transformed 49 | assert ( 50 | onnx.printer.to_text(out_onnx) 51 | == textwrap.dedent(""" 52 | < 53 | ir_version: 8, 54 | opset_import: ["" : 17] 55 | > 56 | test_adapter_weights (float[n,32] input, float[32,32] layer1.weight, float[32,32] layer2.weight) => (float[n,32] output, float[32,32] layer1.weight_RetainedState, float[32,32] layer2.weight_RetainedState) { 57 | layer1output = MatMul (input, layer1.weight) 58 | output = MatMul (layer1output, layer2.weight) 59 | layer1.weight_RetainedState = Identity (layer1.weight) 60 | layer2.weight_RetainedState = Identity (layer2.weight) 61 | } 62 | """).strip() 63 | ) 64 | -------------------------------------------------------------------------------- /tests/text_generation/test_text_generation.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import os 9 | 10 | import pytest 11 | from transformers import AutoModelForCausalLM 12 | 13 | from QEfficient.generation.text_generation_inference import TextGeneration 14 | from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM 15 | from QEfficient.utils import hf_download 16 | from QEfficient.utils._utils import load_hf_tokenizer 17 | from QEfficient.utils.constants import Constants 18 | from QEfficient.utils.device_utils import get_available_device_id 19 | 20 | configs = [pytest.param("gpt2", 2, None, 32, id="gpt2_config")] 21 | 22 | 23 | def load_causal_lm_model(model_config): 24 | """ 25 | Function to load model from huggingface and transform to KV model 26 | -------- 27 | 28 | :model_config: Dict 29 | 30 | :return model_hf, params 31 | """ 32 | model_path = hf_download( 33 | repo_id=model_config["model_name"], 34 | ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], 35 | ) 36 | model_hf = AutoModelForCausalLM.from_pretrained( 37 | model_path, 38 | use_cache=True, 39 | num_hidden_layers=model_config["n_layer"], 40 | attn_implementation="eager", 41 | low_cpu_mem_usage=False, 42 | ) # Run models for single layers only 43 | params = sum(p.numel() for p in model_hf.parameters()) 44 | model_hf.eval() 45 | return model_hf, params 46 | 47 | 48 | # Use @pytest.mark.parametrize to apply the configurations 49 | @pytest.mark.on_qaic 50 | @pytest.mark.parametrize("model_name, n_layer, full_batch_size, max_gen_len", configs) 51 | def test_generate_text_stream( 52 | model_name: str, 53 | n_layer: int, 54 | full_batch_size: int, 55 | max_gen_len: int, 56 | prompt_len: int = Constants.PROMPT_LEN, 57 | ctx_len: int = Constants.CTX_LEN, 58 | ): 59 | """ 60 | Validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. 61 | ``Mandatory`` Args: 62 | :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` 63 | :prompt_len (int): Prompt length for the model to compile. 64 | :ctx_len (int): Maximum context length to compile the model. 65 | :n_layers (int): Number of layers for the Model. 66 | """ 67 | model_config = {"model_name": model_name, "n_layer": n_layer} 68 | model_hf, _ = load_causal_lm_model(model_config) 69 | 70 | tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name) 71 | 72 | qeff_model = QEFFAutoModelForCausalLM(model_hf) 73 | 74 | qeff_model.export() 75 | device_id = get_available_device_id() 76 | 77 | if not device_id: 78 | pytest.skip("No available devices to run model on Cloud AI 100") 79 | 80 | qpc_path = qeff_model.compile( 81 | prefill_seq_len=prompt_len, 82 | ctx_len=ctx_len, 83 | num_cores=14, 84 | mxfp6=False, 85 | aic_enable_depth_first=False, 86 | full_batch_size=full_batch_size, 87 | ) 88 | 89 | exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR, generation_len=max_gen_len) 90 | cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size 91 | cloud_ai_100_output = [tokenizer.decode(token, skip_special_tokens=True) for token in cloud_ai_100_tokens[0]] 92 | 93 | text_generator = TextGeneration( 94 | tokenizer=tokenizer, 95 | qpc_path=qpc_path, 96 | ctx_len=ctx_len, 97 | full_batch_size=full_batch_size, 98 | ) 99 | stream_tokens = [] 100 | for decoded_tokens in text_generator.generate_stream_tokens(Constants.INPUT_STR, generation_len=max_gen_len): 101 | stream_tokens.extend(decoded_tokens) 102 | 103 | assert cloud_ai_100_output == stream_tokens, ( 104 | f"Deviation in output observed while comparing regular execution and streamed output: {cloud_ai_100_output} != {stream_tokens}" 105 | ) 106 | assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json")) 107 | -------------------------------------------------------------------------------- /tests/transformers/models/test_embedding_models.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import os 9 | from typing import Optional 10 | 11 | import numpy as np 12 | import onnxruntime as ort 13 | import pytest 14 | from transformers import AutoModel, AutoTokenizer 15 | 16 | from QEfficient.transformers.models.modeling_auto import QEFFAutoModel 17 | from QEfficient.utils._utils import create_json 18 | from QEfficient.utils.constants import Constants, QnnConstants 19 | 20 | embed_test_models = [ 21 | # model_name, architecture 22 | "sentence-transformers/multi-qa-mpnet-base-cos-v1", # MPNetForMaskedLM 23 | "BAAI/bge-reranker-v2-m3", # XLMRobertaForSequenceClassification 24 | "BAAI/bge-small-en-v1.5", # BertModel 25 | ] 26 | 27 | 28 | def check_embed_pytorch_vs_ort_vs_ai100( 29 | model_name: str, 30 | seq_len: int = Constants.CTX_LEN, 31 | n_layer: int = 1, 32 | enable_qnn: Optional[bool] = False, 33 | qnn_config: Optional[str] = None, 34 | ): 35 | # Prepare input 36 | tokenizer = AutoTokenizer.from_pretrained(model_name) 37 | inputs = tokenizer("My name is", return_tensors="pt") 38 | 39 | # Original PyTorch model 40 | pt_model = AutoModel.from_pretrained( 41 | model_name, 42 | num_hidden_layers=n_layer, 43 | attn_implementation="eager", 44 | trust_remote_code=True, 45 | ) 46 | 47 | pt_outputs = pt_model(**inputs) 48 | pt_embeddings = pt_outputs[0][0].detach().numpy() 49 | # Pytorch transformed model 50 | qeff_model = QEFFAutoModel(pt_model, pretrained_model_name_or_path=model_name) 51 | qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False) 52 | qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy() 53 | mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings)) 54 | print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad) 55 | assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}" 56 | 57 | onnx_model = qeff_model.export() 58 | ort_session = ort.InferenceSession(str(onnx_model)) 59 | 60 | # Prepare the inputs for ONNX Runtime 61 | input_ids = np.array(inputs["input_ids"]) 62 | attention_mask = np.array(inputs["attention_mask"]) 63 | 64 | onnx_inputs = {"input_ids": input_ids, "attention_mask": attention_mask} 65 | # Run inference 66 | onnx_outputs = ort_session.run(None, onnx_inputs) 67 | 68 | # Compare Transformed PyTorch and ONNX outputs 69 | 70 | pt_embeddings = pt_outputs[0][0].detach().numpy() 71 | onnx_embeddings = onnx_outputs[0] 72 | mad = np.mean(np.abs(pt_embeddings - onnx_embeddings)) 73 | print("Mad for onnx and PyTorch is ", mad) 74 | assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}" 75 | 76 | qeff_model.compile( 77 | num_cores=14, 78 | enable_qnn=enable_qnn, 79 | qnn_config=qnn_config, 80 | ) 81 | ai100_output = qeff_model.generate(inputs=inputs) 82 | 83 | # Compare ONNX and AI 100 outputs 84 | mad = np.mean(np.abs(ai100_output - onnx_outputs[0])) 85 | print("Mad for onnx and AI 100 output is ", mad) 86 | assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}" 87 | assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json")) 88 | 89 | 90 | @pytest.mark.on_qaic 91 | @pytest.mark.parametrize("model_name", embed_test_models) 92 | def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name): 93 | """ 94 | Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output. 95 | """ 96 | check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1) 97 | 98 | 99 | @pytest.mark.on_qaic 100 | @pytest.mark.qnn 101 | @pytest.mark.parametrize("model_name", embed_test_models) 102 | def test_embed_model_pytorch_vs_onnx_vs_ai100_qnn(model_name): 103 | """ 104 | QNN Compilation path test. 105 | Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output. 106 | """ 107 | qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") 108 | create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) 109 | 110 | check_embed_pytorch_vs_ort_vs_ai100( 111 | model_name=model_name, seq_len=32, n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path 112 | ) 113 | -------------------------------------------------------------------------------- /tests/utils/test_cache.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import random 9 | 10 | import pytest 11 | 12 | from QEfficient.utils.cache import to_hashable 13 | 14 | 15 | def get_random_string(length: int) -> str: 16 | return "".join([chr(random.randint(0x20, 0x7E)) for _ in range(length)]) 17 | 18 | 19 | def test_to_hashable_dict(): 20 | dct = {get_random_string(i): i for i in range(5)} 21 | dct = dict(sorted(dct.items())) 22 | hash1 = to_hashable(dct) 23 | 24 | dct = dict(reversed(dct.items())) 25 | hash2 = to_hashable(dct) 26 | 27 | assert hash1 == hash2 28 | 29 | 30 | def test_to_hashable_set(): 31 | assert to_hashable(set(range(4))) == to_hashable(set(range(4 - 1, -1, -1))) 32 | 33 | 34 | @pytest.mark.parametrize("value", [float("nan"), float("inf"), -float("inf")]) 35 | def test_to_hashable_float_nan(value): 36 | with pytest.raises(ValueError): 37 | to_hashable(value) 38 | -------------------------------------------------------------------------------- /tests/vllm/test_qaic_output_consistency.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # 6 | # ----------------------------------------------------------------------------- 7 | 8 | import random 9 | 10 | import pytest 11 | from vllm import LLM, SamplingParams 12 | 13 | # Model to test 14 | test_models = [ 15 | "TinyLlama/TinyLlama-1.1B-Chat-v1.0", 16 | ] 17 | 18 | # Constants for configuration 19 | SEQ_LEN = 128 20 | CTX_LEN = 256 21 | DECOE_BSZ = 4 22 | DTYPE = "mxfp6" 23 | KV_DTYPE = "mxint8" 24 | 25 | 26 | @pytest.mark.vllm 27 | @pytest.mark.parametrize("model_name", test_models) 28 | def test_output_consistency(model_name): 29 | """This pytest function is used to check the consistency of vLLM. 30 | 1) Single prompt test to check if the output generated in 5 different 31 | runs yields the same results 32 | 2) Multiple prompt check to test if multiple prompts yield same results 33 | if run in different slots. 34 | 35 | Parameters 36 | ---------- 37 | model_name : string 38 | Huggingface model card name. 39 | """ 40 | sampling_params = SamplingParams(temperature=0.0, max_tokens=None) 41 | 42 | # Creating LLM Object 43 | qllm = LLM( 44 | model=model_name, 45 | max_num_seqs=DECOE_BSZ, 46 | max_model_len=CTX_LEN, 47 | max_seq_len_to_capture=SEQ_LEN, 48 | quantization=DTYPE, 49 | kv_cache_dtype=KV_DTYPE, 50 | device="qaic", 51 | ) 52 | 53 | # Single prompt test 54 | single_prompt = ["My name is"] 55 | 56 | single_prompt_output = qllm.generate(single_prompt * 5, sampling_params) 57 | 58 | check_output = [] 59 | for i, op in enumerate(single_prompt_output): 60 | check_output.append(op.outputs[0].text) 61 | 62 | # Assertion to check the consistency of single prompt. 63 | assert len(set(check_output)) == 1, "Outputs from different slots for same prompt does not match!!" 64 | 65 | # Multiple prompt test 66 | outputDict = dict() 67 | multiple_prompt = [ 68 | "My name is", 69 | "How to eat mangosteen?", 70 | "How many people died in World War II", 71 | "Hello ", 72 | "Who is the president of United States", 73 | "Who is the president of India", 74 | "When it snowfalls in San Diego", 75 | "In which country yamana river flows", 76 | "How many people died in World War II", 77 | "Thy youth is proud livery, so gazed on now", 78 | "Will be a tattered weed, of small worth held:Then being asked where all thy beauty lies", 79 | "Where all the treasure of thy lusty days", 80 | "To say, within thine own deep-sunken eyes", 81 | "Where is Statue of Liberty located?", 82 | ] 83 | 84 | for p in multiple_prompt: 85 | outputDict[p] = [] 86 | 87 | for _ in range(5): 88 | random.shuffle(multiple_prompt) 89 | multiple_prompt_output = qllm.generate(multiple_prompt, sampling_params) 90 | for i, op in enumerate(multiple_prompt_output): 91 | generated_text = op.outputs[0].text 92 | outputDict[multiple_prompt[i]].append(str(multiple_prompt[i] + generated_text)) 93 | 94 | # Assertion to check multiple prompts. 95 | for key in outputDict.keys(): 96 | assert len(set(outputDict[key])) == 1, "Outputs from different slots for same prompt does not match!!" 97 | 98 | # Assertion to check if any prompts are missed. 99 | assert len(multiple_prompt) == len(multiple_prompt_output), ( 100 | "Number of Generated Tokens do not match the number of valid inputs!!" 101 | ) 102 | --------------------------------------------------------------------------------