├── .github
    ├── CODEOWNERS
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    └── workflows
    │   ├── lint-format.yml
    │   └── quic-organization-repolinter.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CODE-OF-CONDUCT.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── QEfficient
    ├── __init__.py
    ├── base
    │   ├── __init__.py
    │   ├── common.py
    │   ├── modeling_qeff.py
    │   ├── onnx_transforms.py
    │   └── pytorch_transforms.py
    ├── cloud
    │   ├── __init__.py
    │   ├── compile.py
    │   ├── execute.py
    │   ├── export.py
    │   ├── finetune.py
    │   └── infer.py
    ├── compile
    │   ├── __init__.py
    │   ├── compile_helper.py
    │   ├── qnn_compiler.py
    │   └── qnn_config.json
    ├── customop
    │   ├── __init__.py
    │   ├── ctx_scatter_gather.py
    │   ├── ctx_scatter_gather_cb.py
    │   ├── matmulnbits.py
    │   └── rms_norm.py
    ├── exporter
    │   ├── __init__.py
    │   ├── export_hf_to_cloud_ai_100.py
    │   └── export_utils.py
    ├── finetune
    │   ├── __init__.py
    │   ├── configs
    │   │   ├── __init__.py
    │   │   ├── dataset_config.py
    │   │   ├── peft_config.py
    │   │   └── training.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   └── sampler.py
    │   ├── dataset
    │   │   ├── __init__.py
    │   │   ├── alpaca_dataset.py
    │   │   ├── custom_dataset.py
    │   │   ├── dataset_config.py
    │   │   ├── grammar_dataset.py
    │   │   ├── gsm8k_dataset.py
    │   │   ├── imdb_dataset.py
    │   │   └── samsum_dataset.py
    │   ├── eval.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── config_utils.py
    │   │   ├── dataset_utils.py
    │   │   ├── plot_metrics.py
    │   │   └── train_utils.py
    ├── generation
    │   ├── __init__.py
    │   ├── cloud_infer.py
    │   └── text_generation_inference.py
    ├── peft
    │   ├── __init__.py
    │   ├── auto.py
    │   ├── lora
    │   │   ├── __init__.py
    │   │   ├── auto.py
    │   │   ├── layers.py
    │   │   ├── lora_model.py
    │   │   └── pytorch_transforms.py
    │   ├── onnx_transforms.py
    │   ├── peft_model.py
    │   └── pytorch_transforms.py
    ├── transformers
    │   ├── __init__.py
    │   ├── cache_utils.py
    │   ├── modeling_attn_mask_utils.py
    │   ├── modeling_utils.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── codegen
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_codegen.py
    │   │   ├── falcon
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_falcon.py
    │   │   ├── gemma
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_gemma.py
    │   │   ├── gemma2
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_gemma2.py
    │   │   ├── gpt2
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_gpt2.py
    │   │   ├── gpt_bigcode
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_gpt_bigcode.py
    │   │   ├── gptj
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_gptj.py
    │   │   ├── granite
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_granite.py
    │   │   ├── granitemoe
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_granitemoe.py
    │   │   ├── internvl
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_internvl.py
    │   │   ├── llama
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_llama.py
    │   │   ├── llama_swiftkv
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_llama_swiftkv.py
    │   │   ├── llava
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_llava.py
    │   │   ├── llava_next
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_llava_next.py
    │   │   ├── mistral
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_mistral.py
    │   │   ├── mixtral_moe
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_mixtral.py
    │   │   ├── mllama
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_mllama.py
    │   │   ├── modeling_auto.py
    │   │   ├── mpt
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_mpt.py
    │   │   ├── phi
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_phi.py
    │   │   ├── phi3
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_phi3.py
    │   │   ├── pytorch_transforms.py
    │   │   ├── qwen2
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_qwen2.py
    │   │   ├── starcoder2
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_starcoder2.py
    │   │   └── whisper
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_whisper.py
    │   ├── post_processing.py
    │   ├── quantizers
    │   │   ├── __init__.py
    │   │   ├── auto.py
    │   │   ├── awq.py
    │   │   ├── gptq.py
    │   │   ├── quant_transforms.py
    │   │   ├── quantizer_awq.py
    │   │   ├── quantizer_compressed_tensors.py
    │   │   ├── quantizer_gptq.py
    │   │   └── quantizer_utils.py
    │   ├── spd
    │   │   ├── __init__.py
    │   │   ├── spd_transform_forward.py
    │   │   └── turbo.py
    │   └── transform.py
    └── utils
    │   ├── __init__.py
    │   ├── _utils.py
    │   ├── cache.py
    │   ├── checkpoint_utils.py
    │   ├── constants.py
    │   ├── device_utils.py
    │   ├── generate_inputs.py
    │   ├── generate_qnn_network_specialization_config.py
    │   ├── logging_utils.py
    │   ├── model_registery.py
    │   ├── run_utils.py
    │   ├── spd_utils.py
    │   └── test_utils.py
├── README.md
├── docs
    ├── README.md
    ├── _static
    │   └── my_theme.css
    ├── _templates
    │   └── versions.html
    ├── conf.py
    ├── image
    │   └── Cloud_AI_100.png
    ├── index.md
    ├── requirements.txt
    └── source
    │   ├── blogs.md
    │   ├── cli_api.md
    │   ├── finetune.md
    │   ├── image
    │       ├── Cloud_AI_100.png
    │       └── kv_cache_cloudai100.png
    │   ├── installation.md
    │   ├── introduction.md
    │   ├── python_api.md
    │   ├── quick_start.md
    │   ├── reference.md
    │   ├── upgrade.md
    │   └── validate.md
├── examples
    ├── __init__.py
    ├── basic_gguf_models.py
    ├── cpp_execution
    │   ├── CMakeLists.txt
    │   ├── InferenceSetIOBuffer.cpp
    │   ├── README.md
    │   └── text_inference_using_cpp.py
    ├── draft_spd_inference.py
    ├── embedding_model.py
    ├── granite_example
    │   ├── granite_vision_inference.py
    │   └── readme.md
    ├── image_text_to_text_inference.py
    ├── intern_example
    │   ├── internvl_inference.py
    │   └── readme.md
    ├── lora_models.py
    ├── multiprojs_spd_inference.py
    ├── peft_models.py
    ├── pld_spd_inference.py
    ├── prompts.txt
    └── speech_to_text
    │   ├── README.md
    │   └── run_whisper_speech_to_text.py
├── notebooks
    ├── QEfficientGPT2.ipynb
    ├── QEfficientMPT.ipynb
    └── __init__.py
├── pyproject.toml
├── scripts
    ├── Jenkinsfile
    ├── __init__.py
    ├── finetune
    │   ├── __init__.py
    │   └── run_ft_model.py
    ├── perplexity_computation
    │   ├── README.md
    │   ├── __init__.py
    │   └── calculate_perplexity.py
    ├── replicate_kv_head
    │   ├── README.md
    │   ├── __init__.py
    │   └── replicate_kv_heads.py
    └── specializations.json
└── tests
    ├── README.md
    ├── __init__.py
    ├── base
        ├── test_modeling_qeff.py
        ├── test_onnx_transforms.py
        └── test_pytorch_transforms.py
    ├── cloud
        ├── conftest.py
        ├── high_level_testing.json
        ├── test_compile_and_execute.py
        ├── test_export.py
        ├── test_infer.py
        └── test_infer_vlm.py
    ├── finetune
        └── test_finetune.py
    ├── peft
        ├── lora
        │   └── test_lora_model.py
        ├── test_peft_model.py
        └── test_peft_onnx_transforms.py
    ├── text_generation
        └── test_text_generation.py
    ├── transformers
        ├── models
        │   ├── test_causal_lm_models.py
        │   ├── test_embedding_models.py
        │   ├── test_image_text_to_text_models.py
        │   ├── test_prefix_caching.py
        │   └── test_speech_seq2seq_models.py
        ├── spd
        │   ├── test_pld_inference.py
        │   └── test_spd_inference.py
        ├── test_causal_lm.py
        ├── test_speech_seq2seq.py
        └── test_transformer_pytorch_transforms.py
    ├── utils
        └── test_cache.py
    └── vllm
        └── test_qaic_output_consistency.py


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | # Default owners
 9 | # review when someone opens a pull request and assign appropriate reviewer
10 | * @quic-rishinr @ochougul @quic-hemagnih @quic-amitraj
11 | pyproject.toml @carlstreeter-quic
12 | 
13 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Command Used to run / script used 
16 | 2. Error details
17 | 
18 | **Expected behavior**
19 | A clear and concise description of what you expected to happen.
20 | 
21 | **Screenshots**
22 | If applicable, add screenshots to help explain your problem.
23 | 
24 | **Environment (please complete the following information):**
25 |  - OS: [e.g. iOS]
26 |  - Environment details with packages version etc.
27 |  - Version/Branch/Commit ID [e.g. 22]
28 | 
29 | **Additional context**
30 | Add any other context about the problem here.
31 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/workflows/lint-format.yml:
--------------------------------------------------------------------------------
 1 | name: Lint & Format
 2 | on: [pull_request]
 3 | jobs:
 4 |   lint:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |       - uses: actions/checkout@v4
 8 |       - run: pip3 install ruff
 9 |       - run: ruff check
10 |     env:
11 |       RUFF_OUTPUT_FORMAT: github
12 |   format:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v4
16 |       - run: pip3 install ruff
17 |       - run: ruff format --check
18 |     env:
19 |       RUFF_OUTPUT_FORMAT: github
20 | 


--------------------------------------------------------------------------------
/.github/workflows/quic-organization-repolinter.yml:
--------------------------------------------------------------------------------
 1 | name: QuIC Organization Repolinter
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 |     branches: [ "main" ]
 8 | 
 9 | jobs:
10 |   repolinter:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Checkout Repo
14 |         uses: actions/checkout@v2
15 |       - name: Verify repolinter config file is present
16 |         id: check_files
17 |         uses: andstor/file-existence-action@v1
18 |         with:
19 |           files: "repolint.json"
20 |       - name: Run Repolinter with local repolint.json
21 |         if: steps.check_files.outputs.files_exists == 'true'
22 |         uses: todogroup/repolinter-action@v1
23 |         with:
24 |           config_file: "repolint.json"
25 |       - name: Run Repolinter with default ruleset
26 |         if: steps.check_files.outputs.files_exists == 'false'
27 |         uses: todogroup/repolinter-action@v1
28 |         with:
29 |           config_url: "https://raw.githubusercontent.com/quic/.github/main/repolint.json"
30 | 
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # Distribution / packaging
 7 | .Python
 8 | build/
 9 | develop-eggs/
10 | dist/
11 | downloads/
12 | eggs/
13 | .eggs/
14 | lib/
15 | lib64/
16 | parts/
17 | sdist/
18 | var/
19 | wheels/
20 | share/python-wheels/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | MANIFEST
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Unit test / coverage reports
33 | htmlcov/
34 | .tox/
35 | .nox/
36 | .coverage
37 | .coverage.*
38 | .cache
39 | nosetests.xml
40 | coverage.xml
41 | *.cover
42 | *.py,cover
43 | .hypothesis/
44 | .pytest_cache/
45 | cover/
46 | 
47 | # PyBuilder
48 | .pybuilder/
49 | target/
50 | 
51 | # Jupyter Notebook
52 | .ipynb_checkpoints
53 | 
54 | # IPython
55 | profile_default/
56 | ipython_config.py
57 | 
58 | # Environments
59 | .env
60 | .venv
61 | env/
62 | venv/
63 | ENV/
64 | env.bak/
65 | venv.bak/
66 | 
67 | # Spyder project settings
68 | .spyderproject
69 | .spyproject
70 | 
71 | # Rope project settings
72 | .ropeproject
73 | 
74 | # mypy
75 | .mypy_cache/
76 | .dmypy.json
77 | dmypy.json
78 | 
79 | # Pyre type checker
80 | .pyre/
81 | 
82 | # pytype static type analyzer
83 | .pytype/
84 | 
85 | # Cython debug symbols
86 | cython_debug/
87 | 
88 | # Local Files
89 | cache_dir
90 | qeff_models
91 | .vscode/*
92 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/astral-sh/ruff-pre-commit
 3 |     # Ruff version.
 4 |     rev: v0.5.2
 5 |     hooks:
 6 |       # Run the linter.
 7 |       - id: ruff
 8 |         types_or: [ python, pyi, jupyter ]
 9 |         args: [ --fix ]
10 |       # Run the formatter.
11 |       - id: ruff-format
12 |         types_or: [ python, pyi, jupyter ]
13 | 


--------------------------------------------------------------------------------
/CODE-OF-CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, gender identity and expression, level of experience,
 9 | nationality, personal appearance, race, religion, or sexual identity and
10 | orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team. All complaints will be reviewed
59 | and investigated and will result in a response that is deemed necessary and 
60 | appropriate to the circumstances. The project team is obligated to maintain
61 | confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | ## Contributing to PROJECT
 2 | 
 3 | Hi there!
 4 | We’re thrilled that you’d like to contribute to this project.
 5 | Your help is essential for keeping this project great and for making it better.
 6 | 
 7 | ## Branching Strategy
 8 | 
 9 | In general, contributors should develop on branches based off of `main` and pull requests should be made against `main`.
10 | 
11 | ## Submitting a pull request
12 | 
13 | 1. Please read our [code of conduct](CODE-OF-CONDUCT.md) and [license](LICENSE).
14 | 1. Fork and clone the repository.
15 | 1. Create a new branch based on `main`: `git checkout -b <my-branch-name> main`.
16 | 1. Make your changes, add tests, and make sure the tests still pass.
17 | 1. Commit your changes using the [DCO](http://developercertificate.org/). You can attest to the DCO by commiting with the **-s** or **--signoff** options or manually adding the "Signed-off-by".
18 | 1. Push to your fork and submit a pull request from your branch to `main`.
19 | 1. Pat yourself on the back and wait for your pull request to be reviewed.
20 | 
21 | Here are a few things you can do that will increase the likelihood of your pull request to be accepted:
22 | 
23 | - Follow the existing style where possible.
24 | - Write tests.
25 | - Keep your change as focused as possible.
26 |   If you want to make multiple independent changes, please consider submitting them as separate pull requests.
27 | - Write a [good commit message](http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html).
28 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use Ubuntu 20.04 as the base image
 2 | # Create a temp image that has build tools that we can use to build wheel
 3 | # files for dependencies only available as source.
 4 | FROM docker-registry.qualcomm.com/library/ubuntu:20.04
 5 | 
 6 | # Update the package lists and install required packages
 7 | RUN apt-get update && apt-get install -y \
 8 |     git \
 9 |     tmux \
10 |     python3.10 \
11 |     python3.10-venv \
12 |     python3-pip
13 | 
14 | # pip recognizes this variable
15 | ENV PIP_CACHE_DIR=/var/cache/pip
16 | WORKDIR /app
17 | 
18 | # Sample command to register and clone the repository
19 | # Clone the GitHub repository
20 | RUN git config --global user.email none@none.com && \
21 |     git config --global user.name none
22 | 
23 | RUN mkdir -p /app/qefficient-library
24 | COPY . /app/qefficient-library
25 | 
26 | # Create Virtual Env for the docker image
27 | RUN python3.10 -m venv /app/llm_env
28 | RUN . /app/llm_env/bin/activate
29 | WORKDIR /app/qefficient-library
30 | 
31 | # Install the required Python packages
32 | 
33 | RUN pip install torch==2.0.0+cpu --extra-index-url https://download.pytorch.org/whl/cpu --no-deps
34 | RUN pip install datasets==2.17.0 fsspec==2023.10.0 multidict==6.0.5 sentencepiece --no-deps
35 | 
36 | RUN python3.10 -m pip install .
37 | WORKDIR /app/qefficient-library
38 | 
39 | # Set the environment variable for the model card name and token ID
40 | ENV HF_HOME = "/app/qefficient-library/docs"
41 | ENV MODEL_NAME = ""
42 | ENV CACHE_DIR = ""
43 | ENV TOKEN_ID = ""
44 | 
45 | # Print a success message
46 | CMD ["echo", "qefficient-transformers repository cloned and setup installed inside Docker image."]
47 | CMD ["echo", "Starting the Model Download and Export to Onnx Stage for QEff."]
48 | CMD python3.10 -m QEfficient.cloud.export --model-name "$MODEL_NAME"
49 | 
50 | # Example usage:
51 | # docker build -t qefficient-library .
52 | 
53 | # Minimum System Requirements Before running docker containers: 
54 | # 1. Clear the tmp space.
55 | # 2. For smaller models, 32GiB RAM is sufficient, but larger LLMs we require good CPU/RAM (Context 7B model would require atleast 64GiB).
56 | # 3. The exact minimum system configuration are tough to decide, since its all function of model parameters.
57 | 
58 | # docker run -e MODEL_NAME=gpt2 -e TOKEN_ID=<your-token-id> qefficient-library


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are met:
 5 | 
 6 |     * Redistributions of source code must retain the above copyright
 7 |       notice, this list of conditions and the following disclaimer.
 8 | 
 9 |     * Redistributions in binary form must reproduce the above
10 |       copyright notice, this list of conditions and the following
11 |       disclaimer in the documentation and/or other materials provided
12 |       with the distribution.
13 | 
14 |     * Neither the name of Qualcomm Technologies, Inc. nor the names of its
15 |       contributors may be used to endorse or promote products derived
16 |       from this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | SPDX-License-Identifier: BSD-3-Clause
30 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | 


--------------------------------------------------------------------------------
/QEfficient/__init__.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | import os
 9 | 
10 | # For faster downloads via hf_transfer
11 | # This code is put above import statements as this needs to be executed before
12 | # hf_transfer is imported (will happen on line 15 via leading imports)
13 | os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
14 | 
15 | # Placeholder for all non-transformer models registered in QEfficient
16 | import QEfficient.utils.model_registery  # noqa: F401
17 | from QEfficient.utils.logging_utils import logger
18 | 
19 | 
20 | def check_qaic_sdk():
21 |     """Check if QAIC SDK is installed"""
22 |     try:
23 |         import platform
24 |         import sys
25 | 
26 |         sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}")
27 |         import qaicrt  # noqa: F401
28 | 
29 |         return True
30 |     except ImportError:
31 |         return False
32 | 
33 | 
34 | # Conditionally import QAIC-related modules if the SDK is installed
35 | __version__ = "0.0.1.dev0"
36 | 
37 | if check_qaic_sdk():
38 |     from QEfficient.base import (
39 |         QEFFAutoModel,
40 |         QEFFAutoModelForCausalLM,
41 |         QEFFAutoModelForImageTextToText,
42 |         QEFFAutoModelForSpeechSeq2Seq,
43 |         QEFFCommonLoader,
44 |     )
45 |     from QEfficient.compile.compile_helper import compile
46 |     from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
47 |     from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
48 |     from QEfficient.peft import QEffAutoPeftModelForCausalLM
49 |     from QEfficient.transformers.transform import transform
50 | 
51 |     # Users can use QEfficient.export for exporting models to ONNX
52 |     export = qualcomm_efficient_converter
53 | 
54 |     __all__ = [
55 |         "transform",
56 |         "export",
57 |         "compile",
58 |         "cloud_ai_100_exec_kv",
59 |         "QEFFAutoModel",
60 |         "QEFFAutoModelForCausalLM",
61 |         "QEffAutoPeftModelForCausalLM",
62 |         "QEFFAutoModelForImageTextToText",
63 |         "QEFFAutoModelForSpeechSeq2Seq",
64 |         "QEFFCommonLoader",
65 |     ]
66 | 
67 | else:
68 |     logger.warning("QAIC SDK is not installed, eager mode features won't be available!")
69 | 


--------------------------------------------------------------------------------
/QEfficient/base/__init__.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | from QEfficient.base.common import QEFFCommonLoader  # noqa: F401
 9 | from QEfficient.transformers.models.modeling_auto import (  # noqa: F401
10 |     QEFFAutoModel,
11 |     QEFFAutoModelForCausalLM,
12 |     QEFFAutoModelForImageTextToText,
13 |     QEFFAutoModelForSpeechSeq2Seq,
14 | )
15 | 


--------------------------------------------------------------------------------
/QEfficient/base/common.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | """
 9 | MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP dictionary defines the mapping between names of the varities of Transformer model defined in
10 | QEFF_MODEL_TYPE and the classes that implement the methods i.e.(compile, export etc.) for those types.
11 | 
12 | QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model.
13 | """
14 | 
15 | import os
16 | from typing import Any
17 | 
18 | from transformers import AutoConfig
19 | 
20 | from QEfficient.base.modeling_qeff import QEFFBaseModel
21 | from QEfficient.transformers.modeling_utils import MODEL_CLASS_MAPPING
22 | from QEfficient.utils import login_and_download_hf_lm
23 | 
24 | 
25 | class QEFFCommonLoader:
26 |     """
27 |     Provides HuggingFace model loading interface same as transformers APIs.
28 |     Supports loading any model on HuggingFace.
29 |     Wrapper on top of Auto Classes
30 |     """
31 | 
32 |     def __init__(self, *args: Any, **kwds: Any) -> None:
33 |         raise EnvironmentError(
34 |             f"{self.__class__.__name__} is designed to be instantiated "
35 |             f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)`"
36 |         )
37 | 
38 |     @classmethod
39 |     def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> QEFFBaseModel:
40 |         """
41 |         Downloads HuggingFace model if already doesn't exist locally, returns QEFFAutoModel object based on type of model.
42 |         """
43 |         config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
44 | 
45 |         class_name = MODEL_CLASS_MAPPING.get(config.__class__.__name__, None)
46 |         if class_name:
47 |             module = __import__("QEfficient.transformers.models.modeling_auto")
48 |             model_class = getattr(module, class_name)
49 |         else:
50 |             raise NotImplementedError(
51 |                 f"Unknown architecture={config.__class__.__name__}, either use specific auto model class for loading the model or raise an issue for support!"
52 |             )
53 | 
54 |         local_model_dir = kwargs.pop("local_model_dir", None)
55 |         if not os.path.isdir(pretrained_model_name_or_path) and local_model_dir is None:
56 |             pretrained_model_name_or_path = login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs)
57 |         hf_token = kwargs.pop("hf_token", None)
58 |         continuous_batching = True if kwargs.pop("full_batch_size", None) else False
59 | 
60 |         qeff_model = model_class.from_pretrained(
61 |             pretrained_model_name_or_path=(local_model_dir if local_model_dir else pretrained_model_name_or_path),
62 |             token=hf_token,
63 |             continuous_batching=continuous_batching,
64 |             **kwargs,
65 |         )
66 |         return qeff_model
67 | 


--------------------------------------------------------------------------------
/QEfficient/base/onnx_transforms.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # ----------------------------------------------------------------------------
 7 | 
 8 | from typing import Optional, Tuple
 9 | 
10 | import numpy as np
11 | from onnx import ModelProto, external_data_helper, numpy_helper
12 | 
13 | 
14 | class OnnxTransform:
15 |     """
16 |     OnnxTransform is the base class for graph modifications on exported onnx.
17 |     """
18 | 
19 |     def __init__(self):
20 |         raise TypeError("Transform classes are not to be instantiated. Directly use the `apply` method.")
21 | 
22 |     @classmethod
23 |     def apply(cls, model: ModelProto, **kwargs) -> Tuple[ModelProto, bool]:
24 |         """
25 |         Override this class to apply a transformation.
26 |         :param model: The model's ONNX graph to transform
27 |         :param kwargs: Parameters needed for specific transforms. All transforms should take **kwargs to ignore unneeded kwargs.
28 | 
29 |         :returns: ONNX graph after applying the transform
30 |         :returns: Boolean indicating whether transform was applied
31 |         """
32 |         raise NotImplementedError("Use subclasses for ONNX transform")
33 | 
34 | 
35 | class FP16ClipTransform(OnnxTransform):
36 |     """
37 |     Clips the tensor values to be in FP16 range.
38 |     """
39 | 
40 |     @classmethod
41 |     def apply(cls, model: ModelProto, *, onnx_base_dir: Optional[str] = None, **kwargs) -> Tuple[ModelProto, bool]:
42 |         """
43 |         :param onnx_base_dir: Base directory to load tensors (if not already loaded).
44 |         """
45 |         finfo = np.finfo(np.float16)
46 |         fp16_max = finfo.max
47 |         fp16_min = finfo.min
48 |         transformed = False
49 |         for tensor in external_data_helper._get_all_tensors(model):
50 |             nptensor = numpy_helper.to_array(tensor, onnx_base_dir)
51 |             if nptensor.dtype == np.float32 and (np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min)):
52 |                 nptensor = np.clip(nptensor, fp16_min, fp16_max)
53 |                 new_tensor = numpy_helper.from_array(nptensor, tensor.name)
54 |                 tensor.CopyFrom(new_tensor)
55 |                 transformed = True
56 |         return model, transformed
57 | 
58 | 
59 | class SplitTensorsTransform(OnnxTransform):
60 |     """
61 |     Split external tensors file
62 |     """
63 | 
64 |     @classmethod
65 |     def apply(
66 |         cls,
67 |         model: ModelProto,
68 |         *,
69 |         model_name: str,
70 |         onnx_base_dir: Optional[str] = None,
71 |         file_chunk_size: int = 10 * 2**30,  # 10 GiB
72 |         size_threshold: int = 1024,
73 |         **kwargs,
74 |     ) -> Tuple[ModelProto, bool]:
75 |         """
76 |         :param model_name: Used for naming external files. i.e. {model_name}_0.onnx.data
77 |         :param onnx_base_dir: Base directory to load tensors (if not already loaded).
78 |         :param file_chunk_size: Chunk size to split external files into.
79 |         :param size_threshold: Only tensors greater than this threshold (in bytes) will be saved externally.
80 |         """
81 |         file_num = 0
82 |         current_file_size = 0
83 |         transformed = False
84 |         external_data_helper.load_external_data_for_model(model, onnx_base_dir)
85 |         for tensor in external_data_helper._get_all_tensors(model):
86 |             if tensor.HasField("raw_data") and ((tsize := len(tensor.raw_data)) > size_threshold):
87 |                 transformed = True
88 |                 current_file_size += tsize
89 |                 if current_file_size > file_chunk_size:
90 |                     file_num += 1
91 |                     current_file_size = tsize
92 |                 external_data_helper.set_external_data(tensor, f"{model_name}_{file_num}.onnx.data")
93 |         return model, transformed
94 | 


--------------------------------------------------------------------------------
/QEfficient/cloud/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/cloud/compile.py:
--------------------------------------------------------------------------------
  1 | # -----------------------------------------------------------------------------
  2 | #
  3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  4 | # SPDX-License-Identifier: BSD-3-Clause
  5 | #
  6 | # -----------------------------------------------------------------------------
  7 | 
  8 | import argparse
  9 | 
 10 | import QEfficient
 11 | 
 12 | if __name__ == "__main__":
 13 |     parser = argparse.ArgumentParser(description="Compilation script.")
 14 |     parser.add_argument("--onnx_path", "--onnx-path", required=True, help="Onnx Model Path")
 15 |     parser.add_argument(
 16 |         "--qpc-path",
 17 |         "--qpc_path",
 18 |         required=True,
 19 |         help="Compiled qpc binaries will be stored under this folder",
 20 |     )
 21 |     parser.add_argument("--batch_size", "--batch-size", type=int, default=1, help="Batch size for text generation")
 22 |     parser.add_argument(
 23 |         "--prompt_len",
 24 |         "--prompt-len",
 25 |         default=32,
 26 |         type=int,
 27 |         help="Sequence length for text generation.",
 28 |     )
 29 |     parser.add_argument("--ctx_len", "--ctx-len", default=128, type=int, help="Context length for text generation.")
 30 |     parser.add_argument(
 31 |         "--mxfp6",
 32 |         action="store_true",
 33 |         help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression",
 34 |     )
 35 |     parser.add_argument(
 36 |         "--mxint8",
 37 |         action="store_true",
 38 |         help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False",
 39 |     )
 40 |     parser.add_argument(
 41 |         "--num_cores",
 42 |         "--num-cores",
 43 |         required=True,
 44 |         type=int,
 45 |         help="num cores to compile the model on",
 46 |     )
 47 |     parser.add_argument(
 48 |         "--custom_io_file_path",
 49 |         "--custom-io-file-path",
 50 |         type=str,
 51 |         help="Path to custom IO file",
 52 |     )
 53 |     parser.add_argument(
 54 |         "--device_group",
 55 |         "--device-group",
 56 |         required=True,
 57 |         type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")],
 58 |         help="Cloud AI 100 device ids (comma-separated) e.g. [0,1] ",
 59 |     )
 60 |     parser.add_argument(
 61 |         "--aic_enable_depth_first",
 62 |         "--aic-enable-depth-first",
 63 |         action="store_true",
 64 |         help="If passed, this option will be enabled during compilation, disabled by default",
 65 |     )
 66 |     parser.add_argument(
 67 |         "--mos",
 68 |         type=int,
 69 |         default=-1,
 70 |         help=" Effort level to reduce the on-chip memory",
 71 |     )
 72 |     parser.add_argument(
 73 |         "--full_batch_size",
 74 |         "--full-batch-size",
 75 |         type=int,
 76 |         default=None,
 77 |         help="Set full batch size to enable continuous batching mode, default is None",
 78 |     )
 79 |     parser.add_argument(
 80 |         "--allow-mxint8-mdp-io",
 81 |         "--allow_mxint8_mdp_io",
 82 |         action="store_true",
 83 |         help="If passed, this option allows MXINT8 compression of MDP IO traffic",
 84 |     )
 85 |     parser.add_argument(
 86 |         "--enable_qnn",
 87 |         "--enable-qnn",
 88 |         nargs="?",
 89 |         const=True,
 90 |         type=str,
 91 |         default=False,
 92 |         help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\
 93 |              If not provided, the default configuration will be used.\
 94 |              Sample Config: QEfficient/compile/qnn_config.json",
 95 |     )
 96 | 
 97 |     args, compiler_options = parser.parse_known_args()
 98 | 
 99 |     if isinstance(args.enable_qnn, str):
100 |         args.qnn_config = args.enable_qnn
101 |         args.enable_qnn = True
102 | 
103 |     compiler_options_dict = {}
104 |     for i in range(0, len(compiler_options)):
105 |         if compiler_options[i].startswith("--"):
106 |             key = compiler_options[i].lstrip("-").replace("-", "_")
107 |             value = (
108 |                 compiler_options[i + 1]
109 |                 if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-")
110 |                 else True
111 |             )
112 |             compiler_options_dict[key] = value
113 |     QEfficient.compile(**args.__dict__, **compiler_options_dict)
114 | 


--------------------------------------------------------------------------------
/QEfficient/cloud/export.py:
--------------------------------------------------------------------------------
  1 | # -----------------------------------------------------------------------------
  2 | #
  3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  4 | # SPDX-License-Identifier: BSD-3-Clause
  5 | #
  6 | # -----------------------------------------------------------------------------
  7 | 
  8 | import argparse
  9 | import os
 10 | from typing import Optional
 11 | 
 12 | from QEfficient.base.common import QEFFCommonLoader
 13 | from QEfficient.utils import check_and_assign_cache_dir
 14 | from QEfficient.utils.logging_utils import logger
 15 | 
 16 | # Specifically for Docker images.
 17 | ROOT_DIR = os.path.dirname(os.path.abspath(""))
 18 | 
 19 | 
 20 | def get_onnx_model_path(
 21 |     model_name: str,
 22 |     cache_dir: Optional[str] = None,
 23 |     hf_token: Optional[str] = None,
 24 |     full_batch_size: Optional[int] = None,
 25 |     local_model_dir: Optional[str] = None,
 26 | ):
 27 |     """
 28 |     exports the model to onnx if pre-exported file is not found and returns onnx_model_path
 29 | 
 30 |     ``Mandatory`` Args:
 31 |         :model_name (str): Hugging Face Model Card name, Example: ``gpt2``.
 32 |     ``Optional`` Args:
 33 |         :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.``
 34 |         :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Pass model tokenizer. ``Defaults to None.``
 35 |         :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.``
 36 |         :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
 37 |         :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.``
 38 |     """
 39 |     logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
 40 | 
 41 |     qeff_model = QEFFCommonLoader.from_pretrained(
 42 |         pretrained_model_name_or_path=model_name,
 43 |         cache_dir=cache_dir,
 44 |         hf_token=hf_token,
 45 |         full_batch_size=full_batch_size,
 46 |         local_model_dir=local_model_dir,
 47 |     )
 48 |     onnx_model_path = qeff_model.export()
 49 |     logger.info(f"Generated onnx_path: {onnx_model_path}")
 50 |     return onnx_model_path
 51 | 
 52 | 
 53 | def main(
 54 |     model_name: str,
 55 |     cache_dir: Optional[str] = None,
 56 |     hf_token: Optional[str] = None,
 57 |     local_model_dir: Optional[str] = None,
 58 |     full_batch_size: Optional[int] = None,
 59 | ) -> None:
 60 |     """
 61 |     Helper function used by export CLI app for exporting to ONNX Model.
 62 | 
 63 |     ``Mandatory`` Args:
 64 |         :model_name (str): Hugging Face Model Card name, Example: ``gpt2``.
 65 | 
 66 |     ``Optional`` Args:
 67 |         :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.``
 68 |         :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.``
 69 |         :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
 70 |         :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.``
 71 | 
 72 |     .. code-block:: bash
 73 | 
 74 |         python -m QEfficient.cloud.export OPTIONS
 75 | 
 76 |     """
 77 |     cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir)
 78 |     get_onnx_model_path(
 79 |         model_name=model_name,
 80 |         cache_dir=cache_dir,
 81 |         hf_token=hf_token,
 82 |         full_batch_size=full_batch_size,
 83 |         local_model_dir=local_model_dir,
 84 |     )
 85 | 
 86 | 
 87 | if __name__ == "__main__":
 88 |     parser = argparse.ArgumentParser(description="Export script.")
 89 |     parser.add_argument("--model_name", "--model-name", required=True, help="HF Model card name/id")
 90 |     parser.add_argument(
 91 |         "--local-model-dir", "--local_model_dir", required=False, help="Path to custom model weights and config files"
 92 |     )
 93 |     parser.add_argument(
 94 |         "--cache_dir",
 95 |         "--cache-dir",
 96 |         required=False,
 97 |         help="Cache_dir to store the HF files",
 98 |     )
 99 |     parser.add_argument(
100 |         "--hf-token", "--hf_token", default=None, type=str, required=False, help="HF token id for private HF models"
101 |     )
102 |     parser.add_argument(
103 |         "--full_batch_size",
104 |         "--full-batch-size",
105 |         type=int,
106 |         default=None,
107 |         help="Set full batch size to enable continuous batching mode, default is None",
108 |     )
109 |     args = parser.parse_args()
110 |     main(**args.__dict__)
111 | 


--------------------------------------------------------------------------------
/QEfficient/compile/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/QEfficient/compile/qnn_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "converter_args_extension": "",
 3 |     "context_binary_generator_args_extension": "--log_level debug",
 4 |     "qnn_compilation_backend":
 5 |     {
 6 |         "compiler_enable_depth_first": true,
 7 |         "compiler_printDDRStats": false,
 8 |         "compiler_printPerfMetrics": false,
 9 |         "compiler_stat_level": 10
10 |     },
11 |     "SKIP_QNN_CONVERTER_STEP": false
12 | }


--------------------------------------------------------------------------------
/QEfficient/customop/__init__.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | from QEfficient.customop.ctx_scatter_gather import CtxGatherFunc, CtxGatherFunc3D, CtxScatterFunc, CtxScatterFunc3D
 9 | from QEfficient.customop.ctx_scatter_gather_cb import (
10 |     CtxGatherFuncCB,
11 |     CtxGatherFuncCB3D,
12 |     CtxScatterFuncCB,
13 |     CtxScatterFuncCB3D,
14 | )
15 | from QEfficient.customop.rms_norm import CustomRMSNormAIC, GemmaCustomRMSNormAIC
16 | 
17 | __all__ = [
18 |     "CtxGatherFunc",
19 |     "CtxScatterFunc",
20 |     "CtxGatherFunc3D",
21 |     "CtxScatterFunc3D",
22 |     "CustomRMSNormAIC",
23 |     "GemmaCustomRMSNormAIC",
24 |     "CtxGatherFuncCB",
25 |     "CtxScatterFuncCB",
26 |     "CtxGatherFuncCB3D",
27 |     "CtxScatterFuncCB3D",
28 | ]
29 | 


--------------------------------------------------------------------------------
/QEfficient/customop/rms_norm.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | import onnxscript
 9 | import torch
10 | from torch import nn
11 | 
12 | from QEfficient.utils import constants
13 | 
14 | ops = getattr(onnxscript, "opset" + str(constants.ONNX_EXPORT_OPSET))
15 | 
16 | 
17 | @onnxscript.script(onnxscript.values.Opset(domain="com.qti.aisw.onnx", version=1))
18 | def CustomRMSNorm(hidden_states: onnxscript.FLOAT, weight: onnxscript.FLOAT, epsilon: float):
19 |     weight = ops.Cast(weight, to=1)
20 |     variance = ops.ReduceMean(ops.Pow(hidden_states, 2), axes=[-1], keepdims=1)
21 |     epsilon = ops.Expand(epsilon, ops.Shape(variance))
22 |     hidden_states = hidden_states * ops.Reciprocal(ops.Sqrt(variance + epsilon))
23 |     return weight * hidden_states
24 | 
25 | 
26 | class CustomRMSNormFunc(torch.autograd.Function):
27 |     @staticmethod
28 |     def forward(hidden_states: torch.Tensor, weight: torch.Tensor, epsilon: float):
29 |         variance = hidden_states.pow(2).mean(-1, keepdim=True)
30 |         hidden_states = hidden_states * torch.rsqrt(variance + epsilon)
31 |         return weight * hidden_states
32 | 
33 |     @staticmethod
34 |     def setup_context(ctx, inputs, outputs):
35 |         pass
36 | 
37 |     @staticmethod
38 |     def symbolic(g: torch.Graph, hidden_states: torch.Value, weight: torch.Value, epsilon: torch.Value) -> torch.Value:
39 |         return g.onnxscript_op(CustomRMSNorm, hidden_states, weight, epsilon_f=epsilon).setTypeAs(hidden_states)
40 | 
41 | 
42 | class CustomRMSNormAIC(nn.Module):
43 |     """
44 |     RMSNorm module that works by replacing the current module with compiler known custom-op.
45 |     """
46 | 
47 |     def __init__(self, hidden_size, eps=1e-05):
48 |         super(CustomRMSNormAIC, self).__init__()
49 |         self.variance_epsilon = eps
50 |         self.eps = eps  # Added to support GemmaRMSNorm
51 |         self.weight = torch.nn.Parameter(torch.ones(hidden_size))
52 | 
53 |     def forward(self, hidden_states):
54 |         return CustomRMSNormFunc.apply(
55 |             hidden_states, self.weight, self.variance_epsilon if hasattr(self, "variance_epsilon") else self.eps
56 |         )
57 | 
58 | 
59 | class GemmaCustomRMSNormAIC(CustomRMSNormAIC):
60 |     """
61 |     Modify the init function to add +1 to the weights
62 |     """
63 | 
64 |     def __qeff_init__(self):
65 |         with torch.no_grad():
66 |             self.weight.copy_(self.weight + 1.0)
67 | 


--------------------------------------------------------------------------------
/QEfficient/exporter/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/finetune/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/QEfficient/finetune/configs/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/QEfficient/finetune/configs/dataset_config.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | from dataclasses import dataclass
 9 | 
10 | 
11 | @dataclass
12 | class samsum_dataset:
13 |     dataset: str = "samsum_dataset"
14 |     train_split: str = "train"
15 |     test_split: str = "validation"
16 | 
17 | 
18 | @dataclass
19 | class grammar_dataset:
20 |     dataset: str = "grammar_dataset"
21 |     train_split: str = "train"
22 |     test_split: str = "validation"
23 | 
24 | 
25 | @dataclass
26 | class alpaca_dataset:
27 |     dataset: str = "alpaca_dataset"
28 |     train_split: str = "train"
29 |     test_split: str = "val"
30 |     data_path: str = "dataset/alpaca_data.json"
31 | 
32 | 
33 | @dataclass
34 | class gsm8k_dataset:
35 |     dataset: str = "gsm8k_dataset"
36 |     train_split: str = "train"
37 |     test_split: str = "test"
38 | 
39 | 
40 | @dataclass
41 | class imdb_dataset:
42 |     dataset: str = "imdb_dataset"
43 |     train_split: str = "train"
44 |     test_split: str = "test"
45 |     num_labels: int = 2
46 | 
47 | 
48 | @dataclass
49 | class custom_dataset:
50 |     dataset: str = "custom_dataset"
51 |     file: str = "dataset/custom_dataset.py"
52 |     train_split: str = "train"
53 |     test_split: str = "validation"
54 |     data_path: str = ""
55 | 


--------------------------------------------------------------------------------
/QEfficient/finetune/configs/peft_config.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | from dataclasses import dataclass, field
 9 | from typing import List
10 | 
11 | 
12 | @dataclass
13 | class LoraConfig:
14 |     """LoRA-specific configuration for parameter-efficient fine-tuning.
15 | 
16 |     Attributes:
17 |         r (int): LoRA rank (default: 8).
18 |         lora_alpha (int): LoRA scaling factor (default: 32).
19 |         target_modules (List[str]): Modules to apply LoRA to (default: ["q_proj", "v_proj"]).
20 |         bias (str): Bias handling in LoRA (default: "none").
21 |         task_type (str): Task type for LoRA (default: "CAUSAL_LM").
22 |         lora_dropout (float): Dropout rate for LoRA (default: 0.0).
23 |         inference_mode (bool): Whether model is in inference mode (default: False).
24 |     """
25 | 
26 |     r: int = 8
27 |     lora_alpha: int = 32
28 |     target_modules: List[str] = field(default_factory=lambda: ["q_proj", "v_proj"])
29 |     bias: str = "none"
30 |     task_type: str = "CAUSAL_LM"
31 |     lora_dropout: float = 0.05
32 |     inference_mode: bool = False  # should be False for finetuning
33 | 
34 | 
35 | # CAUTION prefix tuning is currently not supported
36 | @dataclass
37 | class PrefixConfig:
38 |     num_virtual_tokens: int = 30
39 |     task_type: str = "CAUSAL_LM"
40 | 


--------------------------------------------------------------------------------
/QEfficient/finetune/data/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/finetune/data/sampler.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | import random
 9 | from itertools import islice
10 | 
11 | import numpy as np
12 | import torch
13 | 
14 | 
15 | class LengthBasedBatchSampler(torch.utils.data.BatchSampler):
16 |     def __init__(self, data_source, batch_size: int, drop_last: bool, shuffle: bool = True) -> None:
17 |         if isinstance(next(iter(data_source)), dict):
18 |             first_key = next(iter(next(iter(data_source)).keys()))
19 |             self.lengths = [len(d[first_key]) for d in data_source]
20 |         else:
21 |             self.lengths = [len(d) for d in data_source]
22 |         self.batch_size = batch_size
23 |         self.drop_last = drop_last
24 |         self.shuffle = shuffle
25 | 
26 |     def __iter__(self):
27 |         ids = np.argsort(self.lengths, kind="mergesort")
28 |         if self.drop_last:
29 |             ids = ids[: len(ids) // self.batch_size * self.batch_size]
30 | 
31 |         batches = [ids[i : i + self.batch_size] for i in range(0, len(ids), self.batch_size)]
32 | 
33 |         if self.shuffle:
34 |             random.shuffle(batches)
35 | 
36 |         for b in batches:
37 |             yield b
38 | 
39 |     def __len__(self):
40 |         if self.drop_last:
41 |             return len(self.lengths) // self.batch_size
42 |         else:
43 |             return len(self.lengths) // self.batch_size + (len(self.lengths) % self.batch_size > 0)
44 | 
45 | 
46 | class DistributedLengthBasedBatchSampler(torch.utils.data.BatchSampler):
47 |     def __init__(
48 |         self, data_source, batch_size: int, num_replicas: int, rank: int, shuffle: bool = True, seed: int = 0
49 |     ) -> None:
50 |         random.seed(seed)
51 |         self.batch_sampler = LengthBasedBatchSampler(
52 |             data_source, batch_size=batch_size, drop_last=True, shuffle=shuffle
53 |         )
54 |         self.num_replicas = num_replicas
55 |         self.rank = rank
56 | 
57 |     def __iter__(self):
58 |         max_length = len(self.batch_sampler) // self.num_replicas * self.num_replicas
59 |         return islice(self.batch_sampler, self.rank, max_length, self.num_replicas)
60 | 
61 |     def __len__(self):
62 |         return len(self.batch_sampler) // self.num_replicas
63 | 


--------------------------------------------------------------------------------
/QEfficient/finetune/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/finetune/dataset/alpaca_dataset.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | import copy
 9 | import json
10 | 
11 | import torch
12 | from torch.utils.data import Dataset
13 | 
14 | PROMPT_DICT = {
15 |     "prompt_input": (
16 |         "Below is an instruction that describes a task, paired with an input that provides further context. "
17 |         "Write a response that appropriately completes the request.\n\n"
18 |         "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
19 |     ),
20 |     "prompt_no_input": (
21 |         "Below is an instruction that describes a task. "
22 |         "Write a response that appropriately completes the request.\n\n"
23 |         "### Instruction:\n{instruction}\n\n### Response:"
24 |     ),
25 | }
26 | 
27 | 
28 | class InstructionDataset(Dataset):
29 |     def __init__(self, dataset_config, tokenizer, partition="train", context_length=None):
30 |         self.ann = json.load(open(dataset_config.data_path))
31 |         # Use 5% of the dataset for evaluation
32 |         eval_length = int(len(self.ann) / 20)
33 |         if partition == "train":
34 |             self.ann = self.ann[eval_length:]
35 |         else:
36 |             self.ann = self.ann[:eval_length]
37 | 
38 |         self.tokenizer = tokenizer
39 |         self.context_length = context_length
40 | 
41 |     def __len__(self):
42 |         return len(self.ann)
43 | 
44 |     def __getitem__(self, index):
45 |         IGNORE_INDEX = -100  # The default setting
46 | 
47 |         ann = self.ann[index]
48 |         if ann.get("input", "") == "":
49 |             prompt = PROMPT_DICT["prompt_no_input"].format_map(ann)
50 |         else:
51 |             prompt = PROMPT_DICT["prompt_input"].format_map(ann)
52 |         example = prompt + ann["output"]
53 |         prompt = torch.tensor(
54 |             self.tokenizer.encode(prompt, max_length=self.context_length, pad_to_max_length=True), dtype=torch.int64
55 |         )
56 |         example = self.tokenizer.encode(example, max_length=self.context_length, pad_to_max_length=True)
57 |         example.append(self.tokenizer.eos_token_id)
58 |         example = torch.tensor(example, dtype=torch.int64)
59 |         labels = copy.deepcopy(example)
60 |         labels[: len(prompt)] = -1
61 |         example_mask = example.ge(0)
62 |         label_mask = labels.ge(0)
63 |         example[~example_mask] = 0
64 |         labels[~label_mask] = IGNORE_INDEX
65 | 
66 |         return {
67 |             "input_ids": example.tolist(),
68 |             "labels": labels.tolist(),
69 |             "attention_mask": example_mask.tolist(),
70 |         }
71 | 


--------------------------------------------------------------------------------
/QEfficient/finetune/dataset/custom_dataset.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | import importlib
 9 | from pathlib import Path
10 | 
11 | 
12 | def load_module_from_py_file(py_file: str) -> object:
13 |     """
14 |     This method loads a module from a py file which is not in the Python path
15 |     """
16 |     module_name = Path(py_file).name
17 |     loader = importlib.machinery.SourceFileLoader(module_name, py_file)
18 |     spec = importlib.util.spec_from_loader(module_name, loader)
19 |     module = importlib.util.module_from_spec(spec)
20 | 
21 |     loader.exec_module(module)
22 | 
23 |     return module
24 | 
25 | 
26 | def get_custom_dataset(dataset_config, tokenizer, split: str):
27 |     if ":" in dataset_config.file:
28 |         module_path, func_name = dataset_config.file.split(":")
29 |     else:
30 |         module_path, func_name = dataset_config.file, "get_custom_dataset"
31 | 
32 |     if not module_path.endswith(".py"):
33 |         raise ValueError(f"Dataset file {module_path} is not a .py file.")
34 | 
35 |     module_path = Path(module_path)
36 |     if not module_path.is_file():
37 |         raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
38 | 
39 |     module = load_module_from_py_file(module_path.as_posix())
40 |     try:
41 |         return getattr(module, func_name)(dataset_config, tokenizer, split)
42 |     except AttributeError as e:
43 |         print(
44 |             f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})."
45 |         )
46 |         raise e
47 | 
48 | 
49 | def get_data_collator(dataset_processer, dataset_config):
50 |     if ":" in dataset_config.file:
51 |         module_path, func_name = dataset_config.file.split(":")
52 |     else:
53 |         module_path, func_name = dataset_config.file, "get_data_collator"
54 | 
55 |     if not module_path.endswith(".py"):
56 |         raise ValueError(f"Dataset file {module_path} is not a .py file.")
57 | 
58 |     module_path = Path(module_path)
59 |     if not module_path.is_file():
60 |         raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
61 | 
62 |     module = load_module_from_py_file(module_path.as_posix())
63 |     try:
64 |         return getattr(module, func_name)(dataset_processer)
65 |     except AttributeError:
66 |         print(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).")
67 |         print("Using the default data_collator instead.")
68 |         return None
69 | 


--------------------------------------------------------------------------------
/QEfficient/finetune/dataset/dataset_config.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | from functools import partial
 9 | 
10 | from QEfficient.finetune.dataset.alpaca_dataset import (
11 |     InstructionDataset as get_alpaca_dataset,
12 | )
13 | from QEfficient.finetune.dataset.custom_dataset import (
14 |     get_custom_dataset,
15 |     get_data_collator,
16 | )
17 | from QEfficient.finetune.dataset.grammar_dataset import (
18 |     get_dataset as get_grammar_dataset,
19 | )
20 | from QEfficient.finetune.dataset.gsm8k_dataset import get_gsm8k_dataset
21 | from QEfficient.finetune.dataset.imdb_dataset import (
22 |     get_preprocessed_imdb as get_imdb_dataset,
23 | )
24 | from QEfficient.finetune.dataset.samsum_dataset import (
25 |     get_preprocessed_samsum as get_samsum_dataset,
26 | )
27 | 
28 | DATASET_PREPROC = {
29 |     "alpaca_dataset": partial(get_alpaca_dataset),
30 |     "grammar_dataset": get_grammar_dataset,
31 |     "samsum_dataset": get_samsum_dataset,
32 |     "gsm8k_dataset": get_gsm8k_dataset,
33 |     "custom_dataset": get_custom_dataset,
34 |     "imdb_dataset": get_imdb_dataset,
35 | }
36 | DATALOADER_COLLATE_FUNC = {
37 |     "custom_dataset": get_data_collator,
38 | }
39 | 


--------------------------------------------------------------------------------
/QEfficient/finetune/dataset/grammar_dataset.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | from pathlib import Path
 9 | 
10 | from datasets import load_dataset
11 | from torch.utils.data import Dataset
12 | 
13 | 
14 | class grammar(Dataset):
15 |     def __init__(self, tokenizer, csv_name=None, context_length=None):
16 |         try:
17 |             self.dataset = load_dataset(
18 |                 "csv",
19 |                 data_files={"train": [csv_name]},  # "eval": "grammar_validation.csv"},
20 |                 delimiter=",",
21 |             )
22 |         except Exception as e:
23 |             print(
24 |                 "Loading of grammar dataset failed! Please see [here](https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset."
25 |             )
26 |             raise e
27 | 
28 |         self.context_length = context_length
29 |         self.tokenizer = tokenizer
30 |         self.print_text = False  # print_text
31 | 
32 |     def __len__(self):
33 |         return self.dataset["train"].shape[0]
34 | 
35 |     def convert_to_features(self, example_batch):
36 |         # Create prompt and tokenize contexts and questions
37 | 
38 |         if self.print_text:
39 |             print("Input Text: ", self.clean_text(example_batch["text"]))
40 | 
41 |         input_ = example_batch["input"]
42 |         target_ = example_batch["target"]
43 | 
44 |         prompt = f"Correct this to standard English: {input_}\n---\nCorrected: "
45 |         prompt_ids = self.tokenizer.encode(
46 |             self.tokenizer.bos_token + prompt,
47 |             add_special_tokens=False,
48 |             max_length=self.context_length,
49 |             pad_to_max_length=True,
50 |         )
51 |         label_ids = self.tokenizer.encode(
52 |             target_ + self.tokenizer.eos_token,
53 |             add_special_tokens=False,
54 |             max_length=self.context_length,
55 |             pad_to_max_length=True,
56 |         )
57 | 
58 |         sample = {
59 |             "input_ids": prompt_ids + label_ids,
60 |             "attention_mask": [1] * len(prompt_ids + label_ids),
61 |             "labels": [-100] * len(prompt_ids) + label_ids,
62 |         }
63 | 
64 |         return sample
65 | 
66 |     def __getitem__(self, index):
67 |         return self.convert_to_features(self.dataset["train"][int(index)])
68 | 
69 | 
70 | def get_dataset(dataset_config, tokenizer, csv_name=None, context_length=None):
71 |     """cover function for handling loading the working dataset"""
72 |     """dataset loading"""
73 |     currPath = Path.cwd() / "datasets_grammar" / "grammar_train.csv"
74 |     print(f"Loading dataset {currPath}")
75 |     csv_name = str(currPath)
76 |     print(csv_name)
77 |     dataset = grammar(tokenizer=tokenizer, csv_name=csv_name, context_length=context_length)
78 | 
79 |     return dataset
80 | 


--------------------------------------------------------------------------------
/QEfficient/finetune/dataset/gsm8k_dataset.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | from typing import Dict
 9 | 
10 | from datasets import Dataset, load_dataset
11 | 
12 | default_instruction = """### Instruction: Solve the math question using a basic calculator.
13 | Calculator can be invoked using the format: <<expression=answer>>.
14 | "expression" can be one of the 4 arithmetic operations, and "answer" will be filled in for you.
15 | Example: <<20+30=50>>
16 | 
17 | ### Question: {question}
18 | 
19 | ### Answer: """
20 | 
21 | 
22 | def tokenize_and_mask(row: Dict[str, str], *, tokenizer, instruction) -> Dict[str, list]:
23 |     start_tokens = {tokenizer(x, add_special_tokens=False)["input_ids"][0] for x in ["<<", " <<"]}
24 |     equal_tokens = {tokenizer(x, add_special_tokens=False)["input_ids"][0] for x in ["=", " ="]}
25 |     end_tokens = {tokenizer(x, add_special_tokens=False)["input_ids"][0] for x in [">>"]}
26 | 
27 |     input_str = tokenizer.bos_token + instruction.format(**row)
28 |     ques_ids = tokenizer(input_str, add_special_tokens=False, return_attention_mask=False)["input_ids"]
29 |     ans_ids = tokenizer(row["answer"] + tokenizer.eos_token, add_special_tokens=False, return_attention_mask=False)[
30 |         "input_ids"
31 |     ]
32 |     input_ids = ques_ids + ans_ids
33 | 
34 |     # State machine to recognize <<expression=answer>> and mask answer
35 |     mode = 0
36 |     for i, token in enumerate(ans_ids):
37 |         if mode == 0 and token in start_tokens:
38 |             mode = 1
39 |         elif mode == 1 and token in equal_tokens:
40 |             mode = 2
41 |         elif mode == 2:
42 |             ans_ids[i] = -100
43 |             if token in end_tokens:
44 |                 mode = 0
45 | 
46 |     labels = [-100] * len(ques_ids) + ans_ids
47 | 
48 |     inputs = {"input_ids": input_ids, "labels": labels}
49 |     return inputs
50 | 
51 | 
52 | def pad_to_max_length(row: Dict[str, list], *, tokenizer, max_length: int) -> Dict[str, list]:
53 |     length = len(row["input_ids"])
54 |     return {
55 |         "input_ids": row["input_ids"] + [tokenizer.pad_token_id] * (max_length - length),
56 |         "attention_mask": [1] * length + [0] * (max_length - length),
57 |         "labels": row["labels"] + [-100] * (max_length - length),
58 |     }
59 | 
60 | 
61 | def get_gsm8k_dataset(
62 |     dataset_config,
63 |     tokenizer,
64 |     split,
65 |     context_length=None,
66 |     instruction: str = default_instruction,
67 | ) -> Dataset:
68 |     ds = load_dataset("openai/gsm8k", "main", split=split)
69 |     ds = ds.map(
70 |         tokenize_and_mask,
71 |         fn_kwargs={"tokenizer": tokenizer, "instruction": instruction},
72 |         remove_columns=["question", "answer"],
73 |     )
74 | 
75 |     if context_length is not None:
76 |         ds = ds.filter(lambda x: x["length"] <= context_length)
77 |         ds = ds.map(
78 |             pad_to_max_length,
79 |             fn_kwargs={"tokenizer": tokenizer, "max_length": context_length},
80 |         )
81 | 
82 |     ds.set_format("torch")
83 | 
84 |     return ds
85 | 


--------------------------------------------------------------------------------
/QEfficient/finetune/dataset/imdb_dataset.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | 
 9 | from itertools import chain
10 | 
11 | import datasets
12 | 
13 | 
14 | def get_preprocessed_imdb(dataset_config, tokenizer, split, context_length=None):
15 |     dataset = datasets.load_dataset("stanfordnlp/imdb", split=split, trust_remote_code=True)
16 | 
17 |     if split == "test":
18 |         # Test set contains 15000 samples. Not all are required.
19 |         # 0-12499 are 0 labeled samples, 12500-24999 are 1 labeled samples.
20 |         dataset = dataset.select(chain(range(0, 500), range(12500, 13000)))
21 | 
22 |     # Need to shuffle dataset as all the 0 labeled data is organized first and then all the 1 labeled data.
23 |     dataset = dataset.shuffle(seed=42)
24 | 
25 |     if tokenizer.pad_token is None:
26 |         tokenizer.add_special_tokens({"pad_token": "[PAD]"})
27 | 
28 |     def tokenize_add_label(sample):
29 |         data = tokenizer(
30 |             sample["text"],
31 |             add_special_tokens=True,
32 |             max_length=tokenizer.model_max_length,
33 |         )
34 | 
35 |         data["labels"] = [sample["label"]]
36 |         return data
37 | 
38 |     dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset.features))
39 |     return dataset
40 | 


--------------------------------------------------------------------------------
/QEfficient/finetune/dataset/samsum_dataset.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | import datasets
 9 | 
10 | 
11 | def get_preprocessed_samsum(dataset_config, tokenizer, split, context_length=None):
12 |     dataset = datasets.load_dataset("Samsung/samsum", split=split, trust_remote_code=True)
13 | 
14 |     prompt = "Summarize this dialog:\n{dialog}\n---\nSummary:\n"
15 | 
16 |     def apply_prompt_template(sample):
17 |         return {
18 |             "prompt": prompt.format(dialog=sample["dialogue"]),
19 |             "summary": sample["summary"],
20 |         }
21 | 
22 |     dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))
23 | 
24 |     def tokenize_add_label(sample):
25 |         prompt = tokenizer.encode(
26 |             tokenizer.bos_token + sample["prompt"],
27 |             add_special_tokens=False,
28 |             max_length=context_length,
29 |             pad_to_max_length=True,
30 |         )
31 |         summary = tokenizer.encode(
32 |             sample["summary"] + tokenizer.eos_token,
33 |             add_special_tokens=False,
34 |             max_length=context_length,
35 |             pad_to_max_length=True,
36 |         )
37 | 
38 |         sample = {
39 |             "input_ids": prompt + summary,
40 |             "attention_mask": [1] * (len(prompt) + len(summary)),
41 |             "labels": [-100] * len(prompt) + summary,
42 |         }
43 | 
44 |         return sample
45 | 
46 |     dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset.features))
47 | 
48 |     return dataset
49 | 


--------------------------------------------------------------------------------
/QEfficient/finetune/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/finetune/utils/dataset_utils.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | import torch
 9 | 
10 | # from QEfficient.finetune.data.concatenator import ConcatDataset
11 | from QEfficient.finetune.dataset.dataset_config import DATALOADER_COLLATE_FUNC, DATASET_PREPROC
12 | from QEfficient.finetune.utils.config_utils import get_dataloader_kwargs
13 | 
14 | 
15 | def get_preprocessed_dataset(
16 |     tokenizer, dataset_config, split: str = "train", context_length: int = None
17 | ) -> torch.utils.data.Dataset:
18 |     if dataset_config.dataset not in DATASET_PREPROC:
19 |         raise NotImplementedError(f"{dataset_config.dataset} is not (yet) implemented")
20 | 
21 |     def get_split():
22 |         return dataset_config.train_split if split == "train" else dataset_config.test_split
23 | 
24 |     return DATASET_PREPROC[dataset_config.dataset](dataset_config, tokenizer, get_split(), context_length)
25 | 
26 | 
27 | def get_custom_data_collator(dataset_processer, dataset_config) -> torch.utils.data.Dataset:
28 |     if dataset_config.dataset not in DATALOADER_COLLATE_FUNC:
29 |         return None
30 | 
31 |     return DATALOADER_COLLATE_FUNC[dataset_config.dataset](dataset_processer, dataset_config)
32 | 
33 | 
34 | def get_dataloader(tokenizer, dataset_config, train_config, split: str = "train"):
35 |     dataset = get_preprocessed_dataset(tokenizer, dataset_config, split)
36 |     dl_kwargs = get_dataloader_kwargs(train_config, dataset, tokenizer, split)
37 | 
38 |     # if split == "train" and train_config.batching_strategy == "packing":
39 |     #    dataset = ConcatDataset(dataset, chunk_size=train_config.context_length)
40 | 
41 |     # Create data loader
42 |     dataloader = torch.utils.data.DataLoader(
43 |         dataset,
44 |         num_workers=train_config.num_workers_dataloader,
45 |         pin_memory=True,
46 |         **dl_kwargs,
47 |     )
48 |     return dataloader
49 | 


--------------------------------------------------------------------------------
/QEfficient/finetune/utils/plot_metrics.py:
--------------------------------------------------------------------------------
  1 | # -----------------------------------------------------------------------------
  2 | #
  3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  4 | # SPDX-License-Identifier: BSD-3-Clause
  5 | #
  6 | # -----------------------------------------------------------------------------
  7 | 
  8 | import argparse
  9 | import json
 10 | import os
 11 | 
 12 | import matplotlib.pyplot as plt
 13 | 
 14 | 
 15 | def plot_metric(data, metric_name, x_label, y_label, title, colors):
 16 |     plt.figure(figsize=(7, 6))
 17 | 
 18 |     plt.plot(
 19 |         data[f"train_epoch_{metric_name}"],
 20 |         label=f"Train Epoch {metric_name.capitalize()}",
 21 |         color=colors[0],
 22 |     )
 23 |     plt.plot(
 24 |         data[f"val_epoch_{metric_name}"],
 25 |         label=f"Validation Epoch {metric_name.capitalize()}",
 26 |         color=colors[1],
 27 |     )
 28 |     plt.xlabel(x_label)
 29 |     plt.ylabel(y_label)
 30 |     plt.title(f"Train and Validation Epoch {title}")
 31 |     plt.legend()
 32 |     plt.tight_layout()
 33 | 
 34 | 
 35 | def plot_single_metric_by_step(data, metric_name, x_label, y_label, title, color):
 36 |     plt.plot(data[f"{metric_name}"], label=f"{title}", color=color)
 37 |     plt.xlabel(x_label)
 38 |     plt.ylabel(y_label)
 39 |     plt.title(title)
 40 |     plt.legend()
 41 |     plt.tight_layout()
 42 | 
 43 | 
 44 | def plot_metrics_by_step(data, metric_name, x_label, y_label, colors):
 45 |     plt.figure(figsize=(14, 6))
 46 | 
 47 |     plt.subplot(1, 2, 1)
 48 |     plot_single_metric_by_step(
 49 |         data,
 50 |         f"train_step_{metric_name}",
 51 |         x_label,
 52 |         y_label,
 53 |         f"Train Step {metric_name.capitalize()}",
 54 |         colors[0],
 55 |     )
 56 |     plt.subplot(1, 2, 2)
 57 |     plot_single_metric_by_step(
 58 |         data,
 59 |         f"val_step_{metric_name}",
 60 |         x_label,
 61 |         y_label,
 62 |         f"Validation Step {metric_name.capitalize()}",
 63 |         colors[1],
 64 |     )
 65 |     plt.tight_layout()
 66 | 
 67 | 
 68 | def plot_metrics(file_path):
 69 |     if not os.path.exists(file_path):
 70 |         print(f"File {file_path} does not exist.")
 71 |         return
 72 | 
 73 |     with open(file_path, "r") as f:
 74 |         try:
 75 |             data = json.load(f)
 76 |         except json.JSONDecodeError:
 77 |             print("Invalid JSON file.")
 78 |             return
 79 | 
 80 |     directory = os.path.dirname(file_path)
 81 |     filename_prefix = os.path.basename(file_path).split(".")[0]
 82 | 
 83 |     plot_metric(data, "loss", "Epoch", "Loss", "Loss", ["b", "r"])
 84 |     plt.savefig(os.path.join(directory, f"{filename_prefix}_train_and_validation_loss.png"))
 85 |     plt.close()
 86 | 
 87 |     plot_metric(data, "perplexity", "Epoch", "Perplexity", "Perplexity", ["g", "m"])
 88 |     plt.savefig(os.path.join(directory, f"{filename_prefix}_train_and_validation_perplexity.png"))
 89 |     plt.close()
 90 | 
 91 |     plot_metrics_by_step(data, "loss", "Step", "Loss", ["b", "r"])
 92 |     plt.savefig(os.path.join(directory, f"{filename_prefix}_train_and_validation_loss_by_step.png"))
 93 |     plt.close()
 94 | 
 95 |     plot_metrics_by_step(data, "perplexity", "Step", "Loss", ["g", "m"])
 96 |     plt.savefig(os.path.join(directory, f"{filename_prefix}_train_and_validation_perplexity_by_step.png"))
 97 |     plt.close()
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     parser = argparse.ArgumentParser(description="Plot metrics from JSON file.")
102 |     parser.add_argument("--file_path", required=True, type=str, help="Path to the metrics JSON file.")
103 |     args = parser.parse_args()
104 | 
105 |     plot_metrics(args.file_path)
106 | 


--------------------------------------------------------------------------------
/QEfficient/generation/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/peft/__init__.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | from QEfficient.peft.auto import QEffAutoPeftModelForCausalLM
 9 | from QEfficient.peft.peft_model import QEffPeftModelForCausalLM
10 | 
11 | __all__ = [
12 |     "QEffAutoPeftModelForCausalLM",
13 |     "QEffPeftModelForCausalLM",
14 | ]
15 | 


--------------------------------------------------------------------------------
/QEfficient/peft/lora/__init__.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # ----------------------------------------------------------------------------
 7 | 
 8 | from QEfficient.peft.lora.auto import QEffAutoLoraModelForCausalLM
 9 | 
10 | __all__ = [
11 |     "QEffAutoLoraModelForCausalLM",
12 | ]
13 | 


--------------------------------------------------------------------------------
/QEfficient/peft/lora/layers.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | import math
 9 | from typing import Any
10 | 
11 | import torch
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 | 
15 | from QEfficient.customop import CtxGatherFuncCB
16 | 
17 | 
18 | class LinearMultiLoRA(nn.Linear):
19 |     def multilora_init(self, lora_rank, max_num_adapters):
20 |         if lora_rank < 1 or max_num_adapters < 1:
21 |             raise ValueError("lora_rank and max_num_adapters must be greater or equal to 1")
22 | 
23 |         self.max_num_adapters = max_num_adapters
24 |         self.lora_rank = lora_rank
25 | 
26 |         self.lora_a_weights = nn.Parameter(
27 |             self.weight.new_zeros(self.max_num_adapters + 1, 1, self.in_features, self.lora_rank)
28 |         )
29 |         self.lora_a_weights.requires_grad = False
30 |         self.lora_b_weights = nn.Parameter(
31 |             self.weight.new_zeros(self.max_num_adapters + 1, 1, self.lora_rank, self.out_features)
32 |         )
33 |         self.lora_b_weights.requires_grad = False
34 |         self.lora_scalings = torch.full((self.max_num_adapters + 1, 1, 1, 1), 1.0, dtype=torch.float)
35 | 
36 |         nn.init.kaiming_uniform_(self.lora_a_weights, a=math.sqrt(5))
37 |         nn.init.zeros_(self.lora_b_weights)
38 | 
39 |     def forward(self, x: torch.Tensor, lora_ids: torch.Tensor):
40 |         result = F.linear(x, self.weight, bias=self.bias)
41 | 
42 |         # multilora implementation: lora_ids <batch_size, 1>
43 |         other_indices_a = torch.arange(self.lora_a_weights.shape[2]).view(1, 1, -1)
44 |         selected_lora_a_weights = CtxGatherFuncCB.apply(
45 |             self.lora_a_weights, lora_ids, other_indices_a
46 |         )  # <num_loras, 1, feature, r>
47 |         other_indices_b = torch.arange(self.lora_b_weights.shape[2]).view(1, 1, -1)
48 |         selected_lora_b_weights = CtxGatherFuncCB.apply(
49 |             self.lora_b_weights, lora_ids, other_indices_b
50 |         )  # <num_loras, 1, r, feature>
51 |         other_indices_s = torch.arange(self.lora_scalings.shape[2]).view(1, 1, -1)
52 |         selected_lora_scalings = CtxGatherFuncCB.apply(
53 |             self.lora_scalings, lora_ids, other_indices_s
54 |         )  # <num_loras, 1, 1, 1>
55 | 
56 |         selected_lora_a_weights = selected_lora_a_weights.squeeze(1)
57 |         selected_lora_b_weights = selected_lora_b_weights.squeeze(1)
58 |         selected_lora_scalings = selected_lora_scalings.squeeze(1)
59 | 
60 |         result = result + x @ selected_lora_a_weights @ selected_lora_b_weights * selected_lora_scalings
61 | 
62 |         return result
63 | 
64 | 
65 | class LinearBase(nn.Linear):
66 |     def forward(self, x: torch.Tensor, **kwargs: Any):
67 |         return super().forward(x)
68 | 


--------------------------------------------------------------------------------
/QEfficient/peft/lora/lora_model.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # ----------------------------------------------------------------------------
 7 | 
 8 | from typing import List, Optional, Tuple, Union
 9 | 
10 | import torch
11 | from transformers.modeling_outputs import (
12 |     CausalLMOutputWithPast,
13 | )
14 | 
15 | from QEfficient.transformers.models.llama.modeling_llama import QEffLlamaForCausalLM
16 | from QEfficient.transformers.models.mistral.modeling_mistral import QEffMistralForCausalLM
17 | 
18 | 
19 | class QEffLoraModelMistralForCausalLM(QEffMistralForCausalLM):
20 |     def forward(
21 |         self,
22 |         input_ids: torch.LongTensor = None,
23 |         attention_mask: Optional[torch.Tensor] = None,
24 |         position_ids: Optional[torch.LongTensor] = None,
25 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
26 |         batch_index: Optional[torch.LongTensor] = None,
27 |         inputs_embeds: Optional[torch.FloatTensor] = None,
28 |         labels: Optional[torch.LongTensor] = None,
29 |         use_cache: Optional[bool] = None,
30 |         output_attentions: Optional[bool] = None,
31 |         output_hidden_states: Optional[bool] = None,
32 |         return_dict: Optional[bool] = None,
33 |         cache_position: Optional[torch.LongTensor] = None,
34 |         lora_ids: Optional[torch.Tensor] = None,
35 |         **kwargs,
36 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
37 |         kwargs["lora_ids"] = lora_ids
38 | 
39 |         return super().forward(
40 |             input_ids=input_ids,
41 |             attention_mask=attention_mask,
42 |             position_ids=position_ids,
43 |             past_key_values=past_key_values,
44 |             batch_index=batch_index,
45 |             inputs_embeds=inputs_embeds,
46 |             use_cache=use_cache,
47 |             output_attentions=output_attentions,
48 |             output_hidden_states=output_hidden_states,
49 |             return_dict=return_dict,
50 |             cache_position=cache_position,
51 |             **kwargs,
52 |         )
53 | 
54 | 
55 | class QEffLoraModelLlamaForCausalLM(QEffLlamaForCausalLM):
56 |     def forward(
57 |         self,
58 |         input_ids: torch.LongTensor = None,
59 |         attention_mask: Optional[torch.Tensor] = None,
60 |         position_ids: Optional[torch.LongTensor] = None,
61 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
62 |         batch_index: Optional[torch.LongTensor] = None,
63 |         inputs_embeds: Optional[torch.FloatTensor] = None,
64 |         labels: Optional[torch.LongTensor] = None,
65 |         use_cache: Optional[bool] = None,
66 |         output_attentions: Optional[bool] = None,
67 |         output_hidden_states: Optional[bool] = None,
68 |         return_dict: Optional[bool] = None,
69 |         cache_position: Optional[torch.LongTensor] = None,
70 |         lora_ids: Optional[torch.Tensor] = None,
71 |         **kwargs,
72 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
73 |         kwargs["lora_ids"] = lora_ids
74 | 
75 |         return super().forward(
76 |             input_ids=input_ids,
77 |             attention_mask=attention_mask,
78 |             position_ids=position_ids,
79 |             past_key_values=past_key_values,
80 |             batch_index=batch_index,
81 |             inputs_embeds=inputs_embeds,
82 |             use_cache=use_cache,
83 |             output_attentions=output_attentions,
84 |             output_hidden_states=output_hidden_states,
85 |             return_dict=return_dict,
86 |             cache_position=cache_position,
87 |             **kwargs,
88 |         )
89 | 


--------------------------------------------------------------------------------
/QEfficient/peft/lora/pytorch_transforms.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # ----------------------------------------------------------------------------
 7 | 
 8 | from typing import Dict, Optional, Tuple
 9 | 
10 | from torch import nn
11 | 
12 | from QEfficient.base.pytorch_transforms import ModuleMappingTransform
13 | from QEfficient.peft.lora.layers import LinearBase, LinearMultiLoRA
14 | from QEfficient.peft.lora.lora_model import QEffLoraModelLlamaForCausalLM, QEffLoraModelMistralForCausalLM
15 | from QEfficient.transformers.models.llama.modeling_llama import QEffLlamaForCausalLM
16 | from QEfficient.transformers.models.mistral.modeling_mistral import QEffMistralForCausalLM
17 | 
18 | 
19 | class LoraModelInputsTransform(ModuleMappingTransform):
20 |     _module_mapping = {
21 |         QEffMistralForCausalLM: QEffLoraModelMistralForCausalLM,
22 |         QEffLlamaForCausalLM: QEffLoraModelLlamaForCausalLM,
23 |     }
24 | 
25 | 
26 | class TargetModulesTransform(ModuleMappingTransform):
27 |     _module_mapping = {nn.Linear: LinearMultiLoRA}
28 | 
29 |     _module_mapping_nontarget = {nn.Linear: LinearBase}
30 | 
31 |     # whole set of supported target modules for now (make sure **kwargs are passed in on modeling file)
32 |     all_modules = {"q_proj", "k_proj", "v_proj", "o_proj"}
33 | 
34 |     # a class method that deals with target module names
35 |     @classmethod
36 |     def apply(
37 |         cls, model: nn.Module, target_modules: Optional[Dict], lora_rank: int, max_num_adapters: int
38 |     ) -> Tuple[nn.Module, bool]:
39 |         transformed = False
40 |         nontarget_modules = {key for key in cls.all_modules if key not in target_modules}
41 | 
42 |         for name, module in model.named_modules():
43 |             if repl_module := cls._module_mapping.get(type(module)):
44 |                 if name.split(".")[-1] in target_modules:
45 |                     module.__class__ = repl_module
46 |                     if hasattr(module, "multilora_init"):
47 |                         module.multilora_init(lora_rank, max_num_adapters)
48 |                     transformed = True
49 |                 elif name.split(".")[-1] in nontarget_modules:
50 |                     module.__class__ = cls._module_mapping_nontarget.get(type(module))
51 |                     transformed = True
52 | 
53 |         return model, transformed
54 | 


--------------------------------------------------------------------------------
/QEfficient/peft/onnx_transforms.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # ----------------------------------------------------------------------------
 7 | 
 8 | from typing import Tuple
 9 | 
10 | import onnx
11 | 
12 | from QEfficient.base.onnx_transforms import OnnxTransform
13 | 
14 | 
15 | class AdapterWeightsToInputsTransform(OnnxTransform):
16 |     @classmethod
17 |     def apply(cls, model: onnx.ModelProto, *, adapter_name: str, **kwargs) -> Tuple[onnx.ModelProto, bool]:
18 |         transformed = False
19 |         removed_initializers = []
20 | 
21 |         # Find nodes with lora weights as inputs
22 |         weight_suffix = f".{adapter_name}.weight"
23 |         lora_weight_nodes = {
24 |             inp: node for node in model.graph.node for inp in node.input if inp.endswith(weight_suffix)
25 |         }
26 | 
27 |         for i, weight in enumerate(model.graph.initializer):
28 |             if weight.name.endswith(weight_suffix):
29 |                 transformed = True
30 | 
31 |                 # Create input/output for lora weights
32 |                 new_weight_name = weight.name[: -len(weight_suffix)] + ".weight"
33 |                 type_proto = onnx.helper.make_tensor_type_proto(weight.data_type, shape=list(weight.dims))
34 |                 inp = onnx.ValueInfoProto(name=new_weight_name, type=type_proto)
35 |                 out = onnx.ValueInfoProto(name=new_weight_name + "_RetainedState", type=type_proto)
36 |                 model.graph.input.append(inp)
37 |                 model.graph.output.append(out)
38 | 
39 |                 # Create a node that connects input -> output
40 |                 node = onnx.helper.make_node("Identity", [inp.name], [out.name], new_weight_name + "_identity")
41 |                 model.graph.node.append(node)
42 | 
43 |                 # Rename weight input
44 |                 lora_weight_node = lora_weight_nodes[weight.name]
45 |                 for j, inp in enumerate(lora_weight_node.input):
46 |                     if inp == weight.name:
47 |                         lora_weight_node.input[j] = new_weight_name
48 | 
49 |                 # Remove weight initializers
50 |                 removed_initializers.append(i)
51 | 
52 |         if transformed:
53 |             for i in sorted(removed_initializers, reverse=True):
54 |                 model.graph.initializer.pop(i)
55 | 
56 |         return model, transformed
57 | 


--------------------------------------------------------------------------------
/QEfficient/peft/peft_model.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # ----------------------------------------------------------------------------
 7 | 
 8 | from peft import PeftModelForCausalLM, PeftType
 9 | 
10 | 
11 | class QEffPeftModelForCausalLM(PeftModelForCausalLM):
12 |     def forward(
13 |         self,
14 |         input_ids=None,
15 |         attention_mask=None,
16 |         position_ids=None,
17 |         past_key_values=None,
18 |         inputs_embeds=None,
19 |         labels=None,
20 |         output_attentions=None,
21 |         output_hidden_states=None,
22 |         return_dict=None,
23 |         task_ids=None,
24 |         **kwargs,
25 |     ):
26 |         peft_config = self.active_peft_config
27 |         if not peft_config.is_prompt_learning:
28 |             if self.base_model.config.model_type == "mpt":
29 |                 if inputs_embeds is not None:
30 |                     raise AssertionError("forward in MPTForCausalLM does not support inputs_embeds")
31 |                 return self.base_model(
32 |                     input_ids=input_ids,
33 |                     attention_mask=attention_mask,
34 |                     position_ids=position_ids,
35 |                     past_key_values=past_key_values,
36 |                     labels=labels,
37 |                     output_attentions=output_attentions,
38 |                     output_hidden_states=output_hidden_states,
39 |                     return_dict=return_dict,
40 |                     **kwargs,
41 |                 )
42 | 
43 |             if peft_config.peft_type == PeftType.POLY:
44 |                 kwargs["task_ids"] = task_ids
45 | 
46 |             with self._enable_peft_forward_hooks(**kwargs):
47 |                 kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args}
48 |                 return self.base_model(
49 |                     input_ids=input_ids,
50 |                     attention_mask=attention_mask,
51 |                     position_ids=position_ids,
52 |                     past_key_values=past_key_values,
53 |                     inputs_embeds=inputs_embeds,
54 |                     labels=labels,
55 |                     output_attentions=output_attentions,
56 |                     output_hidden_states=output_hidden_states,
57 |                     return_dict=return_dict,
58 |                     **kwargs,
59 |                 )
60 | 
61 |         raise NotImplementedError("Prompt learning methods are not supported from QEfficient")
62 | 


--------------------------------------------------------------------------------
/QEfficient/peft/pytorch_transforms.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # ----------------------------------------------------------------------------
 7 | 
 8 | from peft import PeftModelForCausalLM
 9 | 
10 | from QEfficient.base.pytorch_transforms import ModuleMappingTransform
11 | from QEfficient.peft.peft_model import QEffPeftModelForCausalLM
12 | 
13 | 
14 | class PeftModelInputsTransform(ModuleMappingTransform):
15 |     _module_mapping = {PeftModelForCausalLM: QEffPeftModelForCausalLM}
16 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/modeling_attn_mask_utils.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | from typing import Optional
 9 | 
10 | import torch
11 | 
12 | 
13 | def _create_causal_mask(
14 |     position_ids,
15 |     target_length,
16 |     sliding_window: Optional[int] = None,
17 | ):
18 |     """
19 |     A utility attention mask class that allows one to:
20 |         - Create a causal 4d mask
21 |         - Create a causal 4d mask with slided window
22 |     """
23 |     if sliding_window is not None:
24 |         query_indices = position_ids.unsqueeze(-1)
25 |         kv_indices = torch.arange(target_length).view(1, -1)
26 |         # --- Rolling buffer ---
27 |         pos_max = position_ids.max(1, keepdim=True).values
28 |         kv_start = (pos_max // target_length) * target_length
29 |         kv_indices_high = kv_indices + kv_start
30 |         kv_indices_low = torch.where(kv_indices_high < target_length, kv_indices, kv_indices_high - target_length)
31 |         kv_indices = torch.where(kv_indices_high > pos_max, kv_indices_low, kv_indices_high)
32 |         kv_indices = kv_indices.unsqueeze(1)
33 |         # ------
34 |         causal_mask = kv_indices > query_indices
35 |         attention_mask = causal_mask
36 | 
37 |         window_indices = query_indices - sliding_window + 1
38 |         window_mask = kv_indices < window_indices
39 |         attention_mask = attention_mask | window_mask
40 |         attention_mask = attention_mask.unsqueeze(1)
41 |     else:
42 |         query_indices = position_ids.unsqueeze(-1)
43 |         kv_indices = torch.arange(target_length).view(1, 1, -1)
44 |         attention_mask = kv_indices > query_indices
45 |         attention_mask = attention_mask.unsqueeze(1)
46 | 
47 |     return attention_mask
48 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/codegen/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/falcon/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/gemma/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/gemma2/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/gpt2/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/gpt_bigcode/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/gptj/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/granite/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/granitemoe/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/internvl/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/llama/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/llama_swiftkv/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/llava/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/llava_next/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/mistral/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/mixtral_moe/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/mllama/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/mpt/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/phi/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/phi3/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/qwen2/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/starcoder2/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/models/whisper/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/post_processing.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | from QEfficient.transformers.spd.turbo import build_and_attach_turbo
 9 | from QEfficient.utils.spd_utils import get_speculative_config, get_speculative_weights
10 | 
11 | model_type_registry = dict(turbo=build_and_attach_turbo)
12 | 
13 | 
14 | def build_and_attach_mlp(model, pretrained_model_name_or_path, speculative_model_type: str, **kwargs):
15 |     speculative_config: dict = get_speculative_config(pretrained_model_name_or_path, **kwargs)
16 |     speculative_weights: str = get_speculative_weights(pretrained_model_name_or_path, **kwargs)
17 | 
18 |     if (model_type := speculative_config.get("model_type")) is None:
19 |         speculative_config["model_type"] = speculative_model_type
20 |     else:
21 |         if model_type != speculative_model_type:
22 |             raise ValueError(
23 |                 f"`model_type` key from speculator config ({model_type} does not match input model type ({speculative_model_type})."
24 |             )
25 |     func = model_type_registry[speculative_model_type]
26 |     model = func(model, speculative_config, speculative_weights)
27 |     model.config.speculative_config = speculative_config
28 |     return model
29 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/quantizers/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/quantizers/auto.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # ----------------------------------------------------------------------------
 7 | 
 8 | from transformers.quantizers.auto import AUTO_QUANTIZATION_CONFIG_MAPPING, AUTO_QUANTIZER_MAPPING
 9 | from transformers.quantizers.quantizer_awq import AwqQuantizer
10 | from transformers.quantizers.quantizer_compressed_tensors import CompressedTensorsHfQuantizer
11 | from transformers.quantizers.quantizer_gptq import GptqHfQuantizer
12 | from transformers.utils.quantization_config import AwqConfig, CompressedTensorsConfig, GPTQConfig
13 | 
14 | from QEfficient.transformers.quantizers.quantizer_awq import QEffAwqConfig, QEffAwqQuantizer
15 | from QEfficient.transformers.quantizers.quantizer_compressed_tensors import (
16 |     QEffCompressedTensorsConfig,
17 |     QEffCompressedTensorsFP8Quantizer,
18 |     QEffFP8Config,
19 |     QEffFP8Quantizer,
20 | )
21 | from QEfficient.transformers.quantizers.quantizer_gptq import QEffGPTQConfig, QEffGPTQQuantizer
22 | 
23 | QEFF_AUTO_QUANTIZER_MAPPING = {
24 |     "awq": QEffAwqQuantizer,
25 |     "gptq": QEffGPTQQuantizer,
26 |     "compressed-tensors": QEffCompressedTensorsFP8Quantizer,
27 |     "fp8": QEffFP8Quantizer,
28 | }
29 | QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING = {
30 |     "awq": QEffAwqConfig,
31 |     "gptq": QEffGPTQConfig,
32 |     "compressed-tensors": QEffCompressedTensorsConfig,
33 |     "fp8": QEffFP8Config,
34 | }
35 | DUPLICATE_AUTO_QUANTIZER_MAPPING = {
36 |     "awq": AwqQuantizer,
37 |     "gptq": GptqHfQuantizer,
38 |     "compressed-tensors": CompressedTensorsHfQuantizer,
39 |     "fp8": None,
40 | }
41 | DUPLICATE_AUTO_QUANTIZATION_CONFIG_MAPPING = {
42 |     "awq": AwqConfig,
43 |     "gptq": GPTQConfig,
44 |     "compressed-tensors": CompressedTensorsConfig,
45 |     "fp8": None,
46 | }
47 | 
48 | 
49 | def with_replaced_quantizers(func):
50 |     def wrapper(*args, **kwargs):
51 |         transformers_replaced_quantization_config_mapping = dict()
52 |         transformers_replaced_quantizer_mapping = dict()
53 | 
54 |         for k in QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.keys():
55 |             # Replace quantization config
56 |             transformers_replaced_quantization_config_mapping[k] = AUTO_QUANTIZATION_CONFIG_MAPPING.get(k, None)
57 |             AUTO_QUANTIZATION_CONFIG_MAPPING[k] = QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING[k]
58 | 
59 |             # Replace quantizer
60 |             transformers_replaced_quantizer_mapping[k] = AUTO_QUANTIZER_MAPPING.get(k, None)
61 |             AUTO_QUANTIZER_MAPPING[k] = QEFF_AUTO_QUANTIZER_MAPPING[k]
62 | 
63 |         # Call the function for loading quantized models here
64 |         out = func(*args, **kwargs)
65 | 
66 |         # Put back quantization config and quantizer
67 |         for k in QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.keys():
68 |             AUTO_QUANTIZATION_CONFIG_MAPPING[k] = transformers_replaced_quantization_config_mapping[k]
69 |             AUTO_QUANTIZER_MAPPING[k] = transformers_replaced_quantizer_mapping[k]
70 | 
71 |         return out
72 | 
73 |     return wrapper
74 | 
75 | 
76 | def replace_transformers_quantizers():
77 |     """
78 |     This method lets you import AWQ/GPTQ models on CPU without bypassing the
79 |     rule of transformers of need to GPU.
80 |     Just call this method before using
81 |     `transformer.AutoModelForCausalLM.from_pretrained` and any AWQ/GPTQ model
82 |     that can be supported by QEfficient will be loaded using CPU.
83 |     """
84 |     AUTO_QUANTIZER_MAPPING.update(QEFF_AUTO_QUANTIZER_MAPPING)
85 |     AUTO_QUANTIZATION_CONFIG_MAPPING.update(QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING)
86 | 
87 | 
88 | # TODO: Make this a fixture? Or better, always update the quantizer and config in transformers.
89 | # When a user imports QEfficient, these are always available.
90 | def undo_transformers_quantizers():
91 |     """
92 |     This method is used to undo the effects on method `replace_transformers_quantizers`.
93 |     After this is called, the transformers library will be used for loading AWQ/GPTQ models.
94 |     """
95 |     AUTO_QUANTIZER_MAPPING.update(DUPLICATE_AUTO_QUANTIZER_MAPPING)
96 |     AUTO_QUANTIZATION_CONFIG_MAPPING.update(DUPLICATE_AUTO_QUANTIZATION_CONFIG_MAPPING)
97 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/quantizers/awq.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | import torch
 9 | import torch.nn as nn
10 | 
11 | from QEfficient.transformers.quantizers.quantizer_utils import dequantize_gemm
12 | 
13 | 
14 | class WQLinear_GEMM(nn.Module):
15 |     def __init__(self, bits, group_size, in_features, out_features, bias):
16 |         super().__init__()
17 | 
18 |         if bits != 4:
19 |             raise NotImplementedError("Only 4-bit are supported for now.")
20 | 
21 |         self.in_features = in_features
22 |         self.out_features = out_features
23 |         self.bits = bits
24 |         self.group_size = group_size if group_size != -1 else in_features
25 | 
26 |         # quick sanity check (make sure alignment)
27 |         if self.in_features % self.group_size != 0:
28 |             raise ValueError(
29 |                 f"in_features should be perfectly divisible by group_size, got in_features = {self.in_features}, group_size = {self.group_size} while initializing WQLinear_GEMM module"
30 |             )
31 |         if out_features % (32 // self.bits) != 0:
32 |             raise ValueError(
33 |                 f"out_features must be perfectly divisible by number of weights packed into int32 value i.e. 8, got out_features={self.out_features}"
34 |             )
35 | 
36 |         # For compatibility with QuantLinearORT
37 |         self.g_idx = torch.tensor([i // group_size for i in range(in_features)], dtype=torch.int32)
38 |         self.register_buffer(
39 |             "qweight",
40 |             torch.zeros(
41 |                 (in_features, out_features // (32 // self.bits)),
42 |                 dtype=torch.int32,
43 |             ),
44 |         )
45 |         self.register_buffer(
46 |             "qzeros",
47 |             torch.zeros(
48 |                 (in_features // self.group_size, out_features // (32 // self.bits)),
49 |                 dtype=torch.int32,
50 |             ),
51 |         )
52 |         self.register_buffer(
53 |             "scales",
54 |             torch.zeros(
55 |                 (in_features // self.group_size, out_features),
56 |                 dtype=torch.float16,
57 |             ),
58 |         )
59 |         if bias:
60 |             self.register_buffer(
61 |                 "bias",
62 |                 torch.zeros(
63 |                     (out_features),
64 |                     dtype=torch.float16,
65 |                 ),
66 |             )
67 |         else:
68 |             self.bias = None
69 | 
70 |     def forward(self, x):
71 |         # Only Inference supported
72 |         with torch.no_grad():
73 |             out_shape = x.shape[:-1] + (self.out_features,)
74 | 
75 |             out = dequantize_gemm(self.qweight, self.qzeros, self.scales, self.bits, self.group_size)
76 |             out = torch.matmul(x.float(), out.float())
77 | 
78 |             out = out + self.bias if self.bias is not None else out
79 |             out = out.reshape(out_shape)
80 | 
81 |         return out
82 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/quantizers/gptq.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # ----------------------------------------------------------------------------
 7 | 
 8 | import math
 9 | 
10 | import torch
11 | from torch import nn
12 | 
13 | from QEfficient.transformers.quantizers.quantizer_utils import dequantize_gptq
14 | 
15 | 
16 | class QuantLinearGPTQ(nn.Module):
17 |     """
18 |     A quantized linear layer using GPTQ (Generalized Post-Training Quantization).
19 |     This class supports only 4-bit quantization and is compatible with QuantLinearORT.
20 | 
21 |     Research paper link- GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers (https://arxiv.org/abs/2210.17323)
22 | 
23 |     Attributes:
24 |         in_features (int): The number of input features.
25 |         out_features (int): The number of output features.
26 |         bits (int): The number of bits used for quantization (must be 4).
27 |         act_order (None or bool): The activation order.
28 |         orig_fp_weight (None or torch.Tensor): The original floating-point weights.
29 |         maxq (int): The maximum quantization value.
30 |         group_size (int): The group size for quantization.
31 |         pack_mode (str): The packing mode, set to "GPTQ".
32 |         qweight (torch.Tensor): The quantized weight tensor.
33 |         qzeros (torch.Tensor): The quantized zeros tensor.
34 |         scales (torch.Tensor): The scales tensor.
35 |         g_idx (torch.Tensor): The group index tensor.
36 |         bias (torch.Tensor or None): The bias tensor, if applicable.
37 |     """
38 | 
39 |     def __init__(self, bits, group_size, in_features, out_features, bias):
40 |         super().__init__()
41 |         if bits != 4:
42 |             raise NotImplementedError("Only 4 bits are supported.")
43 |         self.in_features = in_features
44 |         self.out_features = out_features
45 |         self.bits = bits
46 |         self.act_order = None
47 |         self.orig_fp_weight = None
48 |         self.maxq = 2**self.bits - 1
49 |         self.group_size = group_size if group_size != -1 else in_features
50 |         self.pack_mode = "GPTQ"
51 | 
52 |         # For compatibility with QuantLinearORT
53 |         self.register_buffer(
54 |             "qweight",
55 |             torch.zeros((in_features // 32 * self.bits, out_features), dtype=torch.int32),
56 |         )
57 |         self.register_buffer(
58 |             "qzeros",
59 |             torch.zeros((math.ceil(in_features / self.group_size), out_features // 32 * self.bits), dtype=torch.int32),
60 |         )
61 |         self.register_buffer(
62 |             "scales",
63 |             torch.zeros((math.ceil(in_features / self.group_size), out_features), dtype=torch.float16),
64 |         )
65 |         self.g_idx = torch.tensor([i // group_size for i in range(in_features)], dtype=torch.int32)
66 |         if bias:
67 |             self.register_buffer(
68 |                 "bias",
69 |                 torch.zeros((out_features), dtype=torch.float16),
70 |             )
71 |         else:
72 |             self.bias = None
73 | 
74 |     def forward(self, x):
75 |         # Only Inference supported
76 |         out, _, _ = dequantize_gptq(self.qweight.T, self.qzeros, self.scales, self.bits, self.g_idx)
77 |         out = torch.matmul(x.float(), out.float())
78 |         out = out + self.bias if self.bias is not None else out
79 | 
80 |         return out
81 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/quantizers/quantizer_awq.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | import torch
 9 | from transformers.quantizers.quantizer_awq import AwqQuantizer
10 | from transformers.utils.quantization_config import AwqBackendPackingMethod, AwqConfig, AWQLinearVersion
11 | 
12 | from QEfficient.transformers.quantizers.awq import WQLinear_GEMM
13 | from QEfficient.transformers.quantizers.quantizer_utils import (
14 |     get_keys_to_not_convert,
15 |     replace_linear_layer_with_target_layer,
16 |     replace_quantization_scales,
17 | )
18 | from QEfficient.utils.logging_utils import logger
19 | 
20 | 
21 | class QEffAwqConfig(AwqConfig):
22 |     def post_init(self):
23 |         """
24 |         Safety checker that arguments are correct
25 |         """
26 | 
27 |         if self.backend not in [AwqBackendPackingMethod.AUTOAWQ]:
28 |             raise ValueError(
29 |                 f"Only quantization backend {AwqBackendPackingMethod.AUTOAWQ} is supported - not recognized backend {self.backend}"
30 |             )
31 | 
32 |         self.version = AWQLinearVersion.from_str(self.version)
33 |         if self.version not in [AWQLinearVersion.GEMM]:
34 |             raise ValueError(
35 |                 f"Only {AWQLinearVersion.GEMM} version in supported - not recognized version {self.version}"
36 |             )
37 | 
38 |         if self.do_fuse or self.fuse_max_seq_len is not None:
39 |             raise ValueError(
40 |                 f"fused modules are not supported, got do_fuse={self.do_fuse}, fuse_max_seq_len={self.fuse_max_seq_len}"
41 |             )
42 | 
43 |         if self.bits != 4:
44 |             raise ValueError(f"Only 4-bit AWQ quantization is supported, got bits={self.bits}")
45 | 
46 | 
47 | class QEffAwqQuantizer(AwqQuantizer):
48 |     target_cls = WQLinear_GEMM
49 | 
50 |     def __init__(self, quantization_config: QEffAwqConfig, **kwargs):
51 |         super().__init__(quantization_config, **kwargs)
52 | 
53 |     def validate_environment(self, device_map, **kwargs):
54 |         # No need to validate as we will always use pytorch CPU version.
55 |         return True
56 | 
57 |     @property
58 |     def is_trainable(self):
59 |         return False
60 | 
61 |     def update_torch_dtype(self, torch_dtype):
62 |         if torch_dtype not in [None, torch.float32]:
63 |             logger.warning(f"Requested dtype {torch_dtype} is not supported, overriding to None")
64 |         return None
65 | 
66 |     def _process_model_before_weight_loading(self, model, **kwargs):
67 |         self.modules_to_not_convert = get_keys_to_not_convert(model)
68 | 
69 |         if self.quantization_config.modules_to_not_convert is not None:
70 |             self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert)
71 | 
72 |         model, has_been_replaced = replace_linear_layer_with_target_layer(
73 |             model,
74 |             target_cls=self.target_cls,
75 |             quantization_config=self.quantization_config,
76 |             modules_to_not_convert=self.modules_to_not_convert,
77 |         )
78 | 
79 |         model = replace_quantization_scales(model, model.config.model_type)
80 |         if not has_been_replaced:
81 |             logger.warning(
82 |                 "You are loading an AWQ model but no linear modules were found in your model."
83 |                 " Please double check your model architecture, or submit an issue on github if you think this is a bug."
84 |             )
85 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/spd/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/spd/turbo.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | import torch
 9 | 
10 | from QEfficient.utils.checkpoint_utils import load_checkpoint
11 | 
12 | 
13 | class ResBlock(torch.nn.Module):
14 |     """
15 |     A Residual Block module.
16 |     This module performs a linear transformation followed by a SiLU activation,
17 |     and then adds the result to the original input, creating a residual connection.
18 |     Args:
19 |         hidden_size (int): The size of the hidden layers in the block.
20 |     """
21 | 
22 |     def __init__(self, hidden_size):
23 |         super().__init__()
24 |         self.linear = torch.nn.Linear(hidden_size, hidden_size)
25 |         # Initialize as an identity mapping
26 |         torch.nn.init.zeros_(self.linear.weight)
27 |         # Use SiLU activation to keep consistent with the Llama model
28 |         self.act = torch.nn.SiLU()
29 | 
30 |     def forward(self, x):
31 |         """
32 |         Forward pass of the ResBlock.
33 |         Args:
34 |             x (torch.Tensor): Input tensor.
35 |         Returns:
36 |             torch.Tensor: Output after the residual connection and activation.
37 |         """
38 |         return x + self.act(self.linear(x))
39 | 
40 | 
41 | def post_process_turbo_state_dict(state_dict: dict) -> dict:
42 |     """normaize turbo state dict keys
43 |     Args:
44 |         state_dict (dict): turbo state dict
45 |     Returns:
46 |         dict: normalized state dict
47 |     """
48 |     new_state_dict = dict()
49 |     for name, weights in state_dict.items():
50 |         new_name = name.replace("projections.", "")
51 |         new_state_dict[new_name] = weights
52 |     return new_state_dict
53 | 
54 | 
55 | def build_and_attach_turbo(model, speculative_config: dict, speculative_weights: str):
56 |     """build and attach turbo projections
57 |     Args:
58 |         model: model to attach projections to
59 |         speculative_config (dict): speculative config file used to build projections
60 |     Returns:
61 |         model: model with turbo projections
62 |     """
63 |     hidden_size = model.config.hidden_size
64 |     num_layers = speculative_config["turbo_num_layers"]
65 |     num_heads = speculative_config["turbo_num_heads"]
66 |     projections = torch.nn.ModuleList(
67 |         [
68 |             torch.nn.Sequential(
69 |                 *([ResBlock(hidden_size)] * num_layers),
70 |             )
71 |             for _ in range(num_heads)
72 |         ],
73 |     )
74 |     load_checkpoint(projections, speculative_weights, strict=True, post_process_func=post_process_turbo_state_dict)
75 |     model.projections = projections
76 |     speculative_config["num_speculative_tokens"] = num_heads
77 |     return model
78 | 


--------------------------------------------------------------------------------
/QEfficient/transformers/transform.py:
--------------------------------------------------------------------------------
  1 | # -----------------------------------------------------------------------------
  2 | #
  3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  4 | # SPDX-License-Identifier: BSD-3-Clause
  5 | #
  6 | # -----------------------------------------------------------------------------
  7 | 
  8 | import hashlib
  9 | 
 10 | import torch.nn as nn
 11 | import transformers
 12 | 
 13 | from QEfficient.base.modeling_qeff import QEFFBaseModel
 14 | from QEfficient.transformers.cache_utils import QEffDynamicCache
 15 | from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict
 16 | from QEfficient.utils.logging_utils import logger
 17 | 
 18 | 
 19 | def replace_module_with_qeff_layers(model: nn.Module) -> None:
 20 |     """
 21 |     Replaces the transformers nn.Module classes with optimized QEff classes in place.
 22 | 
 23 |     Args:
 24 |         :model (torch.nn.Module) Base PyTorch model.
 25 |     """
 26 |     # Replace if module class is registed in TransformersToQEffModulesDict
 27 |     target_module = TransformersToQEffModulesDict.get(model.__class__)
 28 |     if target_module is not None:
 29 |         model.__class__ = target_module
 30 | 
 31 |     # Iterate over child modules
 32 |     for _, module in model.named_children():
 33 |         replace_module_with_qeff_layers(module)
 34 | 
 35 | 
 36 | def get_params_hash(model: nn.Module) -> str:
 37 |     """
 38 |     Creates a Hash of all the parameters values i.e. weights using SHA256 algo.
 39 | 
 40 |     Args:
 41 |         model (torch.nn.Module): Base PyTorch model.
 42 | 
 43 |     Returns:
 44 |         :str: Hash string
 45 |     """
 46 |     hasher = hashlib.sha256()
 47 |     for _, params in model.named_parameters():
 48 |         hasher.update(params.data.numpy().tobytes())
 49 | 
 50 |     return hasher.hexdigest()
 51 | 
 52 | 
 53 | def transform_lm(model: nn.Module) -> nn.Module:
 54 |     """
 55 |     Replaces some Transformers torch.nn.Module layers for equivalent optimized modules for Cloud AI 100.
 56 | 
 57 |     Args:
 58 |         model (torch.nn.Module): PyTorch model.
 59 | 
 60 |     Returns:
 61 |         :torch.nn.Module: PyTorch Module with replaced QEff layers.
 62 |     """
 63 | 
 64 |     # Introducnig qeff_transformed attribue in model to check status of transform
 65 |     if getattr(model, "qeff_transformed", False):
 66 |         print("Model is already transformed")
 67 |         return model
 68 | 
 69 |     # Get Hash of all params for checking later
 70 |     prior_params_hash = get_params_hash(model)
 71 |     logger.warning(f"The model {model.__class__} layers has been updated to QEff layers in-place")
 72 |     # Replace with QEff layers
 73 |     replace_module_with_qeff_layers(model)
 74 | 
 75 |     # Check with new params hash
 76 |     later_params_hash = get_params_hash(model)
 77 |     if prior_params_hash != later_params_hash:
 78 |         raise RuntimeError("Weights were changed in the transform process, please report an issue")
 79 | 
 80 |     # Replace the Dyanmic cache utils update api
 81 |     transformers.cache_utils.DynamicCache.update = QEffDynamicCache.update
 82 | 
 83 |     setattr(model, "qeff_transformed", True)
 84 |     return model.eval()
 85 | 
 86 | 
 87 | def transform(model: QEFFBaseModel, form_factor="cloud"):
 88 |     """
 89 |     This function serves for optimizing any kind of model (i.e. LLM, SD, AWQ etc.) for Cloud AI 100.
 90 |     Will replace the torch.nn.Module layers of passed QEffModel with optimized implementation of the same.
 91 | 
 92 |     model (torch.nn.Module): object of any instance of class that is child of `QEFFBaseAutoModelFactory`
 93 |     form_factor (str): form factor configuration for optimizing the model, available options=["cloud", "edge"].
 94 |     """
 95 |     if form_factor != "cloud":
 96 |         raise ValueError("Only form_factor='cloud' is supported as of now!")
 97 |     # FIXME: move this to class and use model.transform()
 98 |     transform_lm(model.model)  # type: ignore
 99 |     return model
100 | 


--------------------------------------------------------------------------------
/QEfficient/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | from QEfficient.transformers.quantizers.auto import (  # noqa: F401
 9 |     replace_transformers_quantizers,
10 |     undo_transformers_quantizers,
11 | )
12 | from QEfficient.utils._utils import (  # noqa: F401
13 |     check_and_assign_cache_dir,
14 |     dump_qconfig,
15 |     get_num_layers_from_config,
16 |     get_num_layers_vlm,
17 |     get_onnx_dir_name,
18 |     get_padding_shape_from_config,
19 |     get_padding_shape_vlm,
20 |     get_qpc_dir_path,
21 |     hf_download,
22 |     load_hf_processor,
23 |     load_hf_tokenizer,
24 |     login_and_download_hf_lm,
25 |     onnx_exists,
26 |     padding_check_and_fix,
27 |     qpc_exists,
28 | )
29 | 


--------------------------------------------------------------------------------
/QEfficient/utils/cache.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # ----------------------------------------------------------------------------
 7 | 
 8 | import json
 9 | import os
10 | from pathlib import Path
11 | 
12 | QEFF_HOME: Path = None
13 | if "QEFF_HOME" in os.environ:
14 |     QEFF_HOME = Path(os.environ["QEFF_HOME"])
15 | elif "XDG_CACHE_HOME" in os.environ:
16 |     QEFF_HOME = Path(os.environ["XDG_CACHE_HOME"]) / "qeff_models"
17 | else:
18 |     QEFF_HOME = Path("~/.cache/qeff_models").expanduser()
19 | 
20 | 
21 | def json_serializable(obj):
22 |     if isinstance(obj, set):
23 |         return sorted(obj)
24 |     raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")
25 | 
26 | 
27 | def to_hashable(obj) -> bytes:
28 |     """
29 |     Converts obj to bytes such that same object will result in same hash
30 |     """
31 |     return json.dumps(
32 |         obj,
33 |         skipkeys=False,
34 |         ensure_ascii=True,
35 |         check_circular=True,
36 |         allow_nan=False,
37 |         indent=None,
38 |         separators=(",", ":"),
39 |         default=json_serializable,
40 |         sort_keys=True,
41 |     ).encode()
42 | 


--------------------------------------------------------------------------------
/QEfficient/utils/checkpoint_utils.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | from safetensors.torch import load_file
 9 | 
10 | 
11 | def load_checkpoint(model, checkpoint: str, strict=False, post_process_func=None):
12 |     """load weights ending with `.safetensors` extension
13 |     Args:
14 |         model: model to load wights into
15 |         checkpoint (str): checkpoint path
16 |         strict (bool, optional): strictness of loading weights. Defaults to False.
17 |         post_process_func (optional): Optional post-processing of loaded state dict. Defaults to None.
18 |     Returns:
19 |         model: model with applied weights
20 |     """
21 |     state_dict: dict = load_file(checkpoint)
22 |     if post_process_func is not None:
23 |         state_dict = post_process_func(state_dict)
24 |     model.load_state_dict(state_dict, strict=strict)
25 |     return model
26 | 


--------------------------------------------------------------------------------
/QEfficient/utils/device_utils.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | import math
 9 | import subprocess
10 | 
11 | from QEfficient.utils.constants import Constants
12 | from QEfficient.utils.logging_utils import logger
13 | 
14 | 
15 | def get_available_device_id():
16 |     """
17 |     API to check available device id.
18 | 
19 |     Return:
20 |         :int: Available device id.
21 |     """
22 | 
23 |     device_id = 0
24 |     result = None
25 | 
26 |     # FIXME: goes into infinite loop when user doesn't have permission and the command gives permission denied.
27 |     # To reproduce change the ownership of available devices.
28 |     while 1:
29 |         command = ["/opt/qti-aic/tools/qaic-util", "-q", "-d", f"{device_id}"]
30 |         try:
31 |             result = subprocess.run(command, capture_output=True, text=True)
32 |         except OSError:
33 |             logger.warning("Not a Cloud AI 100 device, Command not found", command)
34 |             return None
35 |         if result:
36 |             if "Status:Error" in result.stdout:
37 |                 device_id += 1
38 |             elif "Status:Ready" in result.stdout:
39 |                 logger.info("device is available.")
40 |                 return [device_id]
41 |             elif "Failed to find requested device ID" in result.stdout:
42 |                 logger.warning("Failed to find requested device ID")
43 |                 return None
44 | 
45 | 
46 | def is_qpc_size_gt_32gb(params: int, mxfp6: bool) -> bool:
47 |     if mxfp6:
48 |         qpc_size = math.ceil((params * 1) / Constants.GB)
49 |     else:
50 |         qpc_size = math.ceil((params * 2) / Constants.GB)
51 | 
52 |     logger.warning(f"Approximate QPC size is: {qpc_size} GB")
53 |     num_devices = math.ceil(qpc_size / Constants.MAX_QPC_LIMIT)
54 |     logger.warning(f"Number of Devices required: {num_devices}")
55 |     return qpc_size > Constants.MAX_QPC_LIMIT
56 | 
57 | 
58 | def is_multi_qranium_setup_available():
59 |     result = None
60 |     command = ["/opt/qti-aic/tools/qaic-util", "-q"]
61 |     try:
62 |         result = subprocess.run(command, stdout=subprocess.PIPE, universal_newlines=True)
63 |         filtered_result = subprocess.run(
64 |             ["grep", "Device Capabilities"], input=result.stdout, stdout=subprocess.PIPE, text=True
65 |         )
66 |     except OSError:
67 |         print("Command not found", command)
68 |         return None
69 | 
70 |     lines = filtered_result.stdout.split("\n")
71 | 
72 |     # to count the number of devices in MQ enabled set up
73 |     hybridboot_mdp_count = 0
74 |     for line in lines:
75 |         if ("HybridBoot+" in line) and ("MDP+" in line):
76 |             hybridboot_mdp_count = hybridboot_mdp_count + 1
77 | 
78 |     if hybridboot_mdp_count > 0:
79 |         print("No: of Devices with MQ enabled available: ", hybridboot_mdp_count)
80 |         return True
81 |     else:
82 |         print("Device in MQ set up not available")
83 |         return False
84 | 


--------------------------------------------------------------------------------
/QEfficient/utils/logging_utils.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | import logging
 9 | 
10 | 
11 | class QEffFormatter(logging.Formatter):
12 |     """
13 |     Formatter class used to set colors for printing different logging levels of messages on console.
14 |     """
15 | 
16 |     cyan: str = "\x1b[38;5;14m"
17 |     yellow: str = "\x1b[33;20m"
18 |     red: str = "\x1b[31;20m"
19 |     bold_red: str = "\x1b[31;1m"
20 |     reset: str = "\x1b[0m"
21 |     common_format: str = "%(levelname)s - %(name)s - %(message)s"  # type: ignore
22 |     format_with_line_info = "%(levelname)s - %(name)s - %(message)s  (%(filename)s:%(lineno)d)"  # type: ignore
23 | 
24 |     FORMATS = {
25 |         logging.DEBUG: cyan + format_with_line_info + reset,
26 |         logging.INFO: cyan + common_format + reset,
27 |         logging.WARNING: yellow + common_format + reset,
28 |         logging.ERROR: red + format_with_line_info + reset,
29 |         logging.CRITICAL: bold_red + format_with_line_info + reset,
30 |     }
31 | 
32 |     def format(self, record):
33 |         """
34 |         Overriding the base class method to Choose format based on log level.
35 |         """
36 |         log_fmt = self.FORMATS.get(record.levelno)
37 |         formatter = logging.Formatter(log_fmt)
38 |         return formatter.format(record)
39 | 
40 | 
41 | def create_logger() -> logging.Logger:
42 |     """
43 |     Creates a logger object with Colored QEffFormatter.
44 |     """
45 |     logger = logging.getLogger("QEfficient")
46 | 
47 |     # create console handler and set level to debug
48 |     ch = logging.StreamHandler()
49 |     ch.setLevel(logging.INFO)
50 |     # define formatter
51 |     ch.setFormatter(QEffFormatter())
52 | 
53 |     logger.addHandler(ch)
54 |     return logger
55 | 
56 | 
57 | # Define the logger object that can be used for logging purposes throughout the module.
58 | logger = create_logger()
59 | 


--------------------------------------------------------------------------------
/QEfficient/utils/model_registery.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | 
 9 | from transformers import AutoConfig, AutoModelForCausalLM
10 | 
11 | # Placeholder for all non-transformer models
12 | from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import (
13 |     QEffLlamaSwiftKVConfig,
14 |     QEffLlamaSwiftKVForCausalLM,
15 | )
16 | 
17 | # Map of model type to config class, Modelling class and transformer model architecture class
18 | MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS = {
19 |     "llama_swiftkv": [QEffLlamaSwiftKVConfig, QEffLlamaSwiftKVForCausalLM, AutoModelForCausalLM],
20 | }
21 | 
22 | # loop over all the model types which are not present in transformers and register them
23 | for model_type, model_cls in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():
24 |     # Register the model config class based on the model type. This will be first element in the tuple
25 |     AutoConfig.register(model_type, model_cls[0])
26 | 
27 |     # Register the non transformer library Class and config class using AutoModelClass
28 |     model_cls[2].register(model_cls[0], model_cls[1])
29 | 


--------------------------------------------------------------------------------
/QEfficient/utils/spd_utils.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | from pathlib import Path
 9 | 
10 | from huggingface_hub import hf_hub_download
11 | from transformers import PretrainedConfig
12 | 
13 | from QEfficient.utils._utils import filter_kwargs
14 | 
15 | 
16 | def get_speculative_config(pretrained_model_name_or_path, **kwargs) -> dict:
17 |     if not isinstance(pretrained_model_name_or_path, (str, Path)):
18 |         raise ValueError(
19 |             f"`pretrained_config` must be a string or Path object but is of type {type(pretrained_model_name_or_path)}"
20 |         )
21 |     try:
22 |         speculative_config, _ = PretrainedConfig.get_config_dict(
23 |             pretrained_model_name_or_path, _configuration_file="speculator_config.json", **kwargs
24 |         )
25 |     except OSError as err:
26 |         raise OSError(f"{err}.\nFile 'speculator_config.json' is expected to exist to apply turbo projections.")
27 |     return speculative_config
28 | 
29 | 
30 | def get_speculative_weights(pretrained_model_name_or_path, **kwargs) -> str:
31 |     turbo_weights_file = "speculator.safetensors"
32 |     hf_hub_kwargs = filter_kwargs(hf_hub_download, kwargs)
33 |     if (local_path := Path(pretrained_model_name_or_path)).exists():
34 |         if not local_path.is_dir():
35 |             raise ValueError(f"local model path {local_path} must point to an existing dictionary")
36 |         weights_path = local_path / turbo_weights_file
37 |         if not weights_path.exists():
38 |             raise FileNotFoundError(f"weights path {weights_path} does not exist.")
39 |     else:
40 |         weights_path = hf_hub_download(pretrained_model_name_or_path, filename=turbo_weights_file, **hf_hub_kwargs)
41 |     return str(weights_path)
42 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Docs
 2 | 
 3 | This directory contains the instructions for building static html documentations based on [sphinx](https://www.sphinx-doc.org/en/master/).
 4 | 
 5 | 
 6 | ## Build the docs
 7 | Install the packages required for building documentation:
 8 | 
 9 | ```sh
10 |  pip install -r docs/requirements.txt
11 | ```
12 | 
13 | And then, change directory to docs folder to build the docs.
14 | 
15 | ```sh
16 | cd docs/
17 | # To build docs specific to branch
18 | sphinx-build -M html . build
19 | # [Optional] To build docs for all the supporting branches
20 | sphinx-multiversion . build
21 | ```
22 | ## Preview the docs locally
23 |  
24 | ```bash
25 | cd build/html
26 | python -m http.server
27 | ```
28 | You can visit the page with your web browser with url `http://localhost:8080`.
29 | 


--------------------------------------------------------------------------------
/docs/_static/my_theme.css:
--------------------------------------------------------------------------------
1 | .wy-nav-content {
2 |     max-width: 1200px !important;
3 | }


--------------------------------------------------------------------------------
/docs/_templates/versions.html:
--------------------------------------------------------------------------------
 1 | <div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions">
 2 |     <span class="rst-current-version" data-toggle="rst-current-version">
 3 |       Version: Main
 4 |       <span class="fa fa-caret-down"></span>
 5 |     </span>
 6 |     <div class="rst-other-versions">
 7 |       Versions
 8 |       <dl>
 9 |         <dd><a href="../index.html">main</a></dd>
10 |         <dd><a href="release/v1.18/index.html">release/v1.18</a></dd>
11 |       </dl>
12 |     </div>
13 | </div>
14 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | # Configuration file for the Sphinx documentation builder.
 9 | #
10 | # This file only contains a selection of the most common options. For a full
11 | # list see the documentation:
12 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
13 | 
14 | # -- Path setup --------------------------------------------------------------
15 | 
16 | # If extensions (or modules to document with autodoc) are in another directory,
17 | # add these directories to sys.path here. If the directory is relative to the
18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
19 | #
20 | import os
21 | import sys
22 | 
23 | sys.path.insert(0, os.path.abspath(".."))
24 | 
25 | 
26 | # -- Project information -----------------------------------------------------
27 | 
28 | project = "efficient-transformers"
29 | copyright = "2024, Qualcomm"
30 | 
31 | # The full version, including alpha/beta/rc tags
32 | release = "main"
33 | 
34 | 
35 | # -- General configuration ---------------------------------------------------
36 | 
37 | # Add any Sphinx extension module names here, as strings. They can be
38 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
39 | # ones.
40 | extensions = ["myst_parser", "sphinx.ext.todo", "sphinx.ext.viewcode", "sphinx.ext.autodoc", "sphinx_multiversion"]
41 | 
42 | # Add any paths that contain templates here, relative to this directory.
43 | templates_path = ["_templates"]
44 | 
45 | # List of patterns, relative to source directory, that match files and
46 | # directories to ignore when looking for source files.
47 | # This pattern also affects html_static_path and html_extra_path.
48 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
49 | 
50 | 
51 | # -- Options for HTML output -------------------------------------------------
52 | 
53 | # The theme to use for HTML and HTML Help pages.  See the documentation for
54 | # a list of builtin themes.
55 | #
56 | html_theme = "sphinx_rtd_theme"
57 | 
58 | 
59 | def setup(app):
60 |     app.add_css_file("my_theme.css")
61 | 
62 | 
63 | # Add any paths that contain custom static files (such as style sheets) here,
64 | # relative to this directory. They are copied after the builtin static files,
65 | # so a file named "default.css" will overwrite the builtin "default.css".
66 | html_static_path = ["_static"]
67 | source = [".md"]
68 | todo_include_todos = True
69 | 
70 | suppress_warnings = [
71 |     "ref.rst_pilog",  # Suppress warnings about excluded toctree entries
72 | ]
73 | 


--------------------------------------------------------------------------------
/docs/image/Cloud_AI_100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/efficient-transformers/299ef7938ab84dc90f4d0b5e1e273de40be878ab/docs/image/Cloud_AI_100.png


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | % QEfficient documentation master file, created by
 2 | % sphinx-quickstart on Tue May 28 09:19:122024.
 3 | % You can adapt this file completely to your liking, but it should at least
 4 | % contain the root `toctree` directive.
 5 | 
 6 | Welcome to Efficient-Transformers Documentation!
 7 | ========================================
 8 | 
 9 |   
10 | <!-- ```{include} ../README.md
11 |    :relative-images: 
12 | ```   -->
13 |  
14 | 
15 | ```{toctree}
16 | :caption: 'Getting Started'
17 | :maxdepth: 4
18 |  
19 | source/introduction
20 | source/validate
21 | ```
22 | 
23 | 
24 | ```{toctree}
25 | :caption: 'Installation'
26 | :maxdepth: 2
27 | 
28 | source/installation
29 | ```
30 | 
31 | ```{toctree}
32 | :caption: 'Upgrade Efficient-Transformers'
33 | :maxdepth: 2
34 | 
35 | source/upgrade
36 | ```
37 | 
38 | ```{toctree}
39 | :caption: 'Inference on Cloud AI 100'
40 | :maxdepth: 4
41 | 
42 | source/quick_start
43 | source/cli_api
44 | source/python_api
45 | ```
46 | 
47 | 
48 | ```{toctree}
49 | :caption: 'QAIC Finetune'
50 | :maxdepth: 2
51 | 
52 | source/finetune
53 | 
54 | ```
55 | 
56 | ```{toctree}
57 | :caption: 'Blogs'
58 | :maxdepth: 2
59 | 
60 | source/blogs
61 | 
62 | ```
63 | 
64 | ```{toctree}
65 | :caption: 'Reference'
66 | :maxdepth: 2
67 | 
68 | source/reference
69 | 
70 | ```
71 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | Sphinx==7.1.2
2 | sphinx-multiversion==0.2.4
3 | sphinx-rtd-theme==2.0.0
4 | myst-parser==3.0.1
5 | 


--------------------------------------------------------------------------------
/docs/source/blogs.md:
--------------------------------------------------------------------------------
 1 | # Train anywhere, Infer on Qualcomm Cloud AI 100
 2 |  [Click here](https://www.qualcomm.com/developer/blog/2024/01/train-anywhere-infer-qualcomm-cloud-ai-100)
 3 | 
 4 | # How to Quadruple LLM Decoding Performance with Speculative Decoding (SpD) and Microscaling (MX) Formats on Qualcomm® Cloud AI 100
 5 |  [Click here](https://statics.teams.cdn.office.net/evergreen-assets/safelinks/1/atp-safelinks.html)
 6 | 
 7 | # Power-efficient acceleration for large language models – Qualcomm Cloud AI SDK
 8 |  [Click here](https://www.qualcomm.com/developer/blog/2023/11/power-efficient-acceleration-large-language-models-qualcomm-cloud-ai-sdk)
 9 | 
10 | # Qualcomm Cloud AI 100 Accelerates Large Language Model Inference by ~2x Using Microscaling (Mx) Formats
11 | [click here](https://www.qualcomm.com/developer/blog/2024/01/qualcomm-cloud-ai-100-accelerates-large-language-model-inference-2x-using-microscaling-mx)
12 | 
13 | # Qualcomm Cloud AI Introduces Efficient Transformers: One API, Infinite Possibilities
14 | [click here](https://www.qualcomm.com/developer/blog/2024/05/qualcomm-cloud-ai-introduces-efficient-transformers-one-api)
15 | 
16 | 


--------------------------------------------------------------------------------
/docs/source/cli_api.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Command Line Interface Use (CLI)
 3 | 
 4 | ```{NOTE}
 5 | Use ``bash terminal``, else if using ``ZSH terminal`` then ``device_group``should be in single quotes e.g.  ``'--device_group [0]'``
 6 | ```
 7 | 
 8 | (infer_api)=
 9 | ## `QEfficient.cloud.infer`
10 | ```{eval-rst}
11 | .. automodule:: QEfficient.cloud.infer.main
12 | ``` 
13 | ## `QEfficient.cloud.execute`
14 | ```{eval-rst}
15 | .. automodule:: QEfficient.cloud.execute.main
16 | ```
17 | ## `QEfficient.cloud.compile`
18 | ```{eval-rst}
19 |    .. automodule:: QEfficient.compile.compile_helper.compile
20 |    .. code-block:: bash
21 |     
22 |         python -m QEfficient.cloud.compile OPTIONS
23 | ```
24 | ## `QEfficient.cloud.export`
25 | ```{eval-rst}
26 |    .. automodule:: QEfficient.cloud.export.main
27 |    
28 | ```
29 | ## `QEfficient.cloud.finetune`
30 | ```{eval-rst}
31 |    .. automodule:: QEfficient.cloud.finetune.main
32 |    
33 | ```


--------------------------------------------------------------------------------
/docs/source/finetune.md:
--------------------------------------------------------------------------------
 1 | # Finetune Infra
 2 | 
 3 | This repository provides the infrastructure for finetuning models using different hardware accelerators such as QAIC.
 4 | Same CLI can be used to run Finetuning on gpu by setting the device flag.(for finetuning on gpu, install torch specific to cuda)
 5 | 
 6 | ## Installation
 7 | 
 8 | Same as QEfficient along with QAIC PyTorch Eager mode.
 9 | 
10 | For QEfficient Library : https://github.com/quic/efficient-transformers
11 | 
12 | For torch_qaic, assuming QEfficient is already installed,
13 | ```bash
14 | pip install /opt/qti-aic/integrations/torch_qaic/py310/torch_qaic-0.1.0-cp310-cp310-linux_x86_64.whl
15 | ```
16 | 
17 | ## Finetuning
18 | 
19 | Export the ENV variables to download and enable private datasets
20 | ```bash
21 | export HF_DATASETS_TRUST_REMOTE_CODE=True
22 | ```
23 | 
24 | Export the ENV variables to get the device and HW traces and debugging logs
25 | ```bash
26 | export QAIC_DEVICE_LOG_LEVEL=0 # For Device level logs
27 | export QAIC_DEBUG=1 # To understand the CPU fallback ops
28 | ```
29 | 
30 | ## Dataset Details
31 | 
32 | To download the Alpaca dataset, visit this [link](https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/refs/heads/main/alpaca_data.json). Download the dataset and place it under the **dataset** directory. Make sure to update the training configuration accordingly.
33 | ```bash
34 | wget -c https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/refs/heads/main/alpaca_data.json -P dataset/
35 | ```
36 | 
37 | To download the grammar dataset, visit this [link](https://github.com/meta-llama/llama-cookbook/blob/main/src/llama_cookbook/datasets/grammar_dataset/grammar_dataset_process.ipynb). Download the dataset and place it under the **datasets_grammar** directory. Make sure to update the training configuration accordingly.
38 | 
39 | 
40 | ## Usage
41 | 
42 | ### Single SOC finetuning on QAIC
43 | 
44 | ```python
45 | python -m QEfficient.cloud.finetune --device qaic:0 --model_name "meta-llama/Llama-3.2-1B"
46 | ```
47 | Also, you can configure various training parameters, for more details, checkout: QEfficient/finetune/configs/training.py, Below is example command line
48 | ```python
49 | python -m QEfficient.cloud.finetune --device qaic:0 --use-peft --output_dir ./meta-sam --num_epochs 2 --context_length 256 
50 | ```
51 | 
52 | ### Distributed training(DDP) on QAIC
53 | 
54 | ```python
55 | QAIC_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc-per-node 4 -m QEfficient.cloud.finetune --device qaic --enable_ddp --dist_backend qccl --num_epochs 2  --model_name "meta-llama/Llama-3.2-1B"
56 | ```
57 | **nproc-per-node is number of workers(QAIC devices) running locally.
58 | 
59 | ## Visualization
60 | 
61 | Tensorboard logs are generated inside runs/ directory with date and time stamp.
62 | to visualise the data,
63 | 
64 | ```python
65 | tensorboard --logdir runs/<file> --bind_all
66 | ```


--------------------------------------------------------------------------------
/docs/source/image/Cloud_AI_100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/efficient-transformers/299ef7938ab84dc90f4d0b5e1e273de40be878ab/docs/source/image/Cloud_AI_100.png


--------------------------------------------------------------------------------
/docs/source/image/kv_cache_cloudai100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quic/efficient-transformers/299ef7938ab84dc90f4d0b5e1e273de40be878ab/docs/source/image/kv_cache_cloudai100.png


--------------------------------------------------------------------------------
/docs/source/installation.md:
--------------------------------------------------------------------------------
 1 | # Pre-requisites
 2 | System Requirements:
 3 | 1. [Supported Linux OS](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/#operating-systems) - Ubuntu, RHEL and AWS Linux
 4 | 2. [Cloud AI 100 Platform SDK installed](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/Cloud-AI-SDK/Cloud-AI-SDK/#platform-sdk) 
 5 | 3. [SDK Pre-requisites](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/Pre-requisites/pre-requisites/) 
 6 | 4. [Multi-device support enabled for model sharding](https://github.com/quic/cloud-ai-sdk/tree/1.12/utils/multi-device)
 7 | 
 8 | # Installation 
 9 | 
10 | ### <small> 1. Download Apps SDK</small>
11 |    * [Cloud AI 100 Apps SDK install](https://quic.github.io/cloud-ai-sdk-pages/latest/Getting-Started/Installation/Cloud-AI-SDK/Cloud-AI-SDK/)  
12 | 
13 | ### <small> 2. Install Efficient-Transformers</small>
14 | Uninstall existing Apps SDK
15 | ```
16 | sudo ./uninstall.sh
17 | ```
18 | Run the install.sh script as root or with sudo to install with root permissions.
19 | ```
20 | sudo ./install.sh --enable-qeff
21 | source  /opt/qti-aic/dev/python/qeff/bin/activate
22 | ```
23 | On successful installation, the contents are stored to the /opt/qti-aic path under the dev and exec directories:
24 | ```
25 | dev exec integrations scripts
26 | ```
27 | Check the Apps SDK version with the following command
28 | ```
29 | sudo /opt/qti-aic/tools/qaic-version-util --apps
30 | ```
31 | Apply chmod commands
32 | ```
33 | sudo chmod a+x /opt/qti-aic/dev/hexagon_tools/bin/*
34 | sudo chmod a+x /opt/qti-aic/exec/*
35 | ```
36 | 
37 | # Sanity Check
38 | 
39 | After above installation methods, you can check if ``QEfficient`` is installed correctly by using
40 | ```bash
41 | python -c "import QEfficient; print(QEfficient.__version__)"
42 | ```
43 | If the above line executes successfully, you are good to go ahead and start deploying models on ``Cloud AI 100`` cards using ``QEfficient`` library.
44 | 


--------------------------------------------------------------------------------
/docs/source/python_api.md:
--------------------------------------------------------------------------------
  1 | # Python API
  2 | 
  3 | **This page give you an overview about the all the APIs that you might need to integrate the `QEfficient` into your python applications.**
  4 | 
  5 | ## High Level API
  6 | 
  7 | ### `QEFFAutoModelForCausalLM`
  8 | 
  9 | ```{eval-rst}
 10 | .. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForCausalLM
 11 |    :member-order: bysource
 12 |    :members:
 13 | ``` 
 14 | 
 15 | (QEFFAutoModel)=
 16 | ### `QEFFAutoModel`
 17 | 
 18 | ```{eval-rst}
 19 | .. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModel
 20 |    :member-order: bysource
 21 |    :members:
 22 | ``` 
 23 | 
 24 | (QEffAutoPeftModelForCausalLM)=
 25 | ### `QEffAutoPeftModelForCausalLM`
 26 | 
 27 | ```{eval-rst}
 28 | .. autoclass:: QEfficient.peft.auto.QEffAutoPeftModelForCausalLM
 29 |    :member-order: bysource
 30 |    :members:
 31 | ```
 32 | 
 33 | (QEffAutoLoraModelForCausalLM)=
 34 | ### `QEffAutoLoraModelForCausalLM`
 35 | 
 36 | ```{eval-rst}
 37 | .. autoclass:: QEfficient.peft.lora.auto.QEffAutoLoraModelForCausalLM
 38 |    :member-order: bysource
 39 |    :members:
 40 | ```
 41 | 
 42 | (QEFFAutoModelForImageTextToText)=
 43 | ### `QEFFAutoModelForImageTextToText`
 44 | 
 45 | ```{eval-rst}
 46 | .. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForImageTextToText
 47 |    :member-order: bysource
 48 |    :members:
 49 | ```
 50 | 
 51 | (QEFFAutoModelForSpeechSeq2Seq)=
 52 | ### `QEFFAutoModelForSpeechSeq2Seq`
 53 | 
 54 | ```{eval-rst}
 55 | .. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForSpeechSeq2Seq
 56 |    :member-order: bysource
 57 |    :members:
 58 | ```
 59 | 
 60 | ### `export`
 61 | 
 62 | ```{eval-rst}
 63 | .. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100
 64 |    :members:
 65 |    :show-inheritance:
 66 |    :exclude-members: convert_to_cloud_kvstyle, convert_to_cloud_bertstyle
 67 | .. deprecated::
 68 |    This function will be deprecated in version 1.19, please use QEFFAutoModelForCausalLM.export instead
 69 | ```
 70 | 
 71 | ### `compile`
 72 | 
 73 | ```{eval-rst}
 74 | .. automodule:: QEfficient.compile.compile_helper
 75 |    :members:
 76 |    :show-inheritance:
 77 | .. code-block:: python
 78 | 
 79 |    import QEfficient
 80 |    base_path, onnx_model_path = QEfficient.export(model_name="gpt2")
 81 |    qpc_path = QEfficient.compile(onnx_path=onnx_model_path, qpc_path=os.path.join(base_path, "qpc"), num_cores=14, device_group=[0])
 82 | .. deprecated::
 83 |    This function will be deprecated in version 1.19, please use QEFFAutoModelForCausalLM.compile instead
 84 | ```
 85 | 
 86 | ### `Execute`
 87 | 
 88 | ```{eval-rst}
 89 | .. automodule:: QEfficient.generation.text_generation_inference
 90 |    :members:
 91 |    :show-inheritance:
 92 |    :exclude-members:  latency_stats_bertstyle,cloud_ai_100_exec_kv_helper
 93 | ```
 94 | ## Low Level API
 95 | 
 96 | ### `convert_to_cloud_kvstyle`
 97 | 
 98 | ```{eval-rst}
 99 | .. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100
100 |    :members:
101 |    :show-inheritance:
102 |    :exclude-members: qualcomm_efficient_converter, convert_to_cloud_bertstyle
103 | ```
104 | 
105 | ### `convert_to_cloud_bertstyle`
106 | 
107 | ```{eval-rst}
108 | .. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100
109 |    :members:
110 |    :show-inheritance:
111 |    :exclude-members: qualcomm_efficient_converter, convert_to_cloud_kvstyle
112 | ```
113 | 
114 | ### `utils`
115 | 
116 | ```{eval-rst}
117 | .. automodule:: QEfficient.utils.device_utils
118 |    :members:
119 |    :show-inheritance:
120 | ```
121 | 
122 | ```{eval-rst}
123 | .. automodule:: QEfficient.utils.generate_inputs
124 |    :members:
125 |    :undoc-members:
126 |    :show-inheritance:
127 | ```
128 | 
129 | ```{eval-rst}
130 | .. automodule:: QEfficient.utils.run_utils
131 |    :members:
132 |    :undoc-members:
133 |    :show-inheritance:
134 | ```


--------------------------------------------------------------------------------
/docs/source/reference.md:
--------------------------------------------------------------------------------
1 | **References**
2 | # [Qualcomm Cloud AI home](https://www.qualcomm.com/products/technology/processors/cloud-artificial-intelligence)
3 | # [Qualcomm Cloud AI SDK download](https://www.qualcomm.com/products/technology/processors/cloud-artificial-intelligence/cloud-ai-100#Software)
4 | # [Qualcomm Cloud AI API reference](https://quic.github.io/cloud-ai-sdk-pages/latest/API/)
5 | # [User Guide](https://quic.github.io/cloud-ai-sdk-pages/)
6 | # [OCP Microscaling Formats (MX) Specification](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf)


--------------------------------------------------------------------------------
/docs/source/upgrade.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Using GitHub Repository
 3 | 
 4 | ``Warning: Efficient Transformers have been validated to work with the same compatible SDK. Upgrading this may result in certain models becoming incompatible.``
 5 | 
 6 | ```bash
 7 | # Create Python virtual env and activate it. (Required Python 3.10)
 8 | 
 9 | python3.10 -m venv qeff_env
10 | source qeff_env/bin/activate
11 | pip install -U pip
12 | 
13 | # Clone and Install the QEfficient Repo.
14 | pip install git+https://github.com/quic/efficient-transformers
15 | 
16 | ``` 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/examples/basic_gguf_models.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | # This is the work example of the GGUF models with the AI 100
 9 | 
10 | from transformers import AutoTokenizer
11 | 
12 | from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM
13 | 
14 | # Load the model and tokenizer
15 | model_name = "MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF"
16 | gguf_file = "Mistral-7B-Instruct-v0.3.fp16.gguf"
17 | # org_model_name = "mistralai/Mistral-7B-Instruct-v0.3"
18 | 
19 | tokenizer = AutoTokenizer.from_pretrained(model_name, gguf_file=gguf_file)
20 | model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file=gguf_file)
21 | 
22 | generated_qpc_path = model.compile(prefill_seq_len=32, ctx_len=128, num_cores=16, num_devices=1)
23 | model.generate(prompts=["How are you?"], tokenizer=tokenizer)
24 | 


--------------------------------------------------------------------------------
/examples/cpp_execution/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | project(InferenceSetIOBuffer)
 9 | cmake_minimum_required (VERSION 3.15)
10 | 
11 | # Find the pybind11 CMake directory using a Python script
12 | execute_process(
13 |     COMMAND python -c "import pybind11; print(pybind11.get_cmake_dir())"
14 |     OUTPUT_VARIABLE pybind11_DIR
15 |     OUTPUT_STRIP_TRAILING_WHITESPACE
16 | )
17 | 
18 | # Set the CMAKE_PREFIX_PATH to include pybind11
19 | set(CMAKE_PREFIX_PATH ${pybind11_DIR} ${CMAKE_PREFIX_PATH})
20 | set(CMAKE_CXX_STANDARD 17)
21 | find_package(pybind11 REQUIRED)
22 | 
23 | pybind11_add_module(InferenceSetIOBuffer MODULE InferenceSetIOBuffer.cpp)
24 | 
25 | 
26 | include_directories("/opt/qti-aic/dev/inc")
27 | include_directories("examples/cpp_execution")
28 | 
29 | target_link_libraries(InferenceSetIOBuffer PRIVATE ${PYTHON_LIBRARIES} pybind11::module pthread dl)
30 | 
31 | target_include_directories(InferenceSetIOBuffer PRIVATE ${PYTHON_INCLUDE_DIRS} ${pybind11_INCLUDE_DIRS})
32 | 
33 | set_target_properties(
34 |     InferenceSetIOBuffer
35 |     PROPERTIES
36 |     LINK_FLAGS "-Wl,--no-as-needed"
37 | )
38 | 
39 | set(CMAKE_BUILD_TYPE Debug)
40 | 
41 | target_compile_options(InferenceSetIOBuffer PRIVATE
42 |                     -fstack-protector-all
43 |                     -fstack-protector-all
44 |                     -Werror
45 |                     -Wall
46 |                     -Wextra
47 |                     -Wunused-variable
48 |                     -Wunused-parameter
49 |                     -Wnon-virtual-dtor
50 |                     -Wno-missing-field-initializers)
51 | 
52 | #Print paths for debugging
53 | message(STATUS "Python Include Dirs: ${PYTHON_INCLUDE_DIRS}")
54 | message(STATUS "Python Include Dirs: ${PYTHON_LIBRARIES}")
55 | message(STATUS "pybind11_DIR: ${pybind11_DIR}")
56 | 


--------------------------------------------------------------------------------
/examples/cpp_execution/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Text Generation using CPP Inference
 3 | 
 4 | ## Overview
 5 | This example demonstrates how to execute a model on AI 100 using Efficient Transformers and C++ APIs. The Efficient Transformers library is utilized for transforming, exporting and compiling the model, while the QPC is executed using C++ APIs. It is tested on both x86 and ARM platform.
 6 | 
 7 | > **_NOTE:_**  This supports BS>1 and Chunking.
 8 | 
 9 | ## Prerequisite
10 | 1. `pip install pybind11`
11 | 2. Cpp17 or above (Tested on C++17 and g++ version - 11.4.0)
12 | 3. QEfficient [Quick Installation Guide]( https://github.com/quic/efficient-transformers?tab=readme-ov-file#quick-installation)
13 | 
14 | ## Setup and Execution
15 | ```bash
16 | 
17 | # Compile the cpp file using the following commands
18 | mkdir build
19 | cd build
20 | 
21 | cmake ..
22 | make -j 8
23 | 
24 | cd ../../../  # Need to be in base folder - efficient-transformers to run below cmd
25 | 
26 | # Run the python script to get the generated text
27 | python examples/cpp_execution/text_inference_using_cpp.py --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 14 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first
28 | 
29 | ```
30 | 
31 | ## Future Enhancements
32 | 1. DMA Buffer Handling
33 | 2. Continuous Batching
34 | 3. Handling streamer
35 | 


--------------------------------------------------------------------------------
/examples/embedding_model.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | # This is the work example of the Embedding model with the AI 100
 9 | # For more information, visit: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
10 | 
11 | import torch
12 | import torch.nn.functional as F
13 | from transformers import AutoTokenizer
14 | 
15 | from QEfficient import QEFFAutoModel as AutoModel
16 | 
17 | 
18 | def mean_pooling(model_output, attention_mask):
19 |     token_embeddings = model_output  # First element of model_output contains all token embeddings
20 |     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
21 |     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
22 | 
23 | 
24 | # Sentences we want sentence embeddings for
25 | sentences = "This is an example sentence"
26 | 
27 | # Load model from HuggingFace Hub
28 | tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
29 | 
30 | 
31 | qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
32 | qeff_model.compile(num_cores=14)
33 | 
34 | # Tokenize sentences
35 | encoded_input = tokenizer(sentences, return_tensors="pt")
36 | qeff_output = torch.tensor(qeff_model.generate(encoded_input))
37 | 
38 | # Perform pooling
39 | sentence_embeddings = mean_pooling(qeff_output, encoded_input["attention_mask"])
40 | 
41 | # Normalize embeddings
42 | sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
43 | 
44 | print("Sentence embeddings:")
45 | print(sentence_embeddings)
46 | 


--------------------------------------------------------------------------------
/examples/granite_example/granite_vision_inference.py:
--------------------------------------------------------------------------------
  1 | # -----------------------------------------------------------------------------
  2 | #
  3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  4 | # SPDX-License-Identifier: BSD-3-Clause
  5 | #
  6 | # -----------------------------------------------------------------------------
  7 | 
  8 | import requests
  9 | from PIL import Image
 10 | from transformers import AutoProcessor, TextStreamer
 11 | 
 12 | from QEfficient import QEFFAutoModelForImageTextToText
 13 | 
 14 | # Add HuggingFace Token to access the model
 15 | HF_TOKEN = ""
 16 | 
 17 | 
 18 | def run_model(
 19 |     model_name,
 20 |     token,
 21 |     query,
 22 |     image_url,
 23 |     kv_offload=False,
 24 |     prefill_seq_len=5500,
 25 |     ctx_len=6000,
 26 |     generation_len=128,
 27 |     img_size=384,
 28 |     num_cores=16,
 29 |     num_devices=1,
 30 | ):
 31 |     ## STEP - 1 Load the Processor and Model
 32 | 
 33 |     processor = AutoProcessor.from_pretrained(model_name, token=token)
 34 | 
 35 |     # `kv_offload` is used to compile the model in a 2 QPCs.Currently we are not supporting 1 qpc so the flag false is not allowed.
 36 |     # The `kv_offload` flag should always be set to True.
 37 |     # The Dual QPC approach splits the model to perform Image Encoding and Output generation in 2 different QPCs.
 38 |     # The outputs of the Vision Encoder are then passed to the Language model via host in this case.
 39 | 
 40 |     model = QEFFAutoModelForImageTextToText.from_pretrained(model_name, token=token, kv_offload=kv_offload)
 41 | 
 42 |     ## STEP - 2 Export & Compile the Model
 43 | 
 44 |     model.compile(
 45 |         prefill_seq_len=prefill_seq_len,
 46 |         ctx_len=ctx_len,
 47 |         img_size=img_size,
 48 |         num_cores=num_cores,
 49 |         num_devices=num_devices,
 50 |         mxfp6_matmul=False,
 51 |     )
 52 | 
 53 |     ## STEP - 3 Load and process the inputs for Inference
 54 | 
 55 |     # We are resizing the image to (w x h) (1610 x 1109) so that any image can work on the model irrespective of image dimensssions
 56 |     # we have a fixed size of height 1109 and width 1610
 57 | 
 58 |     image = Image.open(requests.get(image_url, stream=True).raw)
 59 |     image = image.resize((1610, 1109))
 60 | 
 61 |     messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": query}]}]
 62 |     input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
 63 |     inputs = processor(image, input_text, add_special_tokens=False, return_tensors="pt")
 64 | 
 65 |     ## STEP - 4 Run Inference on the compiled model
 66 | 
 67 |     streamer = TextStreamer(processor.tokenizer)
 68 |     output = model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len)
 69 |     print(output)
 70 | 
 71 | 
 72 | if __name__ == "__main__":
 73 |     # Model name and Input parameters
 74 |     model_name = "ibm-granite/granite-vision-3.2-2b"
 75 | 
 76 |     # Please add prompt here
 77 |     query = "Describe the image"
 78 | 
 79 |     # Please pass image url or image path .The format of the image should be jpg.
 80 |     image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 81 | 
 82 |     # Compilation parameters for the model
 83 |     kv_offload = True
 84 |     prefill_seq_len = 5500
 85 |     ctx_len = 6000
 86 |     generation_len = 128
 87 |     img_size = 384
 88 |     num_cores = 16
 89 |     num_devices = 4
 90 | 
 91 |     run_model(
 92 |         model_name=model_name,
 93 |         token=HF_TOKEN,
 94 |         query=query,
 95 |         kv_offload=kv_offload,
 96 |         image_url=image_url,
 97 |         prefill_seq_len=prefill_seq_len,
 98 |         ctx_len=ctx_len,
 99 |         generation_len=generation_len,
100 |         img_size=img_size,
101 |         num_cores=num_cores,
102 |         num_devices=num_devices,
103 |     )
104 | 
105 | 
106 | """
107 | Expected Response:
108 | 
109 | The image depicts two cats lying on a pink blanket that is spread out on a red couch. The cats are positioned in a relaxed manner, with their bodies stretched out and their heads resting on the blanket. 
110 | The cat on the left is a smaller, tabby cat with a mix of black, gray, and white fur. It has a long, slender body and a distinctive tail that is curled up near its tail end. The cat on the right is a larger, 
111 | tabby cat with a mix of gray, black, and brown fur. It has
112 | 
113 | """
114 | 


--------------------------------------------------------------------------------
/examples/granite_example/readme.md:
--------------------------------------------------------------------------------
 1 | # Granite Vision Inference
 2 | This directory contains an example script of how to run inference on Granite-vision-3.2-2b via QEFFAutoModelForCausalLM class.
 3 | 
 4 | Currently for this model we will support dual pcs. No CB support is there for this model.
 5 | 
 6 | The model expects the following inputs to be fixed. 
 7 | 
 8 | 1. Image Size Height =1109
 9 | 2. Image Size Width =1610
10 | 3. Num Patches= 10
11 | 
12 | Please reshape any given image to (w x h) (1610 x 1109) and than pass it to the processor.It accepts a path or a url. Please pass jpg images.
13 | 
14 | Image used:
15 | 
16 | http://images.cocodataset.org/val2017/000000039769.jpg
17 | 
18 | 
19 | To run example script after package installations:
20 | ```sh
21 | python granite_vision_inference.py
22 | ```
23 | 
24 | Expected output for given sample inputs in the script:
25 | ```sh
26 | The image depicts two cats lying on a pink blanket that is spread out on a red couch. The cats are positioned in a relaxed manner, with their bodies stretched out and their heads resting on the blanket. 
27 | The cat on the left is a smaller, tabby cat with a mix of black, gray, and white fur. It has a long, slender body and a distinctive tail that is curled up near its tail end. The cat on the right is a larger, 
28 | tabby cat with a mix of gray, black, and brown fur. It has
29 | ```


--------------------------------------------------------------------------------
/examples/image_text_to_text_inference.py:
--------------------------------------------------------------------------------
  1 | # -----------------------------------------------------------------------------
  2 | #
  3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  4 | # SPDX-License-Identifier: BSD-3-Clause
  5 | #
  6 | # -----------------------------------------------------------------------------
  7 | 
  8 | import requests
  9 | from PIL import Image
 10 | from transformers import AutoProcessor, TextStreamer
 11 | 
 12 | from QEfficient import QEFFAutoModelForImageTextToText
 13 | 
 14 | # Add HuggingFace Token to access the model
 15 | HF_TOKEN = ""
 16 | 
 17 | 
 18 | def run_model(
 19 |     model_name,
 20 |     token,
 21 |     query,
 22 |     image_url,
 23 |     kv_offload=False,
 24 |     prefill_seq_len=32,
 25 |     ctx_len=512,
 26 |     generation_len=128,
 27 |     img_size=560,
 28 |     num_cores=16,
 29 |     num_devices=1,
 30 | ):
 31 |     ## STEP - 1 Load the Processor and Model
 32 | 
 33 |     processor = AutoProcessor.from_pretrained(model_name, token=token)
 34 | 
 35 |     # `kv_offload` is used to compile the model in a Single QPC or 2 QPCs.
 36 |     # The Dual QPC approach splits the model to perform Image Encoding and Output generation in 2 different QPCs.
 37 |     # The outputs of the Vision Encoder are then passed to the Language model via host in this case.
 38 | 
 39 |     model = QEFFAutoModelForImageTextToText.from_pretrained(
 40 |         model_name, token=token, attn_implementation="eager", kv_offload=kv_offload
 41 |     )
 42 | 
 43 |     ## STEP - 2 Export & Compile the Model
 44 | 
 45 |     model.compile(
 46 |         prefill_seq_len=prefill_seq_len,
 47 |         ctx_len=ctx_len,
 48 |         img_size=img_size,
 49 |         num_cores=num_cores,
 50 |         num_devices=num_devices,
 51 |         mxfp6_matmul=False,
 52 |     )
 53 | 
 54 |     ## STEP - 3 Load and process the inputs for Inference
 55 | 
 56 |     image = Image.open(requests.get(image_url, stream=True).raw)
 57 |     messages = [
 58 |         {
 59 |             "role": "user",
 60 |             "content": [
 61 |                 {"type": "image"},
 62 |                 {"type": "text", "text": query},
 63 |             ],
 64 |         }
 65 |     ]
 66 |     input_text = [processor.apply_chat_template(messages, add_generation_prompt=True)]
 67 | 
 68 |     inputs = processor(
 69 |         text=input_text,
 70 |         images=image,
 71 |         return_tensors="pt",
 72 |         add_special_tokens=False,
 73 |         padding="max_length",
 74 |         max_length=prefill_seq_len,
 75 |     )
 76 | 
 77 |     ## STEP - 4 Run Inference on the compiled model
 78 | 
 79 |     streamer = TextStreamer(processor.tokenizer)
 80 |     model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len)
 81 | 
 82 | 
 83 | if __name__ == "__main__":
 84 |     # Model name and Input parameters
 85 |     model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 86 |     query = "Describe this image."
 87 |     image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
 88 | 
 89 |     # Compilation parameters for the model
 90 |     kv_offload = False
 91 |     prefill_seq_len = 32
 92 |     ctx_len = 512
 93 |     generation_len = 128
 94 |     img_size = 560
 95 |     num_cores = 16
 96 |     num_devices = 1
 97 | 
 98 |     run_model(
 99 |         model_name=model_name,
100 |         token=HF_TOKEN,
101 |         query=query,
102 |         kv_offload=kv_offload,
103 |         image_url=image_url,
104 |         prefill_seq_len=prefill_seq_len,
105 |         ctx_len=ctx_len,
106 |         generation_len=generation_len,
107 |         img_size=img_size,
108 |         num_cores=num_cores,
109 |         num_devices=num_devices,
110 |     )
111 | 
112 | 
113 | """
114 | Expected Response:
115 | 
116 | This image depicts a charming anthropomorphic rabbit standing on a dirt path in front of a picturesque stone cottage, surrounded by a serene landscape.
117 | 
118 | The rabbit, with its light brown fur and distinctive long ears, is attired in a stylish blue coat, brown vest, and tan pants, exuding a sense of sophistication. The dirt path, flanked by vibrant flowers and lush greenery, leads to the cottage, which features a thatched roof and a chimney, adding to the rustic charm of the scene. In the background, rolling hills and trees create a breathtaking panorama, while the sky above is a brilliant blue with white clouds, completing the
119 | 
120 | """
121 | 


--------------------------------------------------------------------------------
/examples/intern_example/readme.md:
--------------------------------------------------------------------------------
 1 | # InternVL Inference
 2 | This directory contains an example script of how to run inference on InternVL-1B model via QEFFAutoModelForCausalLM class.
 3 | 
 4 | ## Required packages:
 5 | - `torch==2.4.1+cpu`
 6 | - `torchvision==0.19.1+cpu`
 7 | - `timm==1.0.14`
 8 | - `einops==0.8.1`
 9 | 
10 | You can install them using pip:
11 | ```sh
12 | pip install torch==2.4.1+cpu --extra-index-url https://download.pytorch.org/whl/cpu timm==1.0.14 torchvision==0.19.1+cpu einops==0.8.1
13 | ```
14 | 
15 | To run example script after package installations:
16 | ```sh
17 | python internvl_inference.py
18 | ```
19 | 
20 | Expected output for given sample inputs in the script:
21 | ```sh
22 | The image is a promotional graphic for Microsoft Azure. It features a blue background with a hexagonal pattern on the left side. The hexagons are white and are arranged in a way that suggests a network or connectivity theme. 
23 | 
24 | On the right side of the image, the Microsoft Azure logo is prominently displayed. The logo consists of the Azure name in white, with the Microsoft logo above it, which includes four colored squares (blue, green, yellow, and red). Below the logo, the word "Azure" is written in large white letters.
25 | 
26 | Below the logo, there is text that reads:
27 | - "By Dinesh Kumar Wick
28 | ```


--------------------------------------------------------------------------------
/examples/peft_models.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | from transformers import AutoTokenizer, TextStreamer
 9 | 
10 | from QEfficient import QEffAutoPeftModelForCausalLM
11 | 
12 | base_model_name = "mistralai/Mistral-7B-v0.1"
13 | tokenizer = AutoTokenizer.from_pretrained(base_model_name)
14 | streamer = TextStreamer(tokenizer)
15 | 
16 | m = QEffAutoPeftModelForCausalLM.from_pretrained("predibase/magicoder", "magicoder")
17 | m.export()
18 | m.compile(prefill_seq_len=32, ctx_len=1024)
19 | 
20 | # Magicoder adapter
21 | m.set_adapter("magicoder")
22 | inputs = tokenizer("def fibonacci", return_tensors="pt")
23 | m.generate(**inputs, streamer=streamer, max_new_tokens=1024)
24 | 
25 | # TLDR, summary generator
26 | m.load_adapter("predibase/tldr_headline_gen", "tldr_headline_gen")
27 | m.set_adapter("tldr_headline_gen")
28 | inputs = tokenizer(
29 |     """Summarize this passage in one sentence or less: Jeffrey Berns, CEO of Blockchains LLC, wants the Nevada government to allow companies like \
30 | his to form local governments on land they own, granting them power over everything from \
31 | schools to law enforcement. Berns envisions a city based on digital currencies and \
32 | blockchain storage. His company is proposing to build a 15,000 home town 12 miles east of \
33 | Reno. Nevada Lawmakers have responded with intrigue and skepticism. The proposed \
34 | legislation has yet to be formally filed or discussed in public hearings.
35 | 
36 | Summary: """,
37 |     return_tensors="pt",
38 | )
39 | m.generate(**inputs, streamer=streamer, max_new_tokens=1024)
40 | 
41 | # Math problems
42 | m.load_adapter("predibase/gsm8k", "gsm8k")
43 | m.set_adapter("gsm8k")
44 | inputs = tokenizer(
45 |     "James decides to run 3 sprints 3 times a week. He runs 60 meters each sprint. \
46 | How many total meters does he run a week?",
47 |     return_tensors="pt",
48 | )
49 | m.generate(**inputs, streamer=streamer, max_new_tokens=1024)
50 | 
51 | # News explanation
52 | m.load_adapter("predibase/agnews_explained", "agnews_explained")
53 | m.set_adapter("agnews_explained")
54 | inputs = tokenizer(
55 |     """Below is a news article. Please classify it under one of the following \
56 | classes (World, Business, Sports, Sci/Tech) and provide a reasonable coherent explanation for \
57 | why the article is classified as such. Please format your response as a JSON payload.
58 | 
59 | ### Article: US poverty rate climbs, along with number lacking health coverage (AFP) AFP - The \
60 | number of Americans living in poverty or without health insurance grew last year, a government \
61 | survey showed, adding potential dynamite in the battle for the White House.
62 | 
63 | ### JSON Response
64 | 
65 | """,
66 |     return_tensors="pt",
67 | )
68 | m.generate(**inputs, streamer=streamer, max_new_tokens=1024)
69 | 


--------------------------------------------------------------------------------
/examples/prompts.txt:
--------------------------------------------------------------------------------
1 | My name is
2 | The sun rises from
3 | The flat earth theory is the belief that


--------------------------------------------------------------------------------
/examples/speech_to_text/README.md:
--------------------------------------------------------------------------------
 1 | # Speech Seq2Seq
 2 | This directory contains an example script of how to use the AutoModelForSpeechSeq2Seq class. (for now, Whisper models on audio <30 seconds only has been validated)
 3 | 
 4 | ## Required packages:
 5 | - `librosa==0.10.2`
 6 | - `soundfile==0.13.1`
 7 | 
 8 | You can install them using pip:
 9 | ```sh
10 | pip install librosa==0.10.2 soundfile==0.13.1
11 | ```
12 | 
13 | To run example script after package installations:
14 | ```sh
15 | python speech_seq2seq_models.py
16 | ```
17 | 
18 | Expected output for given data sample:
19 | ```sh
20 | <|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.<|endoftext|>
21 | ```


--------------------------------------------------------------------------------
/examples/speech_to_text/run_whisper_speech_to_text.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | from datasets import load_dataset
 9 | from transformers import AutoProcessor
10 | 
11 | from QEfficient import QEFFAutoModelForSpeechSeq2Seq
12 | 
13 | base_model_name = "openai/whisper-tiny"
14 | ctx_len = 25
15 | 
16 | ## STEP 1 -- load audio sample, using a standard english dataset, can load specific files if longer audio needs to be tested; also load initial processor
17 | ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
18 | data = ds[0]["audio"]["array"]
19 | # reshape to so shape corresponds to data with batch size 1
20 | data = data.reshape(-1)
21 | sample_rate = ds[0]["audio"]["sampling_rate"]
22 | processor = AutoProcessor.from_pretrained(base_model_name)
23 | 
24 | ## STEP 2 -- init base model
25 | qeff_model = QEFFAutoModelForSpeechSeq2Seq.from_pretrained(base_model_name)
26 | 
27 | ## STEP 3 -- export and compile model
28 | qeff_model.compile()
29 | 
30 | ## STEP 4 -- generate output for loaded input and processor
31 | exec_info = qeff_model.generate(
32 |     inputs=processor(data, sampling_rate=sample_rate, return_tensors="pt"), generation_len=ctx_len
33 | )
34 | 
35 | ## STEP 5 (optional) -- use processor to decode output
36 | print(processor.batch_decode(exec_info.generated_ids)[0])
37 | 


--------------------------------------------------------------------------------
/notebooks/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "QEfficient"
 3 | dynamic = ["version"]
 4 | description = """
 5 |     QEfficient is the library interface for the Hugging Face Transformer \
 6 |     models for efficient inference on Qualcomm Cloud AI 100"""
 7 | readme = "README.md"
 8 | license = { file = "LICENSE" }
 9 | authors = [{ name = "Qualcomm Cloud AI ML Team" }]
10 | keywords = ["transformers", "Cloud AI 100", "Inference"]
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "Development Status :: 5 - Development/Unstable",
14 |     "Intended Audience :: Developers",
15 |     "Intended Audience :: Education",
16 |     "Operating System :: Linux",
17 |     "Programming Language :: Python :: 3.10",
18 |     "Topic :: Scientific/Engineering :: Artificial Intelligence for Inference Accelerator",
19 | ]
20 | requires-python = ">=3.8,<3.11"
21 | dependencies = [
22 |     "transformers==4.50.0",
23 |     "huggingface-hub==0.27.0",
24 |     "hf_transfer==0.1.9",
25 |     "peft==0.13.2",
26 |     "datasets==2.20.0",
27 |     "fsspec==2023.6.0",
28 |     "multidict==6.0.4",
29 |     "urllib3<2",
30 |     "sentencepiece==0.2.0",
31 |     "onnx==1.16.0",
32 |     "onnxruntime==1.16.3",
33 |     "numpy==1.26.4",
34 |     "protobuf==3.20.2",
35 |     "onnxscript==0.1.0.dev20240327",
36 |     "pillow===10.4.0",
37 |     "sympy",
38 |     "tensorboard",
39 |     "fire",
40 |     "py7zr",
41 |     "torchmetrics==1.7.0",
42 |     "torch==2.4.1; platform_machine=='aarch64'",
43 |     # Specifying torch cpu package URL per python version, update the list once pytorch releases whl for python>3.11
44 |     "torch@https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp38-cp38-linux_x86_64.whl ; python_version=='3.8' and platform_machine=='x86_64'",
45 |     "torch@https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp39-cp39-linux_x86_64.whl ; python_version=='3.9' and platform_machine=='x86_64'",
46 |     "torch@https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp310-cp310-linux_x86_64.whl ; python_version=='3.10' and platform_machine=='x86_64'",
47 | ]
48 | 
49 | [project.optional-dependencies]
50 | test = ["pytest","pytest-mock"]
51 | docs = ["Sphinx==7.1.2","sphinx-rtd-theme==2.0.0","myst-parser==3.0.1","sphinx-multiversion"]
52 | quality = ["black", "ruff", "hf_doc_builder@git+https://github.com/huggingface/doc-builder.git"]
53 | 
54 | [build-system]
55 | requires = ["setuptools>=62.0.0"]
56 | build-backend = "setuptools.build_meta"
57 | 
58 | [tool.setuptools.packages.find]
59 | include = ["QEfficient*"]
60 | namespaces = false
61 | 
62 | [tool.setuptools.dynamic.version]
63 | attr = "QEfficient.__version__"
64 | 
65 | [tool.ruff]
66 | line-length = 120
67 | # Enable the isort rules.
68 | lint.extend-select = ["I"]
69 | 
70 | [tool.pytest.ini_options]
71 | addopts = "-W ignore -s -v"
72 | junit_logging = "all"
73 | doctest_optionflags = "NUMBER NORMALIZE_WHITESPACE ELLIPSIS"
74 | 


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/scripts/finetune/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/scripts/finetune/run_ft_model.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | import os
 9 | import warnings
10 | 
11 | import torch
12 | from peft import AutoPeftModelForCausalLM
13 | from transformers import AutoModelForCausalLM, AutoTokenizer
14 | 
15 | from QEfficient.finetune.configs.training import TrainConfig
16 | 
17 | # Suppress all warnings
18 | warnings.filterwarnings("ignore")
19 | 
20 | try:
21 |     import torch_qaic  # noqa: F401
22 | 
23 |     device = "qaic:0"
24 | except ImportError as e:
25 |     print(f"Warning: {e}. Moving ahead without these qaic modules.")
26 |     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
27 | 
28 | train_config = TrainConfig()
29 | model = AutoModelForCausalLM.from_pretrained(
30 |     train_config.model_name,
31 |     use_cache=False,
32 |     attn_implementation="sdpa",
33 |     torch_dtype=torch.float16 if torch.cuda.is_available() or device == "qaic:0" else None,
34 | )
35 | 
36 | # Load the tokenizer and add special tokens
37 | tokenizer = AutoTokenizer.from_pretrained(
38 |     train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name
39 | )
40 | if not tokenizer.pad_token_id:
41 |     tokenizer.pad_token_id = tokenizer.eos_token_id
42 | 
43 | eval_prompt = """
44 |     Summarize this dialog:
45 |     A: Hi Tom, are you busy tomorrow’s afternoon?
46 |     B: I’m pretty sure I am. What’s up?
47 |     A: Can you go with me to the animal shelter?.
48 |     B: What do you want to do?
49 |     A: I want to get a puppy for my son.
50 |     B: That will make him so happy.
51 |     ---
52 |     Summary:
53 |     """
54 | 
55 | model_input = tokenizer(eval_prompt, return_tensors="pt")
56 | 
57 | model.to(device)
58 | model_input.to(device)
59 | model.eval()
60 | 
61 | with torch.inference_mode():
62 |     print(
63 |         tokenizer.decode(
64 |             model.generate(**model_input, max_new_tokens=50, do_sample=False)[0],
65 |             skip_special_tokens=True,
66 |         )
67 |     )
68 | 
69 | trained_weights_path = os.path.join(train_config.output_dir, "trained_weights")
70 | list_paths = [d for d in os.listdir(trained_weights_path) if os.path.isdir(os.path.join(trained_weights_path, d))]
71 | max_index = max([int(path[5:]) for path in list_paths])
72 | 
73 | save_dir = os.path.join(trained_weights_path, "step_" + str(max_index))
74 | 
75 | # Load PEFT model on CPU
76 | model = AutoPeftModelForCausalLM.from_pretrained(save_dir)
77 | # Merge LoRA and base model and save
78 | merged_model = model.merge_and_unload()
79 | merged_model.save_pretrained(train_config.output_dir, safe_serialization=True)
80 | model_id = train_config.output_dir
81 | 
82 | # Load Model with PEFT adapter
83 | model_peft = AutoModelForCausalLM.from_pretrained(model_id, use_cache=False, attn_implementation="sdpa")
84 | 
85 | model_peft.to(device)
86 | model_peft.eval()
87 | with torch.inference_mode():
88 |     print(
89 |         tokenizer.decode(
90 |             model_peft.generate(**model_input, max_new_tokens=50, do_sample=False)[0],
91 |             skip_special_tokens=True,
92 |         )
93 |     )
94 | 


--------------------------------------------------------------------------------
/scripts/perplexity_computation/README.md:
--------------------------------------------------------------------------------
 1 | # Perplexity Calculator
 2 | 
 3 | This script calculates the perplexity for ONNX, QPC, or Torch models using the WikiText-2 dataset. It supports different model types and configurations.
 4 | 
 5 | ## Table of Contents
 6 | 
 7 | - Requirements
 8 | - Installation
 9 | - Usage
10 |   - Example
11 | - Arguments
12 | - Output Details
13 | 
14 | ## Requirements
15 | 
16 | - Python 3.8+
17 | - Required Python packages:
18 |   - `QEfficient`
19 |   - `datasets==2.20`
20 | 
21 | ## Installation
22 | 
23 | - Install QEfficient and update the datasets package to 2.20
24 | 
25 | ## Usage
26 | 
27 | To run the script, use the following command:
28 | 
29 | ```bash
30 | python calculate_perplexity.py --model_type <model_type> --model_name <model_name> [--model_path <model_path>] [--dataset_name <dataset_name>] [--ctx_len <ctx_len>] [--prompt_len <prompt_len>] [--batch_size <batch_size>] [--stride <stride>] [--num_samples <num_samples>] [--qpc_device_id <qpc_device_id>] [--log_file <log_file>]
31 | 
32 | python perplexity_calculator_cloud.py --model_type torch --model_name meta-llama/Meta-Llama-3-8B-Instruct --num_samples 1
33 | ```
34 | 
35 | ## Arguments (Help Section)
36 | ```bash
37 | --model_path: Path to ONNX or QPC model (optional for Torch Original models).
38 | --model_type: Type of model (onnx, qpc, or torch) (required).
39 | --model_name: Name of the HuggingFace Model Card Name/tokenizer (required).
40 | --dataset_name: Name of the dataset (default: wikitext-2-raw-v1).
41 | --ctx_len: Context length (default: 2048).
42 | --prompt_len: Prompt length (default: 1).
43 | --batch_size: Batch size (default: 1).
44 | --stride: Stride for dataset (default: 1024).
45 | --num_samples: Number of samples to use (-1 for all) (default: -1).
46 | --qpc_device_id: QAIC device ids (comma-separated) (default: [0]).
47 | --log_file: Log file name (default: perplexity_results.log).
48 | ```
49 | 
50 | ## Output Details
51 | The script logs the following information:
52 | 
53 | - Perplexity and loss for the specified model. (For Original Torch, it will dump the Target for FP16 and MXFP6 Precision too)
54 | - Total time taken for evaluation.
55 | - Detailed configuration and results in the specified log file.
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/scripts/perplexity_computation/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/scripts/replicate_kv_head/README.md:
--------------------------------------------------------------------------------
 1 | # KV-Head Replication
 2 | 
 3 | This example contains a sample script for replicating key-value (KV) heads for the Llama-3-8B-Instruct model. The script performs the following steps:
 4 | 1. Runs inference with the original model.
 5 | 2. Replicates the KV heads.
 6 | 3. Runs inference on the modified model to validate the changes.
 7 | 4. Exports the modified model to ONNX format.
 8 | 
 9 | 
10 | ## Installation
11 | 
12 | Install efficient-transformers and the required libraries using https://github.com/quic/efficient-transformers#quick-installation
13 | 
14 | 
15 | ## Usage
16 | You can run the script with different parameters using the command line. Below is an example of how to use the script:
17 | 
18 | 1. **(Optional)** If you are using a gated repository, export the `HF_TOKEN`:
19 |     ```sh
20 |     export HF_TOKEN=<hf_token>
21 |     ```
22 | 
23 | 2. **Run the script** with the desired parameters:
24 |     ```sh
25 |     python script.py --model_name "meta-llama/Meta-Llama-3-8B-Instruct" --prompt "Hello, world!" --repeat 3
26 |     ```
27 | 
28 | Replace `<hf_token>` with your actual token.
29 | 
30 | ### Arguments
31 | - **--model_name**: Model card name to use (default: “meta-llama/Meta-Llama-3-8B-Instruct”).
32 | - **--prompt**: Prompt to use for the model (default: “My name is”).
33 | - **--repeat**: Factor to repeat key-value heads (default: 2).
34 | - **--num_attention_heads**: Number of attentin heads (default: None). This is optional param, if not given explicitly the will be read from config.json.
35 | - **--hidden_size**: Hidden size (default: None). This is optional param, if not given explicitly the will be read from config.json.


--------------------------------------------------------------------------------
/scripts/replicate_kv_head/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/scripts/specializations.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"specializations": [
 3 | 		{
 4 | 			"full_batch_size": "4",
 5 | 			"batch_size": "1",
 6 | 			"seq_len": "8",
 7 | 			"ctx_len": "32"
 8 | 		},
 9 | 		{
10 | 			"full_batch_size": "4",
11 | 			"batch_size": "1",
12 | 			"seq_len": "1",
13 | 			"ctx_len": "32"
14 | 		}
15 | 	]
16 | }
17 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | # Tests
 2 | This directory contains the tests for the project. Below is the list of test functions and required pytest plugins.
 3 | 
 4 | ## Test Functions
 5 | ### cloud/test_infer.py
 6 | - test_infer function
 7 | 
 8 | ### cloud/test_export.py
 9 | - test_export function
10 | 
11 | ### cloud/test_compile.py
12 | - test_compile function
13 | 
14 | ### cloud/test_execute.py
15 | - test_execute function
16 | 
17 | ## Required Plugins
18 | - `pytest`
19 | - `pytest-mock`
20 | 
21 | You can install them using pip:
22 | ```sh
23 | pip install pytest pytest-mock
24 | ```
25 | Alternatively, if you have specefied these dependencies in your `pyproject.toml` , you can install them using the test feature:
26 | ```sh
27 | pip install .[test]
28 | ```
29 | 
30 | ## Running the Tests
31 | To run the tests, navigate to the root directory of the project and use the following command:
32 | ```sh
33 | pytest -v -s
34 | ```
35 | And If you want to see the Skipped reason then you can use the below command for testing:
36 | ```sh
37 | pytest -v -rs
38 | ```
39 | If you want to run a specefic test file or test function, you can specify it like this:
40 | ```sh
41 | pytest tests/cloud/test_infer.py
42 | ```
43 | ```sh
44 | pytest tests/cloud/test_infer.py::test_infer
45 | ```
46 | ### Note
47 | To run all the tests, follow the instructions below:
48 | ```sh
49 | cd tests/cloud  # navigate to the directory where conftest.py present
50 | pytest -v --all # use --all option
51 | ```
52 | ## Cleanup
53 | Some tests will create temporary files or directories, to ensure a clean state after running the tests, use the provided fixtures or cleanup scripts as described in the `conftest.py`.
54 | 
55 | ## Test Coverage
56 | If you want to measure test coverage, you can use the `pytest-cov` plugin. Install it using:
57 | ```sh
58 | pip install pytest-cov
59 | ```
60 | Then run the tests with coverage:
61 | ```sh
62 | pytest --cov=QEfficient/cloud
63 | ```
64 | It will show the code coverage of that particular directory.
65 | 
66 | 
67 | ## Test Report
68 | If you want to generate a html report for the tests execution, you can use the `pytest-html` plugin. Install it using:
69 | ```sh
70 | pip install pytest-html
71 | ```
72 | Then run the tests with html:
73 | ```sh
74 | pytest --html=report.html
75 | ```
76 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # SPDX-License-Identifier: BSD-3-Clause
5 | #
6 | # -----------------------------------------------------------------------------
7 | 
8 | 


--------------------------------------------------------------------------------
/tests/base/test_modeling_qeff.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | from types import SimpleNamespace
 9 | 
10 | import onnx
11 | import pytest
12 | 
13 | from QEfficient.base.modeling_qeff import QEFFBaseModel
14 | 
15 | 
16 | def test_compiler_invalid_file(tmp_path):
17 |     qeff_obj = SimpleNamespace()
18 | 
19 |     invalid_file = tmp_path / "invalid.onnx"
20 |     with open(invalid_file, "wb") as fp:
21 |         fp.write(chr(0).encode() * 100)
22 | 
23 |     with pytest.raises(RuntimeError):
24 |         QEFFBaseModel._compile(qeff_obj, invalid_file, tmp_path)
25 | 
26 | 
27 | def test_compiler_invalid_flag(tmp_path):
28 |     qeff_obj = SimpleNamespace()
29 | 
30 |     onnx_model = onnx.parser.parse_model("""
31 |     <
32 |         ir_version: 8,
33 |         opset_import: ["": 17]
34 |     >
35 |     test_compiler(float x) => (float y)
36 |     {
37 |         y = Identity(x)
38 |     }
39 |     """)
40 |     valid_file = tmp_path / "valid.onnx"
41 |     onnx.save(onnx_model, valid_file)
42 | 
43 |     with pytest.raises(RuntimeError):
44 |         QEFFBaseModel._compile(
45 |             qeff_obj, valid_file, tmp_path, convert_tofp16=True, compile_only=True, aic_binary_dir=tmp_path
46 |         )
47 | 


--------------------------------------------------------------------------------
/tests/base/test_pytorch_transforms.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # ----------------------------------------------------------------------------
 7 | 
 8 | import pytest
 9 | import torch
10 | from torch import nn
11 | 
12 | from QEfficient.base.pytorch_transforms import ModuleMappingTransform, ModuleMutatorTransform
13 | 
14 | 
15 | class TestModel(nn.Module):
16 |     def __init__(self):
17 |         super().__init__()
18 | 
19 |         self.a = nn.Linear(32, 64)
20 |         self.b = nn.Linear(64, 32)
21 | 
22 |     def forward(self, x):
23 |         x = self.a(x)
24 |         x = self.b(x)
25 |         return x
26 | 
27 | 
28 | def test_module_mapping_transform():
29 |     with pytest.raises(TypeError):
30 |         ModuleMappingTransform()
31 | 
32 |     class TestTransform(ModuleMappingTransform):
33 |         _module_mapping = {nn.Linear: nn.Identity}
34 | 
35 |     model = TestModel()
36 |     x = torch.rand(1, 32)
37 |     y1 = model(x)
38 |     assert torch.any(y1 != x)
39 | 
40 |     model, transformed = TestTransform.apply(model)
41 |     assert transformed
42 |     y2 = model(x)
43 |     assert torch.all(y2 == x)
44 | 
45 | 
46 | def test_module_mutator_transform():
47 |     with pytest.raises(TypeError):
48 |         ModuleMutatorTransform()
49 | 
50 |     class TestTransform(ModuleMutatorTransform):
51 |         _match_class = nn.Linear
52 | 
53 |         @classmethod
54 |         def mutate(cls, original_module: nn.Module, parent_module: nn.Module):
55 |             return nn.Identity()
56 | 
57 |     model = TestModel()
58 |     prev_ids = [id(model.a), id(model.b)]
59 |     x = torch.rand(1, 32)
60 |     y1 = model(x)
61 |     assert torch.any(y1 != x)
62 |     model, transformed = TestTransform.apply(model)
63 |     assert transformed
64 |     assert not ([id(model.a), id(model.b)] == prev_ids)
65 |     y2 = model(x)
66 |     assert torch.all(y2 == x)
67 | 


--------------------------------------------------------------------------------
/tests/cloud/high_level_testing.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "license": "SEE LICENSE IN LICENSE FILE",
 3 |     "model_name" : ["gpt2"],
 4 |     "num_cores" : [16],
 5 |     "prompt" : ["My name is"],
 6 |     "prompts_txt_file_path" : ["examples/prompts.txt"],
 7 |     "aic_enable_depth_first" : [1],
 8 |     "mos" : [1],
 9 |     "cache_dir" : [null],
10 |     "hf_token" : [null],
11 |     "batch_size" : [1],
12 |     "prompt_len" : [32],
13 |     "ctx_len" : [128],
14 |     "mxfp6" : [1],
15 |     "mxint8" : [1],
16 |     "device_group" : [null],
17 |     "full_batch_size" : [null,3],
18 |     "enable_qnn" : [false, true],
19 |     "qnn_config" : [null, "QEfficient/compile/qnn_config.json"]
20 | }
21 | 


--------------------------------------------------------------------------------
/tests/cloud/test_compile_and_execute.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | import os
 9 | 
10 | import pytest
11 | import yaml
12 | 
13 | import QEfficient
14 | from QEfficient.cloud.execute import main as execute
15 | from QEfficient.cloud.export import get_onnx_model_path
16 | 
17 | 
18 | @pytest.mark.on_qaic
19 | @pytest.mark.cli
20 | def test_compile(setup, mocker):
21 |     """
22 |     test_compile is a HL compile api testing function,
23 |     checks compile api code flow, object creations, internal api calls, internal returns.
24 |     ---------
25 |     Parameters:
26 |     setup: is a fixture defined in conftest.py module.
27 |     mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions.
28 |     """
29 |     ms = setup
30 |     onnx_model_path = get_onnx_model_path(
31 |         model_name=ms.model_name,
32 |         cache_dir=ms.cache_dir,
33 |         hf_token=ms.hf_token,
34 |         full_batch_size=ms.full_batch_size,
35 |         local_model_dir=ms.local_model_dir,
36 |     )
37 | 
38 |     base_key = "past_key."
39 |     base_value = "past_value."
40 |     precision = "float16"
41 | 
42 |     data = []
43 | 
44 |     for i in range(12):
45 |         data.append({"IOName": f"{base_key}{i}", "Precision": precision})
46 |         data.append({"IOName": f"{base_value}{i}", "Precision": precision})
47 | 
48 |     for i in range(12):
49 |         data.append({"IOName": f"{base_key}{i}_RetainedState", "Precision": precision})
50 |         data.append({"IOName": f"{base_value}{i}_RetainedState", "Precision": precision})
51 | 
52 |     with open(((onnx_model_path.parent) / "custom_io.yaml"), "w") as file:
53 |         yaml.dump(data, file)
54 | 
55 |     qpc_path = QEfficient.compile(
56 |         onnx_path=onnx_model_path,
57 |         qpc_path=os.path.dirname(ms.qpc_dir_path()),
58 |         num_cores=ms.num_cores,
59 |         device_group=ms.device_group,
60 |         custom_io_file_path=(onnx_model_path.parent) / "custom_io.yaml",
61 |         aic_enable_depth_first=ms.aic_enable_depth_first,
62 |         mos=ms.mos,
63 |         batch_size=ms.batch_size,
64 |         prompt_len=ms.prompt_len,
65 |         ctx_len=ms.ctx_len,
66 |         mxfp6=ms.mxfp6,
67 |         mxint8=ms.mxint8,
68 |         full_batch_size=ms.full_batch_size,
69 |         enable_qnn=ms.enable_qnn,
70 |     )
71 | 
72 |     execute(
73 |         model_name=ms.model_name,
74 |         qpc_path=qpc_path,
75 |         prompt=ms.prompt,
76 |         prompts_txt_file_path=ms.prompts_txt_file_path,
77 |         generation_len=ms.generation_len,
78 |         hf_token=ms.hf_token,
79 |         full_batch_size=ms.full_batch_size,
80 |     )
81 | 


--------------------------------------------------------------------------------
/tests/cloud/test_export.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | 
 9 | import pytest
10 | 
11 | from QEfficient.cloud.export import main as export
12 | 
13 | 
14 | @pytest.mark.cli
15 | def test_export(setup, mocker):
16 |     """
17 |     test_export is a HL export api testing function,
18 |     checks export api code flow, object creations, internal api calls, internal returns.
19 |     ---------
20 |     Parameters:
21 |     setup: is a fixture defined in conftest.py module.
22 |     mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions.
23 |     """
24 |     ms = setup
25 | 
26 |     export(
27 |         model_name=ms.model_name,
28 |         hf_token=ms.hf_token,
29 |         local_model_dir=ms.local_model_dir,
30 |         full_batch_size=ms.full_batch_size,
31 |     )
32 | 


--------------------------------------------------------------------------------
/tests/cloud/test_infer.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | 
 9 | import pytest
10 | 
11 | from QEfficient.cloud.infer import main as infer
12 | 
13 | 
14 | @pytest.mark.on_qaic
15 | @pytest.mark.cli
16 | @pytest.mark.usefixtures("clean_up_after_test")
17 | def test_infer(setup, mocker):
18 |     """
19 |     test_infer is a HL infer api testing function,
20 |     checks infer api code flow, object creations, internal api calls, internal returns.
21 |     ---------
22 |     Parameters:
23 |     setup: is a fixture defined in conftest.py module.
24 |     mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions.
25 |     ---------
26 |     Ref: https://docs.pytest.org/en/7.1.x/how-to/fixtures.html
27 |     Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html
28 |     """
29 |     ms = setup
30 |     infer(
31 |         model_name=ms.model_name,
32 |         num_cores=ms.num_cores,
33 |         prompt=ms.prompt,
34 |         local_model_dir=ms.local_model_dir,
35 |         prompts_txt_file_path=ms.prompts_txt_file_path,
36 |         aic_enable_depth_first=ms.aic_enable_depth_first,
37 |         mos=ms.mos,
38 |         hf_token=ms.hf_token,
39 |         batch_size=ms.batch_size,
40 |         prompt_len=ms.prompt_len,
41 |         ctx_len=ms.ctx_len,
42 |         generation_len=ms.generation_len,
43 |         mxfp6=ms.mxfp6,
44 |         mxint8=ms.mxint8,
45 |         full_batch_size=ms.full_batch_size,
46 |         enable_qnn=ms.enable_qnn,
47 |     )
48 | 


--------------------------------------------------------------------------------
/tests/cloud/test_infer_vlm.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | import pytest
 9 | 
10 | from QEfficient.cloud.infer import main as infer
11 | 
12 | 
13 | @pytest.mark.on_qaic
14 | @pytest.mark.cli
15 | @pytest.mark.multimodal
16 | @pytest.mark.usefixtures("clean_up_after_test")
17 | def test_vlm_cli(setup, mocker):
18 |     ms = setup
19 |     # Taking some values from setup fixture and assigning other's based on model's requirement.
20 |     # For example, mxint8 is not required for VLM models, so assigning False.
21 |     infer(
22 |         model_name="llava-hf/llava-1.5-7b-hf",
23 |         num_cores=ms.num_cores,
24 |         prompt="Describe the image.",
25 |         prompts_txt_file_path=None,
26 |         aic_enable_depth_first=ms.aic_enable_depth_first,
27 |         mos=ms.mos,
28 |         batch_size=1,
29 |         full_batch_size=None,
30 |         prompt_len=1024,
31 |         ctx_len=2048,
32 |         generation_len=20,
33 |         mxfp6=False,
34 |         mxint8=False,
35 |         local_model_dir=None,
36 |         cache_dir=None,
37 |         hf_token=ms.hf_token,
38 |         enable_qnn=False,
39 |         qnn_config=None,
40 |         image_url="https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg",
41 |     )
42 | 


--------------------------------------------------------------------------------
/tests/peft/test_peft_onnx_transforms.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | import textwrap
 9 | 
10 | import onnx
11 | 
12 | from QEfficient.peft.onnx_transforms import AdapterWeightsToInputsTransform
13 | 
14 | 
15 | def test_adapter_weights_to_inputs_transform():
16 |     external_tensors_file = "weight.raw"
17 |     adapter_name = "testAdapter1"
18 |     test_onnx = onnx.parser.parse_model(f"""
19 |     <
20 |         ir_version: 8,
21 |         opset_import: ["" : 17]
22 |     >
23 |     test_adapter_weights (float[n, 32] input) => (float[n, 32] output)
24 |     <
25 |         float[32, 32] layer1_{adapter_name}_weight = [ "location" : "{external_tensors_file}" ],
26 |         float[32, 32] layer2_{adapter_name}_weight = [ "location" : "{external_tensors_file}" ]
27 |     >
28 |     {{
29 |         layer1output = MatMul (input, layer1_{adapter_name}_weight)
30 |         output = MatMul (layer1output, layer2_{adapter_name}_weight)
31 |     }}
32 |     """)
33 | 
34 |     out_onnx, transformed = AdapterWeightsToInputsTransform.apply(test_onnx, adapter_name=adapter_name)
35 |     assert not transformed
36 | 
37 |     # Currently the onnx parser doesn't support using "." in identifier
38 |     # Replace _ with .
39 |     for init in test_onnx.graph.initializer:
40 |         init.name = init.name.replace("_", ".")
41 |     for node in test_onnx.graph.node:
42 |         for i, inp in enumerate(node.input):
43 |             node.input[i] = inp.replace("_", ".")
44 |         for i, out in enumerate(node.output):
45 |             node.output[i] = out.replace("_", ".")
46 | 
47 |     out_onnx, transformed = AdapterWeightsToInputsTransform.apply(test_onnx, adapter_name=adapter_name)
48 |     assert transformed
49 |     assert (
50 |         onnx.printer.to_text(out_onnx)
51 |         == textwrap.dedent("""
52 |     <
53 |        ir_version: 8,
54 |        opset_import: ["" : 17]
55 |     >
56 |     test_adapter_weights (float[n,32] input, float[32,32] layer1.weight, float[32,32] layer2.weight) => (float[n,32] output, float[32,32] layer1.weight_RetainedState, float[32,32] layer2.weight_RetainedState) {
57 |        layer1output = MatMul (input, layer1.weight)
58 |        output = MatMul (layer1output, layer2.weight)
59 |        layer1.weight_RetainedState = Identity (layer1.weight)
60 |        layer2.weight_RetainedState = Identity (layer2.weight)
61 |     }
62 |     """).strip()
63 |     )
64 | 


--------------------------------------------------------------------------------
/tests/text_generation/test_text_generation.py:
--------------------------------------------------------------------------------
  1 | # -----------------------------------------------------------------------------
  2 | #
  3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  4 | # SPDX-License-Identifier: BSD-3-Clause
  5 | #
  6 | # -----------------------------------------------------------------------------
  7 | 
  8 | import os
  9 | 
 10 | import pytest
 11 | from transformers import AutoModelForCausalLM
 12 | 
 13 | from QEfficient.generation.text_generation_inference import TextGeneration
 14 | from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
 15 | from QEfficient.utils import hf_download
 16 | from QEfficient.utils._utils import load_hf_tokenizer
 17 | from QEfficient.utils.constants import Constants
 18 | from QEfficient.utils.device_utils import get_available_device_id
 19 | 
 20 | configs = [pytest.param("gpt2", 2, None, 32, id="gpt2_config")]
 21 | 
 22 | 
 23 | def load_causal_lm_model(model_config):
 24 |     """
 25 |     Function to load model from huggingface and transform to KV model
 26 |     --------
 27 | 
 28 |     :model_config: Dict
 29 | 
 30 |     :return model_hf, params
 31 |     """
 32 |     model_path = hf_download(
 33 |         repo_id=model_config["model_name"],
 34 |         ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
 35 |     )
 36 |     model_hf = AutoModelForCausalLM.from_pretrained(
 37 |         model_path,
 38 |         use_cache=True,
 39 |         num_hidden_layers=model_config["n_layer"],
 40 |         attn_implementation="eager",
 41 |         low_cpu_mem_usage=False,
 42 |     )  # Run models for single layers only
 43 |     params = sum(p.numel() for p in model_hf.parameters())
 44 |     model_hf.eval()
 45 |     return model_hf, params
 46 | 
 47 | 
 48 | # Use @pytest.mark.parametrize to apply the configurations
 49 | @pytest.mark.on_qaic
 50 | @pytest.mark.parametrize("model_name, n_layer, full_batch_size, max_gen_len", configs)
 51 | def test_generate_text_stream(
 52 |     model_name: str,
 53 |     n_layer: int,
 54 |     full_batch_size: int,
 55 |     max_gen_len: int,
 56 |     prompt_len: int = Constants.PROMPT_LEN,
 57 |     ctx_len: int = Constants.CTX_LEN,
 58 | ):
 59 |     """
 60 |     Validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
 61 |     ``Mandatory`` Args:
 62 |         :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
 63 |         :prompt_len (int): Prompt length for the model to compile.
 64 |         :ctx_len (int): Maximum context length to compile the model.
 65 |         :n_layers (int): Number of layers for the Model.
 66 |     """
 67 |     model_config = {"model_name": model_name, "n_layer": n_layer}
 68 |     model_hf, _ = load_causal_lm_model(model_config)
 69 | 
 70 |     tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name)
 71 | 
 72 |     qeff_model = QEFFAutoModelForCausalLM(model_hf)
 73 | 
 74 |     qeff_model.export()
 75 |     device_id = get_available_device_id()
 76 | 
 77 |     if not device_id:
 78 |         pytest.skip("No available devices to run model on Cloud AI 100")
 79 | 
 80 |     qpc_path = qeff_model.compile(
 81 |         prefill_seq_len=prompt_len,
 82 |         ctx_len=ctx_len,
 83 |         num_cores=14,
 84 |         mxfp6=False,
 85 |         aic_enable_depth_first=False,
 86 |         full_batch_size=full_batch_size,
 87 |     )
 88 | 
 89 |     exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR, generation_len=max_gen_len)
 90 |     cloud_ai_100_tokens = exec_info.generated_ids[0]  # Because we always run for single input and single batch size
 91 |     cloud_ai_100_output = [tokenizer.decode(token, skip_special_tokens=True) for token in cloud_ai_100_tokens[0]]
 92 | 
 93 |     text_generator = TextGeneration(
 94 |         tokenizer=tokenizer,
 95 |         qpc_path=qpc_path,
 96 |         ctx_len=ctx_len,
 97 |         full_batch_size=full_batch_size,
 98 |     )
 99 |     stream_tokens = []
100 |     for decoded_tokens in text_generator.generate_stream_tokens(Constants.INPUT_STR, generation_len=max_gen_len):
101 |         stream_tokens.extend(decoded_tokens)
102 | 
103 |     assert cloud_ai_100_output == stream_tokens, (
104 |         f"Deviation in output observed while comparing regular execution and streamed output: {cloud_ai_100_output} != {stream_tokens}"
105 |     )
106 |     assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
107 | 


--------------------------------------------------------------------------------
/tests/transformers/models/test_embedding_models.py:
--------------------------------------------------------------------------------
  1 | # -----------------------------------------------------------------------------
  2 | #
  3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  4 | # SPDX-License-Identifier: BSD-3-Clause
  5 | #
  6 | # -----------------------------------------------------------------------------
  7 | 
  8 | import os
  9 | from typing import Optional
 10 | 
 11 | import numpy as np
 12 | import onnxruntime as ort
 13 | import pytest
 14 | from transformers import AutoModel, AutoTokenizer
 15 | 
 16 | from QEfficient.transformers.models.modeling_auto import QEFFAutoModel
 17 | from QEfficient.utils._utils import create_json
 18 | from QEfficient.utils.constants import Constants, QnnConstants
 19 | 
 20 | embed_test_models = [
 21 |     # model_name, architecture
 22 |     "sentence-transformers/multi-qa-mpnet-base-cos-v1",  # MPNetForMaskedLM
 23 |     "BAAI/bge-reranker-v2-m3",  # XLMRobertaForSequenceClassification
 24 |     "BAAI/bge-small-en-v1.5",  # BertModel
 25 | ]
 26 | 
 27 | 
 28 | def check_embed_pytorch_vs_ort_vs_ai100(
 29 |     model_name: str,
 30 |     seq_len: int = Constants.CTX_LEN,
 31 |     n_layer: int = 1,
 32 |     enable_qnn: Optional[bool] = False,
 33 |     qnn_config: Optional[str] = None,
 34 | ):
 35 |     # Prepare input
 36 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
 37 |     inputs = tokenizer("My name is", return_tensors="pt")
 38 | 
 39 |     # Original PyTorch model
 40 |     pt_model = AutoModel.from_pretrained(
 41 |         model_name,
 42 |         num_hidden_layers=n_layer,
 43 |         attn_implementation="eager",
 44 |         trust_remote_code=True,
 45 |     )
 46 | 
 47 |     pt_outputs = pt_model(**inputs)
 48 |     pt_embeddings = pt_outputs[0][0].detach().numpy()
 49 |     # Pytorch transformed model
 50 |     qeff_model = QEFFAutoModel(pt_model, pretrained_model_name_or_path=model_name)
 51 |     qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
 52 |     qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy()
 53 |     mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings))
 54 |     print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad)
 55 |     assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}"
 56 | 
 57 |     onnx_model = qeff_model.export()
 58 |     ort_session = ort.InferenceSession(str(onnx_model))
 59 | 
 60 |     # Prepare the inputs for ONNX Runtime
 61 |     input_ids = np.array(inputs["input_ids"])
 62 |     attention_mask = np.array(inputs["attention_mask"])
 63 | 
 64 |     onnx_inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
 65 |     # Run inference
 66 |     onnx_outputs = ort_session.run(None, onnx_inputs)
 67 | 
 68 |     # Compare Transformed PyTorch and ONNX outputs
 69 | 
 70 |     pt_embeddings = pt_outputs[0][0].detach().numpy()
 71 |     onnx_embeddings = onnx_outputs[0]
 72 |     mad = np.mean(np.abs(pt_embeddings - onnx_embeddings))
 73 |     print("Mad for onnx and PyTorch is ", mad)
 74 |     assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}"
 75 | 
 76 |     qeff_model.compile(
 77 |         num_cores=14,
 78 |         enable_qnn=enable_qnn,
 79 |         qnn_config=qnn_config,
 80 |     )
 81 |     ai100_output = qeff_model.generate(inputs=inputs)
 82 | 
 83 |     # Compare ONNX and AI 100 outputs
 84 |     mad = np.mean(np.abs(ai100_output - onnx_outputs[0]))
 85 |     print("Mad for onnx and AI 100 output is ", mad)
 86 |     assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"
 87 |     assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 88 | 
 89 | 
 90 | @pytest.mark.on_qaic
 91 | @pytest.mark.parametrize("model_name", embed_test_models)
 92 | def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name):
 93 |     """
 94 |     Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
 95 |     """
 96 |     check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1)
 97 | 
 98 | 
 99 | @pytest.mark.on_qaic
100 | @pytest.mark.qnn
101 | @pytest.mark.parametrize("model_name", embed_test_models)
102 | def test_embed_model_pytorch_vs_onnx_vs_ai100_qnn(model_name):
103 |     """
104 |     QNN Compilation path test.
105 |     Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
106 |     """
107 |     qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
108 |     create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
109 | 
110 |     check_embed_pytorch_vs_ort_vs_ai100(
111 |         model_name=model_name, seq_len=32, n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path
112 |     )
113 | 


--------------------------------------------------------------------------------
/tests/utils/test_cache.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | #
 3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | #
 6 | # -----------------------------------------------------------------------------
 7 | 
 8 | import random
 9 | 
10 | import pytest
11 | 
12 | from QEfficient.utils.cache import to_hashable
13 | 
14 | 
15 | def get_random_string(length: int) -> str:
16 |     return "".join([chr(random.randint(0x20, 0x7E)) for _ in range(length)])
17 | 
18 | 
19 | def test_to_hashable_dict():
20 |     dct = {get_random_string(i): i for i in range(5)}
21 |     dct = dict(sorted(dct.items()))
22 |     hash1 = to_hashable(dct)
23 | 
24 |     dct = dict(reversed(dct.items()))
25 |     hash2 = to_hashable(dct)
26 | 
27 |     assert hash1 == hash2
28 | 
29 | 
30 | def test_to_hashable_set():
31 |     assert to_hashable(set(range(4))) == to_hashable(set(range(4 - 1, -1, -1)))
32 | 
33 | 
34 | @pytest.mark.parametrize("value", [float("nan"), float("inf"), -float("inf")])
35 | def test_to_hashable_float_nan(value):
36 |     with pytest.raises(ValueError):
37 |         to_hashable(value)
38 | 


--------------------------------------------------------------------------------
/tests/vllm/test_qaic_output_consistency.py:
--------------------------------------------------------------------------------
  1 | # -----------------------------------------------------------------------------
  2 | #
  3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
  4 | # SPDX-License-Identifier: BSD-3-Clause
  5 | #
  6 | # -----------------------------------------------------------------------------
  7 | 
  8 | import random
  9 | 
 10 | import pytest
 11 | from vllm import LLM, SamplingParams
 12 | 
 13 | # Model to test
 14 | test_models = [
 15 |     "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
 16 | ]
 17 | 
 18 | # Constants for configuration
 19 | SEQ_LEN = 128
 20 | CTX_LEN = 256
 21 | DECOE_BSZ = 4
 22 | DTYPE = "mxfp6"
 23 | KV_DTYPE = "mxint8"
 24 | 
 25 | 
 26 | @pytest.mark.vllm
 27 | @pytest.mark.parametrize("model_name", test_models)
 28 | def test_output_consistency(model_name):
 29 |     """This pytest function is used to check the consistency of vLLM.
 30 |        1) Single prompt test to check if the output generated in 5 different
 31 |           runs yields the same results
 32 |        2) Multiple prompt check to test if multiple prompts yield same results
 33 |           if run in different slots.
 34 | 
 35 |     Parameters
 36 |     ----------
 37 |     model_name : string
 38 |         Huggingface model card name.
 39 |     """
 40 |     sampling_params = SamplingParams(temperature=0.0, max_tokens=None)
 41 | 
 42 |     # Creating LLM Object
 43 |     qllm = LLM(
 44 |         model=model_name,
 45 |         max_num_seqs=DECOE_BSZ,
 46 |         max_model_len=CTX_LEN,
 47 |         max_seq_len_to_capture=SEQ_LEN,
 48 |         quantization=DTYPE,
 49 |         kv_cache_dtype=KV_DTYPE,
 50 |         device="qaic",
 51 |     )
 52 | 
 53 |     # Single prompt test
 54 |     single_prompt = ["My name is"]
 55 | 
 56 |     single_prompt_output = qllm.generate(single_prompt * 5, sampling_params)
 57 | 
 58 |     check_output = []
 59 |     for i, op in enumerate(single_prompt_output):
 60 |         check_output.append(op.outputs[0].text)
 61 | 
 62 |         # Assertion to check the consistency of single prompt.
 63 |     assert len(set(check_output)) == 1, "Outputs from different slots for same prompt does not match!!"
 64 | 
 65 |     # Multiple prompt test
 66 |     outputDict = dict()
 67 |     multiple_prompt = [
 68 |         "My name is",
 69 |         "How to eat mangosteen?",
 70 |         "How many people died in World War II",
 71 |         "Hello ",
 72 |         "Who is the president of United States",
 73 |         "Who is the president of India",
 74 |         "When it snowfalls in San Diego",
 75 |         "In which country yamana river flows",
 76 |         "How many people died in World War II",
 77 |         "Thy youth is proud livery, so gazed on now",
 78 |         "Will be a tattered weed, of small worth held:Then being asked where all thy beauty lies",
 79 |         "Where all the treasure of thy lusty days",
 80 |         "To say, within thine own deep-sunken eyes",
 81 |         "Where is Statue of Liberty located?",
 82 |     ]
 83 | 
 84 |     for p in multiple_prompt:
 85 |         outputDict[p] = []
 86 | 
 87 |     for _ in range(5):
 88 |         random.shuffle(multiple_prompt)
 89 |         multiple_prompt_output = qllm.generate(multiple_prompt, sampling_params)
 90 |         for i, op in enumerate(multiple_prompt_output):
 91 |             generated_text = op.outputs[0].text
 92 |             outputDict[multiple_prompt[i]].append(str(multiple_prompt[i] + generated_text))
 93 | 
 94 |     # Assertion to check multiple prompts.
 95 |     for key in outputDict.keys():
 96 |         assert len(set(outputDict[key])) == 1, "Outputs from different slots for same prompt does not match!!"
 97 | 
 98 |     # Assertion to check if any prompts are missed.
 99 |     assert len(multiple_prompt) == len(multiple_prompt_output), (
100 |         "Number of Generated Tokens do not match the number of valid inputs!!"
101 |     )
102 | 


--------------------------------------------------------------------------------