├── .clang-format ├── .githooks └── check-license.py ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── dependabot.yml └── workflows │ ├── documentation.yml │ ├── style.yml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SECURITY.md ├── bandit.yaml ├── dev_requirements.txt ├── docs ├── Doxyfile ├── build_doc.py └── source │ ├── .gitignore │ ├── _static │ └── .gitkeep │ ├── _templates │ └── .gitkeep │ ├── adding_operations.md │ ├── conf.py │ ├── cpp_reference.rst │ ├── developer.md │ ├── index.rst │ ├── llm.md │ ├── llm_perf.png │ ├── llm_performance.md │ ├── npu.md │ ├── npu_arch.png │ ├── python │ ├── intel_npu_acceleration_library.backend.rst │ ├── intel_npu_acceleration_library.functional.rst │ ├── intel_npu_acceleration_library.nn.rst │ ├── intel_npu_acceleration_library.rst │ └── modules.rst │ ├── setup.md │ └── usage.md ├── examples ├── Audio-Spectrogram-Transformer.py ├── NPU compilation tutorial.ipynb ├── compile_model.py ├── cpp │ ├── CMakeLists.txt │ ├── README.md │ └── main.cpp ├── llama.py ├── llama3.py ├── llava.py ├── matmul.py ├── model_compilation_demo.ipynb ├── phi-2.py ├── phi-3.py ├── qwen2_math_7b.py ├── t5.py ├── tiny_llama_chat.py └── train_mnist.py ├── include └── intel_npu_acceleration_library │ ├── common.h │ ├── conversion.h │ ├── inference.h │ ├── nn_factory.h │ └── parameters.h ├── intel_npu_acceleration_library ├── __init__.py ├── _version.py ├── backend │ ├── __init__.py │ ├── base.py │ ├── bindings.py │ ├── compression.py │ ├── convolution.py │ ├── factory.py │ ├── linear.py │ ├── matmul.py │ ├── mlp.py │ ├── ops.py │ ├── qlinear.py │ ├── qmatmul.py │ ├── runtime.py │ ├── sdpa.py │ ├── tensor.py │ └── utils.py ├── compiler.py ├── device.py ├── dtypes.py ├── functional │ ├── __init__.py │ └── scaled_dot_product_attention.py ├── modelling.py ├── nn │ ├── __init__.py │ ├── autograd.py │ ├── conv.py │ ├── functional.py │ ├── linear.py │ ├── llm.py │ └── module.py ├── optimizations.py └── quantization.py ├── licensing ├── dev-third-party-programs.txt ├── documentation-third-party-programs.txt └── third-party-programs.txt ├── mypy.ini ├── requirements.txt ├── script ├── export.py ├── gen_leaderboard_doc.py ├── llm_leaderboard.py ├── profile_llm.py ├── profile_matmul.py ├── profile_mlp.py └── quantize_model.py ├── setup.cfg ├── setup.py ├── src └── bindings.cpp └── test └── python ├── conftest.py ├── test_basic.py ├── test_bindings.py ├── test_compile.py ├── test_conv.py ├── test_device.py ├── test_dtypes.py ├── test_factory.py ├── test_layers.py ├── test_llm.py ├── test_matmul.py ├── test_module.py ├── test_op.py ├── test_optimizations.py ├── test_profiling.py ├── test_quantization.py ├── test_sdpa.py ├── test_tensor.py └── test_training.py /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | 3 | Language: Cpp 4 | Standard: Cpp11 5 | 6 | AccessModifierOffset: -4 7 | AlignAfterOpenBracket: Align 8 | AllowAllArgumentsOnNextLine: false 9 | AllowAllConstructorInitializersOnNextLine: true 10 | AllowShortBlocksOnASingleLine: false 11 | AllowShortCaseLabelsOnASingleLine: false 12 | AllowShortFunctionsOnASingleLine: None 13 | AllowShortIfStatementsOnASingleLine: Never 14 | AllowShortLambdasOnASingleLine: Empty 15 | AllowShortLoopsOnASingleLine: false 16 | AlwaysBreakBeforeMultilineStrings: false 17 | BreakInheritanceList: AfterColon 18 | ColumnLimit: 120 19 | ConstructorInitializerIndentWidth: 8 20 | ContinuationIndentWidth: 8 21 | DerivePointerAlignment: false 22 | FixNamespaceComments: true 23 | IncludeBlocks: Preserve 24 | IndentCaseLabels: false 25 | IndentWidth: 4 26 | PointerAlignment: Left 27 | SpaceBeforeCpp11BracedList: false 28 | SpaceBeforeCtorInitializerColon: false 29 | UseTab: Never 30 | StatementMacros: ['CASE', 'HW_OPS_CASE'] -------------------------------------------------------------------------------- /.githooks/check-license.py: -------------------------------------------------------------------------------- 1 | #! python 2 | # 3 | # Copyright © 2024 Intel Corporation 4 | # SPDX-License-Identifier: Apache 2.0 5 | # 6 | 7 | import datetime 8 | import sys 9 | import os 10 | 11 | LICENSE_TYPE = "Apache 2.0" 12 | LICENSE_STR = f"SPDX-License-Identifier: {LICENSE_TYPE}" 13 | 14 | COPYRIGHT = f"Copyright © {datetime.datetime.now().year} Intel Corporation" 15 | 16 | if __name__ == "__main__": 17 | ret = 0 18 | for filename in sys.argv: 19 | _, file_extension = os.path.splitext(filename) 20 | if "CMakeLists.txt" in filename or file_extension in [ 21 | ".h", 22 | ".hpp", 23 | ".cpp", 24 | ".c", 25 | ".py", 26 | ".js", 27 | ".sh", 28 | ]: 29 | with open(filename, encoding="utf-8") as fp: 30 | text = fp.read() 31 | if LICENSE_STR not in text: 32 | print(f"[pre-commit] {filename} does not have a valid license!") 33 | ret = 1 34 | if COPYRIGHT not in text: 35 | print(f"[pre-commit] {filename} does not have a valid copyright!") 36 | ret = 1 37 | 38 | sys.exit(ret) 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /.github/workflows/documentation.yml: -------------------------------------------------------------------------------- 1 | name: Documentation 2 | permissions: read-all 3 | 4 | on: 5 | workflow_dispatch: 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v3 12 | - name: Set up Python 3.10 13 | uses: actions/setup-python@v3 14 | with: 15 | python-version: "3.10" 16 | - name: Install packet 17 | run: | 18 | sudo apt-get install -y doxygen 19 | python -m pip install --upgrade pip 20 | pip install .[dev] 21 | - name: Run tests 22 | run: | 23 | cd docs 24 | python build_doc.py gh-deploy 25 | -------------------------------------------------------------------------------- /.github/workflows/style.yml: -------------------------------------------------------------------------------- 1 | name: Style 2 | permissions: read-all 3 | 4 | on: 5 | workflow_dispatch: 6 | pull_request: 7 | push: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v3 16 | - name: Set up Python 3.10 17 | uses: actions/setup-python@v3 18 | with: 19 | python-version: "3.10" 20 | - name: Install packet 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install .[dev] 24 | - name: Install pre-commit 25 | run: | 26 | pip install pre-commit 27 | pre-commit install 28 | - name: Run tests 29 | run: pre-commit run --all-files 30 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | permissions: read-all 3 | 4 | on: 5 | workflow_dispatch: 6 | pull_request: 7 | push: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | build: 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | matrix: 16 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 17 | os: [windows-latest] 18 | steps: 19 | - uses: actions/checkout@v3 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v3 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install TBB on ubuntu 25 | if: matrix.os == 'ubuntu-latest' 26 | run: sudo apt install libtbb-dev 27 | - name: Install packet 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install tox tox-gh-actions 31 | - name: Run tests 32 | run: tox 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | cache 3 | logs 4 | models 5 | ov 6 | *.json 7 | build 8 | *.egg-info 9 | dist 10 | lib 11 | *.csv 12 | *.png 13 | *.bin 14 | *.xml 15 | 16 | _build 17 | site 18 | data 19 | xml 20 | .coverage* 21 | coverity 22 | .tox 23 | nc_workspace -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2022 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | default_install_hook_types: [pre-commit, commit-msg] 7 | repos: 8 | - repo: local 9 | hooks: 10 | - id: check-license 11 | name: Check license 12 | entry: ./.githooks/check-license.py 13 | language: python 14 | stages: [commit] 15 | 16 | - repo: https://github.com/pre-commit/mirrors-clang-format 17 | rev: v10.0.1 18 | hooks: 19 | - id: clang-format 20 | - repo: https://github.com/psf/black 21 | rev: 22.6.0 22 | hooks: 23 | - id: black 24 | - repo: https://github.com/PyCQA/bandit 25 | rev: 1.7.7 26 | hooks: 27 | - id: bandit 28 | args: ["-c", "bandit.yaml"] 29 | - repo: https://github.com/pre-commit/pre-commit-hooks 30 | rev: v4.1.0 31 | hooks: 32 | - id: check-merge-conflict 33 | - id: check-json 34 | - id: check-executables-have-shebangs 35 | - id: check-symlinks 36 | - id: debug-statements 37 | - id: mixed-line-ending 38 | - id: trailing-whitespace 39 | - repo: https://github.com/PyCQA/flake8 40 | rev: 6.1.0 41 | hooks: 42 | - id: flake8 43 | additional_dependencies: [ 44 | 'flake8-blind-except', 45 | 'flake8-docstrings', 46 | 'flake8-bugbear', 47 | 'flake8-comprehensions', 48 | 'flake8-docstrings-complete', 49 | 'flake8-implicit-str-concat', 50 | 'pydocstyle>=5.0.0', 51 | ] 52 | exclude: docs/.*|setup.py|test/.*|script/.*|examples/.* 53 | - repo: https://github.com/regebro/pyroma 54 | rev: "4.2" 55 | hooks: 56 | - id: pyroma 57 | - repo: https://github.com/pre-commit/mirrors-mypy 58 | rev: 'v1.8.0' 59 | hooks: 60 | - id: mypy 61 | exclude: 'docs|script|test|venv|examples' 62 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | cmake_minimum_required(VERSION 3.16) 7 | include(FetchContent) 8 | 9 | project(intel_npu_acceleration_library) 10 | 11 | set(CMAKE_CXX_STANDARD 14 CACHE STRING "C++ standard to conform to") 12 | 13 | if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU") 14 | # lots of warnings and all warnings as errors 15 | message(STATUS "Setting GCC/Clang specific flags for the entire build") 16 | add_compile_options(-Wall -Wextra -Werror -pedantic -Wdouble-promotion -Wfloat-conversion -march=native) 17 | set(CMAKE_CXX_FLAGS_DEBUG "-g") 18 | set(CMAKE_CXX_FLAGS_RELEASE "-O3") 19 | elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") 20 | message(STATUS "Setting Visual Studio specific flags for the entire build") 21 | add_compile_options(/W3 /WX /arch:AVX2 /arch:SSE2) 22 | add_link_options(/WX) 23 | else() 24 | message(AUTHOR_WARNING "-- Building with unrecognised compiler, not setting any specific flags") 25 | endif() 26 | 27 | function(get_linux_lsb_release_information) 28 | find_program(LSB_RELEASE_CMD lsb_release) 29 | if(NOT LSB_RELEASE_CMD) 30 | message(FATAL_ERROR "Command lsb_release cannot be found") 31 | endif() 32 | 33 | execute_process(COMMAND "${LSB_RELEASE_CMD}" --short --id OUTPUT_VARIABLE LSB_RELEASE_ID OUTPUT_STRIP_TRAILING_WHITESPACE) 34 | execute_process(COMMAND "${LSB_RELEASE_CMD}" --short --release OUTPUT_VARIABLE LSB_RELEASE_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) 35 | 36 | set(LSB_RELEASE_ID "${LSB_RELEASE_ID}" PARENT_SCOPE) 37 | set(LSB_RELEASE_VERSION "${LSB_RELEASE_VERSION}" PARENT_SCOPE) 38 | endfunction() 39 | 40 | set(OV_VERSION_SHORT "2024.4") 41 | set(OV_VERSION "2024.4.0.16579.c3152d32c9c_x86_64") 42 | set(OV_STORAGE_URL "https://storage.openvinotoolkit.org/repositories/openvino/packages") 43 | set(OV_NIGHTLY_COMMIT "2024.3.0-15502-66093834e38") 44 | 45 | if (WIN32) 46 | if(NOT OV_LIBRARY_URL) 47 | if (${OV_VERSION_SHORT} STREQUAL "nightly") 48 | set(OV_PLATFORM "${OV_NIGHTLY_COMMIT}") 49 | else() 50 | set(OV_PLATFORM "windows") 51 | endif() 52 | set(OV_LIBRARY_URL "${OV_STORAGE_URL}/${OV_VERSION_SHORT}/${OV_PLATFORM}/w_openvino_toolkit_windows_${OV_VERSION}.zip") 53 | endif() 54 | elseif(UNIX) 55 | if(NOT OV_LIBRARY_URL) 56 | get_linux_lsb_release_information() 57 | if (LSB_RELEASE_ID STREQUAL "Ubuntu") 58 | if (${LSB_RELEASE_VERSION} STREQUAL "18.04" OR ${LSB_RELEASE_VERSION} STREQUAL "20.04" OR ${LSB_RELEASE_VERSION} STREQUAL "22.04" OR ${LSB_RELEASE_VERSION} STREQUAL "24.04") 59 | string(REPLACE ".04" "" LSB_RELEASE_VERSION_SHORT ${LSB_RELEASE_VERSION}) 60 | if (${OV_VERSION_SHORT} STREQUAL "nightly") 61 | set(OV_PLATFORM "${OV_NIGHTLY_COMMIT}") 62 | else() 63 | set(OV_PLATFORM "linux") 64 | endif() 65 | 66 | set(OV_LIBRARY_URL "${OV_STORAGE_URL}/${OV_VERSION_SHORT}/${OV_PLATFORM}/l_openvino_toolkit_ubuntu${LSB_RELEASE_VERSION_SHORT}_${OV_VERSION}.tgz") 67 | else() 68 | message(FATAL_ERROR "Ubuntu version ${LSB_RELEASE_VERSION} is unsupported") 69 | endif() 70 | else() 71 | message(FATAL_ERROR "Linux distribution ${LSB_RELEASE_ID} is unsupported") 72 | endif() 73 | 74 | endif() 75 | else() 76 | message(FATAL_ERROR "Unsupported architecture") 77 | endif () 78 | 79 | message(STATUS "OpenVINO library URL: ${OV_LIBRARY_URL}") 80 | 81 | FetchContent_Declare( 82 | openvino 83 | URL ${OV_LIBRARY_URL} 84 | ) 85 | FetchContent_MakeAvailable(openvino) 86 | 87 | find_package(OpenVINO REQUIRED PATHS ${openvino_SOURCE_DIR}/runtime/cmake) 88 | 89 | if (WIN32) 90 | set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) 91 | file(GLOB OpenVINObin ${openvino_SOURCE_DIR}/runtime/bin/intel64/Release/*) 92 | file(COPY ${OpenVINObin} DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/Release) 93 | 94 | file(GLOB TBBlib ${openvino_SOURCE_DIR}/runtime/3rdparty/tbb/bin/*) 95 | file(COPY ${TBBlib} DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/Release) 96 | else() 97 | file(GLOB OpenVINObin ${openvino_SOURCE_DIR}/runtime/lib/intel64/*) 98 | file(COPY ${OpenVINObin} DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) 99 | endif() 100 | 101 | if(SETUPTOOL_BUILD) 102 | file(GLOB OpenVINOPython ${openvino_SOURCE_DIR}/python/openvino/*) 103 | file(COPY ${OpenVINOPython} DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/../external/openvino) 104 | endif() 105 | 106 | set(CMAKE_POSITION_INDEPENDENT_CODE ON) 107 | 108 | include_directories(include) 109 | 110 | # Create the Python module 111 | add_library(intel_npu_acceleration_library SHARED src/bindings.cpp) 112 | 113 | # Link the OpenVINO libraries 114 | target_link_libraries(intel_npu_acceleration_library PRIVATE openvino::runtime) 115 | if (UNIX) 116 | set_target_properties(intel_npu_acceleration_library PROPERTIES LINK_FLAGS "-Wl,-rpath,./") 117 | endif (UNIX) 118 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, caste, color, religion, or sexual 10 | identity and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the overall 26 | community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or advances of 31 | any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email address, 35 | without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | CommunityCodeOfConduct AT intel DOT com. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series of 86 | actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or permanent 93 | ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within the 113 | community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.1, available at 119 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. 120 | 121 | Community Impact Guidelines were inspired by 122 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. 123 | 124 | For answers to common questions about this code of conduct, see the FAQ at 125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at 126 | [https://www.contributor-covenant.org/translations][translations]. 127 | 128 | [homepage]: https://www.contributor-covenant.org 129 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html 130 | [Mozilla CoC]: https://github.com/mozilla/diversity 131 | [FAQ]: https://www.contributor-covenant.org/faq -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ## License 4 | 5 | Intel® NPU Acceleration Library is licensed under the terms in [LICENSE](LICENSE). By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms. 6 | 7 | ## Sign your work 8 | 9 | Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify 10 | the below (from [developercertificate.org](http://developercertificate.org/)): 11 | 12 | ```text 13 | Developer Certificate of Origin 14 | Version 1.1 15 | 16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 17 | 660 York Street, Suite 102, 18 | San Francisco, CA 94110 USA 19 | 20 | Everyone is permitted to copy and distribute verbatim copies of this 21 | license document, but changing it is not allowed. 22 | 23 | Developer's Certificate of Origin 1.1 24 | 25 | By making a contribution to this project, I certify that: 26 | 27 | (a) The contribution was created in whole or in part by me and I 28 | have the right to submit it under the open source license 29 | indicated in the file; or 30 | 31 | (b) The contribution is based upon previous work that, to the best 32 | of my knowledge, is covered under an appropriate open source 33 | license and I have the right under that license to submit that 34 | work with modifications, whether created in whole or in part 35 | by me, under the same open source license (unless I am 36 | permitted to submit under a different license), as indicated 37 | in the file; or 38 | 39 | (c) The contribution was provided directly to me by some other 40 | person who certified (a), (b) or (c) and I have not modified 41 | it. 42 | 43 | (d) I understand and agree that this project and the contribution 44 | are public and that a record of the contribution (including all 45 | personal information I submit with it, including my sign-off) is 46 | maintained indefinitely and may be redistributed consistent with 47 | this project or the open source license(s) involved. 48 | ``` 49 | 50 | Then you just add a line to every git commit message: 51 | 52 | Signed-off-by: Joe Smith 53 | 54 | Use your real name (sorry, no pseudonyms or anonymous contributions.) 55 | 56 | If you set your `user.name` and `user.email` git configs, you can sign your 57 | commit automatically with `git commit -s`. -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. 3 | 4 | ## Reporting a Vulnerability 5 | Please report any security vulnerabilities in this project utilizing the guidelines [here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). -------------------------------------------------------------------------------- /bandit.yaml: -------------------------------------------------------------------------------- 1 | exclude_dirs: ['test', 'docs', '.githooks', 'script', 'examples'] 2 | skips: [] -------------------------------------------------------------------------------- /dev_requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-xdist 3 | pytest-cov 4 | scikit-learn <= 1.5.2 5 | pre-commit; sys_platform == 'darwin' 6 | sphinx 7 | breathe 8 | sphinx-book-theme 9 | myst-parser 10 | ghp-import 11 | pyroma 12 | mypy -------------------------------------------------------------------------------- /docs/Doxyfile: -------------------------------------------------------------------------------- 1 | INPUT = ../include/intel_npu_acceleration_library 2 | FILE_PATTERNS = *.cpp *.h 3 | GENERATE_HTML = NO 4 | GENERATE_LATEX = NO 5 | GENERATE_XML = YES # Important for Breathe 6 | FULL_PATH_NAMES = NO 7 | ENABLE_PREPROCESSING = YES 8 | MACRO_EXPANSION = YES 9 | EXPAND_ONLY_PREDEF = YES 10 | -------------------------------------------------------------------------------- /docs/build_doc.py: -------------------------------------------------------------------------------- 1 | #! python 2 | # 3 | # Copyright © 2024 Intel Corporation 4 | # SPDX-License-Identifier: Apache 2.0 5 | # 6 | 7 | from http.server import HTTPServer, SimpleHTTPRequestHandler 8 | from ghp_import import ghp_import 9 | from typing import List, Union 10 | import subprocess 11 | import argparse 12 | import shutil 13 | import os 14 | 15 | 16 | def define_and_parse_args(): 17 | parser = argparse.ArgumentParser(description="Build documentations") 18 | parser.add_argument( 19 | "action", 20 | type=str, 21 | choices=["build", "serve", "gh-deploy"], 22 | help="Name of the model to export", 23 | ) 24 | 25 | return parser.parse_args() 26 | 27 | 28 | doc_root = os.path.dirname(os.path.abspath(__file__)) 29 | root = os.path.abspath(os.path.join(doc_root, "..")) 30 | doxygen_available = shutil.which("doxygen") is not None 31 | 32 | 33 | def clean_dirs(dir_names: Union[List[str], str]) -> None: 34 | if isinstance(dir_names, str): 35 | dir_names = [dir_names] 36 | for name in dir_names: 37 | xml_dir = os.path.join(doc_root, name) 38 | if os.path.exists(xml_dir) and os.path.isdir(xml_dir): 39 | shutil.rmtree(xml_dir) 40 | 41 | 42 | def build_doc(): 43 | 44 | clean_dirs(["build", "xml"]) 45 | 46 | if not doxygen_available: 47 | raise RuntimeError("Doxygen is needed to build documentation") 48 | 49 | yield subprocess.check_output( 50 | ["doxygen", "Doxyfile"], cwd=doc_root, stderr=subprocess.STDOUT 51 | ).decode() 52 | yield subprocess.check_output( 53 | ["sphinx-apidoc", "-o", "source/python", "../intel_npu_acceleration_library"], 54 | cwd=doc_root, 55 | stderr=subprocess.STDOUT, 56 | ).decode() 57 | yield subprocess.check_output( 58 | ["sphinx-build", "-b", "html", "source", "build"], 59 | cwd=doc_root, 60 | stderr=subprocess.STDOUT, 61 | ).decode() 62 | 63 | clean_dirs("xml") 64 | 65 | 66 | def build(): 67 | for out in build_doc(): 68 | print(out) 69 | 70 | 71 | class Handler(SimpleHTTPRequestHandler): 72 | def __init__(self, *args, **kwargs): 73 | super().__init__(*args, directory="build", **kwargs) 74 | 75 | 76 | def serve(hostname="localhost", port=8000): 77 | build() 78 | server_address = (hostname, port) 79 | httpd = HTTPServer(server_address, Handler) 80 | print(f"Serving at address {hostname}:{port}") 81 | httpd.serve_forever() 82 | 83 | 84 | def get_git_sha() -> str: 85 | return ( 86 | subprocess.check_output( 87 | ["git", "rev-parse", "--short", "HEAD"], 88 | cwd=root, 89 | ) 90 | .decode() 91 | .strip() 92 | ) 93 | 94 | 95 | def deploy(): 96 | build() 97 | 98 | message = f"Deployed with sha {get_git_sha()}" 99 | 100 | try: 101 | ghp_import( 102 | os.path.join(doc_root, "build"), 103 | mesg=message, 104 | remote="origin", 105 | branch="gh-pages", 106 | push=True, 107 | force=True, 108 | use_shell=False, 109 | no_history=False, 110 | nojekyll=True, 111 | ) 112 | except ghp_import.GhpError as e: 113 | raise RuntimeError(f"Failed to deploy to GitHub. Error: \n{e.message}") 114 | 115 | 116 | if __name__ == "__main__": 117 | args = define_and_parse_args() 118 | 119 | if args.action == "build": 120 | build() 121 | elif args.action == "serve": 122 | serve() 123 | elif args.action == "gh-deploy": 124 | deploy() 125 | else: 126 | raise RuntimeError(f"Unsuported action: {args.action}") 127 | -------------------------------------------------------------------------------- /docs/source/.gitignore: -------------------------------------------------------------------------------- 1 | !*.png -------------------------------------------------------------------------------- /docs/source/_static/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/intel-npu-acceleration-library/073ad6a3a1eb20fdd1ba00d72c7241586372ebee/docs/source/_static/.gitkeep -------------------------------------------------------------------------------- /docs/source/_templates/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/intel-npu-acceleration-library/073ad6a3a1eb20fdd1ba00d72c7241586372ebee/docs/source/_templates/.gitkeep -------------------------------------------------------------------------------- /docs/source/adding_operations.md: -------------------------------------------------------------------------------- 1 | # Adding New Operations in the Library 2 | 3 | This document outlines the process for integrating a new operation into the existing code library. The integration process involves several key steps: defining the operation's interface, implementing the operation ensuring compatibility with the library's architecture, and providing testing to validate the operation. 4 | 5 | An example of implementing new operations can be found here: [Implementing reduce operations](https://github.com/intel/intel-npu-acceleration-library/commit/4f17015a75c146fe8d569ac71a2e2a0a960fc652) 6 | 7 | ## Step 1: Defining the OpenVINO interface 8 | 9 | The first step is defining the call to the OpenVino method of the new operation through the OpenVINO Runtime C++ API. This is done in the `nn_factory.h` header. In this file, a new operation is created by interfacing with the OpenVINO operation. This includes specifying input and output parameters, and data types of the operation's interface and then calling and returning the OpenVINO method. The interface should align with the library's existing design patterns and naming conventions. 10 | 11 | A simple example of defining a new operation: 12 | ``` 13 | ov::op::Op* new_operation(ov::op::Op* input) { 14 | auto new_operation = std::make_shared(input->output(0)); 15 | operations.push_back(new_operation); 16 | return new_operation.get(); 17 | } 18 | ``` 19 | ## Step 2: Defining the C++ bindings 20 | 21 | The next step is defining the C++ binding in the `binding.cpp` source file. This is the method that will be called in Python. This method has the operation's input node as a parameter and additional arguments of the operation are defined in the method. 22 | 23 | An example of defining the binding: 24 | ``` 25 | intel_npu_acceleration_library_DLL_API ov::op::Op* new_operation(intel_npu_acceleration_library::ModelFactory* factory, ov::op::Op* input) { 26 | return factory->new_operation(input); 27 | } 28 | ``` 29 | 30 | ## Step 3: Adding new operation to list of supported operation 31 | 32 | The new operation is added to the list of supported NPU operations in the `ops.py` script. 33 | The information of the new operation that must be provided is: 34 | - the operation name 35 | - the number of inputs 36 | - the optional parameters types 37 | 38 | ## Step 4: Adding extra functionality to the operation's function 39 | Ctypes is used to interface between C++ and Python. (Documentation is found here: [Python Ctypes](https://docs.python.org/3/library/ctypes.html)) 40 | 41 | If there is additional logic that you may want to add to the function, this can be done by defining a Python function that calls the C++ method in the `factory.py` file. 42 | Otherwise, if you directly call the functions to C++, then you do not need to define a Python function. 43 | 44 | ## Step 5: Adding PyTorch wrapper for the new operation 45 | Additionally, to define a wrapper to use PyTorch native functions, this can be implemented in the `functional.py` file. In this step, a function of the same name as the PyTorch equivalent is created, which is used instead of the PyTorch implementation of the operation. 46 | If there is additional logic that you may want to add to the function to interface with the new operation, it can also be added in this function. 47 | 48 | It is common for the new operation to have the same name as the PyTorch equivalent operation, however this is not always the case and to show which operation we are referring to, we refer to the newly implemented operation as `new_operation` and the PyTorch operation and `operation`. 49 | 50 | The basic structure of PyTorch wrapper for a PyTorch operation, referred to as `torch.operation`, which returns the output of the implemented `new_operation`: 51 | ``` 52 | @implements(torch.operation) 53 | def operation(x: Tensor) -> Tensor: 54 | """Return the output tensor of the operation. 55 | 56 | Args: 57 | x (Tensor): The input tensor. 58 | Returns: 59 | Tensor: Output tensor. 60 | """ 61 | return generate_op(x, "new_operation") 62 | ``` 63 | ## Step 6: Building the library 64 | To update the library, run the command: 65 | ``` 66 | pip install . 67 | ``` 68 | 69 | ## Step 7: Adding tests for the new operation 70 | A test for the new operation can be added in the `test_op.py` script. The new operation should be compared with a reference to ensure correct implementation. 71 | 72 | The following is a basic structure to use the new operation: 73 | ``` 74 | X = torch.rand((16, 128)).to(torch.float16) # defining the input tensor 75 | 76 | model = NNFactory() 77 | input = model.parameter(X.shape) # creating the input node 78 | _ = model.new_operation(input) # _ = torch.operation(input) is equivalent if using the PyTorch wrapper 79 | model.compile() 80 | out = model.run(X.numpy()) 81 | ``` 82 | 83 | Using pytest to run all of the tests in the file: 84 | ``` 85 | pytest 86 | ``` 87 | 88 | Using pytest to run a single test in the file: 89 | ``` 90 | pytest :: 91 | ``` -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #! python 2 | # 3 | # Copyright © 2024 Intel Corporation 4 | # SPDX-License-Identifier: Apache 2.0 5 | # 6 | 7 | import glob 8 | import os 9 | import sys 10 | 11 | 12 | repo_root = os.path.abspath( 13 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "../..") 14 | ) 15 | sys.path.insert(0, os.path.join(repo_root, "intel_npu_acceleration_library")) 16 | 17 | project = "Intel® NPU Acceleration Library" 18 | copyright = "2024, Intel Corporation" 19 | author = "Intel Corporation" 20 | 21 | # -- General configuration --------------------------------------------------- 22 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 23 | 24 | 25 | templates_path = ["_templates"] 26 | exclude_patterns = [] 27 | 28 | # Add Breathe extension 29 | extensions = [ 30 | # 'sphinx.ext.autodoc', 31 | "sphinx.ext.napoleon", 32 | "breathe", 33 | "myst_parser", 34 | ] 35 | 36 | # autodoc_default_options = { 37 | # 'ignore-module-all': False 38 | # } 39 | 40 | source_suffix = [".rst", ".md"] 41 | 42 | # Breathe Configuration 43 | breathe_default_project = "Intel® NPU Acceleration Library" 44 | breathe_projects = {"Intel® NPU Acceleration Library": "../xml"} 45 | 46 | # -- Options for HTML output ------------------------------------------------- 47 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 48 | 49 | html_theme = "sphinx_book_theme" 50 | html_static_path = ["_static"] 51 | -------------------------------------------------------------------------------- /docs/source/cpp_reference.rst: -------------------------------------------------------------------------------- 1 | C++ API Reference 2 | ================= 3 | 4 | .. doxygenindex:: 5 | :project: Intel® NPU Acceleration Library 6 | -------------------------------------------------------------------------------- /docs/source/developer.md: -------------------------------------------------------------------------------- 1 | # Developer Guide 2 | 3 | Install developer packages by typing 4 | 5 | ```bash 6 | pip install .[dev] 7 | ``` 8 | 9 | It is suggested to install the package locally by using `pip install -e .[dev]` 10 | 11 | ## Git hooks 12 | 13 | All developers should install the git hooks that are tracked in the `.githooks` directory. We use the pre-commit framework for hook management. The recommended way of installing it is using pip: 14 | 15 | ```bash 16 | pre-commit install 17 | ``` 18 | 19 | If you want to manually run all pre-commit hooks on a repository, run `pre-commit run --all-files`. To run individual hooks use `pre-commit run `. 20 | 21 | Uninstalling the hooks can be done using 22 | 23 | ```bash 24 | pre-commit uninstall 25 | ``` 26 | 27 | ## Testing the library 28 | 29 | ### Python test 30 | 31 | Python test uses `pytest` library. Type 32 | 33 | ```bash 34 | cd test/python && pytest 35 | ``` 36 | 37 | to run the full test suite. 38 | 39 | ## Build the documentation 40 | 41 | This project uses `sphinx` to build and deploy the documentation. To serve locally the documentation type 42 | 43 | ```bash 44 | mkdocs serve 45 | ``` 46 | 47 | to deploy it into github pages type 48 | 49 | ```bash 50 | cd docs 51 | python build_doc.py gh-deploy 52 | ``` 53 | 54 | ## Generate python packages 55 | 56 | On windows: 57 | 58 | ```bat 59 | python setup.py sdist 60 | set CIBW_BUILD=cp* 61 | cibuildwheel --platform windows --output-dir dist 62 | ``` 63 | 64 | 65 | ## Publishing packets 66 | 67 | Install twine 68 | ```bat 69 | python3 -m pip install --upgrade twine 70 | ``` 71 | 72 | Then check on the built sdist and wheel that are properly formatted (all files should return a green `PASSED`) 73 | 74 | ```bat 75 | twine check dist/* 76 | ``` 77 | 78 | Upload the packets to `testpypi` 79 | 80 | ```bat 81 | twine upload --repository testpypi dist/* 82 | ``` 83 | 84 | To upload them to the real index (**verify first with testpypi**) 85 | ```bat 86 | twine upload dist/* 87 | ``` -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. Intel® NPU Acceleration Library documentation master file, created by 2 | sphinx-quickstart on Wed Feb 7 11:48:32 2024. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Intel® NPU Acceleration Library's documentation! 7 | ===================================== 8 | 9 | The Intel® NPU Acceleration Library is a Python library designed to boost the efficiency of your applications by leveraging the power of the Intel Neural Processing Unit (NPU) to perform high-speed computations on compatible hardware. 10 | 11 | Installation 12 | ------------- 13 | 14 | Check that your system has an available NPU (`how-to `_). 15 | 16 | You can install the packet in your machine with 17 | 18 | .. code-block:: bash 19 | 20 | pip install intel-npu-acceleration-library 21 | 22 | 23 | Run a LLaMA model on the NPU 24 | ---------------------------- 25 | 26 | To run LLM models you need to install the `transformers` library 27 | 28 | 29 | .. code-block:: bash 30 | 31 | pip install transformers 32 | 33 | You are now up and running! You can create a simple script like the following one to run a LLM on the NPU 34 | 35 | 36 | .. code-block:: python 37 | :emphasize-lines: 2, 7 38 | 39 | from transformers import AutoTokenizer, TextStreamer 40 | from intel_npu_acceleration_library import NPUModelForCausalLM 41 | import torch 42 | 43 | model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" 44 | 45 | model = NPUModelForCausalLM.from_pretrained(model_id, use_cache=True, dtype=torch.int8).eval() 46 | tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True) 47 | tokenizer.pad_token_id = tokenizer.eos_token_id 48 | streamer = TextStreamer(tokenizer, skip_special_tokens=True) 49 | 50 | query = input("Ask something: ") 51 | prefix = tokenizer(query, return_tensors="pt")["input_ids"] 52 | 53 | generation_kwargs = dict( 54 | input_ids=prefix, 55 | streamer=streamer, 56 | do_sample=True, 57 | top_k=50, 58 | top_p=0.9, 59 | max_new_tokens=512, 60 | ) 61 | 62 | print("Run inference") 63 | _ = model.generate(**generation_kwargs) 64 | 65 | 66 | Take note that you only need to use `intel_npu_acceleration_library.compile` to offload the heavy computation to the NPU. 67 | 68 | Feel free to check `Usage `_ and `LLM `_ and the `examples `_ folder for additional use-cases and examples. 69 | 70 | 71 | 72 | Site map 73 | ---------------------------- 74 | 75 | .. toctree:: 76 | Quickstart 77 | NPU overview 78 | usage.md 79 | setup.md 80 | :maxdepth: 1 81 | :caption: Library overview: 82 | 83 | 84 | .. toctree:: 85 | llm.md 86 | llm_performance.md 87 | :maxdepth: 1 88 | :caption: Applications: 89 | 90 | 91 | 92 | .. toctree:: 93 | developer.md 94 | adding_operations.md 95 | :maxdepth: 1 96 | :caption: Developements guide: 97 | 98 | 99 | 100 | .. toctree:: 101 | Python API Reference 102 | cpp_reference.rst 103 | :maxdepth: 1 104 | :caption: API Reference: 105 | 106 | 107 | 108 | 109 | Indices and tables 110 | ================== 111 | 112 | * :ref:`genindex` 113 | * :ref:`modindex` 114 | * :ref:`search` 115 | -------------------------------------------------------------------------------- /docs/source/llm.md: -------------------------------------------------------------------------------- 1 | # Large Language models 2 | 3 | 4 | ## Run an LLM on the NPU 5 | 6 | You can use your existing LLM inference script on the NPU with a simple line of code 7 | 8 | ```python 9 | # First import the library 10 | import intel_npu_acceleration_library 11 | 12 | # Call the compile function to offload kernels to the NPU. 13 | model = intel_npu_acceleration_library.compile(model) 14 | ``` 15 | 16 | Here a full example: 17 | 18 | ```python 19 | from torch.profiler import profile, ProfilerActivity 20 | from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM 21 | from threading import Thread 22 | import intel_npu_acceleration_library 23 | import torch 24 | import time 25 | import sys 26 | 27 | model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" 28 | 29 | model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=True).eval() 30 | tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True) 31 | tokenizer.pad_token_id = tokenizer.eos_token_id 32 | streamer = TextStreamer(tokenizer, skip_special_tokens=True) 33 | 34 | 35 | print("Compile model for the NPU") 36 | model = intel_npu_acceleration_library.compile(model) 37 | 38 | query = "What is the meaning of life?" 39 | prefix = tokenizer(query, return_tensors="pt")["input_ids"] 40 | 41 | 42 | generation_kwargs = dict( 43 | input_ids=prefix, 44 | streamer=streamer, 45 | do_sample=True, 46 | top_k=50, 47 | top_p=0.9, 48 | ) 49 | 50 | print("Run inference") 51 | _ = model.generate(**generation_kwargs) 52 | 53 | ``` 54 | -------------------------------------------------------------------------------- /docs/source/llm_perf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/intel-npu-acceleration-library/073ad6a3a1eb20fdd1ba00d72c7241586372ebee/docs/source/llm_perf.png -------------------------------------------------------------------------------- /docs/source/npu.md: -------------------------------------------------------------------------------- 1 | # Quick overview of Intel's Neural Processing Unit (NPU) 2 | 3 | The Intel NPU is an AI accelerator integrated into Intel Core Ultra processors, characterized by a unique architecture comprising compute acceleration and data transfer capabilities. Its compute acceleration is facilitated by Neural Compute Engines, which consist of hardware acceleration blocks for AI operations like Matrix Multiplication and Convolution, alongside Streaming Hybrid Architecture Vector Engines for general computing tasks. 4 | 5 | ![Intel NPU architecture](npu_arch.png) 6 | 7 | - **Scalable Multi-Tile Design:** The heart of the NPU's compute acceleration capability lies in its scalable tiled based architecture known as Neural Compute Engines. 8 | - **Hardware Acceleration Blocks:** These engines are equipped with specific hardware blocks designed to handle AI operations that demand high levels of computation, such as Matrix Multiplication and Convolution. 9 | - **Streaming Hybrid Architecture:** Alongside the dedicated AI operation units, the Neural Compute Engines are built with Streaming Hybrid Architecture Vector Engines (SHAVE). This enables them to perform high-performance parallel computing for general compute needs. 10 | - **DMA Engines:** Direct Memory Access (DMA) engines are integral to the NPU, responsible for moving data efficiently between the system memory DRAM and the software-managed cache. 11 | - **Memory Management:** The incorporation of a built-in device MMU, alongside an IOMMU, allows support for multiple concurrent hardware contexts. This is crucial for maintaining security isolation between these contexts in line with the Microsoft Compute Driver Model (MCDM) architectural standards. 12 | 13 | ## The Role of Software 14 | 15 | While the hardware is undoubtedly advanced, the true "magic" of the Intel NPU is realized through a sophisticated MLIR based compiler. It is through compiler technology that Intel's NPU reaches its full potential by optimizing and orchestrating AI workloads. 16 | 17 | - **Parallel Workload Execution:** The compiler ensures that AI tasks are executed in parallel, directing both compute and data flows in a tiling pattern with built-in and programmable control flows. 18 | - **Maximizing Compute Utilization:** By prioritizing execution primarily out of scratchpad SRAM and reducing the data transfers between SRAM and DRAM, the compiler helps in achieving optimum performance-to-power ratios for AI workloads. 19 | 20 | Some useful links 21 | 22 | - Intel AI PC ([link](https://www.intel.com/content/www/us/en/products/docs/processors/core-ultra/ai-pc.html?wapkw=NPU)) 23 | - Intel Core Ultra Processor line ([link](https://www.intel.com/content/www/us/en/products/docs/processors/core-ultra/core-ultra-series-1-product-brief.html?wapkw=NPU)) 24 | - AI Acceleration and NPU explained ([video](https://www.youtube.com/watch?v=QSzNoX0qplE)) 25 | -------------------------------------------------------------------------------- /docs/source/npu_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/intel-npu-acceleration-library/073ad6a3a1eb20fdd1ba00d72c7241586372ebee/docs/source/npu_arch.png -------------------------------------------------------------------------------- /docs/source/python/intel_npu_acceleration_library.backend.rst: -------------------------------------------------------------------------------- 1 | intel\_npu\_acceleration\_library.backend package 2 | ========================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | intel\_npu\_acceleration\_library.backend.base module 8 | ------------------------------ 9 | 10 | .. automodule:: intel_npu_acceleration_library.backend.base 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | intel\_npu\_acceleration\_library.backend.factory module 16 | --------------------------------- 17 | 18 | .. automodule:: intel_npu_acceleration_library.backend.factory 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | intel\_npu\_acceleration\_library.backend.linear module 24 | -------------------------------- 25 | 26 | .. automodule:: intel_npu_acceleration_library.backend.linear 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | intel\_npu\_acceleration\_library.backend.matmul module 32 | -------------------------------- 33 | 34 | .. automodule:: intel_npu_acceleration_library.backend.matmul 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | intel\_npu\_acceleration\_library.backend.mlp module 40 | ----------------------------- 41 | 42 | .. automodule:: intel_npu_acceleration_library.backend.mlp 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | intel\_npu\_acceleration\_library.backend.qlinear module 48 | --------------------------------- 49 | 50 | .. automodule:: intel_npu_acceleration_library.backend.qlinear 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | intel\_npu\_acceleration\_library.backend.qmatmul module 56 | --------------------------------- 57 | 58 | .. automodule:: intel_npu_acceleration_library.backend.qmatmul 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | intel\_npu\_acceleration\_library.backend.runtime module 64 | --------------------------------- 65 | 66 | .. automodule:: intel_npu_acceleration_library.backend.runtime 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | Module contents 72 | --------------- 73 | 74 | .. automodule:: intel_npu_acceleration_library.backend 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | -------------------------------------------------------------------------------- /docs/source/python/intel_npu_acceleration_library.functional.rst: -------------------------------------------------------------------------------- 1 | intel\_npu\_acceleration\_library.functional package 2 | ==================================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | intel\_npu\_acceleration\_library.functional.scaled\_dot\_product\_attention module 8 | ----------------------------------------------------------------------------------- 9 | 10 | .. automodule:: intel_npu_acceleration_library.functional.scaled_dot_product_attention 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: intel_npu_acceleration_library.functional 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/python/intel_npu_acceleration_library.nn.rst: -------------------------------------------------------------------------------- 1 | intel\_npu\_acceleration\_library.nn package 2 | ===================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | intel\_npu\_acceleration\_library.nn.autograd module 8 | ----------------------------- 9 | 10 | .. automodule:: intel_npu_acceleration_library.nn.autograd 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | intel\_npu\_acceleration\_library.nn.linear module 16 | --------------------------- 17 | 18 | .. automodule:: intel_npu_acceleration_library.nn.linear 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | intel\_npu\_acceleration\_library.nn.llm module 24 | ------------------------ 25 | 26 | .. automodule:: intel_npu_acceleration_library.nn.llm 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | Module contents 32 | --------------- 33 | 34 | .. automodule:: intel_npu_acceleration_library.nn 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | -------------------------------------------------------------------------------- /docs/source/python/intel_npu_acceleration_library.rst: -------------------------------------------------------------------------------- 1 | intel\_npu\_acceleration\_library package 2 | ================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | intel_npu_acceleration_library.backend 11 | intel_npu_acceleration_library.nn 12 | intel_npu_acceleration_library.functional 13 | 14 | Submodules 15 | ---------- 16 | 17 | intel\_npu\_acceleration\_library.bindings module 18 | -------------------------- 19 | 20 | .. automodule:: intel_npu_acceleration_library.bindings 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | intel\_npu\_acceleration\_library.compiler module 26 | -------------------------- 27 | 28 | .. automodule:: intel_npu_acceleration_library.compiler 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | 33 | intel\_npu\_acceleration\_library.optimizations module 34 | ------------------------------- 35 | 36 | .. automodule:: intel_npu_acceleration_library.optimizations 37 | :members: 38 | :undoc-members: 39 | :show-inheritance: 40 | 41 | intel\_npu\_acceleration\_library.quantization module 42 | ------------------------------ 43 | 44 | .. automodule:: intel_npu_acceleration_library.quantization 45 | :members: 46 | :undoc-members: 47 | :show-inheritance: 48 | 49 | Module contents 50 | --------------- 51 | 52 | .. automodule:: intel_npu_acceleration_library 53 | :members: 54 | :undoc-members: 55 | :show-inheritance: 56 | -------------------------------------------------------------------------------- /docs/source/python/modules.rst: -------------------------------------------------------------------------------- 1 | intel_npu_acceleration_library 2 | ========= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | intel_npu_acceleration_library 8 | -------------------------------------------------------------------------------- /docs/source/setup.md: -------------------------------------------------------------------------------- 1 | # Advanced Setup 2 | 3 | You can install the package by typing 4 | 5 | ```bash 6 | pip install "intel-npu-acceleration-library @ git+https://github.com/intel/intel-npu-acceleration-library.git" 7 | ``` 8 | 9 | To build the package you need a compiler in your system (Visual Studio 2019 suggested for Windows build). MacOS is not yet supported. 10 | 11 | For development packages use (after cloning the repo) 12 | 13 | ```bash 14 | pip install .[dev] 15 | ``` 16 | -------------------------------------------------------------------------------- /docs/source/usage.md: -------------------------------------------------------------------------------- 1 | # Basic usage 2 | 3 | For implemented examples, please check the `examples` folder 4 | 5 | ## Run a single MatMul in the NPU 6 | 7 | ```python 8 | from intel_npu_acceleration_library.backend import MatMul 9 | import numpy as np 10 | 11 | inC, outC, batch = ... # Define your own values 12 | 13 | # Create both inputs 14 | X1 = np.random.uniform(-1, 1, (batch, inC)).astype(np.float16) 15 | X2 = np.random.uniform(-1, 1, (outC, inC)).astype(np.float16) 16 | 17 | mm = MatMul(inC, outC, batch, profile=False) 18 | 19 | result = mm.run(X1, X2) 20 | 21 | ``` 22 | 23 | ## Compile a model for the NPU 24 | 25 | If you have `pytorch`>=2.0.0 installed you can use torch compile to optimize your model for the NPU 26 | 27 | ```python 28 | import intel_npu_acceleration_library 29 | import torch 30 | 31 | # Compile model for the NPU 32 | # model a torch.nn.Module class. Model can be quantized JIT 33 | optimized_model = torch.compile(model, backend="npu") 34 | 35 | # Use the model as usual 36 | 37 | ``` 38 | 39 | In windows torch.compile is not supported yet. So you might want to use the explicit function `intel_npu_acceleration_library.compile`. This is true also if you use a `pytorch` version < 2.0.0 40 | 41 | To do this, you just need to call the `compile` function with your model and the compiler configuration `CompilerConfig` to compile and optimize the model for the NPU. 42 | ```python 43 | import intel_npu_acceleration_library 44 | from intel_npu_acceleration_library.compiler import CompilerConfig 45 | compiler_conf = CompilerConfig(dtype=torch.int8) 46 | optimized_model = intel_npu_acceleration_library.compile(model, compiler_conf) 47 | 48 | # Use the model as usual 49 | 50 | ``` 51 | 52 | To compile and optimize a single layer of a model to be pushed to the NPU as one block, you can set `use_to=True` in the the compiler configuration `CompilerConfig`. 53 | ```python 54 | import intel_npu_acceleration_library 55 | from intel_npu_acceleration_library.compiler import CompilerConfig 56 | compiler_conf = CompilerConfig(use_to=True, dtype=torch.int8) 57 | optimized_block = intel_npu_acceleration_library.compile(single_block, compiler_conf) 58 | 59 | ``` 60 | 61 | ## Training (**Experimental!**) 62 | 63 | It is possible to use Intel® NPU Acceleration Library to train a model. As before you just need to call the `compile` function, this time with `training=True`. This allows to use the same training script you use in other device with a very minimal modifications. 64 | 65 | ```python 66 | import intel_npu_acceleration_library 67 | from intel_npu_acceleration_library.compiler import CompilerConfig 68 | compiler_conf = CompilerConfig(dtype=torch.float32, training=True) 69 | compiled_model = intel_npu_acceleration_library.compile(model, compiler_conf) 70 | ``` 71 | -------------------------------------------------------------------------------- /examples/Audio-Spectrogram-Transformer.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | 7 | import sys 8 | import subprocess 9 | import pkg_resources 10 | 11 | required = {"librosa", "soundfile", "datasets", "intel-npu-acceleration-library"} 12 | installed = {pkg.key for pkg in pkg_resources.working_set} 13 | missing = required - installed 14 | 15 | if missing: 16 | # implement pip as a subprocess: 17 | subprocess.check_call([sys.executable, "-m", "pip", "install", *missing]) 18 | from transformers import AutoFeatureExtractor, ASTForAudioClassification 19 | from datasets import load_dataset 20 | import torch 21 | import intel_npu_acceleration_library 22 | 23 | dataset = load_dataset( 24 | "hf-internal-testing/librispeech_asr_demo", 25 | "clean", 26 | split="validation", 27 | trust_remote_code=True, 28 | ) 29 | dataset = dataset.sort("id") 30 | sampling_rate = dataset.features["audio"].sampling_rate 31 | 32 | feature_extractor = AutoFeatureExtractor.from_pretrained( 33 | "MIT/ast-finetuned-audioset-10-10-0.4593" 34 | ) 35 | model = ASTForAudioClassification.from_pretrained( 36 | "MIT/ast-finetuned-audioset-10-10-0.4593" 37 | ) 38 | print("Compile model for the NPU") 39 | model = intel_npu_acceleration_library.compile(model) 40 | 41 | # audio file is decoded on the fly 42 | inputs = feature_extractor( 43 | dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt" 44 | ) 45 | 46 | with torch.no_grad(): 47 | logits = model(**inputs).logits 48 | 49 | predicted_class_ids = torch.argmax(logits, dim=-1).item() 50 | predicted_label = model.config.id2label[predicted_class_ids] 51 | predicted_label 52 | 53 | # compute loss - target_label is e.g. "down" 54 | target_label = model.config.id2label[0] 55 | inputs["labels"] = torch.tensor([model.config.label2id[target_label]]) 56 | loss = model(**inputs).loss 57 | print(round(loss.item(), 2)) 58 | -------------------------------------------------------------------------------- /examples/compile_model.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | 7 | from intel_npu_acceleration_library import compile 8 | from intel_npu_acceleration_library.compiler import CompilerConfig 9 | from sklearn.metrics import r2_score 10 | import intel_npu_acceleration_library 11 | import pytest 12 | import torch 13 | import sys 14 | 15 | # Define a 16 | class NN(torch.nn.Module): 17 | def __init__(self, hidden_dim: int, intermediate_dim: int) -> None: 18 | super().__init__() 19 | self.l1 = torch.nn.Linear(hidden_dim, intermediate_dim) 20 | self.l2 = torch.nn.Linear(intermediate_dim, hidden_dim) 21 | self.relu = torch.nn.functional.relu 22 | 23 | def forward(self, x): 24 | return self.relu(self.l2(self.relu(self.l1(x)))) 25 | 26 | 27 | if __name__ == "__main__": 28 | 29 | # Define a NN module 30 | model = NN(32, 128) 31 | # Generate the input 32 | x = torch.rand((16, 32), dtype=torch.float16) - 0.5 33 | 34 | # Get the reference output 35 | with torch.no_grad(): 36 | y_ref = model(x.to(torch.float32)) 37 | 38 | # Compile the model 39 | print("Compile the model for the NPU") 40 | if sys.platform == "win32": 41 | # Windows do not support torch.compile 42 | print( 43 | "Windows do not support torch.compile, fallback to intel_npu_acceleration_library.compile" 44 | ) 45 | compiler_conf = CompilerConfig() 46 | compiled_model = intel_npu_acceleration_library.compile(model, compiler_conf) 47 | else: 48 | compiled_model = torch.compile(model, backend="npu") 49 | 50 | # Get the NPU output 51 | with torch.no_grad(): 52 | y = compiled_model(x) 53 | 54 | print(f"Reference vs actual R2 score: {r2_score(y_ref.numpy(), y.numpy()):.2f}") 55 | -------------------------------------------------------------------------------- /examples/cpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | cmake_minimum_required(VERSION 3.16) 7 | include(FetchContent) 8 | 9 | project(intel_npu_acceleration_library_example) 10 | 11 | set(CMAKE_CXX_STANDARD 14) 12 | 13 | if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU") 14 | add_compile_options(-march=native) 15 | endif() 16 | 17 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) 18 | 19 | 20 | FetchContent_Declare( 21 | intel_npu_acceleration_library 22 | GIT_REPOSITORY "https://github.com/intel/intel-npu-acceleration-library" 23 | GIT_TAG "main" 24 | ) 25 | FetchContent_MakeAvailable(intel_npu_acceleration_library) 26 | 27 | 28 | find_package(OpenVINO REQUIRED PATHS ${openvino_SOURCE_DIR}/runtime/cmake) 29 | 30 | include_directories(${intel_npu_acceleration_library_SOURCE_DIR}/include) 31 | 32 | set(CMAKE_POSITION_INDEPENDENT_CODE ON) 33 | 34 | add_executable(intel_npu_acceleration_library_example main.cpp) 35 | 36 | target_link_libraries(intel_npu_acceleration_library_example PRIVATE openvino::runtime) 37 | -------------------------------------------------------------------------------- /examples/cpp/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Create a custom C++ application using Intel NPU acceleration Library 3 | 4 | The example demonstrates how to create a custom C++ application using the Intel NPU acceleration Library. It showcases the usage of the library's features and functionalities for accelerating neural network inference on Intel NPUs. The provided code snippet shows the build process using CMake, where the project is configured and built in the Release configuration. 5 | 6 | ## Build 7 | 8 | To build the custom C++ application using the Intel NPU acceleration Library, follow these steps: 9 | 10 | 1. Run the following commands to configure and build the project in the Release configuration: 11 | ``` 12 | cmake -S . -B build 13 | cmake --build build --config Release 14 | ``` 15 | 2. Once the build process is complete, you can find the executable file at `build\Release\intel_npu_acceleration_library_example.exe` (on windows) 16 | 17 | Make sure you have the necessary dependencies, compiler and libraries installed before building the application. 18 | 19 | -------------------------------------------------------------------------------- /examples/cpp/main.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright © 2024 Intel Corporation 3 | // SPDX-License-Identifier: Apache 2.0 4 | // 5 | 6 | #include "intel_npu_acceleration_library/nn_factory.h" 7 | 8 | using namespace intel_npu_acceleration_library; 9 | #include 10 | 11 | int main() { 12 | const size_t batch = 128, inC = 256, outC = 512, N = 100000; 13 | 14 | std::cout << "Create a ModelFactory" << std::endl; 15 | auto factory = std::make_shared("NPU"); 16 | 17 | // create parameter 18 | auto input = factory->parameter({batch, inC}, ov::element::f16); 19 | auto weights = factory->parameter({outC, inC}, ov::element::f16); 20 | auto bias = factory->parameter({1, outC}, ov::element::f16); 21 | 22 | // create matmul 23 | auto matmul = factory->matmul(input, weights); 24 | auto matmul_bias = factory->eltwise_add(matmul, bias); 25 | factory->result(matmul_bias); 26 | 27 | // Compile the model 28 | factory->compile(); 29 | 30 | // Save OV model 31 | std::cout << "Saving model to matmul.xml" << std::endl; 32 | factory->saveModel("matmul.xml"); 33 | 34 | // Here you can create float16 buffers and run inference by using 35 | half_ptr input_buffer = new uint16_t[batch * inC]; 36 | half_ptr weights_buffer = new uint16_t[outC * inC]; 37 | half_ptr bias_buffer = new uint16_t[outC]; 38 | half_ptr output_buffer = new uint16_t[batch * outC]; 39 | 40 | memset(input_buffer, 0, batch * inC * sizeof(uint16_t)); 41 | memset(weights_buffer, 0, outC * inC * sizeof(uint16_t)); 42 | memset(output_buffer, 0, batch * outC * sizeof(uint16_t)); 43 | memset(bias_buffer, 0, outC * sizeof(uint16_t)); 44 | 45 | factory->setInputTensor(input_buffer, 0); 46 | factory->setInputTensor(weights_buffer, 1); 47 | factory->setInputTensor(bias_buffer, 2); 48 | factory->setOutputTensor(output_buffer, 0); 49 | 50 | // Run inference 51 | std::cout << "Run inference on " << N << " workloads" << std::endl; 52 | for (auto idx = 0; idx < N; idx++) 53 | factory->run(); 54 | std::cout << "Inference done" << std::endl; 55 | 56 | delete[] input_buffer; 57 | delete[] weights_buffer; 58 | delete[] bias_buffer; 59 | delete[] output_buffer; 60 | return 0; 61 | } -------------------------------------------------------------------------------- /examples/llama.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from transformers import AutoTokenizer, TextStreamer 7 | from intel_npu_acceleration_library import NPUModelForCausalLM, int4 8 | from intel_npu_acceleration_library.compiler import CompilerConfig 9 | 10 | model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" 11 | 12 | compiler_conf = CompilerConfig(dtype=int4) 13 | model = NPUModelForCausalLM.from_pretrained( 14 | model_id, use_cache=True, config=compiler_conf, attn_implementation="sdpa" 15 | ).eval() 16 | tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True) 17 | tokenizer.pad_token_id = tokenizer.eos_token_id 18 | streamer = TextStreamer(tokenizer, skip_special_tokens=True) 19 | 20 | 21 | query = input("Ask something: ") 22 | prefix = tokenizer(query, return_tensors="pt")["input_ids"] 23 | 24 | 25 | generation_kwargs = dict( 26 | input_ids=prefix, 27 | streamer=streamer, 28 | do_sample=True, 29 | top_k=50, 30 | top_p=0.9, 31 | max_new_tokens=512, 32 | ) 33 | 34 | print("Run inference") 35 | _ = model.generate(**generation_kwargs) 36 | -------------------------------------------------------------------------------- /examples/llama3.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from transformers import AutoTokenizer, TextStreamer 7 | from intel_npu_acceleration_library import NPUModelForCausalLM, int4 8 | from intel_npu_acceleration_library.compiler import CompilerConfig 9 | 10 | model_id = "meta-llama/Meta-Llama-3-8B-Instruct" 11 | 12 | compiler_conf = CompilerConfig(dtype=int4) 13 | model = NPUModelForCausalLM.from_pretrained( 14 | model_id, use_cache=True, config=compiler_conf 15 | ).eval() 16 | tokenizer = AutoTokenizer.from_pretrained(model_id) 17 | streamer = TextStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True) 18 | 19 | print("Run inference with Llama3 on NPU\n") 20 | 21 | 22 | query = input(">") 23 | 24 | 25 | messages = [ 26 | { 27 | "role": "system", 28 | "content": "You are an helpful chatbot that can provide information about the Intel NPU", 29 | }, 30 | {"role": "user", "content": query}, 31 | ] 32 | 33 | input_ids = tokenizer.apply_chat_template( 34 | messages, add_generation_prompt=True, return_tensors="pt" 35 | ).to(model.device) 36 | 37 | terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")] 38 | 39 | 40 | outputs = model.generate( 41 | input_ids, 42 | max_new_tokens=256, 43 | eos_token_id=terminators, 44 | do_sample=True, 45 | streamer=streamer, 46 | ) 47 | -------------------------------------------------------------------------------- /examples/llava.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | import requests 7 | from PIL import Image 8 | from transformers import ( 9 | LlavaForConditionalGeneration, 10 | AutoTokenizer, 11 | CLIPImageProcessor, 12 | TextStreamer, 13 | ) 14 | from transformers.feature_extraction_utils import BatchFeature 15 | from intel_npu_acceleration_library.compiler import CompilerConfig 16 | import intel_npu_acceleration_library 17 | import torch 18 | 19 | 20 | checkpoint = "Intel/llava-gemma-2b" 21 | 22 | # Load model 23 | model = LlavaForConditionalGeneration.from_pretrained(checkpoint) 24 | 25 | compiler_conf = CompilerConfig() 26 | model = intel_npu_acceleration_library.compile(model, compiler_conf) 27 | 28 | image_processor = CLIPImageProcessor.from_pretrained(checkpoint) 29 | tokenizer = AutoTokenizer.from_pretrained(checkpoint) 30 | 31 | streamer = TextStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True) 32 | 33 | # Prepare inputs 34 | # Use gemma chat template 35 | prompt = tokenizer.apply_chat_template( 36 | [{"role": "user", "content": "\nWhat's the content of the image?"}], 37 | tokenize=False, 38 | add_generation_prompt=True, 39 | ) 40 | text_inputs = tokenizer(prompt, return_tensors="pt") 41 | 42 | # clean the console 43 | print("\033[H\033[J") 44 | print("LLaVA Gemma Chatbot\n") 45 | print("Please provide an image URL to generate a response.\n") 46 | url = input("Image URL: ") 47 | 48 | print("Description: ", end="", flush=True) 49 | # url = "https://www.ilankelman.org/stopsigns/australia.jpg" 50 | image = Image.open(requests.get(url, stream=True).raw) 51 | 52 | pixel_values = image_processor(image, return_tensors="pt")["pixel_values"] 53 | 54 | inputs = BatchFeature(data={**text_inputs, "pixel_values": pixel_values}) 55 | 56 | # Generate 57 | model.generate(**inputs, max_new_tokens=150, streamer=streamer) 58 | -------------------------------------------------------------------------------- /examples/matmul.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from intel_npu_acceleration_library.backend import MatMul 7 | import numpy as np 8 | 9 | 10 | def run_matmul(inC, outC, batch): 11 | 12 | # Create both inputs 13 | X1 = np.random.uniform(-1, 1, (batch, inC)).astype(np.float16) 14 | X2 = np.random.uniform(-1, 1, (outC, inC)).astype(np.float16) 15 | 16 | mm = MatMul(inC, outC, batch, profile=False) 17 | 18 | return mm.run(X1, X2) 19 | 20 | 21 | if __name__ == "__main__": 22 | result = run_matmul(128, 128, 32) 23 | print(result) 24 | -------------------------------------------------------------------------------- /examples/phi-2.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from langchain.prompts import PromptTemplate 7 | from langchain.chains import LLMChain 8 | from langchain.llms import HuggingFacePipeline 9 | from transformers import AutoTokenizer, pipeline, TextStreamer 10 | from intel_npu_acceleration_library.compiler import CompilerConfig 11 | import intel_npu_acceleration_library as npu_lib 12 | 13 | model_id = "microsoft/Phi-2" 14 | 15 | compiler_conf = CompilerConfig(dtype=npu_lib.int4) 16 | model = npu_lib.NPUModelForCausalLM.from_pretrained( 17 | model_id, use_cache=True, config=compiler_conf 18 | ).eval() 19 | tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True) 20 | streamer = TextStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True) 21 | 22 | pipe = pipeline( 23 | "text-generation", 24 | model=model, 25 | tokenizer=tokenizer, 26 | max_length=256, 27 | temperature=0.9, 28 | top_p=0.95, 29 | repetition_penalty=1.2, 30 | streamer=streamer, 31 | ) 32 | 33 | local_llm = HuggingFacePipeline(pipeline=pipe) 34 | pipe.model.config.pad_token_id = pipe.model.config.eos_token_id 35 | 36 | 37 | template = """Question: {question} 38 | 39 | Answer: """ 40 | 41 | prompt = PromptTemplate(template=template, input_variables=["question"]) 42 | 43 | llm_chain = LLMChain(prompt=prompt, llm=local_llm) 44 | 45 | question = "What's the distance between the Earth and the Moon?" 46 | 47 | llm_chain.run(question) 48 | -------------------------------------------------------------------------------- /examples/phi-3.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | import torch 7 | from transformers import AutoTokenizer, pipeline, TextStreamer 8 | from intel_npu_acceleration_library.compiler import CompilerConfig 9 | import intel_npu_acceleration_library as npu_lib 10 | import warnings 11 | 12 | torch.random.manual_seed(0) 13 | 14 | compiler_conf = CompilerConfig(dtype=npu_lib.int4) 15 | model = npu_lib.NPUModelForCausalLM.from_pretrained( 16 | "microsoft/Phi-3-mini-4k-instruct", 17 | config=compiler_conf, 18 | torch_dtype="auto", 19 | ) 20 | 21 | tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") 22 | streamer = TextStreamer(tokenizer, skip_prompt=True) 23 | 24 | messages = [ 25 | { 26 | "role": "system", 27 | "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.", 28 | }, 29 | { 30 | "role": "user", 31 | "content": "Can you provide ways to eat combinations of bananas and dragonfruits?", 32 | }, 33 | ] 34 | 35 | pipe = pipeline( 36 | "text-generation", 37 | model=model, 38 | tokenizer=tokenizer, 39 | ) 40 | 41 | generation_args = { 42 | "max_new_tokens": 500, 43 | "return_full_text": False, 44 | "temperature": 0.7, 45 | "do_sample": True, 46 | "streamer": streamer, 47 | } 48 | 49 | with warnings.catch_warnings(): 50 | warnings.simplefilter("ignore") 51 | pipe(messages, **generation_args) 52 | -------------------------------------------------------------------------------- /examples/qwen2_math_7b.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from transformers import AutoTokenizer, TextStreamer 7 | from intel_npu_acceleration_library import NPUModelForCausalLM, int8 8 | from intel_npu_acceleration_library.compiler import CompilerConfig 9 | import time 10 | 11 | model_id = "Qwen/Qwen2-Math-7B-Instruct" 12 | 13 | compiler_conf = CompilerConfig(dtype=int8) 14 | model = NPUModelForCausalLM.from_pretrained( 15 | model_id, use_cache=True, config=compiler_conf 16 | ).eval() 17 | tokenizer = AutoTokenizer.from_pretrained(model_id) 18 | streamer = TextStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True) 19 | 20 | print("Run inference with Qwen2-Math-7B on NPU\n") 21 | 22 | # sample query: Find the value of $x$ that satisfies the equation $4x+5 = 6x+7$. 23 | 24 | query = input(">") 25 | 26 | messages = [ 27 | { 28 | "role": "system", 29 | "content": "You are an helpful chatbot", 30 | }, 31 | {"role": "user", "content": query}, 32 | ] 33 | 34 | text = tokenizer.apply_chat_template( 35 | messages, tokenize=False, add_generation_prompt=True 36 | ) 37 | 38 | model_inputs = tokenizer([text], return_tensors="pt").to(model.device) 39 | 40 | # Measure the start time 41 | start_time = time.time() 42 | 43 | generated_ids = model.generate( 44 | **model_inputs, 45 | max_new_tokens=512, 46 | do_sample=True, 47 | temperature=0.01, 48 | streamer=streamer, 49 | ) 50 | 51 | generated_ids = [ 52 | output_ids[len(input_ids) :] 53 | for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) 54 | ] 55 | 56 | # Calculate the total number of generated tokens 57 | num_tokens_generated = sum(len(tokens) for tokens in generated_ids) 58 | 59 | response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] 60 | 61 | # Measure the end time 62 | end_time = time.time() 63 | 64 | # Calculate the number of tokens generated 65 | num_tokens_generated = sum(len(tokens) for tokens in generated_ids) 66 | 67 | # Calculate the tokens per second 68 | time_taken = end_time - start_time 69 | print("Total generated tokens:", num_tokens_generated) 70 | print("Total Time taken:", time_taken) 71 | 72 | tokens_per_second = num_tokens_generated / time_taken 73 | 74 | # Print the tokens per second 75 | print(f"Tokens per second: {tokens_per_second:.2f}") 76 | -------------------------------------------------------------------------------- /examples/t5.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from transformers import AutoTokenizer, TextStreamer 7 | from intel_npu_acceleration_library import NPUModelForSeq2SeqLM 8 | from intel_npu_acceleration_library.compiler import CompilerConfig 9 | 10 | model_id = "google/flan-t5-small" 11 | 12 | compiler_conf = CompilerConfig() 13 | model = NPUModelForSeq2SeqLM.from_pretrained( 14 | model_id, use_cache=True, config=compiler_conf 15 | ).eval() 16 | tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True) 17 | tokenizer.pad_token_id = tokenizer.eos_token_id 18 | streamer = TextStreamer(tokenizer, skip_special_tokens=True) 19 | 20 | query = input("Ask something: ") 21 | prefix = tokenizer(query, return_tensors="pt")["input_ids"] 22 | 23 | 24 | generation_kwargs = dict( 25 | input_ids=prefix, 26 | streamer=streamer, 27 | do_sample=True, 28 | top_k=50, 29 | top_p=0.9, 30 | max_new_tokens=512, 31 | ) 32 | 33 | print("Run inference") 34 | _ = model.generate(**generation_kwargs) 35 | -------------------------------------------------------------------------------- /examples/tiny_llama_chat.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from transformers import pipeline, TextStreamer, set_seed 7 | from intel_npu_acceleration_library.compiler import CompilerConfig 8 | import intel_npu_acceleration_library 9 | import torch 10 | import os 11 | 12 | model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" 13 | 14 | print("Loading the model...") 15 | pipe = pipeline( 16 | "text-generation", model=model_id, torch_dtype=torch.bfloat16, device_map="auto" 17 | ) 18 | print("Compiling the model for NPU...") 19 | compiler_conf = CompilerConfig(dtype=torch.int8) 20 | pipe.model = intel_npu_acceleration_library.compile(pipe.model, compiler_conf) 21 | 22 | streamer = TextStreamer(pipe.tokenizer, skip_special_tokens=True, skip_prompt=True) 23 | 24 | set_seed(42) 25 | 26 | 27 | messages = [ 28 | { 29 | "role": "system", 30 | "content": "You are a friendly chatbot. You can ask me anything.", 31 | }, 32 | ] 33 | 34 | print("NPU Chatbot is ready! Please ask a question. Type 'exit' to quit.") 35 | while True: 36 | query = input("User: ") 37 | if query.lower() == "exit": 38 | break 39 | messages.append({"role": "user", "content": query}) 40 | 41 | prompt = pipe.tokenizer.apply_chat_template( 42 | messages, tokenize=False, add_generation_prompt=True 43 | ) 44 | print("Assistant: ", end="", flush=True) 45 | out = pipe( 46 | prompt, 47 | max_new_tokens=512, 48 | do_sample=True, 49 | temperature=0.7, 50 | top_k=50, 51 | top_p=0.95, 52 | streamer=streamer, 53 | ) 54 | 55 | reply = out[0]["generated_text"].split("<|assistant|>")[-1].strip() 56 | messages.append({"role": "assistant", "content": reply}) 57 | -------------------------------------------------------------------------------- /examples/train_mnist.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | 7 | import torch 8 | from torch import nn 9 | import intel_npu_acceleration_library 10 | from intel_npu_acceleration_library.compiler import CompilerConfig 11 | from torch.utils.data import DataLoader 12 | from torchvision import datasets 13 | from torchvision.transforms import ToTensor 14 | import numpy as np 15 | 16 | # Set random seeds 17 | np.random.seed(0) 18 | torch.manual_seed(0) 19 | 20 | training_data = datasets.FashionMNIST( 21 | root="data", train=True, download=True, transform=ToTensor() 22 | ) 23 | 24 | test_data = datasets.FashionMNIST( 25 | root="data", train=False, download=True, transform=ToTensor() 26 | ) 27 | 28 | train_dataloader = DataLoader(training_data, batch_size=64) 29 | test_dataloader = DataLoader(test_data, batch_size=64) 30 | 31 | 32 | class NeuralNetwork(nn.Module): 33 | def __init__(self): 34 | super().__init__() 35 | self.flatten = nn.Flatten() 36 | self.linear_relu_stack = nn.Sequential( 37 | nn.Linear(28 * 28, 512), 38 | nn.ReLU(), 39 | nn.Linear(512, 512), 40 | nn.ReLU(), 41 | nn.Linear(512, 10), 42 | ) 43 | 44 | def forward(self, x): 45 | x = self.flatten(x) 46 | logits = self.linear_relu_stack(x) 47 | return logits 48 | 49 | 50 | def train_loop(dataloader, model, loss_fn, optimizer): 51 | size = len(dataloader.dataset) 52 | # Set the model to training mode - important for batch normalization and dropout layers 53 | # Unnecessary in this situation but added for best practices 54 | model.train() 55 | for batch, (X, y) in enumerate(dataloader): 56 | # Compute prediction and loss 57 | pred = model(X) 58 | loss = loss_fn(pred, y) 59 | 60 | # Backpropagation 61 | loss.backward() 62 | optimizer.step() 63 | optimizer.zero_grad() 64 | 65 | if batch % 100 == 0: 66 | loss, current = loss.item(), (batch + 1) * len(X) 67 | print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]") 68 | 69 | 70 | def test_loop(dataloader, model, loss_fn): 71 | # Set the model to evaluation mode - important for batch normalization and dropout layers 72 | # Unnecessary in this situation but added for best practices 73 | model.eval() 74 | size = len(dataloader.dataset) 75 | num_batches = len(dataloader) 76 | test_loss, correct = 0, 0 77 | 78 | # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode 79 | # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True 80 | with torch.no_grad(): 81 | for X, y in dataloader: 82 | pred = model(X) 83 | test_loss += loss_fn(pred, y).item() 84 | correct += (pred.argmax(1) == y).type(torch.float).sum().item() 85 | 86 | test_loss /= num_batches 87 | correct /= size 88 | print( 89 | f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n" 90 | ) 91 | 92 | 93 | model = NeuralNetwork() 94 | compiler_conf = CompilerConfig(dtype=torch.float32, training=True) 95 | model = intel_npu_acceleration_library.compile(model, compiler_conf) 96 | 97 | learning_rate = 1e-3 98 | batch_size = 64 99 | 100 | # Initialize the loss function 101 | loss_fn = nn.CrossEntropyLoss() 102 | 103 | optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) 104 | 105 | epochs = 10 106 | for t in range(epochs): 107 | print(f"Epoch {t+1}\n-------------------------------") 108 | train_loop(train_dataloader, model, loss_fn, optimizer) 109 | test_loop(test_dataloader, model, loss_fn) 110 | print("Done!") 111 | -------------------------------------------------------------------------------- /include/intel_npu_acceleration_library/common.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright © 2024 Intel Corporation 3 | // SPDX-License-Identifier: Apache 2.0 4 | // 5 | 6 | #pragma once 7 | #include "openvino/openvino.hpp" 8 | #include "openvino/opsets/opset1.hpp" 9 | #include "openvino/opsets/opset13.hpp" 10 | #include "openvino/opsets/opset4.hpp" 11 | #include "openvino/opsets/opset5.hpp" 12 | #include "openvino/opsets/opset6.hpp" 13 | #include "openvino/opsets/opset7.hpp" 14 | #include "openvino/opsets/opset8.hpp" 15 | #include "openvino/opsets/opset9.hpp" 16 | #include "openvino/runtime/intel_npu/properties.hpp" 17 | 18 | #if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) 19 | #define intel_npu_acceleration_library_DLL_API __attribute__((visibility("default"))) 20 | #elif defined(_MSC_VER) 21 | #define intel_npu_acceleration_library_DLL_API __declspec(dllexport) 22 | #endif 23 | 24 | namespace intel_npu_acceleration_library { 25 | 26 | static constexpr ov::Property npu_compiler_type{"NPU_COMPILER_TYPE"}; 27 | static constexpr ov::Property npu_parameters{"NPU_COMPILATION_MODE_PARAMS"}; 28 | 29 | /** 30 | * @brief Return true if the NPU is available on the system, otherwise return false 31 | * 32 | * @param core ov::Cor object 33 | * @return true NPU AI accelerator is available 34 | * @return false NPU AI accelerator is not available 35 | */ 36 | bool _isNPUAvailable(ov::Core& core) { 37 | std::vector availableDevices = core.get_available_devices(); 38 | return std::find(availableDevices.begin(), availableDevices.end(), "NPU") != availableDevices.end(); 39 | } 40 | 41 | uint32_t driver_version(ov::Core& core) { 42 | return static_cast(core.get_property("NPU", ov::intel_npu::driver_version)); 43 | } 44 | 45 | ov::element::Type_t dtype_from_string(const std::string& dtype) { 46 | if (dtype == "int8" || dtype == "i8") { 47 | return ov::element::Type_t::i8; 48 | } else if (dtype == "int4" || dtype == "i4") { 49 | return ov::element::Type_t::i4; 50 | } else if (dtype == "int16" || dtype == "i16") { 51 | return ov::element::Type_t::i16; 52 | } else if (dtype == "int32" || dtype == "i32") { 53 | return ov::element::Type_t::i32; 54 | } else if (dtype == "int64" || dtype == "i64") { 55 | return ov::element::Type_t::i64; 56 | } 57 | if (dtype == "float16" || dtype == "half" || dtype == "f16") { 58 | return ov::element::Type_t::f16; 59 | } 60 | if (dtype == "float32" || dtype == "f32") { 61 | return ov::element::Type_t::f32; 62 | } 63 | if (dtype == "float64" || dtype == "f64") { 64 | return ov::element::Type_t::f64; 65 | } 66 | if (dtype == "bfloat16" || dtype == "bf16") { 67 | return ov::element::Type_t::bf16; 68 | } else { 69 | throw std::invalid_argument("Unsupported datatype: " + dtype); 70 | } 71 | } 72 | 73 | } // namespace intel_npu_acceleration_library 74 | 75 | // Define half pointer as uint16_t pointer datatype 76 | #define half_ptr uint16_t* -------------------------------------------------------------------------------- /include/intel_npu_acceleration_library/conversion.h: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright © 2024 Intel Corporation 3 | // SPDX-License-Identifier: Apache 2.0 4 | // 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "intel_npu_acceleration_library/common.h" 13 | 14 | namespace intel_npu_acceleration_library { 15 | 16 | /** 17 | * @brief Compress a int8 vector to I4 format. 18 | * 19 | * @param src pointer to the source int8 buffer 20 | * @param dst pointer to the destination uint8 buffer 21 | * @param size size of the src and dst buffers 22 | */ 23 | void compressToI4(const int8_t* src, uint8_t* dst, size_t size) { 24 | for (size_t i = 0; i < size / 2; i++) { 25 | dst[i] = (src[2 * i] & 0x0F) | ((src[2 * i + 1] & 0x0F) << 4); 26 | } 27 | } 28 | 29 | /** 30 | * @brief Convert a int8 vector to fp16 given a scalar scale. 31 | * 32 | * @param src pointer to the source int8 buffer 33 | * @param scale Float scale 34 | * @param dst pointer to the destination float16 buffer 35 | * @param size size of the src and dst buffers 36 | */ 37 | void vector_to_fp16(const int8_t* src, float scale, half_ptr dst, size_t size) { 38 | constexpr size_t VEC_SIZE = 8; // Use AVX2: process 8 values per loop iteration for 32-bit floats 39 | __m256 scale_vec = _mm256_set1_ps(scale); // Broadcast scale 40 | 41 | for (size_t idx = 0; idx < size; idx += VEC_SIZE) { 42 | // Load int8_t and extend to int32_t for conversion 43 | __m128i input_8 = _mm_loadl_epi64((__m128i const*)(src + idx)); // Load 8 int8_t values 44 | __m256i input_32 = _mm256_cvtepi8_epi32(input_8); // Extend to 32-bit integers 45 | 46 | // Convert integers to float and apply scaling 47 | __m256 float_vec = _mm256_mul_ps(_mm256_cvtepi32_ps(input_32), scale_vec); 48 | 49 | // Convert float to fp16 50 | __m128i fp16_vec = _mm256_cvtps_ph(float_vec, _MM_FROUND_TO_NEAREST_INT); 51 | 52 | // Store the result 53 | _mm_store_si128((__m128i*)(dst + idx), fp16_vec); 54 | } 55 | } 56 | 57 | /** 58 | * @brief Convert a int8 array to fp16 given a per output channel scale vector. 59 | * 60 | * @param input pointer to the source int8 buffer of shape [output_channels, input_channels] 61 | * @param scale pointer of a float scale vector of shape [output_channels] 62 | * @param output dst pointer to the destination float16 buffer of shape [output_channels, input_channels] 63 | * @param input_channels number of input channels 64 | * @param output_channels number of output channels 65 | */ 66 | void array_to_fp16_worker(const int8_t* input, float* scale, half_ptr output, size_t input_channels, 67 | size_t output_channels) { 68 | for (size_t idx = 0; idx < output_channels; idx++) { 69 | vector_to_fp16(input + idx * input_channels, scale[idx], output + idx * input_channels, input_channels); 70 | } 71 | } 72 | 73 | /** 74 | * @brief Convert a int8 array to fp16 given a per output channel scale vector. 75 | * 76 | * @param input pointer to the source int8 buffer of shape [output_channels, input_channels] 77 | * @param scale pointer of a float scale vector of shape [output_channels] 78 | * @param output dst pointer to the destination float16 buffer of shape [output_channels, input_channels] 79 | * @param input_channels number of input channels 80 | * @param output_channels number of output channels 81 | * @param num_threads number of parallel threads to use 82 | */ 83 | void to_fp16(const int8_t* input, float* scale, half_ptr output, size_t input_channels, size_t output_channels, 84 | unsigned int num_threads) { 85 | std::vector threads; 86 | 87 | // Calculate chunk size per thread 88 | size_t channels_per_thread = (output_channels + num_threads - 1) / num_threads; // Ceiling division 89 | 90 | for (unsigned int i = 0; i < num_threads; ++i) { 91 | size_t start_channel = i * channels_per_thread; 92 | size_t end_channel = std::min((i + 1) * channels_per_thread, output_channels); 93 | 94 | if (start_channel < output_channels) { 95 | threads.emplace_back(array_to_fp16_worker, input + start_channel * input_channels, scale + start_channel, 96 | output + start_channel * input_channels, input_channels, end_channel - start_channel); 97 | } 98 | } 99 | 100 | // Join threads 101 | for (auto& t : threads) { 102 | if (t.joinable()) { 103 | t.join(); 104 | } 105 | } 106 | } 107 | } // namespace intel_npu_acceleration_library 108 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from .compiler import compile 7 | from .dtypes import int4, int8, float16 8 | from ._version import __version__ 9 | from .modelling import NPUModel, NPUAutoModel, NPUModelForCausalLM, NPUModelForSeq2SeqLM 10 | from .device import enable_npu_device 11 | 12 | enable_npu_device() 13 | 14 | __all__ = [ 15 | "compile", 16 | "int4", 17 | "int8", 18 | "float16", 19 | "__version__", 20 | "NPUModel", 21 | "NPUAutoModel", 22 | "NPUModelForCausalLM", 23 | "NPUModelForSeq2SeqLM", 24 | ] 25 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/_version.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | __version__ = "v1.4.0" 7 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/backend/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | from .bindings import lib 6 | from .utils import npu_available, get_driver_version, check_npu_and_driver_version 7 | from .mlp import MLP 8 | from .convolution import Convolution 9 | from .matmul import MatMul 10 | from .linear import Linear 11 | from .qmatmul import QMatMul 12 | from .qlinear import QLinear 13 | from .tensor import Tensor 14 | from .factory import NNFactory 15 | from .sdpa import SDPA, SimpleSDPA 16 | from .runtime import run_matmul, run_factory, clear_cache 17 | 18 | check_npu_and_driver_version() 19 | 20 | __all__ = [ 21 | "Tensor", 22 | "NNFactory", 23 | "MLP", 24 | "MatMul", 25 | "Linear", 26 | "QMatMul", 27 | "QLinear", 28 | "Convolution", 29 | "SDPA", 30 | "SimpleSDPA", 31 | "run_matmul", 32 | "run_factory", 33 | "clear_cache", 34 | "npu_available", 35 | "get_driver_version", 36 | "lib", 37 | ] 38 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/backend/compression.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from intel_npu_acceleration_library.backend.bindings import lib as backend_lib 7 | import numpy as np 8 | 9 | 10 | def compress_to_i4(weights: np.ndarray) -> np.ndarray: 11 | """Compress a int8 array to int4. 12 | 13 | Args: 14 | weights (np.ndarray): input array 15 | 16 | Returns: 17 | np.ndarray: compressed array 18 | """ 19 | compressed_weights = np.zeros( 20 | (weights.shape[0], weights.shape[1] // 2), dtype=np.uint8 21 | ) 22 | 23 | backend_lib.compressToI4(weights, compressed_weights, np.prod(weights.shape)) 24 | return compressed_weights 25 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/backend/convolution.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from intel_npu_acceleration_library.backend.factory import NNFactory 7 | from typing import Sequence, Union 8 | import numpy as np 9 | 10 | 11 | class Convolution(NNFactory): 12 | """Linear class, computing a matrix matrix multiplication with weights prefetching.""" 13 | 14 | def __init__( 15 | self, 16 | input_shape: Sequence[int], 17 | weights_shape: Sequence[int], 18 | bias: bool = False, 19 | strides: Union[int, Sequence[int]] = 1, 20 | padding: Union[int, Sequence[int]] = 0, 21 | dilation: Union[int, Sequence[int]] = 1, 22 | groups: int = 1, 23 | profile: bool = False, 24 | device: str = "NPU", 25 | ): 26 | """Initialize the Linear class. 27 | 28 | Args: 29 | input_shape (Sequence[int]): input shape 30 | weights_shape (Sequence[int]): weights shape 31 | bias (bool): Enable/Disable bias. Defaults to False. 32 | strides (Union[int, Sequence[int]], optional): Strides. Defaults to 1. 33 | padding (Union[int, Sequence[int]], optional): Padding. Defaults to 0. 34 | dilation (Union[int, Sequence[int]], optional): Dilation. Defaults to 1. 35 | groups (int, optional): Groups. Defaults to 1. 36 | profile (Optional[bool], optional): Enable/Disable profiling. Defaults to False. 37 | device (str): Target device, default to "NPU". 38 | """ 39 | super().__init__(profile, device) 40 | input = self.parameter(input_shape) 41 | weights = self.parameter(weights_shape) 42 | if bias is not None: 43 | bias_node = self.parameter((1, weights_shape[0], 1, 1)) 44 | else: 45 | bias_node = None 46 | 47 | _ = self.convolution( 48 | input, 49 | weights, 50 | bias=bias_node, 51 | strides=strides, 52 | padding=padding, 53 | dilation=dilation, 54 | groups=groups, 55 | act_dtype=np.float16, 56 | ) 57 | 58 | self.compile() 59 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/backend/linear.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from intel_npu_acceleration_library.backend.factory import NNFactory 7 | import numpy as np 8 | 9 | 10 | class Linear(NNFactory): 11 | """Linear class, computing a matrix matrix multiplication with weights prefetching.""" 12 | 13 | def __init__( 14 | self, 15 | inC: int, 16 | outC: int, 17 | batch: int, 18 | profile: bool = False, 19 | device: str = "NPU", 20 | ): 21 | """Initialize the Linear class. 22 | 23 | Args: 24 | inC (int): input channels 25 | outC (int): output channels 26 | batch (int): batch 27 | profile (bool): Enable/Disable profiling. Defaults to False. 28 | device (str): Target device, default to "NPU". 29 | """ 30 | super().__init__(profile, device) 31 | self.inC, self.outC = inC, outC 32 | self.batch = batch 33 | input = self.parameter((self.batch, self.inC)) 34 | _ = self.linear(input, outC, inC, bias=False) 35 | self.compile() 36 | 37 | def run(self, X: np.ndarray, W: np.ndarray, op_id: str) -> np.ndarray: 38 | """Run the layer: X * W^T. 39 | 40 | Args: 41 | X (np.ndarray): lhs operator 42 | W (np.ndarray): rhs operator 43 | op_id (str): operation id 44 | 45 | Raises: 46 | RuntimeError: Input or weight tensor shape mismatch 47 | 48 | Returns: 49 | np.ndarray: result 50 | """ 51 | if not (X.shape[0] == self.batch and X.shape[1] == self.inC): 52 | raise RuntimeError( 53 | f"Input shape {X.shape} different from expected one {(self.batch, self.inC)}" 54 | ) 55 | if not (X.shape[0] == self.batch and X.shape[1] == self.inC): 56 | raise RuntimeError( 57 | f"Weight shape {W.shape} different from expected one {(self.outC, self.inC)}" 58 | ) 59 | 60 | return super().run(X, W, op_id=op_id) 61 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/backend/matmul.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from intel_npu_acceleration_library.backend.factory import NNFactory 7 | import numpy as np 8 | 9 | 10 | class MatMul(NNFactory): 11 | """MatMul class, computing a matrix matrix multiplication.""" 12 | 13 | def __init__( 14 | self, 15 | inC: int, 16 | outC: int, 17 | batch: int, 18 | profile: bool = False, 19 | device: str = "NPU", 20 | ): 21 | """Initialize the MatMul class. 22 | 23 | Args: 24 | inC (int): input channels 25 | outC (int): output channels 26 | batch (int): batch 27 | profile (bool): Enable/Disable profiling. Defaults to False. 28 | device (str): Target device, default to "NPU". 29 | """ 30 | super().__init__(profile, device) 31 | self.inC, self.outC = inC, outC 32 | self.batch = batch 33 | input = self.parameter((self.batch, self.inC)) 34 | _ = self.linear(input, outC, inC, bias=False) 35 | self.compile() 36 | 37 | def run(self, X: np.ndarray, W: np.ndarray) -> np.ndarray: 38 | """Run the layer: X * W^T. 39 | 40 | Args: 41 | X (np.ndarray): lhs operator 42 | W (np.ndarray): rhs operator 43 | 44 | Raises: 45 | RuntimeError: Input or weight tensor shape mismatch 46 | 47 | Returns: 48 | np.ndarray: result 49 | """ 50 | if not (X.shape[0] == self.batch and X.shape[1] == self.inC): 51 | raise RuntimeError( 52 | f"Input shape {X.shape} different from expected one {(self.batch, self.inC)}" 53 | ) 54 | if not (X.shape[0] == self.batch and X.shape[1] == self.inC): 55 | raise RuntimeError( 56 | f"Weight shape {W.shape} different from expected one {(self.outC, self.inC)}" 57 | ) 58 | 59 | return super().run(X, W) 60 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/backend/mlp.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from intel_npu_acceleration_library.backend.factory import NNFactory 7 | from typing import Optional, Sequence 8 | 9 | 10 | class MLP(NNFactory): 11 | """Linear class, computing a matrix matrix multiplication with weights prefetching.""" 12 | 13 | def __init__( 14 | self, 15 | input_shape: Sequence[int], 16 | intermediate_size: int, 17 | activation: str = "swiglu", 18 | bias: Optional[bool] = False, 19 | profile: bool = False, 20 | device: str = "NPU", 21 | **additional_args 22 | ): 23 | """Initialize the Linear class. 24 | 25 | Args: 26 | input_shape (Sequence[int]): input shape channels 27 | intermediate_size (int): intermediate_size 28 | activation (str): activation function to use 29 | bias (Optional[bool], optional): Enable/Disable bias. Defaults to False. 30 | profile (bool): Enable/Disable profiling. Defaults to False. 31 | device (str): Target device, default to "NPU". 32 | additional_args: additional arguments 33 | """ 34 | super().__init__(profile, device) 35 | self.intermediate_size = intermediate_size 36 | self.batch, self.hidden_size = input_shape 37 | input = self.parameter((self.batch, self.hidden_size)) 38 | 39 | mm1 = self.linear(input, self.intermediate_size, self.hidden_size, bias=bias) 40 | 41 | if activation == "swiglu": 42 | mm2 = self.linear(input, self.intermediate_size, self.hidden_size, bias=bias) # type: ignore[attr-defined] 43 | mm1 = self.eltwise_mul(self.swish(mm1), mm2) # type: ignore[attr-defined] 44 | elif activation == "clamp": 45 | atc_fn = getattr(self, activation) 46 | mm1 = atc_fn(mm1, additional_args.get("min"), additional_args.get("max")) 47 | elif activation == "elu": 48 | atc_fn = getattr(self, activation) 49 | mm1 = atc_fn(mm1, additional_args.get("alpha", 1.0)) 50 | elif activation == "grn": 51 | atc_fn = getattr(self, activation) 52 | mm1 = atc_fn(mm1, additional_args.get("grn_bias")) 53 | else: 54 | atc_fn = getattr(self, activation) 55 | mm1 = atc_fn(mm1) 56 | 57 | _ = self.linear(mm1, self.hidden_size, self.intermediate_size, bias=bias) 58 | self.compile() 59 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/backend/ops.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from dataclasses import dataclass 7 | from functools import lru_cache 8 | from typing import List, Any, Sequence 9 | import ctypes 10 | 11 | 12 | @dataclass(frozen=True) 13 | class SupportedOp: 14 | """A class for supported runtime OPs in the NPU. 15 | 16 | Attrs: 17 | name (str): Operation name 18 | inputs (int): Number of inputs 19 | parameters (Sequence[Any]): Optional parameters type. 20 | """ 21 | 22 | name: str 23 | inputs: int 24 | parameters: Sequence[Any] = () 25 | 26 | 27 | @lru_cache(maxsize=None) 28 | def get_supported_ops() -> List[SupportedOp]: 29 | """Generate a list fo supported operations. 30 | 31 | Returns: 32 | List[SupportedOp]: list fo supported NPU operations 33 | """ 34 | supported_ops = [ 35 | SupportedOp(name="result", inputs=1), 36 | SupportedOp(name="matmul", inputs=2, parameters=[ctypes.c_bool, ctypes.c_bool]), 37 | SupportedOp(name="eltwise_add", inputs=2), 38 | SupportedOp(name="eltwise_mul", inputs=2), 39 | SupportedOp(name="eltwise_div", inputs=2), 40 | SupportedOp(name="abs_act", inputs=1), 41 | SupportedOp(name="acos_act", inputs=1), 42 | SupportedOp(name="asin_act", inputs=1), 43 | SupportedOp(name="atan_act", inputs=1), 44 | SupportedOp(name="ceiling", inputs=1), 45 | SupportedOp( 46 | name="clamp", inputs=1, parameters=[ctypes.c_float, ctypes.c_float] 47 | ), 48 | SupportedOp(name="cos_act", inputs=1), 49 | SupportedOp(name="cosh_act", inputs=1), 50 | SupportedOp(name="erf_act", inputs=1), 51 | SupportedOp(name="elu", inputs=1, parameters=[ctypes.c_float]), 52 | SupportedOp(name="exp_act", inputs=1), 53 | SupportedOp(name="floor_act", inputs=1), 54 | SupportedOp(name="grn", inputs=1, parameters=[ctypes.c_float]), 55 | SupportedOp(name="gelu", inputs=1), 56 | SupportedOp(name="gelu_erf", inputs=1), 57 | SupportedOp(name="log_act", inputs=1), 58 | SupportedOp(name="negative", inputs=1), 59 | SupportedOp(name="relu", inputs=1), 60 | SupportedOp(name="prelu", inputs=2), 61 | SupportedOp(name="sigmoid", inputs=1), 62 | SupportedOp(name="sign", inputs=1), 63 | SupportedOp(name="sin_act", inputs=1), 64 | SupportedOp(name="sinh_act", inputs=1), 65 | SupportedOp(name="sqrt_act", inputs=1), 66 | SupportedOp(name="tan_act", inputs=1), 67 | SupportedOp(name="tanh_act", inputs=1), 68 | SupportedOp(name="acosh_act", inputs=1), 69 | SupportedOp(name="asinh_act", inputs=1), 70 | SupportedOp(name="atanh_act", inputs=1), 71 | SupportedOp(name="hswish", inputs=1), 72 | SupportedOp(name="mish", inputs=1), 73 | SupportedOp(name="softplus", inputs=1), 74 | SupportedOp(name="hsigmoid", inputs=1), 75 | SupportedOp(name="round_act", inputs=1), 76 | SupportedOp(name="softsign", inputs=1), 77 | SupportedOp(name="softmax", inputs=1, parameters=[ctypes.c_int]), 78 | SupportedOp(name="swish", inputs=1), 79 | SupportedOp(name="convert_to_fp16", inputs=1), 80 | SupportedOp( 81 | name="scaled_dot_product_attention", 82 | inputs=4, 83 | parameters=[ctypes.c_bool], 84 | ), 85 | SupportedOp( 86 | name="scaled_dot_product_attention_simple", 87 | inputs=3, 88 | parameters=[ctypes.c_bool], 89 | ), 90 | SupportedOp( 91 | name="normL2", 92 | inputs=2, 93 | parameters=[ctypes.c_float], 94 | ), 95 | SupportedOp( 96 | name="gather", 97 | inputs=3, 98 | parameters=[ctypes.c_int], 99 | ), 100 | SupportedOp(name="reshape", inputs=2), 101 | SupportedOp(name="transpose", inputs=2), 102 | SupportedOp(name="squeeze", inputs=1), 103 | SupportedOp(name="unsqueeze", inputs=2), 104 | SupportedOp( 105 | name="concat", 106 | inputs=2, 107 | parameters=[ctypes.c_int64], 108 | ), 109 | SupportedOp( 110 | name="reduce_max", 111 | inputs=2, 112 | parameters=[ctypes.c_bool], 113 | ), 114 | SupportedOp( 115 | name="reduce_mean", 116 | inputs=2, 117 | parameters=[ctypes.c_bool], 118 | ), 119 | SupportedOp( 120 | name="reduce_min", 121 | inputs=2, 122 | parameters=[ctypes.c_bool], 123 | ), 124 | SupportedOp( 125 | name="reduce_prod", 126 | inputs=2, 127 | parameters=[ctypes.c_bool], 128 | ), 129 | SupportedOp( 130 | name="reduce_sum", 131 | inputs=2, 132 | parameters=[ctypes.c_bool], 133 | ), 134 | SupportedOp(name="adaptive_avg_pool", inputs=2), 135 | SupportedOp(name="adaptive_max_pool", inputs=2), 136 | SupportedOp(name="power", inputs=2), 137 | SupportedOp(name="log_softmax", inputs=1, parameters=[ctypes.c_int64]), 138 | ] 139 | return supported_ops 140 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/backend/qlinear.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from intel_npu_acceleration_library.backend.factory import NNFactory 7 | import numpy as np 8 | 9 | 10 | class QLinear(NNFactory): 11 | """Quantized Linear class, computing a matrix matrix multiplication with weights prefetching.""" 12 | 13 | def __init__( 14 | self, 15 | inC: int, 16 | outC: int, 17 | batch: int, 18 | profile: bool = False, 19 | device: str = "NPU", 20 | dtype: np.dtype = np.int8, 21 | ): 22 | """Initialize the QLinear class. 23 | 24 | Args: 25 | inC (int): input channels 26 | outC (int): output channels 27 | batch (int): batch 28 | profile (bool): Enable/Disable profiling. Defaults to False. 29 | device (str): Target device, default to "NPU". 30 | dtype (np.dtype): weights datatype. Defaults to np.int8. 31 | 32 | """ 33 | super().__init__(profile, device) 34 | self.inC, self.outC = inC, outC 35 | self.batch = batch 36 | 37 | input = self.parameter((self.batch, self.inC)) 38 | _ = self.linear(input, outC, inC, bias=False, wt_dtype=dtype) 39 | self.compile() 40 | 41 | def run( 42 | self, X: np.ndarray, W: np.ndarray, scale: np.ndarray, op_id: str 43 | ) -> np.ndarray: 44 | """Run the layer: $X * (W * S)^T$ . 45 | 46 | Args: 47 | X (np.ndarray): activation 48 | W (np.ndarray): quantized weights 49 | scale (np.ndarray): quantization scale 50 | op_id (str): operation id 51 | 52 | Raises: 53 | RuntimeError: Input, weights or scale shape mismatch 54 | 55 | Returns: 56 | np.ndarray: result 57 | """ 58 | if not (X.shape[0] == self.batch and X.shape[1] == self.inC): 59 | raise RuntimeError( 60 | f"Input shape {X.shape} different from expected one {(self.batch, self.inC)}" 61 | ) 62 | if not (X.shape[0] == self.batch and X.shape[1] == self.inC): 63 | raise RuntimeError( 64 | f"Weight shape {W.shape} different from expected one {(self.outC, self.inC)}" 65 | ) 66 | if not (X.shape[0] == self.batch and X.shape[1] == self.inC): 67 | raise RuntimeError( 68 | f"Scale shape {W.shape} different from expected one {(self.outC, 1)}" 69 | ) 70 | 71 | return super().run(X, (W, scale), op_id=op_id) 72 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/backend/qmatmul.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from intel_npu_acceleration_library.backend.factory import NNFactory 7 | import numpy as np 8 | 9 | 10 | class QMatMul(NNFactory): 11 | """Quantized Linear class, computing a matrix matrix multiplication.""" 12 | 13 | def __init__( 14 | self, 15 | inC: int, 16 | outC: int, 17 | batch: int, 18 | profile: bool = False, 19 | device: str = "NPU", 20 | dtype: np.dtype = np.int8, 21 | ): 22 | """Initialize the QMatmul class. 23 | 24 | Args: 25 | inC (int): input channels 26 | outC (int): output channels 27 | batch (int): batch 28 | profile (bool): Enable/Disable profiling. Defaults to False. 29 | device (str): Target device, default to "NPU". 30 | dtype (np.dtype): weights datatype. Defaults to np.int8. 31 | """ 32 | super().__init__(profile, device) 33 | self.inC, self.outC = inC, outC 34 | self.batch = batch 35 | input = self.parameter((self.batch, self.inC)) 36 | _ = self.linear(input, outC, inC, bias=False, wt_dtype=dtype) 37 | self.compile() 38 | 39 | def run(self, X: np.ndarray, W: np.ndarray, scale: np.ndarray) -> np.ndarray: 40 | """Run the layer: X * (W * S)^T. 41 | 42 | Args: 43 | X (np.ndarray): activation 44 | W (np.ndarray): quantized weights 45 | scale (np.ndarray): quantization scale 46 | 47 | Raises: 48 | RuntimeError: Input, weights or scale shape mismatch 49 | 50 | Returns: 51 | np.ndarray: result 52 | """ 53 | if not (X.shape[0] == self.batch and X.shape[1] == self.inC): 54 | raise RuntimeError( 55 | f"Input shape {X.shape} different from expected one {(self.batch, self.inC)}" 56 | ) 57 | if not (X.shape[0] == self.batch and X.shape[1] == self.inC): 58 | raise RuntimeError( 59 | f"Weight shape {W.shape} different from expected one {(self.outC, self.inC)}" 60 | ) 61 | if not (X.shape[0] == self.batch and X.shape[1] == self.inC): 62 | raise RuntimeError( 63 | f"Scale shape {W.shape} different from expected one {(self.outC, 1)}" 64 | ) 65 | 66 | return super().run(X, (W, scale)) 67 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/backend/sdpa.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | from intel_npu_acceleration_library.backend.factory import NNFactory 6 | from typing import Tuple 7 | import numpy as np 8 | 9 | 10 | class SDPA(NNFactory): 11 | """Implementation of a ScaledDotProductAttention NPU operation.""" 12 | 13 | def __init__( 14 | self, 15 | query_shapes: Tuple[int, int], 16 | key_shapes: Tuple[int, int], 17 | value_shapes: Tuple[int, int], 18 | mask_shapes: Tuple[int, int], 19 | is_causal: bool = False, 20 | profile: bool = False, 21 | device: str = "NPU", 22 | ): 23 | """Initialize the SDPA. 24 | 25 | Args: 26 | query_shapes (Tuple[int, int]): shape of the query tensor 27 | key_shapes (Tuple[int, int]): shape of the key tensor 28 | value_shapes (Tuple[int, int]): shape of the value tensor 29 | mask_shapes (Tuple[int, int]): shape of the mask tensor 30 | is_causal (bool, optional): If the SDPA mask is is_causal or not. Defaults to False. 31 | profile (bool, optional): Enable/Disable profiling. Defaults to False. 32 | device (str, optional): Target device, default to "NPU". 33 | """ 34 | super().__init__(profile, device) 35 | 36 | self.query = self.parameter(query_shapes) 37 | self.key = self.parameter(key_shapes) 38 | self.value = self.parameter(value_shapes) 39 | self.mask = self.parameter(mask_shapes) 40 | 41 | _ = self.scaled_dot_product_attention( # type: ignore[attr-defined] 42 | self.query, self.key, self.value, self.mask, is_causal 43 | ) 44 | self.compile() 45 | 46 | def run( 47 | self, query: np.ndarray, key: np.ndarray, value: np.ndarray, mask: np.ndarray 48 | ) -> np.ndarray: 49 | """Run the scaled dot product attention kernel. 50 | 51 | Args: 52 | query (np.ndarray): sdpa query tensor 53 | key (np.ndarray): sdpa key tensor 54 | value (np.ndarray): sdpa value tensor 55 | mask (np.ndarray): sdpa mask tensor 56 | 57 | Returns: 58 | np.ndarray: result 59 | """ 60 | return super().run(query, key, value, mask) 61 | 62 | 63 | class SimpleSDPA(NNFactory): 64 | """Implementation of a ScaledDotProductAttention NPU operation.""" 65 | 66 | def __init__( 67 | self, 68 | query_shapes: Tuple[int, int], 69 | key_shapes: Tuple[int, int], 70 | value_shapes: Tuple[int, int], 71 | is_causal: bool = False, 72 | profile: bool = False, 73 | device: str = "NPU", 74 | ): 75 | """Initialize the SDPA. 76 | 77 | Args: 78 | query_shapes (Tuple[int, int]): shape of the query tensor 79 | key_shapes (Tuple[int, int]): shape of the key tensor 80 | value_shapes (Tuple[int, int]): shape of the value tensor 81 | is_causal (bool, optional): If the SDPA mask is is_causal or not. Defaults to False. 82 | profile (bool, optional): Enable/Disable profiling. Defaults to False. 83 | device (str, optional): Target device, default to "NPU". 84 | """ 85 | super().__init__(profile, device) 86 | 87 | self.query = self.parameter(query_shapes) 88 | self.key = self.parameter(key_shapes) 89 | self.value = self.parameter(value_shapes) 90 | 91 | _ = self.scaled_dot_product_attention_simple( # type: ignore[attr-defined] 92 | self.query, self.key, self.value, is_causal 93 | ) 94 | self.compile() 95 | 96 | def run(self, query: np.ndarray, key: np.ndarray, value: np.ndarray) -> np.ndarray: 97 | """Run the scaled dot product attention kernel. 98 | 99 | Args: 100 | query (np.ndarray): sdpa query tensor 101 | key (np.ndarray): sdpa key tensor 102 | value (np.ndarray): sdpa value tensor 103 | 104 | Returns: 105 | np.ndarray: result 106 | """ 107 | return super().run(query, key, value) 108 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/backend/utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from functools import lru_cache 7 | from .bindings import lib 8 | import warnings 9 | import sys 10 | 11 | __min_npu_driver_version__ = 2408 12 | 13 | 14 | @lru_cache 15 | def npu_available() -> bool: 16 | """Return if the NPU is available. 17 | 18 | Returns: 19 | bool: Return True if the NPU is available in the system 20 | """ 21 | return lib.isNPUAvailable() 22 | 23 | 24 | def get_driver_installation_url() -> str: 25 | """Get the driver installation URL. 26 | 27 | Returns: 28 | std: Return the driver installation url 29 | """ 30 | if sys.platform == "win32": 31 | return "Driver Update URL: https://www.intel.com/content/www/us/en/download/794734/intel-npu-driver-windows.html" 32 | elif sys.platform == "linux": 33 | return "Driver Update URL: https://github.com/intel/linux-npu-driver" 34 | else: 35 | return "" 36 | 37 | 38 | @lru_cache 39 | def get_driver_version() -> int: 40 | """Get the driver version for the Intel® NPU Acceleration Library. 41 | 42 | Raises: 43 | RuntimeError: an error is raised if the platform is not supported. Currently supported platforms are Windows and Linux 44 | 45 | Returns: 46 | int: NPU driver version 47 | """ 48 | if not npu_available(): 49 | raise RuntimeError("NPU is not available on this system") 50 | 51 | return lib.getNPUDriverVersion() 52 | 53 | 54 | def check_npu_and_driver_version(): 55 | """Check NPU and driver version.""" 56 | if not npu_available(): 57 | warnings.warn( 58 | "NPU is not available in your system. Library will fallback to AUTO device selection mode", 59 | stacklevel=2, 60 | ) 61 | elif get_driver_version() < __min_npu_driver_version__: 62 | 63 | warnings.warn( 64 | f"\nWarning: Outdated Driver Detected!!!\n" 65 | f"Current Driver Version: {get_driver_version()}, Minimum Required Version: {__min_npu_driver_version__}\n" 66 | f"Using an outdated driver may result in reduced performance and unexpected errors and crashes" 67 | f"To avoid these issues, please update your driver to the latest version.\n" 68 | f"{get_driver_installation_url()}\n", 69 | stacklevel=2, 70 | ) 71 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/dtypes.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from dataclasses import dataclass 7 | from typing import Union 8 | import numpy as np 9 | import torch 10 | 11 | 12 | @dataclass(frozen=True) 13 | class NPUDtype: 14 | """Represents a custom data type for NPUs (Neural Processing Units). 15 | 16 | Attrs: 17 | name: str: The name of the data type. 18 | bits: int: The number of bits used to represent the data type. 19 | min: int: The minimum value that can be represented by the data type. 20 | max: int: The maximum value that can be represented by the data type. 21 | torch_dtype: torch.dtype: The corresponding torch data type. 22 | is_floating_point: bool: True if the data type is floating-point, False otherwise. 23 | """ 24 | 25 | name: str 26 | bits: int 27 | min: int 28 | max: int 29 | torch_dtype: torch.dtype 30 | 31 | @property 32 | def is_floating_point(self) -> bool: 33 | """ 34 | Check if the data type is a floating-point type. 35 | 36 | Returns: 37 | bool: True if the data type is floating-point, False otherwise. 38 | """ 39 | return self.torch_dtype.is_floating_point 40 | 41 | def __eq__(self, value: Union["NPUDtype", torch.dtype]) -> bool: 42 | """ 43 | Compare the NPUDtype object with another NPUDtype or torch.dtype object. 44 | 45 | Args: 46 | value (Union["NPUDtype", torch.dtype]): The object to compare with. 47 | 48 | Returns: 49 | bool: True if the objects are equal, False otherwise. 50 | """ 51 | if isinstance(value, torch.dtype): 52 | if value.is_floating_point: 53 | info = torch.finfo(value) 54 | else: 55 | info = torch.iinfo(value) 56 | return ( 57 | self.bits == info.bits 58 | and self.max == info.max 59 | and self.min == info.min 60 | and self.torch_dtype == value 61 | ) 62 | if isinstance(value, type): 63 | value = np.dtype(value) 64 | if value.kind == "f": 65 | info = np.finfo(value) 66 | else: 67 | info = np.iinfo(value) 68 | return ( 69 | self.bits == info.bits and self.max == info.max and self.min == info.min 70 | ) 71 | else: 72 | return super().__eq__(value) 73 | 74 | def __repr__(self) -> str: 75 | """ 76 | Return a string representation of the NPUDtype object. 77 | 78 | Returns: 79 | str: The string representation of the NPUDtype object. 80 | """ 81 | return self.name 82 | 83 | 84 | float16 = NPUDtype( 85 | "fp16", 86 | 16, 87 | torch.finfo(torch.float16).min, 88 | torch.finfo(torch.float16).max, 89 | torch.float16, 90 | ) 91 | bfloat16 = NPUDtype( 92 | "bf16", 93 | 16, 94 | torch.finfo(torch.bfloat16).min, 95 | torch.finfo(torch.bfloat16).max, 96 | torch.bfloat16, 97 | ) 98 | float32 = NPUDtype( 99 | "fp32", 100 | 32, 101 | torch.finfo(torch.float32).min, 102 | torch.finfo(torch.float32).max, 103 | torch.float32, 104 | ) 105 | float64 = NPUDtype( 106 | "fp64", 107 | 64, 108 | torch.finfo(torch.float64).min, 109 | torch.finfo(torch.float64).max, 110 | torch.float64, 111 | ) 112 | int4 = NPUDtype("int4", 4, -8, 7, torch.int8) 113 | int8 = NPUDtype("int8", 8, -128, 127, torch.int8) 114 | int16 = NPUDtype( 115 | "int16", 16, torch.iinfo(torch.int16).min, torch.iinfo(torch.int16).max, torch.int16 116 | ) 117 | int32 = NPUDtype( 118 | "int32", 32, torch.iinfo(torch.int32).min, torch.iinfo(torch.int32).max, torch.int32 119 | ) 120 | int64 = NPUDtype( 121 | "int64", 64, torch.iinfo(torch.int64).min, torch.iinfo(torch.int64).max, torch.int64 122 | ) 123 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/functional/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from .scaled_dot_product_attention import scaled_dot_product_attention 7 | 8 | __all__ = ["scaled_dot_product_attention"] 9 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/functional/scaled_dot_product_attention.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | from intel_npu_acceleration_library.backend import run_factory, SDPA, SimpleSDPA 6 | from typing import Optional 7 | from functools import partial 8 | import torch 9 | 10 | 11 | def scaled_dot_product_attention( 12 | query: torch.Tensor, 13 | key: torch.Tensor, 14 | value: torch.Tensor, 15 | attn_mask: torch.Tensor = None, 16 | dropout_p: float = 0.0, 17 | is_causal: bool = False, 18 | scale: Optional[float] = None, 19 | ) -> torch.Tensor: 20 | """Execute SDPA kernel. 21 | 22 | Args: 23 | query (torch.Tensor): query tensor 24 | key (torch.Tensor): key tensor 25 | value (torch.Tensor): value tensor 26 | attn_mask (torch.Tensor, optional): attention mask tensor. Defaults to None. 27 | dropout_p (float, optional): optional dropout. Defaults to 0.0. 28 | is_causal (bool, optional): enable causal mask. Defaults to False. 29 | scale (Optional[float], optional): custom scale. Defaults to None. 30 | 31 | Raises: 32 | RuntimeError: _description_ 33 | 34 | Returns: 35 | torch.Tensor: _description_ 36 | """ 37 | if dropout_p != 0: 38 | raise RuntimeError("dropout_p != 0 is not supported yet") 39 | if scale is not None: 40 | raise RuntimeError("scale != 0 is not supported yet") 41 | 42 | if attn_mask is None: 43 | backend_cls = partial(SimpleSDPA, is_causal=is_causal) # type: ignore 44 | return run_factory([query, key, value], [], backend_cls) 45 | else: 46 | backend_cls = partial(SDPA, is_causal=is_causal) # type: ignore 47 | return run_factory([query, key, value, attn_mask], [], backend_cls) 48 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/modelling.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM 6 | import intel_npu_acceleration_library as npu_lib 7 | from intel_npu_acceleration_library.compiler import CompilerConfig 8 | from functools import partialmethod 9 | from typing import Type, Any, Tuple, Optional 10 | import hashlib 11 | import torch 12 | import os 13 | 14 | 15 | def get_cache_dir() -> str: 16 | """Get the model cache directory. 17 | 18 | Returns: 19 | str: path to the cache directory 20 | """ 21 | return os.path.join("cache", "models") 22 | 23 | 24 | def get_mangled_model_name(model_name: str, *args: Any, **kwargs: Any) -> str: 25 | """Mangle the model name with all the parameters. 26 | 27 | Args: 28 | model_name (str): model name or path 29 | args (Any): positional arguments 30 | kwargs (Any): keyword arguments 31 | 32 | Returns: 33 | str: mangled name 34 | """ 35 | # append all input parameters and create a string 36 | arguments_str = f"{[str(arg) for arg in args] + [f'{str(key)}_{str(arg)}' for key, arg in kwargs.items()]}" 37 | arguments_str_hash = hashlib.sha256(arguments_str.encode("utf-8")).hexdigest() 38 | mangled_model_name = f"{model_name}_{arguments_str_hash}_{npu_lib.__version__}" 39 | return mangled_model_name.replace("\\", "_").replace("/", "_") 40 | 41 | 42 | def get_model_path(model_name: str, *args: Any, **kwargs: Any) -> Tuple[str, str]: 43 | """Get the model path. 44 | 45 | Args: 46 | model_name (str): model name or path 47 | args (Any): positional arguments 48 | kwargs (Any): keyword arguments 49 | 50 | Returns: 51 | Tuple[str, str]: model directory and full path 52 | """ 53 | cache_dir = get_cache_dir() 54 | mangled_model_name = get_mangled_model_name(model_name, *args, **kwargs) 55 | model_dir_path = os.path.join(cache_dir, mangled_model_name) 56 | model_path = os.path.join(model_dir_path, "pytorch_npu_model.pt") 57 | return model_dir_path, model_path 58 | 59 | 60 | class NPUModel: 61 | """Base NPU model class.""" 62 | 63 | @staticmethod 64 | def from_pretrained( 65 | model_name_or_path: str, 66 | config: CompilerConfig, 67 | transformers_class: Optional[Type] = None, 68 | export=True, 69 | *args: Any, 70 | **kwargs: Any, 71 | ) -> torch.nn.Module: 72 | """Template for the `from_pretrained` static method. 73 | 74 | Args: 75 | model_name_or_path (str): model name or path 76 | config (CompilerConfig): compiler configuration 77 | transformers_class (Optional[Type], optional): base class to use. Must have a `from_pretrained` method. Defaults to None. 78 | export (bool, optional): enable the caching of the model. Defaults to True. 79 | args (Any): positional arguments 80 | kwargs (Any): keyword arguments 81 | 82 | Raises: 83 | RuntimeError: Invalid class 84 | AttributeError: Cannot export model with trust_remote_code=True 85 | 86 | Returns: 87 | torch.nn.Module: compiled mode 88 | """ 89 | if transformers_class is None: 90 | raise RuntimeError(f"Invalid transformer class {type(transformers_class)}") 91 | # get the model cache dir and path from the name and arguments 92 | model_dir_path, model_path = get_model_path( 93 | model_name_or_path, config.dtype, config.training, *args, **kwargs 94 | ) 95 | if os.path.isdir(model_dir_path) and os.path.isfile(model_path): 96 | # Model already exist so I can load it directly 97 | return torch.load(model_path) 98 | else: 99 | # Model does not exists, so I need to compile it first 100 | print(f"Compiling model {model_name_or_path} {config.dtype} for the NPU") 101 | model = transformers_class.from_pretrained( 102 | model_name_or_path, *args, **kwargs 103 | ) 104 | model = npu_lib.compile(model, config) 105 | if export: 106 | if kwargs.get("trust_remote_code", False): 107 | raise AttributeError( 108 | "Cannot export model with trust_remote_code=True. Please set trust_remote_code=False or export=False" 109 | ) 110 | print(f"Exporting model {model_name_or_path} to {model_dir_path}") 111 | os.makedirs(model_dir_path, exist_ok=True) 112 | torch.save(model, model_path) 113 | return model 114 | 115 | 116 | class NPUAutoModel: 117 | """NPU wrapper for AutoModel. 118 | 119 | Attrs: 120 | from_pretrained: Load a pretrained model 121 | """ 122 | 123 | from_pretrained = partialmethod( 124 | NPUModel.from_pretrained, transformers_class=AutoModel 125 | ) 126 | 127 | 128 | class NPUModelForCausalLM: 129 | """NPU wrapper for AutoModelForCausalLM. 130 | 131 | Attrs: 132 | from_pretrained: Load a pretrained model 133 | """ 134 | 135 | from_pretrained = partialmethod( 136 | NPUModel.from_pretrained, transformers_class=AutoModelForCausalLM 137 | ) 138 | 139 | 140 | class NPUModelForSeq2SeqLM: 141 | """NPU wrapper for AutoModelForSeq2SeqLM. 142 | 143 | Attrs: 144 | from_pretrained: Load a pretrained model 145 | """ 146 | 147 | from_pretrained = partialmethod( 148 | NPUModel.from_pretrained, transformers_class=AutoModelForSeq2SeqLM 149 | ) 150 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/nn/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from .functional import * # noqa 7 | from .linear import Linear, QuantizedLinear # noqa 8 | from .conv import Conv2d # noqa 9 | from .module import Module # noqa 10 | 11 | try: 12 | from .llm import LlamaAttention, PhiMLP # noqa 13 | 14 | llm_modules = ["LlamaAttention", "PhiMLP"] 15 | except ModuleNotFoundError: 16 | # Transformer library is not installed 17 | llm_modules = [] 18 | 19 | 20 | __all__ = ["Module", "Linear", "QuantizedLinear", "Conv2d"] + llm_modules 21 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/nn/autograd.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from intel_npu_acceleration_library.backend import run_matmul 7 | from typing import Optional, Iterable, Union 8 | import torch 9 | 10 | 11 | class AutogradMatMul(torch.autograd.Function): 12 | """Autograd module for Linear operation.""" 13 | 14 | @staticmethod 15 | def forward( 16 | ctx, x: torch.Tensor, w: torch.Tensor, scale: Optional[torch.Tensor] = None 17 | ) -> torch.Tensor: 18 | """Run a linear forward pass. Depending on the datatype of the weights it runs a float or quantized operation. 19 | 20 | Equivalent pytorch code: 21 | result = x @ w.T 22 | 23 | Args: 24 | ctx (Any): the autograd context 25 | x (torch.Tensor): Activation tensor. Its dtype must be torch.float16 26 | w (torch.Tensor): Weight tensor. Its dtype must be torch.float16 27 | scale (Optional[torch.Tensor], optional): Quantization scale. If weights.dtype == torch.int8 then it must be set. Defaults to None. 28 | 29 | Returns: 30 | torch.Tensor: result 31 | """ 32 | result = run_matmul(x, w, scale, None) 33 | ctx.save_for_backward(w, x) 34 | return result 35 | 36 | @staticmethod 37 | def backward(ctx, grad_output: torch.Tensor) -> Iterable[Union[torch.Tensor, None]]: 38 | """Run a linear backward pass. 39 | 40 | grad_output shape: [batch, output_channels] 41 | x shape: [batch, input_channels] 42 | w shape: [output_channels, input_channels] 43 | 44 | Expected gradients 45 | dl_dx shape: [batch, input_channels] 46 | dl_dw shape: [output_channels, input_channels] 47 | 48 | Equivalent pytorch code: 49 | dl_dx = grad_output @ w.to(torch.float32) 50 | dl_dw = (x.T @ grad_output).T 51 | 52 | Args: 53 | ctx (Any): the autograd context 54 | grad_output (torch.Tensor): output gradient 55 | 56 | Returns: 57 | Iterable[Union[torch.Tensor, None]]: Input and parameters gradients 58 | """ 59 | ( 60 | w, 61 | x, 62 | ) = ctx.saved_tensors 63 | 64 | dl_dx = run_matmul(grad_output, torch.transpose(w, -1, -2)) 65 | dl_dw = run_matmul( 66 | torch.transpose(grad_output, -1, -2), 67 | torch.transpose(x, -1, -2).to(torch.float16), 68 | ) 69 | return dl_dx, dl_dw, None 70 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/nn/linear.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4 7 | from intel_npu_acceleration_library.nn.autograd import AutogradMatMul 8 | from intel_npu_acceleration_library.backend import run_matmul 9 | from intel_npu_acceleration_library.dtypes import NPUDtype 10 | from typing import Optional, Union 11 | import torch 12 | import uuid 13 | import math 14 | 15 | 16 | class Linear(torch.nn.Module): 17 | """Torch Linear operation NPU backend.""" 18 | 19 | def __init__(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = None): 20 | """Initialize the Linear class. 21 | 22 | Args: 23 | weight (torch.Tensor): Linear operation weight 24 | bias (Optional[torch.Tensor], optional): Linear operation optional bias. Defaults to None. 25 | """ 26 | super().__init__() 27 | 28 | self.weight = torch.nn.Parameter(weight) 29 | self.bias = torch.nn.Parameter(bias) if isinstance(bias, torch.Tensor) else None 30 | self.outC, self.inC = self.weight.shape 31 | self.op_id = str(uuid.uuid4()) 32 | # assert self.weight.dtype == torch.float16 33 | self._mm = AutogradMatMul.apply 34 | 35 | def forward(self, x: torch.Tensor) -> torch.Tensor: 36 | """Torch module forward method. 37 | 38 | Args: 39 | x (torch.Tensor): Input tensor 40 | 41 | Returns: 42 | torch.Tensor: result 43 | """ 44 | if self.training: 45 | out = self._mm(x, self.weight, None) 46 | else: 47 | out = run_matmul(x, self.weight, None, self.op_id) 48 | 49 | if self.bias is None: 50 | return out 51 | return out + self.bias 52 | 53 | @staticmethod 54 | def fromTorch( 55 | layer: torch.nn.Linear, dtype: torch.dtype = torch.float16 56 | ) -> Union["Linear", "QuantizedLinear"]: 57 | """Generate a NPU Linear layer from a torch one. 58 | 59 | Args: 60 | layer (torch.nn.Linear): the original torch.nn.Linear model to run on the NPU 61 | dtype (torch.dtype): the desired datatype 62 | 63 | Returns: 64 | Union[Linear, QuantizedLinear]: A NPU linear layer 65 | """ 66 | if any(dim > 2**17 for dim in layer.weight.shape): 67 | return layer 68 | return Linear.fromTensor(layer.weight, getattr(layer, "bias", None), dtype) 69 | 70 | @staticmethod 71 | def fromTensor( 72 | weight: torch.Tensor, 73 | bias: Optional[torch.Tensor], 74 | dtype: torch.dtype = torch.float16, 75 | ) -> Union["Linear", "QuantizedLinear"]: 76 | """Generate a NPU Linear layer from a torch one. 77 | 78 | Args: 79 | weight (torch.Tensor): the original weight tensor 80 | bias (Optional[torch.Tensor]): the original bias tensor 81 | dtype (torch.dtype): the desired datatype 82 | 83 | Raises: 84 | RuntimeError: dtype not supported 85 | 86 | Returns: 87 | Union[Linear, QuantizedLinear]: A NPU linear layer 88 | """ 89 | if dtype.is_floating_point: 90 | if bias is None: 91 | return Linear(weight.to(dtype), None) 92 | return Linear(weight.to(dtype), bias.to(dtype)) 93 | elif isinstance(dtype, NPUDtype): 94 | weights_quant, scale = quantize_tensor(weight, (dtype.min, dtype.max)) 95 | if dtype.bits == 4: 96 | weights_quant = compress_to_i4(weights_quant) 97 | return QuantizedLinear(weights_quant, scale, bias) 98 | elif dtype == torch.int8: 99 | weights_quant, scale = quantize_tensor(weight) 100 | return QuantizedLinear(weights_quant, scale, bias) 101 | else: 102 | raise RuntimeError( 103 | f"intel-npu-acceleration-library library do not support yet the requeste datatype: {dtype}" 104 | ) 105 | 106 | 107 | class QuantizedLinear(torch.nn.Module): 108 | """Torch Quantized Linear operation NPU backend.""" 109 | 110 | def __init__( 111 | self, 112 | weight: torch.Tensor, 113 | scale: torch.Tensor, 114 | bias: Optional[torch.Tensor] = None, 115 | ): 116 | """Initialize the QuantizedLinear class. 117 | 118 | Args: 119 | weight (torch.Tensor): Linear operation weight 120 | scale (torch.Tensor): Quantization scale 121 | bias (Optional[torch.Tensor], optional): Linear operation optional bias. Defaults to None. 122 | 123 | Raises: 124 | RuntimeError: Quantized weight must be in torch.int8 format 125 | """ 126 | super().__init__() 127 | 128 | self.weight = weight 129 | if self.weight.dtype not in (torch.int8, torch.uint8): 130 | raise RuntimeError( 131 | f"Quantized weight must be in torch.(u)int8 dtype instead of {self.weight.dtype}" 132 | ) 133 | self.outC, self.inC = self.weight.shape 134 | if self.weight.dtype == torch.uint8: 135 | # In case is Int4 we need to double the input channels because weights are compressed 136 | self.inC *= 2 137 | self.scale = scale * math.sqrt(self.inC) 138 | self.bias = bias 139 | self.op_id = str(uuid.uuid4()) 140 | self._mm = AutogradMatMul.apply 141 | 142 | def forward(self, x: torch.Tensor) -> torch.Tensor: 143 | """Torch module forward method. 144 | 145 | Args: 146 | x (torch.Tensor): Input tensor 147 | 148 | Raises: 149 | RuntimeError: Training is not supported for QuantizedLinear layer. Use `.eval()` to do inference only 150 | 151 | Returns: 152 | torch.Tensor: result 153 | """ 154 | if self.training: 155 | raise RuntimeError( 156 | "Training is not supported for QuantizedLinear layer. Use `.eval()` to do inference only" 157 | ) 158 | out = run_matmul(x, self.weight, self.scale, self.op_id) 159 | 160 | if self.bias is None: 161 | return out 162 | return out + self.bias 163 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/optimizations.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | from typing import Dict, List, Any 6 | import torch.nn as nn 7 | import torch.fx as fx 8 | import operator 9 | import torch 10 | 11 | 12 | def delattr_recursively(module: nn.Module, target: str): 13 | """Delete attribute recursively by name in a torch.nn.Module. 14 | 15 | Args: 16 | module (nn.Module): the nn.Module 17 | target (str): the attribute you want to delete 18 | """ 19 | *root, name = target.rsplit(".", 1) 20 | if root: 21 | root = root[0].split(".") 22 | delattr_recursively(getattr(module, root[0]), ".".join(root[1:] + [name])) 23 | else: 24 | delattr(module, target) 25 | 26 | 27 | def fuse_linear_layers( 28 | model: nn.Module, 29 | modules: Dict[str, nn.Linear], 30 | targets: List[str], 31 | fused_layer_name: str, 32 | ) -> None: 33 | """Fuse two linear layers and append them to the nn Module. 34 | 35 | Args: 36 | model (nn.Module): Origianl nn.Module object 37 | modules (Dict[nn.Linear]): a dictiorany of node name: linear layer 38 | targets (List[str]): list of layer node names 39 | fused_layer_name (str): fused layer name 40 | 41 | Raises: 42 | ValueError: All linear layers must be of type nn.Linear and must have the same input dimension 43 | 44 | """ 45 | # Get the attributes 46 | layers = [modules[name] for name in targets] 47 | 48 | in_features = list({layer.in_features for layer in layers}) 49 | 50 | # ensure both linear layers have the same input dimensions and are not already fused 51 | if not all(isinstance(layer, nn.Linear) for layer in layers): 52 | raise ValueError("All linear layers must be of type nn.Linear") 53 | if len(in_features) != 1: 54 | raise ValueError( 55 | f"All linear layers must have the same input dimensions. Instead found: {in_features}" 56 | ) 57 | 58 | # Create the new fused linear layer 59 | new_out_features = sum([layer.out_features for layer in layers]) 60 | has_bias = any(layer.bias is not None for layer in layers) 61 | fused_layer = nn.Linear(in_features[0], new_out_features, bias=has_bias) 62 | 63 | # Concatenate the weights and biases 64 | with torch.no_grad(): 65 | start, stop = 0, 0 66 | for layer in layers: 67 | stop += layer.out_features 68 | fused_layer.weight[start:stop, :] = layer.weight 69 | 70 | if has_bias: 71 | if layer.bias is not None: 72 | fused_layer.bias[start:stop] = layer.bias 73 | else: 74 | fused_layer.bias[start:stop] = torch.zeros_like( 75 | fused_layer.bias[start:stop] 76 | ) 77 | start = stop 78 | 79 | # Replace the two layers in the original model with the new fused layer 80 | setattr(model, fused_layer_name, fused_layer) 81 | for layer_name in targets: 82 | delattr_recursively(model, layer_name) 83 | 84 | 85 | def horizontal_fusion_linear(model: torch.nn.Module) -> torch.nn.Module: 86 | """Fuze horizontally two or more linear layers that share the same origin. This will increase NPU hw utilization. 87 | 88 | Args: 89 | model (torch.nn.Module): The original nn.Module 90 | 91 | Returns: 92 | torch.nn.Module: optimize nn.Module where parallel linear operations has been fused into a single bigger one 93 | """ 94 | fx_model = fx.symbolic_trace(model) 95 | modules = dict(fx_model.named_modules()) 96 | # new_graph = copy.deepcopy(fx_model.graph) 97 | 98 | def node_condition(node: Any) -> bool: 99 | """Return true if the node is a module and is nn.Linear. 100 | 101 | Args: 102 | node (Any): A torch fx node 103 | 104 | Returns: 105 | bool: return condition 106 | """ 107 | return node.op == "call_module" and isinstance(modules[node.target], nn.Linear) 108 | 109 | # First, find all node with a linear layer 110 | linear_nodes = [node for node in fx_model.graph.nodes if node_condition(node)] 111 | 112 | # Group the linear layers by input node 113 | linear_nodes_parents: Dict[str, List[Any]] = {} 114 | for node in linear_nodes: 115 | linear_nodes_parents.setdefault(node.args[0], []).append(node) 116 | 117 | # Get the ones with size > 1 118 | fused_modules = [ 119 | (source, modules) 120 | for source, modules in linear_nodes_parents.items() 121 | if len(modules) > 1 122 | ] 123 | 124 | for source, layers in fused_modules: 125 | fused_layer_name = "fused_" + "_".join(node.target for node in layers) 126 | fused_layer_name = fused_layer_name.replace(".", "_") 127 | fuse_linear_layers( 128 | fx_model, modules, [layer.target for layer in layers], fused_layer_name 129 | ) 130 | with fx_model.graph.inserting_after(source): 131 | fused_node = fx_model.graph.call_module(fused_layer_name, (source,)) 132 | 133 | with fx_model.graph.inserting_after(fused_node): 134 | 135 | start, stop = 0, 0 136 | for layer in layers: 137 | stop += modules[layer.target].out_features 138 | 139 | layer_slice = fx_model.graph.call_function( 140 | operator.getitem, 141 | args=( 142 | fused_node, 143 | ( 144 | Ellipsis, 145 | slice(start, stop, None), 146 | ), 147 | ), 148 | kwargs={}, 149 | ) 150 | layer.replace_all_uses_with(layer_slice) 151 | fx_model.graph.erase_node(layer) 152 | start = stop 153 | 154 | fx_model.graph.lint() 155 | fx_model.recompile() 156 | 157 | return fx_model 158 | -------------------------------------------------------------------------------- /intel_npu_acceleration_library/quantization.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | import intel_npu_acceleration_library.backend.compression as compression 6 | from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion 7 | from intel_npu_acceleration_library.dtypes import int8, int4 8 | from intel_npu_acceleration_library.dtypes import NPUDtype 9 | from neural_compressor.quantization import fit 10 | from typing import Tuple 11 | import logging 12 | import torch 13 | 14 | 15 | def quantize_tensor( 16 | weight: torch.Tensor, min_max_range: Tuple[int, int] = (-128, 127) 17 | ) -> Tuple[torch.Tensor, torch.Tensor]: 18 | """Quantize a fp16 tensor symmetrically. 19 | 20 | Produces a quantize tensor (same shape, dtype == `torch.int8`) and a scale tensor (dtype == `torch.float16) 21 | The quantization equation is the following W = S * W_q 22 | 23 | Args: 24 | weight (torch.Tensor): The tensor to quantize 25 | min_max_range (Tuple[int, int]): The min and max range for the quantized tensor. Defaults to (-128, 127). 26 | 27 | Raises: 28 | RuntimeError: Error in the quantization step 29 | 30 | Returns: 31 | Tuple[torch.Tensor, torch.Tensor]: Quantized tensor and scale 32 | """ 33 | scale = torch.max(torch.abs(weight), dim=-1).values 34 | 35 | # if any of the elements are zeros set the scale to the max value 36 | if torch.any(scale == 0): 37 | scale = torch.ones_like(scale) * torch.max(torch.abs(weight)) 38 | 39 | # Compute scale and zero point 40 | scale = (scale / max(min_max_range)).to(torch.float16).view(-1, 1) 41 | 42 | weights_quant = torch.floor(weight / scale) 43 | 44 | if not ( 45 | torch.max(weights_quant) <= max(min_max_range) 46 | and torch.min(weights_quant) >= min(min_max_range) 47 | ): 48 | raise RuntimeError( 49 | f"Quantization error: range of quantized weghts = {(torch.min(weights_quant), torch.max(weights_quant))} instead of ({min_max_range})" 50 | ) 51 | return weights_quant.to(torch.int8), scale 52 | 53 | 54 | def compress_to_i4(weights: torch.Tensor) -> torch.Tensor: 55 | """ 56 | Compresses a given tensor to 4-bit representation. 57 | 58 | Args: 59 | weights (torch.Tensor): The input tensor to be compressed. 60 | 61 | Returns: 62 | torch.Tensor: The compressed tensor with 4-bit representation. 63 | """ 64 | return torch.tensor(compression.compress_to_i4(weights.numpy())) 65 | 66 | 67 | def quantize_fit( 68 | model: torch.nn.Module, weights_dtype: str, algorithm: str = "RTN" 69 | ) -> torch.nn.Module: 70 | """Quantize a model with a given configuration. 71 | 72 | Args: 73 | model (torch.nn.Module): The model to quantize 74 | weights_dtype (str): The datatype for the weights 75 | algorithm (str, optional): The quantization algorithm. Defaults to "RTN". 76 | 77 | Raises: 78 | RuntimeError: Quantization error: unsupported datatype 79 | 80 | Returns: 81 | torch.nn.Module: The quantized model 82 | """ 83 | if weights_dtype == "int4": 84 | bits = 4 85 | elif weights_dtype == "int8": 86 | bits = 8 87 | else: 88 | raise RuntimeError(f"Quantization error: unsupported datatype {weights_dtype}") 89 | 90 | conf = PostTrainingQuantConfig( 91 | approach="weight_only", 92 | tuning_criterion=TuningCriterion(timeout=100000), 93 | op_type_dict={ 94 | ".*": { # match all ops 95 | "weight": { 96 | "dtype": weights_dtype, 97 | "bits": bits, 98 | "group_size": -1, 99 | "scheme": "sym", 100 | "algorithm": algorithm, 101 | }, 102 | "activation": { 103 | "dtype": "fp16", 104 | }, 105 | } 106 | }, 107 | ) 108 | 109 | return fit(model=model, conf=conf) 110 | 111 | 112 | def quantize_i8_model( 113 | model: torch.nn.Module, algorithm: str = "RTN" 114 | ) -> torch.nn.Module: 115 | """Quantize a model to 8-bit representation. 116 | 117 | Args: 118 | model (torch.nn.Module): The model to quantize 119 | algorithm (str, optional): The quantization algorithm. Defaults to "RTN". 120 | 121 | Returns: 122 | torch.nn.Module: The quantized model 123 | """ 124 | quantized_model = quantize_fit(model, "int8", algorithm) 125 | 126 | return quantized_model.export_compressed_model( 127 | scale_dtype=torch.float16, use_optimum_format=False 128 | ) 129 | 130 | 131 | def quantize_i4_model( 132 | model: torch.nn.Module, algorithm: str = "RTN" 133 | ) -> torch.nn.Module: 134 | """Quantize a model to 4-bit representation. 135 | 136 | Args: 137 | model (torch.nn.Module): The model to quantize 138 | algorithm (str, optional): The quantization algorithm. Defaults to "RTN". 139 | 140 | Returns: 141 | torch.nn.Module: The quantized model 142 | """ 143 | quantized_model = quantize_fit(model, "int4", algorithm) 144 | 145 | return quantized_model.export_compressed_model( 146 | compression_dtype=torch.int8, 147 | scale_dtype=torch.float16, 148 | use_optimum_format=False, 149 | ) 150 | 151 | 152 | def quantize_model(model: torch.nn.Module, dtype: NPUDtype) -> torch.nn.Module: 153 | """Quantize a model. 154 | 155 | Args: 156 | model (torch.nn.Module): The model to quantize 157 | dtype (NPUDtype): The desired datatype 158 | 159 | Raises: 160 | RuntimeError: Quantization error: unsupported datatype 161 | 162 | Returns: 163 | torch.nn.Module: The quantized model 164 | """ 165 | # Silence neural compressor logger 166 | logger = logging.getLogger("neural_compressor") 167 | logger.setLevel(logging.ERROR) 168 | 169 | if dtype == int4: 170 | return quantize_i4_model(model) 171 | elif dtype == int8: 172 | return quantize_i8_model(model) 173 | else: 174 | raise RuntimeError(f"Quantization error: unsupported datatype {dtype}") 175 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | implicit_reexport = True 3 | ignore_missing_imports = True 4 | disable_error_code = override -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | torch 3 | transformers>=4.43.0 4 | neural-compressor -------------------------------------------------------------------------------- /script/export.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig 7 | from intel_npu_acceleration_library.compiler import compile 8 | from intel_npu_acceleration_library.compiler import CompilerConfig 9 | from intel_npu_acceleration_library.dtypes import int8, int4 10 | import argparse 11 | import torch 12 | import os 13 | 14 | 15 | def define_and_parse_args(): 16 | parser = argparse.ArgumentParser(description="Export a for NPU") 17 | parser.add_argument("--model", "-m", type=str, help="Name of the model to export") 18 | parser.add_argument( 19 | "--dtype", 20 | type=str, 21 | default="fp16", 22 | choices=["fp16", "int8", "int4"], 23 | help="type of quantization to perform", 24 | ) 25 | parser.add_argument( 26 | "--output", "-o", type=str, default="models", help="Output path" 27 | ) 28 | 29 | return parser.parse_args() 30 | 31 | 32 | def export(model_id, dtype, output): 33 | 34 | print(f"Loading {model_id}") 35 | model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=True) 36 | tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True) 37 | config = AutoConfig.from_pretrained(model_id) 38 | 39 | PATH = os.path.join(output, model_id, dtype) 40 | 41 | tokenizer.save_pretrained(PATH) 42 | config.save_pretrained(PATH) 43 | 44 | if dtype == "fp16": 45 | print(f"Compiling model {model_id}") 46 | dtype = torch.float16 47 | elif dtype == "int8": 48 | print(f"Quantizing & Compiling model {model_id}") 49 | dtype = int8 50 | elif dtype == "int4": 51 | print(f"Quantizing & Compiling model {model_id}") 52 | dtype = int4 53 | else: 54 | raise RuntimeError(f"Invalid dtype {dtype}") 55 | 56 | with torch.no_grad(): 57 | compiler_conf = CompilerConfig(dtype=dtype) 58 | compile(model, compiler_conf) 59 | 60 | filename = os.path.join(PATH, "model.pth") 61 | os.makedirs(PATH, exist_ok=True) 62 | 63 | print("Saving model...") 64 | torch.save(model, filename) 65 | 66 | print(f"Model saved in {filename}") 67 | 68 | 69 | if __name__ == "__main__": 70 | args = define_and_parse_args() 71 | export(args.model, args.dtype, args.output) 72 | -------------------------------------------------------------------------------- /script/gen_leaderboard_doc.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | import matplotlib.pyplot as plt 7 | import seaborn as sns 8 | import pandas as pd 9 | import datetime 10 | import json 11 | import glob 12 | import os 13 | 14 | 15 | df = None 16 | for files in glob.glob("./leaderboard_*.json"): 17 | with open(files) as fp: 18 | prof = json.load(fp) 19 | date = datetime_object = datetime.datetime.strptime( 20 | prof["config"]["time"], "%Y-%m-%d_%H-%M-%S" 21 | ) 22 | 23 | new_df = pd.DataFrame.from_records(prof["profiling"]) 24 | new_df["date"] = date 25 | if df is None: 26 | df = new_df 27 | else: 28 | df = pd.concat([df, new_df], axis=0, join="outer") 29 | 30 | df.pop("error") 31 | 32 | 33 | col_to_str = { 34 | "model": "Model", 35 | "context_size": "Context Size", 36 | "tps": "Tokens / s", 37 | "prefill": "Prefill (s)", 38 | "intel_npu_acceleration_library": "Intel® NPU Acceleration Library enabled", 39 | "dtype": "Datatype", 40 | } 41 | 42 | 43 | def plot(df, x, y, hue="context_size", title=None, latest=True): 44 | 45 | filtered = df[(df["date"] == df["date"].max())] if latest else df 46 | 47 | plt.figure(figsize=(16, 9)) 48 | ax = sns.barplot(filtered.dropna(), x=x, y=y, hue=hue) 49 | ax.set_xlabel(col_to_str[x]) 50 | plt.xticks(rotation=45) 51 | if y == "prefill": 52 | ax.set_ylabel(f"Log {col_to_str[y]}") 53 | # ax.set_yscale('log') 54 | else: 55 | ax.set_ylabel(col_to_str[y]) 56 | if title is None: 57 | title = f"{col_to_str[y]} vs {col_to_str[x]}" 58 | ax.set_title(title) 59 | # ax.legend(bbox_to_anchor=(1, 1), loc=2, borderaxespad=0., title=col_to_str[hue]) 60 | ax.legend(title=col_to_str[hue]) 61 | filename = f"data/{x}_{y}_{hue}.png" 62 | os.makedirs("data", exist_ok=True) 63 | print(f"Save image {title} to {filename}") 64 | ax.get_figure().savefig(filename, bbox_inches="tight") 65 | 66 | 67 | sns.color_palette("tab10") 68 | 69 | plot(df[df["context_size"] == 128], "model", "tps", hue="dtype") 70 | plot(df[df["context_size"] == 128], "model", "prefill", hue="dtype") 71 | 72 | plot(df[df["intel_npu_acceleration_library"] == True], "model", "tps") 73 | plot(df[df["intel_npu_acceleration_library"] == True], "model", "prefill") 74 | -------------------------------------------------------------------------------- /script/llm_leaderboard.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | import subprocess 7 | import itertools 8 | import intel_npu_acceleration_library 9 | import platform 10 | import datetime 11 | import socket 12 | import tqdm 13 | import json 14 | import re 15 | 16 | 17 | def profile_model( 18 | model_id, 19 | context_size, 20 | device="NPU", 21 | dtype="float16", 22 | use_intel_npu_acceleration_library=True, 23 | ): 24 | 25 | profiling_data = { 26 | "model": model_id, 27 | "context_size": context_size, 28 | "device": device, 29 | "dtype": dtype, 30 | "intel_npu_acceleration_library": use_intel_npu_acceleration_library, 31 | "prefill": None, 32 | "tps": None, 33 | "error": None, 34 | } 35 | try: 36 | disable_intel_npu_acceleration_library = ( 37 | "--disable-intel-npu-acceleration-library" 38 | if not use_intel_npu_acceleration_library 39 | else "" 40 | ) 41 | output = subprocess.check_output( 42 | f"python profile_llm.py -m {model_id} --context-size {context_size} --device {device} {disable_intel_npu_acceleration_library} ", 43 | shell=True, 44 | ).decode() 45 | 46 | profiling_line = output.strip().split("\n")[-1].strip() 47 | 48 | pattern = r"prefill-phase (\d+\.\d+) s, tokens/s (\d+\.\d+) s" 49 | 50 | match = re.search(pattern, profiling_line) 51 | 52 | # Check if a match is found 53 | if match: 54 | # Extract the prefill phase and tokens/s values 55 | profiling_data["prefill"] = float(match.group(1)) 56 | profiling_data["tps"] = float(match.group(2)) 57 | else: 58 | profiling_data["error"] = f"parsing error: profiling output {output}" 59 | except: 60 | profiling_data["error"] = "runtime error" 61 | 62 | return profiling_data 63 | 64 | 65 | def save_data(data): 66 | date = data["config"]["time"].replace(" ", "_").replace(":", "_") 67 | with open(f"leaderboard_{date}.json", "w") as fp: 68 | json.dump(data, fp, indent=4) 69 | 70 | 71 | def main(): 72 | 73 | data = { 74 | "config": { 75 | "time": datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), 76 | "arch": platform.machine(), 77 | "version": platform.version(), 78 | "platform": platform.platform(), 79 | "processor": platform.processor(), 80 | "hostname": socket.gethostname(), 81 | "npu": "yes" 82 | if intel_npu_acceleration_library.backend.npu_available() 83 | else "no", 84 | "unit": "seconds", 85 | }, 86 | "profiling": [], 87 | } 88 | save_data(data) 89 | 90 | models = [ 91 | "TinyLlama/TinyLlama-1.1B-Chat-v1.0", 92 | "microsoft/phi-2", 93 | "stabilityai/stablelm-3b-4e1t", 94 | # "qnguyen3/quan-1.8b-chat", 95 | "facebook/opt-1.3b", 96 | "gpt2-large", 97 | "openlm-research/open_llama_3b_v2", 98 | "EleutherAI/pythia-2.8b-v0", 99 | "tiiuae/falcon-rw-1b", 100 | "EleutherAI/gpt-neo-1.3B", 101 | "stabilityai/stable-code-3b", 102 | "google/gemma-2b-it", 103 | ] 104 | 105 | contexts = [64, 128, 256, 512] 106 | use_intel_npu_acceleration_library_lst = [True] 107 | devices = ["NPU"] 108 | dtypes = ["float16", "int8"] 109 | configurations = list( 110 | itertools.product( 111 | models, contexts, devices, dtypes, use_intel_npu_acceleration_library_lst 112 | ) 113 | ) 114 | 115 | for model, context, device, dtype, use_intel_npu_acceleration_library in tqdm.tqdm( 116 | configurations 117 | ): 118 | profiling_data = profile_model( 119 | model, context, device, dtype, use_intel_npu_acceleration_library 120 | ) 121 | data["profiling"].append(profiling_data) 122 | save_data(data) 123 | 124 | 125 | if __name__ == "__main__": 126 | 127 | main() 128 | -------------------------------------------------------------------------------- /script/profile_llm.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from transformers import AutoTokenizer, AutoModelForCausalLM 7 | from intel_npu_acceleration_library.nn.llm import generate_with_static_shape 8 | from intel_npu_acceleration_library.dtypes import int8, int4 9 | from intel_npu_acceleration_library.compiler import CompilerConfig 10 | 11 | from torch.profiler import profile, ProfilerActivity 12 | import intel_npu_acceleration_library 13 | import argparse 14 | import torch 15 | import time 16 | import os 17 | 18 | 19 | def main( 20 | prompt="List all numbers in the Fibonacci sequence: 1, 1, 2, 3, ", 21 | context_size=512, 22 | max_new_tokens=50, 23 | model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0", 24 | device="NPU", 25 | dtype="float16", 26 | disable_intel_npu_acceleration_library=False, 27 | ): 28 | 29 | # Load tokenizer 30 | tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 31 | tokenizer.pad_token_id = tokenizer.eos_token_id 32 | # Load model 33 | if os.path.isdir(model_id) and os.path.isfile(f"{model_id}//model.pth"): 34 | compiled = True 35 | model = torch.load(f"{model_id}//model.pth") 36 | model.eval() 37 | else: 38 | compiled = False 39 | model = ( 40 | AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True) 41 | .to("cpu") 42 | .eval() 43 | ) 44 | 45 | if dtype == "float16": 46 | dtype = torch.float16 47 | elif dtype == "int8": 48 | dtype = int8 49 | elif dtype == "int4": 50 | dtype = int4 51 | else: 52 | raise RuntimeError(f"Invalid dtype: {dtype}") 53 | 54 | if not disable_intel_npu_acceleration_library: 55 | if not compiled: 56 | compiler_conf = CompilerConfig(dtype=dtype) 57 | model = intel_npu_acceleration_library.compile(model, compiler_conf) 58 | intel_npu_acceleration_library.nn.llm.warm_up_decoder_model( 59 | tokenizer, model, context_size 60 | ) 61 | 62 | # Tokenize 63 | input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cpu") 64 | 65 | results = generate_with_static_shape( 66 | model, 67 | input_ids=input_ids, 68 | max_length=context_size, 69 | use_past=True, 70 | pad_token_id=tokenizer.pad_token_id, 71 | ) 72 | times = [time.perf_counter()] 73 | idx = 0 74 | with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof: 75 | for new_token_id in results: 76 | times.append(time.perf_counter()) 77 | if idx >= max_new_tokens: 78 | break 79 | idx += 1 80 | token = tokenizer.decode([new_token_id], skip_special_tokens=True) 81 | 82 | print( 83 | prof.key_averages(group_by_input_shape=True).table( 84 | sort_by="self_cpu_time_total", row_limit=20 85 | ) 86 | ) 87 | prof.export_chrome_trace("trace.json") 88 | 89 | elapsed = [y - x for x, y in zip(times, times[1:])] 90 | 91 | prefix_time = elapsed[0] 92 | tps = len(elapsed[1:]) / sum(elapsed[1:]) 93 | 94 | print( 95 | f"model {model_id} (context: {context_size}): prefill-phase {prefix_time:.3f} s, tokens/s {tps:.3f}" 96 | ) 97 | 98 | 99 | def define_and_parse_args(): 100 | parser = argparse.ArgumentParser(description="Profiling a LLM in the NPU") 101 | parser.add_argument( 102 | "--model", 103 | "-m", 104 | type=str, 105 | default="TinyLlama/TinyLlama-1.1B-Chat-v1.0", 106 | help="Model", 107 | ) 108 | parser.add_argument( 109 | "--context-size", 110 | type=int, 111 | default=128, 112 | help="Context size (default: %(default)s)", 113 | ) 114 | parser.add_argument( 115 | "--n-threads", 116 | type=int, 117 | default=None, 118 | help="Set the number of CPU threads to use (default: %(default))", 119 | ) 120 | parser.add_argument( 121 | "--max-new-tokens", 122 | type=int, 123 | default=10, 124 | help="Set the max number of new tokens to generate (default: %(default)s)", 125 | ) 126 | 127 | parser.add_argument( 128 | "--device", 129 | "-d", 130 | default="NPU", 131 | choices=["NPU", "CPU", "GPU"], 132 | help="Select the target device (default: %(default)s)", 133 | ) 134 | parser.add_argument( 135 | "--dtype", 136 | default="float16", 137 | choices=["float16", "int8", "int4"], 138 | help="Select the target dtype (default: %(default)s)", 139 | ) 140 | 141 | parser.add_argument( 142 | "--disable-intel-npu-acceleration-library", 143 | action="store_true", 144 | help="Disable Intel® NPU Acceleration Library optimizations", 145 | ) 146 | 147 | return parser.parse_args() 148 | 149 | 150 | if __name__ == "__main__": 151 | args = define_and_parse_args() 152 | 153 | print( 154 | f"Profiling {args.model} with context size {args.context_size} and dtype {args.dtype}" 155 | ) 156 | if args.n_threads: 157 | print(f"Setting number of pytorch thread to {args.n_threads}") 158 | torch.set_num_threads(args.n_threads) 159 | print(f"Pytorch thread: {torch.get_num_threads()}") 160 | 161 | main( 162 | context_size=args.context_size, 163 | model_id=args.model, 164 | max_new_tokens=args.max_new_tokens, 165 | device=args.device, 166 | dtype=args.dtype, 167 | disable_intel_npu_acceleration_library=args.disable_intel_npu_acceleration_library, 168 | ) 169 | -------------------------------------------------------------------------------- /script/profile_matmul.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4 7 | from intel_npu_acceleration_library.dtypes import int4 8 | from intel_npu_acceleration_library.backend import Linear, QLinear 9 | from functools import partial 10 | import numpy as np 11 | import argparse 12 | import torch 13 | import time 14 | import json 15 | 16 | 17 | def print_profile_data(hwp_data, data): 18 | config_keys = ["batch", "inC", "outC", "dtype"] 19 | config = ", ".join([f"{key}: {hwp_data[key]}" for key in config_keys]) 20 | 21 | e2e_runtimes = [elem["runtime"] for elem in data] 22 | print( 23 | f"MatMul ({config}) => HWP: {hwp_data['runtime']:.3f} ms, E2E: {np.mean(e2e_runtimes):.3f} ± {2 * np.std(e2e_runtimes):.3f} ms" 24 | ) 25 | 26 | 27 | def profile(inC, outC, batch, dtype, n_iters=500, skip_first=10): 28 | data = [] 29 | mac = inC * outC * batch 30 | memcpy = (inC + outC) * batch 31 | 32 | X = np.random.uniform(-1, 1, (batch, inC)).astype(np.float16) 33 | W = np.random.uniform(-1, 1, (outC, inC)).astype(np.float16) 34 | 35 | if dtype == "float16": 36 | matmul_csl = Linear 37 | args = [W] 38 | elif dtype == "int8": 39 | weights, scale = quantize_tensor(torch.tensor(W)) 40 | scale *= np.sqrt(inC) 41 | matmul_csl = partial(QLinear, dtype=np.int8) 42 | args = [weights.numpy(), scale.numpy()] 43 | elif dtype == "int4": 44 | weights, scale = quantize_tensor(torch.tensor(W), (int4.min, int4.max)) 45 | scale *= np.sqrt(inC) 46 | weights = compress_to_i4(weights) 47 | matmul_csl = partial(QLinear, dtype=np.uint8) 48 | args = [weights.numpy(), scale.numpy()] 49 | else: 50 | raise RuntimeError(f"Invalid dtype: {dtype}") 51 | 52 | args.append("0000") 53 | 54 | mm_prof = matmul_csl(inC, outC, batch, profile=True) 55 | mm = matmul_csl(inC, outC, batch, profile=False) 56 | 57 | # Get the HWP data 58 | mm_prof.run(X, *args) 59 | with open("profiling.json") as fp: 60 | hwp_runtime = ( 61 | json.load(fp)["taskStatistics"]["total duration"] / 1000.0 62 | ) # in us 63 | hwp_data = dict( 64 | batch=batch, 65 | inC=inC, 66 | outC=outC, 67 | memcpy=memcpy, 68 | mac=mac, 69 | runtime=hwp_runtime, 70 | dtype=dtype, 71 | ) 72 | 73 | for idx in range(n_iters): 74 | t0 = time.perf_counter() 75 | mm.run(X, *args) 76 | t1 = time.perf_counter() 77 | if idx > (skip_first - 1): 78 | data.append( 79 | dict( 80 | batch=batch, 81 | inC=inC, 82 | outC=outC, 83 | memcpy=memcpy, 84 | mac=mac, 85 | runtime=(t1 - t0) * 1000, 86 | dtype=W.dtype, 87 | ) 88 | ) 89 | 90 | print_profile_data(hwp_data, data) 91 | 92 | return hwp_data, data 93 | 94 | 95 | def define_and_parse_args(): 96 | parser = argparse.ArgumentParser(description="Profiling a MatMul model in the NPU") 97 | parser.add_argument("--batch", "-b", type=int, required=True, help="MatMul batch") 98 | parser.add_argument( 99 | "--input-channels", "-c", type=int, required=True, help="MatMul input channels" 100 | ) 101 | parser.add_argument( 102 | "--output-channels", 103 | "-k", 104 | type=int, 105 | required=True, 106 | help="MatMul output channels", 107 | ) 108 | parser.add_argument( 109 | "--dtype", 110 | default="float16", 111 | choices=["float16", "int8", "int4"], 112 | help="Select the target dtype (default: %(default)s)", 113 | ) 114 | 115 | return parser.parse_args() 116 | 117 | 118 | if __name__ == "__main__": 119 | args = define_and_parse_args() 120 | profile(args.input_channels, args.output_channels, args.batch, dtype=args.dtype) 121 | -------------------------------------------------------------------------------- /script/profile_mlp.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from transformers.models.phi3.modeling_phi3 import Phi3Config, Phi3MLP 7 | from intel_npu_acceleration_library.dtypes import int8, int4 8 | from intel_npu_acceleration_library.compiler import CompilerConfig 9 | from torch.profiler import profile, ProfilerActivity 10 | from sklearn.metrics import r2_score 11 | import intel_npu_acceleration_library 12 | import argparse 13 | import torch 14 | import numpy as np 15 | 16 | 17 | def main( 18 | seq_len=128, 19 | hidden_size=256, 20 | intermediate_size=512, 21 | dtype="float16", 22 | _profile=False, 23 | enable_graph_mode=False, 24 | ): 25 | 26 | conf = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct") 27 | conf.num_hidden_layers = 1 28 | conf.hidden_size = hidden_size 29 | conf.intermediate_size = intermediate_size 30 | 31 | # Define a single Phi-3 MLP layer 32 | mlp = Phi3MLP(conf) 33 | 34 | hidden_states = torch.rand((seq_len, conf.hidden_size)) 35 | 36 | reference = mlp(hidden_states.to(torch.float32)).to(torch.float16) 37 | 38 | if dtype == "float16": 39 | dtype = torch.float16 40 | elif dtype == "int8": 41 | dtype = int8 42 | elif dtype == "int4": 43 | dtype = int4 44 | else: 45 | raise RuntimeError(f"Invalid dtype: {dtype}") 46 | 47 | # Compile model 48 | compiler_conf = CompilerConfig(use_to=enable_graph_mode, dtype=dtype) 49 | model = intel_npu_acceleration_library.compile(mlp, compiler_conf) 50 | if _profile: 51 | model.profile = True 52 | 53 | with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof: 54 | for _ in range(1000): 55 | results = model(hidden_states) 56 | 57 | print( 58 | prof.key_averages(group_by_input_shape=True).table( 59 | sort_by="cpu_time_total", row_limit=20 60 | ) 61 | ) 62 | 63 | prof.export_chrome_trace("trace.json") 64 | 65 | results = results.detach().numpy() 66 | reference = reference.detach().numpy() 67 | 68 | assert results.shape == reference.shape, "Output shape mismatch" 69 | assert np.isfinite(reference).all(), "Pytorch Reference contains NaN or Inf" 70 | assert np.isfinite(results).all(), "NPU output contains NaN or Inf" 71 | 72 | if dtype == int4: 73 | assert 1 - r2_score(reference, results) < 0.05 74 | else: 75 | assert 1 - r2_score(reference, results) < 0.001 76 | 77 | 78 | def define_and_parse_args(): 79 | parser = argparse.ArgumentParser(description="Profiling a MLP layer in the NPU") 80 | parser.add_argument( 81 | "--seq-len", 82 | type=int, 83 | default=128, 84 | help="Sequence length (default: %(default)s)", 85 | ) 86 | parser.add_argument( 87 | "--hidden-size", 88 | type=int, 89 | default=256, 90 | help="Hidden size (default: %(default)s)", 91 | ) 92 | parser.add_argument( 93 | "--intermediate-size", 94 | type=int, 95 | default=512, 96 | help="Intermediate size (default: %(default)s)", 97 | ) 98 | parser.add_argument( 99 | "--dtype", 100 | default="float16", 101 | choices=["float16", "int8", "int4"], 102 | help="Select the target dtype (default: %(default)s)", 103 | ) 104 | parser.add_argument( 105 | "--profile", 106 | action="store_true", 107 | default=False, 108 | help="Enable the profiling (default: False)", 109 | ) 110 | parser.add_argument( 111 | "--enable_graph_mode", 112 | action="store_true", 113 | default=False, 114 | help="Enable graph mode for MLP, otherwise use eager mode (default: False)", 115 | ) 116 | 117 | return parser.parse_args() 118 | 119 | 120 | if __name__ == "__main__": 121 | args = define_and_parse_args() 122 | 123 | print( 124 | f"Profiling with sequence length {args.seq_len}, hidden size {args.hidden_size}, intermediate size {args.intermediate_size}, dtype {args.dtype}" 125 | ) 126 | 127 | main( 128 | seq_len=args.seq_len, 129 | hidden_size=args.hidden_size, 130 | intermediate_size=args.intermediate_size, 131 | dtype=args.dtype, 132 | _profile=args.profile, 133 | enable_graph_mode=args.enable_graph_mode, 134 | ) 135 | -------------------------------------------------------------------------------- /script/quantize_model.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from transformers import AutoModelForCausalLM, AutoTokenizer 7 | import intel_npu_acceleration_library as npu_lib 8 | from neural_compressor.config import PostTrainingQuantConfig 9 | from neural_compressor.quantization import fit 10 | from neural_compressor.adaptor.torch_utils.auto_round import get_dataloader 11 | import argparse 12 | import torch 13 | import os 14 | 15 | 16 | def define_and_parse_arguments(): 17 | parser = argparse.ArgumentParser(description="Export a model to NPU") 18 | parser.add_argument( 19 | "-m", 20 | "--model", 21 | type=str, 22 | required=True, 23 | help="The name of the model to export", 24 | ) 25 | parser.add_argument( 26 | "-b", 27 | "--bits", 28 | type=int, 29 | default=4, 30 | help="The number of bits to use for quantization", 31 | ) 32 | parser.add_argument( 33 | "-o", 34 | "--output-dir", 35 | type=str, 36 | default="models", 37 | help="The directory where to save the exported model", 38 | ) 39 | parser.add_argument( 40 | "-s", 41 | "--sequence-lenght", 42 | type=int, 43 | default=2048, 44 | help="The sequence lenght to use for the dataloader", 45 | ) 46 | parser.add_argument( 47 | "-a", 48 | "--algorithm", 49 | type=str, 50 | default="RTN", 51 | help="The quantization algorithm to use", 52 | ) 53 | return parser.parse_args() 54 | 55 | 56 | def export_model( 57 | model_name: str, 58 | algorithm: str, 59 | bits: int = 4, 60 | sequence_lenght: int = 2048, 61 | output_dir: str = "models", 62 | ): 63 | """Quantize and export a model. 64 | 65 | Args: 66 | model_name (str): the name of the model to export 67 | algorithm (str, optional): the neural compressor quantization algorithm 68 | bits (int, optional): the number of bits. Defaults to 4. 69 | sequence_lenght (int, optional): the model sequence lenght. Defaults to 2048. 70 | output_dir (str, optional): the output directory. Defaults to "models". 71 | """ 72 | print(f"Exporting model {model_name} with {bits} bits") 73 | output_folder = os.path.join(output_dir, model_name, algorithm, f"int{bits}") 74 | os.makedirs(output_folder, exist_ok=True) 75 | 76 | float_model = AutoModelForCausalLM.from_pretrained(model_name) 77 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) 78 | float_model.config.save_pretrained(output_folder) 79 | tokenizer.save_pretrained(output_folder) 80 | 81 | dataloader = get_dataloader(tokenizer, seqlen=sequence_lenght) 82 | 83 | woq_conf = PostTrainingQuantConfig( 84 | approach="weight_only", 85 | op_type_dict={ 86 | ".*": { # match all ops 87 | "weight": { 88 | "dtype": "int", 89 | "bits": bits, 90 | "group_size": -1, 91 | "scheme": "sym", 92 | "algorithm": algorithm.upper(), 93 | }, 94 | "activation": { 95 | "dtype": "fp16", 96 | }, 97 | } 98 | }, 99 | ) 100 | 101 | print("Apply generic model optimizations") 102 | npu_lib.compiler.apply_general_optimizations(float_model) 103 | print("Quantize model") 104 | quantized_model = fit(model=float_model, conf=woq_conf, calib_dataloader=dataloader) 105 | print("Quantize model") 106 | compressed_model = quantized_model.export_compressed_model( 107 | scale_dtype=torch.float16, use_optimum_format=False 108 | ) 109 | 110 | print("Create NPU kernels") 111 | npu_model = npu_lib.compiler.create_npu_kernels(compressed_model) 112 | 113 | torch.save(npu_model, os.path.join(output_folder, "pytorch_npu_model.bin")) 114 | print(f"Model succesfully exported to {output_folder}") 115 | 116 | 117 | if __name__ == "__main__": 118 | args = define_and_parse_arguments() 119 | export_model( 120 | args.model, args.algorithm, args.bits, args.sequence_lenght, args.output_dir 121 | ) 122 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | extend-ignore = E203, E501, D100, D104 4 | [tool:pytest] 5 | addopts = --cov intel_npu_acceleration_library --cov-report term-missing --cov-fail-under=80 --cov-branch 6 | testpaths = 7 | tests/python 8 | [tox:tox] 9 | min_version = 4.0 10 | env_list = 11 | py38 12 | py39 13 | py310 14 | py311 15 | py312 16 | 17 | [testenv] 18 | changedir = {toxinidir}/test/python 19 | deps = -r{toxinidir}/dev_requirements.txt 20 | commands = pytest 21 | 22 | 23 | [gh-actions] 24 | python = 25 | 3.8: py38 26 | 3.9: py39 27 | 3.10: py310 28 | 3.11: py311 29 | 3.12: py312 30 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from setuptools import setup, Extension 7 | from setuptools.command.build_ext import build_ext as build_ext_orig 8 | import pathlib 9 | import glob 10 | import os 11 | import re 12 | 13 | 14 | def get_version(): 15 | this_file_path = os.path.dirname(os.path.abspath(__file__)) 16 | with open( 17 | os.path.join(this_file_path, "intel_npu_acceleration_library", "_version.py"), 18 | "rt", 19 | ) as fp: 20 | verstrline = fp.read() 21 | VSRE = r"^__version__ = ['\"]([^'\"]*)['\"]" 22 | mo = re.search(VSRE, verstrline, re.M) 23 | if mo: 24 | verstr = mo.group(1) 25 | else: 26 | raise RuntimeError("Unable to find version string") 27 | return verstr 28 | 29 | 30 | class CMakeExtension(Extension): 31 | def __init__(self, name): 32 | # don't invoke the original build_ext for this special extension 33 | headers = glob.glob("include/**/*.h") 34 | cpp_sources = glob.glob("src/*.cpp") 35 | requirements = glob.glob("*requirements.txt") 36 | sources = ["CMakeLists.txt"] + requirements + cpp_sources + headers 37 | super().__init__(name, sources=sources) 38 | 39 | 40 | class build_ext(build_ext_orig): 41 | def run(self): 42 | for ext in self.extensions: 43 | self.build_cmake(ext) 44 | super().run() 45 | 46 | def build_cmake(self, ext): 47 | cwd = pathlib.Path().absolute() 48 | 49 | # these dirs will be created in build_py, so if you don't have 50 | # any python sources to bundle, the dirs will be missing 51 | build_temp = pathlib.Path(self.build_temp) 52 | build_temp.mkdir(parents=True, exist_ok=True) 53 | extdir = pathlib.Path(self.get_ext_fullpath(ext.name)) 54 | extdir.mkdir(parents=True, exist_ok=True) 55 | 56 | # example of cmake args 57 | config = "Debug" if self.debug else "Release" 58 | cmake_args = [ 59 | f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={os.path.join(extdir.parent.absolute(), ext.name, "lib")}', 60 | "-DCMAKE_BUILD_TYPE=" + config, 61 | "-DSETUPTOOL_BUILD=True", 62 | ] 63 | 64 | # example of build args 65 | build_args = [ 66 | "--config", 67 | config, 68 | ] 69 | 70 | os.chdir(str(build_temp)) 71 | self.spawn(["cmake", str(cwd)] + cmake_args) 72 | if not self.dry_run: 73 | self.spawn(["cmake", "--build", "."] + build_args) 74 | # Troubleshooting: if fail on line above then delete all possible 75 | # temporary CMake files including "CMakeCache.txt" in top level dir. 76 | os.chdir(str(cwd)) 77 | 78 | 79 | with open("README.md", "r", encoding="utf-8") as fh: 80 | long_description = fh.read() 81 | 82 | with open("requirements.txt") as fh: 83 | requirements = fh.readlines() 84 | 85 | with open("dev_requirements.txt") as fh: 86 | dev_requirements = fh.readlines() 87 | 88 | setup( 89 | name="intel_npu_acceleration_library", 90 | version=get_version(), 91 | packages=[ 92 | "intel_npu_acceleration_library", 93 | "intel_npu_acceleration_library.backend", 94 | "intel_npu_acceleration_library.nn", 95 | "intel_npu_acceleration_library.functional", 96 | ], 97 | author="Alessandro Palla", 98 | author_email="alessandro.palla@intel.com", 99 | description="Intel® NPU Acceleration Library", 100 | license="Apache License 2.0", 101 | url="https://github.com/intel/intel-npu-acceleration-library", 102 | ext_modules=[CMakeExtension("intel_npu_acceleration_library")], 103 | cmdclass={ 104 | "build_ext": build_ext, 105 | }, 106 | long_description=long_description, 107 | long_description_content_type="text/markdown", 108 | python_requires=">=3.8", 109 | install_requires=requirements, 110 | extras_require={ 111 | "dev": dev_requirements, 112 | }, 113 | classifiers=[ 114 | "Development Status :: 4 - Beta", 115 | "Intended Audience :: Developers", 116 | "Intended Audience :: Education", 117 | "Intended Audience :: Science/Research", 118 | "License :: OSI Approved :: Apache Software License", 119 | "Topic :: Scientific/Engineering", 120 | "Topic :: Scientific/Engineering :: Mathematics", 121 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 122 | "Topic :: Software Development", 123 | "Topic :: Software Development :: Libraries", 124 | "Topic :: Software Development :: Libraries :: Python Modules", 125 | "Programming Language :: C++", 126 | "Programming Language :: Python :: 3", 127 | "Programming Language :: Python :: 3 :: Only", 128 | "Programming Language :: Python :: 3.8", 129 | "Programming Language :: Python :: 3.9", 130 | "Programming Language :: Python :: 3.10", 131 | "Programming Language :: Python :: 3.11", 132 | "Programming Language :: Python :: 3.12", 133 | ], 134 | keywords="intel-npu-acceleration-library, machine learning, llm, intel core ultra", 135 | ) 136 | -------------------------------------------------------------------------------- /test/python/conftest.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from intel_npu_acceleration_library.backend import clear_cache 7 | import pytest 8 | 9 | 10 | @pytest.fixture(autouse=True) 11 | def run_before_and_after_tests(): 12 | """Fixture to execute asserts before and after a test is run""" 13 | yield 14 | clear_cache() 15 | -------------------------------------------------------------------------------- /test/python/test_basic.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from sklearn.metrics import r2_score 7 | from intel_npu_acceleration_library.backend import MatMul 8 | import numpy as np 9 | import intel_npu_acceleration_library 10 | import intel_npu_acceleration_library.external.openvino as ov 11 | import pytest 12 | import time 13 | import sys 14 | import os 15 | 16 | 17 | def test_openvino_version(): 18 | version = ov.get_version() 19 | assert version is not None 20 | 21 | 22 | def profile(func): 23 | def wrapper(*args, **kwargs): 24 | t0 = time.perf_counter() 25 | ret = func(*args, **kwargs) 26 | t1 = time.perf_counter() 27 | elapsed = t1 - t0 28 | return ret, elapsed 29 | 30 | return wrapper 31 | 32 | 33 | def test_basic_functionality(): 34 | 35 | X = np.random.uniform(-1, 1, (512, 2048)).astype(np.float16) 36 | W = np.random.uniform(-1, 1, (512, 2048)).astype(np.float16) 37 | 38 | mm = MatMul(2048, 512, X.shape[0]) 39 | 40 | @profile 41 | def npu_run(): 42 | return mm.run(X, W) 43 | 44 | @profile 45 | def cpu_run(): 46 | return np.matmul(X, W.T) 47 | 48 | npu_val, npu_latency = npu_run() 49 | cpu_val, cpu_latency = cpu_run() 50 | 51 | assert 1 - r2_score(cpu_val, npu_val) < 0.001 52 | assert npu_latency < cpu_latency 53 | 54 | 55 | def test_save_model(): 56 | 57 | mm = MatMul(2048, 512, 512) 58 | mm.save("model.xml") 59 | assert os.path.isfile("model.xml") 60 | assert os.path.isfile("model.bin") 61 | os.remove("model.xml") 62 | os.remove("model.bin") 63 | 64 | 65 | @pytest.mark.skipif( 66 | not intel_npu_acceleration_library.backend.npu_available(), 67 | reason="Cannot save model if NPU is not available", 68 | ) 69 | def test_save_compiled_model(): 70 | 71 | mm = MatMul(2048, 512, 512) 72 | mm.saveCompiledModel("model.blob") 73 | assert os.path.isfile("model.blob") 74 | os.remove("model.blob") 75 | 76 | 77 | @pytest.mark.skipif( 78 | not intel_npu_acceleration_library.backend.npu_available(), 79 | reason="Skip test if NPU is not available", 80 | ) 81 | @pytest.mark.skipif( 82 | sys.platform != "win32", 83 | reason="Skip test if not on windows platform", 84 | ) 85 | def test_driver_version(): 86 | 87 | version = intel_npu_acceleration_library.backend.get_driver_version() 88 | assert version is not None 89 | -------------------------------------------------------------------------------- /test/python/test_bindings.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | from intel_npu_acceleration_library.backend.bindings import lib as backend_lib 6 | import numpy as np 7 | import pytest 8 | import ctypes 9 | 10 | 11 | @pytest.mark.parametrize("device", ["CPU", "NPU"]) 12 | def test_bindings(device): 13 | 14 | device = ctypes.c_char_p(device.encode()) 15 | matmul = backend_lib.createNNFactory(device, False) 16 | 17 | assert isinstance(matmul, ctypes.POINTER(ctypes.c_char)) 18 | 19 | backend_lib.destroyNNFactory(matmul) 20 | 21 | 22 | @pytest.mark.parametrize("inC", [16, 32, 64, 128]) 23 | @pytest.mark.parametrize("outC", [16, 32, 64, 128]) 24 | @pytest.mark.parametrize("batch", [16, 128]) 25 | @pytest.mark.parametrize("run_op", [True, False]) 26 | def test_factory_bindings(inC, outC, batch, run_op): 27 | 28 | ## Weights 29 | weights = np.zeros((outC, inC)).astype(np.float16) 30 | x = np.zeros((batch, inC)).astype(np.float16) 31 | out = np.empty((batch, outC), dtype=np.float16) 32 | 33 | # Create nn factory 34 | device = ctypes.c_char_p("NPU".encode()) 35 | factory = backend_lib.createNNFactory(device, False) 36 | 37 | # Create linear layer 38 | shape_ptr = np.array((batch, inC), dtype=np.uint32) 39 | dtype = ctypes.c_char_p("float16".encode()) 40 | p0 = backend_lib.parameter(factory, shape_ptr.size, shape_ptr, dtype) 41 | linear = backend_lib.linear(factory, p0, outC, inC, False, dtype, dtype) 42 | backend_lib.result(factory, linear) 43 | backend_lib.compile(factory) 44 | backend_lib.set_output(factory, out.ctypes.data_as(ctypes.c_void_p), 0) 45 | 46 | # Set parameters 47 | param = backend_lib.createParameters() 48 | backend_lib.addFloatParameter(param, weights, *weights.shape) 49 | backend_lib.setNNFactoryWeights(factory, param) 50 | 51 | # run 52 | if run_op: 53 | backend_lib.set_activation(factory, x.ctypes.data_as(ctypes.c_void_p), 0) 54 | backend_lib.run(factory) 55 | 56 | # Call destuctors for parameters and weights 57 | backend_lib.destroyNNFactory(factory) 58 | backend_lib.destroyParameters(param) 59 | -------------------------------------------------------------------------------- /test/python/test_compile.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from intel_npu_acceleration_library.compiler import compile 7 | from intel_npu_acceleration_library.compiler import CompilerConfig 8 | from intel_npu_acceleration_library.dtypes import int4 9 | from sklearn.metrics import r2_score 10 | import intel_npu_acceleration_library 11 | from packaging.version import Version 12 | import pytest 13 | import torch 14 | import time 15 | import sys 16 | 17 | 18 | class NN(torch.nn.Module): 19 | def __init__( 20 | self, 21 | ) -> None: 22 | super().__init__() 23 | self.l1 = torch.nn.Linear(32, 128, bias=False) 24 | self.l2 = torch.nn.Linear(128, 32, bias=False) 25 | self.relu = torch.nn.functional.relu 26 | 27 | def forward(self, x): 28 | return self.relu(self.l2(self.relu(self.l1(x)))) 29 | 30 | 31 | torch.manual_seed(0) 32 | x = 128 * (torch.rand((16, 32), dtype=torch.float16) - 0.5) 33 | 34 | 35 | @pytest.mark.parametrize("dtype", [torch.float16, torch.int8, int4]) 36 | def test_compilation(dtype): 37 | 38 | torch.manual_seed(42) 39 | model = NN().half() 40 | 41 | y_ref = model(x).detach() 42 | 43 | compiler_conf = CompilerConfig(dtype=dtype) 44 | compiled_model = compile(model, compiler_conf) 45 | 46 | assert compiled_model 47 | 48 | for name, layer in compiled_model.named_children(): 49 | expected_cls = ( 50 | intel_npu_acceleration_library.nn.Linear 51 | if dtype.is_floating_point 52 | else intel_npu_acceleration_library.nn.QuantizedLinear 53 | ) 54 | assert isinstance(layer, expected_cls) 55 | if dtype == int4: 56 | assert layer.weight.dtype == torch.uint8 57 | else: 58 | assert layer.weight.dtype == dtype 59 | if layer.bias is not None: 60 | if dtype.is_floating_point: 61 | assert layer.bias.dtype == dtype 62 | else: 63 | layer.bias.dtype == torch.float32 64 | 65 | t0 = time.perf_counter() 66 | y1 = compiled_model(x).detach() 67 | t1 = time.perf_counter() 68 | 69 | y2 = compiled_model(x).detach() 70 | t2 = time.perf_counter() 71 | 72 | if dtype == int4: 73 | assert 1 - r2_score(y_ref.numpy(), y1.numpy()) < 0.05 74 | else: 75 | assert 1 - r2_score(y_ref.numpy(), y1.numpy()) < 0.01 76 | 77 | assert torch.allclose(y1, y2) 78 | 79 | # Check that for next iteration weights are prefetched 80 | # latency2 = t2 - t1 81 | # latency1 = t1 - t0 82 | # assert latency2 < latency1 83 | 84 | intel_npu_acceleration_library.backend.clear_cache() 85 | 86 | 87 | def test_torch_compile(): 88 | 89 | model = NN() 90 | y_ref = model(x.to(torch.float32)).detach() 91 | 92 | if sys.platform == "win32" and Version(torch.__version__) < Version("2.2.2"): 93 | with pytest.raises(RuntimeError) as e: 94 | compiled_model = torch.compile(model, backend="npu") 95 | assert str(e.value) == "Windows not yet supported for torch.compile" 96 | else: 97 | compiled_model = torch.compile(model, backend="npu") 98 | y = compiled_model(x.to(torch.float32)).detach() 99 | assert 1 - r2_score(y_ref.numpy(), y.numpy()) < 0.01 100 | 101 | 102 | @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.int8]) 103 | def test_compile_training(dtype): 104 | 105 | model = NN() 106 | 107 | compiler_conf = CompilerConfig(dtype=dtype, training=True) 108 | compiled_model = compile(model, compiler_conf) 109 | 110 | for name, layer in compiled_model.named_children(): 111 | if dtype == torch.int8: 112 | assert layer.training == False 113 | else: 114 | assert layer.training == True 115 | 116 | 117 | @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.int8, int4]) 118 | def test_compile_inference(dtype): 119 | 120 | model = NN() 121 | 122 | compiler_conf = CompilerConfig(dtype=dtype) 123 | compiled_model = compile(model, compiler_conf) 124 | 125 | for name, layer in compiled_model.named_children(): 126 | assert layer.training == False 127 | -------------------------------------------------------------------------------- /test/python/test_conv.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | 7 | import intel_npu_acceleration_library 8 | from intel_npu_acceleration_library.compiler import CompilerConfig 9 | from sklearn.metrics import r2_score 10 | import pytest 11 | import torch 12 | 13 | 14 | class DummyConv(torch.nn.Module): 15 | def __init__( 16 | self, 17 | in_channels, 18 | out_channels, 19 | kernels, 20 | bias, 21 | groups, 22 | stride=1, 23 | padding=0, 24 | dilation=1, 25 | ): 26 | super().__init__() 27 | if groups == -1: 28 | groups = out_channels 29 | self.conv = torch.nn.Conv2d( 30 | in_channels, 31 | out_channels, 32 | kernels, 33 | bias=bias, 34 | groups=groups, 35 | stride=stride, 36 | padding=padding, 37 | dilation=dilation, 38 | ) 39 | 40 | def forward(self, x): 41 | return self.conv(x) 42 | 43 | 44 | @pytest.mark.parametrize("in_channels", [32, 128, 256]) 45 | @pytest.mark.parametrize("out_channels", [32, 128, 256]) 46 | @pytest.mark.parametrize("kernels", [1, 3]) 47 | @pytest.mark.parametrize("dim", [16, 32]) 48 | @pytest.mark.parametrize("bias", [True, False]) 49 | @pytest.mark.parametrize("dtype", [torch.float16]) 50 | @pytest.mark.parametrize("stride", [1, 2]) 51 | @pytest.mark.parametrize("padding", [0, 1]) 52 | @pytest.mark.parametrize("groups", [1, -1]) 53 | def test_conv( 54 | in_channels, out_channels, kernels, dim, bias, dtype, stride, padding, groups 55 | ): 56 | torch.manual_seed(42) 57 | 58 | if groups != 1 and in_channels != out_channels: 59 | pytest.skip("DW convolutions require in_channels == out_channels") 60 | 61 | with torch.no_grad(): 62 | X = torch.rand((1, in_channels, dim, dim), dtype=torch.float16) 63 | conv = DummyConv( 64 | in_channels, 65 | out_channels, 66 | kernels, 67 | bias=bias, 68 | groups=groups, 69 | stride=stride, 70 | padding=padding, 71 | ).half() 72 | conv.conv.weight.data *= 128 73 | y_ref = conv(X) 74 | 75 | compiler_conf = CompilerConfig(dtype=dtype) 76 | npu_conv = intel_npu_acceleration_library.compile(conv, compiler_conf) 77 | y = npu_conv(X) 78 | 79 | assert y.dtype == y_ref.dtype 80 | assert y.shape == y_ref.shape 81 | if dtype == torch.int8: 82 | assert 1 - r2_score(y_ref.flatten().numpy(), y.flatten().numpy()) < 0.05 83 | else: 84 | assert 1 - r2_score(y_ref.flatten().numpy(), y.flatten().numpy()) < 0.001 85 | -------------------------------------------------------------------------------- /test/python/test_device.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from intel_npu_acceleration_library.nn.module import NPUModuleWrapper 7 | import torch 8 | 9 | 10 | class NN(torch.nn.Module): 11 | def __init__(self): 12 | super().__init__() 13 | self.l1 = torch.nn.Linear(128, 128) 14 | 15 | def forward(self, x): 16 | return self.l1(x) 17 | 18 | 19 | def test_device(): 20 | 21 | x = torch.rand((128, 128)).to(torch.float16).to("npu") 22 | 23 | model = NN().half().to("npu") 24 | 25 | assert isinstance(model, torch.nn.Module) 26 | assert isinstance(model, NPUModuleWrapper) 27 | 28 | y = model(x) 29 | 30 | assert y.dtype == torch.float16 31 | assert y.device == torch.device("npu") 32 | -------------------------------------------------------------------------------- /test/python/test_dtypes.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | import pytest 7 | from intel_npu_acceleration_library.dtypes import float16, bfloat16, int4, int8 8 | 9 | 10 | @pytest.fixture 11 | def npu_dtypes(): 12 | return [float16, bfloat16, int4, int8] 13 | 14 | 15 | def test_NPUDtype_is_floating_point(npu_dtypes): 16 | for dtype in npu_dtypes: 17 | if dtype in (int4, int8): 18 | assert dtype.is_floating_point == False 19 | else: 20 | assert dtype.is_floating_point == True 21 | -------------------------------------------------------------------------------- /test/python/test_factory.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | import numpy as np 7 | import intel_npu_acceleration_library 8 | import pytest 9 | import os 10 | 11 | 12 | @pytest.mark.parametrize("batch", [16, 128]) 13 | @pytest.mark.parametrize("inC", [256, 512]) 14 | @pytest.mark.parametrize("outC", [256, 512]) 15 | @pytest.mark.parametrize("dtype", [np.float16, np.int8]) 16 | @pytest.mark.parametrize("activation", ["gelu", "swish", "softmax"]) 17 | def test_factory(batch, inC, outC, dtype, activation): 18 | module = intel_npu_acceleration_library.backend.NNFactory() 19 | assert module 20 | 21 | input = module.parameter((batch, inC)) 22 | assert input 23 | 24 | weights = module.parameter((outC, inC), dtype) 25 | assert weights 26 | 27 | if dtype == np.int8: 28 | weights = module.convert_to_fp16(weights) 29 | 30 | mm = module.matmul(input, weights) 31 | assert mm 32 | 33 | act_fn = getattr(module, activation) 34 | if activation == "softmax": 35 | output = act_fn(mm, -1) 36 | else: 37 | output = act_fn(mm) 38 | assert output 39 | 40 | module.compile() 41 | 42 | output_shape = module.get_tensor_shape(output.node) 43 | assert output_shape == (batch, outC) 44 | 45 | filename = f"test_factory_mm_{batch}_{inC}_{outC}_{dtype.__name__}_{activation}" 46 | module.save(f"{filename}.xml") 47 | 48 | assert os.path.isfile(f"{filename}.xml") 49 | 50 | os.remove(f"{filename}.xml") 51 | os.remove(f"{filename}.bin") 52 | -------------------------------------------------------------------------------- /test/python/test_llm.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from transformers.models.llama.modeling_llama import LlamaForCausalLM, LlamaConfig 7 | from transformers.models.phi.modeling_phi import PhiConfig, PhiMLP 8 | from transformers.models.phi3.modeling_phi3 import Phi3Config, Phi3MLP 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from intel_npu_acceleration_library.dtypes import int8, int4 11 | from intel_npu_acceleration_library.compiler import CompilerConfig 12 | from sklearn.metrics import r2_score 13 | from torch.profiler import profile, ProfilerActivity 14 | import intel_npu_acceleration_library 15 | import pytest 16 | import torch 17 | import numpy as np 18 | 19 | 20 | @pytest.fixture 21 | def config(): 22 | return LlamaConfig(num_hidden_layers=1) 23 | 24 | 25 | @pytest.fixture 26 | def decoder_model(config): 27 | return LlamaForCausalLM(config) 28 | 29 | 30 | @pytest.fixture 31 | def model(): 32 | return AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") 33 | 34 | 35 | @pytest.fixture 36 | def tokenizer(): 37 | return AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") 38 | 39 | 40 | @pytest.mark.parametrize("model_seq_length", [128, 256]) 41 | def test_warm_up(tokenizer, model, model_seq_length): 42 | compiler_conf = CompilerConfig() 43 | compiled_model = intel_npu_acceleration_library.compile(model, compiler_conf) 44 | intel_npu_acceleration_library.nn.llm.warm_up_decoder_model( 45 | tokenizer, compiled_model, model_seq_length 46 | ) 47 | 48 | 49 | @pytest.mark.parametrize("dtype", [torch.float16, torch.int8]) 50 | def test_compilation(tokenizer, decoder_model, dtype): 51 | prefill = tokenizer("test sentence", return_tensors="pt")["input_ids"].to("cpu") 52 | y_ref = decoder_model(prefill).logits.detach() 53 | 54 | compiler_conf = CompilerConfig(dtype=dtype) 55 | compiled_model = intel_npu_acceleration_library.compile( 56 | decoder_model, compiler_conf 57 | ) 58 | 59 | assert compiled_model 60 | 61 | y = compiled_model(prefill).logits.detach() 62 | 63 | assert 1 - r2_score(y_ref.flatten().numpy(), y.flatten().numpy()) < 0.01 64 | 65 | 66 | @torch.no_grad 67 | @pytest.mark.parametrize("seq_len", [16, 128, 256]) 68 | @pytest.mark.parametrize("hidden_size", [256, 512]) 69 | @pytest.mark.parametrize("intermediate_size", [512]) 70 | def test_phi2_mlp(seq_len, hidden_size, intermediate_size): 71 | conf = PhiConfig.from_pretrained("microsoft/phi-2") 72 | conf.num_hidden_layers = 1 73 | conf.hidden_size = hidden_size 74 | conf.intermediate_size = intermediate_size 75 | 76 | mlp = PhiMLP(conf) 77 | 78 | x = torch.rand((seq_len, conf.hidden_size), dtype=torch.float16) 79 | reference = mlp(x.to(torch.float32)).to(torch.float16) 80 | 81 | model = intel_npu_acceleration_library.nn.PhiMLP.fromTorch(mlp) 82 | 83 | assert model 84 | 85 | out = model(x) 86 | 87 | assert 1 - r2_score(reference.numpy(), out.numpy()) < 0.001 88 | 89 | 90 | @torch.no_grad 91 | @pytest.mark.parametrize("seq_len", [16, 128, 256]) 92 | @pytest.mark.parametrize("hidden_size", [256, 512]) 93 | @pytest.mark.parametrize("intermediate_size", [512]) 94 | @pytest.mark.parametrize("dtype", ["float16", "int8", "int4"]) 95 | def test_phi3_mlp_compile(seq_len, hidden_size, intermediate_size, dtype): 96 | conf = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct") 97 | conf.num_hidden_layers = 1 98 | conf.hidden_size = hidden_size 99 | conf.intermediate_size = intermediate_size 100 | 101 | if dtype == "int8": 102 | dtype = int8 103 | elif dtype == "int4": 104 | dtype = int4 105 | else: 106 | dtype = torch.float16 107 | 108 | mlp = Phi3MLP(conf) 109 | 110 | hidden_states = torch.rand((seq_len, conf.hidden_size)) 111 | 112 | reference = mlp(hidden_states.to(torch.float32)).to(torch.float16).detach().numpy() 113 | 114 | compiler_conf = CompilerConfig(use_to=True, dtype=dtype) 115 | model = intel_npu_acceleration_library.compile(mlp, compiler_conf) 116 | 117 | assert model 118 | 119 | with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof: 120 | out = model(hidden_states) 121 | 122 | print( 123 | prof.key_averages(group_by_input_shape=True).table( 124 | sort_by="cpu_time_total", row_limit=20 125 | ) 126 | ) 127 | 128 | out = out.detach().numpy() 129 | 130 | assert out.shape == reference.shape, "Output shape mismatch" 131 | assert np.isfinite(reference).all(), "Pytorch Reference contains NaN or Inf" 132 | assert np.isfinite(out).all(), "NPU output contains NaN or Inf" 133 | 134 | if dtype == int4: 135 | assert 1 - r2_score(reference, out) < 0.05 136 | else: 137 | assert 1 - r2_score(reference, out) < 0.001 138 | -------------------------------------------------------------------------------- /test/python/test_matmul.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from sklearn.metrics import r2_score 7 | from intel_npu_acceleration_library.backend import MatMul, QMatMul 8 | from intel_npu_acceleration_library.quantization import quantize_tensor 9 | import numpy as np 10 | import itertools 11 | import pytest 12 | import torch 13 | import time 14 | 15 | 16 | channels = [512, 768, 1024, 2048] 17 | batches = [16, 128, 512, 1024] 18 | 19 | 20 | @pytest.mark.parametrize( 21 | "batch,inC,outC", itertools.product(batches, channels, channels) 22 | ) 23 | def test_matmul(batch, inC, outC): 24 | X = torch.rand((batch, inC), requires_grad=False).to(torch.float16) 25 | W = torch.rand((outC, inC), requires_grad=False).to(torch.float16) 26 | 27 | cpu_mm = X @ W.T 28 | 29 | mm = MatMul(inC, outC, batch) 30 | 31 | assert mm 32 | 33 | npu_mm = mm.run(X.numpy(), W.numpy()) 34 | 35 | assert np.isfinite(npu_mm).all() 36 | 37 | assert 1 - r2_score(cpu_mm.numpy(), npu_mm) < 0.001 38 | 39 | 40 | @pytest.mark.parametrize( 41 | "batch,inC,outC", itertools.product(batches, channels, channels) 42 | ) 43 | def test_qmatmul_per_channel_scales(batch, inC, outC): 44 | 45 | X = torch.rand((batch, inC), requires_grad=False).to(torch.float16) - 0.5 46 | W = torch.rand((outC, inC), requires_grad=False).to(torch.float16) 47 | 48 | # Compute reference matmul 49 | cpu_mm = X @ W.T 50 | 51 | assert W.shape == (outC, inC) and W.dtype == torch.float16 52 | 53 | # Quantize the weights 54 | weights_quant, scale = quantize_tensor(W) 55 | 56 | assert scale.shape == (outC, 1) and scale.dtype == torch.float16 57 | assert weights_quant.shape == (outC, inC) and weights_quant.dtype == torch.int8 58 | assert weights_quant.shape == W.shape 59 | 60 | # Conversion done properly 61 | expected_W = weights_quant.to(torch.float16) * scale 62 | assert 1 - r2_score(expected_W.numpy(), W.numpy()) < 0.001 63 | 64 | mm = QMatMul(inC, outC, batch) 65 | 66 | assert mm 67 | 68 | # Adapt for numerically accurate qmatumul 69 | scale *= np.sqrt(inC) 70 | 71 | npu_mm = mm.run(X.numpy(), weights_quant.numpy(), scale.numpy()) 72 | 73 | assert np.isfinite(npu_mm).all() 74 | 75 | assert 1 - r2_score(cpu_mm.numpy(), npu_mm) < 0.001 76 | -------------------------------------------------------------------------------- /test/python/test_profiling.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from intel_npu_acceleration_library.quantization import quantize_tensor 7 | from intel_npu_acceleration_library.backend import MatMul, QMatMul, Linear, QLinear 8 | import numpy as np 9 | import intel_npu_acceleration_library 10 | import pytest 11 | import json 12 | import torch 13 | import os 14 | 15 | 16 | def check_no_sw_layer(profiling_file): 17 | with open(profiling_file) as f: 18 | data = json.load(f) 19 | 20 | statistics = data["taskStatistics"] 21 | assert statistics["SW duration"] == 0 22 | 23 | 24 | def test_profiling_matmul(): 25 | 26 | if not intel_npu_acceleration_library.backend.npu_available(): 27 | pytest.xfail("NPU not available") 28 | 29 | X = np.random.uniform(-1, 1, (512, 2048)).astype(np.float16) 30 | W = np.random.uniform(-1, 1, (512, 2048)).astype(np.float16) 31 | W_q, scale = quantize_tensor(torch.tensor(W)) 32 | 33 | # Adapt for numerically accurate qmatumul 34 | scale *= np.sqrt(2048) 35 | 36 | if os.path.exists("profiling.json"): 37 | os.remove("profiling.json") 38 | 39 | MatMul(W.shape[1], W.shape[0], X.shape[0], profile=True).run(X, W) 40 | assert os.path.isfile("profiling.json") 41 | check_no_sw_layer("profiling.json") 42 | os.remove("profiling.json") 43 | 44 | QMatMul(W.shape[1], W.shape[0], X.shape[0], profile=True).run( 45 | X, W_q.numpy(), scale.numpy() 46 | ) 47 | QMatMul(W.shape[1], W.shape[0], X.shape[0], profile=True).save("qmatmul.xml") 48 | assert os.path.isfile("profiling.json") 49 | check_no_sw_layer("profiling.json") 50 | os.remove("profiling.json") 51 | os.remove("qmatmul.xml") 52 | 53 | Linear(W.shape[1], W.shape[0], X.shape[0], profile=True).run(X, W, op_id=0) 54 | assert os.path.isfile("profiling.json") 55 | check_no_sw_layer("profiling.json") 56 | os.remove("profiling.json") 57 | 58 | QLinear(W.shape[1], W.shape[0], X.shape[0], profile=True).run( 59 | X, W_q.numpy(), scale.numpy(), op_id=0 60 | ) 61 | assert os.path.isfile("profiling.json") 62 | check_no_sw_layer("profiling.json") 63 | os.remove("profiling.json") 64 | -------------------------------------------------------------------------------- /test/python/test_quantization.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | from sklearn.metrics import r2_score 7 | from intel_npu_acceleration_library.compiler import CompilerConfig 8 | import numpy as np 9 | import intel_npu_acceleration_library 10 | import pytest 11 | import torch 12 | 13 | import intel_npu_acceleration_library.quantization 14 | 15 | 16 | class NN(torch.nn.Module): 17 | def __init__(self, inC, outC): 18 | super().__init__() 19 | self.l1 = torch.nn.Linear(inC, outC, bias=False) 20 | 21 | def forward(self, x): 22 | return self.l1(x) 23 | 24 | 25 | @pytest.mark.parametrize("batch", [16, 128]) 26 | @pytest.mark.parametrize("inC", [256, 512]) 27 | @pytest.mark.parametrize("outC", [256, 512]) 28 | def test_explicit_quantization(batch, inC, outC): 29 | module = intel_npu_acceleration_library.backend.NNFactory() 30 | assert module 31 | 32 | input = module.parameter((batch, inC)) 33 | assert input 34 | 35 | output = module.linear(input, outC, inC) 36 | assert output 37 | 38 | module.compile() 39 | 40 | X = np.random.random((batch, inC)).astype(np.float16) 41 | W = np.random.randint(-127, 127, (outC, inC)).astype(np.int8) 42 | S = np.random.random((outC, 1)).astype(np.float32) 43 | 44 | w_float = W.astype(np.float16) * S 45 | y_ref = np.matmul(X, w_float.T) 46 | 47 | y = module.run(X, (W, S), op_id="0000") 48 | 49 | assert 1 - r2_score(y_ref, y) < 0.01 50 | 51 | 52 | @pytest.mark.parametrize("batch", [16, 128]) 53 | @pytest.mark.parametrize("inC", [256, 512]) 54 | @pytest.mark.parametrize("outC", [256, 512]) 55 | def test_i8_quantization(batch, inC, outC): 56 | module = intel_npu_acceleration_library.backend.NNFactory() 57 | assert module 58 | 59 | input = module.parameter((batch, inC)) 60 | assert input 61 | 62 | output = module.linear(input, outC, inC, False, wt_dtype=np.int8) 63 | assert output 64 | 65 | module.compile() 66 | 67 | X = np.random.random((batch, inC)).astype(np.float16) 68 | W = np.random.randint(-127, 127, (outC, inC)).astype(np.int8) 69 | S = np.random.random((outC, 1)).astype(np.float16) 70 | 71 | w_float = W.astype(np.float16) * S 72 | y_ref = np.matmul(X, w_float.T) 73 | 74 | y = module.run(X, (W, S * np.sqrt(inC)), op_id="0000") 75 | 76 | assert 1 - r2_score(y_ref, y) < 0.01 77 | 78 | 79 | @pytest.mark.parametrize("batch", [16, 128]) 80 | @pytest.mark.parametrize("inC", [256, 512]) 81 | @pytest.mark.parametrize("outC", [256, 512]) 82 | def test_compiled_quantized(batch, inC, outC): 83 | 84 | intel_npu_acceleration_library.backend.clear_cache() 85 | 86 | torch.manual_seed(0) 87 | X = torch.rand((batch, inC), dtype=torch.float16) - 0.5 88 | # X = np.random.random((batch, inC)).astype(np.float16) 89 | 90 | model = NN(inC, outC) 91 | y_ref = model(X.to(torch.float32)).detach() 92 | 93 | compiler_conf = CompilerConfig(dtype=torch.int8) 94 | compiled_model = intel_npu_acceleration_library.compile(model, compiler_conf) 95 | assert compiled_model 96 | 97 | y1 = compiled_model(X).detach() 98 | 99 | assert 1 - r2_score(y_ref.numpy(), y1.numpy()) < 0.01 100 | 101 | 102 | @pytest.mark.parametrize("batch", [16, 128]) 103 | @pytest.mark.parametrize("inC", [256, 512]) 104 | @pytest.mark.parametrize("outC", [256, 512]) 105 | def test_i4_quantization(batch, inC, outC): 106 | 107 | module = intel_npu_acceleration_library.backend.NNFactory() 108 | assert module 109 | 110 | input = module.parameter((batch, inC)) 111 | assert input 112 | # u8 represents packed i4 dtypes 113 | output = module.linear(input, outC, inC, False, wt_dtype=np.uint8) 114 | assert output 115 | 116 | module.compile() 117 | 118 | X = np.random.random((batch, inC)).astype(np.float16) 119 | S = np.random.random((outC, 1)).astype(np.float16) 120 | W = np.random.randint(-8, 7, (outC, inC)).astype(np.int8) 121 | 122 | w_float = W.astype(np.float16) * S 123 | y_ref = np.matmul(X, w_float.T) 124 | 125 | # Compress the weights for int4 126 | W_npu = intel_npu_acceleration_library.quantization.compress_to_i4( 127 | torch.from_numpy(W) 128 | ).numpy() 129 | 130 | y = module.run(X, (W_npu, S * np.sqrt(inC)), op_id="0000") 131 | 132 | # assert y has no NaN 133 | assert not np.isnan(y).any() 134 | 135 | # assert y has no Inf 136 | assert not np.isinf(y).any() 137 | 138 | # Check for correctness vs reference 139 | assert 1 - r2_score(y_ref.flatten(), y.flatten()) < 0.01 140 | -------------------------------------------------------------------------------- /test/python/test_sdpa.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | from intel_npu_acceleration_library.backend.sdpa import SDPA 6 | from intel_npu_acceleration_library.functional import scaled_dot_product_attention 7 | from sklearn.metrics import r2_score 8 | import numpy as np 9 | import pytest 10 | import torch 11 | 12 | 13 | @pytest.mark.parametrize("heads", [16, 32]) 14 | @pytest.mark.parametrize("sequence", [16, 32, 256, 512]) 15 | @pytest.mark.parametrize("dim", [512, 1024]) 16 | @pytest.mark.parametrize("kv_cache", [True, False]) 17 | @pytest.mark.parametrize("is_causal", [False, True]) 18 | def test_sdpa(heads, sequence, dim, kv_cache, is_causal): 19 | 20 | min_value = torch.finfo(torch.float16).min 21 | 22 | query = torch.rand(1, heads, 1 if kv_cache else sequence, dim // heads).to( 23 | torch.float16 24 | ) 25 | key = torch.rand(1, heads, sequence, dim // heads).to(torch.float16) 26 | value = torch.rand(1, heads, sequence, dim // heads).to(torch.float16) 27 | mask = min_value * torch.ones(1, heads, 1 if kv_cache else sequence, sequence).to( 28 | torch.float16 29 | ) 30 | mask = torch.triu(mask) 31 | 32 | npu_sdpa = SDPA( 33 | query.shape, key.shape, value.shape, mask.shape, is_causal=is_causal 34 | ) 35 | 36 | npu_result = npu_sdpa.run(query.numpy(), key.numpy(), value.numpy(), mask.numpy()) 37 | 38 | ref_result = torch.nn.functional.scaled_dot_product_attention( 39 | query, 40 | key, 41 | value, 42 | None if is_causal else mask, 43 | dropout_p=0, 44 | is_causal=is_causal, 45 | scale=None, 46 | ) 47 | 48 | assert npu_result.shape == (1, heads, 1 if kv_cache else sequence, dim // heads) 49 | 50 | assert np.isfinite(npu_result).all() 51 | 52 | r2 = r2_score(ref_result.numpy().flatten(), npu_result.flatten()) 53 | 54 | assert 1 - r2 < 0.05 55 | 56 | 57 | @pytest.mark.parametrize("heads", [16, 32]) 58 | @pytest.mark.parametrize("sequence", [16, 32, 256, 512]) 59 | @pytest.mark.parametrize("dim", [512, 1024]) 60 | @pytest.mark.parametrize("kv_cache", [True, False]) 61 | @pytest.mark.parametrize("is_causal", [False, True]) 62 | @pytest.mark.parametrize("use_mask", [False, True]) 63 | def test_sdpa_runtime(heads, sequence, dim, kv_cache, is_causal, use_mask): 64 | 65 | min_value = torch.finfo(torch.float16).min 66 | 67 | query = torch.rand(1, heads, 1 if kv_cache else sequence, dim // heads).to( 68 | torch.float16 69 | ) 70 | key = torch.rand(1, heads, sequence, dim // heads).to(torch.float16) 71 | value = torch.rand(1, heads, sequence, dim // heads).to(torch.float16) 72 | if use_mask: 73 | mask = min_value * torch.ones( 74 | 1, heads, 1 if kv_cache else sequence, sequence 75 | ).to(torch.float16) 76 | mask = torch.triu(mask) 77 | else: 78 | mask = None 79 | 80 | npu_result = scaled_dot_product_attention( 81 | query, key, value, mask, is_causal=is_causal 82 | ) 83 | 84 | ref_result = torch.nn.functional.scaled_dot_product_attention( 85 | query, 86 | key, 87 | value, 88 | None if is_causal else mask, 89 | dropout_p=0, 90 | is_causal=is_causal, 91 | scale=None, 92 | ) 93 | 94 | assert npu_result.shape == (1, heads, 1 if kv_cache else sequence, dim // heads) 95 | 96 | assert np.isfinite(npu_result).all() 97 | 98 | r2 = r2_score(ref_result.numpy().flatten(), npu_result.numpy().flatten()) 99 | 100 | assert 1 - r2 < 0.05 101 | -------------------------------------------------------------------------------- /test/python/test_training.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2024 Intel Corporation 3 | # SPDX-License-Identifier: Apache 2.0 4 | # 5 | 6 | 7 | from sklearn.metrics import r2_score 8 | from intel_npu_acceleration_library import compile 9 | from intel_npu_acceleration_library.compiler import CompilerConfig 10 | import torch 11 | import pytest 12 | import copy 13 | 14 | # Three different values to pick the errors 15 | in_c = 128 16 | out_c = 512 17 | batch = 256 18 | 19 | 20 | class NN(torch.nn.Module): 21 | def __init__(self, inc, outc, bias) -> None: 22 | super().__init__() 23 | self.linear = torch.nn.Linear(inc, outc, bias=bias) 24 | self.linear2 = torch.nn.Linear(outc, inc, bias=bias) 25 | 26 | def forward(self, x): 27 | return self.linear2(torch.nn.functional.relu(self.linear(x))) 28 | 29 | 30 | @pytest.fixture 31 | def model_no_bias(): 32 | compiler_conf = CompilerConfig() 33 | return compile(NN(inc=in_c, outc=out_c, bias=False), compiler_conf) 34 | 35 | 36 | @pytest.fixture 37 | def model(): 38 | compiler_conf = CompilerConfig() 39 | return compile(NN(inc=in_c, outc=out_c, bias=True), compiler_conf) 40 | 41 | 42 | def test_parameters(model, model_no_bias): 43 | assert len(list(model.parameters())) == 4 44 | assert len(list(model_no_bias.parameters())) == 2 45 | 46 | 47 | def test_gradient(): 48 | 49 | npu_model = NN(inc=in_c, outc=out_c, bias=True).half() 50 | cpu_model = NN(inc=in_c, outc=out_c, bias=True).half() 51 | cpu_model.load_state_dict(copy.deepcopy(npu_model.state_dict())) 52 | 53 | # Compile one of the model on npu 54 | compiler_conf = CompilerConfig(training=True) 55 | compile(npu_model, compiler_conf) 56 | 57 | x = torch.rand([batch, in_c]).half() 58 | yref = torch.rand([batch, in_c]).half() 59 | 60 | opt1 = torch.optim.SGD(npu_model.parameters(), lr=0.5) 61 | opt2 = torch.optim.SGD(cpu_model.parameters(), lr=0.5) 62 | 63 | for idx in range(100): 64 | 65 | # Check the parameters are the same 66 | for p1, p2 in zip(npu_model.parameters(), cpu_model.parameters()): 67 | assert p1.dtype == p2.dtype 68 | assert 1 - r2_score(p1.detach().numpy(), p2.detach().numpy()) < 0.001, idx 69 | 70 | opt1.zero_grad() 71 | opt2.zero_grad() 72 | 73 | y1 = npu_model(x) 74 | y2 = cpu_model(x) 75 | 76 | model_loss = torch.mean(((yref - y1) ** 2)) 77 | model_loss.backward() 78 | 79 | model_loss = torch.mean(((yref - y2) ** 2)) 80 | model_loss.backward() 81 | 82 | assert (torch.abs(model_loss - model_loss) / model_loss).item() < 0.001 83 | 84 | opt1.step() 85 | opt2.step() 86 | --------------------------------------------------------------------------------