├── .clang-format
├── .githooks
    └── check-license.py
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    ├── dependabot.yml
    └── workflows
    │   ├── documentation.yml
    │   ├── style.yml
    │   └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── SECURITY.md
├── bandit.yaml
├── dev_requirements.txt
├── docs
    ├── Doxyfile
    ├── build_doc.py
    └── source
    │   ├── .gitignore
    │   ├── _static
    │       └── .gitkeep
    │   ├── _templates
    │       └── .gitkeep
    │   ├── adding_operations.md
    │   ├── conf.py
    │   ├── cpp_reference.rst
    │   ├── developer.md
    │   ├── index.rst
    │   ├── llm.md
    │   ├── llm_perf.png
    │   ├── llm_performance.md
    │   ├── npu.md
    │   ├── npu_arch.png
    │   ├── python
    │       ├── intel_npu_acceleration_library.backend.rst
    │       ├── intel_npu_acceleration_library.functional.rst
    │       ├── intel_npu_acceleration_library.nn.rst
    │       ├── intel_npu_acceleration_library.rst
    │       └── modules.rst
    │   ├── setup.md
    │   └── usage.md
├── examples
    ├── Audio-Spectrogram-Transformer.py
    ├── NPU compilation tutorial.ipynb
    ├── compile_model.py
    ├── cpp
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── main.cpp
    ├── llama.py
    ├── llama3.py
    ├── llava.py
    ├── matmul.py
    ├── model_compilation_demo.ipynb
    ├── phi-2.py
    ├── phi-3.py
    ├── qwen2_math_7b.py
    ├── t5.py
    ├── tiny_llama_chat.py
    └── train_mnist.py
├── include
    └── intel_npu_acceleration_library
    │   ├── common.h
    │   ├── conversion.h
    │   ├── inference.h
    │   ├── nn_factory.h
    │   └── parameters.h
├── intel_npu_acceleration_library
    ├── __init__.py
    ├── _version.py
    ├── backend
    │   ├── __init__.py
    │   ├── base.py
    │   ├── bindings.py
    │   ├── compression.py
    │   ├── convolution.py
    │   ├── factory.py
    │   ├── linear.py
    │   ├── matmul.py
    │   ├── mlp.py
    │   ├── ops.py
    │   ├── qlinear.py
    │   ├── qmatmul.py
    │   ├── runtime.py
    │   ├── sdpa.py
    │   ├── tensor.py
    │   └── utils.py
    ├── compiler.py
    ├── device.py
    ├── dtypes.py
    ├── functional
    │   ├── __init__.py
    │   └── scaled_dot_product_attention.py
    ├── modelling.py
    ├── nn
    │   ├── __init__.py
    │   ├── autograd.py
    │   ├── conv.py
    │   ├── functional.py
    │   ├── linear.py
    │   ├── llm.py
    │   └── module.py
    ├── optimizations.py
    └── quantization.py
├── licensing
    ├── dev-third-party-programs.txt
    ├── documentation-third-party-programs.txt
    └── third-party-programs.txt
├── mypy.ini
├── requirements.txt
├── script
    ├── export.py
    ├── gen_leaderboard_doc.py
    ├── llm_leaderboard.py
    ├── profile_llm.py
    ├── profile_matmul.py
    ├── profile_mlp.py
    └── quantize_model.py
├── setup.cfg
├── setup.py
├── src
    └── bindings.cpp
└── test
    └── python
        ├── conftest.py
        ├── test_basic.py
        ├── test_bindings.py
        ├── test_compile.py
        ├── test_conv.py
        ├── test_device.py
        ├── test_dtypes.py
        ├── test_factory.py
        ├── test_layers.py
        ├── test_llm.py
        ├── test_matmul.py
        ├── test_module.py
        ├── test_op.py
        ├── test_optimizations.py
        ├── test_profiling.py
        ├── test_quantization.py
        ├── test_sdpa.py
        ├── test_tensor.py
        └── test_training.py


/.clang-format:
--------------------------------------------------------------------------------
 1 | BasedOnStyle: Google
 2 | 
 3 | Language: Cpp
 4 | Standard: Cpp11
 5 | 
 6 | AccessModifierOffset: -4
 7 | AlignAfterOpenBracket: Align
 8 | AllowAllArgumentsOnNextLine: false
 9 | AllowAllConstructorInitializersOnNextLine: true
10 | AllowShortBlocksOnASingleLine: false
11 | AllowShortCaseLabelsOnASingleLine: false
12 | AllowShortFunctionsOnASingleLine: None
13 | AllowShortIfStatementsOnASingleLine: Never
14 | AllowShortLambdasOnASingleLine: Empty
15 | AllowShortLoopsOnASingleLine: false
16 | AlwaysBreakBeforeMultilineStrings: false
17 | BreakInheritanceList:  AfterColon
18 | ColumnLimit: 120
19 | ConstructorInitializerIndentWidth: 8
20 | ContinuationIndentWidth: 8
21 | DerivePointerAlignment: false
22 | FixNamespaceComments: true
23 | IncludeBlocks: Preserve
24 | IndentCaseLabels: false
25 | IndentWidth: 4
26 | PointerAlignment: Left
27 | SpaceBeforeCpp11BracedList: false
28 | SpaceBeforeCtorInitializerColon: false
29 | UseTab: Never
30 | StatementMacros: ['CASE', 'HW_OPS_CASE']


--------------------------------------------------------------------------------
/.githooks/check-license.py:
--------------------------------------------------------------------------------
 1 | #! python
 2 | #
 3 | # Copyright © 2024 Intel Corporation
 4 | # SPDX-License-Identifier: Apache 2.0
 5 | #
 6 | 
 7 | import datetime
 8 | import sys
 9 | import os
10 | 
11 | LICENSE_TYPE = "Apache 2.0"
12 | LICENSE_STR = f"SPDX-License-Identifier: {LICENSE_TYPE}"
13 | 
14 | COPYRIGHT = f"Copyright © {datetime.datetime.now().year} Intel Corporation"
15 | 
16 | if __name__ == "__main__":
17 |     ret = 0
18 |     for filename in sys.argv:
19 |         _, file_extension = os.path.splitext(filename)
20 |         if "CMakeLists.txt" in filename or file_extension in [
21 |             ".h",
22 |             ".hpp",
23 |             ".cpp",
24 |             ".c",
25 |             ".py",
26 |             ".js",
27 |             ".sh",
28 |         ]:
29 |             with open(filename, encoding="utf-8") as fp:
30 |                 text = fp.read()
31 |                 if LICENSE_STR not in text:
32 |                     print(f"[pre-commit] {filename} does not have a valid license!")
33 |                     ret = 1
34 |                 if COPYRIGHT not in text:
35 |                     print(f"[pre-commit] {filename} does not have a valid copyright!")
36 |                     ret = 1
37 | 
38 |     sys.exit(ret)
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "pip" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "weekly"
12 | 


--------------------------------------------------------------------------------
/.github/workflows/documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Documentation
 2 | permissions: read-all
 3 | 
 4 | on:
 5 |   workflow_dispatch:
 6 | 
 7 | jobs:
 8 |   build:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v3
12 |     - name: Set up Python 3.10
13 |       uses: actions/setup-python@v3
14 |       with:
15 |         python-version: "3.10"
16 |     - name: Install packet
17 |       run: |
18 |         sudo apt-get install -y doxygen
19 |         python -m pip install --upgrade pip
20 |         pip install .[dev]
21 |     - name: Run tests
22 |       run: |
23 |         cd docs
24 |         python build_doc.py gh-deploy
25 | 


--------------------------------------------------------------------------------
/.github/workflows/style.yml:
--------------------------------------------------------------------------------
 1 | name: Style
 2 | permissions: read-all
 3 | 
 4 | on:
 5 |   workflow_dispatch:
 6 |   pull_request:
 7 |   push:
 8 |     branches:
 9 |     - main
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |     - uses: actions/checkout@v3
16 |     - name: Set up Python 3.10
17 |       uses: actions/setup-python@v3
18 |       with:
19 |         python-version: "3.10"
20 |     - name: Install packet
21 |       run: |
22 |         python -m pip install --upgrade pip
23 |         pip install .[dev]
24 |     - name: Install pre-commit
25 |       run: |
26 |         pip install pre-commit
27 |         pre-commit install
28 |     - name: Run tests
29 |       run: pre-commit run --all-files
30 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | permissions: read-all
 3 | 
 4 | on:
 5 |   workflow_dispatch:
 6 |   pull_request:
 7 |   push:
 8 |     branches:
 9 |     - main
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ${{ matrix.os }}
14 |     strategy:
15 |       matrix:
16 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
17 |         os: [windows-latest]
18 |     steps:
19 |     - uses: actions/checkout@v3
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v3
22 |       with:
23 |         python-version: ${{ matrix.python-version }}
24 |     - name: Install TBB on ubuntu
25 |       if: matrix.os  == 'ubuntu-latest'
26 |       run: sudo apt install libtbb-dev
27 |     - name: Install packet
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         pip install tox tox-gh-actions
31 |     - name: Run tests
32 |       run: tox
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | cache
 3 | logs
 4 | models
 5 | ov
 6 | *.json
 7 | build
 8 | *.egg-info
 9 | dist
10 | lib
11 | *.csv
12 | *.png
13 | *.bin
14 | *.xml
15 | 
16 | _build
17 | site
18 | data
19 | xml
20 | .coverage*
21 | coverity
22 | .tox
23 | nc_workspace


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2022 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | default_install_hook_types: [pre-commit, commit-msg]
 7 | repos:
 8 | - repo: local
 9 |   hooks:
10 |     - id: check-license
11 |       name: Check license
12 |       entry: ./.githooks/check-license.py
13 |       language: python
14 |       stages: [commit]
15 | 
16 | - repo: https://github.com/pre-commit/mirrors-clang-format
17 |   rev: v10.0.1
18 |   hooks:
19 |     - id: clang-format
20 | - repo: https://github.com/psf/black
21 |   rev: 22.6.0
22 |   hooks:
23 |     - id: black
24 | - repo: https://github.com/PyCQA/bandit
25 |   rev: 1.7.7
26 |   hooks:
27 |     - id: bandit
28 |       args: ["-c", "bandit.yaml"]
29 | - repo: https://github.com/pre-commit/pre-commit-hooks
30 |   rev: v4.1.0
31 |   hooks:
32 |     - id: check-merge-conflict
33 |     - id: check-json
34 |     - id: check-executables-have-shebangs
35 |     - id: check-symlinks
36 |     - id: debug-statements
37 |     - id: mixed-line-ending
38 |     - id: trailing-whitespace
39 | -   repo: https://github.com/PyCQA/flake8
40 |     rev: 6.1.0
41 |     hooks:
42 |     -   id: flake8
43 |         additional_dependencies: [
44 |             'flake8-blind-except',
45 |             'flake8-docstrings',
46 |             'flake8-bugbear',
47 |             'flake8-comprehensions',
48 |             'flake8-docstrings-complete',
49 |             'flake8-implicit-str-concat',
50 |             'pydocstyle>=5.0.0',
51 |         ]
52 |         exclude: docs/.*|setup.py|test/.*|script/.*|examples/.*
53 | -   repo: https://github.com/regebro/pyroma
54 |     rev: "4.2"
55 |     hooks:
56 |     -   id: pyroma
57 | -   repo: https://github.com/pre-commit/mirrors-mypy
58 |     rev: 'v1.8.0'
59 |     hooks:
60 |     -   id: mypy
61 |         exclude: 'docs|script|test|venv|examples'
62 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | 
  6 | cmake_minimum_required(VERSION 3.16)
  7 | include(FetchContent)
  8 | 
  9 | project(intel_npu_acceleration_library)
 10 | 
 11 | set(CMAKE_CXX_STANDARD 14 CACHE STRING "C++ standard to conform to")
 12 | 
 13 | if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU")
 14 |     # lots of warnings and all warnings as errors
 15 |     message(STATUS "Setting GCC/Clang specific flags for the entire build")
 16 |     add_compile_options(-Wall -Wextra -Werror -pedantic -Wdouble-promotion -Wfloat-conversion -march=native)
 17 |     set(CMAKE_CXX_FLAGS_DEBUG "-g")
 18 |     set(CMAKE_CXX_FLAGS_RELEASE "-O3")
 19 | elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
 20 |     message(STATUS "Setting Visual Studio specific flags for the entire build")
 21 |     add_compile_options(/W3 /WX /arch:AVX2 /arch:SSE2)
 22 |     add_link_options(/WX)
 23 | else()
 24 |     message(AUTHOR_WARNING "-- Building with unrecognised compiler, not setting any specific flags")
 25 | endif()
 26 | 
 27 | function(get_linux_lsb_release_information)
 28 |     find_program(LSB_RELEASE_CMD lsb_release)
 29 |     if(NOT LSB_RELEASE_CMD)
 30 |         message(FATAL_ERROR "Command lsb_release cannot be found")
 31 |     endif()
 32 | 
 33 |     execute_process(COMMAND "${LSB_RELEASE_CMD}" --short --id OUTPUT_VARIABLE LSB_RELEASE_ID OUTPUT_STRIP_TRAILING_WHITESPACE)
 34 |     execute_process(COMMAND "${LSB_RELEASE_CMD}" --short --release OUTPUT_VARIABLE LSB_RELEASE_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE)
 35 | 
 36 |     set(LSB_RELEASE_ID "${LSB_RELEASE_ID}" PARENT_SCOPE)
 37 |     set(LSB_RELEASE_VERSION "${LSB_RELEASE_VERSION}" PARENT_SCOPE)
 38 | endfunction()
 39 | 
 40 | set(OV_VERSION_SHORT "2024.4")
 41 | set(OV_VERSION "2024.4.0.16579.c3152d32c9c_x86_64")
 42 | set(OV_STORAGE_URL "https://storage.openvinotoolkit.org/repositories/openvino/packages")
 43 | set(OV_NIGHTLY_COMMIT "2024.3.0-15502-66093834e38")
 44 | 
 45 | if (WIN32)
 46 |     if(NOT OV_LIBRARY_URL)
 47 |         if (${OV_VERSION_SHORT} STREQUAL "nightly")
 48 |             set(OV_PLATFORM "${OV_NIGHTLY_COMMIT}")
 49 |         else()
 50 |             set(OV_PLATFORM "windows")
 51 |         endif()
 52 |         set(OV_LIBRARY_URL "${OV_STORAGE_URL}/${OV_VERSION_SHORT}/${OV_PLATFORM}/w_openvino_toolkit_windows_${OV_VERSION}.zip")
 53 |     endif()
 54 | elseif(UNIX)
 55 |     if(NOT OV_LIBRARY_URL)
 56 |         get_linux_lsb_release_information()
 57 |         if (LSB_RELEASE_ID STREQUAL "Ubuntu")
 58 |             if (${LSB_RELEASE_VERSION} STREQUAL "18.04" OR ${LSB_RELEASE_VERSION} STREQUAL "20.04" OR ${LSB_RELEASE_VERSION} STREQUAL "22.04" OR ${LSB_RELEASE_VERSION} STREQUAL "24.04")
 59 |                 string(REPLACE ".04" "" LSB_RELEASE_VERSION_SHORT ${LSB_RELEASE_VERSION})
 60 |                 if (${OV_VERSION_SHORT} STREQUAL "nightly")
 61 |                     set(OV_PLATFORM "${OV_NIGHTLY_COMMIT}")
 62 |                 else()
 63 |                     set(OV_PLATFORM "linux")
 64 |                 endif()
 65 | 
 66 |                 set(OV_LIBRARY_URL "${OV_STORAGE_URL}/${OV_VERSION_SHORT}/${OV_PLATFORM}/l_openvino_toolkit_ubuntu${LSB_RELEASE_VERSION_SHORT}_${OV_VERSION}.tgz")
 67 |             else()
 68 |                 message(FATAL_ERROR "Ubuntu version ${LSB_RELEASE_VERSION} is unsupported")
 69 |             endif()
 70 |         else()
 71 |             message(FATAL_ERROR "Linux distribution ${LSB_RELEASE_ID} is unsupported")
 72 |         endif()
 73 | 
 74 |     endif()
 75 | else()
 76 |     message(FATAL_ERROR "Unsupported architecture")
 77 | endif ()
 78 | 
 79 | message(STATUS "OpenVINO library URL: ${OV_LIBRARY_URL}")
 80 | 
 81 | FetchContent_Declare(
 82 |     openvino
 83 |     URL ${OV_LIBRARY_URL}
 84 |     )
 85 | FetchContent_MakeAvailable(openvino)
 86 | 
 87 | find_package(OpenVINO REQUIRED PATHS ${openvino_SOURCE_DIR}/runtime/cmake)
 88 | 
 89 | if (WIN32)
 90 |     set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
 91 |     file(GLOB OpenVINObin ${openvino_SOURCE_DIR}/runtime/bin/intel64/Release/*)
 92 |     file(COPY ${OpenVINObin} DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/Release)
 93 | 
 94 |     file(GLOB TBBlib ${openvino_SOURCE_DIR}/runtime/3rdparty/tbb/bin/*)
 95 |     file(COPY ${TBBlib} DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/Release)
 96 | else()
 97 |     file(GLOB OpenVINObin ${openvino_SOURCE_DIR}/runtime/lib/intel64/*)
 98 |     file(COPY ${OpenVINObin} DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
 99 | endif()
100 | 
101 | if(SETUPTOOL_BUILD)
102 |     file(GLOB OpenVINOPython ${openvino_SOURCE_DIR}/python/openvino/*)
103 |     file(COPY ${OpenVINOPython} DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/../external/openvino)
104 | endif()
105 | 
106 | set(CMAKE_POSITION_INDEPENDENT_CODE ON)
107 | 
108 | include_directories(include)
109 | 
110 | # Create the Python module
111 | add_library(intel_npu_acceleration_library SHARED src/bindings.cpp)
112 | 
113 | # Link the OpenVINO libraries
114 | target_link_libraries(intel_npu_acceleration_library PRIVATE openvino::runtime)
115 | if (UNIX)
116 |     set_target_properties(intel_npu_acceleration_library PROPERTIES LINK_FLAGS "-Wl,-rpath,./")
117 | endif (UNIX)
118 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, caste, color, religion, or sexual
 10 | identity and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the overall
 26 |   community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or advances of
 31 |   any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email address,
 35 |   without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | CommunityCodeOfConduct AT intel DOT com.
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series of
 86 | actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or permanent
 93 | ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within the
113 | community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.1, available at
119 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
120 | 
121 | Community Impact Guidelines were inspired by
122 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
123 | 
124 | For answers to common questions about this code of conduct, see the FAQ at
125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
126 | [https://www.contributor-covenant.org/translations][translations].
127 | 
128 | [homepage]: https://www.contributor-covenant.org
129 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
130 | [Mozilla CoC]: https://github.com/mozilla/diversity
131 | [FAQ]: https://www.contributor-covenant.org/faq


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | ## License
 4 | 
 5 | Intel® NPU Acceleration Library is licensed under the terms in [LICENSE](LICENSE). By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms.
 6 | 
 7 | ## Sign your work
 8 | 
 9 | Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify
10 | the below (from [developercertificate.org](http://developercertificate.org/)):
11 | 
12 | ```text
13 | Developer Certificate of Origin
14 | Version 1.1
15 | 
16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
17 | 660 York Street, Suite 102,
18 | San Francisco, CA 94110 USA
19 | 
20 | Everyone is permitted to copy and distribute verbatim copies of this
21 | license document, but changing it is not allowed.
22 | 
23 | Developer's Certificate of Origin 1.1
24 | 
25 | By making a contribution to this project, I certify that:
26 | 
27 | (a) The contribution was created in whole or in part by me and I
28 |     have the right to submit it under the open source license
29 |     indicated in the file; or
30 | 
31 | (b) The contribution is based upon previous work that, to the best
32 |     of my knowledge, is covered under an appropriate open source
33 |     license and I have the right under that license to submit that
34 |     work with modifications, whether created in whole or in part
35 |     by me, under the same open source license (unless I am
36 |     permitted to submit under a different license), as indicated
37 |     in the file; or
38 | 
39 | (c) The contribution was provided directly to me by some other
40 |     person who certified (a), (b) or (c) and I have not modified
41 |     it.
42 | 
43 | (d) I understand and agree that this project and the contribution
44 |     are public and that a record of the contribution (including all
45 |     personal information I submit with it, including my sign-off) is
46 |     maintained indefinitely and may be redistributed consistent with
47 |     this project or the open source license(s) involved.
48 | ```
49 | 
50 | Then you just add a line to every git commit message:
51 | 
52 |     Signed-off-by: Joe Smith <joe.smith@email.com>
53 | 
54 | Use your real name (sorry, no pseudonyms or anonymous contributions.)
55 | 
56 | If you set your `user.name` and `user.email` git configs, you can sign your
57 | commit automatically with `git commit -s`.


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation.
3 | 
4 | ## Reporting a Vulnerability
5 | Please report any security vulnerabilities in this project utilizing the guidelines [here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html).


--------------------------------------------------------------------------------
/bandit.yaml:
--------------------------------------------------------------------------------
1 | exclude_dirs: ['test', 'docs', '.githooks', 'script', 'examples']
2 | skips: []


--------------------------------------------------------------------------------
/dev_requirements.txt:
--------------------------------------------------------------------------------
 1 | pytest
 2 | pytest-xdist
 3 | pytest-cov
 4 | scikit-learn <= 1.5.2
 5 | pre-commit; sys_platform == 'darwin'
 6 | sphinx
 7 | breathe
 8 | sphinx-book-theme
 9 | myst-parser
10 | ghp-import
11 | pyroma
12 | mypy


--------------------------------------------------------------------------------
/docs/Doxyfile:
--------------------------------------------------------------------------------
 1 | INPUT = ../include/intel_npu_acceleration_library
 2 | FILE_PATTERNS = *.cpp *.h
 3 | GENERATE_HTML = NO
 4 | GENERATE_LATEX = NO
 5 | GENERATE_XML = YES # Important for Breathe
 6 | FULL_PATH_NAMES = NO
 7 | ENABLE_PREPROCESSING = YES
 8 | MACRO_EXPANSION = YES
 9 | EXPAND_ONLY_PREDEF = YES
10 | 


--------------------------------------------------------------------------------
/docs/build_doc.py:
--------------------------------------------------------------------------------
  1 | #! python
  2 | #
  3 | # Copyright © 2024 Intel Corporation
  4 | # SPDX-License-Identifier: Apache 2.0
  5 | #
  6 | 
  7 | from http.server import HTTPServer, SimpleHTTPRequestHandler
  8 | from ghp_import import ghp_import
  9 | from typing import List, Union
 10 | import subprocess
 11 | import argparse
 12 | import shutil
 13 | import os
 14 | 
 15 | 
 16 | def define_and_parse_args():
 17 |     parser = argparse.ArgumentParser(description="Build documentations")
 18 |     parser.add_argument(
 19 |         "action",
 20 |         type=str,
 21 |         choices=["build", "serve", "gh-deploy"],
 22 |         help="Name of the model to export",
 23 |     )
 24 | 
 25 |     return parser.parse_args()
 26 | 
 27 | 
 28 | doc_root = os.path.dirname(os.path.abspath(__file__))
 29 | root = os.path.abspath(os.path.join(doc_root, ".."))
 30 | doxygen_available = shutil.which("doxygen") is not None
 31 | 
 32 | 
 33 | def clean_dirs(dir_names: Union[List[str], str]) -> None:
 34 |     if isinstance(dir_names, str):
 35 |         dir_names = [dir_names]
 36 |     for name in dir_names:
 37 |         xml_dir = os.path.join(doc_root, name)
 38 |         if os.path.exists(xml_dir) and os.path.isdir(xml_dir):
 39 |             shutil.rmtree(xml_dir)
 40 | 
 41 | 
 42 | def build_doc():
 43 | 
 44 |     clean_dirs(["build", "xml"])
 45 | 
 46 |     if not doxygen_available:
 47 |         raise RuntimeError("Doxygen is needed to build documentation")
 48 | 
 49 |     yield subprocess.check_output(
 50 |         ["doxygen", "Doxyfile"], cwd=doc_root, stderr=subprocess.STDOUT
 51 |     ).decode()
 52 |     yield subprocess.check_output(
 53 |         ["sphinx-apidoc", "-o", "source/python", "../intel_npu_acceleration_library"],
 54 |         cwd=doc_root,
 55 |         stderr=subprocess.STDOUT,
 56 |     ).decode()
 57 |     yield subprocess.check_output(
 58 |         ["sphinx-build", "-b", "html", "source", "build"],
 59 |         cwd=doc_root,
 60 |         stderr=subprocess.STDOUT,
 61 |     ).decode()
 62 | 
 63 |     clean_dirs("xml")
 64 | 
 65 | 
 66 | def build():
 67 |     for out in build_doc():
 68 |         print(out)
 69 | 
 70 | 
 71 | class Handler(SimpleHTTPRequestHandler):
 72 |     def __init__(self, *args, **kwargs):
 73 |         super().__init__(*args, directory="build", **kwargs)
 74 | 
 75 | 
 76 | def serve(hostname="localhost", port=8000):
 77 |     build()
 78 |     server_address = (hostname, port)
 79 |     httpd = HTTPServer(server_address, Handler)
 80 |     print(f"Serving at address {hostname}:{port}")
 81 |     httpd.serve_forever()
 82 | 
 83 | 
 84 | def get_git_sha() -> str:
 85 |     return (
 86 |         subprocess.check_output(
 87 |             ["git", "rev-parse", "--short", "HEAD"],
 88 |             cwd=root,
 89 |         )
 90 |         .decode()
 91 |         .strip()
 92 |     )
 93 | 
 94 | 
 95 | def deploy():
 96 |     build()
 97 | 
 98 |     message = f"Deployed with sha {get_git_sha()}"
 99 | 
100 |     try:
101 |         ghp_import(
102 |             os.path.join(doc_root, "build"),
103 |             mesg=message,
104 |             remote="origin",
105 |             branch="gh-pages",
106 |             push=True,
107 |             force=True,
108 |             use_shell=False,
109 |             no_history=False,
110 |             nojekyll=True,
111 |         )
112 |     except ghp_import.GhpError as e:
113 |         raise RuntimeError(f"Failed to deploy to GitHub. Error: \n{e.message}")
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     args = define_and_parse_args()
118 | 
119 |     if args.action == "build":
120 |         build()
121 |     elif args.action == "serve":
122 |         serve()
123 |     elif args.action == "gh-deploy":
124 |         deploy()
125 |     else:
126 |         raise RuntimeError(f"Unsuported action: {args.action}")
127 | 


--------------------------------------------------------------------------------
/docs/source/.gitignore:
--------------------------------------------------------------------------------
1 | !*.png


--------------------------------------------------------------------------------
/docs/source/_static/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-npu-acceleration-library/073ad6a3a1eb20fdd1ba00d72c7241586372ebee/docs/source/_static/.gitkeep


--------------------------------------------------------------------------------
/docs/source/_templates/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-npu-acceleration-library/073ad6a3a1eb20fdd1ba00d72c7241586372ebee/docs/source/_templates/.gitkeep


--------------------------------------------------------------------------------
/docs/source/adding_operations.md:
--------------------------------------------------------------------------------
 1 | # Adding New Operations in the Library
 2 | 
 3 | This document outlines the process for integrating a new operation into the existing code library. The integration process involves several key steps: defining the operation's interface, implementing the operation ensuring compatibility with the library's architecture, and providing testing to validate the operation.
 4 | 
 5 | An example of implementing new operations can be found here: [Implementing reduce operations](https://github.com/intel/intel-npu-acceleration-library/commit/4f17015a75c146fe8d569ac71a2e2a0a960fc652)
 6 | 
 7 | ## Step 1: Defining the OpenVINO interface
 8 | 
 9 | The first step is defining the call to the OpenVino method of the new operation through the OpenVINO Runtime C++ API. This is done in the `nn_factory.h` header. In this file, a new operation is created by interfacing with the OpenVINO operation. This includes specifying input and output parameters, and data types of the operation's interface and then calling and returning the OpenVINO method. The interface should align with the library's existing design patterns and naming conventions.
10 | 
11 | A simple example of defining a new operation:
12 | ```
13 | ov::op::Op* new_operation(ov::op::Op* input) {
14 |     auto new_operation = std::make_shared<ov::opset1::NewOp>(input->output(0));
15 |     operations.push_back(new_operation);
16 |     return new_operation.get();
17 | }
18 | ```
19 | ## Step 2: Defining the C++ bindings
20 | 
21 | The next step is defining the C++ binding in the `binding.cpp` source file. This is the method that will be called in Python. This method has the operation's input node as a parameter and additional arguments of the operation are defined in the method.
22 | 
23 | An example of defining the binding:
24 | ```
25 | intel_npu_acceleration_library_DLL_API ov::op::Op* new_operation(intel_npu_acceleration_library::ModelFactory* factory, ov::op::Op* input) {
26 |     return factory->new_operation(input);
27 | }
28 | ```
29 | 
30 | ## Step 3: Adding new operation to list of supported operation
31 | 
32 | The new operation is added to the list of supported NPU operations in the `ops.py` script.
33 | The information of the new operation that must be provided is:
34 | - the operation name
35 | - the number of inputs
36 | - the optional parameters types
37 | 
38 | ## Step 4: Adding extra functionality to the operation's function
39 | Ctypes is used to interface between C++ and Python. (Documentation is found here: [Python Ctypes](https://docs.python.org/3/library/ctypes.html))
40 | 
41 | If there is additional logic that you may want to add to the function, this can be done by defining a Python function that calls the C++ method in the `factory.py` file.
42 | Otherwise, if you directly call the functions to C++, then you do not need to define a Python function.
43 | 
44 | ## Step 5: Adding PyTorch wrapper for the new operation
45 | Additionally, to define a wrapper to use PyTorch native functions, this can be implemented in the `functional.py` file. In this step, a function of the same name as the PyTorch equivalent is created, which is used instead of the PyTorch implementation of the operation.
46 | If there is additional logic that you may want to add to the function to interface with the new operation, it can also be added in this function.
47 | 
48 | It is common for the new operation to have the same name as the PyTorch equivalent operation, however this is not always the case and to show which operation we are referring to, we refer to the newly implemented operation as `new_operation` and the PyTorch operation and `operation`.
49 | 
50 | The basic structure of PyTorch wrapper for a PyTorch operation, referred to as `torch.operation`, which returns the output of the implemented `new_operation`:
51 | ```
52 | @implements(torch.operation)
53 | def operation(x: Tensor) -> Tensor:
54 |     """Return the output tensor of the operation.
55 | 
56 |     Args:
57 |         x (Tensor): The input tensor.
58 |     Returns:
59 |         Tensor: Output tensor.
60 |     """
61 |     return generate_op(x, "new_operation")
62 | ```
63 | ## Step 6: Building the library
64 | To update the library, run the command:
65 | ```
66 | pip install .
67 | ```
68 | 
69 | ## Step 7: Adding tests for the new operation
70 | A test for the new operation can be added in the `test_op.py` script. The new operation should be compared with a reference to ensure correct implementation.
71 | 
72 | The following is a basic structure to use the new operation:
73 | ```
74 | X = torch.rand((16, 128)).to(torch.float16)  # defining the input tensor
75 | 
76 | model = NNFactory()
77 | input = model.parameter(X.shape)             # creating the input node
78 | _ = model.new_operation(input)               # _ = torch.operation(input) is equivalent if using the PyTorch wrapper
79 | model.compile()
80 | out = model.run(X.numpy())
81 | ```
82 | 
83 | Using pytest to run all of the tests in the file:
84 | ```
85 | pytest <name of the file>
86 | ```
87 | 
88 | Using pytest to run a single test in the file:
89 | ```
90 | pytest <name of the file>::<name of the test>
91 | ```


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | #! python
 2 | #
 3 | # Copyright © 2024 Intel Corporation
 4 | # SPDX-License-Identifier: Apache 2.0
 5 | #
 6 | 
 7 | import glob
 8 | import os
 9 | import sys
10 | 
11 | 
12 | repo_root = os.path.abspath(
13 |     os.path.join(os.path.dirname(os.path.abspath(__file__)), "../..")
14 | )
15 | sys.path.insert(0, os.path.join(repo_root, "intel_npu_acceleration_library"))
16 | 
17 | project = "Intel® NPU Acceleration Library"
18 | copyright = "2024, Intel Corporation"
19 | author = "Intel Corporation"
20 | 
21 | # -- General configuration ---------------------------------------------------
22 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
23 | 
24 | 
25 | templates_path = ["_templates"]
26 | exclude_patterns = []
27 | 
28 | # Add Breathe extension
29 | extensions = [
30 |     # 'sphinx.ext.autodoc',
31 |     "sphinx.ext.napoleon",
32 |     "breathe",
33 |     "myst_parser",
34 | ]
35 | 
36 | # autodoc_default_options = {
37 | #     'ignore-module-all': False
38 | # }
39 | 
40 | source_suffix = [".rst", ".md"]
41 | 
42 | # Breathe Configuration
43 | breathe_default_project = "Intel® NPU Acceleration Library"
44 | breathe_projects = {"Intel® NPU Acceleration Library": "../xml"}
45 | 
46 | # -- Options for HTML output -------------------------------------------------
47 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
48 | 
49 | html_theme = "sphinx_book_theme"
50 | html_static_path = ["_static"]
51 | 


--------------------------------------------------------------------------------
/docs/source/cpp_reference.rst:
--------------------------------------------------------------------------------
1 | C++ API Reference
2 | =================
3 | 
4 | .. doxygenindex::
5 |    :project: Intel® NPU Acceleration Library
6 | 


--------------------------------------------------------------------------------
/docs/source/developer.md:
--------------------------------------------------------------------------------
 1 | # Developer Guide
 2 | 
 3 | Install developer packages by typing
 4 | 
 5 | ```bash
 6 | pip install .[dev]
 7 | ```
 8 | 
 9 | It is suggested to install the package locally by using `pip install -e .[dev]`
10 | 
11 | ## Git hooks
12 | 
13 | All developers should install the git hooks that are tracked in the `.githooks` directory. We use the pre-commit framework for hook management. The recommended way of installing it is using pip:
14 | 
15 | ```bash
16 | pre-commit install
17 | ```
18 | 
19 | If you want to manually run all pre-commit hooks on a repository, run `pre-commit run --all-files`. To run individual hooks use `pre-commit run <hook_id>`.
20 | 
21 | Uninstalling the hooks can be done using
22 | 
23 | ```bash
24 | pre-commit uninstall
25 | ```
26 | 
27 | ## Testing the library
28 | 
29 | ### Python test
30 | 
31 | Python test uses `pytest` library. Type
32 | 
33 | ```bash
34 | cd test/python && pytest
35 | ```
36 | 
37 | to run the full test suite.
38 | 
39 | ## Build the documentation
40 | 
41 | This project uses `sphinx` to build and deploy the documentation. To serve locally the documentation type
42 | 
43 | ```bash
44 | mkdocs serve
45 | ```
46 | 
47 | to deploy it into github pages type
48 | 
49 | ```bash
50 | cd docs
51 | python build_doc.py gh-deploy
52 | ```
53 | 
54 | ## Generate python packages
55 | 
56 | On windows:
57 | 
58 | ```bat
59 | python setup.py sdist
60 | set CIBW_BUILD=cp*
61 | cibuildwheel --platform windows --output-dir dist
62 | ```
63 | 
64 | 
65 | ## Publishing packets
66 | 
67 | Install twine
68 | ```bat
69 | python3 -m pip install --upgrade twine
70 | ```
71 | 
72 | Then check on the built sdist and wheel that are properly formatted (all files should return a green `PASSED`)
73 | 
74 | ```bat
75 | twine check dist/*
76 | ```
77 | 
78 | Upload the packets to `testpypi`
79 | 
80 | ```bat
81 | twine upload --repository testpypi dist/*
82 | ```
83 | 
84 | To upload them to the real index (**verify first with testpypi**)
85 | ```bat
86 | twine upload dist/*
87 | ```


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
  1 | .. Intel® NPU Acceleration Library documentation master file, created by
  2 |    sphinx-quickstart on Wed Feb  7 11:48:32 2024.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | Welcome to Intel® NPU Acceleration Library's documentation!
  7 | =====================================
  8 | 
  9 | The Intel® NPU Acceleration Library is a Python library designed to boost the efficiency of your applications by leveraging the power of the Intel Neural Processing Unit (NPU) to perform high-speed computations on compatible hardware.
 10 | 
 11 | Installation
 12 | -------------
 13 | 
 14 | Check that your system has an available NPU (`how-to <https://www.intel.com/content/www/us/en/support/articles/000097597/processors.html>`_).
 15 | 
 16 | You can install the packet in your machine with
 17 | 
 18 | .. code-block:: bash
 19 | 
 20 |    pip install intel-npu-acceleration-library
 21 | 
 22 | 
 23 | Run a LLaMA model on the NPU
 24 | ----------------------------
 25 | 
 26 | To run LLM models you need to install the `transformers` library
 27 | 
 28 | 
 29 | .. code-block:: bash
 30 | 
 31 |    pip install transformers
 32 | 
 33 | You are now up and running! You can create a simple script like the following one to run a LLM on the NPU
 34 | 
 35 | 
 36 | .. code-block:: python
 37 |    :emphasize-lines: 2, 7
 38 | 
 39 |    from transformers import AutoTokenizer, TextStreamer
 40 |    from intel_npu_acceleration_library import NPUModelForCausalLM
 41 |    import torch
 42 | 
 43 |    model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 44 | 
 45 |    model = NPUModelForCausalLM.from_pretrained(model_id, use_cache=True, dtype=torch.int8).eval()
 46 |    tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True)
 47 |    tokenizer.pad_token_id = tokenizer.eos_token_id
 48 |    streamer = TextStreamer(tokenizer, skip_special_tokens=True)
 49 | 
 50 |    query = input("Ask something: ")
 51 |    prefix = tokenizer(query, return_tensors="pt")["input_ids"]
 52 | 
 53 |    generation_kwargs = dict(
 54 |       input_ids=prefix,
 55 |       streamer=streamer,
 56 |       do_sample=True,
 57 |       top_k=50,
 58 |       top_p=0.9,
 59 |       max_new_tokens=512,
 60 |    )
 61 | 
 62 |    print("Run inference")
 63 |    _ = model.generate(**generation_kwargs)
 64 | 
 65 | 
 66 | Take note that you only need to use `intel_npu_acceleration_library.compile` to offload the heavy computation to the NPU.
 67 | 
 68 | Feel free to check `Usage <usage.html>`_ and `LLM <llm.html>`_ and the `examples <https://github.com/intel/intel-npu-acceleration-library/tree/main/examples>`_ folder for additional use-cases and examples.
 69 | 
 70 | 
 71 | 
 72 | Site map
 73 | ----------------------------
 74 | 
 75 | .. toctree::
 76 |    Quickstart <self>
 77 |    NPU overview <npu.md>
 78 |    usage.md
 79 |    setup.md
 80 |    :maxdepth: 1
 81 |    :caption: Library overview:
 82 | 
 83 | 
 84 | .. toctree::
 85 |    llm.md
 86 |    llm_performance.md
 87 |    :maxdepth: 1
 88 |    :caption: Applications:
 89 | 
 90 | 
 91 | 
 92 | .. toctree::
 93 |    developer.md
 94 |    adding_operations.md
 95 |    :maxdepth: 1
 96 |    :caption: Developements guide:
 97 | 
 98 | 
 99 | 
100 | .. toctree::
101 |    Python API Reference <python/intel_npu_acceleration_library.rst>
102 |    cpp_reference.rst
103 |    :maxdepth: 1
104 |    :caption: API Reference:
105 | 
106 | 
107 | 
108 | 
109 | Indices and tables
110 | ==================
111 | 
112 | * :ref:`genindex`
113 | * :ref:`modindex`
114 | * :ref:`search`
115 | 


--------------------------------------------------------------------------------
/docs/source/llm.md:
--------------------------------------------------------------------------------
 1 | # Large Language models
 2 | 
 3 | 
 4 | ## Run an LLM on the NPU
 5 | 
 6 | You can use your existing LLM inference script on the NPU with a simple line of code
 7 | 
 8 | ```python
 9 | # First import the library
10 | import intel_npu_acceleration_library
11 | 
12 | # Call the compile function to offload kernels to the NPU.
13 | model = intel_npu_acceleration_library.compile(model)
14 | ```
15 | 
16 | Here a full example:
17 | 
18 | ```python
19 | from torch.profiler import profile, ProfilerActivity
20 | from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM
21 | from threading import Thread
22 | import intel_npu_acceleration_library
23 | import torch
24 | import time
25 | import sys
26 | 
27 | model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
28 | 
29 | model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=True).eval()
30 | tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True)
31 | tokenizer.pad_token_id = tokenizer.eos_token_id
32 | streamer = TextStreamer(tokenizer, skip_special_tokens=True)
33 | 
34 | 
35 | print("Compile model for the NPU")
36 | model = intel_npu_acceleration_library.compile(model)
37 | 
38 | query = "What is the meaning of life?"
39 | prefix = tokenizer(query, return_tensors="pt")["input_ids"]
40 | 
41 | 
42 | generation_kwargs = dict(
43 |     input_ids=prefix,
44 |     streamer=streamer,
45 |     do_sample=True,
46 |     top_k=50,
47 |     top_p=0.9,
48 | )
49 | 
50 | print("Run inference")
51 | _ = model.generate(**generation_kwargs)
52 | 
53 | ```
54 | 


--------------------------------------------------------------------------------
/docs/source/llm_perf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-npu-acceleration-library/073ad6a3a1eb20fdd1ba00d72c7241586372ebee/docs/source/llm_perf.png


--------------------------------------------------------------------------------
/docs/source/npu.md:
--------------------------------------------------------------------------------
 1 | # Quick overview of Intel's Neural Processing Unit (NPU)
 2 | 
 3 | The Intel NPU is an AI accelerator integrated into Intel Core Ultra processors, characterized by a unique architecture comprising compute acceleration and data transfer capabilities. Its compute acceleration is facilitated by Neural Compute Engines, which consist of hardware acceleration blocks for AI operations like Matrix Multiplication and Convolution, alongside Streaming Hybrid Architecture Vector Engines for general computing tasks.
 4 | 
 5 | ![Intel NPU architecture](npu_arch.png)
 6 | 
 7 | - **Scalable Multi-Tile Design:** The heart of the NPU's compute acceleration capability lies in its scalable tiled based architecture known as Neural Compute Engines.
 8 | - **Hardware Acceleration Blocks:** These engines are equipped with specific hardware blocks designed to handle AI operations that demand high levels of computation, such as Matrix Multiplication and Convolution.
 9 | - **Streaming Hybrid Architecture:** Alongside the dedicated AI operation units, the Neural Compute Engines are built with Streaming Hybrid Architecture Vector Engines (SHAVE). This enables them to perform high-performance parallel computing for general compute needs.
10 | - **DMA Engines:** Direct Memory Access (DMA) engines are integral to the NPU, responsible for moving data efficiently between the system memory DRAM and the software-managed cache.
11 | - **Memory Management:** The incorporation of a built-in device MMU, alongside an IOMMU, allows support for multiple concurrent hardware contexts. This is crucial for maintaining security isolation between these contexts in line with the Microsoft Compute Driver Model (MCDM) architectural standards.
12 | 
13 | ## The Role of Software
14 | 
15 | While the hardware is undoubtedly advanced, the true "magic" of the Intel NPU is realized through a sophisticated MLIR based compiler. It is through compiler technology that Intel's NPU reaches its full potential by optimizing and orchestrating AI workloads.
16 | 
17 | - **Parallel Workload Execution:** The compiler ensures that AI tasks are executed in parallel, directing both compute and data flows in a tiling pattern with built-in and programmable control flows.
18 | - **Maximizing Compute Utilization:** By prioritizing execution primarily out of scratchpad SRAM and reducing the data transfers between SRAM and DRAM, the compiler helps in achieving optimum performance-to-power ratios for AI workloads.
19 | 
20 | Some useful links
21 | 
22 | - Intel AI PC ([link](https://www.intel.com/content/www/us/en/products/docs/processors/core-ultra/ai-pc.html?wapkw=NPU))
23 | - Intel Core Ultra Processor line ([link](https://www.intel.com/content/www/us/en/products/docs/processors/core-ultra/core-ultra-series-1-product-brief.html?wapkw=NPU))
24 | - AI Acceleration and NPU explained ([video](https://www.youtube.com/watch?v=QSzNoX0qplE))
25 | 


--------------------------------------------------------------------------------
/docs/source/npu_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/intel-npu-acceleration-library/073ad6a3a1eb20fdd1ba00d72c7241586372ebee/docs/source/npu_arch.png


--------------------------------------------------------------------------------
/docs/source/python/intel_npu_acceleration_library.backend.rst:
--------------------------------------------------------------------------------
 1 | intel\_npu\_acceleration\_library.backend package
 2 | ==========================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | intel\_npu\_acceleration\_library.backend.base module
 8 | ------------------------------
 9 | 
10 | .. automodule:: intel_npu_acceleration_library.backend.base
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | intel\_npu\_acceleration\_library.backend.factory module
16 | ---------------------------------
17 | 
18 | .. automodule:: intel_npu_acceleration_library.backend.factory
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | intel\_npu\_acceleration\_library.backend.linear module
24 | --------------------------------
25 | 
26 | .. automodule:: intel_npu_acceleration_library.backend.linear
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | intel\_npu\_acceleration\_library.backend.matmul module
32 | --------------------------------
33 | 
34 | .. automodule:: intel_npu_acceleration_library.backend.matmul
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
39 | intel\_npu\_acceleration\_library.backend.mlp module
40 | -----------------------------
41 | 
42 | .. automodule:: intel_npu_acceleration_library.backend.mlp
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 
47 | intel\_npu\_acceleration\_library.backend.qlinear module
48 | ---------------------------------
49 | 
50 | .. automodule:: intel_npu_acceleration_library.backend.qlinear
51 |    :members:
52 |    :undoc-members:
53 |    :show-inheritance:
54 | 
55 | intel\_npu\_acceleration\_library.backend.qmatmul module
56 | ---------------------------------
57 | 
58 | .. automodule:: intel_npu_acceleration_library.backend.qmatmul
59 |    :members:
60 |    :undoc-members:
61 |    :show-inheritance:
62 | 
63 | intel\_npu\_acceleration\_library.backend.runtime module
64 | ---------------------------------
65 | 
66 | .. automodule:: intel_npu_acceleration_library.backend.runtime
67 |    :members:
68 |    :undoc-members:
69 |    :show-inheritance:
70 | 
71 | Module contents
72 | ---------------
73 | 
74 | .. automodule:: intel_npu_acceleration_library.backend
75 |    :members:
76 |    :undoc-members:
77 |    :show-inheritance:
78 | 


--------------------------------------------------------------------------------
/docs/source/python/intel_npu_acceleration_library.functional.rst:
--------------------------------------------------------------------------------
 1 | intel\_npu\_acceleration\_library.functional package
 2 | ====================================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | intel\_npu\_acceleration\_library.functional.scaled\_dot\_product\_attention module
 8 | -----------------------------------------------------------------------------------
 9 | 
10 | .. automodule:: intel_npu_acceleration_library.functional.scaled_dot_product_attention
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: intel_npu_acceleration_library.functional
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/docs/source/python/intel_npu_acceleration_library.nn.rst:
--------------------------------------------------------------------------------
 1 | intel\_npu\_acceleration\_library.nn package
 2 | =====================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | intel\_npu\_acceleration\_library.nn.autograd module
 8 | -----------------------------
 9 | 
10 | .. automodule:: intel_npu_acceleration_library.nn.autograd
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | intel\_npu\_acceleration\_library.nn.linear module
16 | ---------------------------
17 | 
18 | .. automodule:: intel_npu_acceleration_library.nn.linear
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | intel\_npu\_acceleration\_library.nn.llm module
24 | ------------------------
25 | 
26 | .. automodule:: intel_npu_acceleration_library.nn.llm
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | Module contents
32 | ---------------
33 | 
34 | .. automodule:: intel_npu_acceleration_library.nn
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 


--------------------------------------------------------------------------------
/docs/source/python/intel_npu_acceleration_library.rst:
--------------------------------------------------------------------------------
 1 | intel\_npu\_acceleration\_library package
 2 | ==================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    intel_npu_acceleration_library.backend
11 |    intel_npu_acceleration_library.nn
12 |    intel_npu_acceleration_library.functional
13 | 
14 | Submodules
15 | ----------
16 | 
17 | intel\_npu\_acceleration\_library.bindings module
18 | --------------------------
19 | 
20 | .. automodule:: intel_npu_acceleration_library.bindings
21 |    :members:
22 |    :undoc-members:
23 |    :show-inheritance:
24 | 
25 | intel\_npu\_acceleration\_library.compiler module
26 | --------------------------
27 | 
28 | .. automodule:: intel_npu_acceleration_library.compiler
29 |    :members:
30 |    :undoc-members:
31 |    :show-inheritance:
32 | 
33 | intel\_npu\_acceleration\_library.optimizations module
34 | -------------------------------
35 | 
36 | .. automodule:: intel_npu_acceleration_library.optimizations
37 |    :members:
38 |    :undoc-members:
39 |    :show-inheritance:
40 | 
41 | intel\_npu\_acceleration\_library.quantization module
42 | ------------------------------
43 | 
44 | .. automodule:: intel_npu_acceleration_library.quantization
45 |    :members:
46 |    :undoc-members:
47 |    :show-inheritance:
48 | 
49 | Module contents
50 | ---------------
51 | 
52 | .. automodule:: intel_npu_acceleration_library
53 |    :members:
54 |    :undoc-members:
55 |    :show-inheritance:
56 | 


--------------------------------------------------------------------------------
/docs/source/python/modules.rst:
--------------------------------------------------------------------------------
1 | intel_npu_acceleration_library
2 | =========
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    intel_npu_acceleration_library
8 | 


--------------------------------------------------------------------------------
/docs/source/setup.md:
--------------------------------------------------------------------------------
 1 | # Advanced Setup
 2 | 
 3 | You can install the package by typing
 4 | 
 5 | ```bash
 6 | pip install "intel-npu-acceleration-library @ git+https://github.com/intel/intel-npu-acceleration-library.git"
 7 | ```
 8 | 
 9 | To build the package you need a compiler in your system (Visual Studio 2019 suggested for Windows build). MacOS is not yet supported.
10 | 
11 | For development packages use (after cloning the repo)
12 | 
13 | ```bash
14 | pip install .[dev]
15 | ```
16 | 


--------------------------------------------------------------------------------
/docs/source/usage.md:
--------------------------------------------------------------------------------
 1 | # Basic usage
 2 | 
 3 | For implemented examples, please check the `examples` folder
 4 | 
 5 | ## Run a single MatMul in the NPU
 6 | 
 7 | ```python
 8 | from intel_npu_acceleration_library.backend import MatMul
 9 | import numpy as np
10 | 
11 | inC, outC, batch = ... # Define your own values
12 | 
13 | # Create both inputs
14 | X1 = np.random.uniform(-1, 1, (batch, inC)).astype(np.float16)
15 | X2 = np.random.uniform(-1, 1, (outC, inC)).astype(np.float16)
16 | 
17 | mm = MatMul(inC, outC, batch, profile=False)
18 | 
19 | result = mm.run(X1, X2)
20 | 
21 | ```
22 | 
23 | ## Compile a model for the NPU
24 | 
25 | If you have `pytorch`>=2.0.0 installed you can use torch compile to optimize your model for the NPU
26 | 
27 | ```python
28 | import intel_npu_acceleration_library
29 | import torch
30 | 
31 | # Compile model for the NPU
32 | # model a torch.nn.Module class. Model can be quantized JIT
33 | optimized_model = torch.compile(model, backend="npu")
34 | 
35 | # Use the model as usual
36 | 
37 | ```
38 | 
39 | In windows torch.compile is not supported yet. So you might want to use the explicit function `intel_npu_acceleration_library.compile`. This is true also if you use a `pytorch` version < 2.0.0
40 | 
41 | To do this, you just need to call the `compile` function with your model and the compiler configuration `CompilerConfig` to compile and optimize the model for the NPU.
42 | ```python
43 | import intel_npu_acceleration_library
44 | from intel_npu_acceleration_library.compiler import CompilerConfig
45 | compiler_conf = CompilerConfig(dtype=torch.int8)
46 | optimized_model = intel_npu_acceleration_library.compile(model, compiler_conf)
47 | 
48 | # Use the model as usual
49 | 
50 | ```
51 | 
52 | To compile and optimize a single layer of a model to be pushed to the NPU as one block, you can set `use_to=True` in the the compiler configuration `CompilerConfig`.
53 | ```python
54 | import intel_npu_acceleration_library
55 | from intel_npu_acceleration_library.compiler import CompilerConfig
56 | compiler_conf = CompilerConfig(use_to=True, dtype=torch.int8)
57 | optimized_block = intel_npu_acceleration_library.compile(single_block, compiler_conf)
58 | 
59 | ```
60 | 
61 | ## Training (**Experimental!**)
62 | 
63 | It is possible to use Intel® NPU Acceleration Library to train a model. As before you just need to call the `compile` function, this time with `training=True`. This allows to use the same training script you use in other device with a very minimal modifications.
64 | 
65 | ```python
66 | import intel_npu_acceleration_library
67 | from intel_npu_acceleration_library.compiler import CompilerConfig
68 | compiler_conf = CompilerConfig(dtype=torch.float32, training=True)
69 | compiled_model = intel_npu_acceleration_library.compile(model, compiler_conf)
70 | ```
71 | 


--------------------------------------------------------------------------------
/examples/Audio-Spectrogram-Transformer.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | 
 7 | import sys
 8 | import subprocess
 9 | import pkg_resources
10 | 
11 | required = {"librosa", "soundfile", "datasets", "intel-npu-acceleration-library"}
12 | installed = {pkg.key for pkg in pkg_resources.working_set}
13 | missing = required - installed
14 | 
15 | if missing:
16 |     # implement pip as a subprocess:
17 |     subprocess.check_call([sys.executable, "-m", "pip", "install", *missing])
18 | from transformers import AutoFeatureExtractor, ASTForAudioClassification
19 | from datasets import load_dataset
20 | import torch
21 | import intel_npu_acceleration_library
22 | 
23 | dataset = load_dataset(
24 |     "hf-internal-testing/librispeech_asr_demo",
25 |     "clean",
26 |     split="validation",
27 |     trust_remote_code=True,
28 | )
29 | dataset = dataset.sort("id")
30 | sampling_rate = dataset.features["audio"].sampling_rate
31 | 
32 | feature_extractor = AutoFeatureExtractor.from_pretrained(
33 |     "MIT/ast-finetuned-audioset-10-10-0.4593"
34 | )
35 | model = ASTForAudioClassification.from_pretrained(
36 |     "MIT/ast-finetuned-audioset-10-10-0.4593"
37 | )
38 | print("Compile model for the NPU")
39 | model = intel_npu_acceleration_library.compile(model)
40 | 
41 | # audio file is decoded on the fly
42 | inputs = feature_extractor(
43 |     dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt"
44 | )
45 | 
46 | with torch.no_grad():
47 |     logits = model(**inputs).logits
48 | 
49 | predicted_class_ids = torch.argmax(logits, dim=-1).item()
50 | predicted_label = model.config.id2label[predicted_class_ids]
51 | predicted_label
52 | 
53 | # compute loss - target_label is e.g. "down"
54 | target_label = model.config.id2label[0]
55 | inputs["labels"] = torch.tensor([model.config.label2id[target_label]])
56 | loss = model(**inputs).loss
57 | print(round(loss.item(), 2))
58 | 


--------------------------------------------------------------------------------
/examples/compile_model.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | 
 7 | from intel_npu_acceleration_library import compile
 8 | from intel_npu_acceleration_library.compiler import CompilerConfig
 9 | from sklearn.metrics import r2_score
10 | import intel_npu_acceleration_library
11 | import pytest
12 | import torch
13 | import sys
14 | 
15 | # Define a
16 | class NN(torch.nn.Module):
17 |     def __init__(self, hidden_dim: int, intermediate_dim: int) -> None:
18 |         super().__init__()
19 |         self.l1 = torch.nn.Linear(hidden_dim, intermediate_dim)
20 |         self.l2 = torch.nn.Linear(intermediate_dim, hidden_dim)
21 |         self.relu = torch.nn.functional.relu
22 | 
23 |     def forward(self, x):
24 |         return self.relu(self.l2(self.relu(self.l1(x))))
25 | 
26 | 
27 | if __name__ == "__main__":
28 | 
29 |     # Define a NN module
30 |     model = NN(32, 128)
31 |     # Generate the input
32 |     x = torch.rand((16, 32), dtype=torch.float16) - 0.5
33 | 
34 |     # Get the reference output
35 |     with torch.no_grad():
36 |         y_ref = model(x.to(torch.float32))
37 | 
38 |     # Compile the model
39 |     print("Compile the model for the NPU")
40 |     if sys.platform == "win32":
41 |         # Windows do not support torch.compile
42 |         print(
43 |             "Windows do not support torch.compile, fallback to intel_npu_acceleration_library.compile"
44 |         )
45 |         compiler_conf = CompilerConfig()
46 |         compiled_model = intel_npu_acceleration_library.compile(model, compiler_conf)
47 |     else:
48 |         compiled_model = torch.compile(model, backend="npu")
49 | 
50 |     # Get the NPU output
51 |     with torch.no_grad():
52 |         y = compiled_model(x)
53 | 
54 |     print(f"Reference vs actual R2 score: {r2_score(y_ref.numpy(), y.numpy()):.2f}")
55 | 


--------------------------------------------------------------------------------
/examples/cpp/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | cmake_minimum_required(VERSION 3.16)
 7 | include(FetchContent)
 8 | 
 9 | project(intel_npu_acceleration_library_example)
10 | 
11 | set(CMAKE_CXX_STANDARD 14)
12 | 
13 | if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU")
14 |     add_compile_options(-march=native)
15 | endif()
16 | 
17 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
18 | 
19 | 
20 | FetchContent_Declare(
21 |         intel_npu_acceleration_library
22 |         GIT_REPOSITORY "https://github.com/intel/intel-npu-acceleration-library"
23 |         GIT_TAG "main"
24 | )
25 | FetchContent_MakeAvailable(intel_npu_acceleration_library)
26 | 
27 | 
28 | find_package(OpenVINO REQUIRED PATHS ${openvino_SOURCE_DIR}/runtime/cmake)
29 | 
30 | include_directories(${intel_npu_acceleration_library_SOURCE_DIR}/include)
31 | 
32 | set(CMAKE_POSITION_INDEPENDENT_CODE ON)
33 | 
34 | add_executable(intel_npu_acceleration_library_example main.cpp)
35 | 
36 | target_link_libraries(intel_npu_acceleration_library_example PRIVATE openvino::runtime)
37 | 


--------------------------------------------------------------------------------
/examples/cpp/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Create a custom C++ application using Intel NPU acceleration Library
 3 | 
 4 | The example demonstrates how to create a custom C++ application using the Intel NPU acceleration Library. It showcases the usage of the library's features and functionalities for accelerating neural network inference on Intel NPUs. The provided code snippet shows the build process using CMake, where the project is configured and built in the Release configuration.
 5 | 
 6 | ## Build
 7 | 
 8 | To build the custom C++ application using the Intel NPU acceleration Library, follow these steps:
 9 | 
10 | 1. Run the following commands to configure and build the project in the Release configuration:
11 |     ```
12 |     cmake -S . -B build
13 |     cmake --build build --config Release
14 |     ```
15 | 2. Once the build process is complete, you can find the executable file at `build\Release\intel_npu_acceleration_library_example.exe` (on windows)
16 | 
17 | Make sure you have the necessary dependencies, compiler and libraries installed before building the application.
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/cpp/main.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Copyright © 2024 Intel Corporation
 3 | // SPDX-License-Identifier: Apache 2.0
 4 | //
 5 | 
 6 | #include "intel_npu_acceleration_library/nn_factory.h"
 7 | 
 8 | using namespace intel_npu_acceleration_library;
 9 | #include <iostream>
10 | 
11 | int main() {
12 |     const size_t batch = 128, inC = 256, outC = 512, N = 100000;
13 | 
14 |     std::cout << "Create a ModelFactory" << std::endl;
15 |     auto factory = std::make_shared<ModelFactory>("NPU");
16 | 
17 |     // create parameter
18 |     auto input = factory->parameter({batch, inC}, ov::element::f16);
19 |     auto weights = factory->parameter({outC, inC}, ov::element::f16);
20 |     auto bias = factory->parameter({1, outC}, ov::element::f16);
21 | 
22 |     // create matmul
23 |     auto matmul = factory->matmul(input, weights);
24 |     auto matmul_bias = factory->eltwise_add(matmul, bias);
25 |     factory->result(matmul_bias);
26 | 
27 |     // Compile the model
28 |     factory->compile();
29 | 
30 |     // Save OV model
31 |     std::cout << "Saving model to matmul.xml" << std::endl;
32 |     factory->saveModel("matmul.xml");
33 | 
34 |     // Here you can create float16 buffers and run inference by using
35 |     half_ptr input_buffer = new uint16_t[batch * inC];
36 |     half_ptr weights_buffer = new uint16_t[outC * inC];
37 |     half_ptr bias_buffer = new uint16_t[outC];
38 |     half_ptr output_buffer = new uint16_t[batch * outC];
39 | 
40 |     memset(input_buffer, 0, batch * inC * sizeof(uint16_t));
41 |     memset(weights_buffer, 0, outC * inC * sizeof(uint16_t));
42 |     memset(output_buffer, 0, batch * outC * sizeof(uint16_t));
43 |     memset(bias_buffer, 0, outC * sizeof(uint16_t));
44 | 
45 |     factory->setInputTensor(input_buffer, 0);
46 |     factory->setInputTensor(weights_buffer, 1);
47 |     factory->setInputTensor(bias_buffer, 2);
48 |     factory->setOutputTensor(output_buffer, 0);
49 | 
50 |     // Run inference
51 |     std::cout << "Run inference on " << N << " workloads" << std::endl;
52 |     for (auto idx = 0; idx < N; idx++)
53 |         factory->run();
54 |     std::cout << "Inference done" << std::endl;
55 | 
56 |     delete[] input_buffer;
57 |     delete[] weights_buffer;
58 |     delete[] bias_buffer;
59 |     delete[] output_buffer;
60 |     return 0;
61 | }


--------------------------------------------------------------------------------
/examples/llama.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from transformers import AutoTokenizer, TextStreamer
 7 | from intel_npu_acceleration_library import NPUModelForCausalLM, int4
 8 | from intel_npu_acceleration_library.compiler import CompilerConfig
 9 | 
10 | model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
11 | 
12 | compiler_conf = CompilerConfig(dtype=int4)
13 | model = NPUModelForCausalLM.from_pretrained(
14 |     model_id, use_cache=True, config=compiler_conf, attn_implementation="sdpa"
15 | ).eval()
16 | tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True)
17 | tokenizer.pad_token_id = tokenizer.eos_token_id
18 | streamer = TextStreamer(tokenizer, skip_special_tokens=True)
19 | 
20 | 
21 | query = input("Ask something: ")
22 | prefix = tokenizer(query, return_tensors="pt")["input_ids"]
23 | 
24 | 
25 | generation_kwargs = dict(
26 |     input_ids=prefix,
27 |     streamer=streamer,
28 |     do_sample=True,
29 |     top_k=50,
30 |     top_p=0.9,
31 |     max_new_tokens=512,
32 | )
33 | 
34 | print("Run inference")
35 | _ = model.generate(**generation_kwargs)
36 | 


--------------------------------------------------------------------------------
/examples/llama3.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from transformers import AutoTokenizer, TextStreamer
 7 | from intel_npu_acceleration_library import NPUModelForCausalLM, int4
 8 | from intel_npu_acceleration_library.compiler import CompilerConfig
 9 | 
10 | model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
11 | 
12 | compiler_conf = CompilerConfig(dtype=int4)
13 | model = NPUModelForCausalLM.from_pretrained(
14 |     model_id, use_cache=True, config=compiler_conf
15 | ).eval()
16 | tokenizer = AutoTokenizer.from_pretrained(model_id)
17 | streamer = TextStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
18 | 
19 | print("Run inference with Llama3 on NPU\n")
20 | 
21 | 
22 | query = input(">")
23 | 
24 | 
25 | messages = [
26 |     {
27 |         "role": "system",
28 |         "content": "You are an helpful chatbot that can provide information about the Intel NPU",
29 |     },
30 |     {"role": "user", "content": query},
31 | ]
32 | 
33 | input_ids = tokenizer.apply_chat_template(
34 |     messages, add_generation_prompt=True, return_tensors="pt"
35 | ).to(model.device)
36 | 
37 | terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
38 | 
39 | 
40 | outputs = model.generate(
41 |     input_ids,
42 |     max_new_tokens=256,
43 |     eos_token_id=terminators,
44 |     do_sample=True,
45 |     streamer=streamer,
46 | )
47 | 


--------------------------------------------------------------------------------
/examples/llava.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | import requests
 7 | from PIL import Image
 8 | from transformers import (
 9 |     LlavaForConditionalGeneration,
10 |     AutoTokenizer,
11 |     CLIPImageProcessor,
12 |     TextStreamer,
13 | )
14 | from transformers.feature_extraction_utils import BatchFeature
15 | from intel_npu_acceleration_library.compiler import CompilerConfig
16 | import intel_npu_acceleration_library
17 | import torch
18 | 
19 | 
20 | checkpoint = "Intel/llava-gemma-2b"
21 | 
22 | # Load model
23 | model = LlavaForConditionalGeneration.from_pretrained(checkpoint)
24 | 
25 | compiler_conf = CompilerConfig()
26 | model = intel_npu_acceleration_library.compile(model, compiler_conf)
27 | 
28 | image_processor = CLIPImageProcessor.from_pretrained(checkpoint)
29 | tokenizer = AutoTokenizer.from_pretrained(checkpoint)
30 | 
31 | streamer = TextStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
32 | 
33 | # Prepare inputs
34 | # Use gemma chat template
35 | prompt = tokenizer.apply_chat_template(
36 |     [{"role": "user", "content": "<image>\nWhat's the content of the image?"}],
37 |     tokenize=False,
38 |     add_generation_prompt=True,
39 | )
40 | text_inputs = tokenizer(prompt, return_tensors="pt")
41 | 
42 | # clean the console
43 | print("\033[H\033[J")
44 | print("LLaVA Gemma Chatbot\n")
45 | print("Please provide an image URL to generate a response.\n")
46 | url = input("Image URL: ")
47 | 
48 | print("Description: ", end="", flush=True)
49 | # url = "https://www.ilankelman.org/stopsigns/australia.jpg"
50 | image = Image.open(requests.get(url, stream=True).raw)
51 | 
52 | pixel_values = image_processor(image, return_tensors="pt")["pixel_values"]
53 | 
54 | inputs = BatchFeature(data={**text_inputs, "pixel_values": pixel_values})
55 | 
56 | # Generate
57 | model.generate(**inputs, max_new_tokens=150, streamer=streamer)
58 | 


--------------------------------------------------------------------------------
/examples/matmul.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from intel_npu_acceleration_library.backend import MatMul
 7 | import numpy as np
 8 | 
 9 | 
10 | def run_matmul(inC, outC, batch):
11 | 
12 |     # Create both inputs
13 |     X1 = np.random.uniform(-1, 1, (batch, inC)).astype(np.float16)
14 |     X2 = np.random.uniform(-1, 1, (outC, inC)).astype(np.float16)
15 | 
16 |     mm = MatMul(inC, outC, batch, profile=False)
17 | 
18 |     return mm.run(X1, X2)
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     result = run_matmul(128, 128, 32)
23 |     print(result)
24 | 


--------------------------------------------------------------------------------
/examples/phi-2.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from langchain.prompts import PromptTemplate
 7 | from langchain.chains import LLMChain
 8 | from langchain.llms import HuggingFacePipeline
 9 | from transformers import AutoTokenizer, pipeline, TextStreamer
10 | from intel_npu_acceleration_library.compiler import CompilerConfig
11 | import intel_npu_acceleration_library as npu_lib
12 | 
13 | model_id = "microsoft/Phi-2"
14 | 
15 | compiler_conf = CompilerConfig(dtype=npu_lib.int4)
16 | model = npu_lib.NPUModelForCausalLM.from_pretrained(
17 |     model_id, use_cache=True, config=compiler_conf
18 | ).eval()
19 | tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True)
20 | streamer = TextStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
21 | 
22 | pipe = pipeline(
23 |     "text-generation",
24 |     model=model,
25 |     tokenizer=tokenizer,
26 |     max_length=256,
27 |     temperature=0.9,
28 |     top_p=0.95,
29 |     repetition_penalty=1.2,
30 |     streamer=streamer,
31 | )
32 | 
33 | local_llm = HuggingFacePipeline(pipeline=pipe)
34 | pipe.model.config.pad_token_id = pipe.model.config.eos_token_id
35 | 
36 | 
37 | template = """Question: {question}
38 | 
39 | Answer: """
40 | 
41 | prompt = PromptTemplate(template=template, input_variables=["question"])
42 | 
43 | llm_chain = LLMChain(prompt=prompt, llm=local_llm)
44 | 
45 | question = "What's the distance between the Earth and the Moon?"
46 | 
47 | llm_chain.run(question)
48 | 


--------------------------------------------------------------------------------
/examples/phi-3.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | import torch
 7 | from transformers import AutoTokenizer, pipeline, TextStreamer
 8 | from intel_npu_acceleration_library.compiler import CompilerConfig
 9 | import intel_npu_acceleration_library as npu_lib
10 | import warnings
11 | 
12 | torch.random.manual_seed(0)
13 | 
14 | compiler_conf = CompilerConfig(dtype=npu_lib.int4)
15 | model = npu_lib.NPUModelForCausalLM.from_pretrained(
16 |     "microsoft/Phi-3-mini-4k-instruct",
17 |     config=compiler_conf,
18 |     torch_dtype="auto",
19 | )
20 | 
21 | tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
22 | streamer = TextStreamer(tokenizer, skip_prompt=True)
23 | 
24 | messages = [
25 |     {
26 |         "role": "system",
27 |         "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.",
28 |     },
29 |     {
30 |         "role": "user",
31 |         "content": "Can you provide ways to eat combinations of bananas and dragonfruits?",
32 |     },
33 | ]
34 | 
35 | pipe = pipeline(
36 |     "text-generation",
37 |     model=model,
38 |     tokenizer=tokenizer,
39 | )
40 | 
41 | generation_args = {
42 |     "max_new_tokens": 500,
43 |     "return_full_text": False,
44 |     "temperature": 0.7,
45 |     "do_sample": True,
46 |     "streamer": streamer,
47 | }
48 | 
49 | with warnings.catch_warnings():
50 |     warnings.simplefilter("ignore")
51 |     pipe(messages, **generation_args)
52 | 


--------------------------------------------------------------------------------
/examples/qwen2_math_7b.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from transformers import AutoTokenizer, TextStreamer
 7 | from intel_npu_acceleration_library import NPUModelForCausalLM, int8
 8 | from intel_npu_acceleration_library.compiler import CompilerConfig
 9 | import time
10 | 
11 | model_id = "Qwen/Qwen2-Math-7B-Instruct"
12 | 
13 | compiler_conf = CompilerConfig(dtype=int8)
14 | model = NPUModelForCausalLM.from_pretrained(
15 |     model_id, use_cache=True, config=compiler_conf
16 | ).eval()
17 | tokenizer = AutoTokenizer.from_pretrained(model_id)
18 | streamer = TextStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
19 | 
20 | print("Run inference with Qwen2-Math-7B on NPU\n")
21 | 
22 | # sample query:  Find the value of $x$ that satisfies the equation $4x+5 = 6x+7$.
23 | 
24 | query = input(">")
25 | 
26 | messages = [
27 |     {
28 |         "role": "system",
29 |         "content": "You are an helpful chatbot",
30 |     },
31 |     {"role": "user", "content": query},
32 | ]
33 | 
34 | text = tokenizer.apply_chat_template(
35 |     messages, tokenize=False, add_generation_prompt=True
36 | )
37 | 
38 | model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
39 | 
40 | # Measure the start time
41 | start_time = time.time()
42 | 
43 | generated_ids = model.generate(
44 |     **model_inputs,
45 |     max_new_tokens=512,
46 |     do_sample=True,
47 |     temperature=0.01,
48 |     streamer=streamer,
49 | )
50 | 
51 | generated_ids = [
52 |     output_ids[len(input_ids) :]
53 |     for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
54 | ]
55 | 
56 | # Calculate the total number of generated tokens
57 | num_tokens_generated = sum(len(tokens) for tokens in generated_ids)
58 | 
59 | response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
60 | 
61 | # Measure the end time
62 | end_time = time.time()
63 | 
64 | # Calculate the number of tokens generated
65 | num_tokens_generated = sum(len(tokens) for tokens in generated_ids)
66 | 
67 | # Calculate the tokens per second
68 | time_taken = end_time - start_time
69 | print("Total generated tokens:", num_tokens_generated)
70 | print("Total Time taken:", time_taken)
71 | 
72 | tokens_per_second = num_tokens_generated / time_taken
73 | 
74 | # Print the tokens per second
75 | print(f"Tokens per second: {tokens_per_second:.2f}")
76 | 


--------------------------------------------------------------------------------
/examples/t5.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from transformers import AutoTokenizer, TextStreamer
 7 | from intel_npu_acceleration_library import NPUModelForSeq2SeqLM
 8 | from intel_npu_acceleration_library.compiler import CompilerConfig
 9 | 
10 | model_id = "google/flan-t5-small"
11 | 
12 | compiler_conf = CompilerConfig()
13 | model = NPUModelForSeq2SeqLM.from_pretrained(
14 |     model_id, use_cache=True, config=compiler_conf
15 | ).eval()
16 | tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True)
17 | tokenizer.pad_token_id = tokenizer.eos_token_id
18 | streamer = TextStreamer(tokenizer, skip_special_tokens=True)
19 | 
20 | query = input("Ask something: ")
21 | prefix = tokenizer(query, return_tensors="pt")["input_ids"]
22 | 
23 | 
24 | generation_kwargs = dict(
25 |     input_ids=prefix,
26 |     streamer=streamer,
27 |     do_sample=True,
28 |     top_k=50,
29 |     top_p=0.9,
30 |     max_new_tokens=512,
31 | )
32 | 
33 | print("Run inference")
34 | _ = model.generate(**generation_kwargs)
35 | 


--------------------------------------------------------------------------------
/examples/tiny_llama_chat.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from transformers import pipeline, TextStreamer, set_seed
 7 | from intel_npu_acceleration_library.compiler import CompilerConfig
 8 | import intel_npu_acceleration_library
 9 | import torch
10 | import os
11 | 
12 | model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
13 | 
14 | print("Loading the model...")
15 | pipe = pipeline(
16 |     "text-generation", model=model_id, torch_dtype=torch.bfloat16, device_map="auto"
17 | )
18 | print("Compiling the model for NPU...")
19 | compiler_conf = CompilerConfig(dtype=torch.int8)
20 | pipe.model = intel_npu_acceleration_library.compile(pipe.model, compiler_conf)
21 | 
22 | streamer = TextStreamer(pipe.tokenizer, skip_special_tokens=True, skip_prompt=True)
23 | 
24 | set_seed(42)
25 | 
26 | 
27 | messages = [
28 |     {
29 |         "role": "system",
30 |         "content": "You are a friendly chatbot. You can ask me anything.",
31 |     },
32 | ]
33 | 
34 | print("NPU Chatbot is ready! Please ask a question. Type 'exit' to quit.")
35 | while True:
36 |     query = input("User: ")
37 |     if query.lower() == "exit":
38 |         break
39 |     messages.append({"role": "user", "content": query})
40 | 
41 |     prompt = pipe.tokenizer.apply_chat_template(
42 |         messages, tokenize=False, add_generation_prompt=True
43 |     )
44 |     print("Assistant: ", end="", flush=True)
45 |     out = pipe(
46 |         prompt,
47 |         max_new_tokens=512,
48 |         do_sample=True,
49 |         temperature=0.7,
50 |         top_k=50,
51 |         top_p=0.95,
52 |         streamer=streamer,
53 |     )
54 | 
55 |     reply = out[0]["generated_text"].split("<|assistant|>")[-1].strip()
56 |     messages.append({"role": "assistant", "content": reply})
57 | 


--------------------------------------------------------------------------------
/examples/train_mnist.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | 
  6 | 
  7 | import torch
  8 | from torch import nn
  9 | import intel_npu_acceleration_library
 10 | from intel_npu_acceleration_library.compiler import CompilerConfig
 11 | from torch.utils.data import DataLoader
 12 | from torchvision import datasets
 13 | from torchvision.transforms import ToTensor
 14 | import numpy as np
 15 | 
 16 | # Set random seeds
 17 | np.random.seed(0)
 18 | torch.manual_seed(0)
 19 | 
 20 | training_data = datasets.FashionMNIST(
 21 |     root="data", train=True, download=True, transform=ToTensor()
 22 | )
 23 | 
 24 | test_data = datasets.FashionMNIST(
 25 |     root="data", train=False, download=True, transform=ToTensor()
 26 | )
 27 | 
 28 | train_dataloader = DataLoader(training_data, batch_size=64)
 29 | test_dataloader = DataLoader(test_data, batch_size=64)
 30 | 
 31 | 
 32 | class NeuralNetwork(nn.Module):
 33 |     def __init__(self):
 34 |         super().__init__()
 35 |         self.flatten = nn.Flatten()
 36 |         self.linear_relu_stack = nn.Sequential(
 37 |             nn.Linear(28 * 28, 512),
 38 |             nn.ReLU(),
 39 |             nn.Linear(512, 512),
 40 |             nn.ReLU(),
 41 |             nn.Linear(512, 10),
 42 |         )
 43 | 
 44 |     def forward(self, x):
 45 |         x = self.flatten(x)
 46 |         logits = self.linear_relu_stack(x)
 47 |         return logits
 48 | 
 49 | 
 50 | def train_loop(dataloader, model, loss_fn, optimizer):
 51 |     size = len(dataloader.dataset)
 52 |     # Set the model to training mode - important for batch normalization and dropout layers
 53 |     # Unnecessary in this situation but added for best practices
 54 |     model.train()
 55 |     for batch, (X, y) in enumerate(dataloader):
 56 |         # Compute prediction and loss
 57 |         pred = model(X)
 58 |         loss = loss_fn(pred, y)
 59 | 
 60 |         # Backpropagation
 61 |         loss.backward()
 62 |         optimizer.step()
 63 |         optimizer.zero_grad()
 64 | 
 65 |         if batch % 100 == 0:
 66 |             loss, current = loss.item(), (batch + 1) * len(X)
 67 |             print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
 68 | 
 69 | 
 70 | def test_loop(dataloader, model, loss_fn):
 71 |     # Set the model to evaluation mode - important for batch normalization and dropout layers
 72 |     # Unnecessary in this situation but added for best practices
 73 |     model.eval()
 74 |     size = len(dataloader.dataset)
 75 |     num_batches = len(dataloader)
 76 |     test_loss, correct = 0, 0
 77 | 
 78 |     # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
 79 |     # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
 80 |     with torch.no_grad():
 81 |         for X, y in dataloader:
 82 |             pred = model(X)
 83 |             test_loss += loss_fn(pred, y).item()
 84 |             correct += (pred.argmax(1) == y).type(torch.float).sum().item()
 85 | 
 86 |     test_loss /= num_batches
 87 |     correct /= size
 88 |     print(
 89 |         f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n"
 90 |     )
 91 | 
 92 | 
 93 | model = NeuralNetwork()
 94 | compiler_conf = CompilerConfig(dtype=torch.float32, training=True)
 95 | model = intel_npu_acceleration_library.compile(model, compiler_conf)
 96 | 
 97 | learning_rate = 1e-3
 98 | batch_size = 64
 99 | 
100 | # Initialize the loss function
101 | loss_fn = nn.CrossEntropyLoss()
102 | 
103 | optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
104 | 
105 | epochs = 10
106 | for t in range(epochs):
107 |     print(f"Epoch {t+1}\n-------------------------------")
108 |     train_loop(train_dataloader, model, loss_fn, optimizer)
109 |     test_loop(test_dataloader, model, loss_fn)
110 | print("Done!")
111 | 


--------------------------------------------------------------------------------
/include/intel_npu_acceleration_library/common.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Copyright © 2024 Intel Corporation
 3 | // SPDX-License-Identifier: Apache 2.0
 4 | //
 5 | 
 6 | #pragma once
 7 | #include "openvino/openvino.hpp"
 8 | #include "openvino/opsets/opset1.hpp"
 9 | #include "openvino/opsets/opset13.hpp"
10 | #include "openvino/opsets/opset4.hpp"
11 | #include "openvino/opsets/opset5.hpp"
12 | #include "openvino/opsets/opset6.hpp"
13 | #include "openvino/opsets/opset7.hpp"
14 | #include "openvino/opsets/opset8.hpp"
15 | #include "openvino/opsets/opset9.hpp"
16 | #include "openvino/runtime/intel_npu/properties.hpp"
17 | 
18 | #if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
19 | #define intel_npu_acceleration_library_DLL_API __attribute__((visibility("default")))
20 | #elif defined(_MSC_VER)
21 | #define intel_npu_acceleration_library_DLL_API __declspec(dllexport)
22 | #endif
23 | 
24 | namespace intel_npu_acceleration_library {
25 | 
26 | static constexpr ov::Property<std::string> npu_compiler_type{"NPU_COMPILER_TYPE"};
27 | static constexpr ov::Property<std::string> npu_parameters{"NPU_COMPILATION_MODE_PARAMS"};
28 | 
29 | /**
30 |  * @brief Return true if the NPU is available on the system, otherwise return false
31 |  *
32 |  * @param core ov::Cor object
33 |  * @return true NPU AI accelerator is available
34 |  * @return false NPU AI accelerator is not available
35 |  */
36 | bool _isNPUAvailable(ov::Core& core) {
37 |     std::vector<std::string> availableDevices = core.get_available_devices();
38 |     return std::find(availableDevices.begin(), availableDevices.end(), "NPU") != availableDevices.end();
39 | }
40 | 
41 | uint32_t driver_version(ov::Core& core) {
42 |     return static_cast<uint32_t>(core.get_property("NPU", ov::intel_npu::driver_version));
43 | }
44 | 
45 | ov::element::Type_t dtype_from_string(const std::string& dtype) {
46 |     if (dtype == "int8" || dtype == "i8") {
47 |         return ov::element::Type_t::i8;
48 |     } else if (dtype == "int4" || dtype == "i4") {
49 |         return ov::element::Type_t::i4;
50 |     } else if (dtype == "int16" || dtype == "i16") {
51 |         return ov::element::Type_t::i16;
52 |     } else if (dtype == "int32" || dtype == "i32") {
53 |         return ov::element::Type_t::i32;
54 |     } else if (dtype == "int64" || dtype == "i64") {
55 |         return ov::element::Type_t::i64;
56 |     }
57 |     if (dtype == "float16" || dtype == "half" || dtype == "f16") {
58 |         return ov::element::Type_t::f16;
59 |     }
60 |     if (dtype == "float32" || dtype == "f32") {
61 |         return ov::element::Type_t::f32;
62 |     }
63 |     if (dtype == "float64" || dtype == "f64") {
64 |         return ov::element::Type_t::f64;
65 |     }
66 |     if (dtype == "bfloat16" || dtype == "bf16") {
67 |         return ov::element::Type_t::bf16;
68 |     } else {
69 |         throw std::invalid_argument("Unsupported datatype: " + dtype);
70 |     }
71 | }
72 | 
73 | }  // namespace intel_npu_acceleration_library
74 | 
75 | // Define half pointer as uint16_t pointer datatype
76 | #define half_ptr uint16_t*


--------------------------------------------------------------------------------
/include/intel_npu_acceleration_library/conversion.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Copyright © 2024 Intel Corporation
  3 | // SPDX-License-Identifier: Apache 2.0
  4 | //
  5 | 
  6 | #pragma once
  7 | 
  8 | #include <immintrin.h>
  9 | #include <iostream>
 10 | #include <thread>
 11 | #include <vector>
 12 | #include "intel_npu_acceleration_library/common.h"
 13 | 
 14 | namespace intel_npu_acceleration_library {
 15 | 
 16 | /**
 17 |  * @brief Compress a int8 vector to I4 format.
 18 |  *
 19 |  * @param src pointer to the source int8 buffer
 20 |  * @param dst pointer to the destination uint8 buffer
 21 |  * @param size size of the src and dst buffers
 22 |  */
 23 | void compressToI4(const int8_t* src, uint8_t* dst, size_t size) {
 24 |     for (size_t i = 0; i < size / 2; i++) {
 25 |         dst[i] = (src[2 * i] & 0x0F) | ((src[2 * i + 1] & 0x0F) << 4);
 26 |     }
 27 | }
 28 | 
 29 | /**
 30 |  * @brief Convert a int8 vector to fp16 given a scalar scale.
 31 |  *
 32 |  * @param src pointer to the source int8 buffer
 33 |  * @param scale Float scale
 34 |  * @param dst pointer to the destination float16 buffer
 35 |  * @param size size of the src and dst buffers
 36 |  */
 37 | void vector_to_fp16(const int8_t* src, float scale, half_ptr dst, size_t size) {
 38 |     constexpr size_t VEC_SIZE = 8;             // Use AVX2: process 8 values per loop iteration for 32-bit floats
 39 |     __m256 scale_vec = _mm256_set1_ps(scale);  // Broadcast scale
 40 | 
 41 |     for (size_t idx = 0; idx < size; idx += VEC_SIZE) {
 42 |         // Load int8_t and extend to int32_t for conversion
 43 |         __m128i input_8 = _mm_loadl_epi64((__m128i const*)(src + idx));  // Load 8 int8_t values
 44 |         __m256i input_32 = _mm256_cvtepi8_epi32(input_8);                // Extend to 32-bit integers
 45 | 
 46 |         // Convert integers to float and apply scaling
 47 |         __m256 float_vec = _mm256_mul_ps(_mm256_cvtepi32_ps(input_32), scale_vec);
 48 | 
 49 |         // Convert float to fp16
 50 |         __m128i fp16_vec = _mm256_cvtps_ph(float_vec, _MM_FROUND_TO_NEAREST_INT);
 51 | 
 52 |         // Store the result
 53 |         _mm_store_si128((__m128i*)(dst + idx), fp16_vec);
 54 |     }
 55 | }
 56 | 
 57 | /**
 58 |  * @brief Convert a int8 array to fp16 given a per output channel scale vector.
 59 |  *
 60 |  * @param input pointer to the source int8 buffer of shape [output_channels, input_channels]
 61 |  * @param scale pointer of a float scale vector of shape [output_channels]
 62 |  * @param output dst pointer to the destination float16 buffer of shape [output_channels, input_channels]
 63 |  * @param input_channels number of input channels
 64 |  * @param output_channels number of output channels
 65 |  */
 66 | void array_to_fp16_worker(const int8_t* input, float* scale, half_ptr output, size_t input_channels,
 67 |                           size_t output_channels) {
 68 |     for (size_t idx = 0; idx < output_channels; idx++) {
 69 |         vector_to_fp16(input + idx * input_channels, scale[idx], output + idx * input_channels, input_channels);
 70 |     }
 71 | }
 72 | 
 73 | /**
 74 |  * @brief Convert a int8 array to fp16 given a per output channel scale vector.
 75 |  *
 76 |  * @param input pointer to the source int8 buffer of shape [output_channels, input_channels]
 77 |  * @param scale pointer of a float scale vector of shape [output_channels]
 78 |  * @param output dst pointer to the destination float16 buffer of shape [output_channels, input_channels]
 79 |  * @param input_channels number of input channels
 80 |  * @param output_channels number of output channels
 81 |  * @param num_threads number of parallel threads to use
 82 |  */
 83 | void to_fp16(const int8_t* input, float* scale, half_ptr output, size_t input_channels, size_t output_channels,
 84 |              unsigned int num_threads) {
 85 |     std::vector<std::thread> threads;
 86 | 
 87 |     // Calculate chunk size per thread
 88 |     size_t channels_per_thread = (output_channels + num_threads - 1) / num_threads;  // Ceiling division
 89 | 
 90 |     for (unsigned int i = 0; i < num_threads; ++i) {
 91 |         size_t start_channel = i * channels_per_thread;
 92 |         size_t end_channel = std::min((i + 1) * channels_per_thread, output_channels);
 93 | 
 94 |         if (start_channel < output_channels) {
 95 |             threads.emplace_back(array_to_fp16_worker, input + start_channel * input_channels, scale + start_channel,
 96 |                                  output + start_channel * input_channels, input_channels, end_channel - start_channel);
 97 |         }
 98 |     }
 99 | 
100 |     // Join threads
101 |     for (auto& t : threads) {
102 |         if (t.joinable()) {
103 |             t.join();
104 |         }
105 |     }
106 | }
107 | }  // namespace intel_npu_acceleration_library
108 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from .compiler import compile
 7 | from .dtypes import int4, int8, float16
 8 | from ._version import __version__
 9 | from .modelling import NPUModel, NPUAutoModel, NPUModelForCausalLM, NPUModelForSeq2SeqLM
10 | from .device import enable_npu_device
11 | 
12 | enable_npu_device()
13 | 
14 | __all__ = [
15 |     "compile",
16 |     "int4",
17 |     "int8",
18 |     "float16",
19 |     "__version__",
20 |     "NPUModel",
21 |     "NPUAutoModel",
22 |     "NPUModelForCausalLM",
23 |     "NPUModelForSeq2SeqLM",
24 | ]
25 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/_version.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright © 2024 Intel Corporation
3 | # SPDX-License-Identifier: Apache 2.0
4 | #
5 | 
6 | __version__ = "v1.4.0"
7 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/backend/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | from .bindings import lib
 6 | from .utils import npu_available, get_driver_version, check_npu_and_driver_version
 7 | from .mlp import MLP
 8 | from .convolution import Convolution
 9 | from .matmul import MatMul
10 | from .linear import Linear
11 | from .qmatmul import QMatMul
12 | from .qlinear import QLinear
13 | from .tensor import Tensor
14 | from .factory import NNFactory
15 | from .sdpa import SDPA, SimpleSDPA
16 | from .runtime import run_matmul, run_factory, clear_cache
17 | 
18 | check_npu_and_driver_version()
19 | 
20 | __all__ = [
21 |     "Tensor",
22 |     "NNFactory",
23 |     "MLP",
24 |     "MatMul",
25 |     "Linear",
26 |     "QMatMul",
27 |     "QLinear",
28 |     "Convolution",
29 |     "SDPA",
30 |     "SimpleSDPA",
31 |     "run_matmul",
32 |     "run_factory",
33 |     "clear_cache",
34 |     "npu_available",
35 |     "get_driver_version",
36 |     "lib",
37 | ]
38 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/backend/compression.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
 7 | import numpy as np
 8 | 
 9 | 
10 | def compress_to_i4(weights: np.ndarray) -> np.ndarray:
11 |     """Compress a int8 array to int4.
12 | 
13 |     Args:
14 |         weights (np.ndarray): input array
15 | 
16 |     Returns:
17 |         np.ndarray: compressed array
18 |     """
19 |     compressed_weights = np.zeros(
20 |         (weights.shape[0], weights.shape[1] // 2), dtype=np.uint8
21 |     )
22 | 
23 |     backend_lib.compressToI4(weights, compressed_weights, np.prod(weights.shape))
24 |     return compressed_weights
25 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/backend/convolution.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from intel_npu_acceleration_library.backend.factory import NNFactory
 7 | from typing import Sequence, Union
 8 | import numpy as np
 9 | 
10 | 
11 | class Convolution(NNFactory):
12 |     """Linear class, computing a matrix matrix multiplication with weights prefetching."""
13 | 
14 |     def __init__(
15 |         self,
16 |         input_shape: Sequence[int],
17 |         weights_shape: Sequence[int],
18 |         bias: bool = False,
19 |         strides: Union[int, Sequence[int]] = 1,
20 |         padding: Union[int, Sequence[int]] = 0,
21 |         dilation: Union[int, Sequence[int]] = 1,
22 |         groups: int = 1,
23 |         profile: bool = False,
24 |         device: str = "NPU",
25 |     ):
26 |         """Initialize the Linear class.
27 | 
28 |         Args:
29 |             input_shape (Sequence[int]): input shape
30 |             weights_shape (Sequence[int]): weights shape
31 |             bias (bool): Enable/Disable bias. Defaults to False.
32 |             strides (Union[int, Sequence[int]], optional): Strides. Defaults to 1.
33 |             padding (Union[int, Sequence[int]], optional): Padding. Defaults to 0.
34 |             dilation (Union[int, Sequence[int]], optional): Dilation. Defaults to 1.
35 |             groups (int, optional): Groups. Defaults to 1.
36 |             profile (Optional[bool], optional): Enable/Disable profiling. Defaults to False.
37 |             device (str): Target device, default to "NPU".
38 |         """
39 |         super().__init__(profile, device)
40 |         input = self.parameter(input_shape)
41 |         weights = self.parameter(weights_shape)
42 |         if bias is not None:
43 |             bias_node = self.parameter((1, weights_shape[0], 1, 1))
44 |         else:
45 |             bias_node = None
46 | 
47 |         _ = self.convolution(
48 |             input,
49 |             weights,
50 |             bias=bias_node,
51 |             strides=strides,
52 |             padding=padding,
53 |             dilation=dilation,
54 |             groups=groups,
55 |             act_dtype=np.float16,
56 |         )
57 | 
58 |         self.compile()
59 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/backend/linear.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from intel_npu_acceleration_library.backend.factory import NNFactory
 7 | import numpy as np
 8 | 
 9 | 
10 | class Linear(NNFactory):
11 |     """Linear class, computing a matrix matrix multiplication with weights prefetching."""
12 | 
13 |     def __init__(
14 |         self,
15 |         inC: int,
16 |         outC: int,
17 |         batch: int,
18 |         profile: bool = False,
19 |         device: str = "NPU",
20 |     ):
21 |         """Initialize the Linear class.
22 | 
23 |         Args:
24 |             inC (int): input channels
25 |             outC (int): output channels
26 |             batch (int): batch
27 |             profile (bool): Enable/Disable profiling. Defaults to False.
28 |             device (str): Target device, default to "NPU".
29 |         """
30 |         super().__init__(profile, device)
31 |         self.inC, self.outC = inC, outC
32 |         self.batch = batch
33 |         input = self.parameter((self.batch, self.inC))
34 |         _ = self.linear(input, outC, inC, bias=False)
35 |         self.compile()
36 | 
37 |     def run(self, X: np.ndarray, W: np.ndarray, op_id: str) -> np.ndarray:
38 |         """Run the layer: X * W^T.
39 | 
40 |         Args:
41 |             X (np.ndarray): lhs operator
42 |             W (np.ndarray): rhs operator
43 |             op_id (str): operation id
44 | 
45 |         Raises:
46 |             RuntimeError: Input or weight tensor shape mismatch
47 | 
48 |         Returns:
49 |             np.ndarray: result
50 |         """
51 |         if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
52 |             raise RuntimeError(
53 |                 f"Input shape {X.shape} different from expected one {(self.batch, self.inC)}"
54 |             )
55 |         if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
56 |             raise RuntimeError(
57 |                 f"Weight shape {W.shape} different from expected one {(self.outC, self.inC)}"
58 |             )
59 | 
60 |         return super().run(X, W, op_id=op_id)
61 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/backend/matmul.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from intel_npu_acceleration_library.backend.factory import NNFactory
 7 | import numpy as np
 8 | 
 9 | 
10 | class MatMul(NNFactory):
11 |     """MatMul class, computing a matrix matrix multiplication."""
12 | 
13 |     def __init__(
14 |         self,
15 |         inC: int,
16 |         outC: int,
17 |         batch: int,
18 |         profile: bool = False,
19 |         device: str = "NPU",
20 |     ):
21 |         """Initialize the MatMul class.
22 | 
23 |         Args:
24 |             inC (int): input channels
25 |             outC (int): output channels
26 |             batch (int): batch
27 |             profile (bool): Enable/Disable profiling. Defaults to False.
28 |             device (str): Target device, default to "NPU".
29 |         """
30 |         super().__init__(profile, device)
31 |         self.inC, self.outC = inC, outC
32 |         self.batch = batch
33 |         input = self.parameter((self.batch, self.inC))
34 |         _ = self.linear(input, outC, inC, bias=False)
35 |         self.compile()
36 | 
37 |     def run(self, X: np.ndarray, W: np.ndarray) -> np.ndarray:
38 |         """Run the layer: X * W^T.
39 | 
40 |         Args:
41 |             X (np.ndarray): lhs operator
42 |             W (np.ndarray): rhs operator
43 | 
44 |         Raises:
45 |             RuntimeError: Input or weight tensor shape mismatch
46 | 
47 |         Returns:
48 |             np.ndarray: result
49 |         """
50 |         if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
51 |             raise RuntimeError(
52 |                 f"Input shape {X.shape} different from expected one {(self.batch, self.inC)}"
53 |             )
54 |         if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
55 |             raise RuntimeError(
56 |                 f"Weight shape {W.shape} different from expected one {(self.outC, self.inC)}"
57 |             )
58 | 
59 |         return super().run(X, W)
60 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/backend/mlp.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from intel_npu_acceleration_library.backend.factory import NNFactory
 7 | from typing import Optional, Sequence
 8 | 
 9 | 
10 | class MLP(NNFactory):
11 |     """Linear class, computing a matrix matrix multiplication with weights prefetching."""
12 | 
13 |     def __init__(
14 |         self,
15 |         input_shape: Sequence[int],
16 |         intermediate_size: int,
17 |         activation: str = "swiglu",
18 |         bias: Optional[bool] = False,
19 |         profile: bool = False,
20 |         device: str = "NPU",
21 |         **additional_args
22 |     ):
23 |         """Initialize the Linear class.
24 | 
25 |         Args:
26 |             input_shape (Sequence[int]): input shape channels
27 |             intermediate_size (int): intermediate_size
28 |             activation (str): activation function to use
29 |             bias (Optional[bool], optional): Enable/Disable bias. Defaults to False.
30 |             profile (bool): Enable/Disable profiling. Defaults to False.
31 |             device (str): Target device, default to "NPU".
32 |             additional_args: additional arguments
33 |         """
34 |         super().__init__(profile, device)
35 |         self.intermediate_size = intermediate_size
36 |         self.batch, self.hidden_size = input_shape
37 |         input = self.parameter((self.batch, self.hidden_size))
38 | 
39 |         mm1 = self.linear(input, self.intermediate_size, self.hidden_size, bias=bias)
40 | 
41 |         if activation == "swiglu":
42 |             mm2 = self.linear(input, self.intermediate_size, self.hidden_size, bias=bias)  # type: ignore[attr-defined]
43 |             mm1 = self.eltwise_mul(self.swish(mm1), mm2)  # type: ignore[attr-defined]
44 |         elif activation == "clamp":
45 |             atc_fn = getattr(self, activation)
46 |             mm1 = atc_fn(mm1, additional_args.get("min"), additional_args.get("max"))
47 |         elif activation == "elu":
48 |             atc_fn = getattr(self, activation)
49 |             mm1 = atc_fn(mm1, additional_args.get("alpha", 1.0))
50 |         elif activation == "grn":
51 |             atc_fn = getattr(self, activation)
52 |             mm1 = atc_fn(mm1, additional_args.get("grn_bias"))
53 |         else:
54 |             atc_fn = getattr(self, activation)
55 |             mm1 = atc_fn(mm1)
56 | 
57 |         _ = self.linear(mm1, self.hidden_size, self.intermediate_size, bias=bias)
58 |         self.compile()
59 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/backend/ops.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | 
  6 | from dataclasses import dataclass
  7 | from functools import lru_cache
  8 | from typing import List, Any, Sequence
  9 | import ctypes
 10 | 
 11 | 
 12 | @dataclass(frozen=True)
 13 | class SupportedOp:
 14 |     """A class for supported runtime OPs in the NPU.
 15 | 
 16 |     Attrs:
 17 |         name (str): Operation name
 18 |         inputs (int): Number of inputs
 19 |         parameters (Sequence[Any]): Optional parameters type.
 20 |     """
 21 | 
 22 |     name: str
 23 |     inputs: int
 24 |     parameters: Sequence[Any] = ()
 25 | 
 26 | 
 27 | @lru_cache(maxsize=None)
 28 | def get_supported_ops() -> List[SupportedOp]:
 29 |     """Generate a list fo supported operations.
 30 | 
 31 |     Returns:
 32 |         List[SupportedOp]: list fo supported NPU operations
 33 |     """
 34 |     supported_ops = [
 35 |         SupportedOp(name="result", inputs=1),
 36 |         SupportedOp(name="matmul", inputs=2, parameters=[ctypes.c_bool, ctypes.c_bool]),
 37 |         SupportedOp(name="eltwise_add", inputs=2),
 38 |         SupportedOp(name="eltwise_mul", inputs=2),
 39 |         SupportedOp(name="eltwise_div", inputs=2),
 40 |         SupportedOp(name="abs_act", inputs=1),
 41 |         SupportedOp(name="acos_act", inputs=1),
 42 |         SupportedOp(name="asin_act", inputs=1),
 43 |         SupportedOp(name="atan_act", inputs=1),
 44 |         SupportedOp(name="ceiling", inputs=1),
 45 |         SupportedOp(
 46 |             name="clamp", inputs=1, parameters=[ctypes.c_float, ctypes.c_float]
 47 |         ),
 48 |         SupportedOp(name="cos_act", inputs=1),
 49 |         SupportedOp(name="cosh_act", inputs=1),
 50 |         SupportedOp(name="erf_act", inputs=1),
 51 |         SupportedOp(name="elu", inputs=1, parameters=[ctypes.c_float]),
 52 |         SupportedOp(name="exp_act", inputs=1),
 53 |         SupportedOp(name="floor_act", inputs=1),
 54 |         SupportedOp(name="grn", inputs=1, parameters=[ctypes.c_float]),
 55 |         SupportedOp(name="gelu", inputs=1),
 56 |         SupportedOp(name="gelu_erf", inputs=1),
 57 |         SupportedOp(name="log_act", inputs=1),
 58 |         SupportedOp(name="negative", inputs=1),
 59 |         SupportedOp(name="relu", inputs=1),
 60 |         SupportedOp(name="prelu", inputs=2),
 61 |         SupportedOp(name="sigmoid", inputs=1),
 62 |         SupportedOp(name="sign", inputs=1),
 63 |         SupportedOp(name="sin_act", inputs=1),
 64 |         SupportedOp(name="sinh_act", inputs=1),
 65 |         SupportedOp(name="sqrt_act", inputs=1),
 66 |         SupportedOp(name="tan_act", inputs=1),
 67 |         SupportedOp(name="tanh_act", inputs=1),
 68 |         SupportedOp(name="acosh_act", inputs=1),
 69 |         SupportedOp(name="asinh_act", inputs=1),
 70 |         SupportedOp(name="atanh_act", inputs=1),
 71 |         SupportedOp(name="hswish", inputs=1),
 72 |         SupportedOp(name="mish", inputs=1),
 73 |         SupportedOp(name="softplus", inputs=1),
 74 |         SupportedOp(name="hsigmoid", inputs=1),
 75 |         SupportedOp(name="round_act", inputs=1),
 76 |         SupportedOp(name="softsign", inputs=1),
 77 |         SupportedOp(name="softmax", inputs=1, parameters=[ctypes.c_int]),
 78 |         SupportedOp(name="swish", inputs=1),
 79 |         SupportedOp(name="convert_to_fp16", inputs=1),
 80 |         SupportedOp(
 81 |             name="scaled_dot_product_attention",
 82 |             inputs=4,
 83 |             parameters=[ctypes.c_bool],
 84 |         ),
 85 |         SupportedOp(
 86 |             name="scaled_dot_product_attention_simple",
 87 |             inputs=3,
 88 |             parameters=[ctypes.c_bool],
 89 |         ),
 90 |         SupportedOp(
 91 |             name="normL2",
 92 |             inputs=2,
 93 |             parameters=[ctypes.c_float],
 94 |         ),
 95 |         SupportedOp(
 96 |             name="gather",
 97 |             inputs=3,
 98 |             parameters=[ctypes.c_int],
 99 |         ),
100 |         SupportedOp(name="reshape", inputs=2),
101 |         SupportedOp(name="transpose", inputs=2),
102 |         SupportedOp(name="squeeze", inputs=1),
103 |         SupportedOp(name="unsqueeze", inputs=2),
104 |         SupportedOp(
105 |             name="concat",
106 |             inputs=2,
107 |             parameters=[ctypes.c_int64],
108 |         ),
109 |         SupportedOp(
110 |             name="reduce_max",
111 |             inputs=2,
112 |             parameters=[ctypes.c_bool],
113 |         ),
114 |         SupportedOp(
115 |             name="reduce_mean",
116 |             inputs=2,
117 |             parameters=[ctypes.c_bool],
118 |         ),
119 |         SupportedOp(
120 |             name="reduce_min",
121 |             inputs=2,
122 |             parameters=[ctypes.c_bool],
123 |         ),
124 |         SupportedOp(
125 |             name="reduce_prod",
126 |             inputs=2,
127 |             parameters=[ctypes.c_bool],
128 |         ),
129 |         SupportedOp(
130 |             name="reduce_sum",
131 |             inputs=2,
132 |             parameters=[ctypes.c_bool],
133 |         ),
134 |         SupportedOp(name="adaptive_avg_pool", inputs=2),
135 |         SupportedOp(name="adaptive_max_pool", inputs=2),
136 |         SupportedOp(name="power", inputs=2),
137 |         SupportedOp(name="log_softmax", inputs=1, parameters=[ctypes.c_int64]),
138 |     ]
139 |     return supported_ops
140 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/backend/qlinear.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from intel_npu_acceleration_library.backend.factory import NNFactory
 7 | import numpy as np
 8 | 
 9 | 
10 | class QLinear(NNFactory):
11 |     """Quantized Linear class, computing a matrix matrix multiplication with weights prefetching."""
12 | 
13 |     def __init__(
14 |         self,
15 |         inC: int,
16 |         outC: int,
17 |         batch: int,
18 |         profile: bool = False,
19 |         device: str = "NPU",
20 |         dtype: np.dtype = np.int8,
21 |     ):
22 |         """Initialize the QLinear class.
23 | 
24 |         Args:
25 |             inC (int): input channels
26 |             outC (int): output channels
27 |             batch (int): batch
28 |             profile (bool): Enable/Disable profiling. Defaults to False.
29 |             device (str): Target device, default to "NPU".
30 |             dtype (np.dtype): weights datatype. Defaults to np.int8.
31 | 
32 |         """
33 |         super().__init__(profile, device)
34 |         self.inC, self.outC = inC, outC
35 |         self.batch = batch
36 | 
37 |         input = self.parameter((self.batch, self.inC))
38 |         _ = self.linear(input, outC, inC, bias=False, wt_dtype=dtype)
39 |         self.compile()
40 | 
41 |     def run(
42 |         self, X: np.ndarray, W: np.ndarray, scale: np.ndarray, op_id: str
43 |     ) -> np.ndarray:
44 |         """Run the layer:  $X * (W * S)^T$ .
45 | 
46 |         Args:
47 |             X (np.ndarray): activation
48 |             W (np.ndarray): quantized weights
49 |             scale (np.ndarray): quantization scale
50 |             op_id (str): operation id
51 | 
52 |         Raises:
53 |             RuntimeError: Input, weights or scale shape mismatch
54 | 
55 |         Returns:
56 |             np.ndarray: result
57 |         """
58 |         if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
59 |             raise RuntimeError(
60 |                 f"Input shape {X.shape} different from expected one {(self.batch, self.inC)}"
61 |             )
62 |         if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
63 |             raise RuntimeError(
64 |                 f"Weight shape {W.shape} different from expected one {(self.outC, self.inC)}"
65 |             )
66 |         if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
67 |             raise RuntimeError(
68 |                 f"Scale shape {W.shape} different from expected one {(self.outC, 1)}"
69 |             )
70 | 
71 |         return super().run(X, (W, scale), op_id=op_id)
72 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/backend/qmatmul.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from intel_npu_acceleration_library.backend.factory import NNFactory
 7 | import numpy as np
 8 | 
 9 | 
10 | class QMatMul(NNFactory):
11 |     """Quantized Linear class, computing a matrix matrix multiplication."""
12 | 
13 |     def __init__(
14 |         self,
15 |         inC: int,
16 |         outC: int,
17 |         batch: int,
18 |         profile: bool = False,
19 |         device: str = "NPU",
20 |         dtype: np.dtype = np.int8,
21 |     ):
22 |         """Initialize the QMatmul class.
23 | 
24 |         Args:
25 |             inC (int): input channels
26 |             outC (int): output channels
27 |             batch (int): batch
28 |             profile (bool): Enable/Disable profiling. Defaults to False.
29 |             device (str): Target device, default to "NPU".
30 |             dtype (np.dtype): weights datatype. Defaults to np.int8.
31 |         """
32 |         super().__init__(profile, device)
33 |         self.inC, self.outC = inC, outC
34 |         self.batch = batch
35 |         input = self.parameter((self.batch, self.inC))
36 |         _ = self.linear(input, outC, inC, bias=False, wt_dtype=dtype)
37 |         self.compile()
38 | 
39 |     def run(self, X: np.ndarray, W: np.ndarray, scale: np.ndarray) -> np.ndarray:
40 |         """Run the layer:  X * (W * S)^T.
41 | 
42 |         Args:
43 |             X (np.ndarray): activation
44 |             W (np.ndarray): quantized weights
45 |             scale (np.ndarray): quantization scale
46 | 
47 |         Raises:
48 |             RuntimeError: Input, weights or scale shape mismatch
49 | 
50 |         Returns:
51 |             np.ndarray: result
52 |         """
53 |         if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
54 |             raise RuntimeError(
55 |                 f"Input shape {X.shape} different from expected one {(self.batch, self.inC)}"
56 |             )
57 |         if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
58 |             raise RuntimeError(
59 |                 f"Weight shape {W.shape} different from expected one {(self.outC, self.inC)}"
60 |             )
61 |         if not (X.shape[0] == self.batch and X.shape[1] == self.inC):
62 |             raise RuntimeError(
63 |                 f"Scale shape {W.shape} different from expected one {(self.outC, 1)}"
64 |             )
65 | 
66 |         return super().run(X, (W, scale))
67 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/backend/sdpa.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | from intel_npu_acceleration_library.backend.factory import NNFactory
  6 | from typing import Tuple
  7 | import numpy as np
  8 | 
  9 | 
 10 | class SDPA(NNFactory):
 11 |     """Implementation of a ScaledDotProductAttention NPU operation."""
 12 | 
 13 |     def __init__(
 14 |         self,
 15 |         query_shapes: Tuple[int, int],
 16 |         key_shapes: Tuple[int, int],
 17 |         value_shapes: Tuple[int, int],
 18 |         mask_shapes: Tuple[int, int],
 19 |         is_causal: bool = False,
 20 |         profile: bool = False,
 21 |         device: str = "NPU",
 22 |     ):
 23 |         """Initialize the SDPA.
 24 | 
 25 |         Args:
 26 |             query_shapes (Tuple[int, int]): shape of the query tensor
 27 |             key_shapes (Tuple[int, int]): shape of the key tensor
 28 |             value_shapes (Tuple[int, int]): shape of the value tensor
 29 |             mask_shapes (Tuple[int, int]): shape of the mask tensor
 30 |             is_causal (bool, optional): If the SDPA mask is is_causal or not. Defaults to False.
 31 |             profile (bool, optional): Enable/Disable profiling. Defaults to False.
 32 |             device (str, optional): Target device, default to "NPU".
 33 |         """
 34 |         super().__init__(profile, device)
 35 | 
 36 |         self.query = self.parameter(query_shapes)
 37 |         self.key = self.parameter(key_shapes)
 38 |         self.value = self.parameter(value_shapes)
 39 |         self.mask = self.parameter(mask_shapes)
 40 | 
 41 |         _ = self.scaled_dot_product_attention(  # type: ignore[attr-defined]
 42 |             self.query, self.key, self.value, self.mask, is_causal
 43 |         )
 44 |         self.compile()
 45 | 
 46 |     def run(
 47 |         self, query: np.ndarray, key: np.ndarray, value: np.ndarray, mask: np.ndarray
 48 |     ) -> np.ndarray:
 49 |         """Run the scaled dot product attention kernel.
 50 | 
 51 |         Args:
 52 |             query (np.ndarray): sdpa query tensor
 53 |             key (np.ndarray): sdpa key tensor
 54 |             value (np.ndarray): sdpa value tensor
 55 |             mask (np.ndarray): sdpa mask tensor
 56 | 
 57 |         Returns:
 58 |             np.ndarray: result
 59 |         """
 60 |         return super().run(query, key, value, mask)
 61 | 
 62 | 
 63 | class SimpleSDPA(NNFactory):
 64 |     """Implementation of a ScaledDotProductAttention NPU operation."""
 65 | 
 66 |     def __init__(
 67 |         self,
 68 |         query_shapes: Tuple[int, int],
 69 |         key_shapes: Tuple[int, int],
 70 |         value_shapes: Tuple[int, int],
 71 |         is_causal: bool = False,
 72 |         profile: bool = False,
 73 |         device: str = "NPU",
 74 |     ):
 75 |         """Initialize the SDPA.
 76 | 
 77 |         Args:
 78 |             query_shapes (Tuple[int, int]): shape of the query tensor
 79 |             key_shapes (Tuple[int, int]): shape of the key tensor
 80 |             value_shapes (Tuple[int, int]): shape of the value tensor
 81 |             is_causal (bool, optional): If the SDPA mask is is_causal or not. Defaults to False.
 82 |             profile (bool, optional): Enable/Disable profiling. Defaults to False.
 83 |             device (str, optional): Target device, default to "NPU".
 84 |         """
 85 |         super().__init__(profile, device)
 86 | 
 87 |         self.query = self.parameter(query_shapes)
 88 |         self.key = self.parameter(key_shapes)
 89 |         self.value = self.parameter(value_shapes)
 90 | 
 91 |         _ = self.scaled_dot_product_attention_simple(  # type: ignore[attr-defined]
 92 |             self.query, self.key, self.value, is_causal
 93 |         )
 94 |         self.compile()
 95 | 
 96 |     def run(self, query: np.ndarray, key: np.ndarray, value: np.ndarray) -> np.ndarray:
 97 |         """Run the scaled dot product attention kernel.
 98 | 
 99 |         Args:
100 |             query (np.ndarray): sdpa query tensor
101 |             key (np.ndarray): sdpa key tensor
102 |             value (np.ndarray): sdpa value tensor
103 | 
104 |         Returns:
105 |             np.ndarray: result
106 |         """
107 |         return super().run(query, key, value)
108 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/backend/utils.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from functools import lru_cache
 7 | from .bindings import lib
 8 | import warnings
 9 | import sys
10 | 
11 | __min_npu_driver_version__ = 2408
12 | 
13 | 
14 | @lru_cache
15 | def npu_available() -> bool:
16 |     """Return if the NPU is available.
17 | 
18 |     Returns:
19 |         bool: Return True if the NPU is available in the system
20 |     """
21 |     return lib.isNPUAvailable()
22 | 
23 | 
24 | def get_driver_installation_url() -> str:
25 |     """Get the driver installation URL.
26 | 
27 |     Returns:
28 |         std: Return the driver installation url
29 |     """
30 |     if sys.platform == "win32":
31 |         return "Driver Update URL: https://www.intel.com/content/www/us/en/download/794734/intel-npu-driver-windows.html"
32 |     elif sys.platform == "linux":
33 |         return "Driver Update URL: https://github.com/intel/linux-npu-driver"
34 |     else:
35 |         return ""
36 | 
37 | 
38 | @lru_cache
39 | def get_driver_version() -> int:
40 |     """Get the driver version for the Intel® NPU Acceleration Library.
41 | 
42 |     Raises:
43 |         RuntimeError: an error is raised if the platform is not supported. Currently supported platforms are Windows and Linux
44 | 
45 |     Returns:
46 |         int: NPU driver version
47 |     """
48 |     if not npu_available():
49 |         raise RuntimeError("NPU is not available on this system")
50 | 
51 |     return lib.getNPUDriverVersion()
52 | 
53 | 
54 | def check_npu_and_driver_version():
55 |     """Check NPU and driver version."""
56 |     if not npu_available():
57 |         warnings.warn(
58 |             "NPU is not available in your system. Library will fallback to AUTO device selection mode",
59 |             stacklevel=2,
60 |         )
61 |     elif get_driver_version() < __min_npu_driver_version__:
62 | 
63 |         warnings.warn(
64 |             f"\nWarning: Outdated Driver Detected!!!\n"
65 |             f"Current Driver Version: {get_driver_version()}, Minimum Required Version: {__min_npu_driver_version__}\n"
66 |             f"Using an outdated driver may result in reduced performance and unexpected errors and crashes"
67 |             f"To avoid these issues, please update your driver to the latest version.\n"
68 |             f"{get_driver_installation_url()}\n",
69 |             stacklevel=2,
70 |         )
71 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/dtypes.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | 
  6 | from dataclasses import dataclass
  7 | from typing import Union
  8 | import numpy as np
  9 | import torch
 10 | 
 11 | 
 12 | @dataclass(frozen=True)
 13 | class NPUDtype:
 14 |     """Represents a custom data type for NPUs (Neural Processing Units).
 15 | 
 16 |     Attrs:
 17 |         name: str: The name of the data type.
 18 |         bits: int: The number of bits used to represent the data type.
 19 |         min: int: The minimum value that can be represented by the data type.
 20 |         max: int: The maximum value that can be represented by the data type.
 21 |         torch_dtype: torch.dtype: The corresponding torch data type.
 22 |         is_floating_point: bool: True if the data type is floating-point, False otherwise.
 23 |     """
 24 | 
 25 |     name: str
 26 |     bits: int
 27 |     min: int
 28 |     max: int
 29 |     torch_dtype: torch.dtype
 30 | 
 31 |     @property
 32 |     def is_floating_point(self) -> bool:
 33 |         """
 34 |         Check if the data type is a floating-point type.
 35 | 
 36 |         Returns:
 37 |             bool: True if the data type is floating-point, False otherwise.
 38 |         """
 39 |         return self.torch_dtype.is_floating_point
 40 | 
 41 |     def __eq__(self, value: Union["NPUDtype", torch.dtype]) -> bool:
 42 |         """
 43 |         Compare the NPUDtype object with another NPUDtype or torch.dtype object.
 44 | 
 45 |         Args:
 46 |             value (Union["NPUDtype", torch.dtype]): The object to compare with.
 47 | 
 48 |         Returns:
 49 |             bool: True if the objects are equal, False otherwise.
 50 |         """
 51 |         if isinstance(value, torch.dtype):
 52 |             if value.is_floating_point:
 53 |                 info = torch.finfo(value)
 54 |             else:
 55 |                 info = torch.iinfo(value)
 56 |             return (
 57 |                 self.bits == info.bits
 58 |                 and self.max == info.max
 59 |                 and self.min == info.min
 60 |                 and self.torch_dtype == value
 61 |             )
 62 |         if isinstance(value, type):
 63 |             value = np.dtype(value)
 64 |             if value.kind == "f":
 65 |                 info = np.finfo(value)
 66 |             else:
 67 |                 info = np.iinfo(value)
 68 |             return (
 69 |                 self.bits == info.bits and self.max == info.max and self.min == info.min
 70 |             )
 71 |         else:
 72 |             return super().__eq__(value)
 73 | 
 74 |     def __repr__(self) -> str:
 75 |         """
 76 |         Return a string representation of the NPUDtype object.
 77 | 
 78 |         Returns:
 79 |             str: The string representation of the NPUDtype object.
 80 |         """
 81 |         return self.name
 82 | 
 83 | 
 84 | float16 = NPUDtype(
 85 |     "fp16",
 86 |     16,
 87 |     torch.finfo(torch.float16).min,
 88 |     torch.finfo(torch.float16).max,
 89 |     torch.float16,
 90 | )
 91 | bfloat16 = NPUDtype(
 92 |     "bf16",
 93 |     16,
 94 |     torch.finfo(torch.bfloat16).min,
 95 |     torch.finfo(torch.bfloat16).max,
 96 |     torch.bfloat16,
 97 | )
 98 | float32 = NPUDtype(
 99 |     "fp32",
100 |     32,
101 |     torch.finfo(torch.float32).min,
102 |     torch.finfo(torch.float32).max,
103 |     torch.float32,
104 | )
105 | float64 = NPUDtype(
106 |     "fp64",
107 |     64,
108 |     torch.finfo(torch.float64).min,
109 |     torch.finfo(torch.float64).max,
110 |     torch.float64,
111 | )
112 | int4 = NPUDtype("int4", 4, -8, 7, torch.int8)
113 | int8 = NPUDtype("int8", 8, -128, 127, torch.int8)
114 | int16 = NPUDtype(
115 |     "int16", 16, torch.iinfo(torch.int16).min, torch.iinfo(torch.int16).max, torch.int16
116 | )
117 | int32 = NPUDtype(
118 |     "int32", 32, torch.iinfo(torch.int32).min, torch.iinfo(torch.int32).max, torch.int32
119 | )
120 | int64 = NPUDtype(
121 |     "int64", 64, torch.iinfo(torch.int64).min, torch.iinfo(torch.int64).max, torch.int64
122 | )
123 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/functional/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright © 2024 Intel Corporation
3 | # SPDX-License-Identifier: Apache 2.0
4 | #
5 | 
6 | from .scaled_dot_product_attention import scaled_dot_product_attention
7 | 
8 | __all__ = ["scaled_dot_product_attention"]
9 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/functional/scaled_dot_product_attention.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | from intel_npu_acceleration_library.backend import run_factory, SDPA, SimpleSDPA
 6 | from typing import Optional
 7 | from functools import partial
 8 | import torch
 9 | 
10 | 
11 | def scaled_dot_product_attention(
12 |     query: torch.Tensor,
13 |     key: torch.Tensor,
14 |     value: torch.Tensor,
15 |     attn_mask: torch.Tensor = None,
16 |     dropout_p: float = 0.0,
17 |     is_causal: bool = False,
18 |     scale: Optional[float] = None,
19 | ) -> torch.Tensor:
20 |     """Execute SDPA kernel.
21 | 
22 |     Args:
23 |         query (torch.Tensor): query tensor
24 |         key (torch.Tensor): key tensor
25 |         value (torch.Tensor): value tensor
26 |         attn_mask (torch.Tensor, optional): attention mask tensor. Defaults to None.
27 |         dropout_p (float, optional): optional dropout. Defaults to 0.0.
28 |         is_causal (bool, optional): enable causal mask. Defaults to False.
29 |         scale (Optional[float], optional): custom scale. Defaults to None.
30 | 
31 |     Raises:
32 |         RuntimeError: _description_
33 | 
34 |     Returns:
35 |         torch.Tensor: _description_
36 |     """
37 |     if dropout_p != 0:
38 |         raise RuntimeError("dropout_p != 0 is not supported yet")
39 |     if scale is not None:
40 |         raise RuntimeError("scale != 0 is not supported yet")
41 | 
42 |     if attn_mask is None:
43 |         backend_cls = partial(SimpleSDPA, is_causal=is_causal)  # type: ignore
44 |         return run_factory([query, key, value], [], backend_cls)
45 |     else:
46 |         backend_cls = partial(SDPA, is_causal=is_causal)  # type: ignore
47 |         return run_factory([query, key, value, attn_mask], [], backend_cls)
48 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/modelling.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM
  6 | import intel_npu_acceleration_library as npu_lib
  7 | from intel_npu_acceleration_library.compiler import CompilerConfig
  8 | from functools import partialmethod
  9 | from typing import Type, Any, Tuple, Optional
 10 | import hashlib
 11 | import torch
 12 | import os
 13 | 
 14 | 
 15 | def get_cache_dir() -> str:
 16 |     """Get the model cache directory.
 17 | 
 18 |     Returns:
 19 |         str: path to the cache directory
 20 |     """
 21 |     return os.path.join("cache", "models")
 22 | 
 23 | 
 24 | def get_mangled_model_name(model_name: str, *args: Any, **kwargs: Any) -> str:
 25 |     """Mangle the model name with all the parameters.
 26 | 
 27 |     Args:
 28 |         model_name (str): model name or path
 29 |         args (Any): positional arguments
 30 |         kwargs (Any): keyword arguments
 31 | 
 32 |     Returns:
 33 |         str: mangled name
 34 |     """
 35 |     # append all input parameters and create a string
 36 |     arguments_str = f"{[str(arg) for arg in args] + [f'{str(key)}_{str(arg)}' for key, arg in kwargs.items()]}"
 37 |     arguments_str_hash = hashlib.sha256(arguments_str.encode("utf-8")).hexdigest()
 38 |     mangled_model_name = f"{model_name}_{arguments_str_hash}_{npu_lib.__version__}"
 39 |     return mangled_model_name.replace("\\", "_").replace("/", "_")
 40 | 
 41 | 
 42 | def get_model_path(model_name: str, *args: Any, **kwargs: Any) -> Tuple[str, str]:
 43 |     """Get the model path.
 44 | 
 45 |     Args:
 46 |         model_name (str): model name or path
 47 |         args (Any): positional arguments
 48 |         kwargs (Any): keyword arguments
 49 | 
 50 |     Returns:
 51 |         Tuple[str, str]: model directory and full path
 52 |     """
 53 |     cache_dir = get_cache_dir()
 54 |     mangled_model_name = get_mangled_model_name(model_name, *args, **kwargs)
 55 |     model_dir_path = os.path.join(cache_dir, mangled_model_name)
 56 |     model_path = os.path.join(model_dir_path, "pytorch_npu_model.pt")
 57 |     return model_dir_path, model_path
 58 | 
 59 | 
 60 | class NPUModel:
 61 |     """Base NPU model class."""
 62 | 
 63 |     @staticmethod
 64 |     def from_pretrained(
 65 |         model_name_or_path: str,
 66 |         config: CompilerConfig,
 67 |         transformers_class: Optional[Type] = None,
 68 |         export=True,
 69 |         *args: Any,
 70 |         **kwargs: Any,
 71 |     ) -> torch.nn.Module:
 72 |         """Template for the `from_pretrained` static method.
 73 | 
 74 |         Args:
 75 |             model_name_or_path (str): model name or path
 76 |             config (CompilerConfig): compiler configuration
 77 |             transformers_class (Optional[Type], optional): base class to use. Must have a `from_pretrained` method. Defaults to None.
 78 |             export (bool, optional): enable the caching of the model. Defaults to True.
 79 |             args (Any): positional arguments
 80 |             kwargs (Any): keyword arguments
 81 | 
 82 |         Raises:
 83 |             RuntimeError: Invalid class
 84 |             AttributeError: Cannot export model with trust_remote_code=True
 85 | 
 86 |         Returns:
 87 |             torch.nn.Module: compiled mode
 88 |         """
 89 |         if transformers_class is None:
 90 |             raise RuntimeError(f"Invalid transformer class {type(transformers_class)}")
 91 |         # get the model cache dir and path from the name and arguments
 92 |         model_dir_path, model_path = get_model_path(
 93 |             model_name_or_path, config.dtype, config.training, *args, **kwargs
 94 |         )
 95 |         if os.path.isdir(model_dir_path) and os.path.isfile(model_path):
 96 |             # Model already exist so I can load it directly
 97 |             return torch.load(model_path)
 98 |         else:
 99 |             # Model does not exists, so I need to compile it first
100 |             print(f"Compiling model {model_name_or_path} {config.dtype} for the NPU")
101 |             model = transformers_class.from_pretrained(
102 |                 model_name_or_path, *args, **kwargs
103 |             )
104 |             model = npu_lib.compile(model, config)
105 |             if export:
106 |                 if kwargs.get("trust_remote_code", False):
107 |                     raise AttributeError(
108 |                         "Cannot export model with trust_remote_code=True. Please set trust_remote_code=False or export=False"
109 |                     )
110 |                 print(f"Exporting model {model_name_or_path} to {model_dir_path}")
111 |                 os.makedirs(model_dir_path, exist_ok=True)
112 |                 torch.save(model, model_path)
113 |             return model
114 | 
115 | 
116 | class NPUAutoModel:
117 |     """NPU wrapper for AutoModel.
118 | 
119 |     Attrs:
120 |         from_pretrained: Load a pretrained model
121 |     """
122 | 
123 |     from_pretrained = partialmethod(
124 |         NPUModel.from_pretrained, transformers_class=AutoModel
125 |     )
126 | 
127 | 
128 | class NPUModelForCausalLM:
129 |     """NPU wrapper for AutoModelForCausalLM.
130 | 
131 |     Attrs:
132 |         from_pretrained: Load a pretrained model
133 |     """
134 | 
135 |     from_pretrained = partialmethod(
136 |         NPUModel.from_pretrained, transformers_class=AutoModelForCausalLM
137 |     )
138 | 
139 | 
140 | class NPUModelForSeq2SeqLM:
141 |     """NPU wrapper for AutoModelForSeq2SeqLM.
142 | 
143 |     Attrs:
144 |         from_pretrained: Load a pretrained model
145 |     """
146 | 
147 |     from_pretrained = partialmethod(
148 |         NPUModel.from_pretrained, transformers_class=AutoModelForSeq2SeqLM
149 |     )
150 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/nn/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from .functional import *  # noqa
 7 | from .linear import Linear, QuantizedLinear  # noqa
 8 | from .conv import Conv2d  # noqa
 9 | from .module import Module  # noqa
10 | 
11 | try:
12 |     from .llm import LlamaAttention, PhiMLP  # noqa
13 | 
14 |     llm_modules = ["LlamaAttention", "PhiMLP"]
15 | except ModuleNotFoundError:
16 |     # Transformer library is not installed
17 |     llm_modules = []
18 | 
19 | 
20 | __all__ = ["Module", "Linear", "QuantizedLinear", "Conv2d"] + llm_modules
21 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/nn/autograd.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from intel_npu_acceleration_library.backend import run_matmul
 7 | from typing import Optional, Iterable, Union
 8 | import torch
 9 | 
10 | 
11 | class AutogradMatMul(torch.autograd.Function):
12 |     """Autograd module for Linear operation."""
13 | 
14 |     @staticmethod
15 |     def forward(
16 |         ctx, x: torch.Tensor, w: torch.Tensor, scale: Optional[torch.Tensor] = None
17 |     ) -> torch.Tensor:
18 |         """Run a linear forward pass. Depending on the datatype of the weights it runs a float or quantized operation.
19 | 
20 |             Equivalent pytorch code:
21 |             result = x @ w.T
22 | 
23 |         Args:
24 |             ctx (Any): the autograd context
25 |             x (torch.Tensor): Activation tensor. Its dtype must be torch.float16
26 |             w (torch.Tensor): Weight tensor. Its dtype must be torch.float16
27 |             scale (Optional[torch.Tensor], optional): Quantization scale. If weights.dtype == torch.int8 then it must be set. Defaults to None.
28 | 
29 |         Returns:
30 |             torch.Tensor: result
31 |         """
32 |         result = run_matmul(x, w, scale, None)
33 |         ctx.save_for_backward(w, x)
34 |         return result
35 | 
36 |     @staticmethod
37 |     def backward(ctx, grad_output: torch.Tensor) -> Iterable[Union[torch.Tensor, None]]:
38 |         """Run a linear backward pass.
39 | 
40 |         grad_output shape: [batch, output_channels]
41 |         x shape: [batch, input_channels]
42 |         w shape: [output_channels, input_channels]
43 | 
44 |         Expected gradients
45 |         dl_dx shape: [batch, input_channels]
46 |         dl_dw shape: [output_channels, input_channels]
47 | 
48 |         Equivalent pytorch code:
49 |         dl_dx = grad_output @ w.to(torch.float32)
50 |         dl_dw =  (x.T @ grad_output).T
51 | 
52 |         Args:
53 |             ctx (Any): the autograd context
54 |             grad_output (torch.Tensor): output gradient
55 | 
56 |         Returns:
57 |             Iterable[Union[torch.Tensor, None]]: Input and parameters gradients
58 |         """
59 |         (
60 |             w,
61 |             x,
62 |         ) = ctx.saved_tensors
63 | 
64 |         dl_dx = run_matmul(grad_output, torch.transpose(w, -1, -2))
65 |         dl_dw = run_matmul(
66 |             torch.transpose(grad_output, -1, -2),
67 |             torch.transpose(x, -1, -2).to(torch.float16),
68 |         )
69 |         return dl_dx, dl_dw, None
70 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/nn/linear.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | 
  6 | from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4
  7 | from intel_npu_acceleration_library.nn.autograd import AutogradMatMul
  8 | from intel_npu_acceleration_library.backend import run_matmul
  9 | from intel_npu_acceleration_library.dtypes import NPUDtype
 10 | from typing import Optional, Union
 11 | import torch
 12 | import uuid
 13 | import math
 14 | 
 15 | 
 16 | class Linear(torch.nn.Module):
 17 |     """Torch Linear operation NPU backend."""
 18 | 
 19 |     def __init__(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = None):
 20 |         """Initialize the Linear class.
 21 | 
 22 |         Args:
 23 |             weight (torch.Tensor): Linear operation weight
 24 |             bias (Optional[torch.Tensor], optional): Linear operation optional bias. Defaults to None.
 25 |         """
 26 |         super().__init__()
 27 | 
 28 |         self.weight = torch.nn.Parameter(weight)
 29 |         self.bias = torch.nn.Parameter(bias) if isinstance(bias, torch.Tensor) else None
 30 |         self.outC, self.inC = self.weight.shape
 31 |         self.op_id = str(uuid.uuid4())
 32 |         # assert self.weight.dtype == torch.float16
 33 |         self._mm = AutogradMatMul.apply
 34 | 
 35 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 36 |         """Torch module forward method.
 37 | 
 38 |         Args:
 39 |             x (torch.Tensor): Input tensor
 40 | 
 41 |         Returns:
 42 |             torch.Tensor: result
 43 |         """
 44 |         if self.training:
 45 |             out = self._mm(x, self.weight, None)
 46 |         else:
 47 |             out = run_matmul(x, self.weight, None, self.op_id)
 48 | 
 49 |         if self.bias is None:
 50 |             return out
 51 |         return out + self.bias
 52 | 
 53 |     @staticmethod
 54 |     def fromTorch(
 55 |         layer: torch.nn.Linear, dtype: torch.dtype = torch.float16
 56 |     ) -> Union["Linear", "QuantizedLinear"]:
 57 |         """Generate a NPU Linear layer from a torch one.
 58 | 
 59 |         Args:
 60 |             layer (torch.nn.Linear): the original torch.nn.Linear model to run on the NPU
 61 |             dtype (torch.dtype): the desired datatype
 62 | 
 63 |         Returns:
 64 |             Union[Linear, QuantizedLinear]: A NPU linear layer
 65 |         """
 66 |         if any(dim > 2**17 for dim in layer.weight.shape):
 67 |             return layer
 68 |         return Linear.fromTensor(layer.weight, getattr(layer, "bias", None), dtype)
 69 | 
 70 |     @staticmethod
 71 |     def fromTensor(
 72 |         weight: torch.Tensor,
 73 |         bias: Optional[torch.Tensor],
 74 |         dtype: torch.dtype = torch.float16,
 75 |     ) -> Union["Linear", "QuantizedLinear"]:
 76 |         """Generate a NPU Linear layer from a torch one.
 77 | 
 78 |         Args:
 79 |             weight (torch.Tensor): the original weight tensor
 80 |             bias (Optional[torch.Tensor]): the original bias tensor
 81 |             dtype (torch.dtype): the desired datatype
 82 | 
 83 |         Raises:
 84 |             RuntimeError: dtype not supported
 85 | 
 86 |         Returns:
 87 |             Union[Linear, QuantizedLinear]: A NPU linear layer
 88 |         """
 89 |         if dtype.is_floating_point:
 90 |             if bias is None:
 91 |                 return Linear(weight.to(dtype), None)
 92 |             return Linear(weight.to(dtype), bias.to(dtype))
 93 |         elif isinstance(dtype, NPUDtype):
 94 |             weights_quant, scale = quantize_tensor(weight, (dtype.min, dtype.max))
 95 |             if dtype.bits == 4:
 96 |                 weights_quant = compress_to_i4(weights_quant)
 97 |             return QuantizedLinear(weights_quant, scale, bias)
 98 |         elif dtype == torch.int8:
 99 |             weights_quant, scale = quantize_tensor(weight)
100 |             return QuantizedLinear(weights_quant, scale, bias)
101 |         else:
102 |             raise RuntimeError(
103 |                 f"intel-npu-acceleration-library library do not support yet the requeste datatype: {dtype}"
104 |             )
105 | 
106 | 
107 | class QuantizedLinear(torch.nn.Module):
108 |     """Torch Quantized Linear operation NPU backend."""
109 | 
110 |     def __init__(
111 |         self,
112 |         weight: torch.Tensor,
113 |         scale: torch.Tensor,
114 |         bias: Optional[torch.Tensor] = None,
115 |     ):
116 |         """Initialize the QuantizedLinear class.
117 | 
118 |         Args:
119 |             weight (torch.Tensor): Linear operation weight
120 |             scale (torch.Tensor): Quantization scale
121 |             bias (Optional[torch.Tensor], optional): Linear operation optional bias. Defaults to None.
122 | 
123 |         Raises:
124 |             RuntimeError: Quantized weight must be in torch.int8 format
125 |         """
126 |         super().__init__()
127 | 
128 |         self.weight = weight
129 |         if self.weight.dtype not in (torch.int8, torch.uint8):
130 |             raise RuntimeError(
131 |                 f"Quantized weight must be in torch.(u)int8 dtype instead of {self.weight.dtype}"
132 |             )
133 |         self.outC, self.inC = self.weight.shape
134 |         if self.weight.dtype == torch.uint8:
135 |             # In case is Int4 we need to double the input channels because weights are compressed
136 |             self.inC *= 2
137 |         self.scale = scale * math.sqrt(self.inC)
138 |         self.bias = bias
139 |         self.op_id = str(uuid.uuid4())
140 |         self._mm = AutogradMatMul.apply
141 | 
142 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
143 |         """Torch module forward method.
144 | 
145 |         Args:
146 |             x (torch.Tensor): Input tensor
147 | 
148 |         Raises:
149 |             RuntimeError: Training is not supported for QuantizedLinear layer. Use `.eval()` to do inference only
150 | 
151 |         Returns:
152 |             torch.Tensor: result
153 |         """
154 |         if self.training:
155 |             raise RuntimeError(
156 |                 "Training is not supported for QuantizedLinear layer. Use `.eval()` to do inference only"
157 |             )
158 |         out = run_matmul(x, self.weight, self.scale, self.op_id)
159 | 
160 |         if self.bias is None:
161 |             return out
162 |         return out + self.bias
163 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/optimizations.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | from typing import Dict, List, Any
  6 | import torch.nn as nn
  7 | import torch.fx as fx
  8 | import operator
  9 | import torch
 10 | 
 11 | 
 12 | def delattr_recursively(module: nn.Module, target: str):
 13 |     """Delete attribute recursively by name in a torch.nn.Module.
 14 | 
 15 |     Args:
 16 |         module (nn.Module): the nn.Module
 17 |         target (str): the attribute you want to delete
 18 |     """
 19 |     *root, name = target.rsplit(".", 1)
 20 |     if root:
 21 |         root = root[0].split(".")
 22 |         delattr_recursively(getattr(module, root[0]), ".".join(root[1:] + [name]))
 23 |     else:
 24 |         delattr(module, target)
 25 | 
 26 | 
 27 | def fuse_linear_layers(
 28 |     model: nn.Module,
 29 |     modules: Dict[str, nn.Linear],
 30 |     targets: List[str],
 31 |     fused_layer_name: str,
 32 | ) -> None:
 33 |     """Fuse two linear layers and append them to the nn Module.
 34 | 
 35 |     Args:
 36 |         model (nn.Module): Origianl nn.Module object
 37 |         modules (Dict[nn.Linear]): a dictiorany of node name: linear layer
 38 |         targets (List[str]): list of layer node names
 39 |         fused_layer_name (str): fused layer name
 40 | 
 41 |     Raises:
 42 |         ValueError: All linear layers must be of type nn.Linear and must have the same input dimension
 43 | 
 44 |     """
 45 |     # Get the attributes
 46 |     layers = [modules[name] for name in targets]
 47 | 
 48 |     in_features = list({layer.in_features for layer in layers})
 49 | 
 50 |     # ensure both linear layers have the same input dimensions and are not already fused
 51 |     if not all(isinstance(layer, nn.Linear) for layer in layers):
 52 |         raise ValueError("All linear layers must be of type nn.Linear")
 53 |     if len(in_features) != 1:
 54 |         raise ValueError(
 55 |             f"All linear layers must have the same input dimensions. Instead found: {in_features}"
 56 |         )
 57 | 
 58 |     # Create the new fused linear layer
 59 |     new_out_features = sum([layer.out_features for layer in layers])
 60 |     has_bias = any(layer.bias is not None for layer in layers)
 61 |     fused_layer = nn.Linear(in_features[0], new_out_features, bias=has_bias)
 62 | 
 63 |     # Concatenate the weights and biases
 64 |     with torch.no_grad():
 65 |         start, stop = 0, 0
 66 |         for layer in layers:
 67 |             stop += layer.out_features
 68 |             fused_layer.weight[start:stop, :] = layer.weight
 69 | 
 70 |             if has_bias:
 71 |                 if layer.bias is not None:
 72 |                     fused_layer.bias[start:stop] = layer.bias
 73 |                 else:
 74 |                     fused_layer.bias[start:stop] = torch.zeros_like(
 75 |                         fused_layer.bias[start:stop]
 76 |                     )
 77 |             start = stop
 78 | 
 79 |     # Replace the two layers in the original model with the new fused layer
 80 |     setattr(model, fused_layer_name, fused_layer)
 81 |     for layer_name in targets:
 82 |         delattr_recursively(model, layer_name)
 83 | 
 84 | 
 85 | def horizontal_fusion_linear(model: torch.nn.Module) -> torch.nn.Module:
 86 |     """Fuze horizontally two or more linear layers that share the same origin. This will increase NPU hw utilization.
 87 | 
 88 |     Args:
 89 |         model (torch.nn.Module): The original nn.Module
 90 | 
 91 |     Returns:
 92 |         torch.nn.Module: optimize nn.Module where parallel linear operations has been fused into a single bigger one
 93 |     """
 94 |     fx_model = fx.symbolic_trace(model)
 95 |     modules = dict(fx_model.named_modules())
 96 |     # new_graph = copy.deepcopy(fx_model.graph)
 97 | 
 98 |     def node_condition(node: Any) -> bool:
 99 |         """Return true if the node is a module and is nn.Linear.
100 | 
101 |         Args:
102 |             node (Any): A torch fx node
103 | 
104 |         Returns:
105 |             bool: return condition
106 |         """
107 |         return node.op == "call_module" and isinstance(modules[node.target], nn.Linear)
108 | 
109 |     # First, find all node with a linear layer
110 |     linear_nodes = [node for node in fx_model.graph.nodes if node_condition(node)]
111 | 
112 |     # Group the linear layers by input node
113 |     linear_nodes_parents: Dict[str, List[Any]] = {}
114 |     for node in linear_nodes:
115 |         linear_nodes_parents.setdefault(node.args[0], []).append(node)
116 | 
117 |     # Get the ones with size > 1
118 |     fused_modules = [
119 |         (source, modules)
120 |         for source, modules in linear_nodes_parents.items()
121 |         if len(modules) > 1
122 |     ]
123 | 
124 |     for source, layers in fused_modules:
125 |         fused_layer_name = "fused_" + "_".join(node.target for node in layers)
126 |         fused_layer_name = fused_layer_name.replace(".", "_")
127 |         fuse_linear_layers(
128 |             fx_model, modules, [layer.target for layer in layers], fused_layer_name
129 |         )
130 |         with fx_model.graph.inserting_after(source):
131 |             fused_node = fx_model.graph.call_module(fused_layer_name, (source,))
132 | 
133 |         with fx_model.graph.inserting_after(fused_node):
134 | 
135 |             start, stop = 0, 0
136 |             for layer in layers:
137 |                 stop += modules[layer.target].out_features
138 | 
139 |                 layer_slice = fx_model.graph.call_function(
140 |                     operator.getitem,
141 |                     args=(
142 |                         fused_node,
143 |                         (
144 |                             Ellipsis,
145 |                             slice(start, stop, None),
146 |                         ),
147 |                     ),
148 |                     kwargs={},
149 |                 )
150 |                 layer.replace_all_uses_with(layer_slice)
151 |                 fx_model.graph.erase_node(layer)
152 |                 start = stop
153 | 
154 |     fx_model.graph.lint()
155 |     fx_model.recompile()
156 | 
157 |     return fx_model
158 | 


--------------------------------------------------------------------------------
/intel_npu_acceleration_library/quantization.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | import intel_npu_acceleration_library.backend.compression as compression
  6 | from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion
  7 | from intel_npu_acceleration_library.dtypes import int8, int4
  8 | from intel_npu_acceleration_library.dtypes import NPUDtype
  9 | from neural_compressor.quantization import fit
 10 | from typing import Tuple
 11 | import logging
 12 | import torch
 13 | 
 14 | 
 15 | def quantize_tensor(
 16 |     weight: torch.Tensor, min_max_range: Tuple[int, int] = (-128, 127)
 17 | ) -> Tuple[torch.Tensor, torch.Tensor]:
 18 |     """Quantize a fp16 tensor symmetrically.
 19 | 
 20 |     Produces a quantize tensor (same shape, dtype == `torch.int8`) and a scale tensor (dtype == `torch.float16)
 21 |     The quantization equation is the following W = S * W_q
 22 | 
 23 |     Args:
 24 |         weight (torch.Tensor): The tensor to quantize
 25 |         min_max_range (Tuple[int, int]): The min and max range for the quantized tensor. Defaults to (-128, 127).
 26 | 
 27 |     Raises:
 28 |         RuntimeError: Error in the quantization step
 29 | 
 30 |     Returns:
 31 |         Tuple[torch.Tensor, torch.Tensor]: Quantized tensor and scale
 32 |     """
 33 |     scale = torch.max(torch.abs(weight), dim=-1).values
 34 | 
 35 |     # if any of the elements are zeros set the scale to the max value
 36 |     if torch.any(scale == 0):
 37 |         scale = torch.ones_like(scale) * torch.max(torch.abs(weight))
 38 | 
 39 |     # Compute scale and zero point
 40 |     scale = (scale / max(min_max_range)).to(torch.float16).view(-1, 1)
 41 | 
 42 |     weights_quant = torch.floor(weight / scale)
 43 | 
 44 |     if not (
 45 |         torch.max(weights_quant) <= max(min_max_range)
 46 |         and torch.min(weights_quant) >= min(min_max_range)
 47 |     ):
 48 |         raise RuntimeError(
 49 |             f"Quantization error: range of quantized weghts = {(torch.min(weights_quant), torch.max(weights_quant))} instead of ({min_max_range})"
 50 |         )
 51 |     return weights_quant.to(torch.int8), scale
 52 | 
 53 | 
 54 | def compress_to_i4(weights: torch.Tensor) -> torch.Tensor:
 55 |     """
 56 |     Compresses a given tensor to 4-bit representation.
 57 | 
 58 |     Args:
 59 |         weights (torch.Tensor): The input tensor to be compressed.
 60 | 
 61 |     Returns:
 62 |         torch.Tensor: The compressed tensor with 4-bit representation.
 63 |     """
 64 |     return torch.tensor(compression.compress_to_i4(weights.numpy()))
 65 | 
 66 | 
 67 | def quantize_fit(
 68 |     model: torch.nn.Module, weights_dtype: str, algorithm: str = "RTN"
 69 | ) -> torch.nn.Module:
 70 |     """Quantize a model with a given configuration.
 71 | 
 72 |     Args:
 73 |         model (torch.nn.Module): The model to quantize
 74 |         weights_dtype (str): The datatype for the weights
 75 |         algorithm (str, optional): The quantization algorithm. Defaults to "RTN".
 76 | 
 77 |     Raises:
 78 |         RuntimeError: Quantization error: unsupported datatype
 79 | 
 80 |     Returns:
 81 |         torch.nn.Module: The quantized model
 82 |     """
 83 |     if weights_dtype == "int4":
 84 |         bits = 4
 85 |     elif weights_dtype == "int8":
 86 |         bits = 8
 87 |     else:
 88 |         raise RuntimeError(f"Quantization error: unsupported datatype {weights_dtype}")
 89 | 
 90 |     conf = PostTrainingQuantConfig(
 91 |         approach="weight_only",
 92 |         tuning_criterion=TuningCriterion(timeout=100000),
 93 |         op_type_dict={
 94 |             ".*": {  # match all ops
 95 |                 "weight": {
 96 |                     "dtype": weights_dtype,
 97 |                     "bits": bits,
 98 |                     "group_size": -1,
 99 |                     "scheme": "sym",
100 |                     "algorithm": algorithm,
101 |                 },
102 |                 "activation": {
103 |                     "dtype": "fp16",
104 |                 },
105 |             }
106 |         },
107 |     )
108 | 
109 |     return fit(model=model, conf=conf)
110 | 
111 | 
112 | def quantize_i8_model(
113 |     model: torch.nn.Module, algorithm: str = "RTN"
114 | ) -> torch.nn.Module:
115 |     """Quantize a model to 8-bit representation.
116 | 
117 |     Args:
118 |         model (torch.nn.Module): The model to quantize
119 |         algorithm (str, optional): The quantization algorithm. Defaults to "RTN".
120 | 
121 |     Returns:
122 |         torch.nn.Module: The quantized model
123 |     """
124 |     quantized_model = quantize_fit(model, "int8", algorithm)
125 | 
126 |     return quantized_model.export_compressed_model(
127 |         scale_dtype=torch.float16, use_optimum_format=False
128 |     )
129 | 
130 | 
131 | def quantize_i4_model(
132 |     model: torch.nn.Module, algorithm: str = "RTN"
133 | ) -> torch.nn.Module:
134 |     """Quantize a model to 4-bit representation.
135 | 
136 |     Args:
137 |         model (torch.nn.Module): The model to quantize
138 |         algorithm (str, optional): The quantization algorithm. Defaults to "RTN".
139 | 
140 |     Returns:
141 |         torch.nn.Module: The quantized model
142 |     """
143 |     quantized_model = quantize_fit(model, "int4", algorithm)
144 | 
145 |     return quantized_model.export_compressed_model(
146 |         compression_dtype=torch.int8,
147 |         scale_dtype=torch.float16,
148 |         use_optimum_format=False,
149 |     )
150 | 
151 | 
152 | def quantize_model(model: torch.nn.Module, dtype: NPUDtype) -> torch.nn.Module:
153 |     """Quantize a model.
154 | 
155 |     Args:
156 |         model (torch.nn.Module): The model to quantize
157 |         dtype (NPUDtype): The desired datatype
158 | 
159 |     Raises:
160 |         RuntimeError: Quantization error: unsupported datatype
161 | 
162 |     Returns:
163 |         torch.nn.Module: The quantized model
164 |     """
165 |     # Silence neural compressor logger
166 |     logger = logging.getLogger("neural_compressor")
167 |     logger.setLevel(logging.ERROR)
168 | 
169 |     if dtype == int4:
170 |         return quantize_i4_model(model)
171 |     elif dtype == int8:
172 |         return quantize_i8_model(model)
173 |     else:
174 |         raise RuntimeError(f"Quantization error: unsupported datatype {dtype}")
175 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | implicit_reexport = True
3 | ignore_missing_imports = True
4 | disable_error_code = override


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | torch
3 | transformers>=4.43.0
4 | neural-compressor


--------------------------------------------------------------------------------
/script/export.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 7 | from intel_npu_acceleration_library.compiler import compile
 8 | from intel_npu_acceleration_library.compiler import CompilerConfig
 9 | from intel_npu_acceleration_library.dtypes import int8, int4
10 | import argparse
11 | import torch
12 | import os
13 | 
14 | 
15 | def define_and_parse_args():
16 |     parser = argparse.ArgumentParser(description="Export a for NPU")
17 |     parser.add_argument("--model", "-m", type=str, help="Name of the model to export")
18 |     parser.add_argument(
19 |         "--dtype",
20 |         type=str,
21 |         default="fp16",
22 |         choices=["fp16", "int8", "int4"],
23 |         help="type of quantization to perform",
24 |     )
25 |     parser.add_argument(
26 |         "--output", "-o", type=str, default="models", help="Output path"
27 |     )
28 | 
29 |     return parser.parse_args()
30 | 
31 | 
32 | def export(model_id, dtype, output):
33 | 
34 |     print(f"Loading {model_id}")
35 |     model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=True)
36 |     tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True)
37 |     config = AutoConfig.from_pretrained(model_id)
38 | 
39 |     PATH = os.path.join(output, model_id, dtype)
40 | 
41 |     tokenizer.save_pretrained(PATH)
42 |     config.save_pretrained(PATH)
43 | 
44 |     if dtype == "fp16":
45 |         print(f"Compiling model {model_id}")
46 |         dtype = torch.float16
47 |     elif dtype == "int8":
48 |         print(f"Quantizing & Compiling model {model_id}")
49 |         dtype = int8
50 |     elif dtype == "int4":
51 |         print(f"Quantizing & Compiling model {model_id}")
52 |         dtype = int4
53 |     else:
54 |         raise RuntimeError(f"Invalid dtype {dtype}")
55 | 
56 |     with torch.no_grad():
57 |         compiler_conf = CompilerConfig(dtype=dtype)
58 |         compile(model, compiler_conf)
59 | 
60 |     filename = os.path.join(PATH, "model.pth")
61 |     os.makedirs(PATH, exist_ok=True)
62 | 
63 |     print("Saving model...")
64 |     torch.save(model, filename)
65 | 
66 |     print(f"Model saved in {filename}")
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     args = define_and_parse_args()
71 |     export(args.model, args.dtype, args.output)
72 | 


--------------------------------------------------------------------------------
/script/gen_leaderboard_doc.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | import matplotlib.pyplot as plt
 7 | import seaborn as sns
 8 | import pandas as pd
 9 | import datetime
10 | import json
11 | import glob
12 | import os
13 | 
14 | 
15 | df = None
16 | for files in glob.glob("./leaderboard_*.json"):
17 |     with open(files) as fp:
18 |         prof = json.load(fp)
19 |         date = datetime_object = datetime.datetime.strptime(
20 |             prof["config"]["time"], "%Y-%m-%d_%H-%M-%S"
21 |         )
22 | 
23 |         new_df = pd.DataFrame.from_records(prof["profiling"])
24 |         new_df["date"] = date
25 |         if df is None:
26 |             df = new_df
27 |         else:
28 |             df = pd.concat([df, new_df], axis=0, join="outer")
29 | 
30 | df.pop("error")
31 | 
32 | 
33 | col_to_str = {
34 |     "model": "Model",
35 |     "context_size": "Context Size",
36 |     "tps": "Tokens / s",
37 |     "prefill": "Prefill (s)",
38 |     "intel_npu_acceleration_library": "Intel® NPU Acceleration Library enabled",
39 |     "dtype": "Datatype",
40 | }
41 | 
42 | 
43 | def plot(df, x, y, hue="context_size", title=None, latest=True):
44 | 
45 |     filtered = df[(df["date"] == df["date"].max())] if latest else df
46 | 
47 |     plt.figure(figsize=(16, 9))
48 |     ax = sns.barplot(filtered.dropna(), x=x, y=y, hue=hue)
49 |     ax.set_xlabel(col_to_str[x])
50 |     plt.xticks(rotation=45)
51 |     if y == "prefill":
52 |         ax.set_ylabel(f"Log {col_to_str[y]}")
53 |         # ax.set_yscale('log')
54 |     else:
55 |         ax.set_ylabel(col_to_str[y])
56 |     if title is None:
57 |         title = f"{col_to_str[y]} vs {col_to_str[x]}"
58 |     ax.set_title(title)
59 |     # ax.legend(bbox_to_anchor=(1, 1), loc=2, borderaxespad=0., title=col_to_str[hue])
60 |     ax.legend(title=col_to_str[hue])
61 |     filename = f"data/{x}_{y}_{hue}.png"
62 |     os.makedirs("data", exist_ok=True)
63 |     print(f"Save image {title} to {filename}")
64 |     ax.get_figure().savefig(filename, bbox_inches="tight")
65 | 
66 | 
67 | sns.color_palette("tab10")
68 | 
69 | plot(df[df["context_size"] == 128], "model", "tps", hue="dtype")
70 | plot(df[df["context_size"] == 128], "model", "prefill", hue="dtype")
71 | 
72 | plot(df[df["intel_npu_acceleration_library"] == True], "model", "tps")
73 | plot(df[df["intel_npu_acceleration_library"] == True], "model", "prefill")
74 | 


--------------------------------------------------------------------------------
/script/llm_leaderboard.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | 
  6 | import subprocess
  7 | import itertools
  8 | import intel_npu_acceleration_library
  9 | import platform
 10 | import datetime
 11 | import socket
 12 | import tqdm
 13 | import json
 14 | import re
 15 | 
 16 | 
 17 | def profile_model(
 18 |     model_id,
 19 |     context_size,
 20 |     device="NPU",
 21 |     dtype="float16",
 22 |     use_intel_npu_acceleration_library=True,
 23 | ):
 24 | 
 25 |     profiling_data = {
 26 |         "model": model_id,
 27 |         "context_size": context_size,
 28 |         "device": device,
 29 |         "dtype": dtype,
 30 |         "intel_npu_acceleration_library": use_intel_npu_acceleration_library,
 31 |         "prefill": None,
 32 |         "tps": None,
 33 |         "error": None,
 34 |     }
 35 |     try:
 36 |         disable_intel_npu_acceleration_library = (
 37 |             "--disable-intel-npu-acceleration-library"
 38 |             if not use_intel_npu_acceleration_library
 39 |             else ""
 40 |         )
 41 |         output = subprocess.check_output(
 42 |             f"python profile_llm.py -m {model_id} --context-size {context_size} --device {device} {disable_intel_npu_acceleration_library} ",
 43 |             shell=True,
 44 |         ).decode()
 45 | 
 46 |         profiling_line = output.strip().split("\n")[-1].strip()
 47 | 
 48 |         pattern = r"prefill-phase (\d+\.\d+) s, tokens/s (\d+\.\d+) s"
 49 | 
 50 |         match = re.search(pattern, profiling_line)
 51 | 
 52 |         # Check if a match is found
 53 |         if match:
 54 |             # Extract the prefill phase and tokens/s values
 55 |             profiling_data["prefill"] = float(match.group(1))
 56 |             profiling_data["tps"] = float(match.group(2))
 57 |         else:
 58 |             profiling_data["error"] = f"parsing error: profiling output {output}"
 59 |     except:
 60 |         profiling_data["error"] = "runtime error"
 61 | 
 62 |     return profiling_data
 63 | 
 64 | 
 65 | def save_data(data):
 66 |     date = data["config"]["time"].replace(" ", "_").replace(":", "_")
 67 |     with open(f"leaderboard_{date}.json", "w") as fp:
 68 |         json.dump(data, fp, indent=4)
 69 | 
 70 | 
 71 | def main():
 72 | 
 73 |     data = {
 74 |         "config": {
 75 |             "time": datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"),
 76 |             "arch": platform.machine(),
 77 |             "version": platform.version(),
 78 |             "platform": platform.platform(),
 79 |             "processor": platform.processor(),
 80 |             "hostname": socket.gethostname(),
 81 |             "npu": "yes"
 82 |             if intel_npu_acceleration_library.backend.npu_available()
 83 |             else "no",
 84 |             "unit": "seconds",
 85 |         },
 86 |         "profiling": [],
 87 |     }
 88 |     save_data(data)
 89 | 
 90 |     models = [
 91 |         "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
 92 |         "microsoft/phi-2",
 93 |         "stabilityai/stablelm-3b-4e1t",
 94 |         # "qnguyen3/quan-1.8b-chat",
 95 |         "facebook/opt-1.3b",
 96 |         "gpt2-large",
 97 |         "openlm-research/open_llama_3b_v2",
 98 |         "EleutherAI/pythia-2.8b-v0",
 99 |         "tiiuae/falcon-rw-1b",
100 |         "EleutherAI/gpt-neo-1.3B",
101 |         "stabilityai/stable-code-3b",
102 |         "google/gemma-2b-it",
103 |     ]
104 | 
105 |     contexts = [64, 128, 256, 512]
106 |     use_intel_npu_acceleration_library_lst = [True]
107 |     devices = ["NPU"]
108 |     dtypes = ["float16", "int8"]
109 |     configurations = list(
110 |         itertools.product(
111 |             models, contexts, devices, dtypes, use_intel_npu_acceleration_library_lst
112 |         )
113 |     )
114 | 
115 |     for model, context, device, dtype, use_intel_npu_acceleration_library in tqdm.tqdm(
116 |         configurations
117 |     ):
118 |         profiling_data = profile_model(
119 |             model, context, device, dtype, use_intel_npu_acceleration_library
120 |         )
121 |         data["profiling"].append(profiling_data)
122 |         save_data(data)
123 | 
124 | 
125 | if __name__ == "__main__":
126 | 
127 |     main()
128 | 


--------------------------------------------------------------------------------
/script/profile_llm.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | 
  6 | from transformers import AutoTokenizer, AutoModelForCausalLM
  7 | from intel_npu_acceleration_library.nn.llm import generate_with_static_shape
  8 | from intel_npu_acceleration_library.dtypes import int8, int4
  9 | from intel_npu_acceleration_library.compiler import CompilerConfig
 10 | 
 11 | from torch.profiler import profile, ProfilerActivity
 12 | import intel_npu_acceleration_library
 13 | import argparse
 14 | import torch
 15 | import time
 16 | import os
 17 | 
 18 | 
 19 | def main(
 20 |     prompt="List all numbers in the Fibonacci sequence: 1, 1, 2, 3, ",
 21 |     context_size=512,
 22 |     max_new_tokens=50,
 23 |     model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
 24 |     device="NPU",
 25 |     dtype="float16",
 26 |     disable_intel_npu_acceleration_library=False,
 27 | ):
 28 | 
 29 |     # Load tokenizer
 30 |     tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 31 |     tokenizer.pad_token_id = tokenizer.eos_token_id
 32 |     # Load model
 33 |     if os.path.isdir(model_id) and os.path.isfile(f"{model_id}//model.pth"):
 34 |         compiled = True
 35 |         model = torch.load(f"{model_id}//model.pth")
 36 |         model.eval()
 37 |     else:
 38 |         compiled = False
 39 |         model = (
 40 |             AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
 41 |             .to("cpu")
 42 |             .eval()
 43 |         )
 44 | 
 45 |     if dtype == "float16":
 46 |         dtype = torch.float16
 47 |     elif dtype == "int8":
 48 |         dtype = int8
 49 |     elif dtype == "int4":
 50 |         dtype = int4
 51 |     else:
 52 |         raise RuntimeError(f"Invalid dtype: {dtype}")
 53 | 
 54 |     if not disable_intel_npu_acceleration_library:
 55 |         if not compiled:
 56 |             compiler_conf = CompilerConfig(dtype=dtype)
 57 |             model = intel_npu_acceleration_library.compile(model, compiler_conf)
 58 |         intel_npu_acceleration_library.nn.llm.warm_up_decoder_model(
 59 |             tokenizer, model, context_size
 60 |         )
 61 | 
 62 |     # Tokenize
 63 |     input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cpu")
 64 | 
 65 |     results = generate_with_static_shape(
 66 |         model,
 67 |         input_ids=input_ids,
 68 |         max_length=context_size,
 69 |         use_past=True,
 70 |         pad_token_id=tokenizer.pad_token_id,
 71 |     )
 72 |     times = [time.perf_counter()]
 73 |     idx = 0
 74 |     with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
 75 |         for new_token_id in results:
 76 |             times.append(time.perf_counter())
 77 |             if idx >= max_new_tokens:
 78 |                 break
 79 |             idx += 1
 80 |             token = tokenizer.decode([new_token_id], skip_special_tokens=True)
 81 | 
 82 |     print(
 83 |         prof.key_averages(group_by_input_shape=True).table(
 84 |             sort_by="self_cpu_time_total", row_limit=20
 85 |         )
 86 |     )
 87 |     prof.export_chrome_trace("trace.json")
 88 | 
 89 |     elapsed = [y - x for x, y in zip(times, times[1:])]
 90 | 
 91 |     prefix_time = elapsed[0]
 92 |     tps = len(elapsed[1:]) / sum(elapsed[1:])
 93 | 
 94 |     print(
 95 |         f"model {model_id} (context: {context_size}): prefill-phase {prefix_time:.3f} s, tokens/s {tps:.3f}"
 96 |     )
 97 | 
 98 | 
 99 | def define_and_parse_args():
100 |     parser = argparse.ArgumentParser(description="Profiling a LLM in the NPU")
101 |     parser.add_argument(
102 |         "--model",
103 |         "-m",
104 |         type=str,
105 |         default="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
106 |         help="Model",
107 |     )
108 |     parser.add_argument(
109 |         "--context-size",
110 |         type=int,
111 |         default=128,
112 |         help="Context size (default: %(default)s)",
113 |     )
114 |     parser.add_argument(
115 |         "--n-threads",
116 |         type=int,
117 |         default=None,
118 |         help="Set the number of CPU threads to use (default: %(default))",
119 |     )
120 |     parser.add_argument(
121 |         "--max-new-tokens",
122 |         type=int,
123 |         default=10,
124 |         help="Set the max number of new tokens to generate (default: %(default)s)",
125 |     )
126 | 
127 |     parser.add_argument(
128 |         "--device",
129 |         "-d",
130 |         default="NPU",
131 |         choices=["NPU", "CPU", "GPU"],
132 |         help="Select the target device (default: %(default)s)",
133 |     )
134 |     parser.add_argument(
135 |         "--dtype",
136 |         default="float16",
137 |         choices=["float16", "int8", "int4"],
138 |         help="Select the target dtype (default: %(default)s)",
139 |     )
140 | 
141 |     parser.add_argument(
142 |         "--disable-intel-npu-acceleration-library",
143 |         action="store_true",
144 |         help="Disable Intel® NPU Acceleration Library optimizations",
145 |     )
146 | 
147 |     return parser.parse_args()
148 | 
149 | 
150 | if __name__ == "__main__":
151 |     args = define_and_parse_args()
152 | 
153 |     print(
154 |         f"Profiling {args.model} with context size {args.context_size} and dtype {args.dtype}"
155 |     )
156 |     if args.n_threads:
157 |         print(f"Setting number of pytorch thread to {args.n_threads}")
158 |         torch.set_num_threads(args.n_threads)
159 |         print(f"Pytorch thread: {torch.get_num_threads()}")
160 | 
161 |     main(
162 |         context_size=args.context_size,
163 |         model_id=args.model,
164 |         max_new_tokens=args.max_new_tokens,
165 |         device=args.device,
166 |         dtype=args.dtype,
167 |         disable_intel_npu_acceleration_library=args.disable_intel_npu_acceleration_library,
168 |     )
169 | 


--------------------------------------------------------------------------------
/script/profile_matmul.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | 
  6 | from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4
  7 | from intel_npu_acceleration_library.dtypes import int4
  8 | from intel_npu_acceleration_library.backend import Linear, QLinear
  9 | from functools import partial
 10 | import numpy as np
 11 | import argparse
 12 | import torch
 13 | import time
 14 | import json
 15 | 
 16 | 
 17 | def print_profile_data(hwp_data, data):
 18 |     config_keys = ["batch", "inC", "outC", "dtype"]
 19 |     config = ", ".join([f"{key}: {hwp_data[key]}" for key in config_keys])
 20 | 
 21 |     e2e_runtimes = [elem["runtime"] for elem in data]
 22 |     print(
 23 |         f"MatMul ({config}) => HWP: {hwp_data['runtime']:.3f} ms, E2E: {np.mean(e2e_runtimes):.3f} ± {2 * np.std(e2e_runtimes):.3f} ms"
 24 |     )
 25 | 
 26 | 
 27 | def profile(inC, outC, batch, dtype, n_iters=500, skip_first=10):
 28 |     data = []
 29 |     mac = inC * outC * batch
 30 |     memcpy = (inC + outC) * batch
 31 | 
 32 |     X = np.random.uniform(-1, 1, (batch, inC)).astype(np.float16)
 33 |     W = np.random.uniform(-1, 1, (outC, inC)).astype(np.float16)
 34 | 
 35 |     if dtype == "float16":
 36 |         matmul_csl = Linear
 37 |         args = [W]
 38 |     elif dtype == "int8":
 39 |         weights, scale = quantize_tensor(torch.tensor(W))
 40 |         scale *= np.sqrt(inC)
 41 |         matmul_csl = partial(QLinear, dtype=np.int8)
 42 |         args = [weights.numpy(), scale.numpy()]
 43 |     elif dtype == "int4":
 44 |         weights, scale = quantize_tensor(torch.tensor(W), (int4.min, int4.max))
 45 |         scale *= np.sqrt(inC)
 46 |         weights = compress_to_i4(weights)
 47 |         matmul_csl = partial(QLinear, dtype=np.uint8)
 48 |         args = [weights.numpy(), scale.numpy()]
 49 |     else:
 50 |         raise RuntimeError(f"Invalid dtype: {dtype}")
 51 | 
 52 |     args.append("0000")
 53 | 
 54 |     mm_prof = matmul_csl(inC, outC, batch, profile=True)
 55 |     mm = matmul_csl(inC, outC, batch, profile=False)
 56 | 
 57 |     # Get the HWP data
 58 |     mm_prof.run(X, *args)
 59 |     with open("profiling.json") as fp:
 60 |         hwp_runtime = (
 61 |             json.load(fp)["taskStatistics"]["total duration"] / 1000.0
 62 |         )  # in us
 63 |     hwp_data = dict(
 64 |         batch=batch,
 65 |         inC=inC,
 66 |         outC=outC,
 67 |         memcpy=memcpy,
 68 |         mac=mac,
 69 |         runtime=hwp_runtime,
 70 |         dtype=dtype,
 71 |     )
 72 | 
 73 |     for idx in range(n_iters):
 74 |         t0 = time.perf_counter()
 75 |         mm.run(X, *args)
 76 |         t1 = time.perf_counter()
 77 |         if idx > (skip_first - 1):
 78 |             data.append(
 79 |                 dict(
 80 |                     batch=batch,
 81 |                     inC=inC,
 82 |                     outC=outC,
 83 |                     memcpy=memcpy,
 84 |                     mac=mac,
 85 |                     runtime=(t1 - t0) * 1000,
 86 |                     dtype=W.dtype,
 87 |                 )
 88 |             )
 89 | 
 90 |     print_profile_data(hwp_data, data)
 91 | 
 92 |     return hwp_data, data
 93 | 
 94 | 
 95 | def define_and_parse_args():
 96 |     parser = argparse.ArgumentParser(description="Profiling a MatMul model in the NPU")
 97 |     parser.add_argument("--batch", "-b", type=int, required=True, help="MatMul batch")
 98 |     parser.add_argument(
 99 |         "--input-channels", "-c", type=int, required=True, help="MatMul input channels"
100 |     )
101 |     parser.add_argument(
102 |         "--output-channels",
103 |         "-k",
104 |         type=int,
105 |         required=True,
106 |         help="MatMul output channels",
107 |     )
108 |     parser.add_argument(
109 |         "--dtype",
110 |         default="float16",
111 |         choices=["float16", "int8", "int4"],
112 |         help="Select the target dtype (default: %(default)s)",
113 |     )
114 | 
115 |     return parser.parse_args()
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     args = define_and_parse_args()
120 |     profile(args.input_channels, args.output_channels, args.batch, dtype=args.dtype)
121 | 


--------------------------------------------------------------------------------
/script/profile_mlp.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | 
  6 | from transformers.models.phi3.modeling_phi3 import Phi3Config, Phi3MLP
  7 | from intel_npu_acceleration_library.dtypes import int8, int4
  8 | from intel_npu_acceleration_library.compiler import CompilerConfig
  9 | from torch.profiler import profile, ProfilerActivity
 10 | from sklearn.metrics import r2_score
 11 | import intel_npu_acceleration_library
 12 | import argparse
 13 | import torch
 14 | import numpy as np
 15 | 
 16 | 
 17 | def main(
 18 |     seq_len=128,
 19 |     hidden_size=256,
 20 |     intermediate_size=512,
 21 |     dtype="float16",
 22 |     _profile=False,
 23 |     enable_graph_mode=False,
 24 | ):
 25 | 
 26 |     conf = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
 27 |     conf.num_hidden_layers = 1
 28 |     conf.hidden_size = hidden_size
 29 |     conf.intermediate_size = intermediate_size
 30 | 
 31 |     # Define a single Phi-3 MLP layer
 32 |     mlp = Phi3MLP(conf)
 33 | 
 34 |     hidden_states = torch.rand((seq_len, conf.hidden_size))
 35 | 
 36 |     reference = mlp(hidden_states.to(torch.float32)).to(torch.float16)
 37 | 
 38 |     if dtype == "float16":
 39 |         dtype = torch.float16
 40 |     elif dtype == "int8":
 41 |         dtype = int8
 42 |     elif dtype == "int4":
 43 |         dtype = int4
 44 |     else:
 45 |         raise RuntimeError(f"Invalid dtype: {dtype}")
 46 | 
 47 |     # Compile model
 48 |     compiler_conf = CompilerConfig(use_to=enable_graph_mode, dtype=dtype)
 49 |     model = intel_npu_acceleration_library.compile(mlp, compiler_conf)
 50 |     if _profile:
 51 |         model.profile = True
 52 | 
 53 |     with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
 54 |         for _ in range(1000):
 55 |             results = model(hidden_states)
 56 | 
 57 |     print(
 58 |         prof.key_averages(group_by_input_shape=True).table(
 59 |             sort_by="cpu_time_total", row_limit=20
 60 |         )
 61 |     )
 62 | 
 63 |     prof.export_chrome_trace("trace.json")
 64 | 
 65 |     results = results.detach().numpy()
 66 |     reference = reference.detach().numpy()
 67 | 
 68 |     assert results.shape == reference.shape, "Output shape mismatch"
 69 |     assert np.isfinite(reference).all(), "Pytorch Reference contains NaN or Inf"
 70 |     assert np.isfinite(results).all(), "NPU output contains NaN or Inf"
 71 | 
 72 |     if dtype == int4:
 73 |         assert 1 - r2_score(reference, results) < 0.05
 74 |     else:
 75 |         assert 1 - r2_score(reference, results) < 0.001
 76 | 
 77 | 
 78 | def define_and_parse_args():
 79 |     parser = argparse.ArgumentParser(description="Profiling a MLP layer in the NPU")
 80 |     parser.add_argument(
 81 |         "--seq-len",
 82 |         type=int,
 83 |         default=128,
 84 |         help="Sequence length (default: %(default)s)",
 85 |     )
 86 |     parser.add_argument(
 87 |         "--hidden-size",
 88 |         type=int,
 89 |         default=256,
 90 |         help="Hidden size (default: %(default)s)",
 91 |     )
 92 |     parser.add_argument(
 93 |         "--intermediate-size",
 94 |         type=int,
 95 |         default=512,
 96 |         help="Intermediate size (default: %(default)s)",
 97 |     )
 98 |     parser.add_argument(
 99 |         "--dtype",
100 |         default="float16",
101 |         choices=["float16", "int8", "int4"],
102 |         help="Select the target dtype (default: %(default)s)",
103 |     )
104 |     parser.add_argument(
105 |         "--profile",
106 |         action="store_true",
107 |         default=False,
108 |         help="Enable the profiling (default: False)",
109 |     )
110 |     parser.add_argument(
111 |         "--enable_graph_mode",
112 |         action="store_true",
113 |         default=False,
114 |         help="Enable graph mode for MLP, otherwise use eager mode (default: False)",
115 |     )
116 | 
117 |     return parser.parse_args()
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     args = define_and_parse_args()
122 | 
123 |     print(
124 |         f"Profiling with sequence length {args.seq_len}, hidden size {args.hidden_size}, intermediate size {args.intermediate_size}, dtype {args.dtype}"
125 |     )
126 | 
127 |     main(
128 |         seq_len=args.seq_len,
129 |         hidden_size=args.hidden_size,
130 |         intermediate_size=args.intermediate_size,
131 |         dtype=args.dtype,
132 |         _profile=args.profile,
133 |         enable_graph_mode=args.enable_graph_mode,
134 |     )
135 | 


--------------------------------------------------------------------------------
/script/quantize_model.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | 
  6 | from transformers import AutoModelForCausalLM, AutoTokenizer
  7 | import intel_npu_acceleration_library as npu_lib
  8 | from neural_compressor.config import PostTrainingQuantConfig
  9 | from neural_compressor.quantization import fit
 10 | from neural_compressor.adaptor.torch_utils.auto_round import get_dataloader
 11 | import argparse
 12 | import torch
 13 | import os
 14 | 
 15 | 
 16 | def define_and_parse_arguments():
 17 |     parser = argparse.ArgumentParser(description="Export a model to NPU")
 18 |     parser.add_argument(
 19 |         "-m",
 20 |         "--model",
 21 |         type=str,
 22 |         required=True,
 23 |         help="The name of the model to export",
 24 |     )
 25 |     parser.add_argument(
 26 |         "-b",
 27 |         "--bits",
 28 |         type=int,
 29 |         default=4,
 30 |         help="The number of bits to use for quantization",
 31 |     )
 32 |     parser.add_argument(
 33 |         "-o",
 34 |         "--output-dir",
 35 |         type=str,
 36 |         default="models",
 37 |         help="The directory where to save the exported model",
 38 |     )
 39 |     parser.add_argument(
 40 |         "-s",
 41 |         "--sequence-lenght",
 42 |         type=int,
 43 |         default=2048,
 44 |         help="The sequence lenght to use for the dataloader",
 45 |     )
 46 |     parser.add_argument(
 47 |         "-a",
 48 |         "--algorithm",
 49 |         type=str,
 50 |         default="RTN",
 51 |         help="The quantization algorithm to use",
 52 |     )
 53 |     return parser.parse_args()
 54 | 
 55 | 
 56 | def export_model(
 57 |     model_name: str,
 58 |     algorithm: str,
 59 |     bits: int = 4,
 60 |     sequence_lenght: int = 2048,
 61 |     output_dir: str = "models",
 62 | ):
 63 |     """Quantize and export a model.
 64 | 
 65 |     Args:
 66 |         model_name (str): the name of the model to export
 67 |         algorithm (str, optional): the neural compressor quantization algorithm
 68 |         bits (int, optional): the number of bits. Defaults to 4.
 69 |         sequence_lenght (int, optional): the model sequence lenght. Defaults to 2048.
 70 |         output_dir (str, optional): the output directory. Defaults to "models".
 71 |     """
 72 |     print(f"Exporting model {model_name} with {bits} bits")
 73 |     output_folder = os.path.join(output_dir, model_name, algorithm, f"int{bits}")
 74 |     os.makedirs(output_folder, exist_ok=True)
 75 | 
 76 |     float_model = AutoModelForCausalLM.from_pretrained(model_name)
 77 |     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 78 |     float_model.config.save_pretrained(output_folder)
 79 |     tokenizer.save_pretrained(output_folder)
 80 | 
 81 |     dataloader = get_dataloader(tokenizer, seqlen=sequence_lenght)
 82 | 
 83 |     woq_conf = PostTrainingQuantConfig(
 84 |         approach="weight_only",
 85 |         op_type_dict={
 86 |             ".*": {  # match all ops
 87 |                 "weight": {
 88 |                     "dtype": "int",
 89 |                     "bits": bits,
 90 |                     "group_size": -1,
 91 |                     "scheme": "sym",
 92 |                     "algorithm": algorithm.upper(),
 93 |                 },
 94 |                 "activation": {
 95 |                     "dtype": "fp16",
 96 |                 },
 97 |             }
 98 |         },
 99 |     )
100 | 
101 |     print("Apply generic model optimizations")
102 |     npu_lib.compiler.apply_general_optimizations(float_model)
103 |     print("Quantize model")
104 |     quantized_model = fit(model=float_model, conf=woq_conf, calib_dataloader=dataloader)
105 |     print("Quantize model")
106 |     compressed_model = quantized_model.export_compressed_model(
107 |         scale_dtype=torch.float16, use_optimum_format=False
108 |     )
109 | 
110 |     print("Create NPU kernels")
111 |     npu_model = npu_lib.compiler.create_npu_kernels(compressed_model)
112 | 
113 |     torch.save(npu_model, os.path.join(output_folder, "pytorch_npu_model.bin"))
114 |     print(f"Model succesfully exported to {output_folder}")
115 | 
116 | 
117 | if __name__ == "__main__":
118 |     args = define_and_parse_arguments()
119 |     export_model(
120 |         args.model, args.algorithm, args.bits, args.sequence_lenght, args.output_dir
121 |     )
122 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 88
 3 | extend-ignore = E203, E501, D100, D104
 4 | [tool:pytest]
 5 | addopts = --cov intel_npu_acceleration_library --cov-report term-missing --cov-fail-under=80 --cov-branch
 6 | testpaths =
 7 |     tests/python
 8 | [tox:tox]
 9 | min_version = 4.0
10 | env_list =
11 |     py38
12 |     py39
13 |     py310
14 |     py311
15 |     py312
16 | 
17 | [testenv]
18 | changedir = {toxinidir}/test/python
19 | deps = -r{toxinidir}/dev_requirements.txt
20 | commands = pytest
21 | 
22 | 
23 | [gh-actions]
24 | python =
25 |     3.8: py38
26 |     3.9: py39
27 |     3.10: py310
28 |     3.11: py311
29 |     3.12: py312
30 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | 
  6 | from setuptools import setup, Extension
  7 | from setuptools.command.build_ext import build_ext as build_ext_orig
  8 | import pathlib
  9 | import glob
 10 | import os
 11 | import re
 12 | 
 13 | 
 14 | def get_version():
 15 |     this_file_path = os.path.dirname(os.path.abspath(__file__))
 16 |     with open(
 17 |         os.path.join(this_file_path, "intel_npu_acceleration_library", "_version.py"),
 18 |         "rt",
 19 |     ) as fp:
 20 |         verstrline = fp.read()
 21 |     VSRE = r"^__version__ = ['\"]([^'\"]*)['\"]"
 22 |     mo = re.search(VSRE, verstrline, re.M)
 23 |     if mo:
 24 |         verstr = mo.group(1)
 25 |     else:
 26 |         raise RuntimeError("Unable to find version string")
 27 |     return verstr
 28 | 
 29 | 
 30 | class CMakeExtension(Extension):
 31 |     def __init__(self, name):
 32 |         # don't invoke the original build_ext for this special extension
 33 |         headers = glob.glob("include/**/*.h")
 34 |         cpp_sources = glob.glob("src/*.cpp")
 35 |         requirements = glob.glob("*requirements.txt")
 36 |         sources = ["CMakeLists.txt"] + requirements + cpp_sources + headers
 37 |         super().__init__(name, sources=sources)
 38 | 
 39 | 
 40 | class build_ext(build_ext_orig):
 41 |     def run(self):
 42 |         for ext in self.extensions:
 43 |             self.build_cmake(ext)
 44 |         super().run()
 45 | 
 46 |     def build_cmake(self, ext):
 47 |         cwd = pathlib.Path().absolute()
 48 | 
 49 |         # these dirs will be created in build_py, so if you don't have
 50 |         # any python sources to bundle, the dirs will be missing
 51 |         build_temp = pathlib.Path(self.build_temp)
 52 |         build_temp.mkdir(parents=True, exist_ok=True)
 53 |         extdir = pathlib.Path(self.get_ext_fullpath(ext.name))
 54 |         extdir.mkdir(parents=True, exist_ok=True)
 55 | 
 56 |         # example of cmake args
 57 |         config = "Debug" if self.debug else "Release"
 58 |         cmake_args = [
 59 |             f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={os.path.join(extdir.parent.absolute(), ext.name, "lib")}',
 60 |             "-DCMAKE_BUILD_TYPE=" + config,
 61 |             "-DSETUPTOOL_BUILD=True",
 62 |         ]
 63 | 
 64 |         # example of build args
 65 |         build_args = [
 66 |             "--config",
 67 |             config,
 68 |         ]
 69 | 
 70 |         os.chdir(str(build_temp))
 71 |         self.spawn(["cmake", str(cwd)] + cmake_args)
 72 |         if not self.dry_run:
 73 |             self.spawn(["cmake", "--build", "."] + build_args)
 74 |         # Troubleshooting: if fail on line above then delete all possible
 75 |         # temporary CMake files including "CMakeCache.txt" in top level dir.
 76 |         os.chdir(str(cwd))
 77 | 
 78 | 
 79 | with open("README.md", "r", encoding="utf-8") as fh:
 80 |     long_description = fh.read()
 81 | 
 82 | with open("requirements.txt") as fh:
 83 |     requirements = fh.readlines()
 84 | 
 85 | with open("dev_requirements.txt") as fh:
 86 |     dev_requirements = fh.readlines()
 87 | 
 88 | setup(
 89 |     name="intel_npu_acceleration_library",
 90 |     version=get_version(),
 91 |     packages=[
 92 |         "intel_npu_acceleration_library",
 93 |         "intel_npu_acceleration_library.backend",
 94 |         "intel_npu_acceleration_library.nn",
 95 |         "intel_npu_acceleration_library.functional",
 96 |     ],
 97 |     author="Alessandro Palla",
 98 |     author_email="alessandro.palla@intel.com",
 99 |     description="Intel® NPU Acceleration Library",
100 |     license="Apache License 2.0",
101 |     url="https://github.com/intel/intel-npu-acceleration-library",
102 |     ext_modules=[CMakeExtension("intel_npu_acceleration_library")],
103 |     cmdclass={
104 |         "build_ext": build_ext,
105 |     },
106 |     long_description=long_description,
107 |     long_description_content_type="text/markdown",
108 |     python_requires=">=3.8",
109 |     install_requires=requirements,
110 |     extras_require={
111 |         "dev": dev_requirements,
112 |     },
113 |     classifiers=[
114 |         "Development Status :: 4 - Beta",
115 |         "Intended Audience :: Developers",
116 |         "Intended Audience :: Education",
117 |         "Intended Audience :: Science/Research",
118 |         "License :: OSI Approved :: Apache Software License",
119 |         "Topic :: Scientific/Engineering",
120 |         "Topic :: Scientific/Engineering :: Mathematics",
121 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
122 |         "Topic :: Software Development",
123 |         "Topic :: Software Development :: Libraries",
124 |         "Topic :: Software Development :: Libraries :: Python Modules",
125 |         "Programming Language :: C++",
126 |         "Programming Language :: Python :: 3",
127 |         "Programming Language :: Python :: 3 :: Only",
128 |         "Programming Language :: Python :: 3.8",
129 |         "Programming Language :: Python :: 3.9",
130 |         "Programming Language :: Python :: 3.10",
131 |         "Programming Language :: Python :: 3.11",
132 |         "Programming Language :: Python :: 3.12",
133 |     ],
134 |     keywords="intel-npu-acceleration-library, machine learning, llm, intel core ultra",
135 | )
136 | 


--------------------------------------------------------------------------------
/test/python/conftest.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from intel_npu_acceleration_library.backend import clear_cache
 7 | import pytest
 8 | 
 9 | 
10 | @pytest.fixture(autouse=True)
11 | def run_before_and_after_tests():
12 |     """Fixture to execute asserts before and after a test is run"""
13 |     yield
14 |     clear_cache()
15 | 


--------------------------------------------------------------------------------
/test/python/test_basic.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from sklearn.metrics import r2_score
 7 | from intel_npu_acceleration_library.backend import MatMul
 8 | import numpy as np
 9 | import intel_npu_acceleration_library
10 | import intel_npu_acceleration_library.external.openvino as ov
11 | import pytest
12 | import time
13 | import sys
14 | import os
15 | 
16 | 
17 | def test_openvino_version():
18 |     version = ov.get_version()
19 |     assert version is not None
20 | 
21 | 
22 | def profile(func):
23 |     def wrapper(*args, **kwargs):
24 |         t0 = time.perf_counter()
25 |         ret = func(*args, **kwargs)
26 |         t1 = time.perf_counter()
27 |         elapsed = t1 - t0
28 |         return ret, elapsed
29 | 
30 |     return wrapper
31 | 
32 | 
33 | def test_basic_functionality():
34 | 
35 |     X = np.random.uniform(-1, 1, (512, 2048)).astype(np.float16)
36 |     W = np.random.uniform(-1, 1, (512, 2048)).astype(np.float16)
37 | 
38 |     mm = MatMul(2048, 512, X.shape[0])
39 | 
40 |     @profile
41 |     def npu_run():
42 |         return mm.run(X, W)
43 | 
44 |     @profile
45 |     def cpu_run():
46 |         return np.matmul(X, W.T)
47 | 
48 |     npu_val, npu_latency = npu_run()
49 |     cpu_val, cpu_latency = cpu_run()
50 | 
51 |     assert 1 - r2_score(cpu_val, npu_val) < 0.001
52 |     assert npu_latency < cpu_latency
53 | 
54 | 
55 | def test_save_model():
56 | 
57 |     mm = MatMul(2048, 512, 512)
58 |     mm.save("model.xml")
59 |     assert os.path.isfile("model.xml")
60 |     assert os.path.isfile("model.bin")
61 |     os.remove("model.xml")
62 |     os.remove("model.bin")
63 | 
64 | 
65 | @pytest.mark.skipif(
66 |     not intel_npu_acceleration_library.backend.npu_available(),
67 |     reason="Cannot save model if NPU is not available",
68 | )
69 | def test_save_compiled_model():
70 | 
71 |     mm = MatMul(2048, 512, 512)
72 |     mm.saveCompiledModel("model.blob")
73 |     assert os.path.isfile("model.blob")
74 |     os.remove("model.blob")
75 | 
76 | 
77 | @pytest.mark.skipif(
78 |     not intel_npu_acceleration_library.backend.npu_available(),
79 |     reason="Skip test if NPU is not available",
80 | )
81 | @pytest.mark.skipif(
82 |     sys.platform != "win32",
83 |     reason="Skip test if not on windows platform",
84 | )
85 | def test_driver_version():
86 | 
87 |     version = intel_npu_acceleration_library.backend.get_driver_version()
88 |     assert version is not None
89 | 


--------------------------------------------------------------------------------
/test/python/test_bindings.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
 6 | import numpy as np
 7 | import pytest
 8 | import ctypes
 9 | 
10 | 
11 | @pytest.mark.parametrize("device", ["CPU", "NPU"])
12 | def test_bindings(device):
13 | 
14 |     device = ctypes.c_char_p(device.encode())
15 |     matmul = backend_lib.createNNFactory(device, False)
16 | 
17 |     assert isinstance(matmul, ctypes.POINTER(ctypes.c_char))
18 | 
19 |     backend_lib.destroyNNFactory(matmul)
20 | 
21 | 
22 | @pytest.mark.parametrize("inC", [16, 32, 64, 128])
23 | @pytest.mark.parametrize("outC", [16, 32, 64, 128])
24 | @pytest.mark.parametrize("batch", [16, 128])
25 | @pytest.mark.parametrize("run_op", [True, False])
26 | def test_factory_bindings(inC, outC, batch, run_op):
27 | 
28 |     ## Weights
29 |     weights = np.zeros((outC, inC)).astype(np.float16)
30 |     x = np.zeros((batch, inC)).astype(np.float16)
31 |     out = np.empty((batch, outC), dtype=np.float16)
32 | 
33 |     # Create nn factory
34 |     device = ctypes.c_char_p("NPU".encode())
35 |     factory = backend_lib.createNNFactory(device, False)
36 | 
37 |     # Create linear layer
38 |     shape_ptr = np.array((batch, inC), dtype=np.uint32)
39 |     dtype = ctypes.c_char_p("float16".encode())
40 |     p0 = backend_lib.parameter(factory, shape_ptr.size, shape_ptr, dtype)
41 |     linear = backend_lib.linear(factory, p0, outC, inC, False, dtype, dtype)
42 |     backend_lib.result(factory, linear)
43 |     backend_lib.compile(factory)
44 |     backend_lib.set_output(factory, out.ctypes.data_as(ctypes.c_void_p), 0)
45 | 
46 |     # Set parameters
47 |     param = backend_lib.createParameters()
48 |     backend_lib.addFloatParameter(param, weights, *weights.shape)
49 |     backend_lib.setNNFactoryWeights(factory, param)
50 | 
51 |     # run
52 |     if run_op:
53 |         backend_lib.set_activation(factory, x.ctypes.data_as(ctypes.c_void_p), 0)
54 |         backend_lib.run(factory)
55 | 
56 |     # Call destuctors for parameters and weights
57 |     backend_lib.destroyNNFactory(factory)
58 |     backend_lib.destroyParameters(param)
59 | 


--------------------------------------------------------------------------------
/test/python/test_compile.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | 
  6 | from intel_npu_acceleration_library.compiler import compile
  7 | from intel_npu_acceleration_library.compiler import CompilerConfig
  8 | from intel_npu_acceleration_library.dtypes import int4
  9 | from sklearn.metrics import r2_score
 10 | import intel_npu_acceleration_library
 11 | from packaging.version import Version
 12 | import pytest
 13 | import torch
 14 | import time
 15 | import sys
 16 | 
 17 | 
 18 | class NN(torch.nn.Module):
 19 |     def __init__(
 20 |         self,
 21 |     ) -> None:
 22 |         super().__init__()
 23 |         self.l1 = torch.nn.Linear(32, 128, bias=False)
 24 |         self.l2 = torch.nn.Linear(128, 32, bias=False)
 25 |         self.relu = torch.nn.functional.relu
 26 | 
 27 |     def forward(self, x):
 28 |         return self.relu(self.l2(self.relu(self.l1(x))))
 29 | 
 30 | 
 31 | torch.manual_seed(0)
 32 | x = 128 * (torch.rand((16, 32), dtype=torch.float16) - 0.5)
 33 | 
 34 | 
 35 | @pytest.mark.parametrize("dtype", [torch.float16, torch.int8, int4])
 36 | def test_compilation(dtype):
 37 | 
 38 |     torch.manual_seed(42)
 39 |     model = NN().half()
 40 | 
 41 |     y_ref = model(x).detach()
 42 | 
 43 |     compiler_conf = CompilerConfig(dtype=dtype)
 44 |     compiled_model = compile(model, compiler_conf)
 45 | 
 46 |     assert compiled_model
 47 | 
 48 |     for name, layer in compiled_model.named_children():
 49 |         expected_cls = (
 50 |             intel_npu_acceleration_library.nn.Linear
 51 |             if dtype.is_floating_point
 52 |             else intel_npu_acceleration_library.nn.QuantizedLinear
 53 |         )
 54 |         assert isinstance(layer, expected_cls)
 55 |         if dtype == int4:
 56 |             assert layer.weight.dtype == torch.uint8
 57 |         else:
 58 |             assert layer.weight.dtype == dtype
 59 |         if layer.bias is not None:
 60 |             if dtype.is_floating_point:
 61 |                 assert layer.bias.dtype == dtype
 62 |             else:
 63 |                 layer.bias.dtype == torch.float32
 64 | 
 65 |     t0 = time.perf_counter()
 66 |     y1 = compiled_model(x).detach()
 67 |     t1 = time.perf_counter()
 68 | 
 69 |     y2 = compiled_model(x).detach()
 70 |     t2 = time.perf_counter()
 71 | 
 72 |     if dtype == int4:
 73 |         assert 1 - r2_score(y_ref.numpy(), y1.numpy()) < 0.05
 74 |     else:
 75 |         assert 1 - r2_score(y_ref.numpy(), y1.numpy()) < 0.01
 76 | 
 77 |     assert torch.allclose(y1, y2)
 78 | 
 79 |     # Check that for next iteration weights are prefetched
 80 |     # latency2 = t2 - t1
 81 |     # latency1 = t1 - t0
 82 |     # assert latency2 < latency1
 83 | 
 84 |     intel_npu_acceleration_library.backend.clear_cache()
 85 | 
 86 | 
 87 | def test_torch_compile():
 88 | 
 89 |     model = NN()
 90 |     y_ref = model(x.to(torch.float32)).detach()
 91 | 
 92 |     if sys.platform == "win32" and Version(torch.__version__) < Version("2.2.2"):
 93 |         with pytest.raises(RuntimeError) as e:
 94 |             compiled_model = torch.compile(model, backend="npu")
 95 |         assert str(e.value) == "Windows not yet supported for torch.compile"
 96 |     else:
 97 |         compiled_model = torch.compile(model, backend="npu")
 98 |         y = compiled_model(x.to(torch.float32)).detach()
 99 |         assert 1 - r2_score(y_ref.numpy(), y.numpy()) < 0.01
100 | 
101 | 
102 | @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.int8])
103 | def test_compile_training(dtype):
104 | 
105 |     model = NN()
106 | 
107 |     compiler_conf = CompilerConfig(dtype=dtype, training=True)
108 |     compiled_model = compile(model, compiler_conf)
109 | 
110 |     for name, layer in compiled_model.named_children():
111 |         if dtype == torch.int8:
112 |             assert layer.training == False
113 |         else:
114 |             assert layer.training == True
115 | 
116 | 
117 | @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.int8, int4])
118 | def test_compile_inference(dtype):
119 | 
120 |     model = NN()
121 | 
122 |     compiler_conf = CompilerConfig(dtype=dtype)
123 |     compiled_model = compile(model, compiler_conf)
124 | 
125 |     for name, layer in compiled_model.named_children():
126 |         assert layer.training == False
127 | 


--------------------------------------------------------------------------------
/test/python/test_conv.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | 
 7 | import intel_npu_acceleration_library
 8 | from intel_npu_acceleration_library.compiler import CompilerConfig
 9 | from sklearn.metrics import r2_score
10 | import pytest
11 | import torch
12 | 
13 | 
14 | class DummyConv(torch.nn.Module):
15 |     def __init__(
16 |         self,
17 |         in_channels,
18 |         out_channels,
19 |         kernels,
20 |         bias,
21 |         groups,
22 |         stride=1,
23 |         padding=0,
24 |         dilation=1,
25 |     ):
26 |         super().__init__()
27 |         if groups == -1:
28 |             groups = out_channels
29 |         self.conv = torch.nn.Conv2d(
30 |             in_channels,
31 |             out_channels,
32 |             kernels,
33 |             bias=bias,
34 |             groups=groups,
35 |             stride=stride,
36 |             padding=padding,
37 |             dilation=dilation,
38 |         )
39 | 
40 |     def forward(self, x):
41 |         return self.conv(x)
42 | 
43 | 
44 | @pytest.mark.parametrize("in_channels", [32, 128, 256])
45 | @pytest.mark.parametrize("out_channels", [32, 128, 256])
46 | @pytest.mark.parametrize("kernels", [1, 3])
47 | @pytest.mark.parametrize("dim", [16, 32])
48 | @pytest.mark.parametrize("bias", [True, False])
49 | @pytest.mark.parametrize("dtype", [torch.float16])
50 | @pytest.mark.parametrize("stride", [1, 2])
51 | @pytest.mark.parametrize("padding", [0, 1])
52 | @pytest.mark.parametrize("groups", [1, -1])
53 | def test_conv(
54 |     in_channels, out_channels, kernels, dim, bias, dtype, stride, padding, groups
55 | ):
56 |     torch.manual_seed(42)
57 | 
58 |     if groups != 1 and in_channels != out_channels:
59 |         pytest.skip("DW convolutions require in_channels == out_channels")
60 | 
61 |     with torch.no_grad():
62 |         X = torch.rand((1, in_channels, dim, dim), dtype=torch.float16)
63 |         conv = DummyConv(
64 |             in_channels,
65 |             out_channels,
66 |             kernels,
67 |             bias=bias,
68 |             groups=groups,
69 |             stride=stride,
70 |             padding=padding,
71 |         ).half()
72 |         conv.conv.weight.data *= 128
73 |         y_ref = conv(X)
74 | 
75 |         compiler_conf = CompilerConfig(dtype=dtype)
76 |         npu_conv = intel_npu_acceleration_library.compile(conv, compiler_conf)
77 |         y = npu_conv(X)
78 | 
79 |         assert y.dtype == y_ref.dtype
80 |         assert y.shape == y_ref.shape
81 |         if dtype == torch.int8:
82 |             assert 1 - r2_score(y_ref.flatten().numpy(), y.flatten().numpy()) < 0.05
83 |         else:
84 |             assert 1 - r2_score(y_ref.flatten().numpy(), y.flatten().numpy()) < 0.001
85 | 


--------------------------------------------------------------------------------
/test/python/test_device.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from intel_npu_acceleration_library.nn.module import NPUModuleWrapper
 7 | import torch
 8 | 
 9 | 
10 | class NN(torch.nn.Module):
11 |     def __init__(self):
12 |         super().__init__()
13 |         self.l1 = torch.nn.Linear(128, 128)
14 | 
15 |     def forward(self, x):
16 |         return self.l1(x)
17 | 
18 | 
19 | def test_device():
20 | 
21 |     x = torch.rand((128, 128)).to(torch.float16).to("npu")
22 | 
23 |     model = NN().half().to("npu")
24 | 
25 |     assert isinstance(model, torch.nn.Module)
26 |     assert isinstance(model, NPUModuleWrapper)
27 | 
28 |     y = model(x)
29 | 
30 |     assert y.dtype == torch.float16
31 |     assert y.device == torch.device("npu")
32 | 


--------------------------------------------------------------------------------
/test/python/test_dtypes.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | import pytest
 7 | from intel_npu_acceleration_library.dtypes import float16, bfloat16, int4, int8
 8 | 
 9 | 
10 | @pytest.fixture
11 | def npu_dtypes():
12 |     return [float16, bfloat16, int4, int8]
13 | 
14 | 
15 | def test_NPUDtype_is_floating_point(npu_dtypes):
16 |     for dtype in npu_dtypes:
17 |         if dtype in (int4, int8):
18 |             assert dtype.is_floating_point == False
19 |         else:
20 |             assert dtype.is_floating_point == True
21 | 


--------------------------------------------------------------------------------
/test/python/test_factory.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | import numpy as np
 7 | import intel_npu_acceleration_library
 8 | import pytest
 9 | import os
10 | 
11 | 
12 | @pytest.mark.parametrize("batch", [16, 128])
13 | @pytest.mark.parametrize("inC", [256, 512])
14 | @pytest.mark.parametrize("outC", [256, 512])
15 | @pytest.mark.parametrize("dtype", [np.float16, np.int8])
16 | @pytest.mark.parametrize("activation", ["gelu", "swish", "softmax"])
17 | def test_factory(batch, inC, outC, dtype, activation):
18 |     module = intel_npu_acceleration_library.backend.NNFactory()
19 |     assert module
20 | 
21 |     input = module.parameter((batch, inC))
22 |     assert input
23 | 
24 |     weights = module.parameter((outC, inC), dtype)
25 |     assert weights
26 | 
27 |     if dtype == np.int8:
28 |         weights = module.convert_to_fp16(weights)
29 | 
30 |     mm = module.matmul(input, weights)
31 |     assert mm
32 | 
33 |     act_fn = getattr(module, activation)
34 |     if activation == "softmax":
35 |         output = act_fn(mm, -1)
36 |     else:
37 |         output = act_fn(mm)
38 |     assert output
39 | 
40 |     module.compile()
41 | 
42 |     output_shape = module.get_tensor_shape(output.node)
43 |     assert output_shape == (batch, outC)
44 | 
45 |     filename = f"test_factory_mm_{batch}_{inC}_{outC}_{dtype.__name__}_{activation}"
46 |     module.save(f"{filename}.xml")
47 | 
48 |     assert os.path.isfile(f"{filename}.xml")
49 | 
50 |     os.remove(f"{filename}.xml")
51 |     os.remove(f"{filename}.bin")
52 | 


--------------------------------------------------------------------------------
/test/python/test_llm.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | 
  6 | from transformers.models.llama.modeling_llama import LlamaForCausalLM, LlamaConfig
  7 | from transformers.models.phi.modeling_phi import PhiConfig, PhiMLP
  8 | from transformers.models.phi3.modeling_phi3 import Phi3Config, Phi3MLP
  9 | from transformers import AutoTokenizer, AutoModelForCausalLM
 10 | from intel_npu_acceleration_library.dtypes import int8, int4
 11 | from intel_npu_acceleration_library.compiler import CompilerConfig
 12 | from sklearn.metrics import r2_score
 13 | from torch.profiler import profile, ProfilerActivity
 14 | import intel_npu_acceleration_library
 15 | import pytest
 16 | import torch
 17 | import numpy as np
 18 | 
 19 | 
 20 | @pytest.fixture
 21 | def config():
 22 |     return LlamaConfig(num_hidden_layers=1)
 23 | 
 24 | 
 25 | @pytest.fixture
 26 | def decoder_model(config):
 27 |     return LlamaForCausalLM(config)
 28 | 
 29 | 
 30 | @pytest.fixture
 31 | def model():
 32 |     return AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 33 | 
 34 | 
 35 | @pytest.fixture
 36 | def tokenizer():
 37 |     return AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 38 | 
 39 | 
 40 | @pytest.mark.parametrize("model_seq_length", [128, 256])
 41 | def test_warm_up(tokenizer, model, model_seq_length):
 42 |     compiler_conf = CompilerConfig()
 43 |     compiled_model = intel_npu_acceleration_library.compile(model, compiler_conf)
 44 |     intel_npu_acceleration_library.nn.llm.warm_up_decoder_model(
 45 |         tokenizer, compiled_model, model_seq_length
 46 |     )
 47 | 
 48 | 
 49 | @pytest.mark.parametrize("dtype", [torch.float16, torch.int8])
 50 | def test_compilation(tokenizer, decoder_model, dtype):
 51 |     prefill = tokenizer("test sentence", return_tensors="pt")["input_ids"].to("cpu")
 52 |     y_ref = decoder_model(prefill).logits.detach()
 53 | 
 54 |     compiler_conf = CompilerConfig(dtype=dtype)
 55 |     compiled_model = intel_npu_acceleration_library.compile(
 56 |         decoder_model, compiler_conf
 57 |     )
 58 | 
 59 |     assert compiled_model
 60 | 
 61 |     y = compiled_model(prefill).logits.detach()
 62 | 
 63 |     assert 1 - r2_score(y_ref.flatten().numpy(), y.flatten().numpy()) < 0.01
 64 | 
 65 | 
 66 | @torch.no_grad
 67 | @pytest.mark.parametrize("seq_len", [16, 128, 256])
 68 | @pytest.mark.parametrize("hidden_size", [256, 512])
 69 | @pytest.mark.parametrize("intermediate_size", [512])
 70 | def test_phi2_mlp(seq_len, hidden_size, intermediate_size):
 71 |     conf = PhiConfig.from_pretrained("microsoft/phi-2")
 72 |     conf.num_hidden_layers = 1
 73 |     conf.hidden_size = hidden_size
 74 |     conf.intermediate_size = intermediate_size
 75 | 
 76 |     mlp = PhiMLP(conf)
 77 | 
 78 |     x = torch.rand((seq_len, conf.hidden_size), dtype=torch.float16)
 79 |     reference = mlp(x.to(torch.float32)).to(torch.float16)
 80 | 
 81 |     model = intel_npu_acceleration_library.nn.PhiMLP.fromTorch(mlp)
 82 | 
 83 |     assert model
 84 | 
 85 |     out = model(x)
 86 | 
 87 |     assert 1 - r2_score(reference.numpy(), out.numpy()) < 0.001
 88 | 
 89 | 
 90 | @torch.no_grad
 91 | @pytest.mark.parametrize("seq_len", [16, 128, 256])
 92 | @pytest.mark.parametrize("hidden_size", [256, 512])
 93 | @pytest.mark.parametrize("intermediate_size", [512])
 94 | @pytest.mark.parametrize("dtype", ["float16", "int8", "int4"])
 95 | def test_phi3_mlp_compile(seq_len, hidden_size, intermediate_size, dtype):
 96 |     conf = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
 97 |     conf.num_hidden_layers = 1
 98 |     conf.hidden_size = hidden_size
 99 |     conf.intermediate_size = intermediate_size
100 | 
101 |     if dtype == "int8":
102 |         dtype = int8
103 |     elif dtype == "int4":
104 |         dtype = int4
105 |     else:
106 |         dtype = torch.float16
107 | 
108 |     mlp = Phi3MLP(conf)
109 | 
110 |     hidden_states = torch.rand((seq_len, conf.hidden_size))
111 | 
112 |     reference = mlp(hidden_states.to(torch.float32)).to(torch.float16).detach().numpy()
113 | 
114 |     compiler_conf = CompilerConfig(use_to=True, dtype=dtype)
115 |     model = intel_npu_acceleration_library.compile(mlp, compiler_conf)
116 | 
117 |     assert model
118 | 
119 |     with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
120 |         out = model(hidden_states)
121 | 
122 |     print(
123 |         prof.key_averages(group_by_input_shape=True).table(
124 |             sort_by="cpu_time_total", row_limit=20
125 |         )
126 |     )
127 | 
128 |     out = out.detach().numpy()
129 | 
130 |     assert out.shape == reference.shape, "Output shape mismatch"
131 |     assert np.isfinite(reference).all(), "Pytorch Reference contains NaN or Inf"
132 |     assert np.isfinite(out).all(), "NPU output contains NaN or Inf"
133 | 
134 |     if dtype == int4:
135 |         assert 1 - r2_score(reference, out) < 0.05
136 |     else:
137 |         assert 1 - r2_score(reference, out) < 0.001
138 | 


--------------------------------------------------------------------------------
/test/python/test_matmul.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from sklearn.metrics import r2_score
 7 | from intel_npu_acceleration_library.backend import MatMul, QMatMul
 8 | from intel_npu_acceleration_library.quantization import quantize_tensor
 9 | import numpy as np
10 | import itertools
11 | import pytest
12 | import torch
13 | import time
14 | 
15 | 
16 | channels = [512, 768, 1024, 2048]
17 | batches = [16, 128, 512, 1024]
18 | 
19 | 
20 | @pytest.mark.parametrize(
21 |     "batch,inC,outC", itertools.product(batches, channels, channels)
22 | )
23 | def test_matmul(batch, inC, outC):
24 |     X = torch.rand((batch, inC), requires_grad=False).to(torch.float16)
25 |     W = torch.rand((outC, inC), requires_grad=False).to(torch.float16)
26 | 
27 |     cpu_mm = X @ W.T
28 | 
29 |     mm = MatMul(inC, outC, batch)
30 | 
31 |     assert mm
32 | 
33 |     npu_mm = mm.run(X.numpy(), W.numpy())
34 | 
35 |     assert np.isfinite(npu_mm).all()
36 | 
37 |     assert 1 - r2_score(cpu_mm.numpy(), npu_mm) < 0.001
38 | 
39 | 
40 | @pytest.mark.parametrize(
41 |     "batch,inC,outC", itertools.product(batches, channels, channels)
42 | )
43 | def test_qmatmul_per_channel_scales(batch, inC, outC):
44 | 
45 |     X = torch.rand((batch, inC), requires_grad=False).to(torch.float16) - 0.5
46 |     W = torch.rand((outC, inC), requires_grad=False).to(torch.float16)
47 | 
48 |     # Compute reference matmul
49 |     cpu_mm = X @ W.T
50 | 
51 |     assert W.shape == (outC, inC) and W.dtype == torch.float16
52 | 
53 |     # Quantize the weights
54 |     weights_quant, scale = quantize_tensor(W)
55 | 
56 |     assert scale.shape == (outC, 1) and scale.dtype == torch.float16
57 |     assert weights_quant.shape == (outC, inC) and weights_quant.dtype == torch.int8
58 |     assert weights_quant.shape == W.shape
59 | 
60 |     # Conversion done properly
61 |     expected_W = weights_quant.to(torch.float16) * scale
62 |     assert 1 - r2_score(expected_W.numpy(), W.numpy()) < 0.001
63 | 
64 |     mm = QMatMul(inC, outC, batch)
65 | 
66 |     assert mm
67 | 
68 |     # Adapt for numerically accurate qmatumul
69 |     scale *= np.sqrt(inC)
70 | 
71 |     npu_mm = mm.run(X.numpy(), weights_quant.numpy(), scale.numpy())
72 | 
73 |     assert np.isfinite(npu_mm).all()
74 | 
75 |     assert 1 - r2_score(cpu_mm.numpy(), npu_mm) < 0.001
76 | 


--------------------------------------------------------------------------------
/test/python/test_profiling.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | from intel_npu_acceleration_library.quantization import quantize_tensor
 7 | from intel_npu_acceleration_library.backend import MatMul, QMatMul, Linear, QLinear
 8 | import numpy as np
 9 | import intel_npu_acceleration_library
10 | import pytest
11 | import json
12 | import torch
13 | import os
14 | 
15 | 
16 | def check_no_sw_layer(profiling_file):
17 |     with open(profiling_file) as f:
18 |         data = json.load(f)
19 | 
20 |     statistics = data["taskStatistics"]
21 |     assert statistics["SW duration"] == 0
22 | 
23 | 
24 | def test_profiling_matmul():
25 | 
26 |     if not intel_npu_acceleration_library.backend.npu_available():
27 |         pytest.xfail("NPU not available")
28 | 
29 |     X = np.random.uniform(-1, 1, (512, 2048)).astype(np.float16)
30 |     W = np.random.uniform(-1, 1, (512, 2048)).astype(np.float16)
31 |     W_q, scale = quantize_tensor(torch.tensor(W))
32 | 
33 |     # Adapt for numerically accurate qmatumul
34 |     scale *= np.sqrt(2048)
35 | 
36 |     if os.path.exists("profiling.json"):
37 |         os.remove("profiling.json")
38 | 
39 |     MatMul(W.shape[1], W.shape[0], X.shape[0], profile=True).run(X, W)
40 |     assert os.path.isfile("profiling.json")
41 |     check_no_sw_layer("profiling.json")
42 |     os.remove("profiling.json")
43 | 
44 |     QMatMul(W.shape[1], W.shape[0], X.shape[0], profile=True).run(
45 |         X, W_q.numpy(), scale.numpy()
46 |     )
47 |     QMatMul(W.shape[1], W.shape[0], X.shape[0], profile=True).save("qmatmul.xml")
48 |     assert os.path.isfile("profiling.json")
49 |     check_no_sw_layer("profiling.json")
50 |     os.remove("profiling.json")
51 |     os.remove("qmatmul.xml")
52 | 
53 |     Linear(W.shape[1], W.shape[0], X.shape[0], profile=True).run(X, W, op_id=0)
54 |     assert os.path.isfile("profiling.json")
55 |     check_no_sw_layer("profiling.json")
56 |     os.remove("profiling.json")
57 | 
58 |     QLinear(W.shape[1], W.shape[0], X.shape[0], profile=True).run(
59 |         X, W_q.numpy(), scale.numpy(), op_id=0
60 |     )
61 |     assert os.path.isfile("profiling.json")
62 |     check_no_sw_layer("profiling.json")
63 |     os.remove("profiling.json")
64 | 


--------------------------------------------------------------------------------
/test/python/test_quantization.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | 
  6 | from sklearn.metrics import r2_score
  7 | from intel_npu_acceleration_library.compiler import CompilerConfig
  8 | import numpy as np
  9 | import intel_npu_acceleration_library
 10 | import pytest
 11 | import torch
 12 | 
 13 | import intel_npu_acceleration_library.quantization
 14 | 
 15 | 
 16 | class NN(torch.nn.Module):
 17 |     def __init__(self, inC, outC):
 18 |         super().__init__()
 19 |         self.l1 = torch.nn.Linear(inC, outC, bias=False)
 20 | 
 21 |     def forward(self, x):
 22 |         return self.l1(x)
 23 | 
 24 | 
 25 | @pytest.mark.parametrize("batch", [16, 128])
 26 | @pytest.mark.parametrize("inC", [256, 512])
 27 | @pytest.mark.parametrize("outC", [256, 512])
 28 | def test_explicit_quantization(batch, inC, outC):
 29 |     module = intel_npu_acceleration_library.backend.NNFactory()
 30 |     assert module
 31 | 
 32 |     input = module.parameter((batch, inC))
 33 |     assert input
 34 | 
 35 |     output = module.linear(input, outC, inC)
 36 |     assert output
 37 | 
 38 |     module.compile()
 39 | 
 40 |     X = np.random.random((batch, inC)).astype(np.float16)
 41 |     W = np.random.randint(-127, 127, (outC, inC)).astype(np.int8)
 42 |     S = np.random.random((outC, 1)).astype(np.float32)
 43 | 
 44 |     w_float = W.astype(np.float16) * S
 45 |     y_ref = np.matmul(X, w_float.T)
 46 | 
 47 |     y = module.run(X, (W, S), op_id="0000")
 48 | 
 49 |     assert 1 - r2_score(y_ref, y) < 0.01
 50 | 
 51 | 
 52 | @pytest.mark.parametrize("batch", [16, 128])
 53 | @pytest.mark.parametrize("inC", [256, 512])
 54 | @pytest.mark.parametrize("outC", [256, 512])
 55 | def test_i8_quantization(batch, inC, outC):
 56 |     module = intel_npu_acceleration_library.backend.NNFactory()
 57 |     assert module
 58 | 
 59 |     input = module.parameter((batch, inC))
 60 |     assert input
 61 | 
 62 |     output = module.linear(input, outC, inC, False, wt_dtype=np.int8)
 63 |     assert output
 64 | 
 65 |     module.compile()
 66 | 
 67 |     X = np.random.random((batch, inC)).astype(np.float16)
 68 |     W = np.random.randint(-127, 127, (outC, inC)).astype(np.int8)
 69 |     S = np.random.random((outC, 1)).astype(np.float16)
 70 | 
 71 |     w_float = W.astype(np.float16) * S
 72 |     y_ref = np.matmul(X, w_float.T)
 73 | 
 74 |     y = module.run(X, (W, S * np.sqrt(inC)), op_id="0000")
 75 | 
 76 |     assert 1 - r2_score(y_ref, y) < 0.01
 77 | 
 78 | 
 79 | @pytest.mark.parametrize("batch", [16, 128])
 80 | @pytest.mark.parametrize("inC", [256, 512])
 81 | @pytest.mark.parametrize("outC", [256, 512])
 82 | def test_compiled_quantized(batch, inC, outC):
 83 | 
 84 |     intel_npu_acceleration_library.backend.clear_cache()
 85 | 
 86 |     torch.manual_seed(0)
 87 |     X = torch.rand((batch, inC), dtype=torch.float16) - 0.5
 88 |     # X = np.random.random((batch, inC)).astype(np.float16)
 89 | 
 90 |     model = NN(inC, outC)
 91 |     y_ref = model(X.to(torch.float32)).detach()
 92 | 
 93 |     compiler_conf = CompilerConfig(dtype=torch.int8)
 94 |     compiled_model = intel_npu_acceleration_library.compile(model, compiler_conf)
 95 |     assert compiled_model
 96 | 
 97 |     y1 = compiled_model(X).detach()
 98 | 
 99 |     assert 1 - r2_score(y_ref.numpy(), y1.numpy()) < 0.01
100 | 
101 | 
102 | @pytest.mark.parametrize("batch", [16, 128])
103 | @pytest.mark.parametrize("inC", [256, 512])
104 | @pytest.mark.parametrize("outC", [256, 512])
105 | def test_i4_quantization(batch, inC, outC):
106 | 
107 |     module = intel_npu_acceleration_library.backend.NNFactory()
108 |     assert module
109 | 
110 |     input = module.parameter((batch, inC))
111 |     assert input
112 |     # u8 represents packed i4 dtypes
113 |     output = module.linear(input, outC, inC, False, wt_dtype=np.uint8)
114 |     assert output
115 | 
116 |     module.compile()
117 | 
118 |     X = np.random.random((batch, inC)).astype(np.float16)
119 |     S = np.random.random((outC, 1)).astype(np.float16)
120 |     W = np.random.randint(-8, 7, (outC, inC)).astype(np.int8)
121 | 
122 |     w_float = W.astype(np.float16) * S
123 |     y_ref = np.matmul(X, w_float.T)
124 | 
125 |     # Compress the weights for int4
126 |     W_npu = intel_npu_acceleration_library.quantization.compress_to_i4(
127 |         torch.from_numpy(W)
128 |     ).numpy()
129 | 
130 |     y = module.run(X, (W_npu, S * np.sqrt(inC)), op_id="0000")
131 | 
132 |     # assert y has no NaN
133 |     assert not np.isnan(y).any()
134 | 
135 |     # assert y has no Inf
136 |     assert not np.isinf(y).any()
137 | 
138 |     # Check for correctness vs reference
139 |     assert 1 - r2_score(y_ref.flatten(), y.flatten()) < 0.01
140 | 


--------------------------------------------------------------------------------
/test/python/test_sdpa.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2024 Intel Corporation
  3 | # SPDX-License-Identifier: Apache 2.0
  4 | #
  5 | from intel_npu_acceleration_library.backend.sdpa import SDPA
  6 | from intel_npu_acceleration_library.functional import scaled_dot_product_attention
  7 | from sklearn.metrics import r2_score
  8 | import numpy as np
  9 | import pytest
 10 | import torch
 11 | 
 12 | 
 13 | @pytest.mark.parametrize("heads", [16, 32])
 14 | @pytest.mark.parametrize("sequence", [16, 32, 256, 512])
 15 | @pytest.mark.parametrize("dim", [512, 1024])
 16 | @pytest.mark.parametrize("kv_cache", [True, False])
 17 | @pytest.mark.parametrize("is_causal", [False, True])
 18 | def test_sdpa(heads, sequence, dim, kv_cache, is_causal):
 19 | 
 20 |     min_value = torch.finfo(torch.float16).min
 21 | 
 22 |     query = torch.rand(1, heads, 1 if kv_cache else sequence, dim // heads).to(
 23 |         torch.float16
 24 |     )
 25 |     key = torch.rand(1, heads, sequence, dim // heads).to(torch.float16)
 26 |     value = torch.rand(1, heads, sequence, dim // heads).to(torch.float16)
 27 |     mask = min_value * torch.ones(1, heads, 1 if kv_cache else sequence, sequence).to(
 28 |         torch.float16
 29 |     )
 30 |     mask = torch.triu(mask)
 31 | 
 32 |     npu_sdpa = SDPA(
 33 |         query.shape, key.shape, value.shape, mask.shape, is_causal=is_causal
 34 |     )
 35 | 
 36 |     npu_result = npu_sdpa.run(query.numpy(), key.numpy(), value.numpy(), mask.numpy())
 37 | 
 38 |     ref_result = torch.nn.functional.scaled_dot_product_attention(
 39 |         query,
 40 |         key,
 41 |         value,
 42 |         None if is_causal else mask,
 43 |         dropout_p=0,
 44 |         is_causal=is_causal,
 45 |         scale=None,
 46 |     )
 47 | 
 48 |     assert npu_result.shape == (1, heads, 1 if kv_cache else sequence, dim // heads)
 49 | 
 50 |     assert np.isfinite(npu_result).all()
 51 | 
 52 |     r2 = r2_score(ref_result.numpy().flatten(), npu_result.flatten())
 53 | 
 54 |     assert 1 - r2 < 0.05
 55 | 
 56 | 
 57 | @pytest.mark.parametrize("heads", [16, 32])
 58 | @pytest.mark.parametrize("sequence", [16, 32, 256, 512])
 59 | @pytest.mark.parametrize("dim", [512, 1024])
 60 | @pytest.mark.parametrize("kv_cache", [True, False])
 61 | @pytest.mark.parametrize("is_causal", [False, True])
 62 | @pytest.mark.parametrize("use_mask", [False, True])
 63 | def test_sdpa_runtime(heads, sequence, dim, kv_cache, is_causal, use_mask):
 64 | 
 65 |     min_value = torch.finfo(torch.float16).min
 66 | 
 67 |     query = torch.rand(1, heads, 1 if kv_cache else sequence, dim // heads).to(
 68 |         torch.float16
 69 |     )
 70 |     key = torch.rand(1, heads, sequence, dim // heads).to(torch.float16)
 71 |     value = torch.rand(1, heads, sequence, dim // heads).to(torch.float16)
 72 |     if use_mask:
 73 |         mask = min_value * torch.ones(
 74 |             1, heads, 1 if kv_cache else sequence, sequence
 75 |         ).to(torch.float16)
 76 |         mask = torch.triu(mask)
 77 |     else:
 78 |         mask = None
 79 | 
 80 |     npu_result = scaled_dot_product_attention(
 81 |         query, key, value, mask, is_causal=is_causal
 82 |     )
 83 | 
 84 |     ref_result = torch.nn.functional.scaled_dot_product_attention(
 85 |         query,
 86 |         key,
 87 |         value,
 88 |         None if is_causal else mask,
 89 |         dropout_p=0,
 90 |         is_causal=is_causal,
 91 |         scale=None,
 92 |     )
 93 | 
 94 |     assert npu_result.shape == (1, heads, 1 if kv_cache else sequence, dim // heads)
 95 | 
 96 |     assert np.isfinite(npu_result).all()
 97 | 
 98 |     r2 = r2_score(ref_result.numpy().flatten(), npu_result.numpy().flatten())
 99 | 
100 |     assert 1 - r2 < 0.05
101 | 


--------------------------------------------------------------------------------
/test/python/test_training.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2024 Intel Corporation
 3 | # SPDX-License-Identifier: Apache 2.0
 4 | #
 5 | 
 6 | 
 7 | from sklearn.metrics import r2_score
 8 | from intel_npu_acceleration_library import compile
 9 | from intel_npu_acceleration_library.compiler import CompilerConfig
10 | import torch
11 | import pytest
12 | import copy
13 | 
14 | # Three different values to pick the errors
15 | in_c = 128
16 | out_c = 512
17 | batch = 256
18 | 
19 | 
20 | class NN(torch.nn.Module):
21 |     def __init__(self, inc, outc, bias) -> None:
22 |         super().__init__()
23 |         self.linear = torch.nn.Linear(inc, outc, bias=bias)
24 |         self.linear2 = torch.nn.Linear(outc, inc, bias=bias)
25 | 
26 |     def forward(self, x):
27 |         return self.linear2(torch.nn.functional.relu(self.linear(x)))
28 | 
29 | 
30 | @pytest.fixture
31 | def model_no_bias():
32 |     compiler_conf = CompilerConfig()
33 |     return compile(NN(inc=in_c, outc=out_c, bias=False), compiler_conf)
34 | 
35 | 
36 | @pytest.fixture
37 | def model():
38 |     compiler_conf = CompilerConfig()
39 |     return compile(NN(inc=in_c, outc=out_c, bias=True), compiler_conf)
40 | 
41 | 
42 | def test_parameters(model, model_no_bias):
43 |     assert len(list(model.parameters())) == 4
44 |     assert len(list(model_no_bias.parameters())) == 2
45 | 
46 | 
47 | def test_gradient():
48 | 
49 |     npu_model = NN(inc=in_c, outc=out_c, bias=True).half()
50 |     cpu_model = NN(inc=in_c, outc=out_c, bias=True).half()
51 |     cpu_model.load_state_dict(copy.deepcopy(npu_model.state_dict()))
52 | 
53 |     # Compile one of the model on npu
54 |     compiler_conf = CompilerConfig(training=True)
55 |     compile(npu_model, compiler_conf)
56 | 
57 |     x = torch.rand([batch, in_c]).half()
58 |     yref = torch.rand([batch, in_c]).half()
59 | 
60 |     opt1 = torch.optim.SGD(npu_model.parameters(), lr=0.5)
61 |     opt2 = torch.optim.SGD(cpu_model.parameters(), lr=0.5)
62 | 
63 |     for idx in range(100):
64 | 
65 |         # Check the parameters are the same
66 |         for p1, p2 in zip(npu_model.parameters(), cpu_model.parameters()):
67 |             assert p1.dtype == p2.dtype
68 |             assert 1 - r2_score(p1.detach().numpy(), p2.detach().numpy()) < 0.001, idx
69 | 
70 |         opt1.zero_grad()
71 |         opt2.zero_grad()
72 | 
73 |         y1 = npu_model(x)
74 |         y2 = cpu_model(x)
75 | 
76 |         model_loss = torch.mean(((yref - y1) ** 2))
77 |         model_loss.backward()
78 | 
79 |         model_loss = torch.mean(((yref - y2) ** 2))
80 |         model_loss.backward()
81 | 
82 |         assert (torch.abs(model_loss - model_loss) / model_loss).item() < 0.001
83 | 
84 |         opt1.step()
85 |         opt2.step()
86 | 


--------------------------------------------------------------------------------