├── .clang-format
├── .clang-tidy
├── .editorconfig
├── .github
    ├── license_template.txt
    ├── pull_request_template.md
    └── workflows
    │   ├── Scaner_BDBA.yml
    │   ├── Scaner_Trivil.yml
    │   ├── cpp-graph-test.yml
    │   ├── docker
    │       ├── codeScan.dockerfile
    │       └── devel.dockerfile
    │   ├── format_scan.yml
    │   ├── scripts
    │       ├── change_color.sh
    │       ├── formatScan
    │       │   ├── bandit.sh
    │       │   ├── clangformat.sh
    │       │   ├── clangtidy.sh
    │       │   ├── cloc.sh
    │       │   ├── hadolint.sh
    │       │   ├── nlp_dict.txt
    │       │   ├── pydocstyle.sh
    │       │   ├── pylint.sh
    │       │   └── trellix.sh
    │       ├── install_binary.sh
    │       ├── models
    │       │   ├── calculate_percertiles.py
    │       │   ├── cpp_graph_inference.sh
    │       │   └── generate_report.sh
    │       ├── prepare_env.sh
    │       ├── prepare_env_with_conda.bat
    │       └── prepare_env_with_conda.sh
    │   ├── trellix.yml
    │   ├── unit-test-bestla.yml
    │   ├── unit-test-llmruntime.yml
    │   ├── unitTest
    │       ├── env_setup.sh
    │       └── unittest_llmruntime.sh
    │   └── windows-test.yml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── CMakeLists.txt
├── CMakePresets.json
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── bestla
    ├── CMakeLists.txt
    ├── CMakePresets.json
    ├── README.md
    ├── bestla
    │   ├── bestla.h
    │   ├── bestla_device.h
    │   ├── bestla_epilogue.h
    │   ├── bestla_gemm.h
    │   ├── bestla_jit.h
    │   ├── bestla_parallel.h
    │   ├── bestla_prologue_a.h
    │   ├── bestla_prologue_b.h
    │   ├── bestla_storage.h
    │   ├── bestla_utils.h
    │   ├── bestla_wrapper.h
    │   ├── kernel_avx2.h
    │   ├── kernel_avx512_bf16.h
    │   ├── kernel_avx512_fp16.h
    │   ├── kernel_avx512_vnni.h
    │   ├── kernel_avx512f.h
    │   ├── kernel_avx_vnni.h
    │   ├── kernel_jit.h
    │   ├── kernel_jit_injector.h
    │   ├── kernel_ref.h
    │   ├── kernel_wrapper.h
    │   ├── sycl
    │   │   ├── sycl_device.h
    │   │   ├── sycl_epilogue.h
    │   │   ├── sycl_gemm.h
    │   │   ├── sycl_prologue_a.h
    │   │   ├── sycl_prologue_b.h
    │   │   ├── sycl_storage.h
    │   │   ├── sycl_utils.h
    │   │   └── sycl_wrapper.h
    │   └── ut
    │   │   ├── bestla.cpp
    │   │   ├── bestla_benchmark.cpp
    │   │   ├── bestla_epilogue.cpp
    │   │   ├── bestla_gemm.cpp
    │   │   ├── bestla_parallel.cpp
    │   │   ├── bestla_prologue_a.cpp
    │   │   ├── bestla_prologue_b.cpp
    │   │   ├── bestla_ut.cpp
    │   │   ├── bestla_ut.h
    │   │   ├── bestla_utils.cpp
    │   │   ├── bestla_wrapper.cpp
    │   │   ├── kernel_intrin.cpp
    │   │   ├── kernel_jit.cpp
    │   │   ├── kernel_ut.h
    │   │   ├── kernel_wrapper.cpp
    │   │   ├── sycl_benchmark.cpp
    │   │   ├── sycl_gemm.cpp
    │   │   ├── sycl_misc.cpp
    │   │   └── sycl_ut.h
    └── cmake
    │   ├── FindSIMD.cmake
    │   └── sycl.cmake
├── clang-format.py
├── developer_document.md
├── docker
    ├── DockerFile
    └── README.md
├── docs
    ├── advanced_usage.md
    ├── continuous_batching.md
    ├── customized_stop.md
    ├── fused_attention.md
    ├── gguf.md
    ├── gptq_and_awq.md
    ├── imgs
    │   ├── Attention.PNG
    │   ├── FFN.PNG
    │   ├── ORCA_batching.png
    │   ├── shift-rope.svg
    │   └── shuffle-attn.svg
    ├── infinite_inference.md
    ├── install.md
    ├── prompt_template.md
    ├── supported_models.md
    └── tensor_parallelism.md
├── neural_speed
    ├── CMakeLists.txt
    ├── __init__.py
    ├── application
    │   ├── CMakeLists.txt
    │   ├── audio_run.cpp
    │   ├── common.cpp
    │   ├── common.h
    │   ├── main_pybind.cpp
    │   ├── main_run.cpp
    │   ├── pybind_gptj.cpp
    │   ├── quant_model.cpp
    │   ├── quant_whisper.cpp
    │   └── whisper_pybind.cpp
    ├── cmake
    │   ├── ClangTidy.cmake
    │   ├── Common.cmake
    │   └── ISA.cmake
    ├── convert
    │   ├── __init__.py
    │   ├── common.py
    │   ├── convert-hf-to-gguf.py
    │   ├── convert_baichuan.py
    │   ├── convert_bloom.py
    │   ├── convert_chatglm.py
    │   ├── convert_dolly.py
    │   ├── convert_falcon.py
    │   ├── convert_gemma.py
    │   ├── convert_gptj.py
    │   ├── convert_gptneox.py
    │   ├── convert_grok.py
    │   ├── convert_llama.py
    │   ├── convert_mistral.py
    │   ├── convert_mixtral.py
    │   ├── convert_mpt.py
    │   ├── convert_opt.py
    │   ├── convert_phi.py
    │   ├── convert_phi3.py
    │   ├── convert_quantized_baichuan.py
    │   ├── convert_quantized_falcon.py
    │   ├── convert_quantized_gptj.py
    │   ├── convert_quantized_llama.py
    │   ├── convert_quantized_mistral.py
    │   ├── convert_quantized_mixtral.py
    │   ├── convert_quantized_phi.py
    │   ├── convert_quantized_qwen.py
    │   ├── convert_qwen.py
    │   ├── convert_stablelm.py
    │   ├── convert_starcoder.py
    │   └── convert_whisper.py
    ├── core
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── data_types.h
    │   ├── layers
    │   │   ├── CMakeLists.txt
    │   │   ├── Ops.h
    │   │   ├── argsort.cpp
    │   │   ├── argsort.h
    │   │   ├── bestla_common.hpp
    │   │   ├── bestla_defs.h
    │   │   ├── bestla_gemm.cpp
    │   │   ├── bestla_gemm.h
    │   │   ├── conv.cpp
    │   │   ├── conv.h
    │   │   ├── ele_reduce.h
    │   │   ├── ele_wise.h
    │   │   ├── inner_product.cpp
    │   │   ├── ip_fusion_ffn.cpp
    │   │   ├── ip_fusion_qkv.cpp
    │   │   ├── layers.h
    │   │   ├── memory.cpp
    │   │   ├── memory.h
    │   │   ├── mha_dense.cpp
    │   │   ├── mha_dense.h
    │   │   ├── mha_dense_tests.cpp
    │   │   ├── mha_dense_wrapper.h
    │   │   ├── ne_bestla.cpp
    │   │   ├── ne_bestla_sycl.cpp
    │   │   ├── ne_test_layers_utils.hpp
    │   │   └── vec_dot.h
    │   ├── ne.h
    │   ├── ne_bestla.h
    │   ├── ne_layers.c
    │   ├── ne_layers.h
    │   ├── parallel_context.cpp
    │   ├── parallel_context.h
    │   └── shared_memory_ccl.hpp
    ├── models
    │   ├── CMakeLists.txt
    │   ├── baichuan
    │   │   ├── baichuan.cpp
    │   │   ├── baichuan.h
    │   │   └── baichuan_utils.cpp
    │   ├── bloom
    │   │   ├── bloom.cpp
    │   │   ├── bloom.h
    │   │   └── bloom_utils.cpp
    │   ├── chatglm
    │   │   ├── chatglm.cpp
    │   │   ├── chatglm.h
    │   │   ├── chatglm2.cpp
    │   │   ├── chatglm2.h
    │   │   ├── chatglm2_utils.cpp
    │   │   └── chatglm_utils.cpp
    │   ├── falcon
    │   │   ├── falcon.cpp
    │   │   ├── falcon.h
    │   │   └── falcon_utils.cpp
    │   ├── gemma
    │   │   ├── gemma.cpp
    │   │   ├── gemma.h
    │   │   └── gemma_utils.cpp
    │   ├── gptj
    │   │   ├── gptj.cpp
    │   │   ├── gptj.h
    │   │   └── gptj_utils.cpp
    │   ├── gptneox
    │   │   ├── gptneox.cpp
    │   │   ├── gptneox.h
    │   │   └── gptneox_utils.cpp
    │   ├── grok
    │   │   ├── grok.cpp
    │   │   ├── grok.h
    │   │   └── grok_utils.cpp
    │   ├── llama
    │   │   ├── llama.cpp
    │   │   ├── llama.h
    │   │   └── llama_utils.cpp
    │   ├── model_utils
    │   │   ├── CMakeLists.txt
    │   │   ├── arg_parse.cpp
    │   │   ├── gguf.h
    │   │   ├── model_config.h
    │   │   ├── model_files.h
    │   │   ├── model_types.h
    │   │   ├── model_utils.cpp
    │   │   ├── model_utils.h
    │   │   ├── pool.cpp
    │   │   ├── pool.h
    │   │   ├── quant_config.h
    │   │   ├── quant_utils.cpp
    │   │   ├── quant_utils.h
    │   │   ├── scheduler.cpp
    │   │   ├── scheduler.h
    │   │   ├── util.cpp
    │   │   └── util.h
    │   ├── models.h
    │   ├── mpt
    │   │   ├── mpt.cpp
    │   │   ├── mpt.h
    │   │   └── mpt_utils.cpp
    │   ├── opt
    │   │   ├── opt.cpp
    │   │   ├── opt.h
    │   │   └── opt_utils.cpp
    │   ├── phi
    │   │   ├── phi.cpp
    │   │   ├── phi.h
    │   │   ├── phi3.cpp
    │   │   ├── phi3.h
    │   │   ├── phi3_utils.cpp
    │   │   └── phi_utils.cpp
    │   ├── qwen
    │   │   ├── qwen.cpp
    │   │   ├── qwen.h
    │   │   └── qwen_utils.cpp
    │   ├── requirements
    │   │   ├── baichuan.sh
    │   │   ├── baichuan13b-gptq.sh
    │   │   ├── chatglm-6b.sh
    │   │   ├── common.txt
    │   │   ├── mistral.sh
    │   │   └── mixtral-gptq.sh
    │   ├── stablelm
    │   │   ├── stablelm.cpp
    │   │   ├── stablelm.h
    │   │   └── stablelm_utils.cpp
    │   ├── starcoder
    │   │   ├── starcoder.cpp
    │   │   ├── starcoder.h
    │   │   └── starcoder_utils.cpp
    │   └── whisper
    │   │   ├── dr_wav.h
    │   │   ├── whisper.cpp
    │   │   ├── whisper.h
    │   │   └── whisper_utils.cpp
    └── vectors
    │   ├── CMakeLists.txt
    │   ├── cpu
    │       ├── CMakeLists.txt
    │       ├── quantize.h
    │       ├── simd.h
    │       ├── vec.hpp
    │       ├── vec.hpp.gch
    │       ├── vec_arithmetic.cpp
    │       ├── vec_arithmetic.hpp
    │       ├── vec_base.hpp
    │       ├── vec_compare.cpp
    │       ├── vec_compare.hpp
    │       ├── vec_convert.cpp
    │       ├── vec_convert.hpp
    │       ├── vec_load.cpp
    │       ├── vec_load.hpp
    │       ├── vec_set.cpp
    │       ├── vec_set.hpp
    │       ├── vec_store.cpp
    │       └── vec_store.hpp
    │   ├── ele_reduce.cpp
    │   ├── ele_reduce.h
    │   ├── fp16.h
    │   ├── gpu
    │       ├── CMakeLists.txt
    │       ├── ele_reduce.cpp
    │       ├── ele_wise.cpp
    │       ├── reduce.h
    │       ├── test.cpp
    │       ├── vector_func.h
    │       └── vector_kernel.h
    │   └── parallel_for.h
├── requirements.txt
├── scripts
    ├── accuracy.py
    ├── cal_acc.py
    ├── cal_diff.py
    ├── convert.py
    ├── huggingface.py
    ├── inference.py
    ├── load_peft_and_merge.py
    ├── ns_evaluator.py
    ├── perplexity.py
    ├── python_api_example.py
    ├── python_api_example_for_gguf.py
    ├── python_api_example_for_gptq.py
    ├── python_api_example_for_model_server.py
    ├── python_api_example_for_modelscope.py
    ├── quantize.py
    ├── requirements.txt
    ├── run.py
    └── whisper_example.py
├── security.md
├── setup.py
└── tests
    ├── model-test
        ├── calculate_percentiles.py
        ├── cpp_graph_inference.sh
        ├── cpp_graph_prompts.json
        └── run_tp.sh
    ├── requirements.txt
    ├── test_model_server.py
    ├── test_modelscope.py
    └── test_python_api.py


/.clang-format:
--------------------------------------------------------------------------------
1 | Language:        Cpp
2 | BasedOnStyle:  Google
3 | DerivePointerAlignment: false
4 | ColumnLimit: 120
5 | SpaceBeforeParens: ControlStatements
6 | SpaceBeforeRangeBasedForLoopColon: true
7 | SortIncludes: false
8 | 


--------------------------------------------------------------------------------
/.clang-tidy:
--------------------------------------------------------------------------------
 1 | Checks: >
 2 |     -*,
 3 |     readability-identifier-naming,
 4 |     readability-const-return-type,
 5 |     readability-redundant-smartptr-get,
 6 |     readability-misleading-indentation,
 7 |     readability-redundant-control-flow,
 8 |     readability-redundant-member-init,
 9 |     readability-redundant-string-cstr,
10 |     readability-redundant-string-init,
11 |     readability-simplify-subscript-expr,
12 |     readability-static-accessed-through-instance,
13 |     readability-static-definition-in-anonymous-namespace,
14 |     readability-uniqueptr-delete-release,
15 |     readability-container-size-empty,
16 |     # readability-delete-null-pointer,  // not applicable for gcc/msvc
17 |     readability-make-member-function-const,
18 |     readability-redundant-access-specifiers,
19 |     performance-for-range-copy,
20 |     performance-implicit-conversion-in-loop,
21 |     performance-inefficient-algorithm,
22 |     performance-inefficient-string-concatenation,
23 |     performance-inefficient-vector-operation,
24 |     performance-move-const-arg,
25 |     performance-unnecessary-copy-initialization,
26 |     performance-unnecessary-value-param,
27 |     performance-no-automatic-move,
28 |     performance-trivially-destructible,
29 |     modernize-make-shared,
30 |     modernize-use-bool-literals,
31 |     modernize-use-emplace,
32 |     modernize-use-equals-default,
33 |     modernize-use-override,
34 |     modernize-use-nullptr,
35 |     modernize-use-using,
36 |     bugprone-assert-side-effect,
37 |     bugprone-copy-constructor-init,
38 |     bugprone-forward-declaration-namespace,
39 |     bugprone-move-forwarding-reference,
40 |     bugprone-parent-virtual-call,
41 |     bugprone-too-small-loop-variable,
42 |     bugprone-undefined-memory-manipulation,
43 |     bugprone-unhandled-self-assignment,
44 |     bugprone-multiple-statement-macro,
45 |     bugprone-macro-parentheses,
46 |     bugprone-undefined-memory-manipulation,
47 |     bugprone-unhandled-self-assignment,
48 |     # google-default-arguments,
49 |     misc-misplaced-const,
50 |     misc-definitions-in-headers,
51 |     misc-redundant-expression,
52 |     misc-uniqueptr-reset-release,
53 |     misc-unused-alias-decls,
54 |     misc-unused-using-decls,
55 |     cppcoreguidelines-prefer-member-initializer,
56 | 
57 | CheckOptions:
58 |   - key:             readability-identifier-naming.ClassCase
59 |     value:           lower_case
60 |   - key:             readability-identifier-naming.StructCase
61 |     value:           lower_case
62 |   - key:             readability-identifier-naming.ClassSuffix
63 |     value:           _t
64 |   - key:             readability-identifier-naming.StructSuffix
65 |     value:           _t
66 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | charset = utf-8
 5 | indent_style = space
 6 | indent_size = 2
 7 | end_of_line = lf
 8 | insert_final_newline = true
 9 | trim_trailing_whitespace = true
10 | 
11 | [*.py]
12 | indent_size = 4
13 | 


--------------------------------------------------------------------------------
/.github/license_template.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2024 Intel Corporation
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |    http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## Type of Change
 2 | 
 3 | feature or bug fix or documentation or others
 4 | API changed or not
 5 | 
 6 | ## Description
 7 | 
 8 | detail description 
 9 | Issues: xxx
10 | 
11 | ## Expected Behavior & Potential Risk
12 | 
13 | the expected behavior that triggered by this PR 
14 | 
15 | ## How has this PR been tested?
16 | 
17 | how to reproduce the test (including hardware information)
18 | 
19 | ## Dependency Change?
20 | 
21 | any library dependency introduced or removed
22 | 


--------------------------------------------------------------------------------
/.github/workflows/Scaner_BDBA.yml:
--------------------------------------------------------------------------------
 1 | name: Scanner BDBA
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | permissions: write-all
 7 | jobs:
 8 |   bdba_job:
 9 |     name: BDBA Scan
10 |     uses: intel-innersource/frameworks.ai.infrastructure.code-scan-tools/.github/workflows/Scanner_Bdba.yml@one-ci-cd
11 |     with:
12 |       repos: ${{ github.event.repository.name }}
13 |       refs: ${{ github.ref_name }}
14 |       group: "22"
15 |       runners: "['inner-source']"
16 |     secrets:
17 |       token: ${{ secrets.GITHUB_TOKEN }}
18 |       BDBA_TOKEN: ${{ secrets.BDBA_TOKEN }}
19 | 


--------------------------------------------------------------------------------
/.github/workflows/Scaner_Trivil.yml:
--------------------------------------------------------------------------------
 1 | name: Trivy Scan for Containers
 2 | 
 3 | on:
 4 |     workflow_dispatch:
 5 | permissions: write-all
 6 | jobs:
 7 |   trivy_container_job:
 8 |     uses: "intel-innersource/frameworks.ai.infrastructure.code-scan-tools/.github/workflows/Scanner_Trivy.yml@one-ci-cd"
 9 |     with:
10 |       container: ${{ vars.TRIVY_CONTAINER_NAME }}
11 |       runners: "['inner-source']"
12 |       lmc: false
13 | 


--------------------------------------------------------------------------------
/.github/workflows/docker/codeScan.dockerfile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | ARG UBUNTU_VER=22.04
17 | FROM ubuntu:${UBUNTU_VER} as devel
18 | 
19 | # See http://bugs.python.org/issue19846
20 | ENV LANG C.UTF-8
21 | 
22 | RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
23 |     aspell \
24 |     aspell-en \
25 |     python3 \
26 |     python3-pip \
27 |     python3-dev \
28 |     python3-distutils \
29 |     build-essential \
30 |     cloc \
31 |     python3.10-venv \
32 |     git
33 | 
34 | RUN ln -sf $(which python3) /usr/bin/python
35 | 
36 | RUN python -m pip install --no-cache-dir pylint==2.17.5\
37 |     bandit==1.7.4\
38 |     pyspelling\
39 |     pydocstyle
40 | 
41 | WORKDIR /
42 | 


--------------------------------------------------------------------------------
/.github/workflows/docker/devel.dockerfile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ARG UBUNTU_VER=22.04
16 | FROM ubuntu:${UBUNTU_VER} as devel
17 | 
18 | # See http://bugs.python.org/issue19846
19 | ENV LANG C.UTF-8
20 | 
21 | RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
22 |     python3 \
23 |     python3-pip \
24 |     python3-dev \
25 |     python3-distutils \
26 |     autoconf \
27 |     build-essential \
28 |     git \
29 |     libgl1-mesa-glx \
30 |     libglib2.0-0 \
31 |     numactl \
32 |     time \
33 |     wget \
34 |     bc \
35 |     gawk \
36 |     jq \
37 |     python3.10-venv \
38 |     vim
39 | 
40 | RUN ln -sf $(which python3) /usr/bin/python
41 | 
42 | RUN python -m pip --no-cache-dir install --upgrade pip
43 | RUN python -m pip install --no-cache-dir setuptools
44 | 
45 | RUN pip list
46 | 
47 | WORKDIR /
48 | 
49 | 


--------------------------------------------------------------------------------
/.github/workflows/format_scan.yml:
--------------------------------------------------------------------------------
 1 | name: Format Scan
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main]
 6 |     paths:
 7 |       - neural_speed/**
 8 |       - bestla/**
 9 |       - scripts/**
10 |       - clang-format.py
11 |       - setup.py
12 |       - .github/workflows/format_scan.yml
13 |       - .github/workflows/scripts/formatScan/**
14 |       - "!bestla/*.md"
15 |   workflow_dispatch:
16 | 
17 | # If there is a new commit, the previous jobs will be canceled
18 | concurrency:
19 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
20 |   cancel-in-progress: true
21 | 
22 | env:
23 |   DOCKER_CONFIG_NAME: "commonDockerConfig"
24 |   REPO_NAME: "code-scan"
25 |   REPO_TAG: "1.0"
26 |   DOCKER_FILE_NAME: "codeScan"
27 |   CONTAINER_NAME: "codeScan"
28 | 
29 | jobs:
30 |   format-scan:
31 |     runs-on: ubuntu-latest
32 |     strategy:
33 |       matrix:
34 |         job_name: [
35 |             "pylint",
36 |             "bandit",
37 |             "clangformat",
38 |             "cloc",
39 |             "clangtidy",
40 |             # "pydocstyle",
41 |             #"pyspelling",
42 |             "hadolint"
43 |           ]
44 |       fail-fast: false
45 |     steps:
46 |       - name: Docker Clean Up
47 |         run: |
48 |           docker ps -a
49 |           if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}'$) ]]; then
50 |               docker start ${{ env.CONTAINER_NAME }}
51 |               echo "remove left files through container ..."
52 |               docker exec ${{ env.CONTAINER_NAME }} bash -c "ls -a /neural-speed && rm -fr /neural-speed/* && rm -fr /neural-speed/.* || true"
53 |           fi
54 | 
55 |       - name: Checkout out Repo
56 |         uses: actions/checkout@v3
57 | 
58 |       - name: Docker Build
59 |         run: |
60 |           docker build -f ${{ github.workspace }}/.github/workflows/docker/${{ env.DOCKER_FILE_NAME }}.dockerfile -t ${{ env.REPO_NAME }}:${{ env.REPO_TAG }} .
61 | 
62 |       - name: Docker Run
63 |         run: |
64 |           if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}'$) ]]; then
65 |             docker stop ${{ env.CONTAINER_NAME }}
66 |             docker rm -vf ${{ env.CONTAINER_NAME }} || true
67 |           fi
68 |           docker run -dit --memory="4g" --memory-reservation="1g" --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }} --shm-size="1g" \
69 |           -v ${{ github.workspace }}:/neural-speed \
70 |           ${{ env.REPO_NAME }}:${{ env.REPO_TAG }}
71 | 
72 |       - name: Env build
73 |         run: |
74 |           docker exec ${{ env.CONTAINER_NAME }} \
75 |           bash /neural-speed/.github/workflows/scripts/prepare_env.sh
76 | 
77 |       - name: Code scan check
78 |         run: |
79 |           docker exec ${{ env.CONTAINER_NAME }} \
80 |           bash -c "bash /neural-speed/.github/workflows/scripts/formatScan/${{ matrix.job_name }}.sh"
81 | 
82 |       - name: Publish pipeline artifact
83 |         if: ${{ !cancelled() }}
84 |         uses: actions/upload-artifact@v3
85 |         with:
86 |           name: ${{ matrix.job_name }}
87 |           path: ${{ github.workspace }}/.github/workflows/scripts/formatScan/${{ matrix.job_name }}.*
88 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/change_color.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # -------------- general approach start----------------
 4 | 
 5 | # 1. import this file:
 6 | #     source path/change_color.sh
 7 | # 2. use COLOR/BG:
 8 | #     $VARIABLE_NAME && out_put_content && $RESET
 9 | # 3. COLOR + BG:
10 | #     $COLOR/BG_VARIABLE_NAME && $BG/COLOR_VARIABLE_NAME && out_put_content && $RESET
11 | # 4. custom
12 | #     abbreviation(change number)
13 | #         txt number range (30, 37)
14 | #         bg number range (40, 47)
15 | #         special effects number range (1, 7)
16 | #         echo -en \\E[number1 + ; + number2 + ; + number3 + m"
17 | #         e.g - BG_GRAY+LIGHT_RED = "echo -en \\E[47;31m"
18 | 
19 | # -------------- general approach end----------------==
20 | 
21 | # general setting
22 | # ------------- light_color start----------------
23 | # black
24 | LIGHT_BLACK="echo -en \\E[30m"
25 | # red
26 | LIGHT_RED="echo -en \\E[31m"
27 | # green
28 | LIGHT_GREEN="echo -en \\E[32m"
29 | # yellow
30 | LIGHT_YELLOW="echo -en \\E[33m"
31 | # blue
32 | LIGHT_BLUE="echo -en \\E[34m"
33 | # purple
34 | LIGHT_PURPLE="echo -en \\E[35m"
35 | # cyan
36 | LIGHT_CYAN="echo -en \\E[36m"
37 | # gray
38 | LIGHT_GRAY="echo -en \\E[37m"
39 | # ------------- light_color end----------------
40 | 
41 | # ------------- bold_color start----------------
42 | # black
43 | BOLD_BLACK="echo -en \\E[1;30m"
44 | # red
45 | BOLD_RED="echo -en \\E[1;31m"
46 | # green
47 | BOLD_GREEN="echo -en \\E[1;32m"
48 | # yellow
49 | BOLD_YELLOW="echo -en \\E[1;33m"
50 | # blue
51 | BOLD_BLUE="echo -en \\E[1;34m"
52 | # purple
53 | BOLD_PURPLE="echo -en \\E[1;35m"
54 | # cyan
55 | BOLD_CYAN="echo -en \\E[1;36m"
56 | # gray
57 | BOLD_GRAY="echo -en \\E[1;37m"
58 | # ------------- bold_color end----------------
59 | 
60 | # ------------- background_color start----------------
61 | # black
62 | BG_BLACK="echo -en \\E[40m"
63 | # red
64 | BG_RED="echo -en \\E[41m"
65 | # green
66 | BG_GREEN="echo -en \\E[42m"
67 | # yellow
68 | BG_YELLOW="echo -en \\E[43m"
69 | # blue
70 | BG_BLUE="echo -en \\E[44m"
71 | # purple
72 | BG_PURPLE="echo -en \\E[45m"
73 | # cyan
74 | BG_CYAN="echo -en \\E[46m"
75 | # gray
76 | BG_GRAY="echo -en \\E[47m"
77 | # ------------- background_color end----------------
78 | 
79 | # close
80 | RESET="echo -en \\E[0m"
81 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/formatScan/bandit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source /neural-speed/.github/workflows/scripts/change_color.sh
 3 | pip install bandit==1.7.4
 4 | log_dir=/neural-speed/.github/workflows/scripts/formatScan
 5 | python -m bandit -r -lll -iii /neural-speed >${log_dir}/bandit.log
 6 | exit_code=$?
 7 | 
 8 | $BOLD_YELLOW && echo " -----------------  Current log file output start --------------------------"
 9 | cat ${log_dir}/bandit.log
10 | $BOLD_YELLOW && echo " -----------------  Current log file output end --------------------------" && $RESET
11 | 
12 | if [ ${exit_code} -ne 0 ]; then
13 |     $BOLD_RED && echo "Error!! Please Click on the artifact button to download and view Bandit error details." && $RESET
14 |     exit 1
15 | fi
16 | 
17 | $BOLD_PURPLE && echo "Congratulations, Bandit check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET
18 | exit 0
19 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/formatScan/clangformat.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source /neural-speed/.github/workflows/scripts/change_color.sh
 3 | 
 4 | pip install clang-format==14.0.0
 5 | log_dir=/neural-speed/.github/workflows/scripts/formatScan
 6 | log_path=${log_dir}/clangformat.log
 7 | 
 8 | cd /neural-speed
 9 | git config --global --add safe.directory "*"
10 | 
11 | cd /neural-speed
12 | python clang-format.py
13 | 
14 | echo "run git diff"
15 | git diff 2>&1 | tee -a ${log_path}
16 | 
17 | if [[ ! -f ${log_path} ]] || [[ $(grep -c "diff" ${log_path}) != 0 ]]; then
18 |     exit 1
19 | fi
20 | $BOLD_PURPLE && echo "Congratulations, check passed!" && $LIGHT_PURPLE && echo "You can click on the artifact button to see the log details." && $RESET
21 | exit 0
22 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/formatScan/clangtidy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /neural-speed/.github/workflows/scripts/change_color.sh
 4 | 
 5 | pip install cmake ninja clang-tidy==16.0.4
 6 | REPO_DIR=/neural-speed
 7 | log_dir=/neural-speed/.github/workflows/scripts/formatScan
 8 | log_path=${log_dir}/clangtidy.log
 9 | 
10 | # compile binary
11 | cd ${REPO_DIR}
12 | mkdir build
13 | cd build
14 | cmake .. -G Ninja -DNS_USE_CLANG_TIDY=CHECK -DBTLA_ENABLE_OPENMP=OFF -DNS_USE_OMP=OFF
15 | ninja 2>&1 | tee ${log_path}
16 | 
17 | if [[ ! -f ${log_path} ]] || [[ $(grep -c "warning:" ${log_path}) != 0 ]] || [[ $(grep -c "error" ${log_path}) != 0 ]]; then
18 |     exit 1
19 | fi
20 | $BOLD_PURPLE && echo "Congratulations, check passed!" && $LIGHT_PURPLE && echo "You can click on the artifact button to see the log details." && $RESET
21 | exit 0
22 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/formatScan/cloc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | source /neural-speed/.github/workflows/scripts/change_color.sh
4 | log_dir=/neural-speed/.github/workflows/scripts/formatScan
5 | cloc --include-lang=Python --csv --out=${log_dir}/cloc.csv /neural-speed
6 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/formatScan/hadolint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2024 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | source /neural-speed/.github/workflows/scripts/change_color.sh
17 | log_dir=/neural-speed/.github/workflows/scripts/formatScan
18 | 
19 | find . -type f \( -name "Dockerfile*" \) -print -exec hadolint --ignore DL3006 --ignore DL3007 --ignore DL3008 {} \; 2>&1 | tee ${log_dir}/hadolint.log
20 | 
21 | if [[ $(grep -c "error" ${log_dir}/hadolint.log) != 0 ]]; then
22 |     $BOLD_RED && echo "Error!! Please Click on the artifact button to download and check error details." && $RESET
23 |     exit 1
24 | fi
25 | 
26 | $BOLD_PURPLE && echo "Congratulations, Hadolint check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET
27 | exit 0


--------------------------------------------------------------------------------
/.github/workflows/scripts/formatScan/nlp_dict.txt:
--------------------------------------------------------------------------------
 1 | aadd
 2 | aas
 3 | alse
 4 | ans
 5 | bu
 6 | charactor
 7 | daa
 8 | datas
 9 | dota
10 | dout
11 | endianess
12 | fo
13 | followings
14 | haa
15 | inout
16 | iterm
17 | mata
18 | matc
19 | mone
20 | nd
21 | ore
22 | ot
23 | parm
24 | ques
25 | rouge
26 | ser
27 | sie
28 | te
29 | tne
30 | tye
31 | ue
32 | wya
33 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/formatScan/pydocstyle.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /neural-speed/.github/workflows/scripts/change_color.sh
 4 | 
 5 | REPO_DIR=/neural-speed
 6 | log_dir=/neural-speed/.github/workflows/scripts/formatScan
 7 | pydocstyle --convention=google ${REPO_DIR} >${log_dir}/pydocstyle.log
 8 | exit_code=$?
 9 | 
10 | $BOLD_YELLOW && echo " -----------------  Current pydocstyle cmd start --------------------------" && $RESET
11 | echo "pydocstyle --convention=google ${REPO_DIR} >${log_dir}/pydocstyle.log"
12 | $BOLD_YELLOW && echo " -----------------  Current pydocstyle cmd end --------------------------" && $RESET
13 | 
14 | $BOLD_YELLOW && echo " -----------------  Current log file output start --------------------------"
15 | cat $log_dir/pydocstyle.log
16 | $BOLD_YELLOW && echo " -----------------  Current log file output end --------------------------" && $RESET
17 | 
18 | if [ ${exit_code} -ne 0 ]; then
19 |     $BOLD_RED && echo "Error!! Please Click on the artifact button to download and view error details." && $RESET
20 |     exit 1
21 | fi
22 | 
23 | $BOLD_PURPLE && echo "Congratulations, check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET
24 | exit 0
25 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/formatScan/pylint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /neural-speed/.github/workflows/scripts/change_color.sh
 4 | cd /neural-speed
 5 | $BOLD_YELLOW && echo "---------------- git submodule update --init --recursive -------------" && $RESET
 6 | git config --global --add safe.directory "*"
 7 | git submodule update --init --recursive
 8 | 
 9 | $BOLD_YELLOW && echo "---------------- install NeuralSpeed -------------" && $RESET
10 | export PYTHONPATH=`pwd`
11 | pip list
12 | 
13 | 
14 | cd /neural-speed
15 | log_dir=/neural-speed/.github/workflows/scripts/formatScan
16 | if [ -f "requirements.txt" ]; then
17 |     python -m pip install --default-timeout=100 -r requirements.txt
18 |     pip list
19 | else
20 |     echo "Not found requirements.txt file."
21 | fi
22 | 
23 | echo "[DEBUG] list pipdeptree..."
24 | pip install pipdeptree
25 | pipdeptree
26 | 
27 | python -m pylint -f json --disable=R,C,W,E1129 \
28 |     --enable=line-too-long \
29 |     --max-line-length=120 \
30 |     --disable=no-name-in-module,import-error,no-member,undefined-variable,no-value-for-parameter,unexpected-keyword-arg,not-callable,no-self-argument,too-many-format-args,invalid-unary-operand-type,too-many-function-args \
31 |     --extension-pkg-whitelist=numpy,nltk \
32 |     --ignored-classes=TensorProto,NodeProto \
33 |     --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,neural_compressor,neural_compressor.benchmark,intel_extension_for_transformers.neural_engine_py,cv2,PIL.Image \
34 |     /neural-speed/neural_speed >${log_dir}/pylint.json
35 | exit_code=$?
36 | 
37 | $BOLD_YELLOW && echo " -----------------  Current log file output start --------------------------" && $RESET
38 | cat ${log_dir}/pylint.json
39 | $BOLD_YELLOW && echo " -----------------  Current log file output end --------------------------" && $RESET
40 | 
41 | if [ ${exit_code} -ne 0 ]; then
42 |     $BOLD_RED && echo "Error!! Please Click on the artifact button to download and view Pylint error details." && $RESET
43 |     exit 1
44 | fi
45 | $BOLD_PURPLE && echo "Congratulations, Pylint check passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET
46 | exit 0
47 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/formatScan/trellix.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2024 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | source /neural-speed/.github/workflows/scripts/change_color.sh
17 | log_dir=/neural-speed/.github/workflows/scripts/formatScan
18 | 
19 | 
20 | echo "---Updating definition (DAT) files ---"
21 | DEFS_URL=https://update.nai.com/products/commonupdater/current/vscandat1000/dat/0000
22 | echo "Finding latest defs at $DEFS_URL/avvdat.ini..." \
23 |  && wget -q $DEFS_URL/avvdat.ini \
24 |  && echo "SUCCESS" || fail
25 | 
26 | inifile="avvdat.ini"
27 | filename=`awk -F"=" '$2 ~ /avvdat.*zip/ { print $2 } ' $inifile`
28 | filename2="$(echo -e "${filename}" | tr -d '[:space:]')"
29 | 
30 | if [ -z "$filename2" ]
31 | then
32 |       echo "Cannot get defs information from INI file:"
33 |       cat $inifile
34 |       fail
35 | fi
36 | 
37 | echo "Downloading latest defs from $DEFS_URL/$filename2..." \
38 |  && wget -q $DEFS_URL/$filename2 \
39 |  && echo "SUCCESS" || fail
40 | 
41 | echo "Extracting latest defs..." \
42 |  && unzip -o $filename2 -d /usr/local/uvscan \
43 |  && echo "SUCCESS" || fail
44 | 
45 | echo "--- Scanning ---"
46 | ENV_SCAN_OPTS="--analyze --mime --program --recursive --unzip --threads 4 --summary --verbose --html=${workspace}/.github/workflows/scripts/formatScan/report.html"
47 | echo "Scan Options: $ENV_SCAN_OPTS"
48 | 
49 | rm -r ${workspace}/avvdat*
50 | rm -r ${workspace}/.git
51 | uvscan $ENV_SCAN_OPTS ${workspace} 2>&1 | tee ${log_dir}/trellix.log
52 | 
53 | 
54 | if [[ $(grep "Possibly Infected" ${log_dir}/trellix.log | sed 's/[^0-9]//g') != 0 ]]; then
55 |     $BOLD_RED && echo "Error!! Please Click on the artifact button to download and check error details." && $RESET
56 |     exit 1
57 | fi
58 | 
59 | $BOLD_PURPLE && echo "Congratulations, Trellix Scan passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET
60 | exit 0


--------------------------------------------------------------------------------
/.github/workflows/scripts/install_binary.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source /neural-speed/.github/workflows/scripts/change_color.sh
 3 | 
 4 | cd /neural-speed
 5 | $BOLD_YELLOW && echo "---------------- git submodule update --init --recursive -------------" && $RESET
 6 | git config --global --add safe.directory "*"
 7 | git submodule update --init --recursive
 8 | 
 9 | 
10 | $BOLD_YELLOW && echo "---------------- run python setup.py sdist bdist_wheel -------------" && $RESET
11 | python setup.py bdist_wheel
12 | 
13 | $BOLD_YELLOW && echo "---------------- pip install binary -------------" && $RESET
14 | pip install dist/neural_speed*.whl
15 | pip list
16 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/prepare_env.sh:
--------------------------------------------------------------------------------
 1 | cd /neural-speed
 2 | 
 3 | pip install -U pip
 4 | 
 5 | if [ -f "requirements.txt" ]; then
 6 |     python -m pip install --default-timeout=100 -r requirements.txt
 7 |     pip list
 8 | else
 9 |     echo "Not found requirements.txt file."
10 | fi
11 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/prepare_env_with_conda.bat:
--------------------------------------------------------------------------------
 1 | SET conda_env_name=windows_build_ns
 2 | SET python_version=3.10
 3 | cd ../../..
 4 | 
 5 | FOR /F %%i IN ('conda info -e ^| find /c "%conda_env_name%"') do SET CONDA_COUNT=%%i
 6 | if %CONDA_COUNT% EQU 0 (
 7 |     CALL conda create python=%python_version% -y -n %conda_env_name%
 8 | )
 9 | 
10 | IF %ERRORLEVEL% NEQ 0 (
11 |     echo "Could not create new conda environment."
12 |     exit 1
13 | )
14 | CALL conda activate %conda_env_name%
15 | CALL pip uninstall neural-speed -y
16 | echo "pip list all the components------------->"
17 | CALL pip list
18 | CALL pip config set global.proxy proxy-prc.intel.com:913
19 | CALL pip install -U pip
20 | echo "Installing requirements for validation scripts..."
21 | CALL pip install  -i https://pypi.python.org/simple setuptools_scm
22 | CALL pip install -r requirements.txt
23 | echo "pip list all the components------------->"
24 | CALL pip list
25 | echo "------------------------------------------"
26 | IF %ERRORLEVEL% NEQ 0 (
27 |     echo "Could not install requirements."
28 |     exit 1
29 | )
30 | 
31 | git submodule update --init --recursive
32 | python setup.py sdist bdist_wheel
33 | IF %ERRORLEVEL% NEQ 0 (
34 |     echo "Could not build binary."
35 |     exit 1
36 | )
37 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/prepare_env_with_conda.sh:
--------------------------------------------------------------------------------
 1 | cd ${WORKING_DIR}
 2 | conda_env_name=$1
 3 | python_version=$2
 4 | if [[ -z "${conda_env_name}" ]] || [[ -z "${python_version}" ]]; then
 5 |     $BOLD_RED && echo "need provide with conda env name and python version" && $RESET
 6 |     exit 1
 7 | fi
 8 | source ~/.bashrc 
 9 | conda create -n ${conda_env_name} python=${python_version} -y
10 | source activate ${conda_env_name} || conda activate ${conda_env_name}
11 | pip install -U pip
12 | 
13 | if [ -f "requirements.txt" ]; then
14 |     python -m pip install --default-timeout=100 -r requirements.txt
15 |     pip list
16 | else
17 |     echo "Not found requirements.txt file."
18 | fi
19 | 


--------------------------------------------------------------------------------
/.github/workflows/trellix.yml:
--------------------------------------------------------------------------------
 1 | name: Trellix Command Line Scanner
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | permissions: write-all
 7 | jobs:
 8 |   Trellix:
 9 |     runs-on: inner-source
10 |     steps:
11 |       - name: Clean Up Working Directory
12 |         run: sudo rm -rf ${{github.workspace}}/*
13 | 
14 |       - name: Checkout out Repo
15 |         uses: actions/checkout@v4
16 | 
17 |       - name: Run Trellix Scanner
18 |         env:
19 |           workspace: ${{ github.workspace }}
20 |         run: bash .github/workflows/scripts/formatScan/trellix.sh
21 | 
22 |       - name: Publish pipeline artifact
23 |         if: ${{ !cancelled() }}
24 |         uses: actions/upload-artifact@v4
25 |         with:
26 |           path: ${{ github.workspace }}/.github/workflows/scripts/formatScan/report.html
27 | 


--------------------------------------------------------------------------------
/.github/workflows/unit-test-bestla.yml:
--------------------------------------------------------------------------------
 1 | name: Bestla Unit Test
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main]
 6 |     paths:
 7 |       - bestla/**
 8 |       - .github/workflows/unit-test-bestla.yml
 9 |       - '!bestla/README.md'
10 |   workflow_dispatch:
11 |     inputs:
12 |         compiler_version:
13 |           description: 'compiler_version'
14 |           required: false
15 |           type: string
16 |           default: '13.2.0'
17 | 
18 | # If there is a new commit, the previous jobs will be canceled
19 | concurrency:
20 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
21 |   cancel-in-progress: true
22 | 
23 | env:
24 |   INPUT_COMPILER_VERSION: ${{ inputs.compiler_version || '13.2.0' }}
25 |   WORKING_DIR: ${{ github.workspace }}
26 |   CONTAINER_NAME: "utTest"
27 | 
28 | jobs:
29 |   unit-test:
30 |     runs-on: [self-hosted, linux, X64, spr]
31 |     steps:
32 |       - name: Docker Clean Up
33 |         run: |
34 |           if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}-${{ runner.name }}'$) ]]; then
35 |               docker start ${{ env.CONTAINER_NAME }}-${{ runner.name }}
36 |               echo "remove left files through container ..."
37 |               docker exec ${{ env.CONTAINER_NAME }}-${{ runner.name }} bash -c "ls -a /neural-speed && rm -fr /neural-speed/* && rm -fr /neural-speed/.* || true"
38 |           fi
39 |       - name: Checkout out Repo
40 |         uses: actions/checkout@v3
41 |         with:
42 |           submodules: "recursive"
43 |           fetch-tags: true
44 | 
45 |       - name: Env build
46 |         run: |
47 |           echo "do not need conda env"
48 |           source ~/.bashrc
49 |           bash ${{ github.workspace }}/.github/workflows/scripts/prepare_env_with_conda.sh "unit-test-bestla" "3.10"
50 |           conda activate unit-test-bestla || source activate unit-test-bestla
51 |           conda install --update-deps -c conda-forge gxx==${{ env.INPUT_COMPILER_VERSION }} gcc==${{ env.INPUT_COMPILER_VERSION }} gxx_linux-64==${{ env.INPUT_COMPILER_VERSION }} libstdcxx-ng sysroot_linux-64 -y
52 | 
53 |       - name: Run UT
54 |         run: |
55 |           #source /opt/rh/gcc-toolset-12/enable
56 |           source ~/.bashrc
57 |           conda activate unit-test-bestla || source activate unit-test-bestla
58 |           export LD_LIBRARY_PATH=${HOME}/miniforge/envs/${conda_env}/lib/:$LD_LIBRARY_PATH
59 |           cd ${{ github.workspace }}/bestla && mkdir build && cd build && cmake .. -DBTLA_UT_ALL=ON && make -j
60 |           ./bestla_ut 2>&1 | tee unit_test_bestla.log
61 | 
62 |       - name: Check Result
63 |         run: |
64 |           if [[ $(grep -c "No such file or directory" ${{ github.workspace }}/bestla/build/unit_test_bestla.log) != 0 ]]; then
65 |             echo "neural-speed Compile Failed"
66 |             exit 1
67 |           fi
68 |           if [[ $(grep -c "Case Failed" ${{ github.workspace }}/bestla/build/unit_test_bestla.log) != 0 ]]; then
69 |             echo "UT Failed! Please check UT log."
70 |             exit 1
71 |           fi
72 | 
73 |       - name: Publish pipeline artifact
74 |         uses: actions/upload-artifact@v3
75 |         if: ${{ !cancelled() }}
76 |         with:
77 |           name: Bestla Unit Test
78 |           path: ${{ github.workspace }}/bestla/build/unit_test*.*
79 | 


--------------------------------------------------------------------------------
/.github/workflows/unit-test-llmruntime.yml:
--------------------------------------------------------------------------------
 1 | name: Python Unit Test
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main]
 6 |     paths:
 7 |       - neural_speed/**
 8 |       - bestla/**
 9 |       - tests/**
10 |       - .github/workflows/unit-test-llmruntime.yml
11 |       - .github/workflows/unitTest/**
12 |       - 'CMakeLists.txt'
13 |       - 'setup.py'
14 |       - '!**/*.md'
15 |   workflow_dispatch:
16 | 
17 | # If there is a new commit, the previous jobs will be canceled
18 | concurrency:
19 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
20 |   cancel-in-progress: true
21 | 
22 | env:
23 |   DOCKER_CONFIG_NAME: "commonDockerConfig"
24 |   REPO_NAME: "neural-speed"
25 |   REPO_TAG: "py39"
26 |   DOCKER_FILE_NAME: "devel"
27 |   CONTAINER_NAME: "utTest"
28 | 
29 | jobs:
30 |   unit-test:
31 |     runs-on: [self-hosted, linux, X64, llmruntime-node]
32 |     steps:
33 |       - name: Load environment variables
34 |         run: cat ~/actions-runner3/.env >> $GITHUB_ENV
35 | 
36 |       - name: Docker Clean Up
37 |         run: |
38 |           docker ps -a
39 |           if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}-${{ runner.name }}'$) ]]; then
40 |               docker start ${{ env.CONTAINER_NAME }}-${{ runner.name }}
41 |               echo "remove left files through container ..."
42 |               docker exec ${{ env.CONTAINER_NAME }}-${{ runner.name }} bash -c "ls -a /neural-speed && rm -fr /neural-speed/* && rm -fr /neural-speed/.* || true"
43 |           fi
44 | 
45 |       - name: Checkout out Repo
46 |         uses: actions/checkout@v3
47 |         with:
48 |           submodules: "recursive"
49 |           fetch-tags: true
50 | 
51 |       - name: Docker Build
52 |         run: |
53 |           docker build -f ${{ github.workspace }}/.github/workflows/docker/${{ env.DOCKER_FILE_NAME }}.dockerfile --build-arg http_proxy="${{ env.HTTP_PROXY }}" --build-arg https_proxy="${{ env.HTTPS_PROXY }}" -t ${{ env.REPO_NAME }}:${{ env.REPO_TAG }} .
54 | 
55 |       - name: Docker Run
56 |         run: |
57 |           if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}-${{ runner.name }}'$) ]]; then
58 |             docker stop ${{ env.CONTAINER_NAME }}-${{ runner.name }}
59 |             docker rm -vf ${{ env.CONTAINER_NAME }}-${{ runner.name }} || true
60 |           fi
61 |           docker run -dit --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }}-${{ runner.name }} -v /dev/shm:/dev/shm \
62 |           -e http_proxy="${{ env.HTTP_PROXY }}" \
63 |           -e https_proxy="${{ env.HTTPS_PROXY }}" \
64 |           -v ${{ github.workspace }}:/neural-speed \
65 |           -v /tf_dataset2:/tf_dataset2 \
66 |           -v ~/.cache/oneAPI:/cache \
67 |           ${{ env.REPO_NAME }}:${{ env.REPO_TAG }}
68 | 
69 |       - name: Env build
70 |         run: |
71 |           docker exec ${{ env.CONTAINER_NAME }}-${{ runner.name }} \
72 |           bash /neural-speed/.github/workflows/scripts/prepare_env.sh
73 | 
74 |       - name: Binary build
75 |         run: |
76 |           docker exec ${{ env.CONTAINER_NAME }}-${{ runner.name }} \
77 |           bash -c "cd /neural-speed/.github/workflows/scripts \
78 |           && bash install_binary.sh"
79 | 
80 |       - name: Run UT
81 |         run: |
82 |           docker exec ${{ env.CONTAINER_NAME }}-${{ runner.name }} \
83 |           bash -c "cd /neural-speed/.github/workflows/unitTest \
84 |           && bash unittest_llmruntime.sh"
85 | 
86 |       - name: Publish pipeline artifact
87 |         uses: actions/upload-artifact@v3
88 |         if: ${{ !cancelled() }}
89 |         with:
90 |           name: Python Unit Test
91 |           path: ${{ github.workspace }}/log_dir/unit_test*.*
92 | 


--------------------------------------------------------------------------------
/.github/workflows/unitTest/env_setup.sh:
--------------------------------------------------------------------------------
 1 | pip list
 2 | 
 3 | # Install test requirements
 4 | echo "Install Tests Requirements"
 5 | cd $1 || exit 1
 6 | pwd
 7 | if [ -f "requirements.txt" ]; then
 8 |     python -m pip install --default-timeout=100 -r requirements.txt
 9 |     pip list
10 | else
11 |     echo "Not found requirements.txt file."
12 | fi
13 | 
14 | pip install coverage
15 | pip install pytest
16 | 


--------------------------------------------------------------------------------
/.github/workflows/unitTest/unittest_llmruntime.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source /neural-speed/.github/workflows/scripts/change_color.sh
 3 | test_install_backend="true"
 4 | LOG_DIR=/neural-speed/log_dir
 5 | mkdir -p ${LOG_DIR}
 6 | WORKING_DIR="/neural-speed/tests"
 7 | 
 8 | # -------------------LLM Runtime Test-------------------
 9 | function llmruntime_test() {
10 |     cd ${WORKING_DIR}
11 |     local ut_log_name=${LOG_DIR}/unit_test_llm_runtime.log
12 |     find . -name "test*.py" | sed 's,\.\/,python ,g' | sed 's/$/ --verbose/' >run.sh
13 |     # run UT
14 |     $BOLD_YELLOW && echo "cat run.sh..." && $RESET
15 |     cat run.sh | tee ${ut_log_name}
16 |     $BOLD_YELLOW && echo "------UT start-------" && $RESET
17 |     bash run.sh 2>&1 | tee -a ${ut_log_name}
18 |     $BOLD_YELLOW && echo "------UT end -------" && $RESET
19 | 
20 |     if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] ||
21 |         [ $(grep -c "OK" ${ut_log_name}) == 0 ] ||
22 |         [ $(grep -c "Segmentation fault" ${ut_log_name}) != 0 ] ||
23 |         [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] ||
24 |         [ $(grep -c "==ERROR:" ${ut_log_name}) != 0 ] ||
25 |         [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ]; then
26 |         $BOLD_RED && echo "Find errors in engine test, please check the output..." && $RESET
27 |         exit 1
28 |     else
29 |         $BOLD_GREEN && echo "engine test finished successfully!" && $RESET
30 |     fi
31 | }
32 | 
33 | function main() {
34 |     bash /neural-speed/.github/workflows/unitTest/env_setup.sh "${WORKING_DIR}"
35 |     llmruntime_test
36 | }
37 | 
38 | main
39 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-test.yml:
--------------------------------------------------------------------------------
 1 | name: Windows Binary Test
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main]
 6 |     paths:
 7 |       - ".github/workflows/windows-test.yml"
 8 |       - "requirements.txt"
 9 |       - "setup.py"
10 |       - "neural_speed/**"
11 |       - "bestla/**"
12 |       - '!bestla/ut/**'
13 |       - '!bestla/xbyak/**'
14 |       - '!bestla/xbyak/*.md'
15 |       - '!neural_speed/*.md'
16 | 
17 |   workflow_dispatch:
18 | 
19 | # If there is a new commit, the previous jobs will be canceled
20 | concurrency:
21 |     group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
22 |     cancel-in-progress: true
23 | 
24 | env:
25 |     SCRIPT_PATH: ${{ github.workspace }}\.github\workflows\scripts
26 |     WORKING_DIR: ${{ github.workspace }}
27 | 
28 | jobs:
29 |     Windows-Binary-Test:
30 |       runs-on: 'Windows'
31 |       steps:
32 |         - name: Checkout out Repo
33 |           uses: actions/checkout@v4
34 |           with:
35 |             submodules: "recursive"
36 |             fetch-tags: true
37 |             path: "a"
38 | 
39 |         - name: Binary build
40 |           shell: cmd
41 |           run: |
42 |             SET HTTP_PROXY=http://proxy-dmz.intel.com:912
43 |             SET HTTPS_PROXY=http://proxy-dmz.intel.com:912
44 |             SET http_proxy=http://proxy-dmz.intel.com:912
45 |             SET https_proxy=http://proxy-dmz.intel.com:912
46 |             cd ${{ github.workspace }}\a\.github\workflows\scripts
47 |             prepare_env_with_conda.bat
48 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ﻿### ignore binary files in llm-runtime ###
 2 | /neural_speed/*
 3 | !/neural_speed/*.*
 4 | !/neural_speed/*/
 5 | 
 6 | *.exe
 7 | *.dll
 8 | *.dylib
 9 | *.pyd
10 | *.so
11 | *.so.*
12 | 
13 | 
14 | .vs
15 | .vscode
16 | /out
17 | __pycache__
18 | neural_speed.egg-info/
19 | build
20 | runtime_outs
21 | out
22 | debug/
23 | .eggs/
24 | dist/
25 | .cache/
26 | .clangd
27 | CMakeUserPresets.json
28 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third_party/pybind11"]
2 | 	path = third_party/pybind11
3 | 	url = https://github.com/pybind/pybind11.git
4 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
  1 | ci:
  2 |   autofix_prs: true
  3 |   autoupdate_schedule: quarterly
  4 | 
  5 | repos:
  6 |   - repo: https://github.com/pre-commit/pre-commit-hooks
  7 |     rev: v4.5.0
  8 |     hooks:
  9 |       - id: debug-statements
 10 |       - id: file-contents-sorter
 11 |         files: |
 12 |           (?x)^(
 13 |               .github/workflows/scripts/formatScan/nlp_dict.txt
 14 |           )$
 15 |         args: [--unique]
 16 |       - id: end-of-file-fixer
 17 |         files: (.*\.(py|md|rst|yaml|yml))$
 18 |         exclude: |
 19 |           (?x)^(
 20 |               third-party/.+
 21 |           )$
 22 |       - id: check-json
 23 |       - id: check-yaml
 24 |         exclude: |
 25 |           (?x)^(
 26 |             third-party/.+
 27 |           )$
 28 |       - id: requirements-txt-fixer
 29 |         exclude: |
 30 |           (?x)^(
 31 |               third-party/.+
 32 |           )$
 33 |       - id: trailing-whitespace
 34 |         files: (.*\.(py|rst|cmake|yaml|yml))$
 35 |         exclude: |
 36 |           (?x)^(
 37 |               third-party/.+
 38 |           )$
 39 | 
 40 |   - repo: https://github.com/codespell-project/codespell
 41 |     rev: v2.2.6
 42 |     hooks:
 43 |       - id: codespell
 44 |         args:
 45 |           [-w, --ignore-words=.github/workflows/scripts/formatScan/nlp_dict.txt]
 46 |         exclude: |
 47 |           (?x)^(
 48 |               .+.po|.+.ts|.+.js|.+.map|.+.js.map|.+.css.map|
 49 |               .github/workflows/scripts/formatScan/nlp_dict.txt|
 50 |               tests/model-test/cpp_graph_prompts.json
 51 |           )$
 52 | 
 53 |   - repo: https://github.com/Lucas-C/pre-commit-hooks
 54 |     rev: v1.5.5
 55 |     hooks:
 56 |       - id: insert-license
 57 |         files: |
 58 |           (?x)^(
 59 |             neural_speed/.*(py|yaml|yml|sh)|
 60 |             bestla/.*(py|yaml|yml|sh)|
 61 |             tests/.*(py|yaml|yml|sh)
 62 |           )$
 63 |         args:
 64 |           [
 65 |             --license-filepath=.github/license_template.txt,
 66 |             --use-current-year,
 67 |             --detect-license-in-X-top-lines=40,
 68 |             --skip-license-insertion-comment=Copyright,
 69 |           ]
 70 | #  - repo: https://github.com/asottile/yesqa
 71 | #    rev: v1.5.0
 72 | #    hooks:
 73 | #      - id: yesqa
 74 | #        name: Unused noqa
 75 | #
 76 | #  - repo: https://github.com/pycqa/isort
 77 | #    rev: 5.13.2
 78 | #    hooks:
 79 | #      - id: isort
 80 | #        exclude: |
 81 | #          (?x)^(
 82 | #              examples/.+
 83 | #          )$
 84 | #
 85 | #  - repo: https://github.com/PyCQA/docformatter
 86 | #    rev: v1.7.5
 87 | #    hooks:
 88 | #      - id: docformatter
 89 | #        args: [
 90 | #            --in-place,
 91 | #            --wrap-summaries=0, # 0 means disable wrap
 92 | #            --wrap-descriptions=0, # 0 means disable wrap
 93 | #            --black,
 94 | #            --style=google,
 95 | #          ]
 96 | #        exclude: |
 97 | #          (?x)^(
 98 | #              examples/.+
 99 | #          )$
100 | #
101 | #  - repo: https://github.com/psf/black.git
102 | #    rev: 23.12.1
103 | #    hooks:
104 | #      - id: black
105 | #        files: (.*\.py)$
106 | #        exclude: |
107 | #          (?x)^(
108 | #              examples/.+
109 | #          )$
110 | #
111 | #  - repo: https://github.com/asottile/blacken-docs
112 | #    rev: 1.16.0
113 | #    hooks:
114 | #      - id: blacken-docs
115 | #        args: [--line-length=120, --skip-errors]
116 | #        exclude: |
117 | #          (?x)^(
118 | #              examples/.+|
119 | #              docs/source-app
120 | #          )$
121 | #
122 | #  - repo: https://github.com/astral-sh/ruff-pre-commit
123 | #    rev: v0.1.9
124 | #    hooks:
125 | #      - id: ruff
126 | #        args: [--fix, --exit-non-zero-on-fix, --no-cache]
127 | #        exclude: |
128 | #          (?x)^(
129 | #              examples/.+
130 | #          )$
131 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | ### License
 4 | 
 5 | <PROJECT NAME> is licensed under the terms in [LICENSE]<link to license file in repo>. By contributing to the project, you agree to the license and copyright terms therein and release your contribution under these terms.
 6 | 
 7 | ### Sign your work
 8 | 
 9 | Please use the sign-off line at the end of the patch. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. The rules are pretty simple: if you can certify
10 | the below (from [developercertificate.org](http://developercertificate.org/)):
11 | 
12 | ```
13 | Developer Certificate of Origin
14 | Version 1.1
15 | 
16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
17 | 660 York Street, Suite 102,
18 | San Francisco, CA 94110 USA
19 | 
20 | Everyone is permitted to copy and distribute verbatim copies of this
21 | license document, but changing it is not allowed.
22 | 
23 | Developer's Certificate of Origin 1.1
24 | 
25 | By making a contribution to this project, I certify that:
26 | 
27 | (a) The contribution was created in whole or in part by me and I
28 |     have the right to submit it under the open source license
29 |     indicated in the file; or
30 | 
31 | (b) The contribution is based upon previous work that, to the best
32 |     of my knowledge, is covered under an appropriate open source
33 |     license and I have the right under that license to submit that
34 |     work with modifications, whether created in whole or in part
35 |     by me, under the same open source license (unless I am
36 |     permitted to submit under a different license), as indicated
37 |     in the file; or
38 | 
39 | (c) The contribution was provided directly to me by some other
40 |     person who certified (a), (b) or (c) and I have not modified
41 |     it.
42 | 
43 | (d) I understand and agree that this project and the contribution
44 |     are public and that a record of the contribution (including all
45 |     personal information I submit with it, including my sign-off) is
46 |     maintained indefinitely and may be redistributed consistent with
47 |     this project or the open source license(s) involved.
48 | ```
49 | 
50 | Then you just add a line to every git commit message:
51 | 
52 |     Signed-off-by: Joe Smith <joe.smith@email.com>
53 | 
54 | Use your real name (sorry, no pseudonyms or anonymous contributions.)
55 | 
56 | If you set your `user.name` and `user.email` git configs, you can sign your
57 | commit automatically with `git commit -s`.
58 | 


--------------------------------------------------------------------------------
/bestla/bestla/bestla.h:
--------------------------------------------------------------------------------
  1 | //  Copyright (c) 2023 Intel Corporation
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | #pragma once
 15 | #include <stdint.h>
 16 | enum class BTLA_CODE {
 17 |   Success = 0,
 18 |   InvalidParam = 1,
 19 |   InvalidISA = 2,
 20 |   RuntimeError = 4,
 21 |   NotSupport = 8,
 22 | };
 23 | enum class BTLA_ISA : uint8_t {
 24 |   NoSIMD = 0,
 25 |   AVX,
 26 |   AVX2,
 27 |   AVX_VNNI,
 28 |   AVX512F,
 29 |   AVX512BW,
 30 |   AVX512_VNNI,
 31 |   AVX512_BF16,
 32 |   AVX512_FP16,
 33 |   AMX_BF16,
 34 |   AMX_INT8,
 35 |   AMX_FP16,
 36 |   ISA_COUNT,
 37 | };
 38 | enum class BTLA_DTYPE : uint32_t {
 39 |   EleBitsMask = 0xff,
 40 |   EleBitsShift = 0,
 41 |   EleBitsUndef = 0,
 42 |   EleBits1 = 1,
 43 |   EleBits2 = 2,
 44 |   EleBits3 = 3,
 45 |   EleBits4 = 4,
 46 |   EleBits5 = 5,
 47 |   EleBits6 = 6,
 48 |   EleBits7 = 7,
 49 |   EleBits8 = 8,
 50 |   EleBits16 = 16,
 51 |   EleBits32 = 32,
 52 |   EleBits64 = 64,
 53 |   TypeMask = 0xff00,
 54 |   TypeShift = 8,
 55 |   TypeFloat = 0 << TypeShift,
 56 |   TypeInt = 1 << TypeShift,
 57 |   SubTypeMask = 0xff0000,
 58 |   SubTypeShift = 16,
 59 |   SubType0 = 0 << SubTypeShift,
 60 |   SubType1 = 1 << SubTypeShift,
 61 |   SubType2 = 2 << SubTypeShift,
 62 |   SubType3 = 3 << SubTypeShift,
 63 |   SubType4 = 4 << SubTypeShift,
 64 |   F64 = EleBits64 | TypeFloat,
 65 |   F32 = EleBits32 | TypeFloat,
 66 |   F16 = EleBits16 | TypeFloat,
 67 |   BF16 = EleBits16 | TypeFloat | SubType1,
 68 |   F8_E4M3 = EleBits8 | TypeFloat,
 69 |   F8_E5M2 = EleBits8 | TypeFloat | SubType1,
 70 |   F8_E3M4 = EleBits8 | TypeFloat | SubType2,
 71 |   F8_E8M0 = EleBits8 | TypeFloat | SubType3,
 72 |   DQ8_BNB = EleBits8 | TypeFloat | SubType4,
 73 |   S8 = EleBits8 | TypeInt,
 74 |   U8 = EleBits8 | TypeInt | SubType1,
 75 |   S1_CLIP = EleBits1 | TypeInt,
 76 |   S2_CLIP = EleBits2 | TypeInt,
 77 |   S3_CLIP = EleBits3 | TypeInt,
 78 |   S4_CLIP = EleBits4 | TypeInt,
 79 |   S5_CLIP = EleBits5 | TypeInt,
 80 |   S6_CLIP = EleBits6 | TypeInt,
 81 |   S7_CLIP = EleBits7 | TypeInt,
 82 |   F4_E2M1 = EleBits4 | TypeFloat,
 83 |   F4_BNB = EleBits4 | TypeFloat | SubType1,
 84 |   F4_NF4 = EleBits4 | TypeFloat | SubType2,
 85 |   S32 = EleBits32 | TypeInt,
 86 |   U32 = EleBits32 | TypeInt | SubType1,
 87 | };
 88 | 
 89 | enum class BTLA_ELTWISEOP { GELU, SWISH, TANH, EXP, LOW_PRECISION_EXP, RELU, LINEAR };
 90 | 
 91 | enum class BTLA_PROLOGUEB_IDS : uint32_t {
 92 |   Undef = (uint32_t)-1,
 93 |   Begin = 0,
 94 |   NormalBegin = Begin,
 95 |   WeightPack = NormalBegin,
 96 |   NormalEnd,
 97 |   KBlockBegin = NormalEnd,
 98 |   WeightKBlockNInteger = KBlockBegin,
 99 |   WeightKBlockNFloat,
100 |   KBlockEnd,
101 |   End,
102 | };
103 | 


--------------------------------------------------------------------------------
/bestla/bestla/sycl/sycl_epilogue.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | #pragma once
15 | 
16 | #ifdef BTLA_SYCL
17 | #include <sycl/sycl.hpp>
18 | 
19 | #include "sycl_utils.h"
20 | 
21 | namespace bestla {
22 | namespace sycl_epilogue {
23 | template <typename DstT>
24 | struct ParamOutputBase {
25 |   DstT* C;
26 |   int ldc;
27 | };
28 | template <class GemmCoreT, typename DstT>
29 | class OutputBase {
30 |  public:
31 |   using CType = typename GemmCoreT::TACC;
32 |   using DstType = DstT;
33 |   using Param = ParamOutputBase<DstType>;
34 |   static inline void store(const Param& _param, CType* tmpAcc, const sycl_utils::nd_item_helper<GemmCoreT>& helper) {
35 | #pragma unroll
36 |     for (int im = 0; im < GemmCoreT::TileM; im++) {
37 | #pragma unroll
38 |       for (int in = 0; in < GemmCoreT::TileN; in++) {
39 |         _param.C[(helper.item_g_m() + im) * _param.ldc + helper.item_g_n() + in] = tmpAcc[im * GemmCoreT::TileN + in];
40 |       }
41 |     }
42 |   }
43 | 
44 |   static inline void store_tail(const Param& _param, CType* tmpAcc, const sycl_utils::nd_item_helper<GemmCoreT>& helper,
45 |                                 int m_tail) {
46 |     if (m_tail) {
47 |       for (int im = 0; im < m_tail; im++) {
48 | #pragma unroll
49 |         for (int in = 0; in < GemmCoreT::TileN; in++) {
50 |           _param.C[(helper.item_g_m() + im) * _param.ldc + helper.item_g_n() + in] = tmpAcc[im * GemmCoreT::TileN + in];
51 |         }
52 |       }
53 |     }
54 |   }
55 | };
56 | 
57 | }  // namespace sycl_epilogue
58 | }  // namespace bestla
59 | #endif
60 | 


--------------------------------------------------------------------------------
/bestla/bestla/sycl/sycl_prologue_a.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | #pragma once
15 | 
16 | #ifdef BTLA_SYCL
17 | #include <array>
18 | 
19 | #include "bestla/bestla_utils.h"
20 | #include <sycl/sycl.hpp>
21 | 
22 | namespace bestla {
23 | namespace sycl_prologue_a {
24 | 
25 | template <typename SrcT>
26 | struct ParamActivationBase {
27 |   const SrcT* A;
28 |   int lda;
29 | };
30 | template <class GemmCoreT, typename SrcT>
31 | class ActivationBase {
32 |  public:
33 |   using AType = typename GemmCoreT::TA;
34 |   using SrcType = SrcT;
35 |   using Param = ParamActivationBase<SrcType>;
36 |   static inline void getActivation(const Param& _param, AType* aptr, sycl_utils::nd_item_helper<GemmCoreT>& helper) {}
37 | };
38 | 
39 | }  // namespace sycl_prologue_a
40 | }  // namespace bestla
41 | #endif
42 | 


--------------------------------------------------------------------------------
/bestla/bestla/ut/bestla.cpp:
--------------------------------------------------------------------------------
1 | #include "../bestla.h"
2 | 


--------------------------------------------------------------------------------
/bestla/bestla/ut/bestla_ut.cpp:
--------------------------------------------------------------------------------
1 | #include <stdio.h>
2 | #include <bestla_parallel.h>
3 | 
4 | int main() {
5 |   printf("BesTLA UT done\n");
6 |   return 0;
7 | }
8 | 


--------------------------------------------------------------------------------
/bestla/bestla/ut/bestla_utils.cpp:
--------------------------------------------------------------------------------
1 | 
2 | namespace bestla {
3 | namespace ut {}  // namespace ut
4 | }  // namespace bestla
5 | 


--------------------------------------------------------------------------------
/bestla/bestla/ut/kernel_ut.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "bestla_ut.h"
3 | #include "kernel_ref.h"
4 | 
5 | #ifdef _MSC_VER
6 | #define __PRETTY_FUNCTION__ __FUNCSIG__
7 | #endif
8 | 


--------------------------------------------------------------------------------
/bestla/bestla/ut/sycl_ut.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "sycl/sycl_device.h"
 4 | 
 5 | namespace bestla {
 6 | namespace sycl_ut {
 7 | 
 8 | class UT_Device {
 9 |  public:
10 |   static bestla::sycl_device::SyclDevice* get() {
11 |     static bestla::sycl_device::SyclDevice Instance(true);
12 |     return &Instance;
13 |   }
14 | };
15 | };  // namespace sycl_ut
16 | }  // namespace bestla
17 | 


--------------------------------------------------------------------------------
/bestla/cmake/sycl.cmake:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.23)
2 | 
3 | find_package(IntelSYCL REQUIRED)
4 | 


--------------------------------------------------------------------------------
/clang-format.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023 Intel Corporation
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | import cmd
16 | import os
17 | import platform
18 | import sys
19 | import glob
20 | import argparse
21 | import fnmatch
22 | import subprocess
23 | 
24 | ProjectEXT = ['h', 'hpp', 'c', 'cpp']
25 | 
26 | 
27 | def glob_files(dirs):
28 |     files = []
29 |     for directory in dirs:
30 |         for root, _, filenames in os.walk(directory):
31 |             for ext in ProjectEXT:
32 |                 for filename in fnmatch.filter(filenames, '*.' + ext):
33 |                     files.append(os.path.join(root, filename))
34 |     return files
35 | 
36 | 
37 | if sys.platform == "linux":
38 |     ClangBin = 'clang-format'
39 | elif sys.platform == 'win32':
40 |     ClangBin = 'clang-format.exe'
41 | 
42 | 
43 | def clang_format_dir(args):
44 |     files = glob_files(args.dirs)
45 |     for file in files:
46 |         cmds = [ClangBin, '-i', '--style=file', file]
47 |         subprocess.run(cmds, check=True)
48 | 
49 | 
50 | def parse_args(argv=None):
51 |     if argv is None:
52 |         argv = sys.argv
53 |     parser = argparse.ArgumentParser(description='Recursively clang-format')
54 |     parser.add_argument('--dirs', nargs='+', help='paths to clang-format')
55 |     args = parser.parse_args(argv[1:])
56 |     if not args.dirs:
57 |         sys.exit(-1)
58 |     return args
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     if len(sys.argv) == 1:
63 |         args = parse_args(['', '--dirs', 'neural_speed', 'bestla'])
64 |     else:
65 |         args = parse_args()
66 |     clang_format_dir(args)
67 | 


--------------------------------------------------------------------------------
/docker/DockerFile:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2023 Intel Corporation
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | ARG IMAGE_NAME=ubuntu
 5 | ARG IMAGE_TAG=22.04
 6 | FROM ${IMAGE_NAME}:${IMAGE_TAG} as base
 7 | 
 8 | FROM base as neural-speed
 9 | 
10 | ARG PYTHON=python3.10
11 | 
12 | ENV DEBIAN_FRONTEND=noninteractive
13 | 
14 | RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
15 |     libgl1-mesa-glx \
16 |     libglib2.0-0 \
17 |     ${PYTHON} \
18 |     python3-pip
19 | 
20 | RUN ln -sf $(which ${PYTHON}) /usr/bin/python
21 | 
22 | RUN ${PYTHON} -m pip install -U pip
23 | 
24 | FROM neural-speed as devel
25 | 
26 | ENV DEBIAN_FRONTEND=noninteractive
27 | 
28 | ENV LANG C.UTF-8
29 | ARG PYTHON=python3.10
30 | 
31 | RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
32 |     autoconf \
33 |     build-essential \
34 |     ca-certificates \
35 |     cmake \
36 |     git \
37 |     gcc g++ make
38 | 
39 | RUN mkdir -p /neural_speed
40 | WORKDIR /neural_speed
41 | COPY . /neural_speed
42 | 
43 | RUN pip install cmake ninja psutil && \
44 |     cd /neural_speed && \
45 |     git submodule update --init --recursive && \
46 |     mkdir -p build && cd build && cmake .. -G Ninja && ninja && cd .. && \
47 |     pip install -r requirements.txt
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | # Docker
 2 | Follow these instructions to set up and run our provided Docker image.
 3 | 
 4 | ## Set Up Docker Image
 5 | Build or Pull the provided docker images.
 6 | 
 7 | ### Build Docker Image
 8 | ```bash
 9 | git clone https://github.com/intel/neural-speed.git neuralspeed
10 | cd neuralspeed
11 | docker build -f docker/DockerFile -t neuralspeed:latest .
12 | ```
13 | If you need to use proxy, please use the following command
14 | ```bash
15 | docker build --build-arg http_proxy=${http_proxy} --build-arg https_proxy=${http_proxy} -f docker/DockerFile -t neuralspeed:latest .
16 | ```
17 | 
18 | ### Pull From Docker Hub
19 | 
20 | 
21 | ## Use Docker Image
22 | Utilize the docker container based on docker image.
23 | ```bash
24 | docker run -itd --name="neural-speed-docker" neuralspeed:latest /bin/bash
25 | docker exec -it neural-speed-docker /bin/bash
26 | ```
27 | 
28 | ## Run Simple Test
29 | ```bash
30 | docker exec -it <container_name> /bin/bash
31 | cd /neural_speed/neural_speed
32 | ## convert to model.bin
33 | python scripts/convert.py --outtype f32 --outfile llama-fp32.bin ${input_model_path}
34 | ## quantize to Q4 with groupsize=128
35 | ./build/bin/quant_llama --model_file llama-fp32.bin --out_file llama-q4_j_i8_g128.bin --weight_dtype int4 --group_size 128 --compute_dtype int8 --scale_dtype fp32 --alg sym
36 | ## inference
37 | ./build/bin/run_llama --seed 1234 -b 2047 -c 64 -n 32 -m llama-q4_j_i8_g128.bin -p "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun"
38 | ```
39 | 


--------------------------------------------------------------------------------
/docs/customized_stop.md:
--------------------------------------------------------------------------------
 1 | You can customize the stopping criteria according to your own needs by processing the input_ids to determine if text generation needs to be stopped.
 2 | Here is a simple example, which requires a minimum generation length of 80 tokens. Once the `min_length` is met, encountering a terminator `eos_token_id` will end the generation.
 3 | 
 4 | ```python
 5 | import torch
 6 | from typing import List
 7 | from transformers import StoppingCriteria, StoppingCriteriaList
 8 | 
 9 | class StopOnTokens(StoppingCriteria):
10 |     def __init__(self, min_length: int, start_length: int, stop_token_id: List[int]):
11 |         self.min_length = min_length
12 |         self.start_length = start_length
13 |         self.stop_token_id = stop_token_id
14 | 
15 |     def __call__(
16 |         self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
17 |     ) -> bool:
18 |         if input_ids.shape[-1] - self.start_length > self.min_length:
19 |             for stop_id in self.stop_token_id:
20 |                 if input_ids[0][input_ids.shape[-1] - 1] == stop_id:
21 |                     return True
22 |         return False
23 | 
24 | stopping_criteria = StoppingCriteriaList(
25 |     [
26 |         StopOnTokens(
27 |             min_length=80,
28 |             start_length=inputs.shape[1],
29 |             stop_token_id=[tokenizer.eos_token_id],
30 |         )
31 |     ]
32 | )
33 | 
34 | outputs = model.generate(inputs, streamer=streamer, stopping_criteria=stopping_criteria)
35 | ```
36 | 


--------------------------------------------------------------------------------
/docs/gguf.md:
--------------------------------------------------------------------------------
 1 | GGUF
 2 | =======
 3 | 
 4 | Neural Speed also supports GGUF models generated by [llama.cpp](https://github.com/ggerganov/llama.cpp), you need to download the model and use llama.cpp to create it. 
 5 | 
 6 | Validated models: [llama2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf), [falcon-7b](https://huggingface.co/tiiuae/falcon-7b), [falcon-40b](https://huggingface.co/tiiuae/falcon-40b), [mpt-7b](https://huggingface.co/mosaicml/mpt-7b), [mpt-40b](https://huggingface.co/mosaicml/mpt-40b) and [bloom-7b1](https://huggingface.co/bigscience/bloomz-7b1). 
 7 | 
 8 | Please check more validated GGUF models from HuggingFace in [list](./supported_models.md).
 9 | 
10 | ## Examples
11 | 
12 | How to create the GGUF file in Neural Speed:
13 | ```python
14 | # Example:
15 | # please provide the local model path as the arg, 
16 | # which means you need to `git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf` first.
17 | python neural_speed/convert/convert-hf-to-gguf.py /model_path/Llama-2-7b-chat-hf/
18 | 
19 | ```
20 | 
21 | How to load the GGUF bin file in Neural Speed:
22 | 
23 | ```python
24 |     prompt = "Once upon a time"
25 |     tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
26 |     inputs = tokenizer(prompt, return_tensors="pt").input_ids
27 |     streamer = TextStreamer(tokenizer)
28 | 
29 |     model = Model()
30 |     model.init_from_bin(args.model_name, gguf_path)
31 |     outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True)
32 | 
33 | # Please check this script for more details and input parameters.
34 | # python scripts/python_api_example_for_gguf.py --model_name falcon --model_path /home/model/falcon-7b -m /home/model/falcon-7b/ggml-model-f32.gguf
35 | ```
36 | 
37 | Note: These GGUF models can be accelerated by [Neural Speed BestTLA](https://github.com/intel/neural-speed/blob/c0312283f528d4a9ffebc283cd0f15a7a8eabf1a/bestla/README.md#L1).
38 | 
39 | How to accelerate GGUF by BTLA:
40 | ```python
41 | # quantization and then re-run the above step python_api_example_for_gguf.py
42 | ./build/bin/quant_falcon --model_file /home/model/falcon-7b/ggml-model-f32.gguf --out_file ne-falcon-q4_j.bin --weight_dtype int4 --compute_dtype int8
43 | 
44 | python scripts/python_api_example_for_gguf.py --model_name falcon --model_path /home/model/falcon-7b -m ne-falcon-q4_j.bin
45 | ```
46 | 
47 | How to load the GGUF bin file in [intel-extension-for-transformers](https://github.com/intel/intel-extension-for-transformers/pull/1151):
48 | ```python
49 | from transformers import AutoTokenizer, TextStreamer
50 | from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig
51 | 
52 | # Specify the GGUF repo on the Hugginface
53 | model_name = "TheBloke/Llama-2-7B-Chat-GGUF"
54 | # Download the the specific gguf model file from the above repo
55 | gguf_file = "llama-2-7b-chat.Q4_0.gguf"
56 | # make sure you are granted to access this model on the Huggingface.
57 | tokenizer_name = "meta-llama/Llama-2-7b-chat-hf"
58 | 
59 | prompt = "Once upon a time"
60 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
61 | inputs = tokenizer(prompt, return_tensors="pt").input_ids
62 | streamer = TextStreamer(tokenizer)
63 | model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file = gguf_file)
64 | outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
65 | ```
66 | 


--------------------------------------------------------------------------------
/docs/gptq_and_awq.md:
--------------------------------------------------------------------------------
 1 | GPTQ & AWQ
 2 | =======
 3 | 
 4 | Neural Speed supports multiple weight-only quantization algorithms, such as GPTQ and AWQ.
 5 | 
 6 | More algorithm details please check [GPTQ](https://arxiv.org/abs/2210.17323) and [AWQ](https://arxiv.org/abs/2306.00978).
 7 | 
 8 | Validated GPTQ & AWQ models directly from the HuggingFace:
 9 | * [Llama-2-7B-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ) & [Llama-2-13B-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-13B-Chat-GPTQ) & [Llama-2-7B-AWQ](https://huggingface.co/TheBloke/Llama-2-7B-AWQ) & [Llama-2-13B-chat-AWQ](https://huggingface.co/TheBloke/Llama-2-13B-chat-AWQ)
10 | * [CodeLlama-7B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) & [CodeLlama-13B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GPTQ) & [CodeLlama-7B-AWQ](https://huggingface.co/TheBloke/CodeLlama-7B-AWQ) & [CodeLlama-13B-AWQ](https://huggingface.co/TheBloke/CodeLlama-13B-AWQ)
11 | * [Mistral-7B-Instruct-v0.1-GPTQ](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GPTQ) & [Mistral-7B-Instruct-v0.1-AWQ](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GPTQ)
12 | * [Mixtral-8x7B-Instruct-v0.1-GPTQ](https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ) & [Mixtral-8x7B-Instruct-v0.1-AWQ](https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ)
13 | * [Qwen-7B-Chat-GPTQ](https://huggingface.co/TheBloke/Qwen-7B-Chat-GPTQ) & [Qwen-7B-Chat-AWQ](https://huggingface.co/TheBloke/Qwen-7B-Chat-AWQ) & * [Qwen1.5-7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4)
14 | * [SOLAR-10.7B-v1.0-GPTQ](https://huggingface.co/TheBloke/SOLAR-10.7B-v1.0-GPTQ)
15 | * [Baichuan2-13B-Chat-GPTQ](https://hf-mirror.com/TheBloke/Baichuan2-13B-Chat-GPTQ)
16 | * [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b/tree/main)
17 | * [onlinex/phi-1_5-gptq-4bit](https://hf-mirror.com/onlinex/phi-1_5-gptq-4bit)
18 | 
19 | For more details, please check the list of [supported_models](./supported_models.md).
20 | 
21 | ## Examples
22 | 
23 | How to run GPTQ or AWQ models in Neural Speed:
24 | ```python
25 | import sys
26 | from transformers import AutoTokenizer, TextStreamer
27 | from neural_speed import Model
28 | 
29 | if len(sys.argv) != 2:
30 |     print("Usage: python python_api_example.py model_path")
31 | model_name = sys.argv[1]
32 | 
33 | prompt = "Once upon a time, a little girl"
34 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
35 | inputs = tokenizer(prompt, return_tensors="pt").input_ids
36 | streamer = TextStreamer(tokenizer)
37 | 
38 | model = Model()
39 | # Inference GPTQ models.
40 | model.init(model_name, weight_dtype="int4", compute_dtype="int8", use_gptq=True)
41 | # Inference AWQ models.
42 | # model.init(model_name, weight_dtype="int4", compute_dtype="int8", use_awq=True)
43 | 
44 | outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True)
45 | ```
46 | 
47 | Note: we have provided the [script](../scripts/python_api_example.py) to run these models.
48 | 


--------------------------------------------------------------------------------
/docs/imgs/Attention.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/neural-speed/bfd5d3c17dee18a20e2768f855f2d8fe132fc579/docs/imgs/Attention.PNG


--------------------------------------------------------------------------------
/docs/imgs/FFN.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/neural-speed/bfd5d3c17dee18a20e2768f855f2d8fe132fc579/docs/imgs/FFN.PNG


--------------------------------------------------------------------------------
/docs/imgs/ORCA_batching.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/neural-speed/bfd5d3c17dee18a20e2768f855f2d8fe132fc579/docs/imgs/ORCA_batching.png


--------------------------------------------------------------------------------
/docs/install.md:
--------------------------------------------------------------------------------
 1 | ## Install
 2 | 
 3 | ### Build Python package
 4 | ```shell
 5 | pip install -r requirements.txt
 6 | pip install .
 7 | ```
 8 | 
 9 | ### Build executable only
10 | 
11 | ```shell
12 | # Linux and WSL
13 | git submodule update --init --recursive
14 | mkdir build
15 | cd build
16 | cmake .. -G Ninja
17 | ninja
18 | ```
19 | 
20 | ```powershell
21 | # Windows
22 | # Install VisualStudio 2022 and open 'Developer PowerShell for VS 2022'
23 | mkdir build
24 | cd build
25 | cmake ..
26 | cmake --build . -j --config Release
27 | ```
28 | 


--------------------------------------------------------------------------------
/neural_speed/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023 Intel Corporation
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | include(cmake/ISA.cmake)
16 | include(cmake/Common.cmake)
17 | include(cmake/ClangTidy.cmake)
18 | 
19 | set(COMMON_HEADER_DIRS ./)
20 | if(NS_GPU)
21 |   list(APPEND COMMON_HEADER_DIRS ${GPU_ROOT}/include)
22 |   list(APPEND COMMON_LIB_DIRS ${GPU_ROOT})
23 | endif()
24 | 
25 | include_directories(${COMMON_HEADER_DIRS})
26 | link_directories(${COMMON_LIB_DIRS})
27 | 
28 | add_subdirectory(core)
29 | add_subdirectory(vectors)
30 | add_subdirectory(models)
31 | 
32 | if (NS_BUILD_APPLICATIONS)
33 |     add_subdirectory(application)
34 | endif()
35 | 


--------------------------------------------------------------------------------
/neural_speed/application/quant_model.cpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | // Defines sigaction on msys:
15 | #ifndef _GNU_SOURCE
16 | #define _GNU_SOURCE
17 | #endif
18 | 
19 | #include <stdint.h>
20 | #include <cstdio>
21 | #include <map>
22 | #include <string>
23 | #include <exception>
24 | #include <utility>
25 | #include <unordered_map>
26 | #include <tuple>
27 | 
28 | #include "common.h"
29 | #include "models/model_utils/quant_utils.h"
30 | #include "models/model_utils/model_utils.h"
31 | 
32 | std::shared_ptr<quant_layer_base> get_model_quant_layer(const std::string& model_name) {
33 |   return ql_registry::create_ql(model_name);
34 | }
35 | 
36 | int main(int argc, char** argv) {
37 |   model_init_backend();
38 |   quant_params q_params;
39 | #ifdef MODEL_NAME
40 |   q_params.model_name = MODEL_NAME;
41 | #endif
42 | 
43 |   if (quant_params_parse(argc, argv, q_params) == false) {
44 |     return 1;
45 |   }
46 |   model_archs mt = model_name_to_arch::init().find(q_params.model_name);
47 |   if (mt == MODEL_UNKNOWN) {
48 |     fprintf(stderr, "error, please set model_name \n");
49 |     exit(0);
50 |   }
51 |   q_params.model_arch = mt;
52 | 
53 |   const std::string fname_inp = q_params.model_file;
54 |   const std::string fname_out = q_params.out_file;
55 |   ne_ftype ftype = quant_params_to_ftype(q_params);
56 |   printf("%s: quant_params_to_ftype: %d\n", __func__, ftype);
57 |   const int nthread = q_params.nthread;
58 | 
59 |   const int64_t t_main_start_us = ne_time_us();
60 | 
61 |   int64_t t_quantize_us = 0;
62 |   auto quant_layer = get_model_quant_layer(q_params.model_name);
63 |   // load the model
64 |   {
65 |     const int64_t t_start_us = ne_time_us();
66 | 
67 |     if (model_quantize(q_params, quant_layer)) {
68 |       fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
69 |       return 1;
70 |     }
71 | 
72 |     t_quantize_us = ne_time_us() - t_start_us;
73 |   }
74 |   // report timing
75 |   {
76 |     const int64_t t_main_end_us = ne_time_us();
77 | 
78 |     printf("\n");
79 |     printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us / 1000.0);
80 |     printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0);
81 |   }
82 | 
83 |   return 0;
84 | }
85 | 


--------------------------------------------------------------------------------
/neural_speed/application/quant_whisper.cpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | #include <cassert>
15 | #include <cmath>
16 | #include <cstdio>
17 | #include <cstring>
18 | #include <fstream>
19 | #include <map>
20 | #include <string>
21 | #include <vector>
22 | #include <regex>  //NOLINT
23 | #include "models/model_utils/quant_utils.h"
24 | #include "common.h"
25 | 
26 | #define F_OK 0
27 | 
28 | inline bool exists_model(const std::string& name) { return (access(name.c_str(), F_OK) != -1); }
29 | int main(int argc, char** argv) {
30 |   quant_params q_params;
31 |   if (quant_params_parse(argc, argv, q_params) == false) {
32 |     return 1;
33 |   }
34 | 
35 |   // needed to initialize f16 tables
36 |   {
37 |     struct ne_init_params params = {0, nullptr, false};
38 |     struct ne_context* ctx = ne_init(params);
39 |     ne_free(ctx);
40 |   }
41 |   const std::string fname_inp = q_params.model_file;
42 |   const std::string fname_out = q_params.out_file;
43 |   // printf("input_model_file:%s \n",fname_inp.c_str());
44 | 
45 |   const ne_ftype ftype = quant_params_to_ftype(q_params);
46 |   if (ftype != NE_FTYPE_MOSTLY_Q4_0) {
47 |     fprintf(stderr, "%s: ITREX now only support quantize model to q4_0 \n", __func__);
48 |     return 1;
49 |   }
50 | 
51 |   const int64_t t_main_start_us = ne_time_us();
52 | 
53 |   int64_t t_quantize_us = 0;
54 | 
55 |   // load the model
56 |   {
57 |     const int64_t t_start_us = ne_time_us();
58 |     if (exists_model(fname_inp)) {
59 |       if (!whisper_model_quantize(fname_inp, fname_out, ne_ftype(ftype))) {
60 |         fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
61 |         return 1;
62 |       }
63 |     } else {
64 |       fprintf(stderr, "%s: model is not exist '%s'\n", __func__, fname_inp.c_str());
65 |       return 1;
66 |     }
67 | 
68 |     t_quantize_us = ne_time_us() - t_start_us;
69 |   }
70 | 
71 |   // report timing
72 |   {
73 |     const int64_t t_main_end_us = ne_time_us();
74 | 
75 |     printf("\n");
76 |     printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us / 1000.0f);
77 |     printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
78 |   }
79 | 
80 |   return 0;
81 | }
82 | 


--------------------------------------------------------------------------------
/neural_speed/cmake/ClangTidy.cmake:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023 Intel Corporation
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | if (NS_USE_CLANG_TIDY MATCHES "(CHECK|FIX)" AND ${CMAKE_VERSION} VERSION_LESS "3.6.0")
16 |     message(FATAL_ERROR "Using clang-tidy requires CMake 3.6.0 or newer")
17 | elseif(NS_USE_CLANG_TIDY MATCHES "(CHECK|FIX)")
18 |     find_program(CLANG_TIDY NAMES clang-tidy)
19 |     if(NOT CLANG_TIDY)
20 |         message(FATAL_ERROR "Clang-tidy not found")
21 |     else()
22 |         add_compile_definitions(CLANGTIDY)
23 |         if(NS_USE_CLANG_TIDY STREQUAL "CHECK")
24 |             set(CMAKE_CXX_CLANG_TIDY ${CLANG_TIDY})
25 |             message(STATUS "Using clang-tidy to run checks")
26 |         elseif(NS_USE_CLANG_TIDY STREQUAL "FIX")
27 |             set(CMAKE_CXX_CLANG_TIDY ${CLANG_TIDY} -fix)
28 |             message(STATUS "Using clang-tidy to run checks and fix found issues")
29 |         endif()
30 |     endif()
31 | endif()
32 | 


--------------------------------------------------------------------------------
/neural_speed/cmake/ISA.cmake:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023 Intel Corporation
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | if (MSVC)
16 |     if(NS_F16C)
17 |       add_compile_definitions(__F16C__)
18 |     endif()
19 |     if (NS_AVX512)
20 |         add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
21 |         add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
22 |         # MSVC has no compile-time flags enabling specific
23 |         # AVX512 extensions, neither it defines the
24 |         # macros corresponding to the extensions.
25 |         # Do it manually.
26 |         if (NS_AVX512_VBMI)
27 |             add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
28 |             add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
29 |         endif()
30 |         if (NS_AVX512_VNNI)
31 |             add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
32 |             add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
33 |         endif()
34 |     elseif (NS_AVX2)
35 |         add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
36 |         add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
37 |     elseif (NS_AVX)
38 |         add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
39 |         add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
40 |     endif()
41 | else()
42 |     if (NS_F16C)
43 |         add_compile_options(-mf16c)
44 |     endif()
45 |     if (NS_FMA)
46 |         add_compile_options(-mfma)
47 |     endif()
48 |     if (NS_AVX)
49 |         add_compile_options(-mavx)
50 |     endif()
51 |     if (NS_AVX2)
52 |         add_compile_options(-mavx2)
53 |     endif()
54 |     if (NS_AVX512)
55 |         add_compile_options(-mavx512f)
56 |         add_compile_options(-mavx512bw)
57 |     endif()
58 |     if (NS_AVX512_VBMI)
59 |         add_compile_options(-mavx512vbmi)
60 |     endif()
61 |     if (NS_AVX512_VNNI)
62 |         add_compile_options(-mavx512vnni)
63 |     endif()
64 |     if (NS_AMX)
65 |         add_compile_options(-mamx-tile -mamx-int8 -mamx-bf16)
66 |     endif()
67 | endif()
68 | 


--------------------------------------------------------------------------------
/neural_speed/convert/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright (c) 2023 Intel Corporation
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from pathlib import Path
19 | import subprocess
20 | 
21 | model_maps = {
22 |     "gpt_neox": "gptneox",
23 |     "gpt_bigcode": "starcoder",
24 |     "whisper": "whisper",
25 |     "qwen2": "qwen",
26 |     "RefinedWebModel": "falcon",
27 |     "RefinedWeb": "falcon",
28 |     "phi-msft": "phi"
29 | }
30 | 
31 | 
32 | def convert_model(model, outfile, outtype="f32", format="NE", model_hub="huggingface", use_quantized_model=False):
33 |     if model_hub == "modelscope":
34 |         from modelscope import AutoConfig
35 |     else:
36 |         from transformers import AutoConfig
37 |     config = AutoConfig.from_pretrained(model, trust_remote_code=True)
38 |     model_type = model_maps.get(config.model_type, config.model_type)
39 | 
40 |     cmd = []
41 |     if use_quantized_model:
42 |         path = Path(Path(__file__).parent.absolute(), "convert_quantized_{}.py".format(model_type))
43 |     else:
44 |         path = Path(Path(__file__).parent.absolute(), "convert_{}.py".format(model_type))
45 | 
46 |     cmd.extend(["python", path])
47 |     cmd.extend(["--outfile", outfile])
48 |     cmd.extend(["--outtype", outtype])
49 |     if model_type in {"phi", "stablelm"}:
50 |         cmd.extend(["--format", format])
51 |     cmd.extend(["--model_hub", model_hub])
52 |     cmd.extend([model])
53 | 
54 |     print("cmd:", cmd)
55 |     subprocess.run(cmd, check=True)
56 | 


--------------------------------------------------------------------------------
/neural_speed/core/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023 Intel Corporation
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | find_package(Threads REQUIRED)
16 | file(GLOB layers_srcs "layers/*.cpp")
17 | file(GLOB test_srcs "layers/*test*.cpp")
18 | list(REMOVE_ITEM layers_srcs ${test_srcs})
19 | set(sources ne_layers.c ${layers_srcs})
20 | 
21 | add_shareable_library_w_warning(ne_layers "${sources}")
22 | 
23 | target_include_directories(ne_layers PUBLIC .)
24 | target_compile_features(ne_layers PUBLIC c_std_11) # don't bump
25 | set_target_properties(ne_layers PROPERTIES POSITION_INDEPENDENT_CODE ON)
26 | if (NS_TP)
27 |   find_package(oneCCL REQUIRED)
28 |   find_package(MPI REQUIRED)
29 |   set(CMAKE_POSITION_INDEPENDENT_CODE ON)
30 |   add_library(parallel_context STATIC parallel_context.cpp)
31 |   target_link_libraries(ne_layers PUBLIC Threads::Threads bestla ne_vec MPI::MPI_CXX ccl parallel_context)
32 | else ()
33 |   target_link_libraries(ne_layers PUBLIC Threads::Threads bestla ne_vec)
34 | endif()
35 | 
36 | if(NOT WIN32)
37 |   target_link_libraries(ne_layers PUBLIC rt)
38 | else()
39 |   target_link_options(ne_layers PUBLIC /STACK:5242880 /F5242880)
40 | endif()
41 | 
42 | 
43 | if (NS_BUILD_TESTS)
44 | 
45 | function(add_test_target src)  # ARGN: additional source
46 |   get_filename_component(test_target ${src} NAME_WE)
47 |   get_filename_component(src_dir ${src} DIRECTORY)
48 |   string(REGEX REPLACE [/\\] "_" src_dir ${src_dir})
49 |   if(src_dir)
50 |     set (test_target "${src_dir}_${test_target}")
51 |   endif()
52 |   set (test_target "test_${test_target}")
53 |   add_executable_w_warning(${test_target} ${src} ${ARGN})
54 |   target_compile_definitions(${test_target} PRIVATE NS_TESTS)
55 |   target_compile_options(${test_target} PRIVATE -fsanitize=address)
56 |   target_link_options(${test_target} PRIVATE -fsanitize=address)
57 |   target_include_directories(${test_target} PUBLIC .)
58 |   target_link_libraries(${test_target} PUBLIC Threads::Threads bestla ne_vec)
59 |   if(NOT WIN32)
60 |     target_link_libraries(${test_target} PUBLIC rt)
61 |   endif()
62 |   add_test(NAME ${test_target} COMMAND ${test_target})
63 |   set_tests_properties(${test_target} PROPERTIES LABELS "${src_dir}_test")
64 | endfunction()
65 | 
66 | add_test_target(layers/mha_dense.cpp layers/mha_dense_tests.cpp)
67 | 
68 | endif()
69 | 


--------------------------------------------------------------------------------
/neural_speed/core/layers/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023 Intel Corporation
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 


--------------------------------------------------------------------------------
/neural_speed/core/layers/Ops.h:
--------------------------------------------------------------------------------
  1 | //  Copyright (c) 2023 Intel Corporation
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | #pragma once
 15 | #ifdef __cplusplus
 16 | extern "C" {
 17 | #endif
 18 | 
 19 | // available tensor operations:
 20 | enum ne_op {
 21 |   NE_OP_NONE = 0,
 22 | 
 23 |   NE_OP_DUP,
 24 |   NE_OP_ADD,
 25 |   NE_OP_ADD1,
 26 |   NE_OP_ACC,
 27 |   NE_OP_SUB,
 28 |   NE_OP_MUL,
 29 |   NE_OP_DIV,
 30 |   NE_OP_SQR,
 31 |   NE_OP_SQRT,
 32 |   NE_OP_LOG,
 33 |   NE_OP_SUM,
 34 |   NE_OP_SUM_ROWS,
 35 |   NE_OP_TANH,
 36 |   NE_OP_MEAN,
 37 |   NE_OP_REPEAT,
 38 |   NE_OP_ABS,
 39 |   NE_OP_SGN,
 40 |   NE_OP_NEG,
 41 |   NE_OP_STEP,
 42 |   NE_OP_RELU,
 43 |   NE_OP_GELU,
 44 |   NE_OP_SILU,
 45 |   NE_OP_SILU_BACK,
 46 |   NE_OP_NORM,  // normalize
 47 |   NE_OP_RMS_NORM,
 48 |   NE_OP_RMS_NORM_BACK,
 49 |   NE_OP_RMS_ARGSORT,
 50 | 
 51 |   NE_OP_MUL_MAT,
 52 |   NE_OP_MUL_MAT_BIAS,
 53 |   NE_OP_MUL_MAT_ID,
 54 |   NE_OP_SCALE,
 55 |   NE_OP_SET,
 56 |   NE_OP_CPY,
 57 |   NE_OP_CONT,
 58 |   NE_OP_RESHAPE,
 59 |   NE_OP_VIEW,
 60 |   NE_OP_PERMUTE,
 61 |   NE_OP_TRANSPOSE,
 62 |   NE_OP_GET_ROWS,
 63 |   NE_OP_GET_ROWS_BACK,
 64 |   NE_OP_DIAG,
 65 |   NE_OP_DIAG_MASK_INF,
 66 |   NE_OP_DIAG_MASK_ZERO,
 67 |   NE_OP_PADDING_MASK_INF,
 68 |   NE_OP_SOFT_MAX,
 69 |   NE_OP_ROPE,
 70 |   NE_OP_ROPE_BACK,
 71 |   NE_OP_ALIBI,
 72 |   NE_OP_CLAMP,
 73 |   NE_OP_CONV_1D_1S,
 74 |   NE_OP_CONV_1D_2S,
 75 | 
 76 |   // LLM related
 77 |   NE_OP_MUL_QKV,
 78 |   NE_OP_MUL_FFN_SILU,
 79 |   NE_OP_MUL_FFN_GELU,
 80 |   NE_OP_MUL_FFN_GELU_MUL,
 81 |   NE_OP_MUL_FFN_ADD_GELU,
 82 |   NE_OP_MUL_ID_FFN_SILU,
 83 |   NE_OP_MUL_ID_FFN_GELU,
 84 |   NE_OP_FLASH_ATTN,
 85 |   NE_OP_FLASH_ATTN_KV_UPDATE,
 86 |   NE_OP_FLASH_FF,
 87 | 
 88 |   NE_OP_MAP_UNARY,
 89 |   NE_OP_MAP_BINARY,
 90 | 
 91 |   NE_OP_SPLIT,
 92 |   NE_OP_ALL_REDUCE,
 93 |   NE_OP_TP_CONCAT,
 94 |   NE_OP_DUMP_TENSOR,
 95 |   NE_OP_DEBUG,
 96 |   NE_OP_CONV_1D,
 97 |   NE_OP_ARGSORT,
 98 |   NE_OP_COUNT,
 99 | };
100 | 
101 | #ifdef __cplusplus
102 | }
103 | #endif
104 | 


--------------------------------------------------------------------------------
/neural_speed/core/layers/argsort.cpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2024 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #include "argsort.h"
16 | #include <algorithm>
17 | #include <cstdio>
18 | 
19 | static void ne_compute_forward_argsort_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
20 |                                            struct ne_tensor* dst) {
21 |   if (params->type == NE_TASK_INIT || params->type == NE_TASK_FINALIZE) {
22 |     return;
23 |   }
24 |   const int64_t ne00 = src0->ne[0];
25 |   const int64_t ne01 = src0->ne[1];
26 |   const int64_t ne02 = src0->ne[2];
27 |   const int64_t ne03 = src0->ne[3];
28 | 
29 |   const int64_t ne0 = dst->ne[0];
30 |   const int64_t ne1 = dst->ne[1];
31 |   const int64_t ne2 = dst->ne[2];
32 |   const int64_t ne3 = dst->ne[3];
33 | 
34 |   const size_t nb00 = src0->nb[0];
35 | 
36 |   const size_t nb01 = src0->nb[1];
37 |   const size_t nb02 = src0->nb[2];
38 |   const size_t nb03 = src0->nb[3];
39 | 
40 |   const size_t nb0 = dst->nb[0];
41 |   const size_t nb1 = dst->nb[1];
42 |   const size_t nb2 = dst->nb[2];
43 |   const size_t nb3 = dst->nb[3];
44 |   const int ith = params->ith;
45 |   const int nth = params->nth;
46 | 
47 |   const int64_t nr = src0->ne[1] * src0->ne[2] * src0->ne[3];
48 | 
49 |   for (int64_t i = ith; i < nr; i += nth) {
50 |     int32_t* dst_data = (int32_t*)((char*)dst->data + i * nb1);
51 |     const float* src_data = (float*)((char*)src0->data + i * nb01);
52 | 
53 |     for (int64_t j = 0; j < ne0; j++) {
54 |       dst_data[j] = j;
55 |     }
56 |     std::sort(dst_data, dst_data + ne0, [src_data](int pos1, int pos2) { return (src_data[pos1] > src_data[pos2]); });
57 |   }
58 | }
59 | void ne_compute_forward_argsort(const struct ne_compute_params* params, const struct ne_tensor* src0,
60 |                                 struct ne_tensor* dst) {
61 |   switch (src0->type) {
62 |     case NE_TYPE_F32: {
63 |       ne_compute_forward_argsort_f32(params, src0, dst);
64 |     } break;
65 |     default: {
66 |       NE_ASSERT(false);
67 |     } break;
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/neural_speed/core/layers/argsort.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2024 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #pragma once
16 | #include "core/ne.h"
17 | #include "core/data_types.h"
18 | 
19 | #ifdef __cplusplus
20 | extern "C" {
21 | #endif
22 | 
23 | void ne_compute_forward_argsort(const struct ne_compute_params* params, const struct ne_tensor* src0,
24 |                                 struct ne_tensor* dst);
25 | 
26 | #ifdef __cplusplus
27 | }
28 | #endif
29 | 


--------------------------------------------------------------------------------
/neural_speed/core/layers/bestla_gemm.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | /*++
16 | Module Name:
17 | 
18 |     bestla_gemm.h
19 | 
20 | Abstract:
21 | 
22 |     C APIs of BesTLA GEMMs.
23 | --*/
24 | 
25 | #pragma once
26 | 
27 | #include "data_types.h"
28 | #include "bestla/bestla.h"
29 | 
30 | struct BTLA_GEMM_DATA_PACKED_PARAMS {
31 |   const float* A = nullptr; /**< address of A (float32 matrix)*/
32 |   const void* B = nullptr;  /**< address of B (packed nbits blob)*/
33 |   float* C = nullptr;       /**< address of result matrix */
34 |   int lda = 0;              /**< leading dimension of A */
35 |   int ldc = 0;              /**< leading dimension of C*/
36 | };
37 | 
38 | size_t BTLAGemmPackBSize(size_t N, size_t K, size_t BlkSize, BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym,
39 |                          ne_comp_type CompType, int* shuffle_indice);
40 | 
41 | bool BTLAGemmQuantPackB(void* PackedBuf, const float* FpData, size_t N, size_t K, size_t ldb, size_t BlkSize,
42 |                         BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym, ne_comp_type CompType, bool isTrans,
43 |                         void* ThreadPool);
44 | 
45 | // QData:  K*N quantized int8 weight
46 | // Scales: K/BlkSize * N scales
47 | // Zp:     K/BlkSize * N zero points
48 | bool BTLAGemmPackB(void* PackedBuf, const int8_t* QData, const float* Scales, const int8_t* Zp, size_t N, size_t K,
49 |                    size_t ldb, size_t BlkSize, BTLA_DTYPE QuantType, BTLA_DTYPE ScaleDtype, bool isAsym,
50 |                    ne_comp_type CompType, int* shuffle_indice, void* ThreadPool);
51 | 
52 | bool BTLAGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, void* ThreadPool);
53 | 
54 | bool BTLAGemmBatchDriver(const size_t M, const size_t N, const size_t K, const size_t BatchN,
55 |                          const BTLA_GEMM_DATA_PACKED_PARAMS* DataParams, int8_t* WorkSpace, void* ThreadPool);
56 | 
57 | bool BTLALayerNorm(size_t norm_count, size_t norm_size, bool isrms, float epsilon, const float* FpIn, float* FpOut,
58 |                    void* ThreadPool);
59 | 


--------------------------------------------------------------------------------
/neural_speed/core/layers/conv.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | #pragma once
15 | 
16 | #include "core/ne.h"
17 | #include "core/data_types.h"
18 | 
19 | #ifdef __cplusplus
20 | extern "C" {
21 | #endif
22 | 
23 | void ne_compute_forward_conv_1d_s1_ph(const struct ne_compute_params* params, const struct ne_tensor* src0,
24 |                                       const struct ne_tensor* src1, struct ne_tensor* dst);
25 | void ne_compute_forward_conv_1d_2s(const struct ne_compute_params* params, const struct ne_tensor* src0,
26 |                                    const struct ne_tensor* src1, struct ne_tensor* dst);
27 | void ne_compute_forward_conv_1d(const struct ne_compute_params* params, const struct ne_tensor* src0,
28 |                                 const struct ne_tensor* src1, struct ne_tensor* dst);
29 | void ne_compute_forward_conv_1d_1s(const struct ne_compute_params* params, const struct ne_tensor* src0,
30 |                                    const struct ne_tensor* src1, struct ne_tensor* dst);
31 | void ne_compute_forward_conv_1d_2s(const struct ne_compute_params* params, const struct ne_tensor* src0,
32 |                                    const struct ne_tensor* src1, struct ne_tensor* dst);
33 | #ifdef __cplusplus
34 | }
35 | #endif
36 | 


--------------------------------------------------------------------------------
/neural_speed/core/layers/ele_reduce.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | #pragma once
15 | 
16 | #include "core/data_types.h"
17 | #include "vectors/cpu/simd.h"
18 | #include "vec_dot.h"
19 | 
20 | #ifdef __cplusplus
21 | extern "C" {
22 | #endif
23 | 
24 | inline static void ne_vec_norm_f32(const int n, float* s, const float* x) {
25 |   ne_vec_dot_f32(n, s, x, x);
26 |   *s = sqrtf(*s);
27 | }
28 | 
29 | inline static void ne_vec_sum_f32(const int n, float* s, const float* x) {
30 |   ne_float sum = 0.0;
31 |   for (int i = 0; i < n; ++i) {
32 |     sum += (ne_float)x[i];
33 |   }
34 |   *s = sum;
35 | }
36 | 
37 | inline static void ne_vec_sum_ggf(const int n, ne_float* s, const float* x) {
38 |   ne_float sum = 0.0;
39 |   for (int i = 0; i < n; ++i) {
40 |     sum += (ne_float)x[i];
41 |   }
42 |   *s = sum;
43 | }
44 | 
45 | inline static void ne_vec_max_f32(const int n, float* s, const float* x) {
46 |   float max = -INFINITY;
47 |   for (int i = 0; i < n; ++i) {
48 |     max = x[i] > max ? x[i] : max;
49 |   }
50 |   *s = max;
51 | }
52 | 
53 | inline static void ne_vec_norm_inv_f32(const int n, float* s, const float* x) {
54 |   ne_vec_norm_f32(n, s, x);
55 |   *s = 1.f / (*s);
56 | }
57 | 
58 | #ifdef __cplusplus
59 | }
60 | #endif
61 | 


--------------------------------------------------------------------------------
/neural_speed/core/layers/layers.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | #include "ele_wise.h"
15 | #include "ele_reduce.h"
16 | 
17 | #include "conv.h"
18 | #include "memory.h"
19 | #include "argsort.h"
20 | 


--------------------------------------------------------------------------------
/neural_speed/core/layers/memory.cpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #include "memory.h"
16 | 
17 | void ne_attention_padding_mask_f32_forward(const int bs, const int nr_qk, const int qlen, const int ith, const int nth,
18 |                                            const void* padding, const float p_value, struct ne_tensor* dst) {
19 |   // mask padding token (padding left)
20 |   for (int b = 0; b < bs; b++) {
21 |     const int n_padding = (reinterpret_cast<const int32_t*>(padding))[b];
22 |     if (n_padding == 0) continue;
23 |     for (int k = 0; k < (nr_qk / bs); k++) {
24 |       for (int j = ith; j < qlen; j += nth) {
25 |         // it will not affect next token if don't mask the pad_token row
26 |         ne_vec_set_f32(n_padding,
27 |                        reinterpret_cast<float*>(reinterpret_cast<char*>(dst->data) + b * dst->nb[3] + k * dst->nb[2] +
28 |                                                 j * dst->nb[1]),
29 |                        p_value);
30 |       }
31 |     }
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/neural_speed/core/layers/memory.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "ele_wise.h"
18 | #include "core/ne.h"
19 | 
20 | #ifdef __cplusplus
21 | extern "C" {
22 | #endif
23 | 
24 | void ne_attention_padding_mask_f32_forward(const int bs, const int nr_qk, const int qlen, const int ith, const int nth,
25 |                                            const void* padding, const float p_value, struct ne_tensor* dst);
26 | 
27 | #ifdef __cplusplus
28 | }
29 | #endif
30 | 


--------------------------------------------------------------------------------
/neural_speed/core/layers/ne_test_layers_utils.hpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | #ifndef NE_CORE_GRAPH_NE_TEST_LAYERS_UTILS_H
15 | #define NE_CORE_GRAPH_NE_TEST_LAYERS_UTILS_H
16 | 
17 | #include <algorithm>
18 | #include <iostream>
19 | #include <limits>
20 | #include <memory>
21 | #include <random>
22 | #include <vector>
23 | 
24 | #include "bestla/bestla_utils.h"
25 | 
26 | #ifndef NS_TESTS
27 | static_assert(false, "Only include this header file for testing!");
28 | #endif
29 | 
30 | template <typename T>
31 | inline void init_vector(T* v, size_t size, float v_min = -10, float v_max = 10, int seed = 5489u) {
32 |   float low_value = std::max(v_min, static_cast<float>(std::numeric_limits<T>::lowest()) + 1);
33 |   std::mt19937 gen(seed);
34 |   std::uniform_real_distribution<float> u(low_value, v_max);
35 |   for (size_t i = 0; i < size; ++i) v[i] = u(gen);
36 | }
37 | 
38 | template <>
39 | inline void init_vector<bestla::utils::bf16>(bestla::utils::bf16* v, size_t size, float v_min, float v_max, int seed) {
40 |   std::mt19937 gen(seed);
41 |   std::uniform_real_distribution<float> u(v_min, v_max);
42 |   for (size_t i = 0; i < size; ++i) v[i] = bestla::utils::bf16(u(gen));
43 | }
44 | 
45 | template <>
46 | inline void init_vector<bestla::utils::fp16>(bestla::utils::fp16* v, size_t size, float v_min, float v_max, int seed) {
47 |   std::mt19937 gen(seed);
48 |   std::uniform_real_distribution<float> u(v_min, v_max);
49 |   for (size_t i = 0; i < size; ++i) v[i] = bestla::utils::fp16(u(gen));
50 | }
51 | 
52 | template <typename T>
53 | inline void init_vector(std::vector<T>* v, float v_min = -10, float v_max = 10, int seed = 5489u) {
54 |   init_vector<T>(v->data(), v->size(), v_min, v_max, seed);
55 | }
56 | 
57 | template <typename T>
58 | struct s_is_u8s8 {
59 |   enum { value = false };
60 | };
61 | 
62 | template <>
63 | struct s_is_u8s8<int8_t> {
64 |   enum { value = true };
65 | };
66 | 
67 | template <>
68 | struct s_is_u8s8<uint8_t> {
69 |   enum { value = true };
70 | };
71 | 
72 | template <typename T>
73 | inline typename std::enable_if<!s_is_u8s8<T>::value, float>::type get_err(const T& a, const T& b) {
74 |   // we compare float relative error ratio here
75 |   return fabs(static_cast<float>(a) - static_cast<float>(b)) /
76 |          std::max(static_cast<float>(fabs(static_cast<float>(b))), 1.0f);
77 | }
78 | template <typename T>
79 | inline typename std::enable_if<s_is_u8s8<T>::value, float>::type get_err(const T& a, const T& b) {
80 |   // for quantized value, error ratio was calculated with its data range
81 |   return fabs(static_cast<float>(a) - static_cast<float>(b)) / UINT8_MAX;
82 | }
83 | 
84 | template <typename T>
85 | bool compare_data(const T* buf1, const T* buf2, size_t size, float eps = 1e-6) {
86 |   if (buf1 == buf2) return false;
87 | 
88 |   for (size_t i = 0; i < size; ++i) {
89 |     if (get_err(buf1[i], buf2[i]) > eps) {
90 |       std::cerr << static_cast<float>(buf1[i]) << "vs" << static_cast<float>(buf2[i]) << " idx=" << i << std::endl;
91 |       return false;
92 |     }
93 |   }
94 |   return true;
95 | }
96 | #endif  // NE_CORE_GRAPH_NE_TEST_LAYERS_UTILS_H
97 | 


--------------------------------------------------------------------------------
/neural_speed/core/parallel_context.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | #pragma once
15 | 
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 | 
20 | // For C++ class used in C code
21 | typedef struct parallel_context parallel_context;
22 | 
23 | enum parallel_mode {
24 |   TENSOR_NO_CHANGE,
25 |   TENSOR_1D_ROW,
26 |   TENSOR_1D_COL,
27 |   TENSOR_2D_ROW,
28 |   TENSOR_2D_COL,
29 | 
30 |   TENSOR_3D_INPUT,
31 |   TENSOR_3D_WEIGHT,
32 |   TENSOR_3D_OUTPUT,
33 |   TENSOR_3D_INPUT_X_WEIGHT,
34 |   TENSOR_3D_OUTPUT_X_WEIGHT,
35 | 
36 |   TENSOR_2P5D_ROW,
37 |   TENSOR_2P5D_COL,
38 |   TENSOR_2P5D_DEP
39 | };
40 | parallel_context* init_parallel_context();
41 | int get_tp_size(parallel_context* p);
42 | int get_tp_rank(parallel_context* p);
43 | bool is_master(parallel_context* p);
44 | void barrier(parallel_context* p);
45 | void broadcast(parallel_context* p, float* buffer, size_t count);
46 | void alltoall(parallel_context* p, float* send_buffer, float* recv_buffer, size_t count);
47 | void reduce_add(parallel_context* p, float* send_buffer, float* recv_buffer, size_t count);
48 | 
49 | #ifdef __cplusplus
50 | }
51 | #endif
52 | 


--------------------------------------------------------------------------------
/neural_speed/models/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023 Intel Corporation
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | file(GLOB MODEL_UTILS_SOURCE "model_utils/*.cpp")
16 | 
17 | function(add_model target)
18 |   add_library_w_warning(${target} ${ARGN})  # no (gpt) model utils needed
19 |   target_compile_features(${target} PUBLIC cxx_std_11) # don't bump
20 |   set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON)
21 |   target_link_libraries(${target} PUBLIC ne_layers bestla)
22 | endfunction()
23 | 
24 | add_model(llama llama/llama.cpp llama/llama_utils.cpp ${MODEL_UTILS_SOURCE})
25 | add_model(gptj gptj/gptj.cpp gptj/gptj_utils.cpp ${MODEL_UTILS_SOURCE})
26 | add_model(mpt mpt/mpt.cpp mpt/mpt_utils.cpp ${MODEL_UTILS_SOURCE})
27 | add_model(gptneox gptneox/gptneox.cpp gptneox/gptneox_utils.cpp ${MODEL_UTILS_SOURCE})
28 | add_model(starcoder starcoder/starcoder.cpp starcoder/starcoder_utils.cpp ${MODEL_UTILS_SOURCE})
29 | add_model(falcon falcon/falcon.cpp falcon/falcon_utils.cpp ${MODEL_UTILS_SOURCE})
30 | add_model(opt opt/opt.cpp opt/opt_utils.cpp ${MODEL_UTILS_SOURCE})
31 | add_model(bloom bloom/bloom.cpp bloom/bloom_utils.cpp ${MODEL_UTILS_SOURCE})
32 | add_model(baichuan baichuan/baichuan.cpp baichuan/baichuan_utils.cpp ${MODEL_UTILS_SOURCE})
33 | add_model(qwen qwen/qwen.cpp qwen/qwen_utils.cpp ${MODEL_UTILS_SOURCE})
34 | add_model(whisper whisper/whisper.cpp whisper/whisper_utils.cpp ${MODEL_UTILS_SOURCE})
35 | add_model(chatglm chatglm/chatglm.cpp chatglm/chatglm_utils.cpp ${MODEL_UTILS_SOURCE})
36 | add_model(chatglm2 chatglm/chatglm2.cpp chatglm/chatglm2_utils.cpp ${MODEL_UTILS_SOURCE})
37 | add_model(gemma gemma/gemma.cpp gemma/gemma_utils.cpp ${MODEL_UTILS_SOURCE})
38 | add_model(phi phi/phi.cpp phi/phi_utils.cpp ${MODEL_UTILS_SOURCE})
39 | add_model(stablelm stablelm/stablelm.cpp stablelm/stablelm_utils.cpp ${MODEL_UTILS_SOURCE})
40 | add_model(chatglm3 chatglm/chatglm2.cpp chatglm/chatglm2_utils.cpp ${MODEL_UTILS_SOURCE})
41 | add_model(grok grok/grok.cpp grok/grok_utils.cpp ${MODEL_UTILS_SOURCE})
42 | add_model(phi3 phi/phi3.cpp phi/phi3_utils.cpp ${MODEL_UTILS_SOURCE})
43 | 


--------------------------------------------------------------------------------
/neural_speed/models/baichuan/baichuan.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef BAICHUAN_H
16 | #define BAICHUAN_H
17 | 
18 | #include "models/model_utils/model_files.h"
19 | #include "models/model_utils/model_types.h"
20 | 
21 | enum baichuan_model {
22 |   BAICHUAN_UNKNOWN,
23 |   BAICHUAN_13B,
24 | };
25 | 
26 | static const model_scratch baichuan_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
27 |   switch (n_layers) {
28 |     case 40:
29 |       return {
30 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
31 |           static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
32 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
33 |       };
34 |     case 32:
35 |       return {
36 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
37 |           static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
38 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
39 |       };
40 |     default:
41 |       MODEL_ASSERT(false);
42 |   }
43 | }
44 | 
45 | class BAICHUAN : public IModel {
46 |  private:
47 |   model_archs name = MODEL_BAICHUAN;
48 |   std::unique_ptr<model_model_loader> ml;
49 |   uint32_t n_layer, n_embd, n_ff, n_vocab;
50 |   int n_gpu_layer;
51 |   bool use_mmap, use_mlock, vocab_only;
52 |   model_scratch scratch;
53 | 
54 |  public:
55 |   void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_,
56 |             bool vocab_only_) override;
57 |   void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override;
58 | };
59 | 
60 | #endif  // BAICHUAN_H
61 | 


--------------------------------------------------------------------------------
/neural_speed/models/bloom/bloom.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef BLOOM_H
16 | #define BLOOM_H
17 | 
18 | #include "models/model_utils/model_files.h"
19 | #include "models/model_utils/model_types.h"
20 | 
21 | enum bloom_model {
22 |   BLOOM_UNKNOWN,
23 |   BLOOM_7B,
24 | };
25 | 
26 | static const model_scratch bloom_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
27 |   switch (n_layers) {
28 |     case 30:
29 |       return {
30 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
31 |           static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
32 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
33 |       };
34 |     default:
35 |       MODEL_ASSERT(false);
36 |   }
37 | }
38 | 
39 | class BLOOM : public IModel {
40 |  private:
41 |   model_archs arch = MODEL_BLOOM;
42 |   std::unique_ptr<model_model_loader> ml;
43 |   uint32_t n_layer, n_embd, n_ff, n_vocab;
44 |   int n_gpu_layer;
45 |   bool use_mmap, use_mlock, vocab_only;
46 |   model_scratch scratch;
47 | 
48 |  public:
49 |   void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_,
50 |             bool vocab_only_) override;
51 |   void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override;
52 | };
53 | 
54 | #endif  // BLOOM_H
55 | 


--------------------------------------------------------------------------------
/neural_speed/models/chatglm/chatglm.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef CHATGLM1_H
16 | #define CHATGLM1_H
17 | 
18 | #include "models/model_utils/model_files.h"
19 | #include "models/model_utils/model_types.h"
20 | 
21 | enum chatglm_model {
22 |   CHATGLM_UNKNOWN,
23 |   CHATGLM_6B,
24 | };
25 | 
26 | static const model_scratch chatglm_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
27 |   switch (n_layers) {
28 |     case 28:
29 |       return {
30 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
31 |           static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
32 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
33 |       };
34 |     default:
35 |       MODEL_ASSERT(false);
36 |   }
37 | }
38 | 
39 | class CHATGLM : public IModel {
40 |  private:
41 |   model_archs name = MODEL_CHATGLM;
42 |   std::unique_ptr<model_model_loader> ml;
43 |   uint32_t n_layer, n_embd, n_ff, n_vocab;
44 |   int n_gpu_layer;
45 |   bool use_mmap, use_mlock, vocab_only;
46 |   model_scratch scratch;
47 | 
48 |  public:
49 |   void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_,
50 |             bool vocab_only_) override;
51 |   void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override;
52 | };
53 | 
54 | #endif  // CHATGLM1_H
55 | 


--------------------------------------------------------------------------------
/neural_speed/models/chatglm/chatglm2.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef CHATGLM2_H
16 | #define CHATGLM2_H
17 | 
18 | #include "models/model_utils/model_files.h"
19 | #include "models/model_utils/model_types.h"
20 | 
21 | enum chatglm2_model {
22 |   CHATGLM2_UNKNOWN,
23 |   CHATGLM2_6B,
24 | };
25 | 
26 | static const model_scratch chatglm_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
27 |   switch (n_layers) {
28 |     case 28:
29 |       return {
30 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
31 |           static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
32 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
33 |       };
34 |     case 40:
35 |       return {
36 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
37 |           static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
38 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
39 |       };
40 |     default:
41 |       MODEL_ASSERT(false);
42 |   }
43 | }
44 | 
45 | class CHATGLM2 : public IModel {
46 |  private:
47 |   model_archs name = MODEL_CHATGLM2;
48 |   std::unique_ptr<model_model_loader> ml;
49 |   uint32_t n_layer, n_embd, n_ff, n_vocab;
50 |   int n_gpu_layer;
51 |   bool use_mmap, use_mlock, vocab_only;
52 |   model_scratch scratch;
53 | 
54 |  public:
55 |   void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_,
56 |             bool vocab_only_) override;
57 |   void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override;
58 | };
59 | 
60 | #endif  // CHATGLM2_H
61 | 


--------------------------------------------------------------------------------
/neural_speed/models/falcon/falcon.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef FALCON_H
16 | #define FALCON_H
17 | 
18 | #include "models/model_utils/model_files.h"
19 | #include "models/model_utils/model_types.h"
20 | 
21 | enum falcon_model {
22 |   FALCON_UNKNOWN,
23 |   FALCON_7B,
24 | };
25 | 
26 | static const model_scratch falcon_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
27 |   switch (n_layers) {
28 |     case 32:
29 |       return {
30 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
31 |           static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
32 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
33 |       };
34 |     case 60:
35 |       return {
36 |           static_cast<unsigned long long>(scratch_size_ratio * 2 * 3072) * MB,
37 |           static_cast<unsigned long long>(scratch_size_ratio * 2 * 2048) * MB,
38 |           static_cast<unsigned long long>(scratch_size_ratio * 2 * 3072) * MB,
39 |       };
40 |     case 80:
41 |       return {
42 |           static_cast<unsigned long long>(scratch_size_ratio * 3 * 3072) * MB,
43 |           static_cast<unsigned long long>(scratch_size_ratio * 3 * 2048) * MB,
44 |           static_cast<unsigned long long>(scratch_size_ratio * 3 * 3072) * MB,
45 |       };
46 |     default:
47 |       MODEL_ASSERT(false);
48 |   }
49 | }
50 | 
51 | class FALCON : public IModel {
52 |  private:
53 |   model_archs arch = MODEL_FALCON;
54 |   std::unique_ptr<model_model_loader> ml;
55 |   uint32_t n_layer, n_embd, n_ff, n_vocab, n_head_kv;
56 |   int n_gpu_layer;
57 |   bool use_mmap, use_mlock, vocab_only;
58 |   model_scratch scratch;
59 | 
60 |  public:
61 |   void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_,
62 |             bool vocab_only_) override;
63 |   void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override;
64 | };
65 | 
66 | #endif  // FALCON_H
67 | 


--------------------------------------------------------------------------------
/neural_speed/models/gemma/gemma.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef GEMMA_H
16 | #define GEMMA_H
17 | 
18 | #include "models/model_utils/model_files.h"
19 | #include "models/model_utils/model_types.h"
20 | 
21 | enum gemma_model {
22 |   GEMMA_2B,
23 |   GEMMA_7B,
24 | };
25 | 
26 | static const model_scratch gemma_mem_req(int n_layers, float enlarge_scale = 1.0f) {
27 |   switch (n_layers) {
28 |     case 18:
29 |       return {
30 |           static_cast<unsigned long long>(enlarge_scale * 1024) * MB,
31 |           static_cast<unsigned long long>(enlarge_scale * 1024) * MB,
32 |           static_cast<unsigned long long>(enlarge_scale * 1608) * MB,
33 |       };
34 |     case 28:
35 |       return {
36 |           static_cast<unsigned long long>(enlarge_scale * 1024) * MB,
37 |           static_cast<unsigned long long>(enlarge_scale * 1024) * MB,
38 |           static_cast<unsigned long long>(enlarge_scale * 1608) * MB,
39 |       };
40 |     default:
41 |       MODEL_ASSERT(false);
42 |   }
43 | }
44 | 
45 | class Gemma : public IModel {
46 |  private:
47 |   model_archs arch = MODEL_GEMMA;
48 |   std::unique_ptr<model_model_loader> ml;
49 |   uint32_t n_layer, n_embd, n_ff, n_vocab, n_head, n_head_kv, n_expert, n_expert_used, n_embd_head_k;
50 |   int n_gpu_layer;
51 |   bool use_mmap, use_mlock, vocab_only;
52 |   model_scratch scratch;
53 | 
54 |  public:
55 |   void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_,
56 |             bool vocab_only_) override;
57 |   void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override;
58 | };
59 | 
60 | #endif  // GEMMA_H
61 | 


--------------------------------------------------------------------------------
/neural_speed/models/gptj/gptj.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef GPTJ_H
16 | #define GPTJ_H
17 | 
18 | #include "models/model_utils/model_files.h"
19 | #include "models/model_utils/model_types.h"
20 | 
21 | enum gptj_model {
22 |   GPTJ_UNKNOWN,
23 |   GPTJ_7B,
24 |   GPTJ_13B,
25 |   GPTJ_30B,
26 |   GPTJ_65B,
27 | };
28 | 
29 | static const model_scratch gptj_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
30 |   switch (n_layers) {
31 |     case 28:
32 |       // should be enough for batch=8 * beam=4
33 |       return {
34 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
35 |           static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
36 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
37 |       };
38 |     default:
39 |       MODEL_ASSERT(false);
40 |   }
41 | }
42 | 
43 | class GPTJ : public IModel {
44 |  private:
45 |   model_archs arch = MODEL_GPTJ;
46 |   std::unique_ptr<model_model_loader> ml;
47 |   uint32_t n_layer, n_embd, n_ff, n_vocab;
48 |   int n_gpu_layer;
49 |   bool use_mmap, use_mlock, vocab_only;
50 |   model_scratch scratch;
51 | 
52 |  public:
53 |   void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_,
54 |             bool vocab_only_) override;
55 |   void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override;
56 | };
57 | 
58 | #endif  // GPTJ_H
59 | 


--------------------------------------------------------------------------------
/neural_speed/models/gptneox/gptneox.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef GPTNEOX_H
16 | #define GPTNEOX_H
17 | 
18 | #include "models/model_utils/model_files.h"
19 | #include "models/model_utils/model_types.h"
20 | 
21 | enum gptneox_model {
22 |   GPTNEOX_UNKNOWN,
23 |   GPTNEOX_7B,
24 | };
25 | 
26 | static const model_scratch gptneox_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
27 |   switch (n_layers) {
28 |     case 44:
29 |       return {
30 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
31 |           static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
32 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
33 |       };
34 |     case 32:
35 |       return {
36 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
37 |           static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
38 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
39 |       };
40 |     case 28:  // 5.8B
41 |       return {
42 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
43 |           static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
44 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
45 |       };
46 |     default:
47 |       MODEL_ASSERT(false);
48 |   }
49 | }
50 | 
51 | class GPTNEOX : public IModel {
52 |  private:
53 |   model_archs arch = MODEL_GPTNEOX;
54 |   std::unique_ptr<model_model_loader> ml;
55 |   uint32_t n_layer, n_embd, n_ff, n_vocab;
56 |   int n_gpu_layer;
57 |   bool use_mmap, use_mlock, vocab_only;
58 |   model_scratch scratch;
59 | 
60 |  public:
61 |   void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_,
62 |             bool vocab_only_) override;
63 |   void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override;
64 | };
65 | 
66 | #endif  // GPTNEOX_H
67 | 


--------------------------------------------------------------------------------
/neural_speed/models/grok/grok.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2024 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef GROK_H
16 | #define GROK_H
17 | 
18 | #include "models/model_utils/model_files.h"
19 | #include "models/model_utils/model_types.h"
20 | 
21 | enum grok_model {
22 |   GROK_314B,
23 | };
24 | 
25 | static const model_scratch grok_mem_req(int n_layers, float enlarge_scale = 1.0f) {
26 |   switch (n_layers) {
27 |     case 64:
28 |       return {
29 |           static_cast<unsigned long long>(enlarge_scale * 4096) * MB,
30 |           static_cast<unsigned long long>(enlarge_scale * 2048) * MB,
31 |           static_cast<unsigned long long>(enlarge_scale * 4096 * 10) * MB,
32 |       };
33 |     default:
34 |       MODEL_ASSERT(false);
35 |   }
36 | }
37 | 
38 | class Grok : public IModel {
39 |  private:
40 |   model_archs arch = MODEL_GROK;
41 |   std::unique_ptr<model_model_loader> ml;
42 |   uint32_t n_layer, n_embd, n_ff, n_vocab, n_head, n_head_kv, n_expert, n_expert_used, n_embd_head_k;
43 |   int n_gpu_layer;
44 |   bool use_mmap, use_mlock, vocab_only;
45 |   model_scratch scratch;
46 | 
47 |  public:
48 |   void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_,
49 |             bool vocab_only_) override;
50 |   void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override;
51 | };
52 | 
53 | #endif  // GROK_H
54 | 


--------------------------------------------------------------------------------
/neural_speed/models/model_utils/CMakeLists.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/neural-speed/bfd5d3c17dee18a20e2768f855f2d8fe132fc579/neural_speed/models/model_utils/CMakeLists.txt


--------------------------------------------------------------------------------
/neural_speed/models/model_utils/pool.cpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #include "models/model_utils/pool.h"
16 | 
17 | serve_policy parse_serve_policy(const std::string& policy) {
18 |   if (policy == "fcfs") {
19 |     return serve_policy::FCFS;
20 |   } else {
21 |     fprintf(stderr, "Unexpected serve_policy %s!", policy.c_str());
22 |     return serve_policy::UNKNOWN;
23 |   }
24 | }
25 | 
26 | // fcfs_pool
27 | bool fcfs_pool::add(sequence seq) {
28 |   context.emplace(seq);
29 |   return true;
30 | }
31 | 
32 | bool fcfs_pool::pop(sequence* seq) {
33 |   if (empty()) {
34 |     fprintf(stderr, "%s: pool is empty.\n", __func__);
35 |     return false;
36 |   }
37 |   *seq = context.front();
38 |   context.pop();
39 |   return true;
40 | }
41 | 
42 | void fcfs_pool::clear() {
43 |   std::queue<sequence> empty_q;
44 |   context.swap(empty_q);
45 | }
46 | 
47 | bool fcfs_pool::empty() { return context.empty(); }
48 | 
49 | int fcfs_pool::size() { return context.size(); }
50 | 
51 | // serve_pool
52 | serve_pool::serve_pool(const pool_property& property) {
53 |   // default policy = FCFS
54 |   std::lock_guard<std::mutex> lock(mtx);
55 |   if (internel_pool != nullptr) return;
56 |   internel_pool = new fcfs_pool(property);
57 | }
58 | 
59 | serve_pool::serve_pool(const serve_policy& policy, const pool_property& property) {
60 |   std::lock_guard<std::mutex> lock(mtx);
61 |   if (internel_pool != nullptr) return;
62 |   switch (policy) {
63 |     case serve_policy::FCFS:
64 |       internel_pool = new fcfs_pool(property);
65 |     default:
66 |       NE_ASSERT(false);
67 |   }
68 | }
69 | 
70 | serve_pool::~serve_pool() {
71 |   std::lock_guard<std::mutex> lock(mtx);
72 |   if (internel_pool != nullptr) {
73 |     delete internel_pool;
74 |   }
75 | }
76 | 
77 | bool serve_pool::add(sequence seq) {
78 |   std::lock_guard<std::mutex> lock(mtx);
79 |   return internel_pool->add(std::move(seq));
80 | }
81 | 
82 | bool serve_pool::pop(sequence* seq) {
83 |   std::lock_guard<std::mutex> lock(mtx);
84 |   return internel_pool->pop(seq);
85 | }
86 | 
87 | void serve_pool::clear() {
88 |   std::lock_guard<std::mutex> lock(mtx);
89 |   internel_pool->clear();
90 | }
91 | 
92 | bool serve_pool::empty() { return internel_pool->empty(); }
93 | 
94 | int serve_pool::size() { return internel_pool->size(); }
95 | 


--------------------------------------------------------------------------------
/neural_speed/models/model_utils/pool.h:
--------------------------------------------------------------------------------
  1 | //  Copyright (c) 2023 Intel Corporation
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | #ifndef POOL_H
 16 | #define POOL_H
 17 | 
 18 | #include <queue>
 19 | 
 20 | #include "models/model_utils/model_types.h"
 21 | 
 22 | enum class seq_status : int {
 23 |   UNKNOWN = 0,
 24 |   WAITING,
 25 |   PREFILL,
 26 |   DECODING,
 27 |   FINISHED,
 28 | };
 29 | 
 30 | enum class pool_property : int {
 31 |   WAITING = 0,
 32 |   RUNNING,
 33 |   FINISHED,
 34 | };
 35 | 
 36 | enum class serve_policy : int {
 37 |   UNKNOWN = 0,
 38 |   FCFS,  // first come first serve
 39 | };
 40 | 
 41 | serve_policy parse_serve_policy(const std::string& policy);
 42 | 
 43 | struct sequence {
 44 |   int request_idx = -1;  // -1 means unknown
 45 |   int64_t receive_time;
 46 |   int64_t end_time;
 47 |   std::vector<model_token> prompt_ids;
 48 |   std::vector<model_token> generated_ids;
 49 |   uint32_t n_prompt_tokens;
 50 |   uint32_t n_past;
 51 |   uint32_t n_total;
 52 |   uint32_t n_tokens;
 53 |   generation_config gen_conf;
 54 |   seq_status status = seq_status::UNKNOWN;
 55 |   uint64_t query_id;  // query_id for pybind response
 56 | };
 57 | 
 58 | // abstract class
 59 | class pool {
 60 |  public:
 61 |   explicit pool(const pool_property& property) : property(property) {}
 62 |   virtual ~pool() {}
 63 |   virtual bool add(sequence seq) = 0;
 64 |   virtual bool pop(sequence* seq) = 0;
 65 |   virtual void clear() = 0;
 66 |   virtual bool empty() = 0;
 67 |   virtual int size() = 0;
 68 | 
 69 |  protected:
 70 |   const pool_property property;
 71 | };
 72 | 
 73 | class fcfs_pool : public pool {
 74 |  public:
 75 |   explicit fcfs_pool(const pool_property& property) : pool(property) {}
 76 |   ~fcfs_pool() {}
 77 |   bool add(sequence seq) override;
 78 |   bool pop(sequence* seq) override;
 79 |   void clear() override;
 80 |   bool empty() override;
 81 |   int size() override;
 82 | 
 83 |  protected:
 84 |   std::queue<sequence> context;
 85 | };
 86 | 
 87 | class serve_pool {
 88 |  public:
 89 |   explicit serve_pool(const pool_property& property);
 90 |   serve_pool(const serve_policy& policy, const pool_property& property);
 91 |   ~serve_pool();
 92 |   bool add(sequence seq);
 93 |   bool pop(sequence* seq);
 94 |   void clear();
 95 |   bool empty();
 96 |   int size();
 97 | 
 98 |  protected:
 99 |   pool* internel_pool = nullptr;
100 |   std::mutex mtx;
101 | };
102 | 
103 | #endif  // POOL_H
104 | 


--------------------------------------------------------------------------------
/neural_speed/models/model_utils/quant_utils.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | #ifndef QUANT_UTILS_H
15 | #define QUANT_UTILS_H
16 | 
17 | #include "application/common.h"
18 | #include "models/model_utils/quant_config.h"
19 | 
20 | #ifdef MODEL_SHARED
21 | #if defined(_WIN32) && !defined(__MINGW32__)
22 | #ifdef MODEL_BUILD
23 | #define QUANT_API __declspec(dllexport)
24 | #else
25 | #define QUANT_API __declspec(dllimport)
26 | #endif
27 | #else
28 | #define QUANT_API __attribute__((visibility("default")))
29 | #endif
30 | #else
31 | #define QUANT_API
32 | #endif
33 | 
34 | QUANT_API int model_quantize(const quant_params& param, std::shared_ptr<quant_layer_base> quant_layer);
35 | size_t bestla_qpack(const int8_t* src_w, const float* src_scales, const int8_t* src_zps, void* dstpr,
36 |                     const quant_params_internal params, int nthread, int n, int k, int* g_idx);
37 | size_t bestla_quantize(const float* f32ptr, void* dstpr, const quant_params_internal params, int nthread, size_t n,
38 |                        size_t k);
39 | QUANT_API bool model_quantize_special(std::ifstream& finp, std::ofstream& fout, const ne_ftype ftype,
40 |                                       const std::vector<std::string>& to_quant,
41 |                                       const std::vector<std::string>& to_skip);
42 | QUANT_API bool whisper_model_quantize(const std::string& fname_inp, const std::string& fname_out, ne_ftype ftype);
43 | #endif  // MODEL_H
44 | 


--------------------------------------------------------------------------------
/neural_speed/models/model_utils/util.cpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | #include "util.h"
15 | 
16 | int32_t get_num_physical_cores() {
17 | #ifdef __linux__
18 |   // enumerate the set of thread siblings, num entries is num cores
19 |   std::unordered_set<std::string> siblings;
20 |   for (uint32_t cpu = 0; cpu < UINT32_MAX; ++cpu) {
21 |     std::ifstream thread_siblings("/sys/devices/system/cpu" + std::to_string(cpu) + "/topology/thread_siblings");
22 |     if (!thread_siblings.is_open()) {
23 |       break;  // no more cpus
24 |     }
25 |     std::string line;
26 |     if (std::getline(thread_siblings, line)) {
27 |       siblings.insert(line);
28 |     }
29 |   }
30 |   if (!siblings.empty()) {
31 |     return static_cast<int32_t>(siblings.size());
32 |   }
33 | #elif defined(__APPLE__) && defined(__MACH__)
34 |   int32_t num_physical_cores;
35 |   size_t len = sizeof(num_physical_cores);
36 |   int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, nullptr, 0);
37 |   if (result == 0) {
38 |     return num_physical_cores;
39 |   }
40 |   result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, nullptr, 0);
41 |   if (result == 0) {
42 |     return num_physical_cores;
43 |   }
44 | #elif defined(_WIN32)
45 |   // Implement
46 | #endif
47 |   unsigned int n_threads = std::thread::hardware_concurrency();
48 |   return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
49 | }
50 | 


--------------------------------------------------------------------------------
/neural_speed/models/models.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | #ifndef MODELS_H
15 | #define MODELS_H
16 | 
17 | #include "models/model_utils/model_types.h"
18 | 
19 | struct IModel {
20 |   virtual void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap, bool use_mlock,
21 |                     bool vocab_only) = 0;
22 |   virtual void load(model_context* ctx, model_progress_callback progress_callback,
23 |                     void* progress_callback_user_data) = 0;
24 | };
25 | 
26 | #endif  // MODELS_H
27 | 


--------------------------------------------------------------------------------
/neural_speed/models/mpt/mpt.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef MPT_H
16 | #define MPT_H
17 | 
18 | #include "models/model_utils/model_files.h"
19 | #include "models/model_utils/model_types.h"
20 | 
21 | enum mpt_model {
22 |   MPT_UNKNOWN,
23 |   MPT_7B,
24 |   MPT_30B,
25 | };
26 | 
27 | static const model_scratch mpt_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
28 |   switch (n_layers) {
29 |     case 32:
30 |       return {
31 |           static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
32 |           static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
33 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
34 |       };
35 |     case 48:
36 |       return {
37 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
38 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
39 |           static_cast<unsigned long long>(scratch_size_ratio * 8192) * MB,
40 |       };
41 |     default:
42 |       MODEL_ASSERT(false);
43 |   }
44 | }
45 | 
46 | class MPT : public IModel {
47 |  private:
48 |   model_archs arch = MODEL_MPT;
49 |   std::unique_ptr<model_model_loader> ml;
50 |   uint32_t n_layer, n_embd, n_ff, n_vocab;
51 |   int n_gpu_layer;
52 |   bool use_mmap, use_mlock, vocab_only;
53 |   model_scratch scratch;
54 | 
55 |  public:
56 |   void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_,
57 |             bool vocab_only_) override;
58 |   void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override;
59 | };
60 | 
61 | #endif  // MPT_H
62 | 


--------------------------------------------------------------------------------
/neural_speed/models/phi/phi.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef PHI_H
16 | #define PHI_H
17 | 
18 | #include "models/model_utils/model_files.h"
19 | #include "models/model_utils/model_types.h"
20 | 
21 | enum new_model {
22 |   MDOEL_UNKNOWN,
23 |   PHI,
24 | };
25 | 
26 | static const model_scratch phi_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
27 |   switch (n_layers) {
28 |     case 24:
29 |       return {
30 |           static_cast<unsigned long long>(scratch_size_ratio * 512) * MB,
31 |           static_cast<unsigned long long>(scratch_size_ratio * 512) * MB,
32 |           static_cast<unsigned long long>(scratch_size_ratio * 1024) * MB,
33 |       };
34 |     case 32:
35 |       return {
36 |           static_cast<unsigned long long>(scratch_size_ratio * 1024) * MB,
37 |           static_cast<unsigned long long>(scratch_size_ratio * 1024) * MB,
38 |           static_cast<unsigned long long>(scratch_size_ratio * 1024) * MB,
39 |       };
40 |     default:
41 |       MODEL_ASSERT(false);
42 |   }
43 | }
44 | 
45 | class phi : public IModel {
46 |  private:
47 |   model_archs name = MODEL_PHI;
48 |   std::unique_ptr<model_model_loader> ml;
49 |   uint32_t n_layer, n_embd, n_ff, n_vocab;
50 |   int n_ctx, n_gpu_layer;
51 |   bool use_mmap, use_mlock, vocab_only;
52 |   model_scratch scratch;
53 | 
54 |  public:
55 |   void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_,
56 |             bool vocab_only_) override;
57 |   void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override;
58 | };
59 | 
60 | #endif  // PHI_H
61 | 


--------------------------------------------------------------------------------
/neural_speed/models/phi/phi3.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef PHI3_H
16 | #define PHI3_H
17 | 
18 | #include "models/model_utils/model_files.h"
19 | #include "models/model_utils/model_types.h"
20 | 
21 | enum new_model {
22 |   MDOEL_UNKNOWN,
23 |   PHI3,
24 | };
25 | 
26 | static const model_scratch phi3_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
27 |   switch (n_layers) {
28 |     case 24:
29 |       return {
30 |           static_cast<unsigned long long>(scratch_size_ratio * 512) * MB,
31 |           static_cast<unsigned long long>(scratch_size_ratio * 512) * MB,
32 |           static_cast<unsigned long long>(scratch_size_ratio * 1024) * MB,
33 |       };
34 |     case 32:
35 |       return {
36 |           static_cast<unsigned long long>(scratch_size_ratio * 512) * MB,
37 |           static_cast<unsigned long long>(scratch_size_ratio * 512) * MB,
38 |           static_cast<unsigned long long>(scratch_size_ratio * 1024) * MB,
39 |       };
40 |     default:
41 |       MODEL_ASSERT(false);
42 |   }
43 | }
44 | 
45 | class phi3 : public IModel {
46 |  private:
47 |   model_archs name = MODEL_PHI3;
48 |   std::unique_ptr<model_model_loader> ml;
49 |   uint32_t n_layer, n_embd, n_ff, n_vocab;
50 |   int n_ctx, n_gpu_layer;
51 |   bool use_mmap, use_mlock, vocab_only;
52 |   model_scratch scratch;
53 | 
54 |  public:
55 |   void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_,
56 |             bool vocab_only_) override;
57 |   void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override;
58 | };
59 | 
60 | #endif  // PHI3_H
61 | 


--------------------------------------------------------------------------------
/neural_speed/models/qwen/qwen.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef QWEN_H
16 | #define QWEN_H
17 | 
18 | #include "models/model_utils/model_files.h"
19 | #include "models/model_utils/model_types.h"
20 | 
21 | enum QWEN_model {
22 |   QWEN_UNKNOWN,
23 |   QWEN_7B,
24 |   QWEN_14B,
25 | };
26 | 
27 | static const model_scratch qwen_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
28 |   switch (n_layers) {
29 |     case 40:
30 |       return {
31 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
32 |           static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
33 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
34 |       };
35 |     case 32:
36 |       return {
37 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
38 |           static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
39 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
40 |       };
41 |     case 24:
42 |       return {
43 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
44 |           static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
45 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
46 |       };
47 |     case 28:
48 |       return {
49 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
50 |           static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
51 |           static_cast<unsigned long long>(scratch_size_ratio * 4096) * MB,
52 |       };
53 |     case 80:
54 |       return {
55 |           static_cast<unsigned long long>(scratch_size_ratio * 3 * 4096) * MB,
56 |           static_cast<unsigned long long>(scratch_size_ratio * 3 * 2048) * MB,
57 |           static_cast<unsigned long long>(scratch_size_ratio * 3 * 4096) * MB,
58 |       };
59 |     default:
60 |       MODEL_ASSERT(false);
61 |   }
62 | }
63 | 
64 | class QWEN : public IModel {
65 |  private:
66 |   model_archs arch = MODEL_QWEN;
67 |   std::unique_ptr<model_model_loader> ml;
68 |   uint32_t n_layer, n_embd, n_ff, n_vocab, n_head, n_head_kv;
69 |   int n_gpu_layer;
70 |   bool use_mmap, use_mlock, vocab_only;
71 |   model_scratch scratch;
72 | 
73 |  public:
74 |   void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_,
75 |             bool vocab_only_) override;
76 |   void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override;
77 | };
78 | 
79 | #endif  // QWEN_H
80 | 


--------------------------------------------------------------------------------
/neural_speed/models/requirements/baichuan.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #===============================================================================
 3 | # Copyright (c) 2023 Intel Corporation
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #===============================================================================
17 | 
18 | # To avoid the error: 'ChatGLMTokenizer' object has no attribute 'sp_tokenizer'
19 | pip install -r "$(dirname "${BASH_SOURCE[0]}")/common.txt" transformers==4.33.1
20 | 


--------------------------------------------------------------------------------
/neural_speed/models/requirements/baichuan13b-gptq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #===============================================================================
 3 | # Copyright (c) 2023 Intel Corporation
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #===============================================================================
17 | 
18 | # To avoid the error: 'ChatGLMTokenizer' object has no attribute 'sp_tokenizer'
19 | pip install -r "$(dirname "${BASH_SOURCE[0]}")/common.txt" transformers==4.33.1
20 | 


--------------------------------------------------------------------------------
/neural_speed/models/requirements/chatglm-6b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #===============================================================================
 3 | # Copyright (c) 2023 Intel Corporation
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #===============================================================================
17 | 
18 | # To avoid the error: 'ChatGLMTokenizer' object has no attribute 'sp_tokenizer'
19 | pip install -r "$(dirname "${BASH_SOURCE[0]}")/common.txt" transformers==4.33.1
20 | 


--------------------------------------------------------------------------------
/neural_speed/models/requirements/common.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cpu
 2 | accelerate
 3 | datasets
 4 | einops
 5 | gguf
 6 | huggingface_hub
 7 | lm_eval==0.4.2
 8 | matplotlib
 9 | numpy
10 | peft
11 | protobuf<3.20
12 | sentencepiece
13 | tiktoken
14 | torch
15 | transformers
16 | transformers_stream_generator
17 | zipfile38
18 | 


--------------------------------------------------------------------------------
/neural_speed/models/requirements/mistral.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #===============================================================================
 3 | # Copyright (c) 2023 Intel Corporation
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #===============================================================================
17 | 
18 | pip install -r "$(dirname "${BASH_SOURCE[0]}")/common.txt" transformers>=4.34.0
19 | 


--------------------------------------------------------------------------------
/neural_speed/models/requirements/mixtral-gptq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #===============================================================================
 3 | # Copyright (c) 2023 Intel Corporation
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #===============================================================================
17 | 
18 | pip install -r "$(dirname "${BASH_SOURCE[0]}")/common.txt" transformers>=4.34.0
19 | 


--------------------------------------------------------------------------------
/neural_speed/models/stablelm/stablelm.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef STABLELM_H
16 | #define STABLELM_H
17 | 
18 | #include "models/model_utils/model_files.h"
19 | #include "models/model_utils/model_types.h"
20 | 
21 | enum stablelm_model {
22 |   STABLELM_UNKNOWN,
23 |   STABLELM_2_1_6B,
24 |   STABLELM_2_12B,
25 |   STABLELM_3B,
26 | };
27 | 
28 | static const model_scratch stablelm_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
29 |   switch (n_layers) {
30 |     case 24:  // StableLM-2-1.6B & StableLM-2-Zephyr-1.6B
31 |       return {
32 |           static_cast<unsigned long long>(scratch_size_ratio * 512) * MB,
33 |           static_cast<unsigned long long>(scratch_size_ratio * 512) * MB,
34 |           static_cast<unsigned long long>(scratch_size_ratio * 1024) * MB,
35 |       };
36 |     case 32:  // StableLM-3B & Stable-Code-3B
37 |       return {
38 |           static_cast<unsigned long long>(scratch_size_ratio * 1024) * MB,
39 |           static_cast<unsigned long long>(scratch_size_ratio * 1024) * MB,
40 |           static_cast<unsigned long long>(scratch_size_ratio * 1024) * MB,
41 |       };
42 |     case 40:  // StableLM-2-12B
43 |       return {
44 |           static_cast<unsigned long long>(scratch_size_ratio * 2560) * MB,
45 |           static_cast<unsigned long long>(scratch_size_ratio * 2560) * MB,
46 |           static_cast<unsigned long long>(scratch_size_ratio * 5120) * MB,
47 |       };
48 |     default:
49 |       MODEL_ASSERT(false);
50 |   }
51 | }
52 | 
53 | class stablelm : public IModel {
54 |  private:
55 |   model_archs name = MODEL_STABLELM;
56 |   std::unique_ptr<model_model_loader> ml;
57 |   uint32_t n_layer, n_embd, n_ff, n_vocab, n_head, n_head_kv, n_embd_head_k;
58 |   int n_ctx, n_gpu_layer;
59 |   bool use_mmap, use_mlock, vocab_only;
60 |   model_scratch scratch;
61 | 
62 |  public:
63 |   void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_,
64 |             bool vocab_only_) override;
65 |   void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override;
66 | };
67 | 
68 | #endif  // STABLELM_H
69 | 


--------------------------------------------------------------------------------
/neural_speed/models/starcoder/starcoder.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef STARCODER_H
16 | #define STARCODER_H
17 | 
18 | #include "models/model_utils/model_files.h"
19 | #include "models/model_utils/model_types.h"
20 | 
21 | enum starcoder_model {
22 |   STARCODER_UNKNOWN,
23 |   STARCODER_7B,
24 |   STARCODER_13B,
25 |   STARCODER_30B,
26 |   STARCODER_65B,
27 | };
28 | 
29 | static const model_scratch starcoder_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
30 |   switch (n_layers) {
31 |     case 24:
32 |       return {
33 |           static_cast<unsigned long long>(scratch_size_ratio * 3072 * 2) * MB,
34 |           static_cast<unsigned long long>(scratch_size_ratio * 2048 * 2) * MB,
35 |           static_cast<unsigned long long>(scratch_size_ratio * 3072 * 2) * MB,
36 |       };
37 |     case 36:
38 |       return {
39 |           static_cast<unsigned long long>(scratch_size_ratio * 3072 * 2) * MB,
40 |           static_cast<unsigned long long>(scratch_size_ratio * 2048 * 2) * MB,
41 |           static_cast<unsigned long long>(scratch_size_ratio * 3072 * 2) * MB,
42 |       };
43 |     case 40:
44 |       return {
45 |           static_cast<unsigned long long>(scratch_size_ratio * 3072 * 8) * MB,
46 |           static_cast<unsigned long long>(scratch_size_ratio * 2048 * 8) * MB,
47 |           static_cast<unsigned long long>(scratch_size_ratio * 3072 * 8) * MB,
48 |       };
49 |     default:
50 |       MODEL_ASSERT(false);
51 |   }
52 | }
53 | 
54 | class STARCODER : public IModel {
55 |  private:
56 |   model_archs arch = MODEL_STARCODER;
57 |   std::unique_ptr<model_model_loader> ml;
58 |   uint32_t n_layer, n_embd, n_ff, n_vocab;
59 |   int n_gpu_layer;
60 |   bool use_mmap, use_mlock, vocab_only;
61 |   model_scratch scratch;
62 | 
63 |  public:
64 |   void init(const char* path_model, model_context* ctx, int n_gpu_layers, bool use_mmap_, bool use_mlock_,
65 |             bool vocab_only_) override;
66 |   void load(model_context* ctx, model_progress_callback progress_callback, void* progress_callback_user_data) override;
67 | };
68 | 
69 | #endif  // STARCODER_H
70 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023 Intel Corporation
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | add_subdirectory(cpu)
16 | if (NS_GPU)
17 |   add_subdirectory(gpu)
18 | endif()
19 | 
20 | add_library_w_warning(ne_vec ele_reduce.cpp)
21 | target_link_libraries(ne_vec PUBLIC cpu_vec)
22 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/cpu/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023 Intel Corporation
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | add_library_w_warning(cpu_vec vec_arithmetic.cpp vec_compare.cpp vec_convert.cpp  vec_set.cpp  vec_store.cpp vec_load.cpp)
16 | set_target_properties(cpu_vec PROPERTIES LINKER_LANGUAGE CXX)
17 | set_property(TARGET cpu_vec PROPERTY POSITION_INDEPENDENT_CODE  ON)
18 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/cpu/vec.hpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef ENGINE_EXECUTOR_INCLUDE_VEC_HPP_
16 | #define ENGINE_EXECUTOR_INCLUDE_VEC_HPP_
17 | 
18 | #include "vec_arithmetic.hpp"
19 | #include "vec_base.hpp"
20 | #include "vec_compare.hpp"
21 | #include "vec_convert.hpp"
22 | #include "vec_set.hpp"
23 | 
24 | #endif  // ENGINE_EXECUTOR_INCLUDE_VEC_HPP_
25 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/cpu/vec.hpp.gch:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/neural-speed/bfd5d3c17dee18a20e2768f855f2d8fe132fc579/neural_speed/vectors/cpu/vec.hpp.gch


--------------------------------------------------------------------------------
/neural_speed/vectors/cpu/vec_arithmetic.hpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef ENGINE_EXECUTOR_INCLUDE_VEC_ARITHMETIC_HPP_
16 | #define ENGINE_EXECUTOR_INCLUDE_VEC_ARITHMETIC_HPP_
17 | 
18 | #include "vec_base.hpp"
19 | 
20 | fp32x16 sub_fp32x16(fp32x16 x, fp32x16 y);
21 | REGISTER_KERNEL_T(sub_fp32x16, fp32x16, fp32x16, fp32x16);
22 | 
23 | fp32x16 fmsub_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z);
24 | REGISTER_KERNEL_T(fmsub_fp32x16, fp32x16, fp32x16, fp32x16, fp32x16);
25 | 
26 | fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z);
27 | 
28 | fp32x16 add_fp32x16(fp32x16 x, fp32x16 y);
29 | REGISTER_KERNEL_T(add_fp32x16, fp32x16, fp32x16, fp32x16);
30 | 
31 | fp32x16 fmadd_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z);
32 | REGISTER_KERNEL_T(fmadd_fp32x16, fp32x16, fp32x16, fp32x16, fp32x16);
33 | 
34 | fp32x16 mul_fp32x16(fp32x16 x, fp32x16 y);
35 | REGISTER_KERNEL_T(mul_fp32x16, fp32x16, fp32x16, fp32x16);
36 | 
37 | fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y);
38 | 
39 | template <int rounding>
40 | fp32x16 mul_round_fp32x16(fp32x16 x, fp32x16 y);
41 | 
42 | fp32x16 div_fp32x16(fp32x16 x, fp32x16 y);
43 | REGISTER_KERNEL_T(div_fp32x16, fp32x16, fp32x16, fp32x16);
44 | 
45 | float reduce_add_fp32x16(fp32x16 x);
46 | REGISTER_KERNEL_T(reduce_add_fp32x16, float, fp32x16);
47 | 
48 | fp32x16 sqrt_fp32x16(fp32x16 x);
49 | REGISTER_KERNEL_T(sqrt_fp32x16, fp32x16, fp32x16);
50 | 
51 | fp32x16 rsqrt14_fp32x16(fp32x16 x);
52 | REGISTER_KERNEL_T(rsqrt14_fp32x16, fp32x16, fp32x16);
53 | 
54 | fp32x16 ceil_fp32x16(fp32x16 x);
55 | REGISTER_KERNEL_T(ceil_fp32x16, fp32x16, fp32x16);
56 | 
57 | fp32x16 scale_fp32x16(fp32x16 x, fp32x16 y);
58 | REGISTER_KERNEL_T(scale_fp32x16, fp32x16, fp32x16, fp32x16);
59 | 
60 | float dot_fp32x16(fp32x16 x, fp32x16 y);
61 | REGISTER_KERNEL_T(dot_fp32x16, float, fp32x16, fp32x16);
62 | 
63 | fp32x16 abs_fp32x16(fp32x16 x);
64 | REGISTER_KERNEL_T(abs_fp32x16, fp32x16, fp32x16);
65 | 
66 | #endif  // ENGINE_EXECUTOR_INCLUDE_VEC_SET_HPP_
67 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/cpu/vec_compare.cpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #include "vec_compare.hpp"
16 | 
17 | fp32x16 min_fp32x16(fp32x16 a, fp32x16 b) {
18 | #if __AVX512F__
19 |   return {_mm512_min_ps(a.first, b.first)};
20 | #else
21 |   return {_mm256_min_ps(a.first, b.first), _mm256_min_ps(a.second, b.second)};
22 | #endif
23 | }
24 | 
25 | s32x16 max_s32x16(s32x16 a, s32x16 b) {
26 | #if __AVX512F__
27 |   return {_mm512_max_epi32(a.first, b.first)};
28 | #else
29 |   return {_mm256_max_epi32(a.first, b.first), _mm256_max_epi32(a.second, b.second)};
30 | #endif
31 | }
32 | 
33 | fp32x16 max_fp32x16(fp32x16 a, fp32x16 b) {
34 | #if __AVX512F__
35 |   return {_mm512_max_ps(a.first, b.first)};
36 | #else
37 |   return {_mm256_max_ps(a.first, b.first), _mm256_max_ps(a.second, b.second)};
38 | #endif
39 | }
40 | 
41 | float reduce_max_fp32x16(fp32x16 x) {
42 | #if __AVX512F__
43 |   return {_mm512_reduce_max_ps(x.first)};
44 | #else
45 |   const __m256 x256 = _mm256_max_ps(x.first, x.second);
46 |   const __m128 x128 = _mm_max_ps(_mm256_extractf128_ps(x256, 1), _mm256_castps256_ps128(x256));
47 |   const __m128 x64 = _mm_max_ps(x128, _mm_movehl_ps(x128, x128));
48 |   const __m128 x32 = _mm_max_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
49 |   return _mm_cvtss_f32(x32);
50 | #endif
51 | }
52 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/cpu/vec_compare.hpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef ENGINE_EXECUTOR_INCLUDE_VEC_COMPARE_HPP_
16 | #define ENGINE_EXECUTOR_INCLUDE_VEC_COMPARE_HPP_
17 | 
18 | #include "vec_base.hpp"
19 | 
20 | fp32x16 min_fp32x16(fp32x16 a, fp32x16 b);
21 | 
22 | s32x16 max_s32x16(s32x16 a, s32x16 b);
23 | 
24 | fp32x16 max_fp32x16(fp32x16 a, fp32x16 b);
25 | 
26 | float reduce_max_fp32x16(fp32x16 x);
27 | REGISTER_KERNEL_T(reduce_max_fp32x16, float, fp32x16);
28 | 
29 | #endif  // ENGINE_EXECUTOR_INCLUDE_VEC_COMPARE_HPP_
30 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/cpu/vec_convert.hpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef ENGINE_EXECUTOR_INCLUDE_VEC_CONVERT_HPP_
16 | #define ENGINE_EXECUTOR_INCLUDE_VEC_CONVERT_HPP_
17 | 
18 | #include "vec_base.hpp"
19 | 
20 | template <int rounding = (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)>
21 | s32x16 cvt_roundfp32x16_s32x16(fp32x16 a);
22 | template <int rounding>
23 | struct ne_cvt_roundfp32x16_s32x16_kernel_t : public kernel_t<s32x16, fp32x16> {
24 |   ne_cvt_roundfp32x16_s32x16_kernel_t() { func_ = cvt_roundfp32x16_s32x16<rounding>; }
25 | };
26 | 
27 | template <int rounding = (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)>
28 | s32x16 maskz_cvt_roundfp32x16_s32x16(int mask, fp32x16 a);
29 | bf16x16 cvt_fp32x16_bf16x16(fp32x16 a);
30 | 
31 | fp32x16 cvt_bf16x16_fp32x16(bf16x16 a);
32 | 
33 | fp32x16 maskz_cvt_bf16x16_fp32x16(int mask, bf16x16 a);
34 | 
35 | u8x16 cvt_u32x16_u8x16(u32x16 a);
36 | u8x16 maskz_cvt_u32x16_u8x16(int mask, u32x16 a);
37 | 
38 | s8x16 cvt_s32x16_s8x16(s32x16 a);
39 | s8x16 maskz_cvt_s32x16_s8x16(const int mask, s32x16 a);
40 | 
41 | void cvtu32x16_store_u8x16(void* base_addr, u32x16 a);
42 | void mask_cvtu32x16_store_u8x16(void* base_addr, int mask, u32x16 a);
43 | #endif  // ENGINE_EXECUTOR_INCLUDE_VEC_CONVERT_HPP_
44 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/cpu/vec_load.cpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #include "vec_load.hpp"
16 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/cpu/vec_load.hpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef ENGINE_EXECUTOR_INCLUDE_VEC_LOAD_HPP_
16 | #define ENGINE_EXECUTOR_INCLUDE_VEC_LOAD_HPP_
17 | 
18 | #include "vec_base.hpp"
19 | 
20 | inline fp32x16 load_fp32x16(void const* mem_addr) {
21 | #if __AVX512F__
22 |   return {_mm512_loadu_ps(mem_addr)};
23 | #else
24 |   float const* mem_addr_fp32 = reinterpret_cast<float const*>(mem_addr);
25 |   return {_mm256_loadu_ps(mem_addr_fp32), _mm256_loadu_ps(mem_addr_fp32 + 8)};
26 | #endif
27 | }
28 | template <>
29 | inline fp32x16 load_kernel_t<fp32x16>(const void* src) {
30 |   return load_fp32x16(src);
31 | }
32 | inline fp32x16 mask_load_fp32x16(fp32x16 src, int mask, void const* mem_addr) {
33 | #if __AVX512F__
34 |   return {_mm512_mask_loadu_ps(src.first, mask, mem_addr)};
35 | #else
36 |   float const* mem_addr_fp32 = reinterpret_cast<float const*>(mem_addr);
37 |   return {_mm256_loadu_ps(mem_addr_fp32), _mm256_loadu_ps(mem_addr_fp32 + 8)};
38 | #endif
39 | }
40 | 
41 | inline bf16x16 load_bf16x16(void const* mem_addr) {
42 |   __m256i const* mem_addr_bf16 = reinterpret_cast<__m256i const*>(mem_addr);
43 |   return {_mm256_loadu_si256(mem_addr_bf16)};
44 | }
45 | template <>
46 | inline bf16x16 load_kernel_t<bf16x16>(const void* src) {
47 |   return load_bf16x16(src);
48 | }
49 | 
50 | inline bf16x16 maskz_load_bf16x16(int mask, void const* mem_addr);
51 | 
52 | #endif  // ENGINE_EXECUTOR_INCLUDE_VEC_LOAD_HPP_
53 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/cpu/vec_set.cpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #include "vec_set.hpp"
16 | 
17 | fp32x16 set1_fp32x16(const float x) {
18 | #if __AVX512F__
19 |   return {_mm512_set1_ps(x)};
20 | #else
21 |   return {_mm256_set1_ps(x), _mm256_set1_ps(x)};
22 | #endif
23 | }
24 | 
25 | s8x16 set1_s8x16(const int8_t x) { return {_mm_set1_epi8(x)}; }
26 | 
27 | s16x16 set1_s16x16(const int16_t x) { return {_mm256_set1_epi16(x)}; }
28 | 
29 | fp16x16 set1_fp16x16(const uint16_t x) { return {_mm256_set1_epi16(x)}; }
30 | 
31 | s32x16 set1_s32x16(const int32_t x) {
32 | #if __AVX512F__
33 |   return {_mm512_set1_epi32(x)};
34 | #else
35 |   return {_mm256_set1_epi32(x), _mm256_set1_epi32(x)};
36 | #endif
37 | }
38 | 
39 | s32x16 setzero_s32x16() {
40 | #if __AVX512F__
41 |   return {_mm512_setzero_epi32()};
42 | #else
43 |   return {_mm256_setzero_si256(), _mm256_setzero_si256()};
44 | #endif
45 | }
46 | 
47 | fp32x16 setzero_fp32x16() {
48 | #if __AVX512F__
49 |   return {_mm512_setzero_ps()};
50 | #else
51 |   return {_mm256_setzero_ps(), _mm256_setzero_ps()};
52 | #endif
53 | }
54 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/cpu/vec_set.hpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef ENGINE_EXECUTOR_INCLUDE_VEC_SET_HPP_
16 | #define ENGINE_EXECUTOR_INCLUDE_VEC_SET_HPP_
17 | 
18 | #include "vec_base.hpp"
19 | 
20 | fp32x16 set1_fp32x16(const float x);
21 | REGISTER_KERNEL_T(set1_fp32x16, fp32x16, float);
22 | 
23 | s8x16 set1_s8x16(const int8_t x);
24 | REGISTER_KERNEL_T(set1_s8x16, s8x16, int8_t);
25 | 
26 | s16x16 set1_s16x16(const int16_t x);
27 | REGISTER_KERNEL_T(set1_s16x16, s16x16, int16_t);
28 | 
29 | fp16x16 set1_fp16x16(const uint16_t x);
30 | REGISTER_KERNEL_T(set1_fp16x16, fp16x16, uint16_t);
31 | 
32 | s32x16 set1_s32x16(const int32_t x);
33 | REGISTER_KERNEL_T(set1_s32x16, s32x16, int32_t);
34 | 
35 | s32x16 setzero_s32x16();
36 | 
37 | fp32x16 setzero_fp32x16();
38 | 
39 | #endif  // ENGINE_EXECUTOR_INCLUDE_VEC_SET_HPP_
40 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/cpu/vec_store.cpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #include "vec_store.hpp"
16 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/cpu/vec_store.hpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #ifndef ENGINE_EXECUTOR_INCLUDE_VEC_STORE_HPP_
16 | #define ENGINE_EXECUTOR_INCLUDE_VEC_STORE_HPP_
17 | 
18 | #include "vec_base.hpp"
19 | 
20 | inline void store_s8x16(void* mem_addr, s8x16 a) { _mm_storeu_si128(reinterpret_cast<__m128i*>(mem_addr), a.first); }
21 | inline void store_u8x16(void* mem_addr, u8x16 a) { _mm_storeu_si128(reinterpret_cast<__m128i*>(mem_addr), a.first); }
22 | template <>
23 | inline void store_kernel_t<s8x16>(void* dst, s8x16 src) {
24 |   store_s8x16(dst, src);
25 | }
26 | 
27 | inline void mask_store_s8x16(void* mem_addr, const int mask, s8x16 a) {
28 | #ifdef __AVX512F__
29 |   _mm_mask_storeu_epi8(mem_addr, mask, a.first);
30 | #else
31 |   __m128i mask_reg =
32 |       _mm_set_epi8(mask & 32768, mask & 16384, mask & 8192, mask & 4096, mask & 2048, mask & 1024, mask & 512,
33 |                    mask & 256, mask & 128, mask & 64, mask & 32, mask & 16, mask & 8, mask & 4, mask & 2, mask & 1);
34 |   _mm_maskmoveu_si128(a.first, mask_reg, reinterpret_cast<char*>(mem_addr));
35 | #endif
36 | }
37 | 
38 | inline void mask_store_u8x16(void* mem_addr, const int mask, u8x16 a) {
39 | #ifdef __AVX512F__
40 |   _mm_mask_storeu_epi8(mem_addr, mask, a.first);
41 | #else
42 |   __m128i mask_reg =
43 |       _mm_set_epi8(mask & 32768, mask & 16384, mask & 8192, mask & 4096, mask & 2048, mask & 1024, mask & 512,
44 |                    mask & 256, mask & 128, mask & 64, mask & 32, mask & 16, mask & 8, mask & 4, mask & 2, mask & 1);
45 |   _mm_maskmoveu_si128(a.first, mask_reg, reinterpret_cast<char*>(mem_addr));
46 | #endif
47 | }
48 | 
49 | inline void store_fp32x16(void* mem_addr, fp32x16 a) {
50 | #ifdef __AVX512F__
51 |   _mm512_storeu_ps(mem_addr, a.first);
52 | #else
53 |   float* mem_addr_fp32 = reinterpret_cast<float*>(mem_addr);
54 |   _mm256_storeu_ps(mem_addr_fp32, a.first);
55 |   _mm256_storeu_ps(mem_addr_fp32 + 8, a.second);
56 | #endif
57 | }
58 | 
59 | template <>
60 | inline void store_kernel_t<fp32x16>(void* dst, fp32x16 src) {
61 |   store_fp32x16(dst, src);
62 | }
63 | 
64 | inline void store_bf16x16(void* mem_addr, bf16x16 a) {
65 |   _mm256_storeu_si256(reinterpret_cast<__m256i*>(mem_addr), a.first);
66 | }
67 | 
68 | template <>
69 | inline void store_kernel_t<bf16x16>(void* dst, bf16x16 src) {
70 |   store_bf16x16(dst, src);
71 | }
72 | 
73 | #endif  // ENGINE_EXECUTOR_INCLUDE_VEC_STORE_HPP_
74 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/ele_reduce.cpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | #include "vectors/cpu/vec.hpp"
15 | #include "vectors/ele_reduce.h"
16 | #include <math.h>
17 | 
18 | void ne_vec_norm_f32_(const int n, float* s, const float* x) {
19 |   float sum = 0.0;
20 |   ne_dot_fp32x16_kernel_t k_t;
21 |   for (int i = 0; i < n / 16; ++i) {
22 |     float tmp;
23 |     k_t(reinterpret_cast<void*>(&tmp), reinterpret_cast<const void*>(x + i * 16),
24 |         reinterpret_cast<const void*>(x + i * 16));
25 |     sum += tmp;
26 |   }
27 |   for (int i = n / 16 * 16; i < n; i++) sum += x[i] * x[i];
28 |   *s = sqrtf(sum);
29 | }
30 | 
31 | void ne_vec_sum_f32_(const int n, float* s, const float* x) {
32 |   float sum = 0.0;
33 |   ne_reduce_add_fp32x16_kernel_t k_t;
34 |   for (int i = 0; i < n / 16; ++i) {
35 |     float tmp;
36 |     k_t(reinterpret_cast<void*>(&tmp), reinterpret_cast<const void*>(x + i * 16));
37 |     sum += tmp;
38 |   }
39 |   for (int i = n / 16 * 16; i < n; i++) sum += x[i];
40 |   *s = sum;
41 | }
42 | 
43 | void ne_vec_max_f32_(const int n, float* s, const float* x) {
44 |   float max = -INFINITY;
45 |   ne_reduce_max_fp32x16_kernel_t k_t;
46 |   for (int i = 0; i < n / 16; ++i) {
47 |     float tmp;
48 |     k_t(reinterpret_cast<void*>(&tmp), reinterpret_cast<const void*>(x + i * 16));
49 |     max = max > tmp ? max : tmp;
50 |   }
51 |   for (int i = n / 16 * 16; i < n; i++) {
52 |     max = x[i] > max ? x[i] : max;
53 |   }
54 |   *s = max;
55 | }
56 | 
57 | void ne_vec_norm_inv_f32_(const int n, float* s, const float* x) {
58 |   ne_vec_norm_f32_(n, s, x);
59 |   *s = 1.f / (*s);
60 | }
61 | void ne_vec_sum_ggf_(const int n, double* s, const float* x) {
62 |   float sum = 0.0;
63 |   for (int i = 0; i < n; ++i) {
64 |     sum += static_cast<float>(x[i]);
65 |   }
66 |   *s = sum;
67 | }
68 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/ele_reduce.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | #pragma once
15 | 
16 | #ifdef __cplusplus
17 | extern "C" {
18 | #endif
19 | 
20 | #ifdef VEC_SHARED
21 | #if defined(_WIN32) && !defined(__MINGW32__)
22 | #ifdef VEC_BUILD
23 | #define VEC_API __declspec(dllexport)
24 | #else
25 | #define VEC_API __declspec(dllimport)
26 | #endif
27 | #else
28 | #define VEC_API __attribute__((visibility("default")))
29 | #endif
30 | #else
31 | #define VEC_API
32 | #endif
33 | VEC_API void ne_vec_norm_f32_(const int n, float* s, const float* x);
34 | VEC_API void ne_vec_sum_f32_(const int n, float* s, const float* x);
35 | 
36 | VEC_API void ne_vec_sum_ggf_(const int n, double* s, const float* x);
37 | 
38 | VEC_API void ne_vec_max_f32_(const int n, float* s, const float* x);
39 | 
40 | VEC_API void ne_vec_norm_inv_f32_(const int n, float* s, const float* x);
41 | 
42 | #ifdef __cplusplus
43 | }
44 | #endif
45 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/gpu/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023 Intel Corporation
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | cmake_minimum_required(VERSION 3.11)
15 | project(gpu_vectors)
16 | 
17 | set (CMAKE_CXX_COMPILER "icpx")
18 | set (CMAKE_C_COMPILER "icx")
19 | add_compile_options(-fsycl)
20 | add_library(gpu_vectors STATIC ele_wise.cpp ele_reduce.cpp)
21 | set_property(TARGET gpu_vectors PROPERTY POSITION_INDEPENDENT_CODE ON)
22 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/gpu/ele_reduce.cpp:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | 
15 | #include <iostream>
16 | #include "reduce.h"
17 | 
18 | static sycl::queue q = sycl::queue();
19 | // inline static void ne_vec_norm_f32 (const int n, float * s, const float * x) { ne_vec_dot_f32(n, s, x, x); *s =
20 | // sqrtf(*s);   }
21 | 
22 | void ne_vec_sum_f32(const int n, float* s, const float* x) { reduce<float, sycl::plus<>, 16>(n, s, x, q); }
23 | 
24 | // inline static void ne_vec_sum_ggf(const int n, ne_float * s, const float * x) {
25 | //     ne_float sum = 0.0;
26 | //     for (int i = 0; i < n; ++i) {
27 | //         sum += (ne_float)x[i];
28 | //     }
29 | //     *s = sum;
30 | // }
31 | 
32 | void ne_vec_max_f32(const int n, float* s, const float* x) { reduce<float, sycl::maximum<float>, 16>(n, s, x, q); }
33 | 
34 | // inline static void ne_vec_norm_inv_f32(const int n, float * s, const float * x) {
35 | //     ne_vec_norm_f32(n, s, x);
36 | //     *s = 1.f/(*s);
37 | // }
38 | 
39 | int main() {
40 |   size_t n = 32 * 10;
41 |   std::vector<float> h_src(n);
42 |   std::vector<float> h_dst(n);
43 |   for (size_t i = 0; i < n; i++) {
44 |     h_src[i] = 1.f;
45 |   }
46 |   h_src[1] = 5.f;
47 |   ne_vec_max_f32(n, h_dst.data(), h_src.data());
48 |   std::cout << h_dst[0] << std::endl;
49 |   return 0;
50 | }
51 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/gpu/reduce.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | #include <assert.h>
15 | #include <sycl/sycl.hpp>
16 | #include "sycl/reduction.hpp"
17 | 
18 | template <typename T, typename BinaryOperation, size_t VL = 16>
19 | void reduce(const int n, T* s, const T* x, sycl::queue& q) {
20 |   assert(n % VL == 0);
21 | 
22 |   sycl::buffer<float, 1> buf(const_cast<float*>(x), sycl::range<1>(n));
23 |   sycl::buffer<float, 1> sum_buf(s, sycl::range<1>(1));
24 |   BinaryOperation BOp;
25 |   q.submit([&](auto& h) {
26 |     sycl::accessor buf_acc(buf, h, sycl::read_only);
27 |     auto retr = sycl::reduction(sum_buf, h, BOp);
28 |     h.parallel_for(sycl::nd_range<1>{n, 32}, retr, [=](sycl::nd_item<1> item, auto& retr_arg) {
29 |       int glob_id = item.get_global_id(0);
30 |       retr_arg.combine(buf_acc[glob_id]);
31 |     });
32 |   });
33 | }
34 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/gpu/test.cpp:
--------------------------------------------------------------------------------
  1 | //  Copyright (c) 2023 Intel Corporation
  2 | //
  3 | //  Licensed under the Apache License, Version 2.0 (the "License");
  4 | //  you may not use this file except in compliance with the License.
  5 | //  You may obtain a copy of the License at
  6 | //
  7 | //    http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | //  Unless required by applicable law or agreed to in writing, software
 10 | //  distributed under the License is distributed on an "AS IS" BASIS,
 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | //  See the License for the specific language governing permissions and
 13 | //  limitations under the License.
 14 | 
 15 | // Standard C++ includes
 16 | #include <iostream>
 17 | #include <vector>
 18 | 
 19 | // SYCL include
 20 | #include <CL/sycl.hpp>
 21 | 
 22 | constexpr double Pi = 3.1415926535897932384626433;
 23 | 
 24 | template <typename F, typename RA, typename RWA, typename WA>
 25 | void reduce(F f, RA src, RWA tmp, WA dst, cl::sycl::nd_item<1> id) {
 26 |   auto g = id.get_group().get_id();
 27 |   auto bs = id.get_local_range().get(0);
 28 |   auto l = id.get_local_id().get(0);
 29 | 
 30 |   auto i = g * bs * 2 + l;
 31 | 
 32 |   tmp[l] = f(src[i], src[i + bs]);
 33 | 
 34 |   id.barrier(cl::sycl::access::fence_space::local_space);
 35 | 
 36 |   // do reduction in shared mem
 37 |   for (auto s = bs / 2; s > 0; s >>= 1) {
 38 |     if (l < s) {
 39 |       tmp[l] = f(tmp[l], tmp[l + s]);
 40 |     }
 41 |     id.barrier(cl::sycl::access::fence_space::local_space);
 42 |   }
 43 | 
 44 |   // write result for this block to global mem
 45 |   if (l == 0) {
 46 |     dst[g] = tmp[0];
 47 |   }
 48 | }
 49 | 
 50 | int main() {
 51 |   using T = double;
 52 | 
 53 |   // Size of vectors
 54 |   size_t n = 8192;
 55 |   // block size
 56 |   size_t local_count = 32;
 57 | 
 58 |   // Host vectors
 59 |   std::vector<T> h_src(n);
 60 |   std::vector<T> h_dst(n);
 61 | 
 62 |   // Initialize vectors on host
 63 |   for (size_t i = 0; i < n; i++) {
 64 |     auto k = n - i;
 65 |     h_src[i] = 1.0 / (k * k);
 66 |   }
 67 | 
 68 |   for (size_t i = 0; i < h_dst.size(); i++) {
 69 |     h_dst[i] = 0;
 70 |   }
 71 | 
 72 |   auto sum = [](auto const& x, auto const& y) { return x + y; };
 73 | 
 74 |   try {
 75 |     cl::sycl::queue queue{cl::sycl::gpu_selector()};
 76 |     std::cout << "Selected platform: " << queue.get_context().get_platform().get_info<cl::sycl::info::platform::name>()
 77 |               << "\n";
 78 |     std::cout << "Selected device:   " << queue.get_device().get_info<cl::sycl::info::device::name>() << "\n";
 79 | 
 80 |     cl::sycl::buffer<T, 1> b_src(h_src.data(), n);
 81 |     cl::sycl::buffer<T, 1> b_dst(h_dst.data(), n);
 82 | 
 83 |     cl::sycl::nd_range<1> r(n / 2, local_count);
 84 | 
 85 |     queue.submit([&](cl::sycl::handler& cgh) {
 86 |       auto a_src = b_src.get_access<cl::sycl::access::mode::read>(cgh);
 87 | 
 88 |       cl::sycl::accessor<T, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> a_tmp(
 89 |           cl::sycl::range<1>(local_count), cgh);
 90 | 
 91 |       auto a_dst = b_dst.get_access<cl::sycl::access::mode::discard_write>(cgh);
 92 | 
 93 |       cgh.parallel_for<class Reduce>(r, [=](cl::sycl::nd_item<1> i) { reduce(sum, a_src, a_tmp, a_dst, i); });
 94 |     });
 95 |     queue.wait();
 96 |   } catch (cl::sycl::exception e) {
 97 |     std::cout << "Exception encountered in SYCL: " << e.what() << "\n";
 98 |     return -1;
 99 |   }
100 | 
101 |   T res = 0.0;
102 |   for (size_t i = 0; i < h_dst.size(); i++) {
103 |     res = sum(res, h_dst[i]);
104 |   }
105 | 
106 |   std::cout.precision(16);
107 |   std::cout << "Riemann zeta(2) approximation by explicit summing:\n";
108 |   std::cout << "result = " << res << "\n";
109 |   std::cout << "exact  = " << Pi * Pi / 6.0 << "\n";
110 | 
111 |   return 0;
112 | }
113 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/gpu/vector_func.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | #pragma once
15 | #include <sycl/ext/intel/esimd.hpp>
16 | 
17 | template <typename T, size_t VL>
18 | SYCL_EXTERNAL void usm_copy_from(T* src, sycl::ext::intel::esimd::simd<T, VL> vec, int i) SYCL_ESIMD_FUNCTION {
19 |   vec.copy_from(src + i * VL);
20 | }
21 | 
22 | template <typename T, size_t VL>
23 | SYCL_EXTERNAL sycl::ext::intel::esimd::simd<T, VL> usm_copy_from(T* src, int i) SYCL_ESIMD_FUNCTION {
24 |   sycl::ext::intel::esimd::simd<T, VL> vec;
25 |   vec.copy_from(src + i * VL);
26 |   return vec;
27 | }
28 | 
29 | template <typename T, size_t VL>
30 | SYCL_EXTERNAL void usm_copy_to(T* dst, sycl::ext::intel::esimd::simd<T, VL> vec, int i) SYCL_ESIMD_FUNCTION {
31 |   vec.copy_to(dst + i * VL);
32 | }
33 | 
34 | template <typename T, size_t VL>
35 | SYCL_EXTERNAL void set_value(sycl::ext::intel::esimd::simd<T, VL>& vec, T value) SYCL_ESIMD_FUNCTION {
36 |   vec = sycl::ext::intel::esimd::simd<T, VL>(value, 0);
37 | }
38 | 
39 | template <typename T, size_t VL>
40 | SYCL_EXTERNAL sycl::ext::intel::esimd::simd<T, VL> set_value(T value) SYCL_ESIMD_FUNCTION {
41 |   return sycl::ext::intel::esimd::simd<T, VL>(value, 0);
42 | }
43 | template <typename T, size_t VL>
44 | SYCL_EXTERNAL sycl::ext::intel::esimd::simd<T, VL> vec_tanh(sycl::ext::intel::esimd::simd<T, VL> src)
45 |     SYCL_ESIMD_FUNCTION {
46 |   auto exp2x = sycl::ext::intel::esimd::exp(src * 2.f);
47 |   return (exp2x - 1.f) / (exp2x + 1.f);
48 | }
49 | 


--------------------------------------------------------------------------------
/neural_speed/vectors/parallel_for.h:
--------------------------------------------------------------------------------
 1 | //  Copyright (c) 2023 Intel Corporation
 2 | //
 3 | //  Licensed under the Apache License, Version 2.0 (the "License");
 4 | //  you may not use this file except in compliance with the License.
 5 | //  You may obtain a copy of the License at
 6 | //
 7 | //    http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | //  Unless required by applicable law or agreed to in writing, software
10 | //  distributed under the License is distributed on an "AS IS" BASIS,
11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //  See the License for the specific language governing permissions and
13 | //  limitations under the License.
14 | #pragma once
15 | #ifdef GPU_BACKEND
16 | #include <sycl/sycl.hpp>
17 | 
18 | template <size_t VL = 16, typename kernel_t, typename kernel_tail_t>
19 | void parallel_for(sycl::queue& q, size_t size, kernel_t kernel, kernel_tail_t kernel_tail) {
20 |   constexpr unsigned GroupSize = 1;
21 | 
22 |   sycl::range<1> GlobalRange{size / VL};
23 |   sycl::range<1> LocalRange{GroupSize};
24 |   sycl::nd_range<1> Range(GlobalRange, LocalRange);
25 | 
26 |   sycl::range<1> GlobalRange_tail{size % VL};
27 |   sycl::range<1> LocalRange_tail{GroupSize};
28 |   sycl::nd_range<1> Range_tail(GlobalRange_tail, LocalRange_tail);
29 | 
30 |   auto e = q.submit([&](sycl::handler& cgh) { cgh.parallel_for(Range, kernel); });
31 |   auto e_tail = q.submit([&](sycl::handler& cgh) { cgh.parallel_for(Range_tail, kernel_tail); });
32 |   e.wait();
33 |   e_tail.wait();
34 | }
35 | // #endif
36 | 
37 | // Example:
38 | //    float* input;
39 | //    float* output;
40 | //    size_t size = 128 + 1;
41 | //    size_t VL = 16;
42 | //    ...
43 | //    Kernel kernel(input, output);
44 | //    Kernel_tail kernel_tail(input, output);
45 | //    parallel_for<VL>(128, kernel, kernel_tail);
46 | template <size_t VL, typename kernel_t, typename kernel_tail_t>
47 | void parallel_for(size_t size, kernel_t kernel, kernel_tail_t kernel_tail) {
48 |   for (size_t i = 0; i < size; i += VL) {
49 |     kernel(i);
50 |   }
51 |   for (size_t i = size / VL * VL; i < size; i++) {
52 |     kernel_tail(i);
53 |   }
54 | }
55 | #endif
56 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cpu
 2 | accelerate
 3 | cmake
 4 | datasets
 5 | huggingface_hub
 6 | matplotlib
 7 | numpy
 8 | peft
 9 | protobuf<3.20
10 | py-cpuinfo
11 | sentencepiece
12 | setuptools>=61
13 | tiktoken
14 | torch
15 | transformers
16 | transformers_stream_generator
17 | zipfile38
18 | 


--------------------------------------------------------------------------------
/scripts/cal_acc.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023 Intel Corporation
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | import os
15 | import sys
16 | import shutil
17 | import argparse
18 | from ns_evaluator import LMEvalParser
19 | from accuracy import cli_evaluate
20 | 
21 | if __name__ == "__main__":
22 |     parser = argparse.ArgumentParser(description="Evaluate accuracy for a model")
23 |     parser.add_argument('--model_name', type=str, default="~/Llama-2-7b-chat-hf")
24 |     parser.add_argument('--tasks', type=str, default="lambada_openai")
25 |     parser.add_argument("--clear", action="store_true")
26 |     parser.add_argument("--use_gptq", action="store_true")
27 |     parser.add_argument("--use_awq", action="store_true")
28 |     parser.add_argument("--use_autoround", action="store_true")
29 |     parser.add_argument('--batch_size', type=int, default=1)
30 |     parser.add_argument('--weight_dtype', type=str, default="fp32")
31 |     parser.add_argument('--compute_dtype', type=str, default="fp32")
32 |     parser.add_argument('--group_size', type=int, default=32)
33 |     parser.add_argument('--use_ggml', action="store_true")
34 |     parser.add_argument('--alg', type=str, default="sym")
35 |     parser.add_argument('--scale_dtype', type=str, default="fp32")
36 |     parser.add_argument('--init_from_bin', type=str, default="default_none")
37 |     parser.add_argument('--model_format', type=str, default="neural_speed")
38 |     args = parser.parse_args()
39 | 
40 |     model_args=f'pretrained={args.model_name},model_format={args.model_format},dtype=float32,trust_remote_code=True'
41 |     # model_args += f'use_gptq={args.use_gptq},use_awq={args.use_awq},use_autoround={args.use_autoround}'
42 |     eval_args = LMEvalParser(model="hf",
43 |                         model_args=model_args,
44 |                         tasks=f"{args.tasks}",
45 |                         device="cpu",
46 |                         batch_size=args.batch_size,
47 |                         use_gptq=args.use_gptq,
48 |                         use_autoround=args.use_autoround,
49 |                         use_awq=args.use_awq,
50 |                         weight_dtype=args.weight_dtype,
51 |                         compute_dtype=args.compute_dtype,
52 |                         group_size=args.group_size,
53 |                         use_ggml=args.use_ggml,
54 |                         alg=args.alg,
55 |                         scale_dtype=args.scale_dtype,
56 |                         init_from_bin=args.init_from_bin
57 |                         )
58 |     results = cli_evaluate(eval_args)
59 |     print(results)
60 | 
61 |     if args.clear and os.path.isdir('runtime_outs'):
62 |         shutil.rmtree('runtime_outs')
63 | 


--------------------------------------------------------------------------------
/scripts/cal_diff.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright (c) 2023 Intel Corporation
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | import numpy as np
19 | import argparse
20 | from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM
21 | from neural_speed import Model
22 | 
23 | def cmpData(numa, numb):
24 |     totalErr = ((numa - numb)**2).sum()
25 |     totalNum = (numa**2).sum()
26 |     diff2 = np.sqrt(totalErr / totalNum)
27 | 
28 |     cos = np.dot(numa, numb) / (np.linalg.norm(numa) * np.linalg.norm(numb))
29 |     return {"diff2": diff2, "cos": cos}
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     parser = argparse.ArgumentParser(description="Evaluate diff for a model")
34 |     parser.add_argument('--model_name', type=str, default="~/Llama-2-7b-chat-hf")
35 |     args = parser.parse_args()
36 | 
37 |     woq_configs = {
38 |             "fp32": {"use_quant":False},
39 |         # "ggml_int4": {"compute_dtype":"int8", "weight_dtype":"int4", "use_ggml":True},
40 |         "jblas_int4": {"compute_dtype":"int8", "weight_dtype":"int4"},
41 |         # "jblas_int8": {"compute_dtype":"bf16", "weight_dtype":"int8"},
42 | 
43 |     }
44 |     prompt = "What is the meaning of life?"
45 | 
46 |     model_name = args.model_name
47 |     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
48 |     inputs = tokenizer(prompt, return_tensors="pt")
49 | 
50 |     pt_model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
51 |     pt_model.eval()
52 |     pt_logits = pt_model(input_ids=inputs.input_ids).logits[:, -1]
53 | 
54 |     for config_type in woq_configs:
55 |         itrex_model = Model()
56 |         itrex_model.init(model_name, **woq_configs[config_type])
57 |         itrex_logits = itrex_model(inputs.input_ids)
58 | 
59 |         print(config_type, cmpData(pt_logits.detach().numpy().flatten(), itrex_logits.flatten()))
60 | 


--------------------------------------------------------------------------------
/scripts/convert.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023 Intel Corporation
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | import argparse
16 | import sys
17 | from pathlib import Path
18 | from typing import List, Optional
19 | from huggingface_hub import snapshot_download
20 | from neural_speed.convert import convert_model
21 | 
22 | def main(args_in: Optional[List[str]] = None) -> None:
23 |     parser = argparse.ArgumentParser(description="Convert a PyTorch model to a NE compatible file")
24 |     parser.add_argument(
25 |         "--outtype",
26 |         choices=["f32", "f16"],
27 |         help="output format, default: f32",
28 |         default="f32",
29 |     )
30 |     parser.add_argument(
31 |         "--token",
32 |         type=str,
33 |         help="Access token ID for models that require it (LLaMa2, etc..)",
34 |     )
35 |     parser.add_argument(
36 |         "--outfile",
37 |         type=Path,
38 |         required=True,
39 |         help="path to write to"
40 |     )
41 |     parser.add_argument(
42 |         "--format",
43 |         type=str,
44 |         default="NE",
45 |         choices=["NE", "GGUF"],
46 |         help="Convert to the GGUF or NE format"
47 |     )
48 |     parser.add_argument(
49 |         "--use_quantized_model",
50 |         action="store_true",
51 |         help="use quantized model: awq/gptq/autoround"
52 |     )
53 |     parser.add_argument(
54 |         "model",
55 |         type=Path,
56 |         help="directory containing model file or model id"
57 |     )
58 | 
59 |     args = parser.parse_args(args_in)
60 | 
61 |     if args.model.exists():
62 |         dir_model = args.model.as_posix()
63 |     else:
64 |         try:
65 |             dir_model = snapshot_download(repo_id=str(args.model), resume_download=True, token=args.token)
66 |         except Exception as e:
67 |             if e.response.status_code == 401:
68 |                 print("You are required to input an access token ID for {}, please add it in option --token or download model weights locally".format(args.model))
69 |             sys.exit(f"{e}")
70 | 
71 |     convert_model(dir_model, args.outfile, args.outtype, format=args.format, use_quantized_model=args.use_quantized_model)
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     main()
76 | 


--------------------------------------------------------------------------------
/scripts/load_peft_and_merge.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023 Intel Corporation
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | import argparse
16 | from peft import PeftModel
17 | from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
18 | 
19 | 
20 | def main():
21 |     parser = argparse.ArgumentParser(description="Load CausalLM and Peft model, then merge and save.")
22 |     parser.add_argument(
23 |         "--model_name_or_path",
24 |         type=str,
25 |         required=True,
26 |         help="The model checkpoint for weights initialization."
27 |         "Set to model id of huggingface model hub or local path to the model.",
28 |     )
29 |     parser.add_argument(
30 |         "--peft_name_or_path",
31 |         type=str,
32 |         required=True,
33 |         help="The peft model checkpoint for weights initialization."
34 |         "Set to model id of huggingface model hub or local path to the model.",
35 |     )
36 |     parser.add_argument("--save_path", type=str, default=None, help="Path to save merged model checkpoint.")
37 |     args = parser.parse_args()
38 |     tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
39 |     config = AutoConfig.from_pretrained(args.model_name_or_path)
40 |     model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, config=config)
41 |     model = PeftModel.from_pretrained(model, args.peft_name_or_path)
42 |     model = model.merge_and_unload()
43 |     save_path = args.save_path
44 |     if save_path is None:
45 |         save_path = "./{}_{}".format(
46 |             args.model_name_or_path.strip('/').split('/')[-1],
47 |             args.peft_name_or_path.strip('/').split('/')[-1])
48 |     tokenizer.save_pretrained(save_path)
49 |     model.save_pretrained(save_path)
50 |     print(f"Merged model saved in {save_path}")
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     main()
55 | 


--------------------------------------------------------------------------------
/scripts/python_api_example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright (c) 2023 Intel Corporation
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | import sys
18 | from transformers import AutoTokenizer, TextStreamer
19 | from neural_speed import Model
20 | 
21 | if len(sys.argv) != 2:
22 |     print("Usage: python python_api_example.py model_path")
23 | model_name = sys.argv[1]
24 | 
25 | prompt = "Once upon a time, a little girl"
26 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
27 | inputs = tokenizer(prompt, return_tensors="pt").input_ids
28 | streamer = TextStreamer(tokenizer)
29 | 
30 | model = Model()
31 | # If you want to run GPTQ or AWQ models, just set use_gptq = True or use_awq = True.
32 | model.init(model_name, weight_dtype="int4", compute_dtype="int8")
33 | outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True)
34 | 


--------------------------------------------------------------------------------
/scripts/python_api_example_for_gguf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright (c) 2023 Intel Corporation
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | import sys
18 | import argparse
19 | from pathlib import Path
20 | from typing import List, Optional
21 | from transformers import AutoTokenizer, TextStreamer
22 | from neural_speed import Model
23 | 
24 | # Usage:
25 | # python python_api_example_for_gguf.py \
26 | # --model_name falcon \
27 | # --model_path /model_path/falcon-7b \
28 | # -m /model_path/falcon-7b/ggml-model-f32.gguf
29 | 
30 | def main(args_in: Optional[List[str]] = None) -> None:
31 |     parser = argparse.ArgumentParser(description="main program llm running")
32 |     parser.add_argument("--model_name", type=str, help="Model name: String", required=True)
33 |     parser.add_argument("--model_path", type=Path, help="Path to the model: String", required=True)
34 |     parser.add_argument("-m", "--model", type=Path, help="Path to the executed model: String", required=True)
35 |     parser.add_argument("--format",
36 |                         type=str,
37 |                         default="GGUF",
38 |                         choices=["NE", "GGUF"],
39 |                         help="convert to the GGUF or NE format")
40 |     parser.add_argument(
41 |         "-p",
42 |         "--prompt",
43 |         type=str,
44 |         help="Prompt to start generation with: String (default: empty)",
45 |         default="Once upon a time",
46 |     )
47 | 
48 |     args = parser.parse_args(args_in)
49 |     print(args)
50 | 
51 |     gguf_path = args.model.as_posix()
52 | 
53 |     prompt = args.prompt
54 |     tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
55 |     inputs = tokenizer(prompt, return_tensors="pt").input_ids
56 |     streamer = TextStreamer(tokenizer)
57 | 
58 |     model = Model()
59 |     model.init_from_bin(args.model_name, gguf_path)
60 |     outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True)
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     main()
65 | 


--------------------------------------------------------------------------------
/scripts/python_api_example_for_gptq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright (c) 2023 Intel Corporation
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | import sys
18 | import argparse
19 | from pathlib import Path
20 | from transformers import AutoTokenizer, TextStreamer
21 | from neural_speed import Model
22 | from typing import List, Optional
23 | 
24 | 
25 | def main(args_in: Optional[List[str]] = None) -> None:
26 |     parser = argparse.ArgumentParser(description="pythonAPI example for gptq")
27 |     parser.add_argument("model", type=Path, help="directory containing model file")
28 |     parser.add_argument(
29 |         "-p",
30 |         "--prompt",
31 |         type=str,
32 |         help="Prompt to start generation with: String (default: empty)",
33 |         default="Once upon a time, a little girl",
34 |     )
35 |     args = parser.parse_args(args_in)
36 | 
37 |     prompt = args.prompt
38 |     model_name = args.model
39 |     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
40 |     inputs = tokenizer(prompt, return_tensors="pt").input_ids
41 |     streamer = TextStreamer(tokenizer)
42 | 
43 |     model = Model()
44 |     # If you want to run AWQ models, just set use_awq = True.
45 |     model.init(model_name, weight_dtype="int4", compute_dtype="int8", use_gptq=True)
46 |     outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True)
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     main()
51 | 


--------------------------------------------------------------------------------
/scripts/python_api_example_for_modelscope.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright (c) 2023 Intel Corporation
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | import sys
18 | from modelscope import AutoTokenizer
19 | from transformers import TextStreamer
20 | from neural_speed import Model
21 | 
22 | if len(sys.argv) != 2:
23 |     print("Usage: python python_api_example.py model_path")
24 | model_name = sys.argv[1]
25 | 
26 | prompt = "Once upon a time, a little girl"
27 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
28 | inputs = tokenizer(prompt, return_tensors="pt").input_ids
29 | streamer = TextStreamer(tokenizer)
30 | 
31 | model = Model()
32 | # If you want to run GPTQ or AWQ models, just set use_gptq = True or use_awq = True.
33 | model.init(model_name, weight_dtype="int4", compute_dtype="int8", model_hub="modelscope")
34 | outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True)
35 | 


--------------------------------------------------------------------------------
/scripts/requirements.txt:
--------------------------------------------------------------------------------
1 | lm_eval==0.4.2
2 | 


--------------------------------------------------------------------------------
/scripts/whisper_example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright (c) 2023 Intel Corporation
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | import sys
18 | from transformers import AutoTokenizer, TextStreamer
19 | from neural_speed import Model
20 | 
21 | if len(sys.argv) != 3:
22 |     print("Usage: python whisper_example.py model_path and audio_file")
23 | model_name = sys.argv[1]
24 | audio_file = sys.argv[2]
25 | 
26 | model = Model()
27 | model.init(model_name, use_ggml=True)
28 | model(audio_file)
29 | 


--------------------------------------------------------------------------------
/security.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 | Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. 
3 | 
4 | ## Reporting a Vulnerability
5 | Please report any security vulnerabilities in this project utilizing the guidelines [here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html).
6 | 


--------------------------------------------------------------------------------
/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | lm_eval==0.4.2
2 | modelscope==1.13.1
3 | optimum==1.13.2
4 | optimum-intel==1.11.0
5 | zipfile38
6 | 


--------------------------------------------------------------------------------
/tests/test_modelscope.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # Copyright (c) 2024 Intel Corporation
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | import sys
18 | from modelscope import AutoTokenizer
19 | from transformers import TextStreamer
20 | from neural_speed import Model
21 | 
22 | model_name = "/tf_dataset2/models/pytorch/Qwen-7B"
23 | 
24 | prompt = "Once upon a time, a little girl"
25 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
26 | inputs = tokenizer(prompt, return_tensors="pt").input_ids
27 | streamer = TextStreamer(tokenizer)
28 | 
29 | model = Model()
30 | # If you want to run GPTQ or AWQ models, just set use_gptq = True or use_awq = True.
31 | model.init(model_name, weight_dtype="int4", compute_dtype="int8", model_hub="modelscope")
32 | outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True)
33 | 


--------------------------------------------------------------------------------